Scaffold Reverso MVP pipeline structure

Set up the project skeleton per PLAN.md §4:
- src/ package: identifiers, disease, drugs, scoring, provenance
  with pydantic schemas and confidence-tier logic (working);
  data-pull/compute functions stubbed per their build week
- 5 starter notebooks (01-05) with PLAN-referenced steps
- tests/test_scoring.py: tier-assignment tests pass; scoring
  reference test xfail until Week 3
- docs/: recovery_test_report, data_sources, known_limitations skeletons
- pyproject.toml (requires-python >=3.11,<3.14), .gitignore, README
- data/ tree preserved via .gitkeep; raw/processed/results gitignored

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-23 20:19:38 +02:00
parent e717cf40ed
commit b731478f5d
25 changed files with 1038 additions and 4 deletions

72
src/provenance.py Normal file
View File

@@ -0,0 +1,72 @@
"""Provenance and confidence-tier tracking.
The confidence tier is the most commercially important design decision in the pipeline
(PLAN.md §3). *Every* persisted artifact — signatures and drug profiles alike — must carry
a tier and the provenance needed to justify it.
Tier A — measured data, peer-reviewed source, n>10 per group, recent
Tier B — measured but small-n, older, or single-source
Tier C — inferred / extrapolated / hypothesis-only
"""
from __future__ import annotations
from datetime import date
from enum import Enum
from pydantic import BaseModel, Field
class ConfidenceTier(str, Enum):
"""Confidence tier for a persisted artifact. See module docstring."""
A = "A"
B = "B"
C = "C"
class Provenance(BaseModel):
"""Where a record came from and when. Attached to every persisted artifact."""
source: str = Field(..., description="Human-readable source name, e.g. 'GEO', 'ChEMBL'.")
source_id: str | None = Field(
None, description="Accession / identifier within the source, e.g. 'GSE53441'."
)
source_url: str | None = None
source_version: str | None = Field(
None, description="Dataset/release version where the source is versioned."
)
download_date: date | None = Field(
None, description="Date the underlying data was downloaded (reproducibility)."
)
license: str | None = None
notes: str | None = None
def assign_tier(
*,
is_measured: bool,
n_per_group: int | None,
peer_reviewed: bool,
single_source: bool,
) -> ConfidenceTier:
"""Assign a confidence tier from the evidence characteristics.
This encodes the tier rules from PLAN.md §3 so tier assignment is consistent and
auditable rather than ad-hoc per notebook.
Args:
is_measured: True if the value is directly measured (vs inferred/extrapolated).
n_per_group: Sample size per group, if applicable (None when not meaningful).
peer_reviewed: Whether the source is peer-reviewed.
single_source: Whether the evidence rests on a single source.
Returns:
The assigned ``ConfidenceTier``.
"""
if not is_measured:
return ConfidenceTier.C
if peer_reviewed and (n_per_group is not None and n_per_group > 10) and not single_source:
return ConfidenceTier.A
# Measured, but small-n / older / single-source falls to Tier B.
return ConfidenceTier.B