Scaffold Reverso MVP pipeline structure
Set up the project skeleton per PLAN.md §4: - src/ package: identifiers, disease, drugs, scoring, provenance with pydantic schemas and confidence-tier logic (working); data-pull/compute functions stubbed per their build week - 5 starter notebooks (01-05) with PLAN-referenced steps - tests/test_scoring.py: tier-assignment tests pass; scoring reference test xfail until Week 3 - docs/: recovery_test_report, data_sources, known_limitations skeletons - pyproject.toml (requires-python >=3.11,<3.14), .gitignore, README - data/ tree preserved via .gitkeep; raw/processed/results gitignored Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
72
src/provenance.py
Normal file
72
src/provenance.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""Provenance and confidence-tier tracking.
|
||||
|
||||
The confidence tier is the most commercially important design decision in the pipeline
|
||||
(PLAN.md §3). *Every* persisted artifact — signatures and drug profiles alike — must carry
|
||||
a tier and the provenance needed to justify it.
|
||||
|
||||
Tier A — measured data, peer-reviewed source, n>10 per group, recent
|
||||
Tier B — measured but small-n, older, or single-source
|
||||
Tier C — inferred / extrapolated / hypothesis-only
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import date
|
||||
from enum import Enum
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class ConfidenceTier(str, Enum):
|
||||
"""Confidence tier for a persisted artifact. See module docstring."""
|
||||
|
||||
A = "A"
|
||||
B = "B"
|
||||
C = "C"
|
||||
|
||||
|
||||
class Provenance(BaseModel):
|
||||
"""Where a record came from and when. Attached to every persisted artifact."""
|
||||
|
||||
source: str = Field(..., description="Human-readable source name, e.g. 'GEO', 'ChEMBL'.")
|
||||
source_id: str | None = Field(
|
||||
None, description="Accession / identifier within the source, e.g. 'GSE53441'."
|
||||
)
|
||||
source_url: str | None = None
|
||||
source_version: str | None = Field(
|
||||
None, description="Dataset/release version where the source is versioned."
|
||||
)
|
||||
download_date: date | None = Field(
|
||||
None, description="Date the underlying data was downloaded (reproducibility)."
|
||||
)
|
||||
license: str | None = None
|
||||
notes: str | None = None
|
||||
|
||||
|
||||
def assign_tier(
|
||||
*,
|
||||
is_measured: bool,
|
||||
n_per_group: int | None,
|
||||
peer_reviewed: bool,
|
||||
single_source: bool,
|
||||
) -> ConfidenceTier:
|
||||
"""Assign a confidence tier from the evidence characteristics.
|
||||
|
||||
This encodes the tier rules from PLAN.md §3 so tier assignment is consistent and
|
||||
auditable rather than ad-hoc per notebook.
|
||||
|
||||
Args:
|
||||
is_measured: True if the value is directly measured (vs inferred/extrapolated).
|
||||
n_per_group: Sample size per group, if applicable (None when not meaningful).
|
||||
peer_reviewed: Whether the source is peer-reviewed.
|
||||
single_source: Whether the evidence rests on a single source.
|
||||
|
||||
Returns:
|
||||
The assigned ``ConfidenceTier``.
|
||||
"""
|
||||
if not is_measured:
|
||||
return ConfidenceTier.C
|
||||
if peer_reviewed and (n_per_group is not None and n_per_group > 10) and not single_source:
|
||||
return ConfidenceTier.A
|
||||
# Measured, but small-n / older / single-source falls to Tier B.
|
||||
return ConfidenceTier.B
|
||||
Reference in New Issue
Block a user