Set up the project skeleton per PLAN.md §4: - src/ package: identifiers, disease, drugs, scoring, provenance with pydantic schemas and confidence-tier logic (working); data-pull/compute functions stubbed per their build week - 5 starter notebooks (01-05) with PLAN-referenced steps - tests/test_scoring.py: tier-assignment tests pass; scoring reference test xfail until Week 3 - docs/: recovery_test_report, data_sources, known_limitations skeletons - pyproject.toml (requires-python >=3.11,<3.14), .gitignore, README - data/ tree preserved via .gitkeep; raw/processed/results gitignored Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
73 lines
2.4 KiB
Python
73 lines
2.4 KiB
Python
"""Provenance and confidence-tier tracking.
|
|
|
|
The confidence tier is the most commercially important design decision in the pipeline
|
|
(PLAN.md §3). *Every* persisted artifact — signatures and drug profiles alike — must carry
|
|
a tier and the provenance needed to justify it.
|
|
|
|
Tier A — measured data, peer-reviewed source, n>10 per group, recent
|
|
Tier B — measured but small-n, older, or single-source
|
|
Tier C — inferred / extrapolated / hypothesis-only
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import date
|
|
from enum import Enum
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
class ConfidenceTier(str, Enum):
|
|
"""Confidence tier for a persisted artifact. See module docstring."""
|
|
|
|
A = "A"
|
|
B = "B"
|
|
C = "C"
|
|
|
|
|
|
class Provenance(BaseModel):
|
|
"""Where a record came from and when. Attached to every persisted artifact."""
|
|
|
|
source: str = Field(..., description="Human-readable source name, e.g. 'GEO', 'ChEMBL'.")
|
|
source_id: str | None = Field(
|
|
None, description="Accession / identifier within the source, e.g. 'GSE53441'."
|
|
)
|
|
source_url: str | None = None
|
|
source_version: str | None = Field(
|
|
None, description="Dataset/release version where the source is versioned."
|
|
)
|
|
download_date: date | None = Field(
|
|
None, description="Date the underlying data was downloaded (reproducibility)."
|
|
)
|
|
license: str | None = None
|
|
notes: str | None = None
|
|
|
|
|
|
def assign_tier(
|
|
*,
|
|
is_measured: bool,
|
|
n_per_group: int | None,
|
|
peer_reviewed: bool,
|
|
single_source: bool,
|
|
) -> ConfidenceTier:
|
|
"""Assign a confidence tier from the evidence characteristics.
|
|
|
|
This encodes the tier rules from PLAN.md §3 so tier assignment is consistent and
|
|
auditable rather than ad-hoc per notebook.
|
|
|
|
Args:
|
|
is_measured: True if the value is directly measured (vs inferred/extrapolated).
|
|
n_per_group: Sample size per group, if applicable (None when not meaningful).
|
|
peer_reviewed: Whether the source is peer-reviewed.
|
|
single_source: Whether the evidence rests on a single source.
|
|
|
|
Returns:
|
|
The assigned ``ConfidenceTier``.
|
|
"""
|
|
if not is_measured:
|
|
return ConfidenceTier.C
|
|
if peer_reviewed and (n_per_group is not None and n_per_group > 10) and not single_source:
|
|
return ConfidenceTier.A
|
|
# Measured, but small-n / older / single-source falls to Tier B.
|
|
return ConfidenceTier.B
|