"""Provenance and confidence-tier tracking. The confidence tier is the most commercially important design decision in the pipeline (PLAN.md §3). *Every* persisted artifact — signatures and drug profiles alike — must carry a tier and the provenance needed to justify it. Tier A — measured data, peer-reviewed source, n>10 per group, recent Tier B — measured but small-n, older, or single-source Tier C — inferred / extrapolated / hypothesis-only """ from __future__ import annotations from datetime import date from enum import Enum from pydantic import BaseModel, Field class ConfidenceTier(str, Enum): """Confidence tier for a persisted artifact. See module docstring.""" A = "A" B = "B" C = "C" class Provenance(BaseModel): """Where a record came from and when. Attached to every persisted artifact.""" source: str = Field(..., description="Human-readable source name, e.g. 'GEO', 'ChEMBL'.") source_id: str | None = Field( None, description="Accession / identifier within the source, e.g. 'GSE53441'." ) source_url: str | None = None source_version: str | None = Field( None, description="Dataset/release version where the source is versioned." ) download_date: date | None = Field( None, description="Date the underlying data was downloaded (reproducibility)." ) license: str | None = None notes: str | None = None def assign_tier( *, is_measured: bool, n_per_group: int | None, peer_reviewed: bool, single_source: bool, ) -> ConfidenceTier: """Assign a confidence tier from the evidence characteristics. This encodes the tier rules from PLAN.md §3 so tier assignment is consistent and auditable rather than ad-hoc per notebook. Args: is_measured: True if the value is directly measured (vs inferred/extrapolated). n_per_group: Sample size per group, if applicable (None when not meaningful). peer_reviewed: Whether the source is peer-reviewed. single_source: Whether the evidence rests on a single source. Returns: The assigned ``ConfidenceTier``. """ if not is_measured: return ConfidenceTier.C if peer_reviewed and (n_per_group is not None and n_per_group > 10) and not single_source: return ConfidenceTier.A # Measured, but small-n / older / single-source falls to Tier B. return ConfidenceTier.B