Reverso/src/provenance.py

"""Provenance and confidence-tier tracking.

The confidence tier is the most commercially important design decision in the pipeline
(PLAN.md §3). *Every* persisted artifact — signatures and drug profiles alike — must carry
a tier and the provenance needed to justify it.

    Tier A — measured data, peer-reviewed source, n>10 per group, recent
    Tier B — measured but small-n, older, or single-source
    Tier C — inferred / extrapolated / hypothesis-only
"""

from __future__ import annotations

from datetime import date
from enum import Enum

from pydantic import BaseModel, Field


class ConfidenceTier(str, Enum):
    """Confidence tier for a persisted artifact. See module docstring."""

    A = "A"
    B = "B"
    C = "C"


class Provenance(BaseModel):
    """Where a record came from and when. Attached to every persisted artifact."""

    source: str = Field(..., description="Human-readable source name, e.g. 'GEO', 'ChEMBL'.")
    source_id: str | None = Field(
        None, description="Accession / identifier within the source, e.g. 'GSE53441'."
    )
    source_url: str | None = None
    source_version: str | None = Field(
        None, description="Dataset/release version where the source is versioned."
    )
    download_date: date | None = Field(
        None, description="Date the underlying data was downloaded (reproducibility)."
    )
    license: str | None = None
    notes: str | None = None


def assign_tier(
    *,
    is_measured: bool,
    n_per_group: int | None,
    peer_reviewed: bool,
    single_source: bool,
) -> ConfidenceTier:
    """Assign a confidence tier from the evidence characteristics.

    This encodes the tier rules from PLAN.md §3 so tier assignment is consistent and
    auditable rather than ad-hoc per notebook.

    Args:
        is_measured: True if the value is directly measured (vs inferred/extrapolated).
        n_per_group: Sample size per group, if applicable (None when not meaningful).
        peer_reviewed: Whether the source is peer-reviewed.
        single_source: Whether the evidence rests on a single source.

    Returns:
        The assigned ``ConfidenceTier``.
    """
    if not is_measured:
        return ConfidenceTier.C
    if peer_reviewed and (n_per_group is not None and n_per_group > 10) and not single_source:
        return ConfidenceTier.A
    # Measured, but small-n / older / single-source falls to Tier B.
    return ConfidenceTier.B