Scaffold Reverso MVP pipeline structure

Set up the project skeleton per PLAN.md §4: - src/ package: identifiers, disease, drugs, scoring, provenance with pydantic schemas and confidence-tier logic (working); data-pull/compute functions stubbed per their build week - 5 starter notebooks (01-05) with PLAN-referenced steps - tests/test_scoring.py: tier-assignment tests pass; scoring reference test xfail until Week 3 - docs/: recovery_test_report, data_sources, known_limitations skeletons - pyproject.toml (requires-python >=3.11,<3.14), .gitignore, README - data/ tree preserved via .gitkeep; raw/processed/results gitignored Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-23 20:19:38 +02:00
parent e717cf40ed
commit b731478f5d
25 changed files with 1038 additions and 4 deletions
--- a/src/init.py
+++ b/src/init.py
@@ -0,0 +1,36 @@
+"""Reverso MVP — sickle cell drug repurposing pipeline.
+
+A disease-signature + drug-profile matching pipeline using CMap-style connectivity
+scoring. See PLAN.md for the full specification.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+__version__ = "0.1.0"
+PIPELINE_VERSION = "v1"
+
+# Single source of truth for reproducibility (PLAN.md §8).
+# All randomness in the pipeline must derive from this seed.
+RANDOM_SEED = 42
+
+# Canonical project paths, resolved relative to the repo root.
+REPO_ROOT = Path(__file__).resolve().parent.parent
+DATA_DIR = REPO_ROOT / "data"
+RAW_DIR = DATA_DIR / "raw"
+PROCESSED_DIR = DATA_DIR / "processed"
+RESULTS_DIR = DATA_DIR / "results"
+DOCS_DIR = REPO_ROOT / "docs"
+
+__all__ = [
+    "__version__",
+    "PIPELINE_VERSION",
+    "RANDOM_SEED",
+    "REPO_ROOT",
+    "DATA_DIR",
+    "RAW_DIR",
+    "PROCESSED_DIR",
+    "RESULTS_DIR",
+    "DOCS_DIR",
+]
--- a/src/disease.py
+++ b/src/disease.py
@@ -0,0 +1,106 @@
+"""Disease signature construction.
+
+Week 1 (PLAN.md §6). Builds a Tier-A sickle cell signature from GEO expression data via
+differential expression, then persists it with full provenance to
+``data/processed/sickle_cell_signature_v1.json``.
+
+This module defines the persisted schema (pydantic) and the construction stubs. The actual
+data pull + differential expression is driven from ``notebooks/02_disease_signature.ipynb``.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pandas as pd
+from pydantic import BaseModel, Field
+
+from . import PIPELINE_VERSION, PROCESSED_DIR
+from .provenance import ConfidenceTier
+
+# Number of genes to take per direction (PLAN.md §6, Week 1 task 5).
+TOP_N_PER_DIRECTION = 250
+QVALUE_CUTOFF = 0.05
+
+
+class GeneEntry(BaseModel):
+    """A single differentially expressed gene in the signature."""
+
+    gene: str = Field(..., description="HGNC gene symbol, e.g. 'HBG2'.")
+    entrez_id: str | None = None
+    ensembl_id: str | None = None
+    log_fc: float
+    qvalue: float
+
+
+class SignatureProvenance(BaseModel):
+    """Provenance block for a disease signature (PLAN.md §6 schema)."""
+
+    geo_accession: str
+    n_disease: int
+    n_healthy: int
+    platform: str
+    method: str = Field(..., description="Differential expression method, e.g. 'limma', 'deseq2'.")
+    created_date: str
+
+
+class DiseaseSignature(BaseModel):
+    """The persisted sickle cell disease signature (PLAN.md §6 schema)."""
+
+    signature_id: str = "sickle_cell_v1"
+    disease_mondo_id: str = "MONDO:0011382"
+    pipeline_version: str = PIPELINE_VERSION
+    up_regulated: list[GeneEntry]
+    down_regulated: list[GeneEntry]
+    provenance: SignatureProvenance
+    confidence_tier: ConfidenceTier
+    tier_rationale: str
+    limitations: list[str]
+
+
+def compute_differential_expression(
+    expression: pd.DataFrame,
+    sample_groups: pd.Series,
+    *,
+    method: str,
+) -> pd.DataFrame:
+    """Compute gene-level log fold change and adjusted p-values.
+
+    For RNA-seq use ``pydeseq2``; for microarray log2-transform/normalize and use a
+    limma-equivalent (PLAN.md §6, Week 1 task 4).
+
+    Args:
+        expression: Genes (rows) x samples (columns) expression matrix.
+        sample_groups: Per-sample group label ('disease' / 'healthy'), indexed by sample.
+        method: 'deseq2' (RNA-seq) or 'limma' (microarray).
+
+    Returns:
+        A table indexed by gene with at least ``log_fc`` and ``qvalue`` columns.
+    """
+    raise NotImplementedError("Differential expression: implement in Week 1 (notebook 02).")
+
+
+def build_signature(
+    de_table: pd.DataFrame,
+    provenance: SignatureProvenance,
+    *,
+    tier: ConfidenceTier,
+    tier_rationale: str,
+    limitations: list[str],
+    top_n: int = TOP_N_PER_DIRECTION,
+    qvalue_cutoff: float = QVALUE_CUTOFF,
+) -> DiseaseSignature:
+    """Assemble a ``DiseaseSignature`` from a differential expression table.
+
+    Takes the top ``top_n`` up- and down-regulated genes (by qvalue, cut at
+    ``qvalue_cutoff``) per PLAN.md §6, Week 1 task 5.
+    """
+    raise NotImplementedError("Signature assembly: implement in Week 1 (notebook 02).")
+
+
+def persist_signature(signature: DiseaseSignature, out_path: Path | None = None) -> Path:
+    """Write a signature to ``data/processed/sickle_cell_signature_v1.json``."""
+    out_path = out_path or (PROCESSED_DIR / "sickle_cell_signature_v1.json")
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(signature.model_dump_json(indent=2))
+    return out_path
--- a/src/drugs.py
+++ b/src/drugs.py
@@ -0,0 +1,85 @@
+"""Drug profile construction.
+
+Week 2 (PLAN.md §6). Curates the ~300-drug set, pulls ChEMBL structure/target data and LINCS
+L1000 signatures, and assembles ``data/processed/drug_profiles_v1.parquet``.
+
+The drug set is deliberately composed (PLAN.md §6, Week 2 task 1):
+    - ground truth (n=2): hydroxyurea, L-glutamine
+    - related-mechanism (n~50)
+    - negative controls (n~50)
+    - general random sample (n~200), fixed seed
+"""
+
+from __future__ import annotations
+
+from enum import Enum
+from pathlib import Path
+
+import pandas as pd
+from pydantic import BaseModel, Field
+
+from . import PROCESSED_DIR, RANDOM_SEED
+from .provenance import ConfidenceTier, Provenance
+
+# LINCS L1000 landmark gene count (PLAN.md §6, Week 2 task 3).
+LINCS_LANDMARK_GENES = 978
+
+
+class InclusionReason(str, Enum):
+    """Why a drug is in the curated set (PLAN.md §6, Week 2 task 1)."""
+
+    GROUND_TRUTH = "ground_truth"
+    RELATED_MECHANISM = "related_mechanism"
+    NEGATIVE_CONTROL = "negative_control"
+    GENERAL_SAMPLE = "general_sample"
+
+
+class DrugProfile(BaseModel):
+    """A single drug profile row (PLAN.md §6, Week 2 task 4)."""
+
+    chembl_id: str
+    name: str
+    inchikey: str | None = None
+    smiles: str | None = None
+    targets: list[str] = Field(default_factory=list)
+    mechanism_of_action: str | None = None
+    # 978-length LINCS landmark z-score vector, or None if no signature is available.
+    lincs_signature: list[float] | None = None
+    inclusion_reason: InclusionReason
+    provenance: list[Provenance] = Field(default_factory=list)
+    confidence_tier: ConfidenceTier
+
+
+def curate_drug_set(seed: int = RANDOM_SEED) -> pd.DataFrame:
+    """Build the deliberately-composed ~300-drug set.
+
+    Returns a table with at least ``chembl_id``, ``name`` and ``inclusion_reason`` columns,
+    written by the notebook to ``data/processed/drug_set_v1.csv``. Random sampling uses
+    ``seed`` for reproducibility (PLAN.md §8).
+    """
+    raise NotImplementedError("Drug-set curation: implement in Week 2 (notebook 03).")
+
+
+def fetch_chembl_profile(chembl_id: str) -> dict:
+    """Fetch structure, targets and mechanism for one drug from ChEMBL.
+
+    Uses ``chembl_webresource_client`` (PLAN.md §6, Week 2 task 2).
+    """
+    raise NotImplementedError("ChEMBL fetch: implement in Week 2 (notebook 03).")
+
+
+def fetch_lincs_signature(chembl_id: str) -> list[float] | None:
+    """Fetch the LINCS L1000 Level-5 consensus (MODZ) signature for a drug.
+
+    Returns a 978-length z-score vector, or ``None`` if no signature is available (e.g.
+    L-glutamine — document such gaps in docs/known_limitations.md). PLAN.md §6, Week 2 task 3.
+    """
+    raise NotImplementedError("LINCS fetch: implement in Week 2 (notebook 03).")
+
+
+def persist_drug_profiles(profiles: pd.DataFrame, out_path: Path | None = None) -> Path:
+    """Write the assembled drug profiles to ``data/processed/drug_profiles_v1.parquet``."""
+    out_path = out_path or (PROCESSED_DIR / "drug_profiles_v1.parquet")
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    profiles.to_parquet(out_path, index=False)
+    return out_path
--- a/src/identifiers.py
+++ b/src/identifiers.py
@@ -0,0 +1,71 @@
+"""Canonical identifier resolution and the pinned identifiers for the MVP.
+
+Week 1, task 1 (PLAN.md §6). The disease and causal gene identifiers are pinned constants
+so the whole pipeline resolves to the same canonical IDs. ``persist_identifiers`` writes them
+to ``data/processed/identifiers.json``.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from pydantic import BaseModel
+
+from . import PROCESSED_DIR
+
+# --- Pinned identifiers (PLAN.md §6, Week 1 task 1) -------------------------------------
+
+SICKLE_CELL_IDS: dict[str, str] = {
+    "mondo": "MONDO:0011382",
+    "orphanet": "Orphanet:232",
+    "omim": "OMIM:603903",
+}
+
+HBB_GENE_IDS: dict[str, str] = {
+    "symbol": "HBB",
+    "ensembl": "ENSG00000244734",
+    "hgnc": "HGNC:4827",
+}
+
+# Ground-truth drugs for the recovery test (PLAN.md §6, Week 2 task 1).
+GROUND_TRUTH_DRUGS: dict[str, str] = {
+    "hydroxyurea": "CHEMBL467",
+    "l-glutamine": "CHEMBL930",
+}
+
+
+class IdentifierSet(BaseModel):
+    """The pinned identifier set persisted at the start of the pipeline."""
+
+    disease: dict[str, str]
+    causal_gene: dict[str, str]
+    ground_truth_drugs: dict[str, str]
+
+
+def build_identifier_set() -> IdentifierSet:
+    """Return the pinned identifier set for the MVP."""
+    return IdentifierSet(
+        disease=SICKLE_CELL_IDS,
+        causal_gene=HBB_GENE_IDS,
+        ground_truth_drugs=GROUND_TRUTH_DRUGS,
+    )
+
+
+def persist_identifiers(out_path: Path | None = None) -> Path:
+    """Write the pinned identifier set to ``data/processed/identifiers.json``.
+
+    Returns the path written.
+    """
+    out_path = out_path or (PROCESSED_DIR / "identifiers.json")
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(build_identifier_set().model_dump_json(indent=2))
+    return out_path
+
+
+def resolve_drug_to_chembl(name_or_alias: str) -> str:
+    """Resolve a drug name/alias to a canonical ChEMBL ID.
+
+    Uses ``chembl_webresource_client``. Implemented in Week 2 (PLAN.md §6, task 2).
+    """
+    raise NotImplementedError("Drug -> ChEMBL resolution: implement in Week 2 (notebook 03).")
--- a/src/provenance.py
+++ b/src/provenance.py
@@ -0,0 +1,72 @@
+"""Provenance and confidence-tier tracking.
+
+The confidence tier is the most commercially important design decision in the pipeline
+(PLAN.md §3). *Every* persisted artifact — signatures and drug profiles alike — must carry
+a tier and the provenance needed to justify it.
+
+    Tier A — measured data, peer-reviewed source, n>10 per group, recent
+    Tier B — measured but small-n, older, or single-source
+    Tier C — inferred / extrapolated / hypothesis-only
+"""
+
+from __future__ import annotations
+
+from datetime import date
+from enum import Enum
+
+from pydantic import BaseModel, Field
+
+
+class ConfidenceTier(str, Enum):
+    """Confidence tier for a persisted artifact. See module docstring."""
+
+    A = "A"
+    B = "B"
+    C = "C"
+
+
+class Provenance(BaseModel):
+    """Where a record came from and when. Attached to every persisted artifact."""
+
+    source: str = Field(..., description="Human-readable source name, e.g. 'GEO', 'ChEMBL'.")
+    source_id: str | None = Field(
+        None, description="Accession / identifier within the source, e.g. 'GSE53441'."
+    )
+    source_url: str | None = None
+    source_version: str | None = Field(
+        None, description="Dataset/release version where the source is versioned."
+    )
+    download_date: date | None = Field(
+        None, description="Date the underlying data was downloaded (reproducibility)."
+    )
+    license: str | None = None
+    notes: str | None = None
+
+
+def assign_tier(
+    *,
+    is_measured: bool,
+    n_per_group: int | None,
+    peer_reviewed: bool,
+    single_source: bool,
+) -> ConfidenceTier:
+    """Assign a confidence tier from the evidence characteristics.
+
+    This encodes the tier rules from PLAN.md §3 so tier assignment is consistent and
+    auditable rather than ad-hoc per notebook.
+
+    Args:
+        is_measured: True if the value is directly measured (vs inferred/extrapolated).
+        n_per_group: Sample size per group, if applicable (None when not meaningful).
+        peer_reviewed: Whether the source is peer-reviewed.
+        single_source: Whether the evidence rests on a single source.
+
+    Returns:
+        The assigned ``ConfidenceTier``.
+    """
+    if not is_measured:
+        return ConfidenceTier.C
+    if peer_reviewed and (n_per_group is not None and n_per_group > 10) and not single_source:
+        return ConfidenceTier.A
+    # Measured, but small-n / older / single-source falls to Tier B.
+    return ConfidenceTier.B
--- a/src/scoring.py
+++ b/src/scoring.py
@@ -0,0 +1,85 @@
+"""CMap-style connectivity scoring — the matching engine.
+
+Week 3 (PLAN.md §6). Scores each drug's LINCS signature against the disease signature using
+weighted Kolmogorov-Smirnov enrichment (Lamb 2006 / Subramanian 2017). Strongly *negative*
+connectivity = strong reversal of the disease signature = candidate match.
+
+Uses ``cmapPy`` as the reference implementation. ``tests/test_scoring.py`` verifies the
+implementation against a known reference.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pandas as pd
+from pydantic import BaseModel
+
+from . import RESULTS_DIR
+
+
+class ConnectivityResult(BaseModel):
+    """Connectivity score for a single drug against the disease signature."""
+
+    chembl_id: str
+    drug_name: str
+    connectivity_score: float | None  # None when the drug has no LINCS signature.
+    normalized_score: float | None = None
+    p_value: float | None = None
+    scored: bool  # False => no signature available, not scored (do not skip silently).
+    n_genes_overlap: int | None = None
+
+
+def connectivity_score(
+    up_genes: list[str],
+    down_genes: list[str],
+    drug_signature: pd.Series,
+) -> float:
+    """Weighted KS connectivity score for one drug vs the disease up/down gene sets.
+
+    Only the intersection of disease-signature genes and LINCS landmark genes is scored;
+    callers must record the overlap count (PLAN.md §6, Week 3 task 2).
+
+    Args:
+        up_genes: Disease up-regulated gene identifiers.
+        down_genes: Disease down-regulated gene identifiers.
+        drug_signature: Drug's expression vector indexed by gene identifier.
+
+    Returns:
+        Connectivity score in roughly [-1, 1]; strongly negative = strong reversal.
+    """
+    raise NotImplementedError("Connectivity scoring: implement in Week 3 (notebook 04).")
+
+
+def rank_drugs(
+    signature_up: list[str],
+    signature_down: list[str],
+    drug_profiles: pd.DataFrame,
+) -> pd.DataFrame:
+    """Score and rank all drugs against the disease signature.
+
+    Drugs without a LINCS signature are marked ``scored=False`` and excluded from the ranking
+    rather than dropped silently (PLAN.md §6, Week 3 task 2).
+
+    Returns a ranked table with the columns described in PLAN.md §6 (rank, drug_name,
+    chembl_id, connectivity_score, normalized_score, p_value, inclusion_reason,
+    known_targets, mechanism_summary).
+    """
+    raise NotImplementedError("Drug ranking: implement in Week 3 (notebook 04).")
+
+
+def mechanistic_prior(targets: list[str]) -> float:
+    """Prior weight for a drug based on sickle-cell-relevant target pathways.
+
+    Pathways of interest: HbF regulation, hemoglobin, NO signaling, inflammation, oxidative
+    stress (PLAN.md §6, Week 3 task 3). Used to build the secondary, prior-weighted ranking.
+    """
+    raise NotImplementedError("Mechanistic prior: implement in Week 3 (notebook 04).")
+
+
+def persist_ranking(ranking: pd.DataFrame, out_path: Path | None = None) -> Path:
+    """Write the ranked candidate list to ``data/results/ranked_candidates_v1.csv``."""
+    out_path = out_path or (RESULTS_DIR / "ranked_candidates_v1.csv")
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    ranking.to_csv(out_path, index=False)
+    return out_path