Scaffold Reverso MVP pipeline structure

Set up the project skeleton per PLAN.md §4: - src/ package: identifiers, disease, drugs, scoring, provenance with pydantic schemas and confidence-tier logic (working); data-pull/compute functions stubbed per their build week - 5 starter notebooks (01-05) with PLAN-referenced steps - tests/test_scoring.py: tier-assignment tests pass; scoring reference test xfail until Week 3 - docs/: recovery_test_report, data_sources, known_limitations skeletons - pyproject.toml (requires-python >=3.11,<3.14), .gitignore, README - data/ tree preserved via .gitkeep; raw/processed/results gitignored Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-23 20:19:38 +02:00
parent e717cf40ed
commit b731478f5d
25 changed files with 1038 additions and 4 deletions
--- a/src/scoring.py
+++ b/src/scoring.py
@@ -0,0 +1,85 @@
+"""CMap-style connectivity scoring — the matching engine.
+
+Week 3 (PLAN.md §6). Scores each drug's LINCS signature against the disease signature using
+weighted Kolmogorov-Smirnov enrichment (Lamb 2006 / Subramanian 2017). Strongly *negative*
+connectivity = strong reversal of the disease signature = candidate match.
+
+Uses ``cmapPy`` as the reference implementation. ``tests/test_scoring.py`` verifies the
+implementation against a known reference.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pandas as pd
+from pydantic import BaseModel
+
+from . import RESULTS_DIR
+
+
+class ConnectivityResult(BaseModel):
+    """Connectivity score for a single drug against the disease signature."""
+
+    chembl_id: str
+    drug_name: str
+    connectivity_score: float | None  # None when the drug has no LINCS signature.
+    normalized_score: float | None = None
+    p_value: float | None = None
+    scored: bool  # False => no signature available, not scored (do not skip silently).
+    n_genes_overlap: int | None = None
+
+
+def connectivity_score(
+    up_genes: list[str],
+    down_genes: list[str],
+    drug_signature: pd.Series,
+) -> float:
+    """Weighted KS connectivity score for one drug vs the disease up/down gene sets.
+
+    Only the intersection of disease-signature genes and LINCS landmark genes is scored;
+    callers must record the overlap count (PLAN.md §6, Week 3 task 2).
+
+    Args:
+        up_genes: Disease up-regulated gene identifiers.
+        down_genes: Disease down-regulated gene identifiers.
+        drug_signature: Drug's expression vector indexed by gene identifier.
+
+    Returns:
+        Connectivity score in roughly [-1, 1]; strongly negative = strong reversal.
+    """
+    raise NotImplementedError("Connectivity scoring: implement in Week 3 (notebook 04).")
+
+
+def rank_drugs(
+    signature_up: list[str],
+    signature_down: list[str],
+    drug_profiles: pd.DataFrame,
+) -> pd.DataFrame:
+    """Score and rank all drugs against the disease signature.
+
+    Drugs without a LINCS signature are marked ``scored=False`` and excluded from the ranking
+    rather than dropped silently (PLAN.md §6, Week 3 task 2).
+
+    Returns a ranked table with the columns described in PLAN.md §6 (rank, drug_name,
+    chembl_id, connectivity_score, normalized_score, p_value, inclusion_reason,
+    known_targets, mechanism_summary).
+    """
+    raise NotImplementedError("Drug ranking: implement in Week 3 (notebook 04).")
+
+
+def mechanistic_prior(targets: list[str]) -> float:
+    """Prior weight for a drug based on sickle-cell-relevant target pathways.
+
+    Pathways of interest: HbF regulation, hemoglobin, NO signaling, inflammation, oxidative
+    stress (PLAN.md §6, Week 3 task 3). Used to build the secondary, prior-weighted ranking.
+    """
+    raise NotImplementedError("Mechanistic prior: implement in Week 3 (notebook 04).")
+
+
+def persist_ranking(ranking: pd.DataFrame, out_path: Path | None = None) -> Path:
+    """Write the ranked candidate list to ``data/results/ranked_candidates_v1.csv``."""
+    out_path = out_path or (RESULTS_DIR / "ranked_candidates_v1.csv")
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    ranking.to_csv(out_path, index=False)
+    return out_path