Scaffold Reverso MVP pipeline structure

Set up the project skeleton per PLAN.md §4:
- src/ package: identifiers, disease, drugs, scoring, provenance
  with pydantic schemas and confidence-tier logic (working);
  data-pull/compute functions stubbed per their build week
- 5 starter notebooks (01-05) with PLAN-referenced steps
- tests/test_scoring.py: tier-assignment tests pass; scoring
  reference test xfail until Week 3
- docs/: recovery_test_report, data_sources, known_limitations skeletons
- pyproject.toml (requires-python >=3.11,<3.14), .gitignore, README
- data/ tree preserved via .gitkeep; raw/processed/results gitignored

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-23 20:19:38 +02:00
parent e717cf40ed
commit b731478f5d
25 changed files with 1038 additions and 4 deletions

85
src/scoring.py Normal file
View File

@@ -0,0 +1,85 @@
"""CMap-style connectivity scoring — the matching engine.
Week 3 (PLAN.md §6). Scores each drug's LINCS signature against the disease signature using
weighted Kolmogorov-Smirnov enrichment (Lamb 2006 / Subramanian 2017). Strongly *negative*
connectivity = strong reversal of the disease signature = candidate match.
Uses ``cmapPy`` as the reference implementation. ``tests/test_scoring.py`` verifies the
implementation against a known reference.
"""
from __future__ import annotations
from pathlib import Path
import pandas as pd
from pydantic import BaseModel
from . import RESULTS_DIR
class ConnectivityResult(BaseModel):
"""Connectivity score for a single drug against the disease signature."""
chembl_id: str
drug_name: str
connectivity_score: float | None # None when the drug has no LINCS signature.
normalized_score: float | None = None
p_value: float | None = None
scored: bool # False => no signature available, not scored (do not skip silently).
n_genes_overlap: int | None = None
def connectivity_score(
up_genes: list[str],
down_genes: list[str],
drug_signature: pd.Series,
) -> float:
"""Weighted KS connectivity score for one drug vs the disease up/down gene sets.
Only the intersection of disease-signature genes and LINCS landmark genes is scored;
callers must record the overlap count (PLAN.md §6, Week 3 task 2).
Args:
up_genes: Disease up-regulated gene identifiers.
down_genes: Disease down-regulated gene identifiers.
drug_signature: Drug's expression vector indexed by gene identifier.
Returns:
Connectivity score in roughly [-1, 1]; strongly negative = strong reversal.
"""
raise NotImplementedError("Connectivity scoring: implement in Week 3 (notebook 04).")
def rank_drugs(
signature_up: list[str],
signature_down: list[str],
drug_profiles: pd.DataFrame,
) -> pd.DataFrame:
"""Score and rank all drugs against the disease signature.
Drugs without a LINCS signature are marked ``scored=False`` and excluded from the ranking
rather than dropped silently (PLAN.md §6, Week 3 task 2).
Returns a ranked table with the columns described in PLAN.md §6 (rank, drug_name,
chembl_id, connectivity_score, normalized_score, p_value, inclusion_reason,
known_targets, mechanism_summary).
"""
raise NotImplementedError("Drug ranking: implement in Week 3 (notebook 04).")
def mechanistic_prior(targets: list[str]) -> float:
"""Prior weight for a drug based on sickle-cell-relevant target pathways.
Pathways of interest: HbF regulation, hemoglobin, NO signaling, inflammation, oxidative
stress (PLAN.md §6, Week 3 task 3). Used to build the secondary, prior-weighted ranking.
"""
raise NotImplementedError("Mechanistic prior: implement in Week 3 (notebook 04).")
def persist_ranking(ranking: pd.DataFrame, out_path: Path | None = None) -> Path:
"""Write the ranked candidate list to ``data/results/ranked_candidates_v1.csv``."""
out_path = out_path or (RESULTS_DIR / "ranked_candidates_v1.csv")
out_path.parent.mkdir(parents=True, exist_ok=True)
ranking.to_csv(out_path, index=False)
return out_path