From fd4591949c7687e25c31b26f26b36b26cbe70496 Mon Sep 17 00:00:00 2001 From: "Junior B." Date: Tue, 23 Jun 2026 22:34:56 +0200 Subject: [PATCH] Week 3: CMap connectivity scoring engine + ranked candidates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the matching engine (PLAN §6 Week 3): - src/scoring.py: weighted-KS/GSEA enrichment, weighted connectivity score (WTCS, Lamb 2006 / Subramanian 2017), signed NCS normalization, rank_drugs, and a sickle-pathway mechanistic prior - tests/test_scoring.py: real reference tests for the scorer (perfect reversal0, absent-gene invariance) + prior - week3_scoring.py: score 300 drugs -> ranked_candidates_v1.csv with a raw ranking and a secondary mechanistic-prior-weighted ranking Preliminary (formal recovery test is Week 4): hydroxyurea raw rank 40/300 (top 13%, just misses pre-registered top-10%), blended rank 7; L-glutamine WTCS=0 (ambiguous). Notably anti-inflammatory SCD drugs cluster in the raw top tier — the engine reverses the inflammation axis, not the erythroid axis, traceable to the 12% landmark-overlap caveat. Co-Authored-By: Claude Opus 4.8 (1M context) --- scripts/week3_scoring.py | 82 ++++++++++++++++++++++ src/scoring.py | 143 +++++++++++++++++++++++++++------------ tests/test_scoring.py | 85 +++++++++++++++++++---- 3 files changed, 255 insertions(+), 55 deletions(-) create mode 100644 scripts/week3_scoring.py diff --git a/scripts/week3_scoring.py b/scripts/week3_scoring.py new file mode 100644 index 0000000..91c2735 --- /dev/null +++ b/scripts/week3_scoring.py @@ -0,0 +1,82 @@ +"""Week 3: run connectivity scoring over all drugs -> ranked_candidates_v1.csv (PLAN §6). + +Loads the disease signature + the 300 drug LINCS signatures, computes the weighted-KS +connectivity score per drug, and produces two rankings: + 1. raw connectivity (most negative = strongest reversal = rank 1) + 2. a secondary ranking blending connectivity with a mechanistic prior (sickle-relevant + target pathways), to temper broad-effect drugs (HDAC/kinase) that dominate raw rankings. + +The formal recovery test (ground-truth + negative-control evaluation against the pre-registered +criteria) is Week 4; this script only prints a sanity peek. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pandas as pd + +import sys +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) +from src.scoring import mechanistic_prior, persist_ranking, rank_drugs # noqa: E402 + +PROCESSED = Path("data/processed") +PRIOR_LAMBDA = 0.5 # weight of the mechanistic prior in the secondary ranking + + +def main() -> None: + sig = json.loads((PROCESSED / "sickle_cell_signature_v1.json").read_text()) + up = [g["gene"] for g in sig["up_regulated"]] + down = [g["gene"] for g in sig["down_regulated"]] + + sig_matrix = pd.read_parquet(PROCESSED / "lincs_signatures_v1.parquet") # drug x 978 symbols + profiles = pd.read_parquet(PROCESSED / "drug_profiles_v1.parquet").set_index("name") + + landmark = set(sig_matrix.columns) + n_up_ov = len(set(up) & landmark) + n_down_ov = len(set(down) & landmark) + print(f"query overlap with 978 landmarks: {n_up_ov} up + {n_down_ov} down = {n_up_ov + n_down_ov}") + print(f"scoring {len(sig_matrix)} drugs (all scored; 0 without signature)") + + ranked = rank_drugs(up, down, sig_matrix) + + # attach metadata + mechanistic prior + ranked = ranked.join(profiles[["chembl_id", "inclusion_reason", "targets", "mechanism_of_action"]]) + ranked["mechanistic_prior"] = ranked["targets"].apply( + lambda t: mechanistic_prior(list(t) if t is not None else []) + ) + ranked["known_targets"] = ranked["targets"].apply( + lambda t: "; ".join(t) if t is not None and len(t) else "" + ) + ranked = ranked.rename(columns={"mechanism_of_action": "mechanism_summary"}) + + # secondary, prior-weighted ranking: relevant drugs pushed toward better (more negative) + ranked["blended_score"] = ranked["normalized_score"] - PRIOR_LAMBDA * ranked["mechanistic_prior"] + ranked["blended_rank"] = ranked["blended_score"].rank(method="first").astype(int) + + out = ranked.rename_axis("drug_name").reset_index()[[ + "rank", "drug_name", "chembl_id", "connectivity_score", "normalized_score", + "inclusion_reason", "mechanistic_prior", "blended_rank", "known_targets", "mechanism_summary", + ]] + path = persist_ranking(out) + print(f"wrote {path} ({len(out)} drugs)") + + # --- sanity peek (formal recovery test is Week 4) --- + print("\n--- sanity peek (raw connectivity rank) ---") + for gt in ["hydroxyurea", "glutamine"]: + r = ranked.loc[gt] + pct = 100 * r["rank"] / len(ranked) + print(f" {gt:12s} rank {int(r['rank'])}/{len(ranked)} (top {pct:.0f}%), " + f"score={r['connectivity_score']:.3f}") + neg = ranked[ranked["inclusion_reason"] == "negative_control"] + print(f" negative controls in bottom half: " + f"{(neg['rank'] > len(ranked) / 2).sum()}/{len(neg)}") + print("\n top 5 raw candidates:") + for name, r in ranked.nsmallest(5, "connectivity_score").iterrows(): + print(f" {int(r['rank']):3d} {name:18s} {r['connectivity_score']:+.3f} " + f"[{r['inclusion_reason']}] {r['known_targets'][:50]}") + + +if __name__ == "__main__": + main() diff --git a/src/scoring.py b/src/scoring.py index 747c302..2ae440a 100644 --- a/src/scoring.py +++ b/src/scoring.py @@ -1,33 +1,65 @@ -"""CMap-style connectivity scoring — the matching engine. +"""CMap-style connectivity scoring — the matching engine (Week 3, PLAN §6). -Week 3 (PLAN.md §6). Scores each drug's LINCS signature against the disease signature using -weighted Kolmogorov-Smirnov enrichment (Lamb 2006 / Subramanian 2017). Strongly *negative* -connectivity = strong reversal of the disease signature = candidate match. +Scores each drug's LINCS consensus signature against the disease signature using the weighted +Kolmogorov-Smirnov / GSEA enrichment statistic (Lamb 2006; Subramanian 2017). The query is the +disease up/down gene sets; the reference is each drug's 978 landmark genes ranked by z-score. -Uses ``cmapPy`` as the reference implementation. ``tests/test_scoring.py`` verifies the -implementation against a known reference. +Sign convention (PLAN §6): strongly **negative** connectivity = strong **reversal** of the +disease signature = candidate match. A drug that down-regulates the disease's up-genes and +up-regulates its down-genes scores negative. """ from __future__ import annotations from pathlib import Path +import numpy as np import pandas as pd -from pydantic import BaseModel from . import RESULTS_DIR +# Sickle-cell-relevant target pathways for the mechanistic prior (PLAN §6 Week 3 task 3). +# Keys are pathway categories; values are substrings matched (case-insensitive) against a +# drug's ChEMBL target names. +SICKLE_PATHWAYS: dict[str, tuple[str, ...]] = { + "hbf_epigenetic": ("histone deacetylase", "hdac", "methyltransferase", "dnmt", + "ribonucleoside-diphosphate reductase", "ribonucleotide reductase"), + "hemoglobin": ("hemoglobin", "globin"), + "no_signaling": ("nitric oxide", "guanylate cyclase", "phosphodiesterase 5", "pde5"), + "inflammation": ("cyclooxygenase", "prostaglandin", "nf-kappa", "interleukin", + "leukotriene", "selectin", "tumor necrosis factor"), + "oxidative_stress": ("glutathione", "superoxide", "nadph oxidase", "thioredoxin", "nrf2"), +} -class ConnectivityResult(BaseModel): - """Connectivity score for a single drug against the disease signature.""" - chembl_id: str - drug_name: str - connectivity_score: float | None # None when the drug has no LINCS signature. - normalized_score: float | None = None - p_value: float | None = None - scored: bool # False => no signature available, not scored (do not skip silently). - n_genes_overlap: int | None = None +def _enrichment_score(drug_profile: pd.Series, gene_set: set[str], weight: float = 1.0) -> float: + """Weighted GSEA/KS enrichment score of ``gene_set`` in a drug's ranked profile. + + The profile is ranked by z-score (most up-regulated first). Hits increment a running sum in + proportion to ``|z|**weight``; misses decrement uniformly. ES is the max signed deviation + from zero. ES>0 => set enriched among up-regulated genes; ES<0 => among down-regulated. + """ + s = drug_profile.sort_values(ascending=False) + genes = s.index.to_numpy() + vals = s.to_numpy(dtype=float) + hit = np.fromiter((g in gene_set for g in genes), dtype=bool, count=len(genes)) + + n_hit = int(hit.sum()) + n = len(genes) + if n_hit == 0 or n_hit == n: + return 0.0 + + w = (np.abs(vals) ** weight) * hit + sum_hit = w.sum() + if sum_hit == 0: + return 0.0 + + inc = w / sum_hit + dec = (~hit) / (n - n_hit) + running = np.cumsum(inc - dec) + + hi, lo = running.max(), running.min() + return float(hi if abs(hi) >= abs(lo) else lo) def connectivity_score( @@ -35,46 +67,71 @@ def connectivity_score( down_genes: list[str], drug_signature: pd.Series, ) -> float: - """Weighted KS connectivity score for one drug vs the disease up/down gene sets. + """Weighted connectivity score (WTCS) for one drug vs the disease up/down sets. - Only the intersection of disease-signature genes and LINCS landmark genes is scored; - callers must record the overlap count (PLAN.md §6, Week 3 task 2). - - Args: - up_genes: Disease up-regulated gene identifiers. - down_genes: Disease down-regulated gene identifiers. - drug_signature: Drug's expression vector indexed by gene identifier. - - Returns: - Connectivity score in roughly [-1, 1]; strongly negative = strong reversal. + Only query genes present in the drug's profile index (the 978 landmarks) are used — callers + should record the overlap count (PLAN §6 Week 3 task 2). Returns the WTCS: if the two + enrichment scores share a sign the result is 0 (ambiguous), else ``(ES_up - ES_down)/2``. + Negative => reversal => candidate. """ - raise NotImplementedError("Connectivity scoring: implement in Week 3 (notebook 04).") + profile_genes = set(drug_signature.index) + up = set(up_genes) & profile_genes + down = set(down_genes) & profile_genes + + es_up = _enrichment_score(drug_signature, up) + es_down = _enrichment_score(drug_signature, down) + + if np.sign(es_up) == np.sign(es_down): + return 0.0 + return (es_up - es_down) / 2.0 + + +def normalize_scores(scores: pd.Series) -> pd.Series: + """Signed normalization (NCS, Subramanian 2017): divide by the mean magnitude of same-sign + scores, so positive and negative tails are separately scaled to a mean magnitude of 1.""" + out = scores.astype(float).copy() + pos_mean = scores[scores > 0].mean() + neg_mean = scores[scores < 0].abs().mean() + if pos_mean and not np.isnan(pos_mean): + out[scores > 0] = scores[scores > 0] / pos_mean + if neg_mean and not np.isnan(neg_mean): + out[scores < 0] = scores[scores < 0] / neg_mean + return out def rank_drugs( - signature_up: list[str], - signature_down: list[str], - drug_profiles: pd.DataFrame, + up_genes: list[str], + down_genes: list[str], + signature_matrix: pd.DataFrame, ) -> pd.DataFrame: - """Score and rank all drugs against the disease signature. + """Score and rank all drugs (rows of ``signature_matrix``: drug x landmark-gene z-scores). - Drugs without a LINCS signature are marked ``scored=False`` and excluded from the ranking - rather than dropped silently (PLAN.md §6, Week 3 task 2). - - Returns a ranked table with the columns described in PLAN.md §6 (rank, drug_name, - chembl_id, connectivity_score, normalized_score, p_value, inclusion_reason, - known_targets, mechanism_summary). + Returns a table indexed by drug with ``rank`` (1 = strongest reversal = most negative), + ``connectivity_score`` and ``normalized_score``. Drugs are expected to all have signatures + here; signature-less drugs are handled (marked not-scored) by the orchestration layer per + PLAN §6 Week 3 task 2. """ - raise NotImplementedError("Drug ranking: implement in Week 3 (notebook 04).") + scores = pd.Series( + {drug: connectivity_score(up_genes, down_genes, signature_matrix.loc[drug]) + for drug in signature_matrix.index}, + name="connectivity_score", + ) + df = pd.DataFrame({"connectivity_score": scores, "normalized_score": normalize_scores(scores)}) + df = df.sort_values("connectivity_score") # most negative (reversal) first + df.insert(0, "rank", range(1, len(df) + 1)) + return df def mechanistic_prior(targets: list[str]) -> float: - """Prior weight for a drug based on sickle-cell-relevant target pathways. + """Count of sickle-cell-relevant pathway categories a drug's targets hit (PLAN §6 task 3). - Pathways of interest: HbF regulation, hemoglobin, NO signaling, inflammation, oxidative - stress (PLAN.md §6, Week 3 task 3). Used to build the secondary, prior-weighted ranking. + Higher = more mechanistically plausible. Used to build the secondary, prior-weighted ranking + alongside the raw connectivity ranking. """ - raise NotImplementedError("Mechanistic prior: implement in Week 3 (notebook 04).") + if not targets: + return 0.0 + text = " ; ".join(str(t) for t in targets).lower() + return float(sum(any(kw in text for kw in kws) for kws in SICKLE_PATHWAYS.values())) def persist_ranking(ranking: pd.DataFrame, out_path: Path | None = None) -> Path: diff --git a/tests/test_scoring.py b/tests/test_scoring.py index 79f0f59..2d32785 100644 --- a/tests/test_scoring.py +++ b/tests/test_scoring.py @@ -1,14 +1,13 @@ """Tests for the matching engine and provenance logic. -The headline test (PLAN.md §6, Week 3 task 4) verifies connectivity scoring against a known -reference within tolerance; it is marked xfail until the scorer is implemented in Week 3. - -The tier-assignment tests run today — they pin the rules from PLAN.md §3 so the most +Connectivity tests (PLAN.md §6, Week 3 task 4) pin the weighted-KS scorer against hand-built +reference profiles. The tier-assignment tests pin the rules from PLAN.md §3 so the most commercially important design decision can't silently drift. """ from __future__ import annotations +import pandas as pd import pytest from src.provenance import ConfidenceTier, assign_tier @@ -52,14 +51,76 @@ class TestAssignTier: assert assign_tier(**kwargs) == ConfidenceTier.B -@pytest.mark.xfail(reason="Connectivity scoring implemented in Week 3 (notebook 04).", strict=True) -def test_connectivity_score_matches_reference(): - """Verify connectivity scoring against a CMap/cmapPy reference within tolerance. +class TestConnectivityScore: + """Reference checks for the weighted-KS connectivity score (PLAN §6 Week 3 task 4). - PLAN.md §6, Week 3 task 4. Replace this body with a known reference example - (disease up/down sets + drug signature -> expected score) once the scorer exists. + Query: up = {U1, U2}, down = {D1, D2}. We build drug profiles with a known relationship to + the query and assert the sign/ordering the CMap convention requires. """ - from src.scoring import connectivity_score - score = connectivity_score(up_genes=[], down_genes=[], drug_signature=None) # noqa - assert score == pytest.approx(0.0, abs=1e-6) + UP = ["U1", "U2"] + DOWN = ["D1", "D2"] + + @staticmethod + def _profile(values: dict[str, float]) -> pd.Series: + # 20 filler genes at ~0 so the query genes sit clearly at the extremes. + base = {f"N{i}": 0.01 * ((i % 5) - 2) for i in range(20)} + base.update(values) + return pd.Series(base) + + def test_perfect_reversal_is_strongly_negative(self): + from src.scoring import connectivity_score + # Drug pushes disease-up genes DOWN (very negative) and disease-down genes UP (very + # positive) => reversal => negative connectivity. + prof = self._profile({"U1": -8, "U2": -7, "D1": 8, "D2": 7}) + assert connectivity_score(self.UP, self.DOWN, prof) < -0.4 + + def test_perfect_mimic_is_strongly_positive(self): + from src.scoring import connectivity_score + prof = self._profile({"U1": 8, "U2": 7, "D1": -8, "D2": -7}) + assert connectivity_score(self.UP, self.DOWN, prof) > 0.4 + + def test_reversal_beats_mimic_and_null(self): + from src.scoring import connectivity_score + rev = connectivity_score(self.UP, self.DOWN, self._profile({"U1": -8, "U2": -7, "D1": 8, "D2": 7})) + mimic = connectivity_score(self.UP, self.DOWN, self._profile({"U1": 8, "U2": 7, "D1": -8, "D2": -7})) + null = connectivity_score(self.UP, self.DOWN, self._profile({"U1": 0.2, "U2": -0.1, "D1": 0.1, "D2": -0.2})) + assert rev < null < mimic + assert abs(null) < abs(rev) + + def test_same_sign_enrichment_returns_zero(self): + from src.scoring import connectivity_score + # Both up- and down-sets at the top => same-sign ES => ambiguous => 0 (WTCS rule). + prof = self._profile({"U1": 8, "U2": 7, "D1": 6, "D2": 5}) + assert connectivity_score(self.UP, self.DOWN, prof) == 0.0 + + def test_genes_absent_from_profile_are_ignored(self): + from src.scoring import connectivity_score + prof = self._profile({"U1": -8, "U2": -7, "D1": 8, "D2": 7}) + # Adding a query gene not in the profile must not change the score. + s1 = connectivity_score(self.UP, self.DOWN, prof) + s2 = connectivity_score(self.UP + ["NOT_IN_PROFILE"], self.DOWN, prof) + assert s1 == pytest.approx(s2) + + +class TestMechanisticPrior: + def test_counts_distinct_sickle_pathways(self): + from src.scoring import mechanistic_prior + # ribonucleotide reductase (hydroxyurea) -> hbf_epigenetic category. + assert mechanistic_prior(["Ribonucleoside-diphosphate reductase RR1"]) == 1.0 + # DNMT (epigenetic) + hemoglobin -> two categories. + assert mechanistic_prior(["DNA (cytosine-5)-methyltransferase 1", "Hemoglobin subunit beta"]) == 2.0 + assert mechanistic_prior([]) == 0.0 + assert mechanistic_prior(["Some unrelated kinase"]) == 0.0 + + +def test_rank_drugs_orders_by_reversal(): + from src.scoring import rank_drugs + genes = ["U1", "U2", "D1", "D2"] + [f"N{i}" for i in range(10)] + base = {g: 0.0 for g in genes} + reverser = {**base, "U1": -8, "U2": -7, "D1": 8, "D2": 7} + mimic = {**base, "U1": 8, "U2": 7, "D1": -8, "D2": -7} + matrix = pd.DataFrame([reverser, mimic], index=["reverser", "mimic"]) + ranked = rank_drugs(["U1", "U2"], ["D1", "D2"], matrix) + assert ranked.loc["reverser", "rank"] == 1 + assert ranked.loc["reverser", "connectivity_score"] < ranked.loc["mimic", "connectivity_score"]