Implement the matching engine (PLAN §6 Week 3): - src/scoring.py: weighted-KS/GSEA enrichment, weighted connectivity score (WTCS, Lamb 2006 / Subramanian 2017), signed NCS normalization, rank_drugs, and a sickle-pathway mechanistic prior - tests/test_scoring.py: real reference tests for the scorer (perfect reversal<null<mimic, same-sign->0, absent-gene invariance) + prior - week3_scoring.py: score 300 drugs -> ranked_candidates_v1.csv with a raw ranking and a secondary mechanistic-prior-weighted ranking Preliminary (formal recovery test is Week 4): hydroxyurea raw rank 40/300 (top 13%, just misses pre-registered top-10%), blended rank 7; L-glutamine WTCS=0 (ambiguous). Notably anti-inflammatory SCD drugs cluster in the raw top tier — the engine reverses the inflammation axis, not the erythroid axis, traceable to the 12% landmark-overlap caveat. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
83 lines
3.6 KiB
Python
83 lines
3.6 KiB
Python
"""Week 3: run connectivity scoring over all drugs -> ranked_candidates_v1.csv (PLAN §6).
|
|
|
|
Loads the disease signature + the 300 drug LINCS signatures, computes the weighted-KS
|
|
connectivity score per drug, and produces two rankings:
|
|
1. raw connectivity (most negative = strongest reversal = rank 1)
|
|
2. a secondary ranking blending connectivity with a mechanistic prior (sickle-relevant
|
|
target pathways), to temper broad-effect drugs (HDAC/kinase) that dominate raw rankings.
|
|
|
|
The formal recovery test (ground-truth + negative-control evaluation against the pre-registered
|
|
criteria) is Week 4; this script only prints a sanity peek.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
|
|
import sys
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
from src.scoring import mechanistic_prior, persist_ranking, rank_drugs # noqa: E402
|
|
|
|
PROCESSED = Path("data/processed")
|
|
PRIOR_LAMBDA = 0.5 # weight of the mechanistic prior in the secondary ranking
|
|
|
|
|
|
def main() -> None:
|
|
sig = json.loads((PROCESSED / "sickle_cell_signature_v1.json").read_text())
|
|
up = [g["gene"] for g in sig["up_regulated"]]
|
|
down = [g["gene"] for g in sig["down_regulated"]]
|
|
|
|
sig_matrix = pd.read_parquet(PROCESSED / "lincs_signatures_v1.parquet") # drug x 978 symbols
|
|
profiles = pd.read_parquet(PROCESSED / "drug_profiles_v1.parquet").set_index("name")
|
|
|
|
landmark = set(sig_matrix.columns)
|
|
n_up_ov = len(set(up) & landmark)
|
|
n_down_ov = len(set(down) & landmark)
|
|
print(f"query overlap with 978 landmarks: {n_up_ov} up + {n_down_ov} down = {n_up_ov + n_down_ov}")
|
|
print(f"scoring {len(sig_matrix)} drugs (all scored; 0 without signature)")
|
|
|
|
ranked = rank_drugs(up, down, sig_matrix)
|
|
|
|
# attach metadata + mechanistic prior
|
|
ranked = ranked.join(profiles[["chembl_id", "inclusion_reason", "targets", "mechanism_of_action"]])
|
|
ranked["mechanistic_prior"] = ranked["targets"].apply(
|
|
lambda t: mechanistic_prior(list(t) if t is not None else [])
|
|
)
|
|
ranked["known_targets"] = ranked["targets"].apply(
|
|
lambda t: "; ".join(t) if t is not None and len(t) else ""
|
|
)
|
|
ranked = ranked.rename(columns={"mechanism_of_action": "mechanism_summary"})
|
|
|
|
# secondary, prior-weighted ranking: relevant drugs pushed toward better (more negative)
|
|
ranked["blended_score"] = ranked["normalized_score"] - PRIOR_LAMBDA * ranked["mechanistic_prior"]
|
|
ranked["blended_rank"] = ranked["blended_score"].rank(method="first").astype(int)
|
|
|
|
out = ranked.rename_axis("drug_name").reset_index()[[
|
|
"rank", "drug_name", "chembl_id", "connectivity_score", "normalized_score",
|
|
"inclusion_reason", "mechanistic_prior", "blended_rank", "known_targets", "mechanism_summary",
|
|
]]
|
|
path = persist_ranking(out)
|
|
print(f"wrote {path} ({len(out)} drugs)")
|
|
|
|
# --- sanity peek (formal recovery test is Week 4) ---
|
|
print("\n--- sanity peek (raw connectivity rank) ---")
|
|
for gt in ["hydroxyurea", "glutamine"]:
|
|
r = ranked.loc[gt]
|
|
pct = 100 * r["rank"] / len(ranked)
|
|
print(f" {gt:12s} rank {int(r['rank'])}/{len(ranked)} (top {pct:.0f}%), "
|
|
f"score={r['connectivity_score']:.3f}")
|
|
neg = ranked[ranked["inclusion_reason"] == "negative_control"]
|
|
print(f" negative controls in bottom half: "
|
|
f"{(neg['rank'] > len(ranked) / 2).sum()}/{len(neg)}")
|
|
print("\n top 5 raw candidates:")
|
|
for name, r in ranked.nsmallest(5, "connectivity_score").iterrows():
|
|
print(f" {int(r['rank']):3d} {name:18s} {r['connectivity_score']:+.3f} "
|
|
f"[{r['inclusion_reason']}] {r['known_targets'][:50]}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|