"""Week 3: run connectivity scoring over all drugs -> ranked_candidates_v1.csv (PLAN ยง6). Loads the disease signature + the 300 drug LINCS signatures, computes the weighted-KS connectivity score per drug, and produces two rankings: 1. raw connectivity (most negative = strongest reversal = rank 1) 2. a secondary ranking blending connectivity with a mechanistic prior (sickle-relevant target pathways), to temper broad-effect drugs (HDAC/kinase) that dominate raw rankings. The formal recovery test (ground-truth + negative-control evaluation against the pre-registered criteria) is Week 4; this script only prints a sanity peek. """ from __future__ import annotations import json from pathlib import Path import pandas as pd import sys sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from src.scoring import mechanistic_prior, persist_ranking, rank_drugs # noqa: E402 PROCESSED = Path("data/processed") PRIOR_LAMBDA = 0.5 # weight of the mechanistic prior in the secondary ranking def main() -> None: sig = json.loads((PROCESSED / "sickle_cell_signature_v1.json").read_text()) up = [g["gene"] for g in sig["up_regulated"]] down = [g["gene"] for g in sig["down_regulated"]] sig_matrix = pd.read_parquet(PROCESSED / "lincs_signatures_v1.parquet") # drug x 978 symbols profiles = pd.read_parquet(PROCESSED / "drug_profiles_v1.parquet").set_index("name") landmark = set(sig_matrix.columns) n_up_ov = len(set(up) & landmark) n_down_ov = len(set(down) & landmark) print(f"query overlap with 978 landmarks: {n_up_ov} up + {n_down_ov} down = {n_up_ov + n_down_ov}") print(f"scoring {len(sig_matrix)} drugs (all scored; 0 without signature)") ranked = rank_drugs(up, down, sig_matrix) # attach metadata + mechanistic prior ranked = ranked.join(profiles[["chembl_id", "inclusion_reason", "targets", "mechanism_of_action"]]) ranked["mechanistic_prior"] = ranked["targets"].apply( lambda t: mechanistic_prior(list(t) if t is not None else []) ) ranked["known_targets"] = ranked["targets"].apply( lambda t: "; ".join(t) if t is not None and len(t) else "" ) ranked = ranked.rename(columns={"mechanism_of_action": "mechanism_summary"}) # secondary, prior-weighted ranking: relevant drugs pushed toward better (more negative) ranked["blended_score"] = ranked["normalized_score"] - PRIOR_LAMBDA * ranked["mechanistic_prior"] ranked["blended_rank"] = ranked["blended_score"].rank(method="first").astype(int) out = ranked.rename_axis("drug_name").reset_index()[[ "rank", "drug_name", "chembl_id", "connectivity_score", "normalized_score", "inclusion_reason", "mechanistic_prior", "blended_rank", "known_targets", "mechanism_summary", ]] path = persist_ranking(out) print(f"wrote {path} ({len(out)} drugs)") # --- sanity peek (formal recovery test is Week 4) --- print("\n--- sanity peek (raw connectivity rank) ---") for gt in ["hydroxyurea", "glutamine"]: r = ranked.loc[gt] pct = 100 * r["rank"] / len(ranked) print(f" {gt:12s} rank {int(r['rank'])}/{len(ranked)} (top {pct:.0f}%), " f"score={r['connectivity_score']:.3f}") neg = ranked[ranked["inclusion_reason"] == "negative_control"] print(f" negative controls in bottom half: " f"{(neg['rank'] > len(ranked) / 2).sum()}/{len(neg)}") print("\n top 5 raw candidates:") for name, r in ranked.nsmallest(5, "connectivity_score").iterrows(): print(f" {int(r['rank']):3d} {name:18s} {r['connectivity_score']:+.3f} " f"[{r['inclusion_reason']}] {r['known_targets'][:50]}") if __name__ == "__main__": main()