"""Week 3 (v1.1): connectivity scoring over the full gene space with tau calibration. Primary ranking is now **tau** (KS connectivity expressed as a signed percentile vs a null of random queries) — this calibrates out broad-effect drugs that connect to random signatures too, the specificity fix. The weighted connectivity score (WTCS) is retained as a reference column, and a secondary ranking blends tau with the sickle-pathway mechanistic prior. Output: data/results/ranked_candidates_v1.csv. """ from __future__ import annotations import json from pathlib import Path import pandas as pd import sys sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from src.scoring import ( # noqa: E402 connectivity_score, mechanistic_prior, normalize_scores, persist_ranking, tau_calibrate, ) PROCESSED = Path("data/processed") N_NULL = 1000 PRIOR_LAMBDA = 0.5 # spec_z credit per matched sickle pathway, for the blended ranking def main() -> None: sig = json.loads((PROCESSED / "sickle_cell_signature_v1.json").read_text()) up = [g["gene"] for g in sig["up_regulated"]] down = [g["gene"] for g in sig["down_regulated"]] sig_matrix = pd.read_parquet(PROCESSED / "lincs_signatures_v1.parquet") # drug x 12,328 genes profiles = pd.read_parquet(PROCESSED / "drug_profiles_v1.parquet").set_index("name") n_up = len(set(up) & set(sig_matrix.columns)) n_down = len(set(down) & set(sig_matrix.columns)) print(f"gene space: {sig_matrix.shape[1]} genes; query overlap {n_up} up + {n_down} down = {n_up + n_down}") # primary: tau calibration print(f"computing tau over {N_NULL} random-query permutations ...", flush=True) ranked = tau_calibrate(up, down, sig_matrix, n_null=N_NULL) # reference: weighted connectivity score (WTCS) + NCS wtcs = pd.Series({d: connectivity_score(up, down, sig_matrix.loc[d]) for d in sig_matrix.index}, name="connectivity_score") ranked["connectivity_score"] = wtcs ranked["normalized_score"] = normalize_scores(wtcs) # metadata + mechanistic prior ranked = ranked.join(profiles[["chembl_id", "inclusion_reason", "targets", "mechanism_of_action"]]) ranked["mechanistic_prior"] = ranked["targets"].apply( lambda t: mechanistic_prior(list(t) if t is not None else [])) ranked["known_targets"] = ranked["targets"].apply( lambda t: "; ".join(t) if t is not None and len(t) else "") ranked = ranked.rename(columns={"mechanism_of_action": "mechanism_summary"}) # secondary, prior-weighted ranking (relevant drugs pushed toward more-negative spec_z) ranked["blended_score"] = ranked["spec_z"] - PRIOR_LAMBDA * ranked["mechanistic_prior"] ranked["blended_rank"] = ranked["blended_score"].rank(method="first").astype(int) out = ranked.rename_axis("drug_name").reset_index()[[ "rank", "drug_name", "chembl_id", "spec_z", "tau", "connectivity_ks", "connectivity_score", "inclusion_reason", "mechanistic_prior", "blended_rank", "known_targets", "mechanism_summary", ]] path = persist_ranking(out) print(f"wrote {path} ({len(out)} drugs)") print("\n--- sanity peek (spec_z ranking) ---") for gt in ["hydroxyurea", "glutamine"]: r = ranked.loc[gt] print(f" {gt:12s} rank {int(r['rank'])}/{len(ranked)} (top {100*r['rank']/len(ranked):.0f}%), " f"spec_z={r['spec_z']:.2f}") print(" top 10 by spec_z:") for name, r in ranked.nsmallest(10, "spec_z").iterrows(): print(f" {int(r['rank']):2d} {name:18s} z={r['spec_z']:6.2f} [{r['inclusion_reason']:16s}] " f"{str(r['known_targets'])[:38]}") if __name__ == "__main__": main()