"""Experiment (v1.1): re-score on a larger LINCS gene space and re-run the recovery test. v1 used only the 978 landmark genes (12% signature overlap). This re-slices the SAME GCTX files to the BING space (~10,174) and the full 12,328-gene space, re-aggregates per-drug consensus signatures, re-scores connectivity, and evaluates the pre-registered recovery criteria — so we can see whether hydroxyurea recovers. Writes nothing to the committed v1 artifacts. """ from __future__ import annotations import gzip import io import json from pathlib import Path import pandas as pd import sys sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from src.scoring import rank_drugs # noqa: E402 LINCS = Path("data/raw/lincs") PROCESSED = Path("data/processed") GCTX = {1: LINCS / "phase1_level5.gctx", 2: LINCS / "phase2_level5.gctx"} SIG_INFO = {1: "GSE92742_sig_info.txt.gz", 2: "GSE70138_sig_info.txt.gz"} NEG5 = ["clotrimazole", "astemizole", "azithromycin", "ethinyl-estradiol", "caffeine"] def read_gz(name): return pd.read_csv(io.BytesIO(gzip.decompress((LINCS / name).read_bytes())), sep="\t", low_memory=False) def gene_ids_for_space(space: str): g = pd.read_csv(LINCS / "GSE92742_gene_info.txt.gz", sep="\t") if space == "bing": g = g[g.pr_is_bing == 1] # 'all' -> keep everything ids = [str(x) for x in g.pr_gene_id] id_to_symbol = {str(r.pr_gene_id): r.pr_gene_symbol for r in g.itertuples()} return ids, id_to_symbol def extract(space, drug_names, gene_ids, id_to_symbol): from cmapPy.pandasGEXpress.parse import parse frames = [] for ph in (1, 2): sig = read_gz(SIG_INFO[ph]) sig = sig[(sig.pert_type == "trt_cp") & (sig.pert_iname.isin(drug_names))] if sig.empty: continue gct = parse(str(GCTX[ph]), rid=gene_ids, cid=sig.sig_id.tolist()) data = gct.data_df s2d = dict(zip(sig.sig_id, sig.pert_iname)) frames.append(data.T.groupby(data.columns.map(s2d)).mean()) print(f" [{space}] phase {ph}: {sig.pert_iname.nunique()} drugs sliced", flush=True) combined = pd.concat(frames).groupby(level=0).mean() combined.columns = [id_to_symbol.get(c, c) for c in combined.columns] combined = combined.loc[:, ~combined.columns.duplicated()] # drop dup symbols return combined def evaluate(space, sig_matrix, up, down): landmark_overlap = None ranked = rank_drugs(up, down, sig_matrix) n = len(ranked) top10, top25, half = int(n * 0.10), int(n * 0.25), n // 2 profiles = pd.read_parquet(PROCESSED / "drug_profiles_v1.parquet").set_index("name") ranked = ranked.join(profiles[["inclusion_reason"]]) hu, glut = int(ranked.loc["hydroxyurea", "rank"]), int(ranked.loc["glutamine", "rank"]) glut_s = ranked.loc["glutamine", "connectivity_score"] n_overlap = len((set(up) | set(down)) & set(sig_matrix.columns)) negs = {d: int(ranked.loc[d, "rank"]) for d in NEG5 if d in ranked.index} n_bottom = sum(r > half for r in negs.values()) print(f"\n=== gene space: {space.upper()} ({sig_matrix.shape[1]} genes; query overlap {n_overlap}) ===") print(f" hydroxyurea: rank {hu}/{n} (top {100*hu/n:.1f}%) top-10%? {hu <= top10}") print(f" L-glutamine: rank {glut}/{n} (top {100*glut/n:.1f}%), WTCS={glut_s:.3f} top-25%? {glut <= top25}") print(f" neg controls in bottom half: {n_bottom}/5 {negs}") crit = (hu <= top10) and (glut <= top25) and (n_bottom >= 4) print(f" OVERALL: {'PASS' if crit else 'FAIL'}") print(" top 8:") for name, r in ranked.nsmallest(8, "connectivity_score").iterrows(): print(f" {int(r['rank']):2d} {name:18s} {r['connectivity_score']:+.3f} [{r['inclusion_reason']}]") return ranked def main(): sig = json.loads((PROCESSED / "sickle_cell_signature_v1.json").read_text()) up = [g["gene"] for g in sig["up_regulated"]] down = [g["gene"] for g in sig["down_regulated"]] drug_names = set(pd.read_csv(PROCESSED / "drug_set_v1.csv").pert_iname) for space in ("bing", "all"): print(f"\n>>> extracting {space} ...", flush=True) ids, id2sym = gene_ids_for_space(space) mat = extract(space, drug_names, ids, id2sym) evaluate(space, mat, up, down) if __name__ == "__main__": main()