Reverso/scripts/exp_genespace.py

"""Experiment (v1.1): re-score on a larger LINCS gene space and re-run the recovery test.

v1 used only the 978 landmark genes (12% signature overlap). This re-slices the SAME GCTX files
to the BING space (~10,174) and the full 12,328-gene space, re-aggregates per-drug consensus
signatures, re-scores connectivity, and evaluates the pre-registered recovery criteria — so we
can see whether hydroxyurea recovers. Writes nothing to the committed v1 artifacts.
"""

from __future__ import annotations

import gzip
import io
import json
from pathlib import Path

import pandas as pd

import sys
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from src.scoring import rank_drugs  # noqa: E402

LINCS = Path("data/raw/lincs")
PROCESSED = Path("data/processed")
GCTX = {1: LINCS / "phase1_level5.gctx", 2: LINCS / "phase2_level5.gctx"}
SIG_INFO = {1: "GSE92742_sig_info.txt.gz", 2: "GSE70138_sig_info.txt.gz"}
NEG5 = ["clotrimazole", "astemizole", "azithromycin", "ethinyl-estradiol", "caffeine"]


def read_gz(name):
    return pd.read_csv(io.BytesIO(gzip.decompress((LINCS / name).read_bytes())), sep="\t", low_memory=False)


def gene_ids_for_space(space: str):
    g = pd.read_csv(LINCS / "GSE92742_gene_info.txt.gz", sep="\t")
    if space == "bing":
        g = g[g.pr_is_bing == 1]
    # 'all' -> keep everything
    ids = [str(x) for x in g.pr_gene_id]
    id_to_symbol = {str(r.pr_gene_id): r.pr_gene_symbol for r in g.itertuples()}
    return ids, id_to_symbol


def extract(space, drug_names, gene_ids, id_to_symbol):
    from cmapPy.pandasGEXpress.parse import parse
    frames = []
    for ph in (1, 2):
        sig = read_gz(SIG_INFO[ph])
        sig = sig[(sig.pert_type == "trt_cp") & (sig.pert_iname.isin(drug_names))]
        if sig.empty:
            continue
        gct = parse(str(GCTX[ph]), rid=gene_ids, cid=sig.sig_id.tolist())
        data = gct.data_df
        s2d = dict(zip(sig.sig_id, sig.pert_iname))
        frames.append(data.T.groupby(data.columns.map(s2d)).mean())
        print(f"    [{space}] phase {ph}: {sig.pert_iname.nunique()} drugs sliced", flush=True)
    combined = pd.concat(frames).groupby(level=0).mean()
    combined.columns = [id_to_symbol.get(c, c) for c in combined.columns]
    combined = combined.loc[:, ~combined.columns.duplicated()]  # drop dup symbols
    return combined


def evaluate(space, sig_matrix, up, down):
    landmark_overlap = None
    ranked = rank_drugs(up, down, sig_matrix)
    n = len(ranked)
    top10, top25, half = int(n * 0.10), int(n * 0.25), n // 2
    profiles = pd.read_parquet(PROCESSED / "drug_profiles_v1.parquet").set_index("name")
    ranked = ranked.join(profiles[["inclusion_reason"]])

    hu, glut = int(ranked.loc["hydroxyurea", "rank"]), int(ranked.loc["glutamine", "rank"])
    glut_s = ranked.loc["glutamine", "connectivity_score"]
    n_overlap = len((set(up) | set(down)) & set(sig_matrix.columns))
    negs = {d: int(ranked.loc[d, "rank"]) for d in NEG5 if d in ranked.index}
    n_bottom = sum(r > half for r in negs.values())

    print(f"\n=== gene space: {space.upper()} ({sig_matrix.shape[1]} genes; query overlap {n_overlap}) ===")
    print(f"  hydroxyurea: rank {hu}/{n} (top {100*hu/n:.1f}%)  top-10%? {hu <= top10}")
    print(f"  L-glutamine: rank {glut}/{n} (top {100*glut/n:.1f}%), WTCS={glut_s:.3f}  top-25%? {glut <= top25}")
    print(f"  neg controls in bottom half: {n_bottom}/5  {negs}")
    crit = (hu <= top10) and (glut <= top25) and (n_bottom >= 4)
    print(f"  OVERALL: {'PASS' if crit else 'FAIL'}")
    print("  top 8:")
    for name, r in ranked.nsmallest(8, "connectivity_score").iterrows():
        print(f"    {int(r['rank']):2d} {name:18s} {r['connectivity_score']:+.3f} [{r['inclusion_reason']}]")
    return ranked


def main():
    sig = json.loads((PROCESSED / "sickle_cell_signature_v1.json").read_text())
    up = [g["gene"] for g in sig["up_regulated"]]
    down = [g["gene"] for g in sig["down_regulated"]]
    drug_names = set(pd.read_csv(PROCESSED / "drug_set_v1.csv").pert_iname)

    for space in ("bing", "all"):
        print(f"\n>>> extracting {space} ...", flush=True)
        ids, id2sym = gene_ids_for_space(space)
        mat = extract(space, drug_names, ids, id2sym)
        evaluate(space, mat, up, down)


if __name__ == "__main__":
    main()