Reverso/scripts/exp_deconv_signature.py

"""Experiment: composition-adjusted sickle signature, to fix specificity (option 1).

The v1 signature is confounded by cell composition (SS patients have very different WBC/RBC
than AA controls). GSE35007 *measured* those counts per sample, so we adjust the differential
expression for them directly (a measured-covariate alternative to estimated deconvolution):

    expression ~ disease + WBC + RBC + MCV + age + sex     (per gene, vectorized OLS)

We compare the composition-ADJUSTED signature against an UNADJUSTED single-study signature
(same samples, model without the covariates), score both with the v1.1 engine (full gene space
+ spec_z), and report the recovery test for each. Writes nothing to committed artifacts.
"""

from __future__ import annotations

import warnings
from pathlib import Path

import GEOparse
import numpy as np
import pandas as pd
from scipy.stats import false_discovery_control
from scipy.stats import t as tdist

import sys
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from src.disease import collapse_probes_to_symbols  # noqa: E402
from src.scoring import tau_calibrate  # noqa: E402

warnings.filterwarnings("ignore")
PROCESSED = Path("data/processed")
NEG5 = ["clotrimazole", "astemizole", "azithromycin", "ethinyl-estradiol", "caffeine"]
SYMBOL_COLS = ["Symbol", "ILMN_Gene", "Gene Symbol", "GeneSymbol"]


def cval(gsm, key):
    for c in gsm.metadata.get("characteristics_ch1", []):
        if c.lower().startswith(key.lower()):
            return c.split(":", 1)[1].strip()
    return None


def load_data():
    gse = GEOparse.get_GEO(geo="GSE35007", destdir="data/raw/geo", silent=True)
    meta = []
    for gid, g in gse.gsms.items():
        meta.append({"gsm": gid, "hb": cval(g, "hb phenotype"), "wbc": cval(g, "white blood cells"),
                     "rbc": cval(g, "red blood cells"), "mcv": cval(g, "mean corpuscular volume"),
                     "age": cval(g, "age"), "sex": cval(g, "Sex")})
    meta = pd.DataFrame(meta).set_index("gsm")
    for c in ["wbc", "rbc", "mcv", "age"]:
        meta[c] = pd.to_numeric(meta[c], errors="coerce")
    meta["disease"] = meta["hb"].map({"SS": 1.0, "AA": 0.0})
    meta["sex_m"] = (meta["sex"] == "M").astype(float)
    keep = meta[meta["hb"].isin(["SS", "AA"])].dropna(subset=["wbc", "rbc", "mcv", "age", "disease"])

    expr = pd.DataFrame({gid: gse.gsms[gid].table.set_index("ID_REF")["VALUE"] for gid in keep.index})
    expr = expr.apply(pd.to_numeric, errors="coerce").dropna(how="any")
    if float(np.nanmax(expr.to_numpy())) > 50:
        expr = np.log2(expr.clip(lower=0) + 1.0)

    gpl = list(gse.gpls.values())[0]
    col = next((c for c in SYMBOL_COLS if c in gpl.table.columns), None)
    sym = gpl.table.set_index("ID")[col].astype(str).str.strip().replace({"": np.nan, "nan": np.nan})
    return expr, keep, sym.dropna()


def ols_de(expr, design, disease_idx):
    """Vectorized per-gene OLS; return DE table (log_fc=disease coef, pvalue, qvalue)."""
    X = design.to_numpy(dtype=float)
    Y = expr.T.to_numpy(dtype=float)  # samples x genes
    n, p = X.shape
    XtX_inv = np.linalg.pinv(X.T @ X)
    B = XtX_inv @ X.T @ Y
    resid = Y - X @ B
    dof = n - p
    sigma2 = (resid ** 2).sum(0) / dof
    se = np.sqrt(sigma2 * XtX_inv[disease_idx, disease_idx])
    t = B[disease_idx] / se
    pval = 2 * tdist.sf(np.abs(t), dof)
    out = pd.DataFrame({"log_fc": B[disease_idx], "pvalue": pval}, index=expr.index).dropna()
    out["qvalue"] = false_discovery_control(out["pvalue"].to_numpy(), method="bh")
    return out


def make_signature(de, sym, expr, top_n=250):
    de_sym = collapse_probes_to_symbols(de, sym, expression_for_ranking=expr)
    sig = de_sym[de_sym["qvalue"] < 0.05]
    up = sig[sig["log_fc"] > 0].nsmallest(top_n, "qvalue").index.tolist()
    down = sig[sig["log_fc"] < 0].nsmallest(top_n, "qvalue").index.tolist()
    return up, down


def evaluate(label, up, down, lincs):
    ranked = tau_calibrate(up, down, lincs, n_null=1000)
    n = len(ranked)
    top10, top25, half = int(n * .10), int(n * .25), n // 2
    profiles = pd.read_parquet(PROCESSED / "drug_profiles_v1.parquet").set_index("name")
    ranked = ranked.join(profiles[["inclusion_reason"]])
    hu, glut = int(ranked.loc["hydroxyurea", "rank"]), int(ranked.loc["glutamine", "rank"])
    negs = {d: int(ranked.loc[d, "rank"]) for d in NEG5 if d in ranked.index}
    n_bottom = sum(r > half for r in negs.values())
    n_ov = len((set(up) | set(down)) & set(lincs.columns))
    print(f"\n### {label}: {len(up)} up / {len(down)} down (query overlap {n_ov})")
    print(f"  hydroxyurea rank {hu}/{n} (top {100*hu/n:.1f}%)  top-10%? {hu <= top10}")
    print(f"  L-glutamine rank {glut}/{n} (top {100*glut/n:.1f}%)  top-25%? {glut <= top25}")
    print(f"  neg controls bottom-half: {n_bottom}/5  {negs}")
    print("  top 8: " + ", ".join(
        f"{name}[{r['inclusion_reason'][:3]}]" for name, r in ranked.nsmallest(8, "spec_z").iterrows()))
    return ranked


def main():
    expr, meta, sym = load_data()
    print(f"loaded {expr.shape[1]} samples x {expr.shape[0]} probes; "
          f"{int(meta.disease.sum())} SS / {int((meta.disease==0).sum())} AA")
    lincs = pd.read_parquet(PROCESSED / "lincs_signatures_v1.parquet")

    base = pd.DataFrame({"intercept": 1.0, "disease": meta["disease"]}, index=meta.index)
    adj = base.assign(wbc=meta["wbc"], rbc=meta["rbc"], mcv=meta["mcv"], age=meta["age"], sex_m=meta["sex_m"])

    de_unadj = ols_de(expr, base, disease_idx=1)
    de_adj = ols_de(expr, adj, disease_idx=1)

    up_u, dn_u = make_signature(de_unadj, sym, expr)
    up_a, dn_a = make_signature(de_adj, sym, expr)

    # how much does adjustment change the gene set?
    overlap = len((set(up_u) | set(dn_u)) & (set(up_a) | set(dn_a)))
    print(f"\nsignature gene overlap unadjusted vs adjusted: {overlap}/{len(set(up_u)|set(dn_u))}")

    evaluate("UNADJUSTED (GSE35007 only)", up_u, dn_u, lincs)
    evaluate("COMPOSITION-ADJUSTED", up_a, dn_a, lincs)


if __name__ == "__main__":
    main()