Experiment: composition-adjusted signature (negative result)

Test whether fixing the cell-composition confound rescues the recovery test. GSE35007 measured WBC/RBC/MCV per sample, so adjust DE directly for them (per-gene OLS: expression ~ disease + WBC + RBC + MCV + age + sex) — a measured-covariate deconvolution, compared vs unadjusted. Finding (negative, informative): adjustment HURT hydroxyurea (rank 20 -> 35, fails top-10%) — it was recovering partly via composition genes — and did NOT fix negative-control specificity (still 2/5). Only gain: top hits become more mechanistically coherent (resveratrol, aspirin enter). Conclusion: cleaning the disease signature does not rescue connectivity; the binding constraint is matching-side (needs a reference-signature library for proper specificity calibration), which is a multi-disease investment. v1.1 signature NOT replaced. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-23 23:05:08 +02:00
parent 3417f85eb1
commit b62048614d
1 changed files with 137 additions and 0 deletions
--- a/scripts/exp_deconv_signature.py
+++ b/scripts/exp_deconv_signature.py
@@ -0,0 +1,137 @@
+"""Experiment: composition-adjusted sickle signature, to fix specificity (option 1).
+
+The v1 signature is confounded by cell composition (SS patients have very different WBC/RBC
+than AA controls). GSE35007 *measured* those counts per sample, so we adjust the differential
+expression for them directly (a measured-covariate alternative to estimated deconvolution):
+
+    expression ~ disease + WBC + RBC + MCV + age + sex     (per gene, vectorized OLS)
+
+We compare the composition-ADJUSTED signature against an UNADJUSTED single-study signature
+(same samples, model without the covariates), score both with the v1.1 engine (full gene space
+ spec_z), and report the recovery test for each. Writes nothing to committed artifacts.
+"""
+
+from __future__ import annotations
+
+import warnings
+from pathlib import Path
+
+import GEOparse
+import numpy as np
+import pandas as pd
+from scipy.stats import false_discovery_control
+from scipy.stats import t as tdist
+
+import sys
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from src.disease import collapse_probes_to_symbols  # noqa: E402
+from src.scoring import tau_calibrate  # noqa: E402
+
+warnings.filterwarnings("ignore")
+PROCESSED = Path("data/processed")
+NEG5 = ["clotrimazole", "astemizole", "azithromycin", "ethinyl-estradiol", "caffeine"]
+SYMBOL_COLS = ["Symbol", "ILMN_Gene", "Gene Symbol", "GeneSymbol"]
+
+
+def cval(gsm, key):
+    for c in gsm.metadata.get("characteristics_ch1", []):
+        if c.lower().startswith(key.lower()):
+            return c.split(":", 1)[1].strip()
+    return None
+
+
+def load_data():
+    gse = GEOparse.get_GEO(geo="GSE35007", destdir="data/raw/geo", silent=True)
+    meta = []
+    for gid, g in gse.gsms.items():
+        meta.append({"gsm": gid, "hb": cval(g, "hb phenotype"), "wbc": cval(g, "white blood cells"),
+                     "rbc": cval(g, "red blood cells"), "mcv": cval(g, "mean corpuscular volume"),
+                     "age": cval(g, "age"), "sex": cval(g, "Sex")})
+    meta = pd.DataFrame(meta).set_index("gsm")
+    for c in ["wbc", "rbc", "mcv", "age"]:
+        meta[c] = pd.to_numeric(meta[c], errors="coerce")
+    meta["disease"] = meta["hb"].map({"SS": 1.0, "AA": 0.0})
+    meta["sex_m"] = (meta["sex"] == "M").astype(float)
+    keep = meta[meta["hb"].isin(["SS", "AA"])].dropna(subset=["wbc", "rbc", "mcv", "age", "disease"])
+
+    expr = pd.DataFrame({gid: gse.gsms[gid].table.set_index("ID_REF")["VALUE"] for gid in keep.index})
+    expr = expr.apply(pd.to_numeric, errors="coerce").dropna(how="any")
+    if float(np.nanmax(expr.to_numpy())) > 50:
+        expr = np.log2(expr.clip(lower=0) + 1.0)
+
+    gpl = list(gse.gpls.values())[0]
+    col = next((c for c in SYMBOL_COLS if c in gpl.table.columns), None)
+    sym = gpl.table.set_index("ID")[col].astype(str).str.strip().replace({"": np.nan, "nan": np.nan})
+    return expr, keep, sym.dropna()
+
+
+def ols_de(expr, design, disease_idx):
+    """Vectorized per-gene OLS; return DE table (log_fc=disease coef, pvalue, qvalue)."""
+    X = design.to_numpy(dtype=float)
+    Y = expr.T.to_numpy(dtype=float)  # samples x genes
+    n, p = X.shape
+    XtX_inv = np.linalg.pinv(X.T @ X)
+    B = XtX_inv @ X.T @ Y
+    resid = Y - X @ B
+    dof = n - p
+    sigma2 = (resid ** 2).sum(0) / dof
+    se = np.sqrt(sigma2 * XtX_inv[disease_idx, disease_idx])
+    t = B[disease_idx] / se
+    pval = 2 * tdist.sf(np.abs(t), dof)
+    out = pd.DataFrame({"log_fc": B[disease_idx], "pvalue": pval}, index=expr.index).dropna()
+    out["qvalue"] = false_discovery_control(out["pvalue"].to_numpy(), method="bh")
+    return out
+
+
+def make_signature(de, sym, expr, top_n=250):
+    de_sym = collapse_probes_to_symbols(de, sym, expression_for_ranking=expr)
+    sig = de_sym[de_sym["qvalue"] < 0.05]
+    up = sig[sig["log_fc"] > 0].nsmallest(top_n, "qvalue").index.tolist()
+    down = sig[sig["log_fc"] < 0].nsmallest(top_n, "qvalue").index.tolist()
+    return up, down
+
+
+def evaluate(label, up, down, lincs):
+    ranked = tau_calibrate(up, down, lincs, n_null=1000)
+    n = len(ranked)
+    top10, top25, half = int(n * .10), int(n * .25), n // 2
+    profiles = pd.read_parquet(PROCESSED / "drug_profiles_v1.parquet").set_index("name")
+    ranked = ranked.join(profiles[["inclusion_reason"]])
+    hu, glut = int(ranked.loc["hydroxyurea", "rank"]), int(ranked.loc["glutamine", "rank"])
+    negs = {d: int(ranked.loc[d, "rank"]) for d in NEG5 if d in ranked.index}
+    n_bottom = sum(r > half for r in negs.values())
+    n_ov = len((set(up) | set(down)) & set(lincs.columns))
+    print(f"\n### {label}: {len(up)} up / {len(down)} down (query overlap {n_ov})")
+    print(f"  hydroxyurea rank {hu}/{n} (top {100*hu/n:.1f}%)  top-10%? {hu <= top10}")
+    print(f"  L-glutamine rank {glut}/{n} (top {100*glut/n:.1f}%)  top-25%? {glut <= top25}")
+    print(f"  neg controls bottom-half: {n_bottom}/5  {negs}")
+    print("  top 8: " + ", ".join(
+        f"{name}[{r['inclusion_reason'][:3]}]" for name, r in ranked.nsmallest(8, "spec_z").iterrows()))
+    return ranked
+
+
+def main():
+    expr, meta, sym = load_data()
+    print(f"loaded {expr.shape[1]} samples x {expr.shape[0]} probes; "
+          f"{int(meta.disease.sum())} SS / {int((meta.disease==0).sum())} AA")
+    lincs = pd.read_parquet(PROCESSED / "lincs_signatures_v1.parquet")
+
+    base = pd.DataFrame({"intercept": 1.0, "disease": meta["disease"]}, index=meta.index)
+    adj = base.assign(wbc=meta["wbc"], rbc=meta["rbc"], mcv=meta["mcv"], age=meta["age"], sex_m=meta["sex_m"])
+
+    de_unadj = ols_de(expr, base, disease_idx=1)
+    de_adj = ols_de(expr, adj, disease_idx=1)
+
+    up_u, dn_u = make_signature(de_unadj, sym, expr)
+    up_a, dn_a = make_signature(de_adj, sym, expr)
+
+    # how much does adjustment change the gene set?
+    overlap = len((set(up_u) | set(dn_u)) & (set(up_a) | set(dn_a)))
+    print(f"\nsignature gene overlap unadjusted vs adjusted: {overlap}/{len(set(up_u)|set(dn_u))}")
+
+    evaluate("UNADJUSTED (GSE35007 only)", up_u, dn_u, lincs)
+    evaluate("COMPOSITION-ADJUSTED", up_a, dn_a, lincs)
+
+
+if __name__ == "__main__":
+    main()