diff --git a/scripts/exp_deconv_signature.py b/scripts/exp_deconv_signature.py new file mode 100644 index 0000000..30aa00e --- /dev/null +++ b/scripts/exp_deconv_signature.py @@ -0,0 +1,137 @@ +"""Experiment: composition-adjusted sickle signature, to fix specificity (option 1). + +The v1 signature is confounded by cell composition (SS patients have very different WBC/RBC +than AA controls). GSE35007 *measured* those counts per sample, so we adjust the differential +expression for them directly (a measured-covariate alternative to estimated deconvolution): + + expression ~ disease + WBC + RBC + MCV + age + sex (per gene, vectorized OLS) + +We compare the composition-ADJUSTED signature against an UNADJUSTED single-study signature +(same samples, model without the covariates), score both with the v1.1 engine (full gene space ++ spec_z), and report the recovery test for each. Writes nothing to committed artifacts. +""" + +from __future__ import annotations + +import warnings +from pathlib import Path + +import GEOparse +import numpy as np +import pandas as pd +from scipy.stats import false_discovery_control +from scipy.stats import t as tdist + +import sys +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) +from src.disease import collapse_probes_to_symbols # noqa: E402 +from src.scoring import tau_calibrate # noqa: E402 + +warnings.filterwarnings("ignore") +PROCESSED = Path("data/processed") +NEG5 = ["clotrimazole", "astemizole", "azithromycin", "ethinyl-estradiol", "caffeine"] +SYMBOL_COLS = ["Symbol", "ILMN_Gene", "Gene Symbol", "GeneSymbol"] + + +def cval(gsm, key): + for c in gsm.metadata.get("characteristics_ch1", []): + if c.lower().startswith(key.lower()): + return c.split(":", 1)[1].strip() + return None + + +def load_data(): + gse = GEOparse.get_GEO(geo="GSE35007", destdir="data/raw/geo", silent=True) + meta = [] + for gid, g in gse.gsms.items(): + meta.append({"gsm": gid, "hb": cval(g, "hb phenotype"), "wbc": cval(g, "white blood cells"), + "rbc": cval(g, "red blood cells"), "mcv": cval(g, "mean corpuscular volume"), + "age": cval(g, "age"), "sex": cval(g, "Sex")}) + meta = pd.DataFrame(meta).set_index("gsm") + for c in ["wbc", "rbc", "mcv", "age"]: + meta[c] = pd.to_numeric(meta[c], errors="coerce") + meta["disease"] = meta["hb"].map({"SS": 1.0, "AA": 0.0}) + meta["sex_m"] = (meta["sex"] == "M").astype(float) + keep = meta[meta["hb"].isin(["SS", "AA"])].dropna(subset=["wbc", "rbc", "mcv", "age", "disease"]) + + expr = pd.DataFrame({gid: gse.gsms[gid].table.set_index("ID_REF")["VALUE"] for gid in keep.index}) + expr = expr.apply(pd.to_numeric, errors="coerce").dropna(how="any") + if float(np.nanmax(expr.to_numpy())) > 50: + expr = np.log2(expr.clip(lower=0) + 1.0) + + gpl = list(gse.gpls.values())[0] + col = next((c for c in SYMBOL_COLS if c in gpl.table.columns), None) + sym = gpl.table.set_index("ID")[col].astype(str).str.strip().replace({"": np.nan, "nan": np.nan}) + return expr, keep, sym.dropna() + + +def ols_de(expr, design, disease_idx): + """Vectorized per-gene OLS; return DE table (log_fc=disease coef, pvalue, qvalue).""" + X = design.to_numpy(dtype=float) + Y = expr.T.to_numpy(dtype=float) # samples x genes + n, p = X.shape + XtX_inv = np.linalg.pinv(X.T @ X) + B = XtX_inv @ X.T @ Y + resid = Y - X @ B + dof = n - p + sigma2 = (resid ** 2).sum(0) / dof + se = np.sqrt(sigma2 * XtX_inv[disease_idx, disease_idx]) + t = B[disease_idx] / se + pval = 2 * tdist.sf(np.abs(t), dof) + out = pd.DataFrame({"log_fc": B[disease_idx], "pvalue": pval}, index=expr.index).dropna() + out["qvalue"] = false_discovery_control(out["pvalue"].to_numpy(), method="bh") + return out + + +def make_signature(de, sym, expr, top_n=250): + de_sym = collapse_probes_to_symbols(de, sym, expression_for_ranking=expr) + sig = de_sym[de_sym["qvalue"] < 0.05] + up = sig[sig["log_fc"] > 0].nsmallest(top_n, "qvalue").index.tolist() + down = sig[sig["log_fc"] < 0].nsmallest(top_n, "qvalue").index.tolist() + return up, down + + +def evaluate(label, up, down, lincs): + ranked = tau_calibrate(up, down, lincs, n_null=1000) + n = len(ranked) + top10, top25, half = int(n * .10), int(n * .25), n // 2 + profiles = pd.read_parquet(PROCESSED / "drug_profiles_v1.parquet").set_index("name") + ranked = ranked.join(profiles[["inclusion_reason"]]) + hu, glut = int(ranked.loc["hydroxyurea", "rank"]), int(ranked.loc["glutamine", "rank"]) + negs = {d: int(ranked.loc[d, "rank"]) for d in NEG5 if d in ranked.index} + n_bottom = sum(r > half for r in negs.values()) + n_ov = len((set(up) | set(down)) & set(lincs.columns)) + print(f"\n### {label}: {len(up)} up / {len(down)} down (query overlap {n_ov})") + print(f" hydroxyurea rank {hu}/{n} (top {100*hu/n:.1f}%) top-10%? {hu <= top10}") + print(f" L-glutamine rank {glut}/{n} (top {100*glut/n:.1f}%) top-25%? {glut <= top25}") + print(f" neg controls bottom-half: {n_bottom}/5 {negs}") + print(" top 8: " + ", ".join( + f"{name}[{r['inclusion_reason'][:3]}]" for name, r in ranked.nsmallest(8, "spec_z").iterrows())) + return ranked + + +def main(): + expr, meta, sym = load_data() + print(f"loaded {expr.shape[1]} samples x {expr.shape[0]} probes; " + f"{int(meta.disease.sum())} SS / {int((meta.disease==0).sum())} AA") + lincs = pd.read_parquet(PROCESSED / "lincs_signatures_v1.parquet") + + base = pd.DataFrame({"intercept": 1.0, "disease": meta["disease"]}, index=meta.index) + adj = base.assign(wbc=meta["wbc"], rbc=meta["rbc"], mcv=meta["mcv"], age=meta["age"], sex_m=meta["sex_m"]) + + de_unadj = ols_de(expr, base, disease_idx=1) + de_adj = ols_de(expr, adj, disease_idx=1) + + up_u, dn_u = make_signature(de_unadj, sym, expr) + up_a, dn_a = make_signature(de_adj, sym, expr) + + # how much does adjustment change the gene set? + overlap = len((set(up_u) | set(dn_u)) & (set(up_a) | set(dn_a))) + print(f"\nsignature gene overlap unadjusted vs adjusted: {overlap}/{len(set(up_u)|set(dn_u))}") + + evaluate("UNADJUSTED (GSE35007 only)", up_u, dn_u, lincs) + evaluate("COMPOSITION-ADJUSTED", up_a, dn_a, lincs) + + +if __name__ == "__main__": + main()