"""Experiment: composition-adjusted sickle signature, to fix specificity (option 1). The v1 signature is confounded by cell composition (SS patients have very different WBC/RBC than AA controls). GSE35007 *measured* those counts per sample, so we adjust the differential expression for them directly (a measured-covariate alternative to estimated deconvolution): expression ~ disease + WBC + RBC + MCV + age + sex (per gene, vectorized OLS) We compare the composition-ADJUSTED signature against an UNADJUSTED single-study signature (same samples, model without the covariates), score both with the v1.1 engine (full gene space + spec_z), and report the recovery test for each. Writes nothing to committed artifacts. """ from __future__ import annotations import warnings from pathlib import Path import GEOparse import numpy as np import pandas as pd from scipy.stats import false_discovery_control from scipy.stats import t as tdist import sys sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from src.disease import collapse_probes_to_symbols # noqa: E402 from src.scoring import tau_calibrate # noqa: E402 warnings.filterwarnings("ignore") PROCESSED = Path("data/processed") NEG5 = ["clotrimazole", "astemizole", "azithromycin", "ethinyl-estradiol", "caffeine"] SYMBOL_COLS = ["Symbol", "ILMN_Gene", "Gene Symbol", "GeneSymbol"] def cval(gsm, key): for c in gsm.metadata.get("characteristics_ch1", []): if c.lower().startswith(key.lower()): return c.split(":", 1)[1].strip() return None def load_data(): gse = GEOparse.get_GEO(geo="GSE35007", destdir="data/raw/geo", silent=True) meta = [] for gid, g in gse.gsms.items(): meta.append({"gsm": gid, "hb": cval(g, "hb phenotype"), "wbc": cval(g, "white blood cells"), "rbc": cval(g, "red blood cells"), "mcv": cval(g, "mean corpuscular volume"), "age": cval(g, "age"), "sex": cval(g, "Sex")}) meta = pd.DataFrame(meta).set_index("gsm") for c in ["wbc", "rbc", "mcv", "age"]: meta[c] = pd.to_numeric(meta[c], errors="coerce") meta["disease"] = meta["hb"].map({"SS": 1.0, "AA": 0.0}) meta["sex_m"] = (meta["sex"] == "M").astype(float) keep = meta[meta["hb"].isin(["SS", "AA"])].dropna(subset=["wbc", "rbc", "mcv", "age", "disease"]) expr = pd.DataFrame({gid: gse.gsms[gid].table.set_index("ID_REF")["VALUE"] for gid in keep.index}) expr = expr.apply(pd.to_numeric, errors="coerce").dropna(how="any") if float(np.nanmax(expr.to_numpy())) > 50: expr = np.log2(expr.clip(lower=0) + 1.0) gpl = list(gse.gpls.values())[0] col = next((c for c in SYMBOL_COLS if c in gpl.table.columns), None) sym = gpl.table.set_index("ID")[col].astype(str).str.strip().replace({"": np.nan, "nan": np.nan}) return expr, keep, sym.dropna() def ols_de(expr, design, disease_idx): """Vectorized per-gene OLS; return DE table (log_fc=disease coef, pvalue, qvalue).""" X = design.to_numpy(dtype=float) Y = expr.T.to_numpy(dtype=float) # samples x genes n, p = X.shape XtX_inv = np.linalg.pinv(X.T @ X) B = XtX_inv @ X.T @ Y resid = Y - X @ B dof = n - p sigma2 = (resid ** 2).sum(0) / dof se = np.sqrt(sigma2 * XtX_inv[disease_idx, disease_idx]) t = B[disease_idx] / se pval = 2 * tdist.sf(np.abs(t), dof) out = pd.DataFrame({"log_fc": B[disease_idx], "pvalue": pval}, index=expr.index).dropna() out["qvalue"] = false_discovery_control(out["pvalue"].to_numpy(), method="bh") return out def make_signature(de, sym, expr, top_n=250): de_sym = collapse_probes_to_symbols(de, sym, expression_for_ranking=expr) sig = de_sym[de_sym["qvalue"] < 0.05] up = sig[sig["log_fc"] > 0].nsmallest(top_n, "qvalue").index.tolist() down = sig[sig["log_fc"] < 0].nsmallest(top_n, "qvalue").index.tolist() return up, down def evaluate(label, up, down, lincs): ranked = tau_calibrate(up, down, lincs, n_null=1000) n = len(ranked) top10, top25, half = int(n * .10), int(n * .25), n // 2 profiles = pd.read_parquet(PROCESSED / "drug_profiles_v1.parquet").set_index("name") ranked = ranked.join(profiles[["inclusion_reason"]]) hu, glut = int(ranked.loc["hydroxyurea", "rank"]), int(ranked.loc["glutamine", "rank"]) negs = {d: int(ranked.loc[d, "rank"]) for d in NEG5 if d in ranked.index} n_bottom = sum(r > half for r in negs.values()) n_ov = len((set(up) | set(down)) & set(lincs.columns)) print(f"\n### {label}: {len(up)} up / {len(down)} down (query overlap {n_ov})") print(f" hydroxyurea rank {hu}/{n} (top {100*hu/n:.1f}%) top-10%? {hu <= top10}") print(f" L-glutamine rank {glut}/{n} (top {100*glut/n:.1f}%) top-25%? {glut <= top25}") print(f" neg controls bottom-half: {n_bottom}/5 {negs}") print(" top 8: " + ", ".join( f"{name}[{r['inclusion_reason'][:3]}]" for name, r in ranked.nsmallest(8, "spec_z").iterrows())) return ranked def main(): expr, meta, sym = load_data() print(f"loaded {expr.shape[1]} samples x {expr.shape[0]} probes; " f"{int(meta.disease.sum())} SS / {int((meta.disease==0).sum())} AA") lincs = pd.read_parquet(PROCESSED / "lincs_signatures_v1.parquet") base = pd.DataFrame({"intercept": 1.0, "disease": meta["disease"]}, index=meta.index) adj = base.assign(wbc=meta["wbc"], rbc=meta["rbc"], mcv=meta["mcv"], age=meta["age"], sex_m=meta["sex_m"]) de_unadj = ols_de(expr, base, disease_idx=1) de_adj = ols_de(expr, adj, disease_idx=1) up_u, dn_u = make_signature(de_unadj, sym, expr) up_a, dn_a = make_signature(de_adj, sym, expr) # how much does adjustment change the gene set? overlap = len((set(up_u) | set(dn_u)) & (set(up_a) | set(dn_a))) print(f"\nsignature gene overlap unadjusted vs adjusted: {overlap}/{len(set(up_u)|set(dn_u))}") evaluate("UNADJUSTED (GSE35007 only)", up_u, dn_u, lincs) evaluate("COMPOSITION-ADJUSTED", up_a, dn_a, lincs) if __name__ == "__main__": main()