Experiment: composition-adjusted signature (negative result)

Test whether fixing the cell-composition confound rescues the recovery
test. GSE35007 measured WBC/RBC/MCV per sample, so adjust DE directly
for them (per-gene OLS: expression ~ disease + WBC + RBC + MCV + age +
sex) — a measured-covariate deconvolution, compared vs unadjusted.

Finding (negative, informative): adjustment HURT hydroxyurea (rank
20 -> 35, fails top-10%) — it was recovering partly via composition
genes — and did NOT fix negative-control specificity (still 2/5). Only
gain: top hits become more mechanistically coherent (resveratrol,
aspirin enter). Conclusion: cleaning the disease signature does not
rescue connectivity; the binding constraint is matching-side (needs a
reference-signature library for proper specificity calibration), which
is a multi-disease investment. v1.1 signature NOT replaced.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-23 23:05:08 +02:00
parent 3417f85eb1
commit b62048614d

View File

@@ -0,0 +1,137 @@
"""Experiment: composition-adjusted sickle signature, to fix specificity (option 1).
The v1 signature is confounded by cell composition (SS patients have very different WBC/RBC
than AA controls). GSE35007 *measured* those counts per sample, so we adjust the differential
expression for them directly (a measured-covariate alternative to estimated deconvolution):
expression ~ disease + WBC + RBC + MCV + age + sex (per gene, vectorized OLS)
We compare the composition-ADJUSTED signature against an UNADJUSTED single-study signature
(same samples, model without the covariates), score both with the v1.1 engine (full gene space
+ spec_z), and report the recovery test for each. Writes nothing to committed artifacts.
"""
from __future__ import annotations
import warnings
from pathlib import Path
import GEOparse
import numpy as np
import pandas as pd
from scipy.stats import false_discovery_control
from scipy.stats import t as tdist
import sys
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from src.disease import collapse_probes_to_symbols # noqa: E402
from src.scoring import tau_calibrate # noqa: E402
warnings.filterwarnings("ignore")
PROCESSED = Path("data/processed")
NEG5 = ["clotrimazole", "astemizole", "azithromycin", "ethinyl-estradiol", "caffeine"]
SYMBOL_COLS = ["Symbol", "ILMN_Gene", "Gene Symbol", "GeneSymbol"]
def cval(gsm, key):
for c in gsm.metadata.get("characteristics_ch1", []):
if c.lower().startswith(key.lower()):
return c.split(":", 1)[1].strip()
return None
def load_data():
gse = GEOparse.get_GEO(geo="GSE35007", destdir="data/raw/geo", silent=True)
meta = []
for gid, g in gse.gsms.items():
meta.append({"gsm": gid, "hb": cval(g, "hb phenotype"), "wbc": cval(g, "white blood cells"),
"rbc": cval(g, "red blood cells"), "mcv": cval(g, "mean corpuscular volume"),
"age": cval(g, "age"), "sex": cval(g, "Sex")})
meta = pd.DataFrame(meta).set_index("gsm")
for c in ["wbc", "rbc", "mcv", "age"]:
meta[c] = pd.to_numeric(meta[c], errors="coerce")
meta["disease"] = meta["hb"].map({"SS": 1.0, "AA": 0.0})
meta["sex_m"] = (meta["sex"] == "M").astype(float)
keep = meta[meta["hb"].isin(["SS", "AA"])].dropna(subset=["wbc", "rbc", "mcv", "age", "disease"])
expr = pd.DataFrame({gid: gse.gsms[gid].table.set_index("ID_REF")["VALUE"] for gid in keep.index})
expr = expr.apply(pd.to_numeric, errors="coerce").dropna(how="any")
if float(np.nanmax(expr.to_numpy())) > 50:
expr = np.log2(expr.clip(lower=0) + 1.0)
gpl = list(gse.gpls.values())[0]
col = next((c for c in SYMBOL_COLS if c in gpl.table.columns), None)
sym = gpl.table.set_index("ID")[col].astype(str).str.strip().replace({"": np.nan, "nan": np.nan})
return expr, keep, sym.dropna()
def ols_de(expr, design, disease_idx):
"""Vectorized per-gene OLS; return DE table (log_fc=disease coef, pvalue, qvalue)."""
X = design.to_numpy(dtype=float)
Y = expr.T.to_numpy(dtype=float) # samples x genes
n, p = X.shape
XtX_inv = np.linalg.pinv(X.T @ X)
B = XtX_inv @ X.T @ Y
resid = Y - X @ B
dof = n - p
sigma2 = (resid ** 2).sum(0) / dof
se = np.sqrt(sigma2 * XtX_inv[disease_idx, disease_idx])
t = B[disease_idx] / se
pval = 2 * tdist.sf(np.abs(t), dof)
out = pd.DataFrame({"log_fc": B[disease_idx], "pvalue": pval}, index=expr.index).dropna()
out["qvalue"] = false_discovery_control(out["pvalue"].to_numpy(), method="bh")
return out
def make_signature(de, sym, expr, top_n=250):
de_sym = collapse_probes_to_symbols(de, sym, expression_for_ranking=expr)
sig = de_sym[de_sym["qvalue"] < 0.05]
up = sig[sig["log_fc"] > 0].nsmallest(top_n, "qvalue").index.tolist()
down = sig[sig["log_fc"] < 0].nsmallest(top_n, "qvalue").index.tolist()
return up, down
def evaluate(label, up, down, lincs):
ranked = tau_calibrate(up, down, lincs, n_null=1000)
n = len(ranked)
top10, top25, half = int(n * .10), int(n * .25), n // 2
profiles = pd.read_parquet(PROCESSED / "drug_profiles_v1.parquet").set_index("name")
ranked = ranked.join(profiles[["inclusion_reason"]])
hu, glut = int(ranked.loc["hydroxyurea", "rank"]), int(ranked.loc["glutamine", "rank"])
negs = {d: int(ranked.loc[d, "rank"]) for d in NEG5 if d in ranked.index}
n_bottom = sum(r > half for r in negs.values())
n_ov = len((set(up) | set(down)) & set(lincs.columns))
print(f"\n### {label}: {len(up)} up / {len(down)} down (query overlap {n_ov})")
print(f" hydroxyurea rank {hu}/{n} (top {100*hu/n:.1f}%) top-10%? {hu <= top10}")
print(f" L-glutamine rank {glut}/{n} (top {100*glut/n:.1f}%) top-25%? {glut <= top25}")
print(f" neg controls bottom-half: {n_bottom}/5 {negs}")
print(" top 8: " + ", ".join(
f"{name}[{r['inclusion_reason'][:3]}]" for name, r in ranked.nsmallest(8, "spec_z").iterrows()))
return ranked
def main():
expr, meta, sym = load_data()
print(f"loaded {expr.shape[1]} samples x {expr.shape[0]} probes; "
f"{int(meta.disease.sum())} SS / {int((meta.disease==0).sum())} AA")
lincs = pd.read_parquet(PROCESSED / "lincs_signatures_v1.parquet")
base = pd.DataFrame({"intercept": 1.0, "disease": meta["disease"]}, index=meta.index)
adj = base.assign(wbc=meta["wbc"], rbc=meta["rbc"], mcv=meta["mcv"], age=meta["age"], sex_m=meta["sex_m"])
de_unadj = ols_de(expr, base, disease_idx=1)
de_adj = ols_de(expr, adj, disease_idx=1)
up_u, dn_u = make_signature(de_unadj, sym, expr)
up_a, dn_a = make_signature(de_adj, sym, expr)
# how much does adjustment change the gene set?
overlap = len((set(up_u) | set(dn_u)) & (set(up_a) | set(dn_a)))
print(f"\nsignature gene overlap unadjusted vs adjusted: {overlap}/{len(set(up_u)|set(dn_u))}")
evaluate("UNADJUSTED (GSE35007 only)", up_u, dn_u, lincs)
evaluate("COMPOSITION-ADJUSTED", up_a, dn_a, lincs)
if __name__ == "__main__":
main()