Test whether fixing the cell-composition confound rescues the recovery test. GSE35007 measured WBC/RBC/MCV per sample, so adjust DE directly for them (per-gene OLS: expression ~ disease + WBC + RBC + MCV + age + sex) — a measured-covariate deconvolution, compared vs unadjusted. Finding (negative, informative): adjustment HURT hydroxyurea (rank 20 -> 35, fails top-10%) — it was recovering partly via composition genes — and did NOT fix negative-control specificity (still 2/5). Only gain: top hits become more mechanistically coherent (resveratrol, aspirin enter). Conclusion: cleaning the disease signature does not rescue connectivity; the binding constraint is matching-side (needs a reference-signature library for proper specificity calibration), which is a multi-disease investment. v1.1 signature NOT replaced. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
138 lines
5.9 KiB
Python
138 lines
5.9 KiB
Python
"""Experiment: composition-adjusted sickle signature, to fix specificity (option 1).
|
|
|
|
The v1 signature is confounded by cell composition (SS patients have very different WBC/RBC
|
|
than AA controls). GSE35007 *measured* those counts per sample, so we adjust the differential
|
|
expression for them directly (a measured-covariate alternative to estimated deconvolution):
|
|
|
|
expression ~ disease + WBC + RBC + MCV + age + sex (per gene, vectorized OLS)
|
|
|
|
We compare the composition-ADJUSTED signature against an UNADJUSTED single-study signature
|
|
(same samples, model without the covariates), score both with the v1.1 engine (full gene space
|
|
+ spec_z), and report the recovery test for each. Writes nothing to committed artifacts.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import warnings
|
|
from pathlib import Path
|
|
|
|
import GEOparse
|
|
import numpy as np
|
|
import pandas as pd
|
|
from scipy.stats import false_discovery_control
|
|
from scipy.stats import t as tdist
|
|
|
|
import sys
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
from src.disease import collapse_probes_to_symbols # noqa: E402
|
|
from src.scoring import tau_calibrate # noqa: E402
|
|
|
|
warnings.filterwarnings("ignore")
|
|
PROCESSED = Path("data/processed")
|
|
NEG5 = ["clotrimazole", "astemizole", "azithromycin", "ethinyl-estradiol", "caffeine"]
|
|
SYMBOL_COLS = ["Symbol", "ILMN_Gene", "Gene Symbol", "GeneSymbol"]
|
|
|
|
|
|
def cval(gsm, key):
|
|
for c in gsm.metadata.get("characteristics_ch1", []):
|
|
if c.lower().startswith(key.lower()):
|
|
return c.split(":", 1)[1].strip()
|
|
return None
|
|
|
|
|
|
def load_data():
|
|
gse = GEOparse.get_GEO(geo="GSE35007", destdir="data/raw/geo", silent=True)
|
|
meta = []
|
|
for gid, g in gse.gsms.items():
|
|
meta.append({"gsm": gid, "hb": cval(g, "hb phenotype"), "wbc": cval(g, "white blood cells"),
|
|
"rbc": cval(g, "red blood cells"), "mcv": cval(g, "mean corpuscular volume"),
|
|
"age": cval(g, "age"), "sex": cval(g, "Sex")})
|
|
meta = pd.DataFrame(meta).set_index("gsm")
|
|
for c in ["wbc", "rbc", "mcv", "age"]:
|
|
meta[c] = pd.to_numeric(meta[c], errors="coerce")
|
|
meta["disease"] = meta["hb"].map({"SS": 1.0, "AA": 0.0})
|
|
meta["sex_m"] = (meta["sex"] == "M").astype(float)
|
|
keep = meta[meta["hb"].isin(["SS", "AA"])].dropna(subset=["wbc", "rbc", "mcv", "age", "disease"])
|
|
|
|
expr = pd.DataFrame({gid: gse.gsms[gid].table.set_index("ID_REF")["VALUE"] for gid in keep.index})
|
|
expr = expr.apply(pd.to_numeric, errors="coerce").dropna(how="any")
|
|
if float(np.nanmax(expr.to_numpy())) > 50:
|
|
expr = np.log2(expr.clip(lower=0) + 1.0)
|
|
|
|
gpl = list(gse.gpls.values())[0]
|
|
col = next((c for c in SYMBOL_COLS if c in gpl.table.columns), None)
|
|
sym = gpl.table.set_index("ID")[col].astype(str).str.strip().replace({"": np.nan, "nan": np.nan})
|
|
return expr, keep, sym.dropna()
|
|
|
|
|
|
def ols_de(expr, design, disease_idx):
|
|
"""Vectorized per-gene OLS; return DE table (log_fc=disease coef, pvalue, qvalue)."""
|
|
X = design.to_numpy(dtype=float)
|
|
Y = expr.T.to_numpy(dtype=float) # samples x genes
|
|
n, p = X.shape
|
|
XtX_inv = np.linalg.pinv(X.T @ X)
|
|
B = XtX_inv @ X.T @ Y
|
|
resid = Y - X @ B
|
|
dof = n - p
|
|
sigma2 = (resid ** 2).sum(0) / dof
|
|
se = np.sqrt(sigma2 * XtX_inv[disease_idx, disease_idx])
|
|
t = B[disease_idx] / se
|
|
pval = 2 * tdist.sf(np.abs(t), dof)
|
|
out = pd.DataFrame({"log_fc": B[disease_idx], "pvalue": pval}, index=expr.index).dropna()
|
|
out["qvalue"] = false_discovery_control(out["pvalue"].to_numpy(), method="bh")
|
|
return out
|
|
|
|
|
|
def make_signature(de, sym, expr, top_n=250):
|
|
de_sym = collapse_probes_to_symbols(de, sym, expression_for_ranking=expr)
|
|
sig = de_sym[de_sym["qvalue"] < 0.05]
|
|
up = sig[sig["log_fc"] > 0].nsmallest(top_n, "qvalue").index.tolist()
|
|
down = sig[sig["log_fc"] < 0].nsmallest(top_n, "qvalue").index.tolist()
|
|
return up, down
|
|
|
|
|
|
def evaluate(label, up, down, lincs):
|
|
ranked = tau_calibrate(up, down, lincs, n_null=1000)
|
|
n = len(ranked)
|
|
top10, top25, half = int(n * .10), int(n * .25), n // 2
|
|
profiles = pd.read_parquet(PROCESSED / "drug_profiles_v1.parquet").set_index("name")
|
|
ranked = ranked.join(profiles[["inclusion_reason"]])
|
|
hu, glut = int(ranked.loc["hydroxyurea", "rank"]), int(ranked.loc["glutamine", "rank"])
|
|
negs = {d: int(ranked.loc[d, "rank"]) for d in NEG5 if d in ranked.index}
|
|
n_bottom = sum(r > half for r in negs.values())
|
|
n_ov = len((set(up) | set(down)) & set(lincs.columns))
|
|
print(f"\n### {label}: {len(up)} up / {len(down)} down (query overlap {n_ov})")
|
|
print(f" hydroxyurea rank {hu}/{n} (top {100*hu/n:.1f}%) top-10%? {hu <= top10}")
|
|
print(f" L-glutamine rank {glut}/{n} (top {100*glut/n:.1f}%) top-25%? {glut <= top25}")
|
|
print(f" neg controls bottom-half: {n_bottom}/5 {negs}")
|
|
print(" top 8: " + ", ".join(
|
|
f"{name}[{r['inclusion_reason'][:3]}]" for name, r in ranked.nsmallest(8, "spec_z").iterrows()))
|
|
return ranked
|
|
|
|
|
|
def main():
|
|
expr, meta, sym = load_data()
|
|
print(f"loaded {expr.shape[1]} samples x {expr.shape[0]} probes; "
|
|
f"{int(meta.disease.sum())} SS / {int((meta.disease==0).sum())} AA")
|
|
lincs = pd.read_parquet(PROCESSED / "lincs_signatures_v1.parquet")
|
|
|
|
base = pd.DataFrame({"intercept": 1.0, "disease": meta["disease"]}, index=meta.index)
|
|
adj = base.assign(wbc=meta["wbc"], rbc=meta["rbc"], mcv=meta["mcv"], age=meta["age"], sex_m=meta["sex_m"])
|
|
|
|
de_unadj = ols_de(expr, base, disease_idx=1)
|
|
de_adj = ols_de(expr, adj, disease_idx=1)
|
|
|
|
up_u, dn_u = make_signature(de_unadj, sym, expr)
|
|
up_a, dn_a = make_signature(de_adj, sym, expr)
|
|
|
|
# how much does adjustment change the gene set?
|
|
overlap = len((set(up_u) | set(dn_u)) & (set(up_a) | set(dn_a)))
|
|
print(f"\nsignature gene overlap unadjusted vs adjusted: {overlap}/{len(set(up_u)|set(dn_u))}")
|
|
|
|
evaluate("UNADJUSTED (GSE35007 only)", up_u, dn_u, lincs)
|
|
evaluate("COMPOSITION-ADJUSTED", up_a, dn_a, lincs)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|