Week 1: Tier-A sickle cell signature via 2-study concordance
Implement and run the Week 1 disease-signature pipeline: - src/disease.py: Welch t-test + BH DE (microarray), probe->symbol collapse, cross-study concordance filter, 2-study provenance schema - scripts/week1_explore.py: download GSE35007 + GSE16728, DE + concordance - scripts/week1_finalize.py: mygene ID mapping + persist signature - tests/test_disease.py: synthetic-data tests for DE/collapse/concordance - docs/data_sources.md: chosen datasets, group defs, reproduction steps Result: sickle_cell_signature_v1.json (gitignored), Tier A, 250 up / 227 down genes from 671 concordant (GSE35007 Illumina whole blood SS/AA + GSE16728 Affymetrix whole blood patient/control). Documented caveats: missing HbF axis (globin depletion) and reticulocyte composition confound. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
139
scripts/week1_explore.py
Normal file
139
scripts/week1_explore.py
Normal file
@@ -0,0 +1,139 @@
|
||||
"""Week 1 exploratory run: build per-study DE + concordance for the whole-blood pair.
|
||||
|
||||
Read-only decision aid (NOT the final pipeline): downloads GSE35007 + GSE16728, runs DE on
|
||||
each, collapses to HGNC symbols, computes concordance, and reports whether the sanity-gate
|
||||
genes survive — so we can choose Tier A (concordance pair) vs Tier B (GSE35007 alone) with
|
||||
real numbers. Intermediate DE tables are cached under data/raw/geo/.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import GEOparse
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
from src.disease import ( # noqa: E402
|
||||
DISEASE_LABEL,
|
||||
HEALTHY_LABEL,
|
||||
collapse_probes_to_symbols,
|
||||
compute_differential_expression,
|
||||
concordance_filter,
|
||||
)
|
||||
|
||||
GEO_DIR = Path("data/raw/geo")
|
||||
GEO_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
SANITY_GENES = ["HBG1", "HBG2", "ALAS2", "CA1", "AHSP", "SLC4A1", "HBB", "ERAF"]
|
||||
SYMBOL_COL_CANDIDATES = [
|
||||
"Symbol", "ILMN_Gene", "Gene Symbol", "GENE_SYMBOL", "Gene symbol",
|
||||
"gene_assignment", "GeneSymbol",
|
||||
]
|
||||
|
||||
|
||||
def log(msg: str) -> None:
|
||||
print(msg, flush=True)
|
||||
|
||||
|
||||
def get_symbol_map(gpl) -> pd.Series:
|
||||
tbl = gpl.table
|
||||
col = next((c for c in SYMBOL_COL_CANDIDATES if c in tbl.columns), None)
|
||||
if col is None:
|
||||
raise RuntimeError(f"No symbol column in GPL {gpl.name}; columns={list(tbl.columns)[:15]}")
|
||||
s = tbl.set_index("ID")[col].astype(str).str.strip()
|
||||
# gene_assignment fields are '// SYMBOL // ...'; take the first real token.
|
||||
if col == "gene_assignment":
|
||||
s = s.str.split("//").str[1].str.strip()
|
||||
s = s.replace({"": np.nan, "nan": np.nan, "---": np.nan})
|
||||
return s.dropna()
|
||||
|
||||
|
||||
def build_expression(gse, sample_ids: list[str]) -> pd.DataFrame:
|
||||
cols = {}
|
||||
for gsm_id in sample_ids:
|
||||
tbl = gse.gsms[gsm_id].table
|
||||
cols[gsm_id] = tbl.set_index("ID_REF")["VALUE"]
|
||||
return pd.DataFrame(cols)
|
||||
|
||||
|
||||
def char_value(gsm, key: str) -> str | None:
|
||||
for c in gsm.metadata.get("characteristics_ch1", []):
|
||||
if c.lower().startswith(key.lower()):
|
||||
return c.split(":", 1)[1].strip()
|
||||
return None
|
||||
|
||||
|
||||
def run_study(gse, groups: dict[str, str], label: str) -> pd.DataFrame:
|
||||
"""groups: gsm_id -> 'disease'/'healthy'. Returns symbol-level DE table."""
|
||||
sample_ids = list(groups.keys())
|
||||
log(f"[{label}] {len(sample_ids)} samples "
|
||||
f"({sum(v==DISEASE_LABEL for v in groups.values())} disease / "
|
||||
f"{sum(v==HEALTHY_LABEL for v in groups.values())} healthy)")
|
||||
expr = build_expression(gse, sample_ids).apply(pd.to_numeric, errors="coerce").dropna(how="all")
|
||||
sample_groups = pd.Series(groups)
|
||||
de = compute_differential_expression(expr, sample_groups, method="welch")
|
||||
gpl = list(gse.gpls.values())[0]
|
||||
sym = get_symbol_map(gpl)
|
||||
de_sym = collapse_probes_to_symbols(de, sym, expression_for_ranking=expr)
|
||||
log(f"[{label}] probes->symbols: {len(de)} probes -> {len(de_sym)} symbols; "
|
||||
f"sig q<0.05: {(de_sym['qvalue']<0.05).sum()}")
|
||||
de_sym.to_parquet(GEO_DIR / f"de_{label}.parquet")
|
||||
return de_sym
|
||||
|
||||
|
||||
def main() -> None:
|
||||
# --- GSE35007: whole blood, hb phenotype SS (disease) vs AA (healthy) ---
|
||||
log("downloading GSE35007 ...")
|
||||
gse1 = GEOparse.get_GEO(geo="GSE35007", destdir=str(GEO_DIR), silent=True)
|
||||
g1 = {}
|
||||
for gsm_id, gsm in gse1.gsms.items():
|
||||
hb = char_value(gsm, "hb phenotype")
|
||||
if hb == "SS":
|
||||
g1[gsm_id] = DISEASE_LABEL
|
||||
elif hb == "AA":
|
||||
g1[gsm_id] = HEALTHY_LABEL
|
||||
de1 = run_study(gse1, g1, "GSE35007")
|
||||
|
||||
# --- GSE16728: whole blood only (PAXgene preps; exclude PBMC), patient vs control ---
|
||||
log("downloading GSE16728 ...")
|
||||
gse2 = GEOparse.get_GEO(geo="GSE16728", destdir=str(GEO_DIR), silent=True)
|
||||
g2 = {}
|
||||
for gsm_id, gsm in gse2.gsms.items():
|
||||
prep = char_value(gsm, "rna prep") or ""
|
||||
subj = char_value(gsm, "subject") or ""
|
||||
if "PBMC" in prep:
|
||||
continue # whole-blood only
|
||||
if subj.lower().startswith("sickle"):
|
||||
g2[gsm_id] = DISEASE_LABEL
|
||||
elif subj.lower().startswith("control"):
|
||||
g2[gsm_id] = HEALTHY_LABEL
|
||||
de2 = run_study(gse2, g2, "GSE16728")
|
||||
|
||||
# --- Concordance ---
|
||||
keep, summary = concordance_filter(de1, de2)
|
||||
log("\n==== CONCORDANCE (whole-blood pair) ====")
|
||||
log(f"genes tested in both: {summary.n_genes_tested}")
|
||||
log(f"concordant (q<0.05 both + same sign): {summary.n_concordant} "
|
||||
f"(up={summary.n_up}, down={summary.n_down})")
|
||||
keep.to_parquet(GEO_DIR / "concordant_genes.parquet")
|
||||
|
||||
log("\n==== SANITY-GATE GENES ====")
|
||||
for g in SANITY_GENES:
|
||||
in1 = g in de1.index
|
||||
in2 = g in de2.index
|
||||
row1 = de1.loc[g] if in1 else None
|
||||
row2 = de2.loc[g] if in2 else None
|
||||
concord = "YES" if g in keep.index else "no"
|
||||
d1 = f"lfc={row1['log_fc']:+.2f},q={row1['qvalue']:.1e}" if in1 else "absent"
|
||||
d2 = f"lfc={row2['log_fc']:+.2f},q={row2['qvalue']:.1e}" if in2 else "absent"
|
||||
log(f" {g:7s} | GSE35007 {d1:28s} | GSE16728 {d2:28s} | concordant={concord}")
|
||||
|
||||
log("\n==== TOP 15 CONCORDANT (by worst-case q) ====")
|
||||
log(keep.head(15).to_string())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user