"""Week 1 exploratory run: build per-study DE + concordance for the whole-blood pair. Read-only decision aid (NOT the final pipeline): downloads GSE35007 + GSE16728, runs DE on each, collapses to HGNC symbols, computes concordance, and reports whether the sanity-gate genes survive — so we can choose Tier A (concordance pair) vs Tier B (GSE35007 alone) with real numbers. Intermediate DE tables are cached under data/raw/geo/. """ from __future__ import annotations import sys from pathlib import Path import GEOparse import numpy as np import pandas as pd sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from src.disease import ( # noqa: E402 DISEASE_LABEL, HEALTHY_LABEL, collapse_probes_to_symbols, compute_differential_expression, concordance_filter, ) GEO_DIR = Path("data/raw/geo") GEO_DIR.mkdir(parents=True, exist_ok=True) SANITY_GENES = ["HBG1", "HBG2", "ALAS2", "CA1", "AHSP", "SLC4A1", "HBB", "ERAF"] SYMBOL_COL_CANDIDATES = [ "Symbol", "ILMN_Gene", "Gene Symbol", "GENE_SYMBOL", "Gene symbol", "gene_assignment", "GeneSymbol", ] def log(msg: str) -> None: print(msg, flush=True) def get_symbol_map(gpl) -> pd.Series: tbl = gpl.table col = next((c for c in SYMBOL_COL_CANDIDATES if c in tbl.columns), None) if col is None: raise RuntimeError(f"No symbol column in GPL {gpl.name}; columns={list(tbl.columns)[:15]}") s = tbl.set_index("ID")[col].astype(str).str.strip() # gene_assignment fields are '// SYMBOL // ...'; take the first real token. if col == "gene_assignment": s = s.str.split("//").str[1].str.strip() s = s.replace({"": np.nan, "nan": np.nan, "---": np.nan}) return s.dropna() def build_expression(gse, sample_ids: list[str]) -> pd.DataFrame: cols = {} for gsm_id in sample_ids: tbl = gse.gsms[gsm_id].table cols[gsm_id] = tbl.set_index("ID_REF")["VALUE"] return pd.DataFrame(cols) def char_value(gsm, key: str) -> str | None: for c in gsm.metadata.get("characteristics_ch1", []): if c.lower().startswith(key.lower()): return c.split(":", 1)[1].strip() return None def run_study(gse, groups: dict[str, str], label: str) -> pd.DataFrame: """groups: gsm_id -> 'disease'/'healthy'. Returns symbol-level DE table.""" sample_ids = list(groups.keys()) log(f"[{label}] {len(sample_ids)} samples " f"({sum(v==DISEASE_LABEL for v in groups.values())} disease / " f"{sum(v==HEALTHY_LABEL for v in groups.values())} healthy)") expr = build_expression(gse, sample_ids).apply(pd.to_numeric, errors="coerce").dropna(how="all") sample_groups = pd.Series(groups) de = compute_differential_expression(expr, sample_groups, method="welch") gpl = list(gse.gpls.values())[0] sym = get_symbol_map(gpl) de_sym = collapse_probes_to_symbols(de, sym, expression_for_ranking=expr) log(f"[{label}] probes->symbols: {len(de)} probes -> {len(de_sym)} symbols; " f"sig q<0.05: {(de_sym['qvalue']<0.05).sum()}") de_sym.to_parquet(GEO_DIR / f"de_{label}.parquet") return de_sym def main() -> None: # --- GSE35007: whole blood, hb phenotype SS (disease) vs AA (healthy) --- log("downloading GSE35007 ...") gse1 = GEOparse.get_GEO(geo="GSE35007", destdir=str(GEO_DIR), silent=True) g1 = {} for gsm_id, gsm in gse1.gsms.items(): hb = char_value(gsm, "hb phenotype") if hb == "SS": g1[gsm_id] = DISEASE_LABEL elif hb == "AA": g1[gsm_id] = HEALTHY_LABEL de1 = run_study(gse1, g1, "GSE35007") # --- GSE16728: whole blood only (PAXgene preps; exclude PBMC), patient vs control --- log("downloading GSE16728 ...") gse2 = GEOparse.get_GEO(geo="GSE16728", destdir=str(GEO_DIR), silent=True) g2 = {} for gsm_id, gsm in gse2.gsms.items(): prep = char_value(gsm, "rna prep") or "" subj = char_value(gsm, "subject") or "" if "PBMC" in prep: continue # whole-blood only if subj.lower().startswith("sickle"): g2[gsm_id] = DISEASE_LABEL elif subj.lower().startswith("control"): g2[gsm_id] = HEALTHY_LABEL de2 = run_study(gse2, g2, "GSE16728") # --- Concordance --- keep, summary = concordance_filter(de1, de2) log("\n==== CONCORDANCE (whole-blood pair) ====") log(f"genes tested in both: {summary.n_genes_tested}") log(f"concordant (q<0.05 both + same sign): {summary.n_concordant} " f"(up={summary.n_up}, down={summary.n_down})") keep.to_parquet(GEO_DIR / "concordant_genes.parquet") log("\n==== SANITY-GATE GENES ====") for g in SANITY_GENES: in1 = g in de1.index in2 = g in de2.index row1 = de1.loc[g] if in1 else None row2 = de2.loc[g] if in2 else None concord = "YES" if g in keep.index else "no" d1 = f"lfc={row1['log_fc']:+.2f},q={row1['qvalue']:.1e}" if in1 else "absent" d2 = f"lfc={row2['log_fc']:+.2f},q={row2['qvalue']:.1e}" if in2 else "absent" log(f" {g:7s} | GSE35007 {d1:28s} | GSE16728 {d2:28s} | concordant={concord}") log("\n==== TOP 15 CONCORDANT (by worst-case q) ====") log(keep.head(15).to_string()) if __name__ == "__main__": main()