Week 1: Tier-A sickle cell signature via 2-study concordance

Implement and run the Week 1 disease-signature pipeline: - src/disease.py: Welch t-test + BH DE (microarray), probe->symbol collapse, cross-study concordance filter, 2-study provenance schema - scripts/week1_explore.py: download GSE35007 + GSE16728, DE + concordance - scripts/week1_finalize.py: mygene ID mapping + persist signature - tests/test_disease.py: synthetic-data tests for DE/collapse/concordance - docs/data_sources.md: chosen datasets, group defs, reproduction steps Result: sickle_cell_signature_v1.json (gitignored), Tier A, 250 up / 227 down genes from 671 concordant (GSE35007 Illumina whole blood SS/AA + GSE16728 Affymetrix whole blood patient/control). Documented caveats: missing HbF axis (globin depletion) and reticulocyte composition confound. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-23 20:43:23 +02:00
parent b731478f5d
commit c7b6649d31
5 changed files with 631 additions and 26 deletions
--- a/scripts/week1_explore.py
+++ b/scripts/week1_explore.py
@@ -0,0 +1,139 @@
+"""Week 1 exploratory run: build per-study DE + concordance for the whole-blood pair.
+
+Read-only decision aid (NOT the final pipeline): downloads GSE35007 + GSE16728, runs DE on
+each, collapses to HGNC symbols, computes concordance, and reports whether the sanity-gate
+genes survive — so we can choose Tier A (concordance pair) vs Tier B (GSE35007 alone) with
+real numbers. Intermediate DE tables are cached under data/raw/geo/.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import GEOparse
+import numpy as np
+import pandas as pd
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from src.disease import (  # noqa: E402
+    DISEASE_LABEL,
+    HEALTHY_LABEL,
+    collapse_probes_to_symbols,
+    compute_differential_expression,
+    concordance_filter,
+)
+
+GEO_DIR = Path("data/raw/geo")
+GEO_DIR.mkdir(parents=True, exist_ok=True)
+
+SANITY_GENES = ["HBG1", "HBG2", "ALAS2", "CA1", "AHSP", "SLC4A1", "HBB", "ERAF"]
+SYMBOL_COL_CANDIDATES = [
+    "Symbol", "ILMN_Gene", "Gene Symbol", "GENE_SYMBOL", "Gene symbol",
+    "gene_assignment", "GeneSymbol",
+]
+
+
+def log(msg: str) -> None:
+    print(msg, flush=True)
+
+
+def get_symbol_map(gpl) -> pd.Series:
+    tbl = gpl.table
+    col = next((c for c in SYMBOL_COL_CANDIDATES if c in tbl.columns), None)
+    if col is None:
+        raise RuntimeError(f"No symbol column in GPL {gpl.name}; columns={list(tbl.columns)[:15]}")
+    s = tbl.set_index("ID")[col].astype(str).str.strip()
+    # gene_assignment fields are '// SYMBOL // ...'; take the first real token.
+    if col == "gene_assignment":
+        s = s.str.split("//").str[1].str.strip()
+    s = s.replace({"": np.nan, "nan": np.nan, "---": np.nan})
+    return s.dropna()
+
+
+def build_expression(gse, sample_ids: list[str]) -> pd.DataFrame:
+    cols = {}
+    for gsm_id in sample_ids:
+        tbl = gse.gsms[gsm_id].table
+        cols[gsm_id] = tbl.set_index("ID_REF")["VALUE"]
+    return pd.DataFrame(cols)
+
+
+def char_value(gsm, key: str) -> str | None:
+    for c in gsm.metadata.get("characteristics_ch1", []):
+        if c.lower().startswith(key.lower()):
+            return c.split(":", 1)[1].strip()
+    return None
+
+
+def run_study(gse, groups: dict[str, str], label: str) -> pd.DataFrame:
+    """groups: gsm_id -> 'disease'/'healthy'. Returns symbol-level DE table."""
+    sample_ids = list(groups.keys())
+    log(f"[{label}] {len(sample_ids)} samples "
+        f"({sum(v==DISEASE_LABEL for v in groups.values())} disease / "
+        f"{sum(v==HEALTHY_LABEL for v in groups.values())} healthy)")
+    expr = build_expression(gse, sample_ids).apply(pd.to_numeric, errors="coerce").dropna(how="all")
+    sample_groups = pd.Series(groups)
+    de = compute_differential_expression(expr, sample_groups, method="welch")
+    gpl = list(gse.gpls.values())[0]
+    sym = get_symbol_map(gpl)
+    de_sym = collapse_probes_to_symbols(de, sym, expression_for_ranking=expr)
+    log(f"[{label}] probes->symbols: {len(de)} probes -> {len(de_sym)} symbols; "
+        f"sig q<0.05: {(de_sym['qvalue']<0.05).sum()}")
+    de_sym.to_parquet(GEO_DIR / f"de_{label}.parquet")
+    return de_sym
+
+
+def main() -> None:
+    # --- GSE35007: whole blood, hb phenotype SS (disease) vs AA (healthy) ---
+    log("downloading GSE35007 ...")
+    gse1 = GEOparse.get_GEO(geo="GSE35007", destdir=str(GEO_DIR), silent=True)
+    g1 = {}
+    for gsm_id, gsm in gse1.gsms.items():
+        hb = char_value(gsm, "hb phenotype")
+        if hb == "SS":
+            g1[gsm_id] = DISEASE_LABEL
+        elif hb == "AA":
+            g1[gsm_id] = HEALTHY_LABEL
+    de1 = run_study(gse1, g1, "GSE35007")
+
+    # --- GSE16728: whole blood only (PAXgene preps; exclude PBMC), patient vs control ---
+    log("downloading GSE16728 ...")
+    gse2 = GEOparse.get_GEO(geo="GSE16728", destdir=str(GEO_DIR), silent=True)
+    g2 = {}
+    for gsm_id, gsm in gse2.gsms.items():
+        prep = char_value(gsm, "rna prep") or ""
+        subj = char_value(gsm, "subject") or ""
+        if "PBMC" in prep:
+            continue  # whole-blood only
+        if subj.lower().startswith("sickle"):
+            g2[gsm_id] = DISEASE_LABEL
+        elif subj.lower().startswith("control"):
+            g2[gsm_id] = HEALTHY_LABEL
+    de2 = run_study(gse2, g2, "GSE16728")
+
+    # --- Concordance ---
+    keep, summary = concordance_filter(de1, de2)
+    log("\n==== CONCORDANCE (whole-blood pair) ====")
+    log(f"genes tested in both: {summary.n_genes_tested}")
+    log(f"concordant (q<0.05 both + same sign): {summary.n_concordant} "
+        f"(up={summary.n_up}, down={summary.n_down})")
+    keep.to_parquet(GEO_DIR / "concordant_genes.parquet")
+
+    log("\n==== SANITY-GATE GENES ====")
+    for g in SANITY_GENES:
+        in1 = g in de1.index
+        in2 = g in de2.index
+        row1 = de1.loc[g] if in1 else None
+        row2 = de2.loc[g] if in2 else None
+        concord = "YES" if g in keep.index else "no"
+        d1 = f"lfc={row1['log_fc']:+.2f},q={row1['qvalue']:.1e}" if in1 else "absent"
+        d2 = f"lfc={row2['log_fc']:+.2f},q={row2['qvalue']:.1e}" if in2 else "absent"
+        log(f"  {g:7s} | GSE35007 {d1:28s} | GSE16728 {d2:28s} | concordant={concord}")
+
+    log("\n==== TOP 15 CONCORDANT (by worst-case q) ====")
+    log(keep.head(15).to_string())
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/week1_finalize.py
+++ b/scripts/week1_finalize.py
@@ -0,0 +1,121 @@
+"""Finalize the Week 1 sickle cell signature (Tier A, whole-blood concordance pair).
+
+Reads the cached concordant gene table, maps symbols -> Entrez/Ensembl via mygene, attaches
+two-study provenance + concordance summary, and persists
+data/processed/sickle_cell_signature_v1.json.
+
+Tier note: GSE16728 is exactly 10/group, which fails assign_tier()'s strict n>10. Tier A is
+assigned explicitly here on the basis of cross-study, cross-platform, cross-population
+concordance (the founder's decision), with the n=10 / prep-merge caveat documented in the
+tier rationale and limitations.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import mygene
+import pandas as pd
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from src.disease import (  # noqa: E402
+    ConcordanceSummary,
+    SignatureProvenance,
+    StudyProvenance,
+    build_signature,
+    persist_signature,
+)
+from src.provenance import ConfidenceTier  # noqa: E402
+
+CREATED_DATE = "2026-06-23"
+GEO_DIR = Path("data/raw/geo")
+
+
+def map_ids(symbols: list[str]) -> pd.DataFrame:
+    mg = mygene.MyGeneInfo()
+    res = mg.querymany(
+        symbols, scopes="symbol", fields="entrezgene,ensembl.gene",
+        species="human", as_dataframe=True, verbose=False,
+    )
+    res = res[~res.index.duplicated(keep="first")]
+
+    def _ensembl(v):
+        if isinstance(v, list) and v:
+            v = v[0]
+        if isinstance(v, dict):
+            return v.get("gene")
+        return v
+
+    id_map = pd.DataFrame(index=res.index)
+    id_map["entrez_id"] = res["entrezgene"] if "entrezgene" in res.columns else None
+    ens_col = "ensembl.gene" if "ensembl.gene" in res.columns else ("ensembl" if "ensembl" in res.columns else None)
+    id_map["ensembl_id"] = res[ens_col].map(_ensembl) if ens_col else None
+    return id_map
+
+
+def main() -> None:
+    concordant = pd.read_parquet(GEO_DIR / "concordant_genes.parquet")
+    print(f"loaded {len(concordant)} concordant genes "
+          f"({(concordant['log_fc']>0).sum()} up / {(concordant['log_fc']<0).sum()} down)")
+
+    id_map = map_ids(list(concordant.index))
+    print(f"mygene mapped {id_map['entrez_id'].notna().sum()}/{len(id_map)} symbols to Entrez")
+
+    provenance = SignatureProvenance(
+        studies=[
+            StudyProvenance(
+                geo_accession="GSE35007", n_disease=190, n_healthy=12,
+                platform="Illumina HumanHT-12 V4 (GPL10558)",
+                tissue="whole blood", method="welch",
+            ),
+            StudyProvenance(
+                geo_accession="GSE16728", n_disease=10, n_healthy=10,
+                platform="Affymetrix HG-U133 Plus 2.0 (GPL570)",
+                tissue="whole blood (PAXgene; globin-reduced + total RNA preps merged)",
+                method="welch",
+            ),
+        ],
+        concordance=ConcordanceSummary(
+            n_genes_tested=16208, n_concordant=671, n_up=444, n_down=227,
+        ),
+        created_date=CREATED_DATE,
+    )
+
+    tier_rationale = (
+        "Measured RNA expression concordant across two independent peer-reviewed whole-blood "
+        "studies on different platforms (Illumina GPL10558, Affymetrix GPL570) and populations "
+        "(pediatric West-African GSE35007; adult GSE16728). Tier A rests on cross-study / "
+        "cross-platform concordance. Caveat: GSE16728 is exactly 10/group (two PAXgene preps "
+        "merged), which does not meet the strict n>10 per-study threshold on its own."
+    )
+    limitations = [
+        "Whole-blood expression is partly driven by cell-composition differences "
+        "(reticulocytosis in sickle patients), not pure disease-state regulation; the "
+        "signature is dominated by erythroid markers (CA1, AHSP, SLC4A1). v2 needs cell-type "
+        "deconvolution.",
+        "Fetal-hemoglobin genes (HBG1/HBG2) are NOT captured: flat in GSE35007 and removed by "
+        "GSE16728's globin-depleted RNA prep. The signature therefore does not directly encode "
+        "the HbF axis that hydroxyurea acts on, which may weaken the hydroxyurea recovery test.",
+        "GSE35007 is highly imbalanced (SS n=190 vs AA n=12); the healthy arm is small.",
+        "GSE16728 whole-blood arm is small (10/group) and merges two RNA-prep batches.",
+        "Cross-platform probe->symbol collapse (keep max-mean-expression probe) loses "
+        "isoform-level resolution and drops genes absent from either platform.",
+    ]
+
+    signature = build_signature(
+        concordant, provenance,
+        tier=ConfidenceTier.A,
+        tier_rationale=tier_rationale,
+        limitations=limitations,
+        id_map=id_map,
+        top_n=250,
+    )
+    path = persist_signature(signature)
+    print(f"wrote {path}")
+    print(f"signature: {len(signature.up_regulated)} up / {len(signature.down_regulated)} down, "
+          f"tier {signature.confidence_tier.value}")
+
+
+if __name__ == "__main__":
+    main()