Implement and run the Week 1 disease-signature pipeline: - src/disease.py: Welch t-test + BH DE (microarray), probe->symbol collapse, cross-study concordance filter, 2-study provenance schema - scripts/week1_explore.py: download GSE35007 + GSE16728, DE + concordance - scripts/week1_finalize.py: mygene ID mapping + persist signature - tests/test_disease.py: synthetic-data tests for DE/collapse/concordance - docs/data_sources.md: chosen datasets, group defs, reproduction steps Result: sickle_cell_signature_v1.json (gitignored), Tier A, 250 up / 227 down genes from 671 concordant (GSE35007 Illumina whole blood SS/AA + GSE16728 Affymetrix whole blood patient/control). Documented caveats: missing HbF axis (globin depletion) and reticulocyte composition confound. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
122 lines
4.8 KiB
Python
122 lines
4.8 KiB
Python
"""Finalize the Week 1 sickle cell signature (Tier A, whole-blood concordance pair).
|
|
|
|
Reads the cached concordant gene table, maps symbols -> Entrez/Ensembl via mygene, attaches
|
|
two-study provenance + concordance summary, and persists
|
|
data/processed/sickle_cell_signature_v1.json.
|
|
|
|
Tier note: GSE16728 is exactly 10/group, which fails assign_tier()'s strict n>10. Tier A is
|
|
assigned explicitly here on the basis of cross-study, cross-platform, cross-population
|
|
concordance (the founder's decision), with the n=10 / prep-merge caveat documented in the
|
|
tier rationale and limitations.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import mygene
|
|
import pandas as pd
|
|
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
from src.disease import ( # noqa: E402
|
|
ConcordanceSummary,
|
|
SignatureProvenance,
|
|
StudyProvenance,
|
|
build_signature,
|
|
persist_signature,
|
|
)
|
|
from src.provenance import ConfidenceTier # noqa: E402
|
|
|
|
CREATED_DATE = "2026-06-23"
|
|
GEO_DIR = Path("data/raw/geo")
|
|
|
|
|
|
def map_ids(symbols: list[str]) -> pd.DataFrame:
|
|
mg = mygene.MyGeneInfo()
|
|
res = mg.querymany(
|
|
symbols, scopes="symbol", fields="entrezgene,ensembl.gene",
|
|
species="human", as_dataframe=True, verbose=False,
|
|
)
|
|
res = res[~res.index.duplicated(keep="first")]
|
|
|
|
def _ensembl(v):
|
|
if isinstance(v, list) and v:
|
|
v = v[0]
|
|
if isinstance(v, dict):
|
|
return v.get("gene")
|
|
return v
|
|
|
|
id_map = pd.DataFrame(index=res.index)
|
|
id_map["entrez_id"] = res["entrezgene"] if "entrezgene" in res.columns else None
|
|
ens_col = "ensembl.gene" if "ensembl.gene" in res.columns else ("ensembl" if "ensembl" in res.columns else None)
|
|
id_map["ensembl_id"] = res[ens_col].map(_ensembl) if ens_col else None
|
|
return id_map
|
|
|
|
|
|
def main() -> None:
|
|
concordant = pd.read_parquet(GEO_DIR / "concordant_genes.parquet")
|
|
print(f"loaded {len(concordant)} concordant genes "
|
|
f"({(concordant['log_fc']>0).sum()} up / {(concordant['log_fc']<0).sum()} down)")
|
|
|
|
id_map = map_ids(list(concordant.index))
|
|
print(f"mygene mapped {id_map['entrez_id'].notna().sum()}/{len(id_map)} symbols to Entrez")
|
|
|
|
provenance = SignatureProvenance(
|
|
studies=[
|
|
StudyProvenance(
|
|
geo_accession="GSE35007", n_disease=190, n_healthy=12,
|
|
platform="Illumina HumanHT-12 V4 (GPL10558)",
|
|
tissue="whole blood", method="welch",
|
|
),
|
|
StudyProvenance(
|
|
geo_accession="GSE16728", n_disease=10, n_healthy=10,
|
|
platform="Affymetrix HG-U133 Plus 2.0 (GPL570)",
|
|
tissue="whole blood (PAXgene; globin-reduced + total RNA preps merged)",
|
|
method="welch",
|
|
),
|
|
],
|
|
concordance=ConcordanceSummary(
|
|
n_genes_tested=16208, n_concordant=671, n_up=444, n_down=227,
|
|
),
|
|
created_date=CREATED_DATE,
|
|
)
|
|
|
|
tier_rationale = (
|
|
"Measured RNA expression concordant across two independent peer-reviewed whole-blood "
|
|
"studies on different platforms (Illumina GPL10558, Affymetrix GPL570) and populations "
|
|
"(pediatric West-African GSE35007; adult GSE16728). Tier A rests on cross-study / "
|
|
"cross-platform concordance. Caveat: GSE16728 is exactly 10/group (two PAXgene preps "
|
|
"merged), which does not meet the strict n>10 per-study threshold on its own."
|
|
)
|
|
limitations = [
|
|
"Whole-blood expression is partly driven by cell-composition differences "
|
|
"(reticulocytosis in sickle patients), not pure disease-state regulation; the "
|
|
"signature is dominated by erythroid markers (CA1, AHSP, SLC4A1). v2 needs cell-type "
|
|
"deconvolution.",
|
|
"Fetal-hemoglobin genes (HBG1/HBG2) are NOT captured: flat in GSE35007 and removed by "
|
|
"GSE16728's globin-depleted RNA prep. The signature therefore does not directly encode "
|
|
"the HbF axis that hydroxyurea acts on, which may weaken the hydroxyurea recovery test.",
|
|
"GSE35007 is highly imbalanced (SS n=190 vs AA n=12); the healthy arm is small.",
|
|
"GSE16728 whole-blood arm is small (10/group) and merges two RNA-prep batches.",
|
|
"Cross-platform probe->symbol collapse (keep max-mean-expression probe) loses "
|
|
"isoform-level resolution and drops genes absent from either platform.",
|
|
]
|
|
|
|
signature = build_signature(
|
|
concordant, provenance,
|
|
tier=ConfidenceTier.A,
|
|
tier_rationale=tier_rationale,
|
|
limitations=limitations,
|
|
id_map=id_map,
|
|
top_n=250,
|
|
)
|
|
path = persist_signature(signature)
|
|
print(f"wrote {path}")
|
|
print(f"signature: {len(signature.up_regulated)} up / {len(signature.down_regulated)} down, "
|
|
f"tier {signature.confidence_tier.value}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|