"""Finalize the Week 1 sickle cell signature (Tier A, whole-blood concordance pair). Reads the cached concordant gene table, maps symbols -> Entrez/Ensembl via mygene, attaches two-study provenance + concordance summary, and persists data/processed/sickle_cell_signature_v1.json. Tier note: GSE16728 is exactly 10/group, which fails assign_tier()'s strict n>10. Tier A is assigned explicitly here on the basis of cross-study, cross-platform, cross-population concordance (the founder's decision), with the n=10 / prep-merge caveat documented in the tier rationale and limitations. """ from __future__ import annotations import sys from pathlib import Path import mygene import pandas as pd sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from src.disease import ( # noqa: E402 ConcordanceSummary, SignatureProvenance, StudyProvenance, build_signature, persist_signature, ) from src.provenance import ConfidenceTier # noqa: E402 CREATED_DATE = "2026-06-23" GEO_DIR = Path("data/raw/geo") def map_ids(symbols: list[str]) -> pd.DataFrame: mg = mygene.MyGeneInfo() res = mg.querymany( symbols, scopes="symbol", fields="entrezgene,ensembl.gene", species="human", as_dataframe=True, verbose=False, ) res = res[~res.index.duplicated(keep="first")] def _ensembl(v): if isinstance(v, list) and v: v = v[0] if isinstance(v, dict): return v.get("gene") return v id_map = pd.DataFrame(index=res.index) id_map["entrez_id"] = res["entrezgene"] if "entrezgene" in res.columns else None ens_col = "ensembl.gene" if "ensembl.gene" in res.columns else ("ensembl" if "ensembl" in res.columns else None) id_map["ensembl_id"] = res[ens_col].map(_ensembl) if ens_col else None return id_map def main() -> None: concordant = pd.read_parquet(GEO_DIR / "concordant_genes.parquet") print(f"loaded {len(concordant)} concordant genes " f"({(concordant['log_fc']>0).sum()} up / {(concordant['log_fc']<0).sum()} down)") id_map = map_ids(list(concordant.index)) print(f"mygene mapped {id_map['entrez_id'].notna().sum()}/{len(id_map)} symbols to Entrez") provenance = SignatureProvenance( studies=[ StudyProvenance( geo_accession="GSE35007", n_disease=190, n_healthy=12, platform="Illumina HumanHT-12 V4 (GPL10558)", tissue="whole blood", method="welch", ), StudyProvenance( geo_accession="GSE16728", n_disease=10, n_healthy=10, platform="Affymetrix HG-U133 Plus 2.0 (GPL570)", tissue="whole blood (PAXgene; globin-reduced + total RNA preps merged)", method="welch", ), ], concordance=ConcordanceSummary( n_genes_tested=16208, n_concordant=671, n_up=444, n_down=227, ), created_date=CREATED_DATE, ) tier_rationale = ( "Measured RNA expression concordant across two independent peer-reviewed whole-blood " "studies on different platforms (Illumina GPL10558, Affymetrix GPL570) and populations " "(pediatric West-African GSE35007; adult GSE16728). Tier A rests on cross-study / " "cross-platform concordance. Caveat: GSE16728 is exactly 10/group (two PAXgene preps " "merged), which does not meet the strict n>10 per-study threshold on its own." ) limitations = [ "Whole-blood expression is partly driven by cell-composition differences " "(reticulocytosis in sickle patients), not pure disease-state regulation; the " "signature is dominated by erythroid markers (CA1, AHSP, SLC4A1). v2 needs cell-type " "deconvolution.", "Fetal-hemoglobin genes (HBG1/HBG2) are NOT captured: flat in GSE35007 and removed by " "GSE16728's globin-depleted RNA prep. The signature therefore does not directly encode " "the HbF axis that hydroxyurea acts on, which may weaken the hydroxyurea recovery test.", "GSE35007 is highly imbalanced (SS n=190 vs AA n=12); the healthy arm is small.", "GSE16728 whole-blood arm is small (10/group) and merges two RNA-prep batches.", "Cross-platform probe->symbol collapse (keep max-mean-expression probe) loses " "isoform-level resolution and drops genes absent from either platform.", ] signature = build_signature( concordant, provenance, tier=ConfidenceTier.A, tier_rationale=tier_rationale, limitations=limitations, id_map=id_map, top_n=250, ) path = persist_signature(signature) print(f"wrote {path}") print(f"signature: {len(signature.up_regulated)} up / {len(signature.down_regulated)} down, " f"tier {signature.confidence_tier.value}") if __name__ == "__main__": main()