From c7b6649d316785fb02c2ee327299be2286f5a872 Mon Sep 17 00:00:00 2001 From: "Junior B." Date: Tue, 23 Jun 2026 20:43:23 +0200 Subject: [PATCH] Week 1: Tier-A sickle cell signature via 2-study concordance Implement and run the Week 1 disease-signature pipeline: - src/disease.py: Welch t-test + BH DE (microarray), probe->symbol collapse, cross-study concordance filter, 2-study provenance schema - scripts/week1_explore.py: download GSE35007 + GSE16728, DE + concordance - scripts/week1_finalize.py: mygene ID mapping + persist signature - tests/test_disease.py: synthetic-data tests for DE/collapse/concordance - docs/data_sources.md: chosen datasets, group defs, reproduction steps Result: sickle_cell_signature_v1.json (gitignored), Tier A, 250 up / 227 down genes from 671 concordant (GSE35007 Illumina whole blood SS/AA + GSE16728 Affymetrix whole blood patient/control). Documented caveats: missing HbF axis (globin depletion) and reticulocyte composition confound. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/data_sources.md | 28 ++++- scripts/week1_explore.py | 139 +++++++++++++++++++++ scripts/week1_finalize.py | 121 +++++++++++++++++++ src/disease.py | 247 ++++++++++++++++++++++++++++++++++---- tests/test_disease.py | 122 +++++++++++++++++++ 5 files changed, 631 insertions(+), 26 deletions(-) create mode 100644 scripts/week1_explore.py create mode 100644 scripts/week1_finalize.py create mode 100644 tests/test_disease.py diff --git a/docs/data_sources.md b/docs/data_sources.md index edae034..22d8b53 100644 --- a/docs/data_sources.md +++ b/docs/data_sources.md @@ -10,17 +10,37 @@ | MONDO | http://www.obofoundry.org/ontology/mondo.html | OBO file | CC BY 4.0 | Disease ID | TBD | TBD | | Orphanet | https://www.orpha.net | Bulk XML | CC BY 4.0 | Rare disease metadata | TBD | TBD | | OMIM | https://omim.org | Free for academic | License for commercial | Disease genetics | TBD | TBD | -| GEO | https://www.ncbi.nlm.nih.gov/geo/ | GEOparse, FTP | Public domain | Expression data | TBD | TBD | +| GEO (GSE35007) | https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE35007 | GEOparse, FTP | Public domain | Disease signature (study 1) | GPL10558 | 2026-06-23 | +| GEO (GSE16728) | https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE16728 | GEOparse, FTP | Public domain | Disease signature (study 2) | GPL570 | 2026-06-23 | | ChEMBL | https://www.ebi.ac.uk/chembl | Python client, bulk SQLite | CC BY-SA 3.0 | Drug structures, targets | TBD | TBD | | LINCS L1000 | https://clue.io/data | Bulk download | Restricted academic free | Drug expression signatures | TBD | TBD | | ClinicalTrials.gov | https://clinicaltrials.gov | API | Public domain | Trial history | TBD | TBD | | FDA DailyMed | https://dailymed.nlm.nih.gov | API | Public domain | Approved labels | TBD | TBD | | Reactome | https://reactome.org | API, bulk | CC0 | Pathway data (Week 3 prior) | TBD | TBD | -## Chosen GEO dataset +## Chosen GEO datasets (disease signature, Tier A via 2-study concordance) -_Document the chosen study fully: accession, platform, n per group, publication, why it was -selected over the alternatives (GSE53441, GSE35007, …)._ +The signature is the cross-study concordance of two independent whole-blood studies (genes +significant at q<0.05 in **both** with the same direction). Whole-blood tissue was required so +concordance is meaningful; the two differ by platform and population, which strengthens +robustness. + +| Study | Platform | Tissue | Disease group | Healthy group | n disease / healthy | +|---|---|---|---|---|---| +| **GSE35007** | Illumina HumanHT-12 V4 (GPL10558) | whole blood | hb phenotype = SS | hb phenotype = AA | 190 / 12 | +| **GSE16728** | Affymetrix HG-U133 Plus 2.0 (GPL570) | whole blood (PAXgene) | sickle-cell patient | control | 10 / 10 | + +- DE method: per-gene Welch t-test + Benjamini–Hochberg (microarray, pure Python). +- Probes collapsed to HGNC symbol (keep max-mean-expression probe) before concordance. +- Result: 16,208 genes tested in both → **671 concordant** (444 up / 227 down). Signature = + top 250 up + all 227 down by worst-case q-value. +- **Rejected candidates:** GSE53441 (PBMC — tissue mismatch with the whole-blood anchor); + GSE84633/GSE84634 (PBMC, no healthy controls). +- **Tier caveat:** GSE16728 is exactly 10/group (two PAXgene preps merged), below the strict + n>10 rule; Tier A is assigned on cross-study concordance, documented in the signature JSON. + +Reproduce with `scripts/week1_explore.py` (download + DE + concordance) then +`scripts/week1_finalize.py` (mygene mapping + persist). ## Licensing note for LINCS diff --git a/scripts/week1_explore.py b/scripts/week1_explore.py new file mode 100644 index 0000000..42b555e --- /dev/null +++ b/scripts/week1_explore.py @@ -0,0 +1,139 @@ +"""Week 1 exploratory run: build per-study DE + concordance for the whole-blood pair. + +Read-only decision aid (NOT the final pipeline): downloads GSE35007 + GSE16728, runs DE on +each, collapses to HGNC symbols, computes concordance, and reports whether the sanity-gate +genes survive — so we can choose Tier A (concordance pair) vs Tier B (GSE35007 alone) with +real numbers. Intermediate DE tables are cached under data/raw/geo/. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import GEOparse +import numpy as np +import pandas as pd + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) +from src.disease import ( # noqa: E402 + DISEASE_LABEL, + HEALTHY_LABEL, + collapse_probes_to_symbols, + compute_differential_expression, + concordance_filter, +) + +GEO_DIR = Path("data/raw/geo") +GEO_DIR.mkdir(parents=True, exist_ok=True) + +SANITY_GENES = ["HBG1", "HBG2", "ALAS2", "CA1", "AHSP", "SLC4A1", "HBB", "ERAF"] +SYMBOL_COL_CANDIDATES = [ + "Symbol", "ILMN_Gene", "Gene Symbol", "GENE_SYMBOL", "Gene symbol", + "gene_assignment", "GeneSymbol", +] + + +def log(msg: str) -> None: + print(msg, flush=True) + + +def get_symbol_map(gpl) -> pd.Series: + tbl = gpl.table + col = next((c for c in SYMBOL_COL_CANDIDATES if c in tbl.columns), None) + if col is None: + raise RuntimeError(f"No symbol column in GPL {gpl.name}; columns={list(tbl.columns)[:15]}") + s = tbl.set_index("ID")[col].astype(str).str.strip() + # gene_assignment fields are '// SYMBOL // ...'; take the first real token. + if col == "gene_assignment": + s = s.str.split("//").str[1].str.strip() + s = s.replace({"": np.nan, "nan": np.nan, "---": np.nan}) + return s.dropna() + + +def build_expression(gse, sample_ids: list[str]) -> pd.DataFrame: + cols = {} + for gsm_id in sample_ids: + tbl = gse.gsms[gsm_id].table + cols[gsm_id] = tbl.set_index("ID_REF")["VALUE"] + return pd.DataFrame(cols) + + +def char_value(gsm, key: str) -> str | None: + for c in gsm.metadata.get("characteristics_ch1", []): + if c.lower().startswith(key.lower()): + return c.split(":", 1)[1].strip() + return None + + +def run_study(gse, groups: dict[str, str], label: str) -> pd.DataFrame: + """groups: gsm_id -> 'disease'/'healthy'. Returns symbol-level DE table.""" + sample_ids = list(groups.keys()) + log(f"[{label}] {len(sample_ids)} samples " + f"({sum(v==DISEASE_LABEL for v in groups.values())} disease / " + f"{sum(v==HEALTHY_LABEL for v in groups.values())} healthy)") + expr = build_expression(gse, sample_ids).apply(pd.to_numeric, errors="coerce").dropna(how="all") + sample_groups = pd.Series(groups) + de = compute_differential_expression(expr, sample_groups, method="welch") + gpl = list(gse.gpls.values())[0] + sym = get_symbol_map(gpl) + de_sym = collapse_probes_to_symbols(de, sym, expression_for_ranking=expr) + log(f"[{label}] probes->symbols: {len(de)} probes -> {len(de_sym)} symbols; " + f"sig q<0.05: {(de_sym['qvalue']<0.05).sum()}") + de_sym.to_parquet(GEO_DIR / f"de_{label}.parquet") + return de_sym + + +def main() -> None: + # --- GSE35007: whole blood, hb phenotype SS (disease) vs AA (healthy) --- + log("downloading GSE35007 ...") + gse1 = GEOparse.get_GEO(geo="GSE35007", destdir=str(GEO_DIR), silent=True) + g1 = {} + for gsm_id, gsm in gse1.gsms.items(): + hb = char_value(gsm, "hb phenotype") + if hb == "SS": + g1[gsm_id] = DISEASE_LABEL + elif hb == "AA": + g1[gsm_id] = HEALTHY_LABEL + de1 = run_study(gse1, g1, "GSE35007") + + # --- GSE16728: whole blood only (PAXgene preps; exclude PBMC), patient vs control --- + log("downloading GSE16728 ...") + gse2 = GEOparse.get_GEO(geo="GSE16728", destdir=str(GEO_DIR), silent=True) + g2 = {} + for gsm_id, gsm in gse2.gsms.items(): + prep = char_value(gsm, "rna prep") or "" + subj = char_value(gsm, "subject") or "" + if "PBMC" in prep: + continue # whole-blood only + if subj.lower().startswith("sickle"): + g2[gsm_id] = DISEASE_LABEL + elif subj.lower().startswith("control"): + g2[gsm_id] = HEALTHY_LABEL + de2 = run_study(gse2, g2, "GSE16728") + + # --- Concordance --- + keep, summary = concordance_filter(de1, de2) + log("\n==== CONCORDANCE (whole-blood pair) ====") + log(f"genes tested in both: {summary.n_genes_tested}") + log(f"concordant (q<0.05 both + same sign): {summary.n_concordant} " + f"(up={summary.n_up}, down={summary.n_down})") + keep.to_parquet(GEO_DIR / "concordant_genes.parquet") + + log("\n==== SANITY-GATE GENES ====") + for g in SANITY_GENES: + in1 = g in de1.index + in2 = g in de2.index + row1 = de1.loc[g] if in1 else None + row2 = de2.loc[g] if in2 else None + concord = "YES" if g in keep.index else "no" + d1 = f"lfc={row1['log_fc']:+.2f},q={row1['qvalue']:.1e}" if in1 else "absent" + d2 = f"lfc={row2['log_fc']:+.2f},q={row2['qvalue']:.1e}" if in2 else "absent" + log(f" {g:7s} | GSE35007 {d1:28s} | GSE16728 {d2:28s} | concordant={concord}") + + log("\n==== TOP 15 CONCORDANT (by worst-case q) ====") + log(keep.head(15).to_string()) + + +if __name__ == "__main__": + main() diff --git a/scripts/week1_finalize.py b/scripts/week1_finalize.py new file mode 100644 index 0000000..4bc0077 --- /dev/null +++ b/scripts/week1_finalize.py @@ -0,0 +1,121 @@ +"""Finalize the Week 1 sickle cell signature (Tier A, whole-blood concordance pair). + +Reads the cached concordant gene table, maps symbols -> Entrez/Ensembl via mygene, attaches +two-study provenance + concordance summary, and persists +data/processed/sickle_cell_signature_v1.json. + +Tier note: GSE16728 is exactly 10/group, which fails assign_tier()'s strict n>10. Tier A is +assigned explicitly here on the basis of cross-study, cross-platform, cross-population +concordance (the founder's decision), with the n=10 / prep-merge caveat documented in the +tier rationale and limitations. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import mygene +import pandas as pd + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) +from src.disease import ( # noqa: E402 + ConcordanceSummary, + SignatureProvenance, + StudyProvenance, + build_signature, + persist_signature, +) +from src.provenance import ConfidenceTier # noqa: E402 + +CREATED_DATE = "2026-06-23" +GEO_DIR = Path("data/raw/geo") + + +def map_ids(symbols: list[str]) -> pd.DataFrame: + mg = mygene.MyGeneInfo() + res = mg.querymany( + symbols, scopes="symbol", fields="entrezgene,ensembl.gene", + species="human", as_dataframe=True, verbose=False, + ) + res = res[~res.index.duplicated(keep="first")] + + def _ensembl(v): + if isinstance(v, list) and v: + v = v[0] + if isinstance(v, dict): + return v.get("gene") + return v + + id_map = pd.DataFrame(index=res.index) + id_map["entrez_id"] = res["entrezgene"] if "entrezgene" in res.columns else None + ens_col = "ensembl.gene" if "ensembl.gene" in res.columns else ("ensembl" if "ensembl" in res.columns else None) + id_map["ensembl_id"] = res[ens_col].map(_ensembl) if ens_col else None + return id_map + + +def main() -> None: + concordant = pd.read_parquet(GEO_DIR / "concordant_genes.parquet") + print(f"loaded {len(concordant)} concordant genes " + f"({(concordant['log_fc']>0).sum()} up / {(concordant['log_fc']<0).sum()} down)") + + id_map = map_ids(list(concordant.index)) + print(f"mygene mapped {id_map['entrez_id'].notna().sum()}/{len(id_map)} symbols to Entrez") + + provenance = SignatureProvenance( + studies=[ + StudyProvenance( + geo_accession="GSE35007", n_disease=190, n_healthy=12, + platform="Illumina HumanHT-12 V4 (GPL10558)", + tissue="whole blood", method="welch", + ), + StudyProvenance( + geo_accession="GSE16728", n_disease=10, n_healthy=10, + platform="Affymetrix HG-U133 Plus 2.0 (GPL570)", + tissue="whole blood (PAXgene; globin-reduced + total RNA preps merged)", + method="welch", + ), + ], + concordance=ConcordanceSummary( + n_genes_tested=16208, n_concordant=671, n_up=444, n_down=227, + ), + created_date=CREATED_DATE, + ) + + tier_rationale = ( + "Measured RNA expression concordant across two independent peer-reviewed whole-blood " + "studies on different platforms (Illumina GPL10558, Affymetrix GPL570) and populations " + "(pediatric West-African GSE35007; adult GSE16728). Tier A rests on cross-study / " + "cross-platform concordance. Caveat: GSE16728 is exactly 10/group (two PAXgene preps " + "merged), which does not meet the strict n>10 per-study threshold on its own." + ) + limitations = [ + "Whole-blood expression is partly driven by cell-composition differences " + "(reticulocytosis in sickle patients), not pure disease-state regulation; the " + "signature is dominated by erythroid markers (CA1, AHSP, SLC4A1). v2 needs cell-type " + "deconvolution.", + "Fetal-hemoglobin genes (HBG1/HBG2) are NOT captured: flat in GSE35007 and removed by " + "GSE16728's globin-depleted RNA prep. The signature therefore does not directly encode " + "the HbF axis that hydroxyurea acts on, which may weaken the hydroxyurea recovery test.", + "GSE35007 is highly imbalanced (SS n=190 vs AA n=12); the healthy arm is small.", + "GSE16728 whole-blood arm is small (10/group) and merges two RNA-prep batches.", + "Cross-platform probe->symbol collapse (keep max-mean-expression probe) loses " + "isoform-level resolution and drops genes absent from either platform.", + ] + + signature = build_signature( + concordant, provenance, + tier=ConfidenceTier.A, + tier_rationale=tier_rationale, + limitations=limitations, + id_map=id_map, + top_n=250, + ) + path = persist_signature(signature) + print(f"wrote {path}") + print(f"signature: {len(signature.up_regulated)} up / {len(signature.down_regulated)} down, " + f"tier {signature.confidence_tier.value}") + + +if __name__ == "__main__": + main() diff --git a/src/disease.py b/src/disease.py index a4569a6..7e0139e 100644 --- a/src/disease.py +++ b/src/disease.py @@ -1,19 +1,27 @@ """Disease signature construction. Week 1 (PLAN.md §6). Builds a Tier-A sickle cell signature from GEO expression data via -differential expression, then persists it with full provenance to -``data/processed/sickle_cell_signature_v1.json``. +differential expression on TWO studies, keeping only genes that are concordant across both +(the multi-source evidence that earns Tier A — see project memory). Persists with full +provenance to ``data/processed/sickle_cell_signature_v1.json``. -This module defines the persisted schema (pydantic) and the construction stubs. The actual -data pull + differential expression is driven from ``notebooks/02_disease_signature.ipynb``. +Pipeline (driven from ``notebooks/02_disease_signature.ipynb``): + per study: compute_differential_expression -> collapse_probes_to_symbols + across: concordance_filter -> build_signature -> persist_signature + +Conventions: + - log_fc > 0 => up-regulated in DISEASE vs HEALTHY (Week 1 refinement R1) + - cross-study join key is the HGNC gene symbol (R2) """ from __future__ import annotations from pathlib import Path +import numpy as np import pandas as pd from pydantic import BaseModel, Field +from scipy.stats import false_discovery_control, ttest_ind from . import PIPELINE_VERSION, PROCESSED_DIR from .provenance import ConfidenceTier @@ -22,6 +30,10 @@ from .provenance import ConfidenceTier TOP_N_PER_DIRECTION = 250 QVALUE_CUTOFF = 0.05 +# Group labels used throughout. The disease group is the "treatment" arm in DE contrasts. +DISEASE_LABEL = "disease" +HEALTHY_LABEL = "healthy" + class GeneEntry(BaseModel): """A single differentially expressed gene in the signature.""" @@ -33,15 +45,37 @@ class GeneEntry(BaseModel): qvalue: float -class SignatureProvenance(BaseModel): - """Provenance block for a disease signature (PLAN.md §6 schema).""" +class StudyProvenance(BaseModel): + """Provenance for one contributing GEO study (Week 1 refinement R6).""" geo_accession: str n_disease: int n_healthy: int platform: str - method: str = Field(..., description="Differential expression method, e.g. 'limma', 'deseq2'.") + tissue: str + method: str = Field(..., description="DE method: 'welch' (microarray) or 'deseq2' (RNA-seq).") + + +class ConcordanceSummary(BaseModel): + """How the two studies were reconciled into the signature (R6).""" + + rule: str = Field( + default="q<0.05 in both studies AND same sign of log_fc in both", + description="The concordance rule applied across studies.", + ) + n_genes_tested: int = Field(..., description="Genes present in both studies after symbol collapse.") + n_concordant: int + n_up: int + n_down: int + + +class SignatureProvenance(BaseModel): + """Provenance block for a disease signature (revised for 2-study concordance, R6).""" + + studies: list[StudyProvenance] + concordance: ConcordanceSummary created_date: str + pipeline_version: str = PIPELINE_VERSION class DiseaseSignature(BaseModel): @@ -58,44 +92,213 @@ class DiseaseSignature(BaseModel): limitations: list[str] +def _looks_like_linear_intensity(expression: pd.DataFrame) -> bool: + """Heuristic: microarray data still on a linear scale has a large dynamic range. + + log2-transformed intensities are typically <~20; raw intensities run into the thousands. + """ + return float(np.nanmax(expression.to_numpy())) > 50.0 + + +def _welch_de(expression: pd.DataFrame, groups: pd.Series) -> pd.DataFrame: + """Per-gene Welch t-test + Benjamini-Hochberg, the limma-equivalent for microarray. + + Assumes ``expression`` is genes (rows) x samples (columns), on (or coercible to) a log2 + scale. ``log_fc`` is mean(disease) - mean(healthy) (R1 sign convention). + """ + expr = expression.copy() + if _looks_like_linear_intensity(expr): + # log2(x+1) guards against zeros/negatives while keeping the transform standard. + expr = np.log2(expr.clip(lower=0) + 1.0) + + disease_cols = groups.index[groups == DISEASE_LABEL] + healthy_cols = groups.index[groups == HEALTHY_LABEL] + disease = expr[disease_cols].to_numpy() + healthy = expr[healthy_cols].to_numpy() + + log_fc = disease.mean(axis=1) - healthy.mean(axis=1) + # Welch's t-test (unequal variance) per gene across samples. + _, pvalue = ttest_ind(disease, healthy, axis=1, equal_var=False, nan_policy="omit") + + out = pd.DataFrame({"log_fc": log_fc, "pvalue": pvalue}, index=expr.index) + out = out.dropna(subset=["pvalue"]) + out["qvalue"] = false_discovery_control(out["pvalue"].to_numpy(), method="bh") + return out.sort_values("qvalue") + + +def _deseq2_de(counts: pd.DataFrame, groups: pd.Series) -> pd.DataFrame: + """RNA-seq differential expression via pydeseq2. + + ``counts`` is genes (rows) x samples (columns) of raw integer counts. Returns the same + schema as ``_welch_de`` (log_fc = log2FC of disease vs healthy, plus pvalue/qvalue). + """ + from pydeseq2.dds import DeseqDataSet + from pydeseq2.ds import DeseqStats + + # pydeseq2 wants samples (rows) x genes (cols), with an aligned metadata frame. + counts_t = counts.T.round().astype(int) + metadata = pd.DataFrame({"condition": groups.reindex(counts_t.index).to_numpy()}, index=counts_t.index) + + dds = DeseqDataSet(counts=counts_t, metadata=metadata, design="~condition", quiet=True) + dds.deseq2() + stats = DeseqStats(dds, contrast=["condition", DISEASE_LABEL, HEALTHY_LABEL], quiet=True) + stats.summary() + res = stats.results_df.rename(columns={"log2FoldChange": "log_fc", "pvalue": "pvalue", "padj": "qvalue"}) + return res[["log_fc", "pvalue", "qvalue"]].dropna(subset=["qvalue"]).sort_values("qvalue") + + def compute_differential_expression( expression: pd.DataFrame, sample_groups: pd.Series, *, method: str, ) -> pd.DataFrame: - """Compute gene-level log fold change and adjusted p-values. - - For RNA-seq use ``pydeseq2``; for microarray log2-transform/normalize and use a - limma-equivalent (PLAN.md §6, Week 1 task 4). + """Compute gene-level log fold change and adjusted p-values for one study. Args: - expression: Genes (rows) x samples (columns) expression matrix. - sample_groups: Per-sample group label ('disease' / 'healthy'), indexed by sample. - method: 'deseq2' (RNA-seq) or 'limma' (microarray). + expression: Genes (rows) x samples (columns). Microarray intensities or RNA-seq counts. + sample_groups: Per-sample 'disease'/'healthy' label, indexed by sample id (column). + method: 'welch' (microarray, PLAN choice) or 'deseq2' (RNA-seq). Returns: - A table indexed by gene with at least ``log_fc`` and ``qvalue`` columns. + Table indexed by gene/probe with ``log_fc``, ``pvalue``, ``qvalue`` (R1 sign convention). """ - raise NotImplementedError("Differential expression: implement in Week 1 (notebook 02).") + groups = sample_groups.reindex(expression.columns) + if groups.isna().any(): + missing = list(groups.index[groups.isna()]) + raise ValueError(f"sample_groups missing labels for samples: {missing[:5]}...") + + if method == "welch": + return _welch_de(expression, groups) + if method == "deseq2": + return _deseq2_de(expression, groups) + raise ValueError(f"Unknown method {method!r}; expected 'welch' or 'deseq2'.") + + +def collapse_probes_to_symbols( + de_table: pd.DataFrame, + probe_to_symbol: pd.Series, + expression_for_ranking: pd.DataFrame | None = None, +) -> pd.DataFrame: + """Collapse a probe-level DE table to one row per HGNC symbol (R2). + + When multiple probes map to the same symbol, keep the probe with the highest mean + expression (the standard collapseRows heuristic) if ``expression_for_ranking`` is given; + otherwise keep the probe with the smallest qvalue. + + Args: + de_table: DE table indexed by probe id (from ``compute_differential_expression``). + probe_to_symbol: Probe id -> HGNC symbol mapping. + expression_for_ranking: Optional genes x samples matrix to rank duplicate probes. + + Returns: + DE table indexed by HGNC symbol. + """ + df = de_table.copy() + df["symbol"] = probe_to_symbol.reindex(df.index) + df = df.dropna(subset=["symbol"]) + + if expression_for_ranking is not None: + rank_key = expression_for_ranking.reindex(df.index).mean(axis=1) + df = df.assign(_rank=rank_key).sort_values("_rank", ascending=False) + else: + df = df.sort_values("qvalue", ascending=True) + + collapsed = df[~df["symbol"].duplicated(keep="first")].set_index("symbol") + return collapsed.drop(columns=[c for c in ("_rank",) if c in collapsed.columns]) + + +def concordance_filter( + de_a: pd.DataFrame, + de_b: pd.DataFrame, + *, + qvalue_cutoff: float = QVALUE_CUTOFF, +) -> tuple[pd.DataFrame, ConcordanceSummary]: + """Keep only genes concordant across two symbol-level DE tables (R3). + + A gene qualifies iff q<``qvalue_cutoff`` in BOTH studies AND log_fc has the same sign in + both. Reported ``log_fc`` = mean of the two; ``qvalue`` = max of the two (worst case). + Ranked by ``max(q_a, q_b)`` ascending. + + Returns: + (concordant_table indexed by symbol with log_fc/qvalue, ConcordanceSummary). + """ + merged = de_a.join(de_b, lsuffix="_a", rsuffix="_b", how="inner") + n_tested = len(merged) + + sig_both = (merged["qvalue_a"] < qvalue_cutoff) & (merged["qvalue_b"] < qvalue_cutoff) + same_sign = np.sign(merged["log_fc_a"]) == np.sign(merged["log_fc_b"]) + keep = merged[sig_both & same_sign].copy() + + keep["log_fc"] = (keep["log_fc_a"] + keep["log_fc_b"]) / 2.0 + keep["qvalue"] = keep[["qvalue_a", "qvalue_b"]].max(axis=1) + keep = keep[["log_fc", "qvalue"]].sort_values("qvalue") + + summary = ConcordanceSummary( + n_genes_tested=n_tested, + n_concordant=len(keep), + n_up=int((keep["log_fc"] > 0).sum()), + n_down=int((keep["log_fc"] < 0).sum()), + ) + return keep, summary def build_signature( - de_table: pd.DataFrame, + concordant_table: pd.DataFrame, provenance: SignatureProvenance, *, tier: ConfidenceTier, tier_rationale: str, limitations: list[str], + id_map: pd.DataFrame | None = None, top_n: int = TOP_N_PER_DIRECTION, - qvalue_cutoff: float = QVALUE_CUTOFF, ) -> DiseaseSignature: - """Assemble a ``DiseaseSignature`` from a differential expression table. + """Assemble a ``DiseaseSignature`` from the concordant gene table. - Takes the top ``top_n`` up- and down-regulated genes (by qvalue, cut at - ``qvalue_cutoff``) per PLAN.md §6, Week 1 task 5. + Takes the top ``top_n`` up- and down-regulated genes by qvalue per direction (R3). If + fewer than ``top_n`` qualify in a direction, takes all available (the caller should have + logged the count). ``id_map`` optionally provides 'entrez_id'/'ensembl_id' columns indexed + by symbol (from ``mygene``). + + Args: + concordant_table: Output of ``concordance_filter`` (indexed by symbol). + provenance: Fully populated ``SignatureProvenance`` (both studies + concordance). + tier: Confidence tier (Tier A only if genuinely multi-source). + tier_rationale: One-line justification of the tier. + limitations: Honest limitations list (R8). + id_map: Optional symbol -> {entrez_id, ensembl_id} table. + top_n: Max genes per direction. + + Returns: + A populated ``DiseaseSignature``. """ - raise NotImplementedError("Signature assembly: implement in Week 1 (notebook 02).") + + def _entries(rows: pd.DataFrame) -> list[GeneEntry]: + entries: list[GeneEntry] = [] + for symbol, row in rows.iterrows(): + ids = id_map.loc[symbol] if (id_map is not None and symbol in id_map.index) else None + entries.append( + GeneEntry( + gene=str(symbol), + entrez_id=(str(ids["entrez_id"]) if ids is not None and pd.notna(ids.get("entrez_id")) else None), + ensembl_id=(str(ids["ensembl_id"]) if ids is not None and pd.notna(ids.get("ensembl_id")) else None), + log_fc=float(row["log_fc"]), + qvalue=float(row["qvalue"]), + ) + ) + return entries + + up = concordant_table[concordant_table["log_fc"] > 0].sort_values("qvalue").head(top_n) + down = concordant_table[concordant_table["log_fc"] < 0].sort_values("qvalue").head(top_n) + + return DiseaseSignature( + up_regulated=_entries(up), + down_regulated=_entries(down), + provenance=provenance, + confidence_tier=tier, + tier_rationale=tier_rationale, + limitations=limitations, + ) def persist_signature(signature: DiseaseSignature, out_path: Path | None = None) -> Path: diff --git a/tests/test_disease.py b/tests/test_disease.py new file mode 100644 index 0000000..79902a8 --- /dev/null +++ b/tests/test_disease.py @@ -0,0 +1,122 @@ +"""Tests for the Week 1 signature-construction logic on synthetic data. + +These verify the DE / probe-collapse / concordance math without touching the network, so the +pipeline is trustworthy before it is pointed at real GEO studies. +""" + +from __future__ import annotations + +import numpy as np +import pandas as pd +import pytest + +from src.disease import ( + DISEASE_LABEL, + HEALTHY_LABEL, + ConcordanceSummary, + SignatureProvenance, + StudyProvenance, + build_signature, + collapse_probes_to_symbols, + compute_differential_expression, + concordance_filter, +) +from src.provenance import ConfidenceTier + + +def _synthetic_study(seed: int, n_per_group: int = 12) -> tuple[pd.DataFrame, pd.Series]: + """Genes x samples matrix where UP is higher and DOWN is lower in disease.""" + rng = np.random.default_rng(seed) + genes = ["UP1", "UP2", "DOWN1", "DOWN2", "NOISE1", "NOISE2"] + samples = [f"d{i}" for i in range(n_per_group)] + [f"h{i}" for i in range(n_per_group)] + groups = pd.Series([DISEASE_LABEL] * n_per_group + [HEALTHY_LABEL] * n_per_group, index=samples) + + base = rng.normal(8.0, 0.3, size=(len(genes), len(samples))) + df = pd.DataFrame(base, index=genes, columns=samples) + disease = groups == DISEASE_LABEL + df.loc["UP1", disease] += 3.0 + df.loc["UP2", disease] += 2.0 + df.loc["DOWN1", disease] -= 3.0 + df.loc["DOWN2", disease] -= 2.0 + return df, groups + + +def test_welch_de_recovers_direction_and_significance(): + expr, groups = _synthetic_study(seed=1) + de = compute_differential_expression(expr, groups, method="welch") + + assert de.loc["UP1", "log_fc"] > 0 and de.loc["UP1", "qvalue"] < 0.05 + assert de.loc["DOWN1", "log_fc"] < 0 and de.loc["DOWN1", "qvalue"] < 0.05 + # Pure-noise genes should not be significant. + assert de.loc["NOISE1", "qvalue"] > 0.05 + + +def test_compute_de_rejects_unlabelled_samples(): + expr, groups = _synthetic_study(seed=2) + with pytest.raises(ValueError): + compute_differential_expression(expr, groups.iloc[:-3], method="welch") + + +def test_collapse_probes_keeps_highest_mean_expression(): + de = pd.DataFrame( + {"log_fc": [1.0, 2.0, -1.0], "pvalue": [0.1, 0.2, 0.3], "qvalue": [0.1, 0.2, 0.3]}, + index=["probeA1", "probeA2", "probeB1"], + ) + probe_to_symbol = pd.Series({"probeA1": "GENEA", "probeA2": "GENEA", "probeB1": "GENEB"}) + expr = pd.DataFrame( + {"s1": [1.0, 100.0, 5.0], "s2": [1.0, 100.0, 5.0]}, + index=["probeA1", "probeA2", "probeB1"], + ) + collapsed = collapse_probes_to_symbols(de, probe_to_symbol, expression_for_ranking=expr) + + assert set(collapsed.index) == {"GENEA", "GENEB"} + # probeA2 has the higher mean expression, so its log_fc (2.0) should win. + assert collapsed.loc["GENEA", "log_fc"] == 2.0 + + +def test_concordance_filter_keeps_only_agreeing_genes(): + de_a = pd.DataFrame( + {"log_fc": [2.0, -2.0, 1.5, 0.1], "qvalue": [0.001, 0.001, 0.2, 0.001]}, + index=["UP1", "DOWN1", "WEAK", "DISAGREE"], + ) + de_b = pd.DataFrame( + {"log_fc": [1.8, -2.2, 1.4, -0.1], "qvalue": [0.002, 0.002, 0.2, 0.002]}, + index=["UP1", "DOWN1", "WEAK", "DISAGREE"], + ) + keep, summary = concordance_filter(de_a, de_b) + + assert set(keep.index) == {"UP1", "DOWN1"} # WEAK fails q-cut; DISAGREE flips sign + assert keep.loc["UP1", "log_fc"] == pytest.approx(1.9) # mean of the two + assert keep.loc["UP1", "qvalue"] == pytest.approx(0.002) # max of the two + assert isinstance(summary, ConcordanceSummary) + assert summary.n_genes_tested == 4 and summary.n_concordant == 2 + assert summary.n_up == 1 and summary.n_down == 1 + + +def test_build_signature_splits_directions_and_respects_top_n(): + concordant = pd.DataFrame( + { + "log_fc": [3.0, 2.0, 1.0, -1.0, -2.0], + "qvalue": [0.001, 0.002, 0.003, 0.004, 0.005], + }, + index=["UP1", "UP2", "UP3", "DOWN1", "DOWN2"], + ) + prov = SignatureProvenance( + studies=[ + StudyProvenance(geo_accession="GSE1", n_disease=12, n_healthy=12, + platform="P", tissue="whole blood", method="welch"), + StudyProvenance(geo_accession="GSE2", n_disease=15, n_healthy=11, + platform="P", tissue="whole blood", method="welch"), + ], + concordance=ConcordanceSummary(n_genes_tested=100, n_concordant=5, n_up=3, n_down=2), + created_date="2026-06-23", + ) + sig = build_signature( + concordant, prov, tier=ConfidenceTier.A, + tier_rationale="Two-study concordance", limitations=["cell-composition confound"], + top_n=2, + ) + + assert [g.gene for g in sig.up_regulated] == ["UP1", "UP2"] # top 2 up by qvalue + assert [g.gene for g in sig.down_regulated] == ["DOWN1", "DOWN2"] + assert sig.confidence_tier == ConfidenceTier.A