Week 1: Tier-A sickle cell signature via 2-study concordance
Implement and run the Week 1 disease-signature pipeline: - src/disease.py: Welch t-test + BH DE (microarray), probe->symbol collapse, cross-study concordance filter, 2-study provenance schema - scripts/week1_explore.py: download GSE35007 + GSE16728, DE + concordance - scripts/week1_finalize.py: mygene ID mapping + persist signature - tests/test_disease.py: synthetic-data tests for DE/collapse/concordance - docs/data_sources.md: chosen datasets, group defs, reproduction steps Result: sickle_cell_signature_v1.json (gitignored), Tier A, 250 up / 227 down genes from 671 concordant (GSE35007 Illumina whole blood SS/AA + GSE16728 Affymetrix whole blood patient/control). Documented caveats: missing HbF axis (globin depletion) and reticulocyte composition confound. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
122
tests/test_disease.py
Normal file
122
tests/test_disease.py
Normal file
@@ -0,0 +1,122 @@
|
||||
"""Tests for the Week 1 signature-construction logic on synthetic data.
|
||||
|
||||
These verify the DE / probe-collapse / concordance math without touching the network, so the
|
||||
pipeline is trustworthy before it is pointed at real GEO studies.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from src.disease import (
|
||||
DISEASE_LABEL,
|
||||
HEALTHY_LABEL,
|
||||
ConcordanceSummary,
|
||||
SignatureProvenance,
|
||||
StudyProvenance,
|
||||
build_signature,
|
||||
collapse_probes_to_symbols,
|
||||
compute_differential_expression,
|
||||
concordance_filter,
|
||||
)
|
||||
from src.provenance import ConfidenceTier
|
||||
|
||||
|
||||
def _synthetic_study(seed: int, n_per_group: int = 12) -> tuple[pd.DataFrame, pd.Series]:
|
||||
"""Genes x samples matrix where UP is higher and DOWN is lower in disease."""
|
||||
rng = np.random.default_rng(seed)
|
||||
genes = ["UP1", "UP2", "DOWN1", "DOWN2", "NOISE1", "NOISE2"]
|
||||
samples = [f"d{i}" for i in range(n_per_group)] + [f"h{i}" for i in range(n_per_group)]
|
||||
groups = pd.Series([DISEASE_LABEL] * n_per_group + [HEALTHY_LABEL] * n_per_group, index=samples)
|
||||
|
||||
base = rng.normal(8.0, 0.3, size=(len(genes), len(samples)))
|
||||
df = pd.DataFrame(base, index=genes, columns=samples)
|
||||
disease = groups == DISEASE_LABEL
|
||||
df.loc["UP1", disease] += 3.0
|
||||
df.loc["UP2", disease] += 2.0
|
||||
df.loc["DOWN1", disease] -= 3.0
|
||||
df.loc["DOWN2", disease] -= 2.0
|
||||
return df, groups
|
||||
|
||||
|
||||
def test_welch_de_recovers_direction_and_significance():
|
||||
expr, groups = _synthetic_study(seed=1)
|
||||
de = compute_differential_expression(expr, groups, method="welch")
|
||||
|
||||
assert de.loc["UP1", "log_fc"] > 0 and de.loc["UP1", "qvalue"] < 0.05
|
||||
assert de.loc["DOWN1", "log_fc"] < 0 and de.loc["DOWN1", "qvalue"] < 0.05
|
||||
# Pure-noise genes should not be significant.
|
||||
assert de.loc["NOISE1", "qvalue"] > 0.05
|
||||
|
||||
|
||||
def test_compute_de_rejects_unlabelled_samples():
|
||||
expr, groups = _synthetic_study(seed=2)
|
||||
with pytest.raises(ValueError):
|
||||
compute_differential_expression(expr, groups.iloc[:-3], method="welch")
|
||||
|
||||
|
||||
def test_collapse_probes_keeps_highest_mean_expression():
|
||||
de = pd.DataFrame(
|
||||
{"log_fc": [1.0, 2.0, -1.0], "pvalue": [0.1, 0.2, 0.3], "qvalue": [0.1, 0.2, 0.3]},
|
||||
index=["probeA1", "probeA2", "probeB1"],
|
||||
)
|
||||
probe_to_symbol = pd.Series({"probeA1": "GENEA", "probeA2": "GENEA", "probeB1": "GENEB"})
|
||||
expr = pd.DataFrame(
|
||||
{"s1": [1.0, 100.0, 5.0], "s2": [1.0, 100.0, 5.0]},
|
||||
index=["probeA1", "probeA2", "probeB1"],
|
||||
)
|
||||
collapsed = collapse_probes_to_symbols(de, probe_to_symbol, expression_for_ranking=expr)
|
||||
|
||||
assert set(collapsed.index) == {"GENEA", "GENEB"}
|
||||
# probeA2 has the higher mean expression, so its log_fc (2.0) should win.
|
||||
assert collapsed.loc["GENEA", "log_fc"] == 2.0
|
||||
|
||||
|
||||
def test_concordance_filter_keeps_only_agreeing_genes():
|
||||
de_a = pd.DataFrame(
|
||||
{"log_fc": [2.0, -2.0, 1.5, 0.1], "qvalue": [0.001, 0.001, 0.2, 0.001]},
|
||||
index=["UP1", "DOWN1", "WEAK", "DISAGREE"],
|
||||
)
|
||||
de_b = pd.DataFrame(
|
||||
{"log_fc": [1.8, -2.2, 1.4, -0.1], "qvalue": [0.002, 0.002, 0.2, 0.002]},
|
||||
index=["UP1", "DOWN1", "WEAK", "DISAGREE"],
|
||||
)
|
||||
keep, summary = concordance_filter(de_a, de_b)
|
||||
|
||||
assert set(keep.index) == {"UP1", "DOWN1"} # WEAK fails q-cut; DISAGREE flips sign
|
||||
assert keep.loc["UP1", "log_fc"] == pytest.approx(1.9) # mean of the two
|
||||
assert keep.loc["UP1", "qvalue"] == pytest.approx(0.002) # max of the two
|
||||
assert isinstance(summary, ConcordanceSummary)
|
||||
assert summary.n_genes_tested == 4 and summary.n_concordant == 2
|
||||
assert summary.n_up == 1 and summary.n_down == 1
|
||||
|
||||
|
||||
def test_build_signature_splits_directions_and_respects_top_n():
|
||||
concordant = pd.DataFrame(
|
||||
{
|
||||
"log_fc": [3.0, 2.0, 1.0, -1.0, -2.0],
|
||||
"qvalue": [0.001, 0.002, 0.003, 0.004, 0.005],
|
||||
},
|
||||
index=["UP1", "UP2", "UP3", "DOWN1", "DOWN2"],
|
||||
)
|
||||
prov = SignatureProvenance(
|
||||
studies=[
|
||||
StudyProvenance(geo_accession="GSE1", n_disease=12, n_healthy=12,
|
||||
platform="P", tissue="whole blood", method="welch"),
|
||||
StudyProvenance(geo_accession="GSE2", n_disease=15, n_healthy=11,
|
||||
platform="P", tissue="whole blood", method="welch"),
|
||||
],
|
||||
concordance=ConcordanceSummary(n_genes_tested=100, n_concordant=5, n_up=3, n_down=2),
|
||||
created_date="2026-06-23",
|
||||
)
|
||||
sig = build_signature(
|
||||
concordant, prov, tier=ConfidenceTier.A,
|
||||
tier_rationale="Two-study concordance", limitations=["cell-composition confound"],
|
||||
top_n=2,
|
||||
)
|
||||
|
||||
assert [g.gene for g in sig.up_regulated] == ["UP1", "UP2"] # top 2 up by qvalue
|
||||
assert [g.gene for g in sig.down_regulated] == ["DOWN1", "DOWN2"]
|
||||
assert sig.confidence_tier == ConfidenceTier.A
|
||||
Reference in New Issue
Block a user