"""Tests for the Week 1 signature-construction logic on synthetic data. These verify the DE / probe-collapse / concordance math without touching the network, so the pipeline is trustworthy before it is pointed at real GEO studies. """ from __future__ import annotations import numpy as np import pandas as pd import pytest from src.disease import ( DISEASE_LABEL, HEALTHY_LABEL, ConcordanceSummary, SignatureProvenance, StudyProvenance, build_signature, collapse_probes_to_symbols, compute_differential_expression, concordance_filter, ) from src.provenance import ConfidenceTier def _synthetic_study(seed: int, n_per_group: int = 12) -> tuple[pd.DataFrame, pd.Series]: """Genes x samples matrix where UP is higher and DOWN is lower in disease.""" rng = np.random.default_rng(seed) genes = ["UP1", "UP2", "DOWN1", "DOWN2", "NOISE1", "NOISE2"] samples = [f"d{i}" for i in range(n_per_group)] + [f"h{i}" for i in range(n_per_group)] groups = pd.Series([DISEASE_LABEL] * n_per_group + [HEALTHY_LABEL] * n_per_group, index=samples) base = rng.normal(8.0, 0.3, size=(len(genes), len(samples))) df = pd.DataFrame(base, index=genes, columns=samples) disease = groups == DISEASE_LABEL df.loc["UP1", disease] += 3.0 df.loc["UP2", disease] += 2.0 df.loc["DOWN1", disease] -= 3.0 df.loc["DOWN2", disease] -= 2.0 return df, groups def test_welch_de_recovers_direction_and_significance(): expr, groups = _synthetic_study(seed=1) de = compute_differential_expression(expr, groups, method="welch") assert de.loc["UP1", "log_fc"] > 0 and de.loc["UP1", "qvalue"] < 0.05 assert de.loc["DOWN1", "log_fc"] < 0 and de.loc["DOWN1", "qvalue"] < 0.05 # Pure-noise genes should not be significant. assert de.loc["NOISE1", "qvalue"] > 0.05 def test_compute_de_rejects_unlabelled_samples(): expr, groups = _synthetic_study(seed=2) with pytest.raises(ValueError): compute_differential_expression(expr, groups.iloc[:-3], method="welch") def test_collapse_probes_keeps_highest_mean_expression(): de = pd.DataFrame( {"log_fc": [1.0, 2.0, -1.0], "pvalue": [0.1, 0.2, 0.3], "qvalue": [0.1, 0.2, 0.3]}, index=["probeA1", "probeA2", "probeB1"], ) probe_to_symbol = pd.Series({"probeA1": "GENEA", "probeA2": "GENEA", "probeB1": "GENEB"}) expr = pd.DataFrame( {"s1": [1.0, 100.0, 5.0], "s2": [1.0, 100.0, 5.0]}, index=["probeA1", "probeA2", "probeB1"], ) collapsed = collapse_probes_to_symbols(de, probe_to_symbol, expression_for_ranking=expr) assert set(collapsed.index) == {"GENEA", "GENEB"} # probeA2 has the higher mean expression, so its log_fc (2.0) should win. assert collapsed.loc["GENEA", "log_fc"] == 2.0 def test_concordance_filter_keeps_only_agreeing_genes(): de_a = pd.DataFrame( {"log_fc": [2.0, -2.0, 1.5, 0.1], "qvalue": [0.001, 0.001, 0.2, 0.001]}, index=["UP1", "DOWN1", "WEAK", "DISAGREE"], ) de_b = pd.DataFrame( {"log_fc": [1.8, -2.2, 1.4, -0.1], "qvalue": [0.002, 0.002, 0.2, 0.002]}, index=["UP1", "DOWN1", "WEAK", "DISAGREE"], ) keep, summary = concordance_filter(de_a, de_b) assert set(keep.index) == {"UP1", "DOWN1"} # WEAK fails q-cut; DISAGREE flips sign assert keep.loc["UP1", "log_fc"] == pytest.approx(1.9) # mean of the two assert keep.loc["UP1", "qvalue"] == pytest.approx(0.002) # max of the two assert isinstance(summary, ConcordanceSummary) assert summary.n_genes_tested == 4 and summary.n_concordant == 2 assert summary.n_up == 1 and summary.n_down == 1 def test_build_signature_splits_directions_and_respects_top_n(): concordant = pd.DataFrame( { "log_fc": [3.0, 2.0, 1.0, -1.0, -2.0], "qvalue": [0.001, 0.002, 0.003, 0.004, 0.005], }, index=["UP1", "UP2", "UP3", "DOWN1", "DOWN2"], ) prov = SignatureProvenance( studies=[ StudyProvenance(geo_accession="GSE1", n_disease=12, n_healthy=12, platform="P", tissue="whole blood", method="welch"), StudyProvenance(geo_accession="GSE2", n_disease=15, n_healthy=11, platform="P", tissue="whole blood", method="welch"), ], concordance=ConcordanceSummary(n_genes_tested=100, n_concordant=5, n_up=3, n_down=2), created_date="2026-06-23", ) sig = build_signature( concordant, prov, tier=ConfidenceTier.A, tier_rationale="Two-study concordance", limitations=["cell-composition confound"], top_n=2, ) assert [g.gene for g in sig.up_regulated] == ["UP1", "UP2"] # top 2 up by qvalue assert [g.gene for g in sig.down_regulated] == ["DOWN1", "DOWN2"] assert sig.confidence_tier == ConfidenceTier.A