Scaffold Reverso MVP pipeline structure

Set up the project skeleton per PLAN.md §4:
- src/ package: identifiers, disease, drugs, scoring, provenance
  with pydantic schemas and confidence-tier logic (working);
  data-pull/compute functions stubbed per their build week
- 5 starter notebooks (01-05) with PLAN-referenced steps
- tests/test_scoring.py: tier-assignment tests pass; scoring
  reference test xfail until Week 3
- docs/: recovery_test_report, data_sources, known_limitations skeletons
- pyproject.toml (requires-python >=3.11,<3.14), .gitignore, README
- data/ tree preserved via .gitkeep; raw/processed/results gitignored

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-23 20:19:38 +02:00
parent e717cf40ed
commit b731478f5d
25 changed files with 1038 additions and 4 deletions

106
src/disease.py Normal file
View File

@@ -0,0 +1,106 @@
"""Disease signature construction.
Week 1 (PLAN.md §6). Builds a Tier-A sickle cell signature from GEO expression data via
differential expression, then persists it with full provenance to
``data/processed/sickle_cell_signature_v1.json``.
This module defines the persisted schema (pydantic) and the construction stubs. The actual
data pull + differential expression is driven from ``notebooks/02_disease_signature.ipynb``.
"""
from __future__ import annotations
from pathlib import Path
import pandas as pd
from pydantic import BaseModel, Field
from . import PIPELINE_VERSION, PROCESSED_DIR
from .provenance import ConfidenceTier
# Number of genes to take per direction (PLAN.md §6, Week 1 task 5).
TOP_N_PER_DIRECTION = 250
QVALUE_CUTOFF = 0.05
class GeneEntry(BaseModel):
"""A single differentially expressed gene in the signature."""
gene: str = Field(..., description="HGNC gene symbol, e.g. 'HBG2'.")
entrez_id: str | None = None
ensembl_id: str | None = None
log_fc: float
qvalue: float
class SignatureProvenance(BaseModel):
"""Provenance block for a disease signature (PLAN.md §6 schema)."""
geo_accession: str
n_disease: int
n_healthy: int
platform: str
method: str = Field(..., description="Differential expression method, e.g. 'limma', 'deseq2'.")
created_date: str
class DiseaseSignature(BaseModel):
"""The persisted sickle cell disease signature (PLAN.md §6 schema)."""
signature_id: str = "sickle_cell_v1"
disease_mondo_id: str = "MONDO:0011382"
pipeline_version: str = PIPELINE_VERSION
up_regulated: list[GeneEntry]
down_regulated: list[GeneEntry]
provenance: SignatureProvenance
confidence_tier: ConfidenceTier
tier_rationale: str
limitations: list[str]
def compute_differential_expression(
expression: pd.DataFrame,
sample_groups: pd.Series,
*,
method: str,
) -> pd.DataFrame:
"""Compute gene-level log fold change and adjusted p-values.
For RNA-seq use ``pydeseq2``; for microarray log2-transform/normalize and use a
limma-equivalent (PLAN.md §6, Week 1 task 4).
Args:
expression: Genes (rows) x samples (columns) expression matrix.
sample_groups: Per-sample group label ('disease' / 'healthy'), indexed by sample.
method: 'deseq2' (RNA-seq) or 'limma' (microarray).
Returns:
A table indexed by gene with at least ``log_fc`` and ``qvalue`` columns.
"""
raise NotImplementedError("Differential expression: implement in Week 1 (notebook 02).")
def build_signature(
de_table: pd.DataFrame,
provenance: SignatureProvenance,
*,
tier: ConfidenceTier,
tier_rationale: str,
limitations: list[str],
top_n: int = TOP_N_PER_DIRECTION,
qvalue_cutoff: float = QVALUE_CUTOFF,
) -> DiseaseSignature:
"""Assemble a ``DiseaseSignature`` from a differential expression table.
Takes the top ``top_n`` up- and down-regulated genes (by qvalue, cut at
``qvalue_cutoff``) per PLAN.md §6, Week 1 task 5.
"""
raise NotImplementedError("Signature assembly: implement in Week 1 (notebook 02).")
def persist_signature(signature: DiseaseSignature, out_path: Path | None = None) -> Path:
"""Write a signature to ``data/processed/sickle_cell_signature_v1.json``."""
out_path = out_path or (PROCESSED_DIR / "sickle_cell_signature_v1.json")
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(signature.model_dump_json(indent=2))
return out_path