Scaffold Reverso MVP pipeline structure
Set up the project skeleton per PLAN.md §4: - src/ package: identifiers, disease, drugs, scoring, provenance with pydantic schemas and confidence-tier logic (working); data-pull/compute functions stubbed per their build week - 5 starter notebooks (01-05) with PLAN-referenced steps - tests/test_scoring.py: tier-assignment tests pass; scoring reference test xfail until Week 3 - docs/: recovery_test_report, data_sources, known_limitations skeletons - pyproject.toml (requires-python >=3.11,<3.14), .gitignore, README - data/ tree preserved via .gitkeep; raw/processed/results gitignored Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
106
src/disease.py
Normal file
106
src/disease.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""Disease signature construction.
|
||||
|
||||
Week 1 (PLAN.md §6). Builds a Tier-A sickle cell signature from GEO expression data via
|
||||
differential expression, then persists it with full provenance to
|
||||
``data/processed/sickle_cell_signature_v1.json``.
|
||||
|
||||
This module defines the persisted schema (pydantic) and the construction stubs. The actual
|
||||
data pull + differential expression is driven from ``notebooks/02_disease_signature.ipynb``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from . import PIPELINE_VERSION, PROCESSED_DIR
|
||||
from .provenance import ConfidenceTier
|
||||
|
||||
# Number of genes to take per direction (PLAN.md §6, Week 1 task 5).
|
||||
TOP_N_PER_DIRECTION = 250
|
||||
QVALUE_CUTOFF = 0.05
|
||||
|
||||
|
||||
class GeneEntry(BaseModel):
|
||||
"""A single differentially expressed gene in the signature."""
|
||||
|
||||
gene: str = Field(..., description="HGNC gene symbol, e.g. 'HBG2'.")
|
||||
entrez_id: str | None = None
|
||||
ensembl_id: str | None = None
|
||||
log_fc: float
|
||||
qvalue: float
|
||||
|
||||
|
||||
class SignatureProvenance(BaseModel):
|
||||
"""Provenance block for a disease signature (PLAN.md §6 schema)."""
|
||||
|
||||
geo_accession: str
|
||||
n_disease: int
|
||||
n_healthy: int
|
||||
platform: str
|
||||
method: str = Field(..., description="Differential expression method, e.g. 'limma', 'deseq2'.")
|
||||
created_date: str
|
||||
|
||||
|
||||
class DiseaseSignature(BaseModel):
|
||||
"""The persisted sickle cell disease signature (PLAN.md §6 schema)."""
|
||||
|
||||
signature_id: str = "sickle_cell_v1"
|
||||
disease_mondo_id: str = "MONDO:0011382"
|
||||
pipeline_version: str = PIPELINE_VERSION
|
||||
up_regulated: list[GeneEntry]
|
||||
down_regulated: list[GeneEntry]
|
||||
provenance: SignatureProvenance
|
||||
confidence_tier: ConfidenceTier
|
||||
tier_rationale: str
|
||||
limitations: list[str]
|
||||
|
||||
|
||||
def compute_differential_expression(
|
||||
expression: pd.DataFrame,
|
||||
sample_groups: pd.Series,
|
||||
*,
|
||||
method: str,
|
||||
) -> pd.DataFrame:
|
||||
"""Compute gene-level log fold change and adjusted p-values.
|
||||
|
||||
For RNA-seq use ``pydeseq2``; for microarray log2-transform/normalize and use a
|
||||
limma-equivalent (PLAN.md §6, Week 1 task 4).
|
||||
|
||||
Args:
|
||||
expression: Genes (rows) x samples (columns) expression matrix.
|
||||
sample_groups: Per-sample group label ('disease' / 'healthy'), indexed by sample.
|
||||
method: 'deseq2' (RNA-seq) or 'limma' (microarray).
|
||||
|
||||
Returns:
|
||||
A table indexed by gene with at least ``log_fc`` and ``qvalue`` columns.
|
||||
"""
|
||||
raise NotImplementedError("Differential expression: implement in Week 1 (notebook 02).")
|
||||
|
||||
|
||||
def build_signature(
|
||||
de_table: pd.DataFrame,
|
||||
provenance: SignatureProvenance,
|
||||
*,
|
||||
tier: ConfidenceTier,
|
||||
tier_rationale: str,
|
||||
limitations: list[str],
|
||||
top_n: int = TOP_N_PER_DIRECTION,
|
||||
qvalue_cutoff: float = QVALUE_CUTOFF,
|
||||
) -> DiseaseSignature:
|
||||
"""Assemble a ``DiseaseSignature`` from a differential expression table.
|
||||
|
||||
Takes the top ``top_n`` up- and down-regulated genes (by qvalue, cut at
|
||||
``qvalue_cutoff``) per PLAN.md §6, Week 1 task 5.
|
||||
"""
|
||||
raise NotImplementedError("Signature assembly: implement in Week 1 (notebook 02).")
|
||||
|
||||
|
||||
def persist_signature(signature: DiseaseSignature, out_path: Path | None = None) -> Path:
|
||||
"""Write a signature to ``data/processed/sickle_cell_signature_v1.json``."""
|
||||
out_path = out_path or (PROCESSED_DIR / "sickle_cell_signature_v1.json")
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
out_path.write_text(signature.model_dump_json(indent=2))
|
||||
return out_path
|
||||
Reference in New Issue
Block a user