Scaffold Reverso MVP pipeline structure

Set up the project skeleton per PLAN.md §4:
- src/ package: identifiers, disease, drugs, scoring, provenance
  with pydantic schemas and confidence-tier logic (working);
  data-pull/compute functions stubbed per their build week
- 5 starter notebooks (01-05) with PLAN-referenced steps
- tests/test_scoring.py: tier-assignment tests pass; scoring
  reference test xfail until Week 3
- docs/: recovery_test_report, data_sources, known_limitations skeletons
- pyproject.toml (requires-python >=3.11,<3.14), .gitignore, README
- data/ tree preserved via .gitkeep; raw/processed/results gitignored

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-23 20:19:38 +02:00
parent e717cf40ed
commit b731478f5d
25 changed files with 1038 additions and 4 deletions

36
src/__init__.py Normal file
View File

@@ -0,0 +1,36 @@
"""Reverso MVP — sickle cell drug repurposing pipeline.
A disease-signature + drug-profile matching pipeline using CMap-style connectivity
scoring. See PLAN.md for the full specification.
"""
from __future__ import annotations
from pathlib import Path
__version__ = "0.1.0"
PIPELINE_VERSION = "v1"
# Single source of truth for reproducibility (PLAN.md §8).
# All randomness in the pipeline must derive from this seed.
RANDOM_SEED = 42
# Canonical project paths, resolved relative to the repo root.
REPO_ROOT = Path(__file__).resolve().parent.parent
DATA_DIR = REPO_ROOT / "data"
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
RESULTS_DIR = DATA_DIR / "results"
DOCS_DIR = REPO_ROOT / "docs"
__all__ = [
"__version__",
"PIPELINE_VERSION",
"RANDOM_SEED",
"REPO_ROOT",
"DATA_DIR",
"RAW_DIR",
"PROCESSED_DIR",
"RESULTS_DIR",
"DOCS_DIR",
]

106
src/disease.py Normal file
View File

@@ -0,0 +1,106 @@
"""Disease signature construction.
Week 1 (PLAN.md §6). Builds a Tier-A sickle cell signature from GEO expression data via
differential expression, then persists it with full provenance to
``data/processed/sickle_cell_signature_v1.json``.
This module defines the persisted schema (pydantic) and the construction stubs. The actual
data pull + differential expression is driven from ``notebooks/02_disease_signature.ipynb``.
"""
from __future__ import annotations
from pathlib import Path
import pandas as pd
from pydantic import BaseModel, Field
from . import PIPELINE_VERSION, PROCESSED_DIR
from .provenance import ConfidenceTier
# Number of genes to take per direction (PLAN.md §6, Week 1 task 5).
TOP_N_PER_DIRECTION = 250
QVALUE_CUTOFF = 0.05
class GeneEntry(BaseModel):
"""A single differentially expressed gene in the signature."""
gene: str = Field(..., description="HGNC gene symbol, e.g. 'HBG2'.")
entrez_id: str | None = None
ensembl_id: str | None = None
log_fc: float
qvalue: float
class SignatureProvenance(BaseModel):
"""Provenance block for a disease signature (PLAN.md §6 schema)."""
geo_accession: str
n_disease: int
n_healthy: int
platform: str
method: str = Field(..., description="Differential expression method, e.g. 'limma', 'deseq2'.")
created_date: str
class DiseaseSignature(BaseModel):
"""The persisted sickle cell disease signature (PLAN.md §6 schema)."""
signature_id: str = "sickle_cell_v1"
disease_mondo_id: str = "MONDO:0011382"
pipeline_version: str = PIPELINE_VERSION
up_regulated: list[GeneEntry]
down_regulated: list[GeneEntry]
provenance: SignatureProvenance
confidence_tier: ConfidenceTier
tier_rationale: str
limitations: list[str]
def compute_differential_expression(
expression: pd.DataFrame,
sample_groups: pd.Series,
*,
method: str,
) -> pd.DataFrame:
"""Compute gene-level log fold change and adjusted p-values.
For RNA-seq use ``pydeseq2``; for microarray log2-transform/normalize and use a
limma-equivalent (PLAN.md §6, Week 1 task 4).
Args:
expression: Genes (rows) x samples (columns) expression matrix.
sample_groups: Per-sample group label ('disease' / 'healthy'), indexed by sample.
method: 'deseq2' (RNA-seq) or 'limma' (microarray).
Returns:
A table indexed by gene with at least ``log_fc`` and ``qvalue`` columns.
"""
raise NotImplementedError("Differential expression: implement in Week 1 (notebook 02).")
def build_signature(
de_table: pd.DataFrame,
provenance: SignatureProvenance,
*,
tier: ConfidenceTier,
tier_rationale: str,
limitations: list[str],
top_n: int = TOP_N_PER_DIRECTION,
qvalue_cutoff: float = QVALUE_CUTOFF,
) -> DiseaseSignature:
"""Assemble a ``DiseaseSignature`` from a differential expression table.
Takes the top ``top_n`` up- and down-regulated genes (by qvalue, cut at
``qvalue_cutoff``) per PLAN.md §6, Week 1 task 5.
"""
raise NotImplementedError("Signature assembly: implement in Week 1 (notebook 02).")
def persist_signature(signature: DiseaseSignature, out_path: Path | None = None) -> Path:
"""Write a signature to ``data/processed/sickle_cell_signature_v1.json``."""
out_path = out_path or (PROCESSED_DIR / "sickle_cell_signature_v1.json")
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(signature.model_dump_json(indent=2))
return out_path

85
src/drugs.py Normal file
View File

@@ -0,0 +1,85 @@
"""Drug profile construction.
Week 2 (PLAN.md §6). Curates the ~300-drug set, pulls ChEMBL structure/target data and LINCS
L1000 signatures, and assembles ``data/processed/drug_profiles_v1.parquet``.
The drug set is deliberately composed (PLAN.md §6, Week 2 task 1):
- ground truth (n=2): hydroxyurea, L-glutamine
- related-mechanism (n~50)
- negative controls (n~50)
- general random sample (n~200), fixed seed
"""
from __future__ import annotations
from enum import Enum
from pathlib import Path
import pandas as pd
from pydantic import BaseModel, Field
from . import PROCESSED_DIR, RANDOM_SEED
from .provenance import ConfidenceTier, Provenance
# LINCS L1000 landmark gene count (PLAN.md §6, Week 2 task 3).
LINCS_LANDMARK_GENES = 978
class InclusionReason(str, Enum):
"""Why a drug is in the curated set (PLAN.md §6, Week 2 task 1)."""
GROUND_TRUTH = "ground_truth"
RELATED_MECHANISM = "related_mechanism"
NEGATIVE_CONTROL = "negative_control"
GENERAL_SAMPLE = "general_sample"
class DrugProfile(BaseModel):
"""A single drug profile row (PLAN.md §6, Week 2 task 4)."""
chembl_id: str
name: str
inchikey: str | None = None
smiles: str | None = None
targets: list[str] = Field(default_factory=list)
mechanism_of_action: str | None = None
# 978-length LINCS landmark z-score vector, or None if no signature is available.
lincs_signature: list[float] | None = None
inclusion_reason: InclusionReason
provenance: list[Provenance] = Field(default_factory=list)
confidence_tier: ConfidenceTier
def curate_drug_set(seed: int = RANDOM_SEED) -> pd.DataFrame:
"""Build the deliberately-composed ~300-drug set.
Returns a table with at least ``chembl_id``, ``name`` and ``inclusion_reason`` columns,
written by the notebook to ``data/processed/drug_set_v1.csv``. Random sampling uses
``seed`` for reproducibility (PLAN.md §8).
"""
raise NotImplementedError("Drug-set curation: implement in Week 2 (notebook 03).")
def fetch_chembl_profile(chembl_id: str) -> dict:
"""Fetch structure, targets and mechanism for one drug from ChEMBL.
Uses ``chembl_webresource_client`` (PLAN.md §6, Week 2 task 2).
"""
raise NotImplementedError("ChEMBL fetch: implement in Week 2 (notebook 03).")
def fetch_lincs_signature(chembl_id: str) -> list[float] | None:
"""Fetch the LINCS L1000 Level-5 consensus (MODZ) signature for a drug.
Returns a 978-length z-score vector, or ``None`` if no signature is available (e.g.
L-glutamine — document such gaps in docs/known_limitations.md). PLAN.md §6, Week 2 task 3.
"""
raise NotImplementedError("LINCS fetch: implement in Week 2 (notebook 03).")
def persist_drug_profiles(profiles: pd.DataFrame, out_path: Path | None = None) -> Path:
"""Write the assembled drug profiles to ``data/processed/drug_profiles_v1.parquet``."""
out_path = out_path or (PROCESSED_DIR / "drug_profiles_v1.parquet")
out_path.parent.mkdir(parents=True, exist_ok=True)
profiles.to_parquet(out_path, index=False)
return out_path

71
src/identifiers.py Normal file
View File

@@ -0,0 +1,71 @@
"""Canonical identifier resolution and the pinned identifiers for the MVP.
Week 1, task 1 (PLAN.md §6). The disease and causal gene identifiers are pinned constants
so the whole pipeline resolves to the same canonical IDs. ``persist_identifiers`` writes them
to ``data/processed/identifiers.json``.
"""
from __future__ import annotations
import json
from pathlib import Path
from pydantic import BaseModel
from . import PROCESSED_DIR
# --- Pinned identifiers (PLAN.md §6, Week 1 task 1) -------------------------------------
SICKLE_CELL_IDS: dict[str, str] = {
"mondo": "MONDO:0011382",
"orphanet": "Orphanet:232",
"omim": "OMIM:603903",
}
HBB_GENE_IDS: dict[str, str] = {
"symbol": "HBB",
"ensembl": "ENSG00000244734",
"hgnc": "HGNC:4827",
}
# Ground-truth drugs for the recovery test (PLAN.md §6, Week 2 task 1).
GROUND_TRUTH_DRUGS: dict[str, str] = {
"hydroxyurea": "CHEMBL467",
"l-glutamine": "CHEMBL930",
}
class IdentifierSet(BaseModel):
"""The pinned identifier set persisted at the start of the pipeline."""
disease: dict[str, str]
causal_gene: dict[str, str]
ground_truth_drugs: dict[str, str]
def build_identifier_set() -> IdentifierSet:
"""Return the pinned identifier set for the MVP."""
return IdentifierSet(
disease=SICKLE_CELL_IDS,
causal_gene=HBB_GENE_IDS,
ground_truth_drugs=GROUND_TRUTH_DRUGS,
)
def persist_identifiers(out_path: Path | None = None) -> Path:
"""Write the pinned identifier set to ``data/processed/identifiers.json``.
Returns the path written.
"""
out_path = out_path or (PROCESSED_DIR / "identifiers.json")
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(build_identifier_set().model_dump_json(indent=2))
return out_path
def resolve_drug_to_chembl(name_or_alias: str) -> str:
"""Resolve a drug name/alias to a canonical ChEMBL ID.
Uses ``chembl_webresource_client``. Implemented in Week 2 (PLAN.md §6, task 2).
"""
raise NotImplementedError("Drug -> ChEMBL resolution: implement in Week 2 (notebook 03).")

72
src/provenance.py Normal file
View File

@@ -0,0 +1,72 @@
"""Provenance and confidence-tier tracking.
The confidence tier is the most commercially important design decision in the pipeline
(PLAN.md §3). *Every* persisted artifact — signatures and drug profiles alike — must carry
a tier and the provenance needed to justify it.
Tier A — measured data, peer-reviewed source, n>10 per group, recent
Tier B — measured but small-n, older, or single-source
Tier C — inferred / extrapolated / hypothesis-only
"""
from __future__ import annotations
from datetime import date
from enum import Enum
from pydantic import BaseModel, Field
class ConfidenceTier(str, Enum):
"""Confidence tier for a persisted artifact. See module docstring."""
A = "A"
B = "B"
C = "C"
class Provenance(BaseModel):
"""Where a record came from and when. Attached to every persisted artifact."""
source: str = Field(..., description="Human-readable source name, e.g. 'GEO', 'ChEMBL'.")
source_id: str | None = Field(
None, description="Accession / identifier within the source, e.g. 'GSE53441'."
)
source_url: str | None = None
source_version: str | None = Field(
None, description="Dataset/release version where the source is versioned."
)
download_date: date | None = Field(
None, description="Date the underlying data was downloaded (reproducibility)."
)
license: str | None = None
notes: str | None = None
def assign_tier(
*,
is_measured: bool,
n_per_group: int | None,
peer_reviewed: bool,
single_source: bool,
) -> ConfidenceTier:
"""Assign a confidence tier from the evidence characteristics.
This encodes the tier rules from PLAN.md §3 so tier assignment is consistent and
auditable rather than ad-hoc per notebook.
Args:
is_measured: True if the value is directly measured (vs inferred/extrapolated).
n_per_group: Sample size per group, if applicable (None when not meaningful).
peer_reviewed: Whether the source is peer-reviewed.
single_source: Whether the evidence rests on a single source.
Returns:
The assigned ``ConfidenceTier``.
"""
if not is_measured:
return ConfidenceTier.C
if peer_reviewed and (n_per_group is not None and n_per_group > 10) and not single_source:
return ConfidenceTier.A
# Measured, but small-n / older / single-source falls to Tier B.
return ConfidenceTier.B

85
src/scoring.py Normal file
View File

@@ -0,0 +1,85 @@
"""CMap-style connectivity scoring — the matching engine.
Week 3 (PLAN.md §6). Scores each drug's LINCS signature against the disease signature using
weighted Kolmogorov-Smirnov enrichment (Lamb 2006 / Subramanian 2017). Strongly *negative*
connectivity = strong reversal of the disease signature = candidate match.
Uses ``cmapPy`` as the reference implementation. ``tests/test_scoring.py`` verifies the
implementation against a known reference.
"""
from __future__ import annotations
from pathlib import Path
import pandas as pd
from pydantic import BaseModel
from . import RESULTS_DIR
class ConnectivityResult(BaseModel):
"""Connectivity score for a single drug against the disease signature."""
chembl_id: str
drug_name: str
connectivity_score: float | None # None when the drug has no LINCS signature.
normalized_score: float | None = None
p_value: float | None = None
scored: bool # False => no signature available, not scored (do not skip silently).
n_genes_overlap: int | None = None
def connectivity_score(
up_genes: list[str],
down_genes: list[str],
drug_signature: pd.Series,
) -> float:
"""Weighted KS connectivity score for one drug vs the disease up/down gene sets.
Only the intersection of disease-signature genes and LINCS landmark genes is scored;
callers must record the overlap count (PLAN.md §6, Week 3 task 2).
Args:
up_genes: Disease up-regulated gene identifiers.
down_genes: Disease down-regulated gene identifiers.
drug_signature: Drug's expression vector indexed by gene identifier.
Returns:
Connectivity score in roughly [-1, 1]; strongly negative = strong reversal.
"""
raise NotImplementedError("Connectivity scoring: implement in Week 3 (notebook 04).")
def rank_drugs(
signature_up: list[str],
signature_down: list[str],
drug_profiles: pd.DataFrame,
) -> pd.DataFrame:
"""Score and rank all drugs against the disease signature.
Drugs without a LINCS signature are marked ``scored=False`` and excluded from the ranking
rather than dropped silently (PLAN.md §6, Week 3 task 2).
Returns a ranked table with the columns described in PLAN.md §6 (rank, drug_name,
chembl_id, connectivity_score, normalized_score, p_value, inclusion_reason,
known_targets, mechanism_summary).
"""
raise NotImplementedError("Drug ranking: implement in Week 3 (notebook 04).")
def mechanistic_prior(targets: list[str]) -> float:
"""Prior weight for a drug based on sickle-cell-relevant target pathways.
Pathways of interest: HbF regulation, hemoglobin, NO signaling, inflammation, oxidative
stress (PLAN.md §6, Week 3 task 3). Used to build the secondary, prior-weighted ranking.
"""
raise NotImplementedError("Mechanistic prior: implement in Week 3 (notebook 04).")
def persist_ranking(ranking: pd.DataFrame, out_path: Path | None = None) -> Path:
"""Write the ranked candidate list to ``data/results/ranked_candidates_v1.csv``."""
out_path = out_path or (RESULTS_DIR / "ranked_candidates_v1.csv")
out_path.parent.mkdir(parents=True, exist_ok=True)
ranking.to_csv(out_path, index=False)
return out_path