Scaffold Reverso MVP pipeline structure
Set up the project skeleton per PLAN.md §4: - src/ package: identifiers, disease, drugs, scoring, provenance with pydantic schemas and confidence-tier logic (working); data-pull/compute functions stubbed per their build week - 5 starter notebooks (01-05) with PLAN-referenced steps - tests/test_scoring.py: tier-assignment tests pass; scoring reference test xfail until Week 3 - docs/: recovery_test_report, data_sources, known_limitations skeletons - pyproject.toml (requires-python >=3.11,<3.14), .gitignore, README - data/ tree preserved via .gitkeep; raw/processed/results gitignored Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
36
src/__init__.py
Normal file
36
src/__init__.py
Normal file
@@ -0,0 +1,36 @@
|
||||
"""Reverso MVP — sickle cell drug repurposing pipeline.
|
||||
|
||||
A disease-signature + drug-profile matching pipeline using CMap-style connectivity
|
||||
scoring. See PLAN.md for the full specification.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
__version__ = "0.1.0"
|
||||
PIPELINE_VERSION = "v1"
|
||||
|
||||
# Single source of truth for reproducibility (PLAN.md §8).
|
||||
# All randomness in the pipeline must derive from this seed.
|
||||
RANDOM_SEED = 42
|
||||
|
||||
# Canonical project paths, resolved relative to the repo root.
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
DATA_DIR = REPO_ROOT / "data"
|
||||
RAW_DIR = DATA_DIR / "raw"
|
||||
PROCESSED_DIR = DATA_DIR / "processed"
|
||||
RESULTS_DIR = DATA_DIR / "results"
|
||||
DOCS_DIR = REPO_ROOT / "docs"
|
||||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
"PIPELINE_VERSION",
|
||||
"RANDOM_SEED",
|
||||
"REPO_ROOT",
|
||||
"DATA_DIR",
|
||||
"RAW_DIR",
|
||||
"PROCESSED_DIR",
|
||||
"RESULTS_DIR",
|
||||
"DOCS_DIR",
|
||||
]
|
||||
106
src/disease.py
Normal file
106
src/disease.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""Disease signature construction.
|
||||
|
||||
Week 1 (PLAN.md §6). Builds a Tier-A sickle cell signature from GEO expression data via
|
||||
differential expression, then persists it with full provenance to
|
||||
``data/processed/sickle_cell_signature_v1.json``.
|
||||
|
||||
This module defines the persisted schema (pydantic) and the construction stubs. The actual
|
||||
data pull + differential expression is driven from ``notebooks/02_disease_signature.ipynb``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from . import PIPELINE_VERSION, PROCESSED_DIR
|
||||
from .provenance import ConfidenceTier
|
||||
|
||||
# Number of genes to take per direction (PLAN.md §6, Week 1 task 5).
|
||||
TOP_N_PER_DIRECTION = 250
|
||||
QVALUE_CUTOFF = 0.05
|
||||
|
||||
|
||||
class GeneEntry(BaseModel):
|
||||
"""A single differentially expressed gene in the signature."""
|
||||
|
||||
gene: str = Field(..., description="HGNC gene symbol, e.g. 'HBG2'.")
|
||||
entrez_id: str | None = None
|
||||
ensembl_id: str | None = None
|
||||
log_fc: float
|
||||
qvalue: float
|
||||
|
||||
|
||||
class SignatureProvenance(BaseModel):
|
||||
"""Provenance block for a disease signature (PLAN.md §6 schema)."""
|
||||
|
||||
geo_accession: str
|
||||
n_disease: int
|
||||
n_healthy: int
|
||||
platform: str
|
||||
method: str = Field(..., description="Differential expression method, e.g. 'limma', 'deseq2'.")
|
||||
created_date: str
|
||||
|
||||
|
||||
class DiseaseSignature(BaseModel):
|
||||
"""The persisted sickle cell disease signature (PLAN.md §6 schema)."""
|
||||
|
||||
signature_id: str = "sickle_cell_v1"
|
||||
disease_mondo_id: str = "MONDO:0011382"
|
||||
pipeline_version: str = PIPELINE_VERSION
|
||||
up_regulated: list[GeneEntry]
|
||||
down_regulated: list[GeneEntry]
|
||||
provenance: SignatureProvenance
|
||||
confidence_tier: ConfidenceTier
|
||||
tier_rationale: str
|
||||
limitations: list[str]
|
||||
|
||||
|
||||
def compute_differential_expression(
|
||||
expression: pd.DataFrame,
|
||||
sample_groups: pd.Series,
|
||||
*,
|
||||
method: str,
|
||||
) -> pd.DataFrame:
|
||||
"""Compute gene-level log fold change and adjusted p-values.
|
||||
|
||||
For RNA-seq use ``pydeseq2``; for microarray log2-transform/normalize and use a
|
||||
limma-equivalent (PLAN.md §6, Week 1 task 4).
|
||||
|
||||
Args:
|
||||
expression: Genes (rows) x samples (columns) expression matrix.
|
||||
sample_groups: Per-sample group label ('disease' / 'healthy'), indexed by sample.
|
||||
method: 'deseq2' (RNA-seq) or 'limma' (microarray).
|
||||
|
||||
Returns:
|
||||
A table indexed by gene with at least ``log_fc`` and ``qvalue`` columns.
|
||||
"""
|
||||
raise NotImplementedError("Differential expression: implement in Week 1 (notebook 02).")
|
||||
|
||||
|
||||
def build_signature(
|
||||
de_table: pd.DataFrame,
|
||||
provenance: SignatureProvenance,
|
||||
*,
|
||||
tier: ConfidenceTier,
|
||||
tier_rationale: str,
|
||||
limitations: list[str],
|
||||
top_n: int = TOP_N_PER_DIRECTION,
|
||||
qvalue_cutoff: float = QVALUE_CUTOFF,
|
||||
) -> DiseaseSignature:
|
||||
"""Assemble a ``DiseaseSignature`` from a differential expression table.
|
||||
|
||||
Takes the top ``top_n`` up- and down-regulated genes (by qvalue, cut at
|
||||
``qvalue_cutoff``) per PLAN.md §6, Week 1 task 5.
|
||||
"""
|
||||
raise NotImplementedError("Signature assembly: implement in Week 1 (notebook 02).")
|
||||
|
||||
|
||||
def persist_signature(signature: DiseaseSignature, out_path: Path | None = None) -> Path:
|
||||
"""Write a signature to ``data/processed/sickle_cell_signature_v1.json``."""
|
||||
out_path = out_path or (PROCESSED_DIR / "sickle_cell_signature_v1.json")
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
out_path.write_text(signature.model_dump_json(indent=2))
|
||||
return out_path
|
||||
85
src/drugs.py
Normal file
85
src/drugs.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""Drug profile construction.
|
||||
|
||||
Week 2 (PLAN.md §6). Curates the ~300-drug set, pulls ChEMBL structure/target data and LINCS
|
||||
L1000 signatures, and assembles ``data/processed/drug_profiles_v1.parquet``.
|
||||
|
||||
The drug set is deliberately composed (PLAN.md §6, Week 2 task 1):
|
||||
- ground truth (n=2): hydroxyurea, L-glutamine
|
||||
- related-mechanism (n~50)
|
||||
- negative controls (n~50)
|
||||
- general random sample (n~200), fixed seed
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from . import PROCESSED_DIR, RANDOM_SEED
|
||||
from .provenance import ConfidenceTier, Provenance
|
||||
|
||||
# LINCS L1000 landmark gene count (PLAN.md §6, Week 2 task 3).
|
||||
LINCS_LANDMARK_GENES = 978
|
||||
|
||||
|
||||
class InclusionReason(str, Enum):
|
||||
"""Why a drug is in the curated set (PLAN.md §6, Week 2 task 1)."""
|
||||
|
||||
GROUND_TRUTH = "ground_truth"
|
||||
RELATED_MECHANISM = "related_mechanism"
|
||||
NEGATIVE_CONTROL = "negative_control"
|
||||
GENERAL_SAMPLE = "general_sample"
|
||||
|
||||
|
||||
class DrugProfile(BaseModel):
|
||||
"""A single drug profile row (PLAN.md §6, Week 2 task 4)."""
|
||||
|
||||
chembl_id: str
|
||||
name: str
|
||||
inchikey: str | None = None
|
||||
smiles: str | None = None
|
||||
targets: list[str] = Field(default_factory=list)
|
||||
mechanism_of_action: str | None = None
|
||||
# 978-length LINCS landmark z-score vector, or None if no signature is available.
|
||||
lincs_signature: list[float] | None = None
|
||||
inclusion_reason: InclusionReason
|
||||
provenance: list[Provenance] = Field(default_factory=list)
|
||||
confidence_tier: ConfidenceTier
|
||||
|
||||
|
||||
def curate_drug_set(seed: int = RANDOM_SEED) -> pd.DataFrame:
|
||||
"""Build the deliberately-composed ~300-drug set.
|
||||
|
||||
Returns a table with at least ``chembl_id``, ``name`` and ``inclusion_reason`` columns,
|
||||
written by the notebook to ``data/processed/drug_set_v1.csv``. Random sampling uses
|
||||
``seed`` for reproducibility (PLAN.md §8).
|
||||
"""
|
||||
raise NotImplementedError("Drug-set curation: implement in Week 2 (notebook 03).")
|
||||
|
||||
|
||||
def fetch_chembl_profile(chembl_id: str) -> dict:
|
||||
"""Fetch structure, targets and mechanism for one drug from ChEMBL.
|
||||
|
||||
Uses ``chembl_webresource_client`` (PLAN.md §6, Week 2 task 2).
|
||||
"""
|
||||
raise NotImplementedError("ChEMBL fetch: implement in Week 2 (notebook 03).")
|
||||
|
||||
|
||||
def fetch_lincs_signature(chembl_id: str) -> list[float] | None:
|
||||
"""Fetch the LINCS L1000 Level-5 consensus (MODZ) signature for a drug.
|
||||
|
||||
Returns a 978-length z-score vector, or ``None`` if no signature is available (e.g.
|
||||
L-glutamine — document such gaps in docs/known_limitations.md). PLAN.md §6, Week 2 task 3.
|
||||
"""
|
||||
raise NotImplementedError("LINCS fetch: implement in Week 2 (notebook 03).")
|
||||
|
||||
|
||||
def persist_drug_profiles(profiles: pd.DataFrame, out_path: Path | None = None) -> Path:
|
||||
"""Write the assembled drug profiles to ``data/processed/drug_profiles_v1.parquet``."""
|
||||
out_path = out_path or (PROCESSED_DIR / "drug_profiles_v1.parquet")
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
profiles.to_parquet(out_path, index=False)
|
||||
return out_path
|
||||
71
src/identifiers.py
Normal file
71
src/identifiers.py
Normal file
@@ -0,0 +1,71 @@
|
||||
"""Canonical identifier resolution and the pinned identifiers for the MVP.
|
||||
|
||||
Week 1, task 1 (PLAN.md §6). The disease and causal gene identifiers are pinned constants
|
||||
so the whole pipeline resolves to the same canonical IDs. ``persist_identifiers`` writes them
|
||||
to ``data/processed/identifiers.json``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from . import PROCESSED_DIR
|
||||
|
||||
# --- Pinned identifiers (PLAN.md §6, Week 1 task 1) -------------------------------------
|
||||
|
||||
SICKLE_CELL_IDS: dict[str, str] = {
|
||||
"mondo": "MONDO:0011382",
|
||||
"orphanet": "Orphanet:232",
|
||||
"omim": "OMIM:603903",
|
||||
}
|
||||
|
||||
HBB_GENE_IDS: dict[str, str] = {
|
||||
"symbol": "HBB",
|
||||
"ensembl": "ENSG00000244734",
|
||||
"hgnc": "HGNC:4827",
|
||||
}
|
||||
|
||||
# Ground-truth drugs for the recovery test (PLAN.md §6, Week 2 task 1).
|
||||
GROUND_TRUTH_DRUGS: dict[str, str] = {
|
||||
"hydroxyurea": "CHEMBL467",
|
||||
"l-glutamine": "CHEMBL930",
|
||||
}
|
||||
|
||||
|
||||
class IdentifierSet(BaseModel):
|
||||
"""The pinned identifier set persisted at the start of the pipeline."""
|
||||
|
||||
disease: dict[str, str]
|
||||
causal_gene: dict[str, str]
|
||||
ground_truth_drugs: dict[str, str]
|
||||
|
||||
|
||||
def build_identifier_set() -> IdentifierSet:
|
||||
"""Return the pinned identifier set for the MVP."""
|
||||
return IdentifierSet(
|
||||
disease=SICKLE_CELL_IDS,
|
||||
causal_gene=HBB_GENE_IDS,
|
||||
ground_truth_drugs=GROUND_TRUTH_DRUGS,
|
||||
)
|
||||
|
||||
|
||||
def persist_identifiers(out_path: Path | None = None) -> Path:
|
||||
"""Write the pinned identifier set to ``data/processed/identifiers.json``.
|
||||
|
||||
Returns the path written.
|
||||
"""
|
||||
out_path = out_path or (PROCESSED_DIR / "identifiers.json")
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
out_path.write_text(build_identifier_set().model_dump_json(indent=2))
|
||||
return out_path
|
||||
|
||||
|
||||
def resolve_drug_to_chembl(name_or_alias: str) -> str:
|
||||
"""Resolve a drug name/alias to a canonical ChEMBL ID.
|
||||
|
||||
Uses ``chembl_webresource_client``. Implemented in Week 2 (PLAN.md §6, task 2).
|
||||
"""
|
||||
raise NotImplementedError("Drug -> ChEMBL resolution: implement in Week 2 (notebook 03).")
|
||||
72
src/provenance.py
Normal file
72
src/provenance.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""Provenance and confidence-tier tracking.
|
||||
|
||||
The confidence tier is the most commercially important design decision in the pipeline
|
||||
(PLAN.md §3). *Every* persisted artifact — signatures and drug profiles alike — must carry
|
||||
a tier and the provenance needed to justify it.
|
||||
|
||||
Tier A — measured data, peer-reviewed source, n>10 per group, recent
|
||||
Tier B — measured but small-n, older, or single-source
|
||||
Tier C — inferred / extrapolated / hypothesis-only
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import date
|
||||
from enum import Enum
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class ConfidenceTier(str, Enum):
|
||||
"""Confidence tier for a persisted artifact. See module docstring."""
|
||||
|
||||
A = "A"
|
||||
B = "B"
|
||||
C = "C"
|
||||
|
||||
|
||||
class Provenance(BaseModel):
|
||||
"""Where a record came from and when. Attached to every persisted artifact."""
|
||||
|
||||
source: str = Field(..., description="Human-readable source name, e.g. 'GEO', 'ChEMBL'.")
|
||||
source_id: str | None = Field(
|
||||
None, description="Accession / identifier within the source, e.g. 'GSE53441'."
|
||||
)
|
||||
source_url: str | None = None
|
||||
source_version: str | None = Field(
|
||||
None, description="Dataset/release version where the source is versioned."
|
||||
)
|
||||
download_date: date | None = Field(
|
||||
None, description="Date the underlying data was downloaded (reproducibility)."
|
||||
)
|
||||
license: str | None = None
|
||||
notes: str | None = None
|
||||
|
||||
|
||||
def assign_tier(
|
||||
*,
|
||||
is_measured: bool,
|
||||
n_per_group: int | None,
|
||||
peer_reviewed: bool,
|
||||
single_source: bool,
|
||||
) -> ConfidenceTier:
|
||||
"""Assign a confidence tier from the evidence characteristics.
|
||||
|
||||
This encodes the tier rules from PLAN.md §3 so tier assignment is consistent and
|
||||
auditable rather than ad-hoc per notebook.
|
||||
|
||||
Args:
|
||||
is_measured: True if the value is directly measured (vs inferred/extrapolated).
|
||||
n_per_group: Sample size per group, if applicable (None when not meaningful).
|
||||
peer_reviewed: Whether the source is peer-reviewed.
|
||||
single_source: Whether the evidence rests on a single source.
|
||||
|
||||
Returns:
|
||||
The assigned ``ConfidenceTier``.
|
||||
"""
|
||||
if not is_measured:
|
||||
return ConfidenceTier.C
|
||||
if peer_reviewed and (n_per_group is not None and n_per_group > 10) and not single_source:
|
||||
return ConfidenceTier.A
|
||||
# Measured, but small-n / older / single-source falls to Tier B.
|
||||
return ConfidenceTier.B
|
||||
85
src/scoring.py
Normal file
85
src/scoring.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""CMap-style connectivity scoring — the matching engine.
|
||||
|
||||
Week 3 (PLAN.md §6). Scores each drug's LINCS signature against the disease signature using
|
||||
weighted Kolmogorov-Smirnov enrichment (Lamb 2006 / Subramanian 2017). Strongly *negative*
|
||||
connectivity = strong reversal of the disease signature = candidate match.
|
||||
|
||||
Uses ``cmapPy`` as the reference implementation. ``tests/test_scoring.py`` verifies the
|
||||
implementation against a known reference.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
from pydantic import BaseModel
|
||||
|
||||
from . import RESULTS_DIR
|
||||
|
||||
|
||||
class ConnectivityResult(BaseModel):
|
||||
"""Connectivity score for a single drug against the disease signature."""
|
||||
|
||||
chembl_id: str
|
||||
drug_name: str
|
||||
connectivity_score: float | None # None when the drug has no LINCS signature.
|
||||
normalized_score: float | None = None
|
||||
p_value: float | None = None
|
||||
scored: bool # False => no signature available, not scored (do not skip silently).
|
||||
n_genes_overlap: int | None = None
|
||||
|
||||
|
||||
def connectivity_score(
|
||||
up_genes: list[str],
|
||||
down_genes: list[str],
|
||||
drug_signature: pd.Series,
|
||||
) -> float:
|
||||
"""Weighted KS connectivity score for one drug vs the disease up/down gene sets.
|
||||
|
||||
Only the intersection of disease-signature genes and LINCS landmark genes is scored;
|
||||
callers must record the overlap count (PLAN.md §6, Week 3 task 2).
|
||||
|
||||
Args:
|
||||
up_genes: Disease up-regulated gene identifiers.
|
||||
down_genes: Disease down-regulated gene identifiers.
|
||||
drug_signature: Drug's expression vector indexed by gene identifier.
|
||||
|
||||
Returns:
|
||||
Connectivity score in roughly [-1, 1]; strongly negative = strong reversal.
|
||||
"""
|
||||
raise NotImplementedError("Connectivity scoring: implement in Week 3 (notebook 04).")
|
||||
|
||||
|
||||
def rank_drugs(
|
||||
signature_up: list[str],
|
||||
signature_down: list[str],
|
||||
drug_profiles: pd.DataFrame,
|
||||
) -> pd.DataFrame:
|
||||
"""Score and rank all drugs against the disease signature.
|
||||
|
||||
Drugs without a LINCS signature are marked ``scored=False`` and excluded from the ranking
|
||||
rather than dropped silently (PLAN.md §6, Week 3 task 2).
|
||||
|
||||
Returns a ranked table with the columns described in PLAN.md §6 (rank, drug_name,
|
||||
chembl_id, connectivity_score, normalized_score, p_value, inclusion_reason,
|
||||
known_targets, mechanism_summary).
|
||||
"""
|
||||
raise NotImplementedError("Drug ranking: implement in Week 3 (notebook 04).")
|
||||
|
||||
|
||||
def mechanistic_prior(targets: list[str]) -> float:
|
||||
"""Prior weight for a drug based on sickle-cell-relevant target pathways.
|
||||
|
||||
Pathways of interest: HbF regulation, hemoglobin, NO signaling, inflammation, oxidative
|
||||
stress (PLAN.md §6, Week 3 task 3). Used to build the secondary, prior-weighted ranking.
|
||||
"""
|
||||
raise NotImplementedError("Mechanistic prior: implement in Week 3 (notebook 04).")
|
||||
|
||||
|
||||
def persist_ranking(ranking: pd.DataFrame, out_path: Path | None = None) -> Path:
|
||||
"""Write the ranked candidate list to ``data/results/ranked_candidates_v1.csv``."""
|
||||
out_path = out_path or (RESULTS_DIR / "ranked_candidates_v1.csv")
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
ranking.to_csv(out_path, index=False)
|
||||
return out_path
|
||||
Reference in New Issue
Block a user