Week 2: 300-drug profiles with LINCS signatures + ChEMBL
Build the drug profile dataset (PLAN §6 Week 2): - week2_curate_drugset.py: 300-drug set (2 ground-truth + 32 related- mechanism + 26 negative-control + 240 random), restricted to LINCS-scorable compounds, seed=42 - week2_chembl.py: InChIKey->ChEMBL match (145/300), MoA + targets - week2_lincs_extract.py: cmapPy-slice both Level-5 GCTX phases to 978 landmark genes, mean-aggregate per drug to one consensus signature - week2_assemble.py: join into drug_profiles_v1.parquet, Tier B (LINCS single-source), scored flag per PLAN §6 Week 3 task 2 - docs/data_sources.md: drug set composition + LINCS/ChEMBL provenance Results (all gitignored data): 300/300 drugs scored, both ground-truth drugs present (hydroxyurea Phase II = CHEMBL467, L-glutamine Phase I). Key caveat recorded: only 56/477 (12%) of the disease signature genes are LINCS landmarks, so Week-3 scoring uses a 30-up/26-down query. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
91
scripts/week2_assemble.py
Normal file
91
scripts/week2_assemble.py
Normal file
@@ -0,0 +1,91 @@
|
||||
"""Week 2, task 4: assemble drug_profiles_v1.parquet (PLAN §6).
|
||||
|
||||
Joins the curated drug set + ChEMBL enrichment + LINCS consensus signatures into one profile
|
||||
table. Each drug carries a confidence tier: LINCS is a single source, so signature-backed drugs
|
||||
are Tier B at best (assign_tier with single_source=True); drugs with no signature are Tier C and
|
||||
marked not-scored (not dropped silently — PLAN §6 Week 3 task 2).
|
||||
|
||||
The 978-gene signature order is the column order of lincs_signatures_v1.parquet (landmark
|
||||
symbols); each profile's `lincs_signature` is that vector (or null).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import ast
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
from src.drugs import persist_drug_profiles # noqa: E402
|
||||
from src.provenance import ConfidenceTier, assign_tier # noqa: E402
|
||||
|
||||
PROCESSED = Path("data/processed")
|
||||
DRUG_SET = PROCESSED / "drug_set_v1.csv"
|
||||
CHEMBL = Path("data/raw/chembl/chembl_enrichment.parquet")
|
||||
LINCS_SIG = PROCESSED / "lincs_signatures_v1.parquet"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
drugs = pd.read_csv(DRUG_SET)
|
||||
chembl = pd.read_parquet(CHEMBL)[
|
||||
["pert_iname", "chembl_id", "pref_name", "smiles", "mechanism_of_action", "targets"]
|
||||
]
|
||||
sigs = pd.read_parquet(LINCS_SIG) # rows=pert_iname, cols=978 landmark symbols
|
||||
gene_order = list(sigs.columns)
|
||||
|
||||
df = drugs.merge(chembl, on="pert_iname", how="left")
|
||||
|
||||
rows = []
|
||||
for r in df.itertuples():
|
||||
has_sig = r.pert_iname in sigs.index
|
||||
vector = sigs.loc[r.pert_iname].tolist() if has_sig else None
|
||||
# LINCS = single source => Tier B max when measured; no signature => Tier C.
|
||||
tier = assign_tier(
|
||||
is_measured=has_sig, n_per_group=None, peer_reviewed=True, single_source=True
|
||||
) if has_sig else ConfidenceTier.C
|
||||
targets = r.targets
|
||||
if isinstance(targets, str):
|
||||
try:
|
||||
targets = ast.literal_eval(targets)
|
||||
except (ValueError, SyntaxError):
|
||||
targets = []
|
||||
elif hasattr(targets, "tolist"): # numpy ndarray from parquet round-trip
|
||||
targets = targets.tolist()
|
||||
elif targets is None or (not isinstance(targets, (list, tuple))):
|
||||
targets = []
|
||||
rows.append({
|
||||
"name": r.pert_iname,
|
||||
"chembl_id": r.chembl_id if pd.notna(r.chembl_id) else None,
|
||||
"pref_name": r.pref_name if pd.notna(r.pref_name) else None,
|
||||
"inchikey": r.inchi_key if pd.notna(r.inchi_key) else None,
|
||||
"smiles": r.smiles if pd.notna(r.smiles) else None,
|
||||
"targets": list(targets),
|
||||
"mechanism_of_action": r.mechanism_of_action if pd.notna(r.mechanism_of_action) else None,
|
||||
"inclusion_reason": r.inclusion_reason,
|
||||
"lincs_phase": r.phase,
|
||||
"scored": has_sig,
|
||||
"lincs_signature": vector,
|
||||
"confidence_tier": tier.value,
|
||||
})
|
||||
|
||||
profiles = pd.DataFrame(rows)
|
||||
# Persist the gene order alongside, so Week-3 scoring can align the vectors.
|
||||
(PROCESSED / "lincs_gene_order.txt").write_text("\n".join(gene_order))
|
||||
path = persist_drug_profiles(profiles)
|
||||
|
||||
print(f"drug_profiles_v1: {len(profiles)} drugs")
|
||||
print(f" scored (have LINCS signature): {profiles['scored'].sum()}")
|
||||
print(f" not scored: {(~profiles['scored']).sum()}")
|
||||
print(" by inclusion reason (scored rate):")
|
||||
print(profiles.groupby("inclusion_reason")["scored"].agg(["sum", "count"]).to_string())
|
||||
print(" tier split:", profiles["confidence_tier"].value_counts().to_dict())
|
||||
for gt in ["hydroxyurea", "glutamine"]:
|
||||
row = profiles[profiles["name"] == gt]
|
||||
print(f" ground truth '{gt}': scored={bool(row['scored'].iloc[0]) if len(row) else 'ABSENT'}")
|
||||
print(f"wrote {path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user