Week 2: 300-drug profiles with LINCS signatures + ChEMBL
Build the drug profile dataset (PLAN §6 Week 2): - week2_curate_drugset.py: 300-drug set (2 ground-truth + 32 related- mechanism + 26 negative-control + 240 random), restricted to LINCS-scorable compounds, seed=42 - week2_chembl.py: InChIKey->ChEMBL match (145/300), MoA + targets - week2_lincs_extract.py: cmapPy-slice both Level-5 GCTX phases to 978 landmark genes, mean-aggregate per drug to one consensus signature - week2_assemble.py: join into drug_profiles_v1.parquet, Tier B (LINCS single-source), scored flag per PLAN §6 Week 3 task 2 - docs/data_sources.md: drug set composition + LINCS/ChEMBL provenance Results (all gitignored data): 300/300 drugs scored, both ground-truth drugs present (hydroxyurea Phase II = CHEMBL467, L-glutamine Phase I). Key caveat recorded: only 56/477 (12%) of the disease signature genes are LINCS landmarks, so Week-3 scoring uses a 30-up/26-down query. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
131
scripts/week2_curate_drugset.py
Normal file
131
scripts/week2_curate_drugset.py
Normal file
@@ -0,0 +1,131 @@
|
||||
"""Week 2, task 1: curate the deliberately-composed ~300-drug set (PLAN §6).
|
||||
|
||||
Composition: 2 ground-truth + ~50 related-mechanism + ~50 negative controls + ~200 random.
|
||||
The universe is restricted to compounds that actually have a LINCS Level-5 signature (in
|
||||
Phase I and/or Phase II), so every curated drug is scorable. Output: drug_set_v1.csv.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import gzip
|
||||
import io
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
from src import RANDOM_SEED # noqa: E402
|
||||
|
||||
LINCS = Path("data/raw/lincs")
|
||||
OUT = Path("data/processed/drug_set_v1.csv")
|
||||
|
||||
GROUND_TRUTH = ["hydroxyurea", "glutamine"] # glutamine == L-glutamine in LINCS
|
||||
|
||||
# Curated by mechanism (PLAN §6). Intersected with the LINCS catalog below, so misses are
|
||||
# silently dropped — we keep whatever actually has a signature.
|
||||
RELATED_MECHANISM = [
|
||||
# HbF inducers / epigenetic
|
||||
"decitabine", "azacitidine", "vorinostat", "panobinostat", "romidepsin", "entinostat",
|
||||
"mocetinostat", "belinostat", "pomalidomide", "lenalidomide", "thalidomide", "apicidin",
|
||||
"trichostatin-a", "scriptaid", "valproic-acid",
|
||||
# NO / vascular
|
||||
"sildenafil", "tadalafil", "nitroprusside",
|
||||
# antioxidants
|
||||
"n-acetyl-cysteine", "resveratrol", "curcumin", "quercetin", "sulforaphane",
|
||||
# anti-inflammatory studied in SCD
|
||||
"dexamethasone", "prednisolone", "hydrocortisone", "ibuprofen", "indomethacin",
|
||||
"sulfasalazine", "montelukast", "aspirin",
|
||||
# iron / heme / SCD-adjacent
|
||||
"hemin", "deferoxamine", "deferasirox", "simvastatin", "atorvastatin", "ticagrelor",
|
||||
]
|
||||
|
||||
NEGATIVE_CONTROL = [
|
||||
# antifungals
|
||||
"fluconazole", "ketoconazole", "itraconazole", "clotrimazole", "terbinafine", "miconazole",
|
||||
# antihistamines
|
||||
"loratadine", "cetirizine", "fexofenadine", "diphenhydramine", "chlorpheniramine",
|
||||
"astemizole",
|
||||
# antibiotics
|
||||
"amoxicillin", "ciprofloxacin", "doxycycline", "trimethoprim", "azithromycin", "tetracycline",
|
||||
"nitrofurantoin",
|
||||
# hormones / contraceptives
|
||||
"levonorgestrel", "ethinyl-estradiol", "norethindrone", "medroxyprogesterone-acetate",
|
||||
# misc unrelated
|
||||
"omeprazole", "ranitidine", "loperamide", "caffeine", "acetaminophen", "lidocaine",
|
||||
]
|
||||
|
||||
# Fill the random sample so the total set is ~300 (the denominator the pre-registered
|
||||
# recovery-test thresholds assume: "top 30 of 300"). Curated mechanism/control drugs are
|
||||
# capped by what LINCS actually contains, so the random arm absorbs the remainder.
|
||||
TARGET_TOTAL = 300
|
||||
|
||||
|
||||
def load_catalog() -> pd.DataFrame:
|
||||
"""Compounds with >=1 Level-5 signature, annotated with phase + inchi/smiles."""
|
||||
|
||||
def read_gz(fn, **kw):
|
||||
return pd.read_csv(io.BytesIO(gzip.decompress(Path(fn).read_bytes())), sep="\t", **kw)
|
||||
|
||||
sig1 = read_gz(LINCS / "GSE92742_sig_info.txt.gz", low_memory=False)
|
||||
sig2 = read_gz(LINCS / "GSE70138_sig_info.txt.gz", low_memory=False)
|
||||
cp1 = set(sig1[sig1["pert_type"] == "trt_cp"]["pert_iname"])
|
||||
cp2 = set(sig2[sig2["pert_type"] == "trt_cp"]["pert_iname"])
|
||||
|
||||
pert1 = read_gz(LINCS / "GSE92742_pert_info.txt.gz", low_memory=False)
|
||||
pert2 = read_gz(LINCS / "GSE70138_pert_info.txt.gz", low_memory=False)
|
||||
info = pd.concat([pert1, pert2], ignore_index=True)
|
||||
info = info[info["pert_type"] == "trt_cp"].drop_duplicates("pert_iname", keep="first")
|
||||
info = info.set_index("pert_iname")
|
||||
|
||||
names = cp1 | cp2
|
||||
rows = []
|
||||
for nm in names:
|
||||
phase = "both" if nm in cp1 and nm in cp2 else ("P1" if nm in cp1 else "P2")
|
||||
rec = info.loc[nm] if nm in info.index else None
|
||||
rows.append({
|
||||
"pert_iname": nm,
|
||||
"phase": phase,
|
||||
"pert_id": rec["pert_id"] if rec is not None else None,
|
||||
"inchi_key": rec["inchi_key"] if rec is not None else None,
|
||||
"canonical_smiles": rec["canonical_smiles"] if rec is not None else None,
|
||||
})
|
||||
return pd.DataFrame(rows).set_index("pert_iname")
|
||||
|
||||
|
||||
def pick(catalog: pd.DataFrame, names: list[str], reason: str) -> pd.DataFrame:
|
||||
present = [n for n in names if n in catalog.index]
|
||||
missing = [n for n in names if n not in catalog.index]
|
||||
if missing:
|
||||
print(f" [{reason}] {len(present)}/{len(names)} in LINCS; dropped: {missing}")
|
||||
out = catalog.loc[present].copy()
|
||||
out["inclusion_reason"] = reason
|
||||
return out
|
||||
|
||||
|
||||
def main() -> None:
|
||||
catalog = load_catalog()
|
||||
print(f"LINCS scorable compound universe: {len(catalog)}")
|
||||
|
||||
gt = pick(catalog, GROUND_TRUTH, "ground_truth")
|
||||
rel = pick(catalog, RELATED_MECHANISM, "related_mechanism")
|
||||
neg = pick(catalog, NEGATIVE_CONTROL, "negative_control")
|
||||
|
||||
chosen = pd.concat([gt, rel, neg])
|
||||
remaining = catalog.drop(index=chosen.index)
|
||||
n_random = TARGET_TOTAL - len(chosen)
|
||||
rand = remaining.sample(n=n_random, random_state=RANDOM_SEED).copy()
|
||||
rand["inclusion_reason"] = "general_sample"
|
||||
|
||||
drug_set = pd.concat([gt, rel, neg, rand]).reset_index()
|
||||
OUT.parent.mkdir(parents=True, exist_ok=True)
|
||||
drug_set.to_csv(OUT, index=False)
|
||||
|
||||
print(f"\ndrug_set_v1.csv: {len(drug_set)} drugs")
|
||||
print(drug_set["inclusion_reason"].value_counts().to_string())
|
||||
print(f"phase split:\n{drug_set['phase'].value_counts().to_string()}")
|
||||
print(f"wrote {OUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user