Week 2: 300-drug profiles with LINCS signatures + ChEMBL
Build the drug profile dataset (PLAN §6 Week 2): - week2_curate_drugset.py: 300-drug set (2 ground-truth + 32 related- mechanism + 26 negative-control + 240 random), restricted to LINCS-scorable compounds, seed=42 - week2_chembl.py: InChIKey->ChEMBL match (145/300), MoA + targets - week2_lincs_extract.py: cmapPy-slice both Level-5 GCTX phases to 978 landmark genes, mean-aggregate per drug to one consensus signature - week2_assemble.py: join into drug_profiles_v1.parquet, Tier B (LINCS single-source), scored flag per PLAN §6 Week 3 task 2 - docs/data_sources.md: drug set composition + LINCS/ChEMBL provenance Results (all gitignored data): 300/300 drugs scored, both ground-truth drugs present (hydroxyurea Phase II = CHEMBL467, L-glutamine Phase I). Key caveat recorded: only 56/477 (12%) of the disease signature genes are LINCS landmarks, so Week-3 scoring uses a 30-up/26-down query. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
99
scripts/week2_chembl.py
Normal file
99
scripts/week2_chembl.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""Week 2, task 2: enrich the drug set with ChEMBL structure/target/mechanism data.
|
||||
|
||||
Drugs are matched to ChEMBL by the InChIKey already carried from LINCS pert_info (reliable),
|
||||
then mechanism-of-action and target names are pulled. Compounds absent from ChEMBL (many
|
||||
research/tool compounds in the random arm) keep null ChEMBL fields — they still have LINCS
|
||||
signatures for scoring; only the Week-3 mechanistic prior won't apply. Output cached to
|
||||
data/raw/chembl/chembl_enrichment.parquet.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
from chembl_webresource_client.new_client import new_client
|
||||
|
||||
DRUG_SET = Path("data/processed/drug_set_v1.csv")
|
||||
OUT = Path("data/raw/chembl/chembl_enrichment.parquet")
|
||||
BATCH = 40
|
||||
|
||||
|
||||
def chunks(seq, n):
|
||||
for i in range(0, len(seq), n):
|
||||
yield seq[i:i + n]
|
||||
|
||||
|
||||
def main() -> None:
|
||||
drugs = pd.read_csv(DRUG_SET)
|
||||
inchikeys = sorted({k for k in drugs["inchi_key"].dropna() if isinstance(k, str) and len(k) > 10})
|
||||
print(f"{len(drugs)} drugs; {len(inchikeys)} usable InChIKeys to resolve")
|
||||
|
||||
molecule = new_client.molecule
|
||||
mechanism = new_client.mechanism
|
||||
target = new_client.target
|
||||
|
||||
# 1) InChIKey -> ChEMBL molecule (id, name, smiles)
|
||||
mol_rows = []
|
||||
for i, batch in enumerate(chunks(inchikeys, BATCH)):
|
||||
res = molecule.filter(molecule_structures__standard_inchi_key__in=batch).only(
|
||||
["molecule_chembl_id", "pref_name", "molecule_structures"])
|
||||
for m in res:
|
||||
ms = m.get("molecule_structures") or {}
|
||||
mol_rows.append({
|
||||
"chembl_id": m["molecule_chembl_id"],
|
||||
"pref_name": m.get("pref_name"),
|
||||
"smiles": ms.get("canonical_smiles"),
|
||||
"inchi_key": ms.get("standard_inchi_key"),
|
||||
})
|
||||
print(f" molecules batch {i+1}: cumulative {len(mol_rows)} hits", flush=True)
|
||||
mols = pd.DataFrame(mol_rows).drop_duplicates("inchi_key")
|
||||
chembl_ids = sorted(mols["chembl_id"].unique())
|
||||
print(f"resolved {len(mols)} molecules -> {len(chembl_ids)} ChEMBL ids")
|
||||
|
||||
# 2) ChEMBL id -> mechanism of action + target ids
|
||||
mech_rows = []
|
||||
for batch in chunks(chembl_ids, BATCH):
|
||||
for m in mechanism.filter(molecule_chembl_id__in=batch).only(
|
||||
["molecule_chembl_id", "mechanism_of_action", "target_chembl_id"]):
|
||||
mech_rows.append(m)
|
||||
mech = pd.DataFrame(mech_rows)
|
||||
print(f"mechanism records: {len(mech)}")
|
||||
|
||||
# 3) target id -> name
|
||||
tgt_names = {}
|
||||
if not mech.empty:
|
||||
tids = sorted({t for t in mech["target_chembl_id"].dropna().unique()})
|
||||
for batch in chunks(tids, BATCH):
|
||||
for t in target.filter(target_chembl_id__in=batch).only(["target_chembl_id", "pref_name"]):
|
||||
tgt_names[t["target_chembl_id"]] = t.get("pref_name")
|
||||
|
||||
# aggregate mechanism/targets per molecule
|
||||
def agg(df):
|
||||
moa = sorted({x for x in df["mechanism_of_action"].dropna()})
|
||||
tns = sorted({tgt_names.get(t) for t in df["target_chembl_id"].dropna() if tgt_names.get(t)})
|
||||
return pd.Series({"mechanism_of_action": "; ".join(moa) or None, "targets": tns})
|
||||
|
||||
if not mech.empty:
|
||||
per_mol = mech.groupby("molecule_chembl_id").apply(agg, include_groups=False).reset_index()
|
||||
per_mol = per_mol.rename(columns={"molecule_chembl_id": "chembl_id"})
|
||||
mols = mols.merge(per_mol, on="chembl_id", how="left")
|
||||
else:
|
||||
mols["mechanism_of_action"] = None
|
||||
mols["targets"] = None
|
||||
|
||||
# join back to the drug set on inchi_key
|
||||
enriched = drugs.merge(mols, on="inchi_key", how="left", suffixes=("", "_chembl"))
|
||||
OUT.parent.mkdir(parents=True, exist_ok=True)
|
||||
enriched.to_parquet(OUT, index=False)
|
||||
|
||||
n_resolved = enriched["chembl_id"].notna().sum()
|
||||
n_moa = enriched["mechanism_of_action"].notna().sum()
|
||||
print(f"\nenriched {len(enriched)} drugs: {n_resolved} matched ChEMBL, {n_moa} have MoA")
|
||||
print(f"by reason, ChEMBL match rate:")
|
||||
print(enriched.assign(matched=enriched["chembl_id"].notna()).groupby("inclusion_reason")["matched"].mean().round(2).to_string())
|
||||
print(f"wrote {OUT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user