"""Week 2, task 2: enrich the drug set with ChEMBL structure/target/mechanism data. Drugs are matched to ChEMBL by the InChIKey already carried from LINCS pert_info (reliable), then mechanism-of-action and target names are pulled. Compounds absent from ChEMBL (many research/tool compounds in the random arm) keep null ChEMBL fields — they still have LINCS signatures for scoring; only the Week-3 mechanistic prior won't apply. Output cached to data/raw/chembl/chembl_enrichment.parquet. """ from __future__ import annotations from pathlib import Path import pandas as pd from chembl_webresource_client.new_client import new_client DRUG_SET = Path("data/processed/drug_set_v1.csv") OUT = Path("data/raw/chembl/chembl_enrichment.parquet") BATCH = 40 def chunks(seq, n): for i in range(0, len(seq), n): yield seq[i:i + n] def main() -> None: drugs = pd.read_csv(DRUG_SET) inchikeys = sorted({k for k in drugs["inchi_key"].dropna() if isinstance(k, str) and len(k) > 10}) print(f"{len(drugs)} drugs; {len(inchikeys)} usable InChIKeys to resolve") molecule = new_client.molecule mechanism = new_client.mechanism target = new_client.target # 1) InChIKey -> ChEMBL molecule (id, name, smiles) mol_rows = [] for i, batch in enumerate(chunks(inchikeys, BATCH)): res = molecule.filter(molecule_structures__standard_inchi_key__in=batch).only( ["molecule_chembl_id", "pref_name", "molecule_structures"]) for m in res: ms = m.get("molecule_structures") or {} mol_rows.append({ "chembl_id": m["molecule_chembl_id"], "pref_name": m.get("pref_name"), "smiles": ms.get("canonical_smiles"), "inchi_key": ms.get("standard_inchi_key"), }) print(f" molecules batch {i+1}: cumulative {len(mol_rows)} hits", flush=True) mols = pd.DataFrame(mol_rows).drop_duplicates("inchi_key") chembl_ids = sorted(mols["chembl_id"].unique()) print(f"resolved {len(mols)} molecules -> {len(chembl_ids)} ChEMBL ids") # 2) ChEMBL id -> mechanism of action + target ids mech_rows = [] for batch in chunks(chembl_ids, BATCH): for m in mechanism.filter(molecule_chembl_id__in=batch).only( ["molecule_chembl_id", "mechanism_of_action", "target_chembl_id"]): mech_rows.append(m) mech = pd.DataFrame(mech_rows) print(f"mechanism records: {len(mech)}") # 3) target id -> name tgt_names = {} if not mech.empty: tids = sorted({t for t in mech["target_chembl_id"].dropna().unique()}) for batch in chunks(tids, BATCH): for t in target.filter(target_chembl_id__in=batch).only(["target_chembl_id", "pref_name"]): tgt_names[t["target_chembl_id"]] = t.get("pref_name") # aggregate mechanism/targets per molecule def agg(df): moa = sorted({x for x in df["mechanism_of_action"].dropna()}) tns = sorted({tgt_names.get(t) for t in df["target_chembl_id"].dropna() if tgt_names.get(t)}) return pd.Series({"mechanism_of_action": "; ".join(moa) or None, "targets": tns}) if not mech.empty: per_mol = mech.groupby("molecule_chembl_id").apply(agg, include_groups=False).reset_index() per_mol = per_mol.rename(columns={"molecule_chembl_id": "chembl_id"}) mols = mols.merge(per_mol, on="chembl_id", how="left") else: mols["mechanism_of_action"] = None mols["targets"] = None # join back to the drug set on inchi_key enriched = drugs.merge(mols, on="inchi_key", how="left", suffixes=("", "_chembl")) OUT.parent.mkdir(parents=True, exist_ok=True) enriched.to_parquet(OUT, index=False) n_resolved = enriched["chembl_id"].notna().sum() n_moa = enriched["mechanism_of_action"].notna().sum() print(f"\nenriched {len(enriched)} drugs: {n_resolved} matched ChEMBL, {n_moa} have MoA") print(f"by reason, ChEMBL match rate:") print(enriched.assign(matched=enriched["chembl_id"].notna()).groupby("inclusion_reason")["matched"].mean().round(2).to_string()) print(f"wrote {OUT}") if __name__ == "__main__": main()