From 817bcda7dc7bb261ad8d603a15880174e612ff95 Mon Sep 17 00:00:00 2001 From: "Junior B." Date: Tue, 23 Jun 2026 23:53:27 +0200 Subject: [PATCH] Structure-binding track: scaffold + ligand-retrieval baseline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Start the structure-based binding branch (PLAN §12), baseline-first. - src/binding.py: validated RDKit ligand retrieval (morgan_fp, tanimoto, retrieve_nearest = the §12.9 engine) + dock() stub documenting the blocked ARM-Mac toolchain - scripts/binding_ligand_baseline.py: 300 drugs vs known binders - docs/structure_binding_notes.md: status, toolchain blocker, next steps - pyproject: [structure] extra (rdkit); data/raw/structures/ for PDBs Step-0 finding: retrieval engine VALIDATED on in-set classes (decitabine->azacitidine 0.62; vorinostat->scriptaid/belinostat) but the distinctive binders voxelotor/mitapivat have no analog in our 300-drug set (Tanimoto ~0.2). Needs (a) bigger library, (b) real docking (§12.3), which is blocked on the ARM-Mac docking toolchain (§12.6 pitfall 4). Structures 5E83 (Hb+voxelotor) and 8XFD (PKR+mitapivat) fetched. Co-Authored-By: Claude Opus 4.8 (1M context) --- data/raw/structures/.gitkeep | 0 docs/structure_binding_notes.md | 34 ++++++++++++ pyproject.toml | 6 ++ scripts/binding_ligand_baseline.py | 66 ++++++++++++++++++++++ src/binding.py | 89 ++++++++++++++++++++++++++++++ 5 files changed, 195 insertions(+) create mode 100644 data/raw/structures/.gitkeep create mode 100644 docs/structure_binding_notes.md create mode 100644 scripts/binding_ligand_baseline.py create mode 100644 src/binding.py diff --git a/data/raw/structures/.gitkeep b/data/raw/structures/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/docs/structure_binding_notes.md b/docs/structure_binding_notes.md new file mode 100644 index 0000000..c78d9c3 --- /dev/null +++ b/docs/structure_binding_notes.md @@ -0,0 +1,34 @@ +# Structure-based binding track — working notes + +Branch `structure-based-binding`. Implements PLAN §12. Baseline-first, start with the two cleanest +targets (Hemoglobin + PKR), de-risk the harness before scaling. + +## Status (2026-06-23) + +**Toolchain check (PLAN §12.6 pitfall 4, confirmed real):** +- ✅ RDKit installs on ARM Mac — ligand side ready. +- ❌ AutoDock Vina does NOT pip-install on ARM Mac; no docking binary available. Docking (§12.3) + is **blocked on toolchain** — must resolve via conda/micromamba (`vina`/`smina`), a GPU AF3-class + model (Boltz-2/Chai-1/DiffDock), or an x86 Vina binary under Rosetta. + +**Structures obtained:** `5E83` (hemoglobin + voxelotor), `8XFD` (PKR + mitapivat) in +`data/raw/structures/`. + +**Step 0 — ligand-based retrieval baseline (`scripts/binding_ligand_baseline.py`):** +RDKit Tanimoto of our 300 drugs vs known sickle binders. +- Engine VALIDATED on in-set classes: `decitabine`→azacitidine (0.62); `vorinostat`→scriptaid + (0.42), belinostat (0.28). Correctly clusters DNMT1 / HDAC HbF-inducers. +- But voxelotor / mitapivat have **no analog** in our set (max Tanimoto ~0.20–0.26). A 300-drug + library is too sparse to contain look-alikes of distinctive scaffolds. + +**Takeaways:** +1. Ligand retrieval works but needs a **bigger drug library** to be useful for distinctive targets. +2. The targets without in-set analogs (Hb, PKR) need **actual docking** (§12.3) — which scores + binding directly, no look-alike required. That is the gating next step, and it needs the + toolchain solved. + +## Next steps +- [ ] Resolve the docking toolchain (recommend: micromamba + smina/vina, CPU, no GPU needed for baseline). +- [ ] Dock the known binders (voxelotor→5E83, mitapivat→8XFD) as positive controls (§12.4 recovery test). +- [ ] Expand the ligand library (full ChEMBL/LINCS) for retrieval to have reach. +- [ ] Only then: AF3-class co-folding (Boltz-2/DiffDock) vs the docking baseline; and §12.9 generative beacon. diff --git a/pyproject.toml b/pyproject.toml index 1216248..7487a83 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,12 @@ dev = [ "pytest>=8.0", "ruff>=0.5", ] +# Structure-based binding track (PLAN §12). Docking tool (vina/smina) is NOT pip-installable on +# ARM Mac — install via conda/micromamba or use a GPU AF3-class model; see docs/structure_binding_notes.md. +structure = [ + "rdkit>=2024.3", + "requests>=2.31", +] [tool.setuptools.packages.find] where = ["."] diff --git a/scripts/binding_ligand_baseline.py b/scripts/binding_ligand_baseline.py new file mode 100644 index 0000000..c714a35 --- /dev/null +++ b/scripts/binding_ligand_baseline.py @@ -0,0 +1,66 @@ +"""Structure-based track, step 0: ligand-based retrieval baseline (PLAN §12.9 engine). + +Docking (§12.3) needs a toolchain that doesn't pip-install on ARM Mac (AutoDock Vina) — that's the +next dependency to solve. Meanwhile this runs now with pure RDKit: do any of our 300 drugs sit near +the KNOWN sickle binders (voxelotor, mitapivat, decitabine) in chemical space? This is the +retrieval engine §12.9 would point a generative beacon at, and a sanity check on the ligand data. + +NOT docking and NOT a binding claim — chemical similarity only. Similarity != activity (§12.9). +""" + +from __future__ import annotations + +import pandas as pd +import requests +from rdkit import Chem, DataStructs, RDLogger +from rdkit.Chem import rdFingerprintGenerator + +RDLogger.DisableLog("rdApp.*") +MORGAN = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048) + +# Known sickle binders = positive-control beacons (target in parens). +BINDERS = ["voxelotor", "mitapivat", "decitabine", "vorinostat"] + + +def pubchem_smiles(name: str) -> str | None: + for prop in ("SMILES", "ConnectivitySMILES", "IsomericSMILES", "CanonicalSMILES"): + try: + u = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{name}/property/{prop}/JSON" + d = requests.get(u, timeout=30).json()["PropertyTable"]["Properties"][0] + if prop in d: + return d[prop] + except Exception: + continue + return None + + +def fp(smi: str): + if not isinstance(smi, str) or smi in ("-666", ""): + return None + m = Chem.MolFromSmiles(smi) + return MORGAN.GetFingerprint(m) if m else None + + +def main() -> None: + binder_smi = {b: pubchem_smiles(b) for b in BINDERS} + print("known-binder SMILES:", {k: (v[:34] + "..." if v else "MISSING") for k, v in binder_smi.items()}) + + drugs = pd.read_csv("data/processed/drug_set_v1.csv")[["pert_iname", "canonical_smiles", "inclusion_reason"]] + reason = dict(zip(drugs.pert_iname, drugs.inclusion_reason)) + drug_fp = {r.pert_iname: fp(r.canonical_smiles) for r in drugs.itertuples()} + drug_fp = {k: v for k, v in drug_fp.items() if v is not None} + print(f"fingerprinted {len(drug_fp)}/{len(drugs)} drugs\n") + + for b, smi in binder_smi.items(): + bfp = fp(smi) + if bfp is None: + print(f"{b}: no SMILES\n"); continue + sims = sorted(((DataStructs.TanimotoSimilarity(bfp, v), k) for k, v in drug_fp.items()), reverse=True) + print(f"nearest drugs to {b}:") + for s, k in sims[:5]: + print(f" {s:.3f} {k:22s} [{reason.get(k,'?')}]") + print() + + +if __name__ == "__main__": + main() diff --git a/src/binding.py b/src/binding.py new file mode 100644 index 0000000..c761dcf --- /dev/null +++ b/src/binding.py @@ -0,0 +1,89 @@ +"""Structure-based binding track (PLAN §12). + +Two capabilities: +- ligand-based retrieval (RDKit, works now): find existing drugs near a query molecule in + chemical space — validated, and the engine behind §12.9 generative-guided retrieval. +- structure-based docking (§12.3): score whether a ligand binds a target pocket. Blocked on an + ARM-Mac docking toolchain (AutoDock Vina does not pip-install); see ``dock`` for options. + +Caveat carried throughout: chemical similarity != activity, and docking != efficacy (§12.6). +""" + +from __future__ import annotations + +from pathlib import Path + +from rdkit import Chem, DataStructs, RDLogger +from rdkit.Chem import rdFingerprintGenerator + +RDLogger.DisableLog("rdApp.*") +_MORGAN = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048) + +STRUCT_DIR = Path("data/raw/structures") + +# Known sickle small-molecule binders, by target (positive controls for the §12.4 recovery test). +KNOWN_BINDERS = { + "hemoglobin": "voxelotor", + "PKR": "mitapivat", + "DNMT1": "decitabine", + "HDAC": "vorinostat", +} + +# Curated target structures (PLAN §12.2). Add PDB ids as the harness grows. +TARGET_PDB = { + "hemoglobin": "5E83", # hemoglobin + voxelotor (GBT440) + "PKR": "8XFD", # pyruvate kinase R + mitapivat +} + + +def morgan_fp(smiles: str): + """Morgan (ECFP4) fingerprint, or None for invalid / missing SMILES ('-666', '').""" + if not isinstance(smiles, str) or smiles in ("-666", ""): + return None + mol = Chem.MolFromSmiles(smiles) + return _MORGAN.GetFingerprint(mol) if mol else None + + +def tanimoto(smiles_a: str, smiles_b: str) -> float | None: + fa, fb = morgan_fp(smiles_a), morgan_fp(smiles_b) + if fa is None or fb is None: + return None + return DataStructs.TanimotoSimilarity(fa, fb) + + +def retrieve_nearest( + query_smiles: str, + library: dict[str, str], + top_n: int = 5, +) -> list[tuple[float, str]]: + """Rank a library of {name: smiles} by Tanimoto similarity to a query molecule. + + This is the §12.9 retrieval step: the query may be a known binder (positive-control beacon) + or a generated idealised binder; the returned existing drugs are repurposing candidates that + STILL require docking/validation (similarity != activity). + """ + qfp = morgan_fp(query_smiles) + if qfp is None: + raise ValueError("invalid query SMILES") + sims = [] + for name, smi in library.items(): + fp = morgan_fp(smi) + if fp is not None: + sims.append((DataStructs.TanimotoSimilarity(qfp, fp), name)) + return sorted(sims, reverse=True)[:top_n] + + +def dock(target: str, ligand_smiles: str) -> float: + """Dock a ligand into a target pocket and return a binding score (PLAN §12.3). + + Blocked: AutoDock Vina does not pip-install on ARM Mac and no docking binary is on PATH. + Resolve the toolchain first (one of): + - conda/micromamba: ``vina`` (conda-forge) or ``smina`` (bioconda), osx-arm64 builds + - an AF3-class co-folding model on GPU: Boltz-2 / Chai-1 / DiffDock (also predicts affinity) + - x86 Vina binary under Rosetta 2 + Then: fetch TARGET_PDB[target], define the pocket box, prep the ligand (Meeko), score. + """ + raise NotImplementedError( + "Docking toolchain unresolved on ARM Mac (PLAN §12.6 pitfall 4 / §12.8). " + "See docstring for options." + )