From 75f53839615fe47d2f54c5afd3d844bfc3ad6866 Mon Sep 17 00:00:00 2001 From: "Junior B." Date: Wed, 24 Jun 2026 00:03:00 +0200 Subject: [PATCH] Docking baseline: toolchain solved, raw affinity is size-biased MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit §12.3-12.4 first binding result on ARM Mac. - Toolchain SOLVED: AutoDock Vina 1.2.5 mac binary (Rosetta) + open-babel (brew). No conda, no MLX. dock_positive_controls.py runs end-to-end. - Cross-dock known binders + negatives into Hb (5E83) and PKR (8XFD), box centered on co-crystal ligands (5L7=voxelotor, WV2=mitapivat). Finding: raw Vina affinity ranks almost perfectly by MOLECULAR SIZE (mitapivat > voxelotor > decitabine/caffeine > hydroxyurea) in both pockets — mitapivat wins even on hemoglobin it doesn't target. Raw score can't distinguish target-specific binding: the docking analog of the connectivity specificity problem. Next: redocking-RMSD validation + ligand-efficiency normalization. Note: machine is 24GB (not 96GB per PLAN §2), capping local AF3-class inference. tools/ gitignored (vina binary). Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitignore | 3 + docs/structure_binding_notes.md | 39 +++++++++-- scripts/dock_positive_controls.py | 111 ++++++++++++++++++++++++++++++ 3 files changed, 149 insertions(+), 4 deletions(-) create mode 100644 scripts/dock_positive_controls.py diff --git a/.gitignore b/.gitignore index 1c2879d..e930ef6 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,6 @@ env/ .DS_Store .idea/ .vscode/ + +# Local tool binaries (refetch per docs/structure_binding_notes.md) +tools/ diff --git a/docs/structure_binding_notes.md b/docs/structure_binding_notes.md index c78d9c3..4352cc8 100644 --- a/docs/structure_binding_notes.md +++ b/docs/structure_binding_notes.md @@ -27,8 +27,39 @@ RDKit Tanimoto of our 300 drugs vs known sickle binders. binding directly, no look-alike required. That is the gating next step, and it needs the toolchain solved. +## Step 1 — docking baseline (2026-06-24) + +**Toolchain SOLVED on ARM Mac:** AutoDock Vina 1.2.5 mac binary (`tools/vina`, runs under Rosetta) ++ open-babel (brew) for prep. Docking runs end-to-end (`scripts/dock_positive_controls.py`). +Co-crystal ligands identified: 5L7 = voxelotor (5E83), WV2 = mitapivat (8XFD). + +**Positive-control cross-docking — inconclusive, and instructively so.** Affinities (kcal/mol): + +| ligand | hemoglobin | PKR | +|---|---|---| +| voxelotor | −8.1 | −9.3 | +| mitapivat | −10.0 | −11.2 | +| decitabine | −6.6 | −7.0 | +| hydroxyurea | −3.9 | −3.6 | +| caffeine | −6.1 | −6.4 | + +The scores rank almost perfectly by **molecular size** (mitapivat > voxelotor > decitabine/caffeine +> hydroxyurea) in *both* pockets — mitapivat wins even on hemoglobin, which it doesn't target. So +raw Vina affinity is confounded by ligand size and per-pocket stickiness; it cannot yet +distinguish target-specific binding. This is the **docking analog of the connectivity specificity +problem** — raw scores carry a systematic bias (size here, broadness there) that masquerades as +signal. voxelotor *does* dock to Hb (−8.1, a real score); the cross-target test just isn't the +right validation. + ## Next steps -- [ ] Resolve the docking toolchain (recommend: micromamba + smina/vina, CPU, no GPU needed for baseline). -- [ ] Dock the known binders (voxelotor→5E83, mitapivat→8XFD) as positive controls (§12.4 recovery test). -- [ ] Expand the ligand library (full ChEMBL/LINCS) for retrieval to have reach. -- [ ] Only then: AF3-class co-folding (Boltz-2/DiffDock) vs the docking baseline; and §12.9 generative beacon. +- [ ] **Redocking-RMSD validation** (the gold-standard positive control): redock the crystal ligand + 5L7/WV2 into its own structure, compute pose RMSD vs crystal. <2 Å = geometry validated. This + tests pose accuracy, which size bias doesn't corrupt. +- [ ] **Ligand-efficiency normalization** (affinity / heavy-atom count) to de-bias the size effect, + the docking counterpart of the connectivity calibration work. +- [ ] Expand the ligand library (full ChEMBL/LINCS) for retrieval reach. +- [ ] Only then: AF3-class co-folding (Boltz-2/DiffDock via PyTorch-MPS — note 24 GB ceiling) vs the + docking baseline; and §12.9 generative beacon. + +> **Hardware note:** this machine is **24 GB** unified memory (not the 96 GB PLAN §2 assumed), +> which caps local AF3-class model inference. Classical docking (above) is unaffected. diff --git a/scripts/dock_positive_controls.py b/scripts/dock_positive_controls.py new file mode 100644 index 0000000..21613ea --- /dev/null +++ b/scripts/dock_positive_controls.py @@ -0,0 +1,111 @@ +"""Structure-based track §12.4: dock known binders + negatives into Hb and PKR. + +Positive-control recovery test: voxelotor should dock best into hemoglobin (5E83), mitapivat best +into PKR (8XFD), and unrelated drugs (caffeine, hydroxyurea) should score worse. The box is +centered on each co-crystal ligand (5L7=voxelotor, WV2=mitapivat). AutoDock Vina (mac binary, +Rosetta) + open-babel prep. Affinity in kcal/mol; more negative = stronger predicted binding. + +This is a docking baseline, not an efficacy claim (PLAN §12.6). +""" + +from __future__ import annotations + +import re +import subprocess +import tempfile +from pathlib import Path + +import numpy as np +import requests + +VINA = "./tools/vina" +STRUCT = Path("data/raw/structures") +WORK = Path("data/processed/docking") +WORK.mkdir(parents=True, exist_ok=True) + +TARGETS = {"hemoglobin": ("5E83", "5L7"), "PKR": ("8XFD", "WV2")} +LIGANDS = ["voxelotor", "mitapivat", "decitabine", "hydroxyurea", "caffeine"] +EXPECTED = {"voxelotor": "hemoglobin", "mitapivat": "PKR"} + + +def pubchem_smiles(name: str) -> str | None: + for prop in ("SMILES", "ConnectivitySMILES", "IsomericSMILES", "CanonicalSMILES"): + try: + d = requests.get(f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{name}/property/{prop}/JSON", + timeout=30).json()["PropertyTable"]["Properties"][0] + if prop in d: + return d[prop] + except Exception: + continue + return None + + +def prep_receptor_and_box(pdb: str, lig_res: str): + """Receptor pdbqt (protein only) + box center/size from one copy of the co-crystal ligand.""" + atoms, lig, chosen = [], [], None + for ln in (STRUCT / f"{pdb}.pdb").read_text().splitlines(): + if ln.startswith("ATOM"): + atoms.append(ln) + elif ln.startswith("HETATM") and ln[17:20].strip() == lig_res: + key = (ln[21], ln[22:26]) + chosen = chosen or key + if key == chosen: + lig.append(ln) + rec_pdb = WORK / f"{pdb}_rec.pdb" + rec_pdb.write_text("\n".join(atoms) + "\nEND\n") + rec_pdbqt = WORK / f"{pdb}_rec.pdbqt" + subprocess.run(["obabel", str(rec_pdb), "-O", str(rec_pdbqt), "-xr", "-p", "7.4"], + capture_output=True, check=True) + xyz = np.array([[float(l[30:38]), float(l[38:46]), float(l[46:54])] for l in lig]) + center = xyz.mean(0) + size = np.clip((xyz.max(0) - xyz.min(0)) + 9.0, 18.0, 28.0) + return rec_pdbqt, center, size + + +def prep_ligand(name: str, smiles: str) -> Path | None: + out = WORK / f"lig_{name}.pdbqt" + r = subprocess.run(["obabel", f"-:{smiles}", "-O", str(out), "--gen3d", "-p", "7.4"], + capture_output=True, text=True) + return out if out.exists() and out.stat().st_size > 0 else None + + +def dock(rec_pdbqt: Path, lig_pdbqt: Path, center, size) -> float | None: + out = subprocess.run([VINA, "--receptor", str(rec_pdbqt), "--ligand", str(lig_pdbqt), + "--center_x", f"{center[0]:.2f}", "--center_y", f"{center[1]:.2f}", + "--center_z", f"{center[2]:.2f}", "--size_x", f"{size[0]:.1f}", + "--size_y", f"{size[1]:.1f}", "--size_z", f"{size[2]:.1f}", + "--exhaustiveness", "8", "--cpu", "4", + "--out", str(WORK / "o.pdbqt")], capture_output=True, text=True) + m = re.search(r"^\s+1\s+(-?\d+\.\d+)", out.stdout, re.M) + return float(m.group(1)) if m else None + + +def main() -> None: + smis = {n: pubchem_smiles(n) for n in LIGANDS} + ligs = {n: prep_ligand(n, s) for n, s in smis.items() if s} + ligs = {n: p for n, p in ligs.items() if p} + print(f"prepared ligands: {list(ligs)}") + + boxes = {t: prep_receptor_and_box(pdb, lr) for t, (pdb, lr) in TARGETS.items()} + print("prepared receptors:", list(boxes)) + + print(f"\n{'ligand':14s}" + "".join(f"{t:>13s}" for t in TARGETS)) + results = {} + for lname, lpath in ligs.items(): + row = {} + for tname, (rec, center, size) in boxes.items(): + row[tname] = dock(rec, lpath, center, size) + results[lname] = row + cells = "".join(f"{(f'{row[t]:.1f}' if row[t] is not None else 'NA'):>13s}" for t in TARGETS) + print(f" {lname:12s}{cells}") + + print("\n=== positive-control recovery test (§12.4) ===") + for lig, exp_target in EXPECTED.items(): + if lig in results: + best = min(results[lig], key=lambda t: results[lig][t] if results[lig][t] is not None else 99) + ok = best == exp_target + print(f" {lig:12s} best target = {best:11s} (expected {exp_target}) -> {'PASS' if ok else 'FAIL'}") + + +if __name__ == "__main__": + main()