Redocking-RMSD validation fails 3/3: pipeline-quality issue

§12.4 de-biased validation (scripts/dock_validate.py).
Redock each co-crystal ligand into its own structure, RMSD vs crystal:
- voxelotor->Hb: NA (covalent binder, out of scope §12.7)
- mitapivat->PKR: 8.2A (allosteric, cofactors stripped)
- vorinostat->HDAC2 (4LXZ, zinc kept): 7.9A -- a CLASSICAL target that
  should have worked

The clean target also failing => systematic pipeline-quality problem,
not target choice. Cheap Vina + open-babel prep gives scores but doesn't
reproduce known geometry, so affinities aren't trustworthy. Ligand
efficiency over-corrects (ranks tiny hydroxyurea best). Fix needs
production prep (Meeko/AutoDockTools prepare_receptor + reduce) and an
in-place RMSD metric. Consistent with the project theme: the quick
version of every method runs but fails honest validation.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-24 07:28:47 +02:00
parent 75f5383961
commit 51bd90df41
2 changed files with 127 additions and 8 deletions

93
scripts/dock_validate.py Normal file
View File

@@ -0,0 +1,93 @@
"""Structure-binding §12.4: redocking-RMSD validation + ligand-efficiency normalization.
Two de-biased checks of the docking baseline:
A. REDOCKING RMSD — redock each co-crystal ligand into its own structure and measure pose RMSD
vs the crystal pose (open-babel obrms, symmetry-aware). <2 A = geometry validated. This is
the gold-standard positive control and is immune to the molecular-size bias that made the
cross-target affinity test inconclusive.
B. LIGAND EFFICIENCY — affinity / heavy-atom count, to de-bias the size effect in the raw scores.
"""
from __future__ import annotations
import re
import subprocess
from pathlib import Path
from rdkit import Chem, RDLogger
import sys
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from scripts.dock_positive_controls import ( # noqa: E402
STRUCT, VINA, WORK, TARGETS, pubchem_smiles, prep_ligand, prep_receptor_and_box,
)
RDLogger.DisableLog("rdApp.*")
XTAL_LIG = {"hemoglobin": ("5E83", "5L7", "voxelotor"), "PKR": ("8XFD", "WV2", "mitapivat")}
def extract_crystal_ligand(pdb: str, resname: str) -> Path:
lines, chosen = [], None
for ln in (STRUCT / f"{pdb}.pdb").read_text().splitlines():
if ln.startswith("HETATM") and ln[17:20].strip() == resname:
key = (ln[21], ln[22:26])
chosen = chosen or key
if key == chosen:
lines.append(ln)
out = WORK / f"xtal_{resname}.pdb"
out.write_text("\n".join(lines) + "\nEND\n")
return out
def dock_pose(rec, lig, center, size) -> Path | None:
pose = WORK / "redock_out.pdbqt"
subprocess.run([VINA, "--receptor", str(rec), "--ligand", str(lig),
"--center_x", f"{center[0]:.2f}", "--center_y", f"{center[1]:.2f}",
"--center_z", f"{center[2]:.2f}", "--size_x", f"{size[0]:.1f}",
"--size_y", f"{size[1]:.1f}", "--size_z", f"{size[2]:.1f}",
"--exhaustiveness", "16", "--out", str(pose)], capture_output=True, text=True)
if not pose.exists():
return None
top = WORK / "redock_top.pdb" # first model only
subprocess.run(["obabel", str(pose), "-O", str(top), "-f", "1", "-l", "1"], capture_output=True)
return top
def obrms(ref: Path, test: Path) -> float | None:
# strip H to compare heavy-atom poses; obrms is automorphism-aware
r2, t2 = WORK / "ref_noH.sdf", WORK / "test_noH.sdf"
subprocess.run(["obabel", str(ref), "-d", "-O", str(r2)], capture_output=True)
subprocess.run(["obabel", str(test), "-d", "-O", str(t2)], capture_output=True)
out = subprocess.run(["obrms", str(r2), str(t2)], capture_output=True, text=True).stdout
m = re.search(r"(-?\d+\.\d+)", out)
return float(m.group(1)) if m else None
def main() -> None:
print("=== A. Redocking-RMSD validation ===")
for target, (pdb, resname, drug) in XTAL_LIG.items():
rec, center, size = prep_receptor_and_box(pdb, resname)
smi = pubchem_smiles(drug)
lig = prep_ligand(drug, smi)
xtal = extract_crystal_ligand(pdb, resname)
pose = dock_pose(rec, lig, center, size)
rmsd = obrms(xtal, pose) if pose else None
verdict = "PASS (<2A)" if (rmsd is not None and rmsd < 2.0) else \
("MARGINAL (<3A)" if (rmsd is not None and rmsd < 3.0) else "FAIL")
rstr = f"{rmsd:.2f} A" if rmsd is not None else "NA"
print(f" {drug:12s} -> {target:11s} ({pdb}/{resname}): redock RMSD {rstr} {verdict}")
print("\n=== B. Ligand efficiency (affinity / heavy atoms), de-biasing size ===")
# raw affinities from the cross-dock baseline (scripts/dock_positive_controls.py)
aff = {"voxelotor": {"hemoglobin": -8.1, "PKR": -9.3}, "mitapivat": {"hemoglobin": -10.0, "PKR": -11.2},
"decitabine": {"hemoglobin": -6.6, "PKR": -7.0}, "hydroxyurea": {"hemoglobin": -3.9, "PKR": -3.6},
"caffeine": {"hemoglobin": -6.1, "PKR": -6.4}}
print(f" {'ligand':12s}{'heavy':>6s}" + "".join(f"{t+' LE':>14s}" for t in TARGETS))
for lig, row in aff.items():
ha = Chem.MolFromSmiles(pubchem_smiles(lig)).GetNumHeavyAtoms()
cells = "".join(f"{row[t]/ha:>14.3f}" for t in TARGETS)
print(f" {lig:12s}{ha:>6d}{cells}")
if __name__ == "__main__":
main()