Reverso/scripts/dock_positive_controls.py

"""Structure-based track §12.4: dock known binders + negatives into Hb and PKR.

Positive-control recovery test: voxelotor should dock best into hemoglobin (5E83), mitapivat best
into PKR (8XFD), and unrelated drugs (caffeine, hydroxyurea) should score worse. The box is
centered on each co-crystal ligand (5L7=voxelotor, WV2=mitapivat). AutoDock Vina (mac binary,
Rosetta) + open-babel prep. Affinity in kcal/mol; more negative = stronger predicted binding.

This is a docking baseline, not an efficacy claim (PLAN §12.6).
"""

from __future__ import annotations

import re
import subprocess
import tempfile
from pathlib import Path

import numpy as np
import requests

VINA = "./tools/vina"
STRUCT = Path("data/raw/structures")
WORK = Path("data/processed/docking")
WORK.mkdir(parents=True, exist_ok=True)

TARGETS = {"hemoglobin": ("5E83", "5L7"), "PKR": ("8XFD", "WV2")}
LIGANDS = ["voxelotor", "mitapivat", "decitabine", "hydroxyurea", "caffeine"]
EXPECTED = {"voxelotor": "hemoglobin", "mitapivat": "PKR"}


def pubchem_smiles(name: str) -> str | None:
    for prop in ("SMILES", "ConnectivitySMILES", "IsomericSMILES", "CanonicalSMILES"):
        try:
            d = requests.get(f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{name}/property/{prop}/JSON",
                             timeout=30).json()["PropertyTable"]["Properties"][0]
            if prop in d:
                return d[prop]
        except Exception:
            continue
    return None


def prep_receptor_and_box(pdb: str, lig_res: str):
    """Receptor pdbqt (protein only) + box center/size from one copy of the co-crystal ligand."""
    atoms, lig, chosen = [], [], None
    for ln in (STRUCT / f"{pdb}.pdb").read_text().splitlines():
        if ln.startswith("ATOM"):
            atoms.append(ln)
        elif ln.startswith("HETATM") and ln[17:20].strip() == lig_res:
            key = (ln[21], ln[22:26])
            chosen = chosen or key
            if key == chosen:
                lig.append(ln)
    rec_pdb = WORK / f"{pdb}_rec.pdb"
    rec_pdb.write_text("\n".join(atoms) + "\nEND\n")
    rec_pdbqt = WORK / f"{pdb}_rec.pdbqt"
    subprocess.run(["obabel", str(rec_pdb), "-O", str(rec_pdbqt), "-xr", "-p", "7.4"],
                   capture_output=True, check=True)
    xyz = np.array([[float(l[30:38]), float(l[38:46]), float(l[46:54])] for l in lig])
    center = xyz.mean(0)
    size = np.clip((xyz.max(0) - xyz.min(0)) + 9.0, 18.0, 28.0)
    return rec_pdbqt, center, size


def prep_ligand(name: str, smiles: str) -> Path | None:
    out = WORK / f"lig_{name}.pdbqt"
    r = subprocess.run(["obabel", f"-:{smiles}", "-O", str(out), "--gen3d", "-p", "7.4"],
                       capture_output=True, text=True)
    return out if out.exists() and out.stat().st_size > 0 else None


def dock(rec_pdbqt: Path, lig_pdbqt: Path, center, size) -> float | None:
    out = subprocess.run([VINA, "--receptor", str(rec_pdbqt), "--ligand", str(lig_pdbqt),
                          "--center_x", f"{center[0]:.2f}", "--center_y", f"{center[1]:.2f}",
                          "--center_z", f"{center[2]:.2f}", "--size_x", f"{size[0]:.1f}",
                          "--size_y", f"{size[1]:.1f}", "--size_z", f"{size[2]:.1f}",
                          "--exhaustiveness", "8", "--cpu", "4",
                          "--out", str(WORK / "o.pdbqt")], capture_output=True, text=True)
    m = re.search(r"^\s+1\s+(-?\d+\.\d+)", out.stdout, re.M)
    return float(m.group(1)) if m else None


def main() -> None:
    smis = {n: pubchem_smiles(n) for n in LIGANDS}
    ligs = {n: prep_ligand(n, s) for n, s in smis.items() if s}
    ligs = {n: p for n, p in ligs.items() if p}
    print(f"prepared ligands: {list(ligs)}")

    boxes = {t: prep_receptor_and_box(pdb, lr) for t, (pdb, lr) in TARGETS.items()}
    print("prepared receptors:", list(boxes))

    print(f"\n{'ligand':14s}" + "".join(f"{t:>13s}" for t in TARGETS))
    results = {}
    for lname, lpath in ligs.items():
        row = {}
        for tname, (rec, center, size) in boxes.items():
            row[tname] = dock(rec, lpath, center, size)
        results[lname] = row
        cells = "".join(f"{(f'{row[t]:.1f}' if row[t] is not None else 'NA'):>13s}" for t in TARGETS)
        print(f"  {lname:12s}{cells}")

    print("\n=== positive-control recovery test (§12.4) ===")
    for lig, exp_target in EXPECTED.items():
        if lig in results:
            best = min(results[lig], key=lambda t: results[lig][t] if results[lig][t] is not None else 99)
            ok = best == exp_target
            print(f"  {lig:12s} best target = {best:11s} (expected {exp_target}) -> {'PASS' if ok else 'FAIL'}")


if __name__ == "__main__":
    main()