From 07705a58843f0f7120a58507bd2bad51a0feee9b Mon Sep 17 00:00:00 2001
From: "Junior B." <junior@bonto.ch>
Date: Wed, 24 Jun 2026 17:16:06 +0200
Subject: [PATCH] GPU Phase 1: co-fold cofactors/metals (the binding-mode
 determinants)

Add metal/cofactor handling to the Boltz-2 YAML as CCD ligand entries -
the modes classical docking couldn't model:
- HDAC2 + catalytic Zn (vorinostat chelates it)
- PKR + FBP + Mg (allosteric activator + metal)
- hemoglobin + heme
Same cofactors present when co-folding negatives into a target (fair test).

build_boltz_yaml() gains a cofactor_ccds arg (emits `ligand: {ccd: ...}`
entries); TARGETS carries per-target cofactors; cofold()/main() thread them
through. Verified locally: YAML builds correctly with Zn / FBP+Mg.

Honest limitation noted: Hb's voxelotor site is at the tetramer centre and
covalent (Schiff base), so single-chain+heme only approximates it - HDAC2
(Zn) and PKR (cofactor) are the real co-folding tests. Ready for
`modal run gpu/modal_app.py`.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 docs/gpu_plan.md |  8 +++---
 gpu/modal_app.py | 66 ++++++++++++++++++++++++++++--------------------
 2 files changed, 43 insertions(+), 31 deletions(-)

diff --git a/docs/gpu_plan.md b/docs/gpu_plan.md
index 063c99c..a49093e 100644
--- a/docs/gpu_plan.md
+++ b/docs/gpu_plan.md
@@ -69,9 +69,11 @@ forgotten box can't bleed money.
 ## What runs on the GPU (in cost order — cheap validation first)
 
 - **Phase 1 — modality validation (~minutes, ~$1):** co-fold each known binder + 2 negative
-  controls (caffeine, hydroxyurea) into each target (Hb, PKR, HDAC2) and check the **known binder
-  has the highest Boltz-2 P(binder)** for its own target — the discrimination Vina couldn't manage
-  on metal/covalent/allosteric modes. (Ranking uses P(binder), not the raw affinity value, whose
+  controls (caffeine, hydroxyurea) into each target (Hb, PKR, HDAC2) **with the binding-mode
+  cofactors co-folded in** — HDAC2 + catalytic **Zn**, PKR + **FBP/Mg**, Hb + **heme** (as CCD
+  ligand entries) — and check the **known binder has the highest Boltz-2 P(binder)** for its own
+  target. This is the discrimination Vina couldn't manage precisely because it can't model
+  Zn-chelation / cofactors. (Ranking uses P(binder), not the raw affinity value, whose
   sign is version-dependent.) Pose-RMSD vs crystal is a deeper check but needs receptor
   superposition (align predicted protein to crystal, transform ligand) — a later refinement. If
   Phase 1 passes, the modality is real; if not, stop before paying for a screen.
diff --git a/gpu/modal_app.py b/gpu/modal_app.py
index 6038d3f..a9c5f4a 100644
--- a/gpu/modal_app.py
+++ b/gpu/modal_app.py
@@ -32,14 +32,23 @@ image = (
 weights = modal.Volume.from_name("reverso-binding-weights", create_if_missing=True)
 WEIGHTS = "/weights"
 
-# target name -> (PDB id, crystal-ligand resname, drug name). Plus negatives co-folded into each.
+# target -> (PDB id, crystal-ligand resname, drug name, cofactor/metal CCD codes to co-fold).
+# The cofactors are the binding-mode determinants Vina couldn't model: HDAC2 needs the catalytic
+# Zn (vorinostat chelates it), PKR the allosteric FBP + Mg, hemoglobin the heme. Co-folding them
+# in as CCD ligands is the whole point of the AF3-class pivot. The same cofactors are present when
+# co-folding the negatives into that target, for a fair comparison.
 TARGETS = {
-    "hemoglobin": ("5E83", "5L7", "voxelotor"),
-    "PKR": ("8XFD", "WV2", "mitapivat"),
-    "HDAC2": ("4LXZ", "SHH", "vorinostat"),
+    "hemoglobin": ("5E83", "5L7", "voxelotor", ["HEM"]),
+    "PKR": ("8XFD", "WV2", "mitapivat", ["FBP", "MG"]),
+    "HDAC2": ("4LXZ", "SHH", "vorinostat", ["ZN"]),
 }
 NEGATIVES = ["caffeine", "hydroxyurea"]
 
+# Honest limitation: hemoglobin's voxelotor site sits at the tetramer centre and the bond is
+# covalent (Schiff base) — a single-chain + heme model only approximates it, so Hb is the weak
+# case. HDAC2 (Zn chelation) and PKR (allosteric + cofactor) are the real tests of whether
+# co-folding handles the modes classical docking could not.
+
 
 # --------------------------------------------------------------------------- local helpers (CPU)
 
@@ -92,28 +101,27 @@ def pubchem_smiles(name: str) -> str:
     raise ValueError(f"no SMILES for {name}")
 
 
-def build_boltz_yaml(protein_seq: str, ligand_smiles: str) -> str:
-    """Boltz-2 YAML for a protein+ligand complex with an affinity prediction on the ligand."""
-    return (
-        "version: 1\n"
-        "sequences:\n"
-        "  - protein:\n"
-        "      id: A\n"
-        f"      sequence: {protein_seq}\n"
-        "  - ligand:\n"
-        "      id: L\n"
-        f"      smiles: '{ligand_smiles}'\n"
-        "properties:\n"
-        "  - affinity:\n"
-        "      binder: L\n"
-    )
+def build_boltz_yaml(protein_seq: str, ligand_smiles: str, cofactor_ccds: list[str] | None = None) -> str:
+    """Boltz-2 YAML: protein + the drug ligand (affinity binder) + any cofactor/metal CCD ligands.
+
+    Cofactors/ions are added as `ligand` entries referencing their CCD code (e.g. ZN, MG, FBP,
+    HEM) so the model places the metal/cofactor that defines the binding mode. Affinity is
+    predicted on the drug ligand L only.
+    """
+    lines = ["version: 1", "sequences:",
+             "  - protein:", "      id: A", f"      sequence: {protein_seq}",
+             "  - ligand:", "      id: L", f"      smiles: '{ligand_smiles}'"]
+    for i, ccd in enumerate(cofactor_ccds or []):
+        lines += ["  - ligand:", f"      id: M{i}", f"      ccd: {ccd}"]
+    lines += ["properties:", "  - affinity:", "      binder: L"]
+    return "\n".join(lines) + "\n"
 
 
 # ------------------------------------------------------------------------------- GPU function
 
 @app.function(gpu="L4", image=image, volumes={WEIGHTS: weights}, timeout=3600)
-def cofold(label: str, protein_seq: str, ligand_smiles: str) -> dict:
-    """Co-fold one complex on the GPU; return predicted affinity + binder probability.
+def cofold(label: str, protein_seq: str, ligand_smiles: str, cofactor_ccds: list[str]) -> dict:
+    """Co-fold one complex (protein + drug + cofactors) on the GPU; return affinity + P(binder).
 
     Weights persist on the mounted Volume (HF_HOME/boltz --cache under /weights), so run 2+ skips
     the download. GPU is released the moment this returns.
@@ -126,7 +134,7 @@ def cofold(label: str, protein_seq: str, ligand_smiles: str) -> dict:
 
     work = Path("/tmp") / label
     work.mkdir(parents=True, exist_ok=True)
-    (work / "in.yaml").write_text(build_boltz_yaml(protein_seq, ligand_smiles))
+    (work / "in.yaml").write_text(build_boltz_yaml(protein_seq, ligand_smiles, cofactor_ccds))
     out = work / "out"
     subprocess.run(
         ["boltz", "predict", str(work / "in.yaml"), "--use_msa_server",
@@ -154,19 +162,21 @@ def cofold(label: str, protein_seq: str, ligand_smiles: str) -> dict:
 @app.local_entrypoint()
 def main() -> None:
     """Fan out one GPU call per (target, ligand) pair; tabulate affinities; positive-control test."""
-    jobs = []  # (target, ligand_name, protein_seq, smiles)
-    for target, (pdb, res, drug) in TARGETS.items():
+    jobs = []  # (label, protein_seq, smiles, cofactor_ccds)
+    for target, (pdb, res, drug, cofactors) in TARGETS.items():
         seq = binding_chain_sequence(pdb, res)
         for lig in [drug, *NEGATIVES]:
-            jobs.append((target, lig, seq, pubchem_smiles(lig)))
-    print(f"co-folding {len(jobs)} complexes ({len(TARGETS)} targets x {1+len(NEGATIVES)} ligands)")
+            jobs.append((f"{target}_{lig}", seq, pubchem_smiles(lig), cofactors))
+    cofactor_summary = {t: c[3] for t, c in TARGETS.items()}
+    print(f"co-folding {len(jobs)} complexes ({len(TARGETS)} targets x {1+len(NEGATIVES)} ligands); "
+          f"cofactors per target: {cofactor_summary}")
 
-    results = list(cofold.starmap([(f"{t}_{l}", s, smi) for t, l, s, smi in jobs]))
+    results = list(cofold.starmap(jobs))
     by = {r["label"]: r for r in results}
 
     print(f"\n{'target':12s}{'ligand':14s}{'affinity':>10s}{'P(binder)':>11s}")
     out_rows = []
-    for target, (pdb, res, drug) in TARGETS.items():
+    for target, (pdb, res, drug, cofactors) in TARGETS.items():
         prob = {}  # rank by P(binder): unambiguous (higher = more likely a binder). Boltz
         for lig in [drug, *NEGATIVES]:   # affinity_pred_value is ~log(IC50) (lower=stronger) -- sign
             r = by.get(f"{target}_{lig}", {})           # is version-dependent, so don't rank on it.