From 07705a58843f0f7120a58507bd2bad51a0feee9b Mon Sep 17 00:00:00 2001 From: "Junior B." Date: Wed, 24 Jun 2026 17:16:06 +0200 Subject: [PATCH] GPU Phase 1: co-fold cofactors/metals (the binding-mode determinants) Add metal/cofactor handling to the Boltz-2 YAML as CCD ligand entries - the modes classical docking couldn't model: - HDAC2 + catalytic Zn (vorinostat chelates it) - PKR + FBP + Mg (allosteric activator + metal) - hemoglobin + heme Same cofactors present when co-folding negatives into a target (fair test). build_boltz_yaml() gains a cofactor_ccds arg (emits `ligand: {ccd: ...}` entries); TARGETS carries per-target cofactors; cofold()/main() thread them through. Verified locally: YAML builds correctly with Zn / FBP+Mg. Honest limitation noted: Hb's voxelotor site is at the tetramer centre and covalent (Schiff base), so single-chain+heme only approximates it - HDAC2 (Zn) and PKR (cofactor) are the real co-folding tests. Ready for `modal run gpu/modal_app.py`. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/gpu_plan.md | 8 +++--- gpu/modal_app.py | 66 ++++++++++++++++++++++++++++-------------------- 2 files changed, 43 insertions(+), 31 deletions(-) diff --git a/docs/gpu_plan.md b/docs/gpu_plan.md index 063c99c..a49093e 100644 --- a/docs/gpu_plan.md +++ b/docs/gpu_plan.md @@ -69,9 +69,11 @@ forgotten box can't bleed money. ## What runs on the GPU (in cost order — cheap validation first) - **Phase 1 — modality validation (~minutes, ~$1):** co-fold each known binder + 2 negative - controls (caffeine, hydroxyurea) into each target (Hb, PKR, HDAC2) and check the **known binder - has the highest Boltz-2 P(binder)** for its own target — the discrimination Vina couldn't manage - on metal/covalent/allosteric modes. (Ranking uses P(binder), not the raw affinity value, whose + controls (caffeine, hydroxyurea) into each target (Hb, PKR, HDAC2) **with the binding-mode + cofactors co-folded in** — HDAC2 + catalytic **Zn**, PKR + **FBP/Mg**, Hb + **heme** (as CCD + ligand entries) — and check the **known binder has the highest Boltz-2 P(binder)** for its own + target. This is the discrimination Vina couldn't manage precisely because it can't model + Zn-chelation / cofactors. (Ranking uses P(binder), not the raw affinity value, whose sign is version-dependent.) Pose-RMSD vs crystal is a deeper check but needs receptor superposition (align predicted protein to crystal, transform ligand) — a later refinement. If Phase 1 passes, the modality is real; if not, stop before paying for a screen. diff --git a/gpu/modal_app.py b/gpu/modal_app.py index 6038d3f..a9c5f4a 100644 --- a/gpu/modal_app.py +++ b/gpu/modal_app.py @@ -32,14 +32,23 @@ image = ( weights = modal.Volume.from_name("reverso-binding-weights", create_if_missing=True) WEIGHTS = "/weights" -# target name -> (PDB id, crystal-ligand resname, drug name). Plus negatives co-folded into each. +# target -> (PDB id, crystal-ligand resname, drug name, cofactor/metal CCD codes to co-fold). +# The cofactors are the binding-mode determinants Vina couldn't model: HDAC2 needs the catalytic +# Zn (vorinostat chelates it), PKR the allosteric FBP + Mg, hemoglobin the heme. Co-folding them +# in as CCD ligands is the whole point of the AF3-class pivot. The same cofactors are present when +# co-folding the negatives into that target, for a fair comparison. TARGETS = { - "hemoglobin": ("5E83", "5L7", "voxelotor"), - "PKR": ("8XFD", "WV2", "mitapivat"), - "HDAC2": ("4LXZ", "SHH", "vorinostat"), + "hemoglobin": ("5E83", "5L7", "voxelotor", ["HEM"]), + "PKR": ("8XFD", "WV2", "mitapivat", ["FBP", "MG"]), + "HDAC2": ("4LXZ", "SHH", "vorinostat", ["ZN"]), } NEGATIVES = ["caffeine", "hydroxyurea"] +# Honest limitation: hemoglobin's voxelotor site sits at the tetramer centre and the bond is +# covalent (Schiff base) — a single-chain + heme model only approximates it, so Hb is the weak +# case. HDAC2 (Zn chelation) and PKR (allosteric + cofactor) are the real tests of whether +# co-folding handles the modes classical docking could not. + # --------------------------------------------------------------------------- local helpers (CPU) @@ -92,28 +101,27 @@ def pubchem_smiles(name: str) -> str: raise ValueError(f"no SMILES for {name}") -def build_boltz_yaml(protein_seq: str, ligand_smiles: str) -> str: - """Boltz-2 YAML for a protein+ligand complex with an affinity prediction on the ligand.""" - return ( - "version: 1\n" - "sequences:\n" - " - protein:\n" - " id: A\n" - f" sequence: {protein_seq}\n" - " - ligand:\n" - " id: L\n" - f" smiles: '{ligand_smiles}'\n" - "properties:\n" - " - affinity:\n" - " binder: L\n" - ) +def build_boltz_yaml(protein_seq: str, ligand_smiles: str, cofactor_ccds: list[str] | None = None) -> str: + """Boltz-2 YAML: protein + the drug ligand (affinity binder) + any cofactor/metal CCD ligands. + + Cofactors/ions are added as `ligand` entries referencing their CCD code (e.g. ZN, MG, FBP, + HEM) so the model places the metal/cofactor that defines the binding mode. Affinity is + predicted on the drug ligand L only. + """ + lines = ["version: 1", "sequences:", + " - protein:", " id: A", f" sequence: {protein_seq}", + " - ligand:", " id: L", f" smiles: '{ligand_smiles}'"] + for i, ccd in enumerate(cofactor_ccds or []): + lines += [" - ligand:", f" id: M{i}", f" ccd: {ccd}"] + lines += ["properties:", " - affinity:", " binder: L"] + return "\n".join(lines) + "\n" # ------------------------------------------------------------------------------- GPU function @app.function(gpu="L4", image=image, volumes={WEIGHTS: weights}, timeout=3600) -def cofold(label: str, protein_seq: str, ligand_smiles: str) -> dict: - """Co-fold one complex on the GPU; return predicted affinity + binder probability. +def cofold(label: str, protein_seq: str, ligand_smiles: str, cofactor_ccds: list[str]) -> dict: + """Co-fold one complex (protein + drug + cofactors) on the GPU; return affinity + P(binder). Weights persist on the mounted Volume (HF_HOME/boltz --cache under /weights), so run 2+ skips the download. GPU is released the moment this returns. @@ -126,7 +134,7 @@ def cofold(label: str, protein_seq: str, ligand_smiles: str) -> dict: work = Path("/tmp") / label work.mkdir(parents=True, exist_ok=True) - (work / "in.yaml").write_text(build_boltz_yaml(protein_seq, ligand_smiles)) + (work / "in.yaml").write_text(build_boltz_yaml(protein_seq, ligand_smiles, cofactor_ccds)) out = work / "out" subprocess.run( ["boltz", "predict", str(work / "in.yaml"), "--use_msa_server", @@ -154,19 +162,21 @@ def cofold(label: str, protein_seq: str, ligand_smiles: str) -> dict: @app.local_entrypoint() def main() -> None: """Fan out one GPU call per (target, ligand) pair; tabulate affinities; positive-control test.""" - jobs = [] # (target, ligand_name, protein_seq, smiles) - for target, (pdb, res, drug) in TARGETS.items(): + jobs = [] # (label, protein_seq, smiles, cofactor_ccds) + for target, (pdb, res, drug, cofactors) in TARGETS.items(): seq = binding_chain_sequence(pdb, res) for lig in [drug, *NEGATIVES]: - jobs.append((target, lig, seq, pubchem_smiles(lig))) - print(f"co-folding {len(jobs)} complexes ({len(TARGETS)} targets x {1+len(NEGATIVES)} ligands)") + jobs.append((f"{target}_{lig}", seq, pubchem_smiles(lig), cofactors)) + cofactor_summary = {t: c[3] for t, c in TARGETS.items()} + print(f"co-folding {len(jobs)} complexes ({len(TARGETS)} targets x {1+len(NEGATIVES)} ligands); " + f"cofactors per target: {cofactor_summary}") - results = list(cofold.starmap([(f"{t}_{l}", s, smi) for t, l, s, smi in jobs])) + results = list(cofold.starmap(jobs)) by = {r["label"]: r for r in results} print(f"\n{'target':12s}{'ligand':14s}{'affinity':>10s}{'P(binder)':>11s}") out_rows = [] - for target, (pdb, res, drug) in TARGETS.items(): + for target, (pdb, res, drug, cofactors) in TARGETS.items(): prob = {} # rank by P(binder): unambiguous (higher = more likely a binder). Boltz for lig in [drug, *NEGATIVES]: # affinity_pred_value is ~log(IC50) (lower=stronger) -- sign r = by.get(f"{target}_{lig}", {}) # is version-dependent, so don't rank on it.