From 0535886ce6abc1d1a8eae67a95e3dd9bec1c79e4 Mon Sep 17 00:00:00 2001 From: "Junior B." Date: Wed, 24 Jun 2026 22:23:01 +0200 Subject: [PATCH] Phase 2 screen pilot: HDAC2 recovers the inhibitor class (P>=0.99) Add the `screen` entrypoint (parallel ~10-wide, cached weights) and run a 24-drug pilot vs HDAC2 (+Zn), ranked by Boltz-2 P(binder). ~$1.3. Result (recovery test at scale): top 9 are ALL HDAC inhibitors (trichostatin-A/vorinostat/panobinostat/belinostat/scriptaid/mocetinostat/ entinostat/apicidin >=0.99; valproic-acid 0.91), clean drop-off to hydroxyurea 0.78 and non-HDAC drugs to dexamethasone 0.03. Captures the structure-activity gradient (hydroxamates > weak fatty-acid > non-HDAC). Honest false negative: romidepsin (potent HDAC inhibitor) ranks low (0.43) -- it's a depsipeptide PRODRUG co-folding doesn't model. Screen mishandles non-standard chemotypes. Screening pipeline validated; next is the full 300-drug discovery run. max_containers=10 (parallel safe once weights cached). Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/results/screen_HDAC2_pilot.csv | 25 +++++++++++++ docs/structure_binding_notes.md | 20 +++++++++++ gpu/modal_app.py | 54 +++++++++++++++++++++++++++-- 3 files changed, 96 insertions(+), 3 deletions(-) create mode 100644 docs/results/screen_HDAC2_pilot.csv diff --git a/docs/results/screen_HDAC2_pilot.csv b/docs/results/screen_HDAC2_pilot.csv new file mode 100644 index 0000000..d362548 --- /dev/null +++ b/docs/results/screen_HDAC2_pilot.csv @@ -0,0 +1,25 @@ +rank,drug,P_binder,affinity,inclusion_reason +1,trichostatin-a,0.9994845986366272,-2.883908271789551,related_mechanism +2,vorinostat,0.9994641542434692,-1.7357702255249023,related_mechanism +3,panobinostat,0.9983962774276733,-2.4892501831054688,related_mechanism +4,belinostat,0.9970909357070923,-2.102327823638916,related_mechanism +5,scriptaid,0.9957252740859985,-1.5838574171066284,related_mechanism +6,mocetinostat,0.9910210371017456,-1.3829972743988037,related_mechanism +7,entinostat,0.9903219938278198,-0.6888977289199829,related_mechanism +8,apicidin,0.9882931113243103,-2.2579169273376465,related_mechanism +9,valproic-acid,0.9134805202484131,0.3317195773124695,related_mechanism +10,hydroxyurea,0.77649986743927,1.2352395057678223,ground_truth +11,curcumin,0.6803375482559204,0.9697343707084656,related_mechanism +12,sulforaphane,0.5829939842224121,0.6017141342163086,related_mechanism +13,resveratrol,0.5800595283508301,1.1647570133209229,related_mechanism +14,tadalafil,0.5426475405693054,0.21663367748260498,related_mechanism +15,glutamine,0.4674023389816284,2.029467821121216,ground_truth +16,quercetin,0.45189985632896423,0.34488344192504883,related_mechanism +17,romidepsin,0.4316568374633789,-0.8200547099113464,related_mechanism +18,decitabine,0.38500306010246277,0.8381982445716858,related_mechanism +19,azacitidine,0.31987687945365906,0.6874549388885498,related_mechanism +20,sildenafil,0.15390564501285553,0.26623794436454773,related_mechanism +21,thalidomide,0.1190553605556488,1.9194087982177734,related_mechanism +22,pomalidomide,0.11849743872880936,1.7003729343414307,related_mechanism +23,lenalidomide,0.09446469694375992,1.9872398376464844,related_mechanism +24,dexamethasone,0.031893711537122726,0.5724264979362488,related_mechanism diff --git a/docs/structure_binding_notes.md b/docs/structure_binding_notes.md index d5fea4f..b1e2d59 100644 --- a/docs/structure_binding_notes.md +++ b/docs/structure_binding_notes.md @@ -141,7 +141,27 @@ the exact Zn-chelation case Vina was off by 7.9 A. HDAC2 is now validated on BOT perfect, metal slightly less). The structure-binding modality is comprehensively validated on its decisive metal-coordination case. +## Step 6 — Phase 2 screen pilot (HDAC2): recovers the inhibitor class decisively (2026-06-24) + +`modal run gpu/modal_app.py::screen --limit 24` — co-fold 24 drugs vs HDAC2 (+Zn), parallel +(~10-wide, cached weights), ranked by P(binder). ~$1.3. + +**Top 9 = all HDAC inhibitors** (trichostatin-A, vorinostat, panobinostat, belinostat, scriptaid, +mocetinostat, entinostat, apicidin all P≥0.99; valproic-acid 0.91), then a clean drop to +hydroxyurea 0.78 and non-HDAC drugs sinking to dexamethasone 0.03. The model captures the +**structure-activity gradient**: potent Zn-chelating hydroxamates ~0.99 vs weak fatty-acid +valproate 0.91 vs DNMT inhibitors / IMiDs / steroid near 0. The screen recovers the right +pharmacological class at scale. + +**Honest false negative:** romidepsin (potent HDAC inhibitor) ranked low (0.43) — it is a cyclic +depsipeptide PRODRUG (must be reduced to expose its Zn-binding thiol), which co-folding doesn't +model. The screen mishandles non-standard chemotypes (prodrugs, macrocycles). + +The screening pipeline is validated. Next: run the full set (incl. the 240 random + negatives) to +hunt for NON-obvious HDAC2 binders (the actual discovery run), ~$15-20. + ## Next steps +- [ ] Full screen (300 drugs) vs HDAC2 — discovery run for non-obvious binders. - [ ] Investigate PKR: allosteric site may need the full assembly / better pocket definition. - [ ] Phase 2 screen: rank the ~300-drug set against HDAC2 (the validated target) by P(binder); positive-control recovery test at screen scale. diff --git a/gpu/modal_app.py b/gpu/modal_app.py index 65dfc83..035278f 100644 --- a/gpu/modal_app.py +++ b/gpu/modal_app.py @@ -120,9 +120,10 @@ def build_boltz_yaml(protein_seq: str, ligand_smiles: str, cofactor_ccds: list[s # ------------------------------------------------------------------------------- GPU function -# max_containers=1: run the inputs serially on one warm container so the weights download ONCE -# (no concurrent-download race that corrupts the checkpoint) and are reused for the rest. -@app.function(gpu="L4", image=image, volumes={WEIGHTS: weights}, timeout=3600, max_containers=1) +# max_containers caps parallel fan-out (cost control). The download race that corrupts the +# checkpoint only happens on a COLD volume; once weights are cached+committed (Phase 1 did this), +# parallel containers just reload them, so a screen can safely run ~10-wide. +@app.function(gpu="L4", image=image, volumes={WEIGHTS: weights}, timeout=3600, max_containers=10) def cofold(label: str, protein_seq: str, ligand_smiles: str, cofactor_ccds: list[str]) -> dict: """Co-fold one complex (protein + drug + cofactors) on the GPU; return affinity + P(binder). @@ -185,6 +186,53 @@ def pose() -> None: print("no structure returned") +@app.local_entrypoint() +def screen(limit: int = 0) -> None: + """Phase 2: co-fold the drug set against the validated target (HDAC2 + Zn), rank by P(binder). + + `modal run gpu/modal_app.py::screen --limit 24` (pilot; omit --limit for the full set). + Recovery check at scale: the known HDAC inhibitors (related_mechanism) should rank top. + Weights are cached, so this fans out ~10-wide. + """ + import csv + import pandas as pd + + target = "HDAC2" + pdb, res, _drug, cofactors = TARGETS[target] + seq = binding_chain_sequence(pdb, res) + + df = pd.read_csv("data/processed/drug_set_v1.csv") + df = df[df["canonical_smiles"].notna() & (df["canonical_smiles"] != "-666")].copy() + if limit: # pilot: prioritise mechanism + controls (incl. the HDAC inhibitors) then fill + pri = df[df["inclusion_reason"].isin(["ground_truth", "related_mechanism", "negative_control"])] + df = pd.concat([pri, df.drop(pri.index)]).head(limit) + jobs = [(f"{target}__{r.pert_iname}", seq, r.canonical_smiles, cofactors) for r in df.itertuples()] + print(f"screening {len(jobs)} drugs vs {target} (+{cofactors})") + + results = list(cofold.starmap(jobs)) + by = {j[0].split("__")[1]: r for j, r in zip(jobs, results)} + reason = dict(zip(df["pert_iname"], df["inclusion_reason"])) + + rows = [{"drug": d, "P_binder": (r or {}).get("prob_binder"), + "affinity": (r or {}).get("affinity"), "inclusion_reason": reason.get(d)} + for d, r in by.items()] + rows = [x for x in rows if x["P_binder"] is not None] + rows.sort(key=lambda x: x["P_binder"], reverse=True) + + out = Path("data/processed/binding"); out.mkdir(parents=True, exist_ok=True) + with (out / f"screen_{target}.csv").open("w", newline="") as f: + w = csv.DictWriter(f, fieldnames=["rank", "drug", "P_binder", "affinity", "inclusion_reason"]) + w.writeheader() + for i, x in enumerate(rows, 1): + w.writerow({"rank": i, **x}) + print(f"\nscreened {len(rows)} drugs vs {target}; top 15 by P(binder):") + for i, x in enumerate(rows[:15], 1): + print(f" {i:2d} {x['drug']:20s} P={x['P_binder']:.3f} [{x['inclusion_reason']}]") + hdac_like = [i for i, x in enumerate(rows, 1) if x["inclusion_reason"] == "related_mechanism"] + if hdac_like: + print(f"\nrelated-mechanism (HDAC inhibitors etc.) ranks: {hdac_like[:10]}") + + @app.local_entrypoint() def main() -> None: """Fan out one GPU call per (target, ligand) pair; tabulate affinities; positive-control test."""