From f74a58aa4317119b8d4625c4fa0c3c50660174bb Mon Sep 17 00:00:00 2001 From: "Junior B." Date: Thu, 25 Jun 2026 00:44:32 +0200 Subject: [PATCH] Harden cache_msa: pick valid FASTA a3m + strip null bytes Pilot validated the MSA-reuse architecture (1 server query, cached & reused -- server hammering fixed) but Boltz's INPUT a3m parser crashed on the cached file: KeyError '\x00'. Boltz writes several .a3m files (some binary); we were grabbing a bad one. Fix: select the largest a3m that looks like FASTA (starts with '>'), and strip null bytes before caching. Raise with the candidate list if none qualifies (so we can see what's there). Pilot status: MSA caching + reuse confirmed working; this fixes the a3m-format crash. Re-validate with a 2-drug pilot before the full run. Co-Authored-By: Claude Opus 4.8 (1M context) --- gpu/modal_app.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/gpu/modal_app.py b/gpu/modal_app.py index 231e50a..36b09af 100644 --- a/gpu/modal_app.py +++ b/gpu/modal_app.py @@ -148,14 +148,22 @@ def cache_msa(label: str, protein_seq: str, ligand_smiles: str, cofactor_ccds: l out = work / "out" subprocess.run(["boltz", "predict", str(work / "in.yaml"), "--use_msa_server", "--cache", boltz_cache, "--out_dir", str(out), "--output_format", "pdb"], check=True) - a3m = next(out.rglob("*.a3m"), None) - if a3m is None: + # Pick the largest a3m that actually looks like FASTA/a3m text (Boltz writes several files; + # some are binary and crash its INPUT parser with KeyError '\x00'). Strip null bytes too. + candidates = sorted(out.rglob("*.a3m"), key=lambda p: p.stat().st_size, reverse=True) + chosen = None + for c in candidates: + raw = c.read_bytes() + if raw[:1] == b">" or b"\n>" in raw[:512]: + chosen = raw.replace(b"\x00", b"") + break + if chosen is None: weights.commit() - raise RuntimeError("MSA generation produced no .a3m") + raise RuntimeError(f"no valid a3m; candidates={[(str(p), p.stat().st_size) for p in candidates]}") msa_dir = Path(WEIGHTS) / "msa" msa_dir.mkdir(parents=True, exist_ok=True) dest = msa_dir / f"{label}.a3m" - shutil.copy(a3m, dest) + dest.write_bytes(chosen) weights.commit() return str(dest)