Harden cache_msa: pick valid FASTA a3m + strip null bytes
Pilot validated the MSA-reuse architecture (1 server query, cached & reused -- server hammering fixed) but Boltz's INPUT a3m parser crashed on the cached file: KeyError '\x00'. Boltz writes several .a3m files (some binary); we were grabbing a bad one. Fix: select the largest a3m that looks like FASTA (starts with '>'), and strip null bytes before caching. Raise with the candidate list if none qualifies (so we can see what's there). Pilot status: MSA caching + reuse confirmed working; this fixes the a3m-format crash. Re-validate with a 2-drug pilot before the full run. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -148,14 +148,22 @@ def cache_msa(label: str, protein_seq: str, ligand_smiles: str, cofactor_ccds: l
|
|||||||
out = work / "out"
|
out = work / "out"
|
||||||
subprocess.run(["boltz", "predict", str(work / "in.yaml"), "--use_msa_server",
|
subprocess.run(["boltz", "predict", str(work / "in.yaml"), "--use_msa_server",
|
||||||
"--cache", boltz_cache, "--out_dir", str(out), "--output_format", "pdb"], check=True)
|
"--cache", boltz_cache, "--out_dir", str(out), "--output_format", "pdb"], check=True)
|
||||||
a3m = next(out.rglob("*.a3m"), None)
|
# Pick the largest a3m that actually looks like FASTA/a3m text (Boltz writes several files;
|
||||||
if a3m is None:
|
# some are binary and crash its INPUT parser with KeyError '\x00'). Strip null bytes too.
|
||||||
|
candidates = sorted(out.rglob("*.a3m"), key=lambda p: p.stat().st_size, reverse=True)
|
||||||
|
chosen = None
|
||||||
|
for c in candidates:
|
||||||
|
raw = c.read_bytes()
|
||||||
|
if raw[:1] == b">" or b"\n>" in raw[:512]:
|
||||||
|
chosen = raw.replace(b"\x00", b"")
|
||||||
|
break
|
||||||
|
if chosen is None:
|
||||||
weights.commit()
|
weights.commit()
|
||||||
raise RuntimeError("MSA generation produced no .a3m")
|
raise RuntimeError(f"no valid a3m; candidates={[(str(p), p.stat().st_size) for p in candidates]}")
|
||||||
msa_dir = Path(WEIGHTS) / "msa"
|
msa_dir = Path(WEIGHTS) / "msa"
|
||||||
msa_dir.mkdir(parents=True, exist_ok=True)
|
msa_dir.mkdir(parents=True, exist_ok=True)
|
||||||
dest = msa_dir / f"{label}.a3m"
|
dest = msa_dir / f"{label}.a3m"
|
||||||
shutil.copy(a3m, dest)
|
dest.write_bytes(chosen)
|
||||||
weights.commit()
|
weights.commit()
|
||||||
return str(dest)
|
return str(dest)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user