Structure-binding track: scaffold + ligand-retrieval baseline
Start the structure-based binding branch (PLAN §12), baseline-first. - src/binding.py: validated RDKit ligand retrieval (morgan_fp, tanimoto, retrieve_nearest = the §12.9 engine) + dock() stub documenting the blocked ARM-Mac toolchain - scripts/binding_ligand_baseline.py: 300 drugs vs known binders - docs/structure_binding_notes.md: status, toolchain blocker, next steps - pyproject: [structure] extra (rdkit); data/raw/structures/ for PDBs Step-0 finding: retrieval engine VALIDATED on in-set classes (decitabine->azacitidine 0.62; vorinostat->scriptaid/belinostat) but the distinctive binders voxelotor/mitapivat have no analog in our 300-drug set (Tanimoto ~0.2). Needs (a) bigger library, (b) real docking (§12.3), which is blocked on the ARM-Mac docking toolchain (§12.6 pitfall 4). Structures 5E83 (Hb+voxelotor) and 8XFD (PKR+mitapivat) fetched. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
66
scripts/binding_ligand_baseline.py
Normal file
66
scripts/binding_ligand_baseline.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""Structure-based track, step 0: ligand-based retrieval baseline (PLAN §12.9 engine).
|
||||
|
||||
Docking (§12.3) needs a toolchain that doesn't pip-install on ARM Mac (AutoDock Vina) — that's the
|
||||
next dependency to solve. Meanwhile this runs now with pure RDKit: do any of our 300 drugs sit near
|
||||
the KNOWN sickle binders (voxelotor, mitapivat, decitabine) in chemical space? This is the
|
||||
retrieval engine §12.9 would point a generative beacon at, and a sanity check on the ligand data.
|
||||
|
||||
NOT docking and NOT a binding claim — chemical similarity only. Similarity != activity (§12.9).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
from rdkit import Chem, DataStructs, RDLogger
|
||||
from rdkit.Chem import rdFingerprintGenerator
|
||||
|
||||
RDLogger.DisableLog("rdApp.*")
|
||||
MORGAN = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
|
||||
|
||||
# Known sickle binders = positive-control beacons (target in parens).
|
||||
BINDERS = ["voxelotor", "mitapivat", "decitabine", "vorinostat"]
|
||||
|
||||
|
||||
def pubchem_smiles(name: str) -> str | None:
|
||||
for prop in ("SMILES", "ConnectivitySMILES", "IsomericSMILES", "CanonicalSMILES"):
|
||||
try:
|
||||
u = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{name}/property/{prop}/JSON"
|
||||
d = requests.get(u, timeout=30).json()["PropertyTable"]["Properties"][0]
|
||||
if prop in d:
|
||||
return d[prop]
|
||||
except Exception:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def fp(smi: str):
|
||||
if not isinstance(smi, str) or smi in ("-666", ""):
|
||||
return None
|
||||
m = Chem.MolFromSmiles(smi)
|
||||
return MORGAN.GetFingerprint(m) if m else None
|
||||
|
||||
|
||||
def main() -> None:
|
||||
binder_smi = {b: pubchem_smiles(b) for b in BINDERS}
|
||||
print("known-binder SMILES:", {k: (v[:34] + "..." if v else "MISSING") for k, v in binder_smi.items()})
|
||||
|
||||
drugs = pd.read_csv("data/processed/drug_set_v1.csv")[["pert_iname", "canonical_smiles", "inclusion_reason"]]
|
||||
reason = dict(zip(drugs.pert_iname, drugs.inclusion_reason))
|
||||
drug_fp = {r.pert_iname: fp(r.canonical_smiles) for r in drugs.itertuples()}
|
||||
drug_fp = {k: v for k, v in drug_fp.items() if v is not None}
|
||||
print(f"fingerprinted {len(drug_fp)}/{len(drugs)} drugs\n")
|
||||
|
||||
for b, smi in binder_smi.items():
|
||||
bfp = fp(smi)
|
||||
if bfp is None:
|
||||
print(f"{b}: no SMILES\n"); continue
|
||||
sims = sorted(((DataStructs.TanimotoSimilarity(bfp, v), k) for k, v in drug_fp.items()), reverse=True)
|
||||
print(f"nearest drugs to {b}:")
|
||||
for s, k in sims[:5]:
|
||||
print(f" {s:.3f} {k:22s} [{reason.get(k,'?')}]")
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user