Start the structure-based binding branch (PLAN §12), baseline-first. - src/binding.py: validated RDKit ligand retrieval (morgan_fp, tanimoto, retrieve_nearest = the §12.9 engine) + dock() stub documenting the blocked ARM-Mac toolchain - scripts/binding_ligand_baseline.py: 300 drugs vs known binders - docs/structure_binding_notes.md: status, toolchain blocker, next steps - pyproject: [structure] extra (rdkit); data/raw/structures/ for PDBs Step-0 finding: retrieval engine VALIDATED on in-set classes (decitabine->azacitidine 0.62; vorinostat->scriptaid/belinostat) but the distinctive binders voxelotor/mitapivat have no analog in our 300-drug set (Tanimoto ~0.2). Needs (a) bigger library, (b) real docking (§12.3), which is blocked on the ARM-Mac docking toolchain (§12.6 pitfall 4). Structures 5E83 (Hb+voxelotor) and 8XFD (PKR+mitapivat) fetched. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
67 lines
2.6 KiB
Python
67 lines
2.6 KiB
Python
"""Structure-based track, step 0: ligand-based retrieval baseline (PLAN §12.9 engine).
|
|
|
|
Docking (§12.3) needs a toolchain that doesn't pip-install on ARM Mac (AutoDock Vina) — that's the
|
|
next dependency to solve. Meanwhile this runs now with pure RDKit: do any of our 300 drugs sit near
|
|
the KNOWN sickle binders (voxelotor, mitapivat, decitabine) in chemical space? This is the
|
|
retrieval engine §12.9 would point a generative beacon at, and a sanity check on the ligand data.
|
|
|
|
NOT docking and NOT a binding claim — chemical similarity only. Similarity != activity (§12.9).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import pandas as pd
|
|
import requests
|
|
from rdkit import Chem, DataStructs, RDLogger
|
|
from rdkit.Chem import rdFingerprintGenerator
|
|
|
|
RDLogger.DisableLog("rdApp.*")
|
|
MORGAN = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
|
|
|
|
# Known sickle binders = positive-control beacons (target in parens).
|
|
BINDERS = ["voxelotor", "mitapivat", "decitabine", "vorinostat"]
|
|
|
|
|
|
def pubchem_smiles(name: str) -> str | None:
|
|
for prop in ("SMILES", "ConnectivitySMILES", "IsomericSMILES", "CanonicalSMILES"):
|
|
try:
|
|
u = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{name}/property/{prop}/JSON"
|
|
d = requests.get(u, timeout=30).json()["PropertyTable"]["Properties"][0]
|
|
if prop in d:
|
|
return d[prop]
|
|
except Exception:
|
|
continue
|
|
return None
|
|
|
|
|
|
def fp(smi: str):
|
|
if not isinstance(smi, str) or smi in ("-666", ""):
|
|
return None
|
|
m = Chem.MolFromSmiles(smi)
|
|
return MORGAN.GetFingerprint(m) if m else None
|
|
|
|
|
|
def main() -> None:
|
|
binder_smi = {b: pubchem_smiles(b) for b in BINDERS}
|
|
print("known-binder SMILES:", {k: (v[:34] + "..." if v else "MISSING") for k, v in binder_smi.items()})
|
|
|
|
drugs = pd.read_csv("data/processed/drug_set_v1.csv")[["pert_iname", "canonical_smiles", "inclusion_reason"]]
|
|
reason = dict(zip(drugs.pert_iname, drugs.inclusion_reason))
|
|
drug_fp = {r.pert_iname: fp(r.canonical_smiles) for r in drugs.itertuples()}
|
|
drug_fp = {k: v for k, v in drug_fp.items() if v is not None}
|
|
print(f"fingerprinted {len(drug_fp)}/{len(drugs)} drugs\n")
|
|
|
|
for b, smi in binder_smi.items():
|
|
bfp = fp(smi)
|
|
if bfp is None:
|
|
print(f"{b}: no SMILES\n"); continue
|
|
sims = sorted(((DataStructs.TanimotoSimilarity(bfp, v), k) for k, v in drug_fp.items()), reverse=True)
|
|
print(f"nearest drugs to {b}:")
|
|
for s, k in sims[:5]:
|
|
print(f" {s:.3f} {k:22s} [{reason.get(k,'?')}]")
|
|
print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|