"""Structure-based track, step 0: ligand-based retrieval baseline (PLAN §12.9 engine). Docking (§12.3) needs a toolchain that doesn't pip-install on ARM Mac (AutoDock Vina) — that's the next dependency to solve. Meanwhile this runs now with pure RDKit: do any of our 300 drugs sit near the KNOWN sickle binders (voxelotor, mitapivat, decitabine) in chemical space? This is the retrieval engine §12.9 would point a generative beacon at, and a sanity check on the ligand data. NOT docking and NOT a binding claim — chemical similarity only. Similarity != activity (§12.9). """ from __future__ import annotations import pandas as pd import requests from rdkit import Chem, DataStructs, RDLogger from rdkit.Chem import rdFingerprintGenerator RDLogger.DisableLog("rdApp.*") MORGAN = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048) # Known sickle binders = positive-control beacons (target in parens). BINDERS = ["voxelotor", "mitapivat", "decitabine", "vorinostat"] def pubchem_smiles(name: str) -> str | None: for prop in ("SMILES", "ConnectivitySMILES", "IsomericSMILES", "CanonicalSMILES"): try: u = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{name}/property/{prop}/JSON" d = requests.get(u, timeout=30).json()["PropertyTable"]["Properties"][0] if prop in d: return d[prop] except Exception: continue return None def fp(smi: str): if not isinstance(smi, str) or smi in ("-666", ""): return None m = Chem.MolFromSmiles(smi) return MORGAN.GetFingerprint(m) if m else None def main() -> None: binder_smi = {b: pubchem_smiles(b) for b in BINDERS} print("known-binder SMILES:", {k: (v[:34] + "..." if v else "MISSING") for k, v in binder_smi.items()}) drugs = pd.read_csv("data/processed/drug_set_v1.csv")[["pert_iname", "canonical_smiles", "inclusion_reason"]] reason = dict(zip(drugs.pert_iname, drugs.inclusion_reason)) drug_fp = {r.pert_iname: fp(r.canonical_smiles) for r in drugs.itertuples()} drug_fp = {k: v for k, v in drug_fp.items() if v is not None} print(f"fingerprinted {len(drug_fp)}/{len(drugs)} drugs\n") for b, smi in binder_smi.items(): bfp = fp(smi) if bfp is None: print(f"{b}: no SMILES\n"); continue sims = sorted(((DataStructs.TanimotoSimilarity(bfp, v), k) for k, v in drug_fp.items()), reverse=True) print(f"nearest drugs to {b}:") for s, k in sims[:5]: print(f" {s:.3f} {k:22s} [{reason.get(k,'?')}]") print() if __name__ == "__main__": main()