Reverso/scripts/binding_ligand_baseline.py

"""Structure-based track, step 0: ligand-based retrieval baseline (PLAN §12.9 engine).

Docking (§12.3) needs a toolchain that doesn't pip-install on ARM Mac (AutoDock Vina) — that's the
next dependency to solve. Meanwhile this runs now with pure RDKit: do any of our 300 drugs sit near
the KNOWN sickle binders (voxelotor, mitapivat, decitabine) in chemical space? This is the
retrieval engine §12.9 would point a generative beacon at, and a sanity check on the ligand data.

NOT docking and NOT a binding claim — chemical similarity only. Similarity != activity (§12.9).
"""

from __future__ import annotations

import pandas as pd
import requests
from rdkit import Chem, DataStructs, RDLogger
from rdkit.Chem import rdFingerprintGenerator

RDLogger.DisableLog("rdApp.*")
MORGAN = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

# Known sickle binders = positive-control beacons (target in parens).
BINDERS = ["voxelotor", "mitapivat", "decitabine", "vorinostat"]


def pubchem_smiles(name: str) -> str | None:
    for prop in ("SMILES", "ConnectivitySMILES", "IsomericSMILES", "CanonicalSMILES"):
        try:
            u = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{name}/property/{prop}/JSON"
            d = requests.get(u, timeout=30).json()["PropertyTable"]["Properties"][0]
            if prop in d:
                return d[prop]
        except Exception:
            continue
    return None


def fp(smi: str):
    if not isinstance(smi, str) or smi in ("-666", ""):
        return None
    m = Chem.MolFromSmiles(smi)
    return MORGAN.GetFingerprint(m) if m else None


def main() -> None:
    binder_smi = {b: pubchem_smiles(b) for b in BINDERS}
    print("known-binder SMILES:", {k: (v[:34] + "..." if v else "MISSING") for k, v in binder_smi.items()})

    drugs = pd.read_csv("data/processed/drug_set_v1.csv")[["pert_iname", "canonical_smiles", "inclusion_reason"]]
    reason = dict(zip(drugs.pert_iname, drugs.inclusion_reason))
    drug_fp = {r.pert_iname: fp(r.canonical_smiles) for r in drugs.itertuples()}
    drug_fp = {k: v for k, v in drug_fp.items() if v is not None}
    print(f"fingerprinted {len(drug_fp)}/{len(drugs)} drugs\n")

    for b, smi in binder_smi.items():
        bfp = fp(smi)
        if bfp is None:
            print(f"{b}: no SMILES\n"); continue
        sims = sorted(((DataStructs.TanimotoSimilarity(bfp, v), k) for k, v in drug_fp.items()), reverse=True)
        print(f"nearest drugs to {b}:")
        for s, k in sims[:5]:
            print(f"    {s:.3f}  {k:22s} [{reason.get(k,'?')}]")
        print()


if __name__ == "__main__":
    main()