"""Week 2, task 1: curate the deliberately-composed ~300-drug set (PLAN §6). Composition: 2 ground-truth + ~50 related-mechanism + ~50 negative controls + ~200 random. The universe is restricted to compounds that actually have a LINCS Level-5 signature (in Phase I and/or Phase II), so every curated drug is scorable. Output: drug_set_v1.csv. """ from __future__ import annotations import gzip import io from pathlib import Path import pandas as pd import sys sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from src import RANDOM_SEED # noqa: E402 LINCS = Path("data/raw/lincs") OUT = Path("data/processed/drug_set_v1.csv") GROUND_TRUTH = ["hydroxyurea", "glutamine"] # glutamine == L-glutamine in LINCS # Curated by mechanism (PLAN §6). Intersected with the LINCS catalog below, so misses are # silently dropped — we keep whatever actually has a signature. RELATED_MECHANISM = [ # HbF inducers / epigenetic "decitabine", "azacitidine", "vorinostat", "panobinostat", "romidepsin", "entinostat", "mocetinostat", "belinostat", "pomalidomide", "lenalidomide", "thalidomide", "apicidin", "trichostatin-a", "scriptaid", "valproic-acid", # NO / vascular "sildenafil", "tadalafil", "nitroprusside", # antioxidants "n-acetyl-cysteine", "resveratrol", "curcumin", "quercetin", "sulforaphane", # anti-inflammatory studied in SCD "dexamethasone", "prednisolone", "hydrocortisone", "ibuprofen", "indomethacin", "sulfasalazine", "montelukast", "aspirin", # iron / heme / SCD-adjacent "hemin", "deferoxamine", "deferasirox", "simvastatin", "atorvastatin", "ticagrelor", ] NEGATIVE_CONTROL = [ # antifungals "fluconazole", "ketoconazole", "itraconazole", "clotrimazole", "terbinafine", "miconazole", # antihistamines "loratadine", "cetirizine", "fexofenadine", "diphenhydramine", "chlorpheniramine", "astemizole", # antibiotics "amoxicillin", "ciprofloxacin", "doxycycline", "trimethoprim", "azithromycin", "tetracycline", "nitrofurantoin", # hormones / contraceptives "levonorgestrel", "ethinyl-estradiol", "norethindrone", "medroxyprogesterone-acetate", # misc unrelated "omeprazole", "ranitidine", "loperamide", "caffeine", "acetaminophen", "lidocaine", ] # Fill the random sample so the total set is ~300 (the denominator the pre-registered # recovery-test thresholds assume: "top 30 of 300"). Curated mechanism/control drugs are # capped by what LINCS actually contains, so the random arm absorbs the remainder. TARGET_TOTAL = 300 def load_catalog() -> pd.DataFrame: """Compounds with >=1 Level-5 signature, annotated with phase + inchi/smiles.""" def read_gz(fn, **kw): return pd.read_csv(io.BytesIO(gzip.decompress(Path(fn).read_bytes())), sep="\t", **kw) sig1 = read_gz(LINCS / "GSE92742_sig_info.txt.gz", low_memory=False) sig2 = read_gz(LINCS / "GSE70138_sig_info.txt.gz", low_memory=False) cp1 = set(sig1[sig1["pert_type"] == "trt_cp"]["pert_iname"]) cp2 = set(sig2[sig2["pert_type"] == "trt_cp"]["pert_iname"]) pert1 = read_gz(LINCS / "GSE92742_pert_info.txt.gz", low_memory=False) pert2 = read_gz(LINCS / "GSE70138_pert_info.txt.gz", low_memory=False) info = pd.concat([pert1, pert2], ignore_index=True) info = info[info["pert_type"] == "trt_cp"].drop_duplicates("pert_iname", keep="first") info = info.set_index("pert_iname") names = cp1 | cp2 rows = [] for nm in names: phase = "both" if nm in cp1 and nm in cp2 else ("P1" if nm in cp1 else "P2") rec = info.loc[nm] if nm in info.index else None rows.append({ "pert_iname": nm, "phase": phase, "pert_id": rec["pert_id"] if rec is not None else None, "inchi_key": rec["inchi_key"] if rec is not None else None, "canonical_smiles": rec["canonical_smiles"] if rec is not None else None, }) return pd.DataFrame(rows).set_index("pert_iname") def pick(catalog: pd.DataFrame, names: list[str], reason: str) -> pd.DataFrame: present = [n for n in names if n in catalog.index] missing = [n for n in names if n not in catalog.index] if missing: print(f" [{reason}] {len(present)}/{len(names)} in LINCS; dropped: {missing}") out = catalog.loc[present].copy() out["inclusion_reason"] = reason return out def main() -> None: catalog = load_catalog() print(f"LINCS scorable compound universe: {len(catalog)}") gt = pick(catalog, GROUND_TRUTH, "ground_truth") rel = pick(catalog, RELATED_MECHANISM, "related_mechanism") neg = pick(catalog, NEGATIVE_CONTROL, "negative_control") chosen = pd.concat([gt, rel, neg]) remaining = catalog.drop(index=chosen.index) n_random = TARGET_TOTAL - len(chosen) rand = remaining.sample(n=n_random, random_state=RANDOM_SEED).copy() rand["inclusion_reason"] = "general_sample" drug_set = pd.concat([gt, rel, neg, rand]).reset_index() OUT.parent.mkdir(parents=True, exist_ok=True) drug_set.to_csv(OUT, index=False) print(f"\ndrug_set_v1.csv: {len(drug_set)} drugs") print(drug_set["inclusion_reason"].value_counts().to_string()) print(f"phase split:\n{drug_set['phase'].value_counts().to_string()}") print(f"wrote {OUT}") if __name__ == "__main__": main()