"""Week 4: formal recovery test against the pre-registered criteria (PLAN §6). Pre-registered criteria (committed in docs/recovery_test_report.md before this run): - hydroxyurea in top 10% (top 30 of 300), AND - L-glutamine in top 25% (top 75) OR documented unscorable due to missing LINCS signature, AND - >=4 of 5 pre-specified negative controls in the bottom half. The 5 negative controls are pre-specified here by a category rule (one per category, alphabetically first available) so the choice does not peek at ranks. Primary ranking = raw connectivity. """ from __future__ import annotations from pathlib import Path import pandas as pd RANKED = Path("data/results/ranked_candidates_v1.csv") # One per unrelated category, alphabetical-first — chosen without looking at ranks. NEG_CONTROL_CATEGORIES = { "antifungal": ["clotrimazole", "fluconazole", "itraconazole", "ketoconazole", "miconazole", "terbinafine"], "antihistamine": ["astemizole", "cetirizine", "diphenhydramine", "fexofenadine", "loratadine"], "antibiotic": ["azithromycin", "ciprofloxacin", "doxycycline", "tetracycline", "trimethoprim"], "hormone": ["ethinyl-estradiol", "levonorgestrel", "medroxyprogesterone-acetate", "norethindrone"], "misc": ["caffeine", "lidocaine", "loperamide", "omeprazole", "ranitidine"], } def main() -> None: df = pd.read_csv(RANKED).set_index("drug_name") n = len(df) top10_cut, top25_cut, half = int(n * 0.10), int(n * 0.25), n // 2 def rk(name): return int(df.loc[name, "rank"]) if name in df.index else None hu, glut = rk("hydroxyurea"), rk("glutamine") # pick negative controls present in the ranking negs = {} for cat, options in NEG_CONTROL_CATEGORIES.items(): pick = next((d for d in options if d in df.index), None) if pick: negs[pick] = (cat, rk(pick)) print("=" * 60) print(f"N = {n}; top10 cut = {top10_cut}, top25 cut = {top25_cut}, bottom-half > {half}") print(f"\nhydroxyurea: rank {hu} (top {100*hu/n:.1f}%) -> top-10%? {hu <= top10_cut}") glut_score = df.loc["glutamine", "connectivity_score"] print(f"L-glutamine: rank {glut} (top {100*glut/n:.1f}%), WTCS={glut_score:.3f} " f"-> top-25%? {glut <= top25_cut} (has signature, so NOT 'missing-signature unscorable')") print("\nnegative controls (pre-specified, 1 per category):") n_bottom = 0 for d, (cat, r) in negs.items(): in_bottom = r > half n_bottom += in_bottom print(f" {d:18s} [{cat:13s}] rank {r:3d} bottom-half? {in_bottom}") print(f" -> {n_bottom}/5 in bottom half (need >=4)") crit_hu = hu <= top10_cut crit_glut = glut <= top25_cut crit_neg = n_bottom >= 4 overall = crit_hu and crit_glut and crit_neg print(f"\nCRITERIA: hydroxyurea={crit_hu}, L-glutamine={crit_glut}, neg-controls={crit_neg}") print(f"OVERALL (raw ranking): {'PASS' if overall else 'FAIL'}") # secondary prior-weighted view (reported, not the primary criterion) hu_b = int(df.loc["hydroxyurea", "blended_rank"]) print(f"\nsecondary (mechanistic-prior) ranking: hydroxyurea blended_rank {hu_b} " f"(top {100*hu_b/n:.1f}%)") print("\n--- TOP 10 (raw connectivity) ---") top10 = df.nsmallest(10, "connectivity_score") for name, r in top10.iterrows(): print(f" {int(r['rank']):2d} {name:18s} {r['connectivity_score']:+.3f} " f"[{r['inclusion_reason']}] {str(r['known_targets'])[:45]}") if __name__ == "__main__": main()