Reverso/scripts/week2_assemble.py

"""Week 2, task 4: assemble drug_profiles_v1.parquet (PLAN §6).

Joins the curated drug set + ChEMBL enrichment + LINCS consensus signatures into one profile
table. Each drug carries a confidence tier: LINCS is a single source, so signature-backed drugs
are Tier B at best (assign_tier with single_source=True); drugs with no signature are Tier C and
marked not-scored (not dropped silently — PLAN §6 Week 3 task 2).

The 978-gene signature order is the column order of lincs_signatures_v1.parquet (landmark
symbols); each profile's `lincs_signature` is that vector (or null).
"""

from __future__ import annotations

import ast
from pathlib import Path

import pandas as pd

import sys
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from src.drugs import persist_drug_profiles  # noqa: E402
from src.provenance import ConfidenceTier, assign_tier  # noqa: E402

PROCESSED = Path("data/processed")
DRUG_SET = PROCESSED / "drug_set_v1.csv"
CHEMBL = Path("data/raw/chembl/chembl_enrichment.parquet")
LINCS_SIG = PROCESSED / "lincs_signatures_v1.parquet"


def main() -> None:
    drugs = pd.read_csv(DRUG_SET)
    chembl = pd.read_parquet(CHEMBL)[
        ["pert_iname", "chembl_id", "pref_name", "smiles", "mechanism_of_action", "targets"]
    ]
    sigs = pd.read_parquet(LINCS_SIG)  # rows=pert_iname, cols=978 landmark symbols
    gene_order = list(sigs.columns)

    df = drugs.merge(chembl, on="pert_iname", how="left")

    rows = []
    for r in df.itertuples():
        has_sig = r.pert_iname in sigs.index
        vector = sigs.loc[r.pert_iname].tolist() if has_sig else None
        # LINCS = single source => Tier B max when measured; no signature => Tier C.
        tier = assign_tier(
            is_measured=has_sig, n_per_group=None, peer_reviewed=True, single_source=True
        ) if has_sig else ConfidenceTier.C
        targets = r.targets
        if isinstance(targets, str):
            try:
                targets = ast.literal_eval(targets)
            except (ValueError, SyntaxError):
                targets = []
        elif hasattr(targets, "tolist"):  # numpy ndarray from parquet round-trip
            targets = targets.tolist()
        elif targets is None or (not isinstance(targets, (list, tuple))):
            targets = []
        rows.append({
            "name": r.pert_iname,
            "chembl_id": r.chembl_id if pd.notna(r.chembl_id) else None,
            "pref_name": r.pref_name if pd.notna(r.pref_name) else None,
            "inchikey": r.inchi_key if pd.notna(r.inchi_key) else None,
            "smiles": r.smiles if pd.notna(r.smiles) else None,
            "targets": list(targets),
            "mechanism_of_action": r.mechanism_of_action if pd.notna(r.mechanism_of_action) else None,
            "inclusion_reason": r.inclusion_reason,
            "lincs_phase": r.phase,
            "scored": has_sig,
            "lincs_signature": vector,
            "confidence_tier": tier.value,
        })

    profiles = pd.DataFrame(rows)
    # Persist the gene order alongside, so Week-3 scoring can align the vectors.
    (PROCESSED / "lincs_gene_order.txt").write_text("\n".join(gene_order))
    path = persist_drug_profiles(profiles)

    print(f"drug_profiles_v1: {len(profiles)} drugs")
    print(f"  scored (have LINCS signature): {profiles['scored'].sum()}")
    print(f"  not scored: {(~profiles['scored']).sum()}")
    print("  by inclusion reason (scored rate):")
    print(profiles.groupby("inclusion_reason")["scored"].agg(["sum", "count"]).to_string())
    print("  tier split:", profiles["confidence_tier"].value_counts().to_dict())
    for gt in ["hydroxyurea", "glutamine"]:
        row = profiles[profiles["name"] == gt]
        print(f"  ground truth '{gt}': scored={bool(row['scored'].iloc[0]) if len(row) else 'ABSENT'}")
    print(f"wrote {path}")


if __name__ == "__main__":
    main()