"""Week 2, task 4: assemble drug_profiles_v1.parquet (PLAN §6). Joins the curated drug set + ChEMBL enrichment + LINCS consensus signatures into one profile table. Each drug carries a confidence tier: LINCS is a single source, so signature-backed drugs are Tier B at best (assign_tier with single_source=True); drugs with no signature are Tier C and marked not-scored (not dropped silently — PLAN §6 Week 3 task 2). The 978-gene signature order is the column order of lincs_signatures_v1.parquet (landmark symbols); each profile's `lincs_signature` is that vector (or null). """ from __future__ import annotations import ast from pathlib import Path import pandas as pd import sys sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from src.drugs import persist_drug_profiles # noqa: E402 from src.provenance import ConfidenceTier, assign_tier # noqa: E402 PROCESSED = Path("data/processed") DRUG_SET = PROCESSED / "drug_set_v1.csv" CHEMBL = Path("data/raw/chembl/chembl_enrichment.parquet") LINCS_SIG = PROCESSED / "lincs_signatures_v1.parquet" def main() -> None: drugs = pd.read_csv(DRUG_SET) chembl = pd.read_parquet(CHEMBL)[ ["pert_iname", "chembl_id", "pref_name", "smiles", "mechanism_of_action", "targets"] ] sigs = pd.read_parquet(LINCS_SIG) # rows=pert_iname, cols=978 landmark symbols gene_order = list(sigs.columns) df = drugs.merge(chembl, on="pert_iname", how="left") rows = [] for r in df.itertuples(): has_sig = r.pert_iname in sigs.index vector = sigs.loc[r.pert_iname].tolist() if has_sig else None # LINCS = single source => Tier B max when measured; no signature => Tier C. tier = assign_tier( is_measured=has_sig, n_per_group=None, peer_reviewed=True, single_source=True ) if has_sig else ConfidenceTier.C targets = r.targets if isinstance(targets, str): try: targets = ast.literal_eval(targets) except (ValueError, SyntaxError): targets = [] elif hasattr(targets, "tolist"): # numpy ndarray from parquet round-trip targets = targets.tolist() elif targets is None or (not isinstance(targets, (list, tuple))): targets = [] rows.append({ "name": r.pert_iname, "chembl_id": r.chembl_id if pd.notna(r.chembl_id) else None, "pref_name": r.pref_name if pd.notna(r.pref_name) else None, "inchikey": r.inchi_key if pd.notna(r.inchi_key) else None, "smiles": r.smiles if pd.notna(r.smiles) else None, "targets": list(targets), "mechanism_of_action": r.mechanism_of_action if pd.notna(r.mechanism_of_action) else None, "inclusion_reason": r.inclusion_reason, "lincs_phase": r.phase, "scored": has_sig, "lincs_signature": vector, "confidence_tier": tier.value, }) profiles = pd.DataFrame(rows) # Persist the gene order alongside, so Week-3 scoring can align the vectors. (PROCESSED / "lincs_gene_order.txt").write_text("\n".join(gene_order)) path = persist_drug_profiles(profiles) print(f"drug_profiles_v1: {len(profiles)} drugs") print(f" scored (have LINCS signature): {profiles['scored'].sum()}") print(f" not scored: {(~profiles['scored']).sum()}") print(" by inclusion reason (scored rate):") print(profiles.groupby("inclusion_reason")["scored"].agg(["sum", "count"]).to_string()) print(" tier split:", profiles["confidence_tier"].value_counts().to_dict()) for gt in ["hydroxyurea", "glutamine"]: row = profiles[profiles["name"] == gt] print(f" ground truth '{gt}': scored={bool(row['scored'].iloc[0]) if len(row) else 'ABSENT'}") print(f"wrote {path}") if __name__ == "__main__": main()