"""Week 2, task 3: extract per-drug LINCS L1000 consensus signatures. For each drug in the set, slice its Level-5 MODZ signatures (978 landmark genes x its sig_ids) out of the big GCTX via cmapPy, then aggregate to ONE consensus 978-vector per drug by mean across its sig_ids (the "MODZ aggregation across cell lines/replicates" of PLAN §6). hydroxyurea lives in Phase II, L-glutamine in Phase I, so both phases are processed and merged. Output: data/processed/lincs_signatures_v1.parquet (rows = pert_iname, cols = 978 landmark gene symbols, values = mean MODZ z-score). Usage: python scripts/week2_lincs_extract.py --phase 2 # test on Phase II (already downloaded) python scripts/week2_lincs_extract.py # both phases (needs Phase I gunzipped) """ from __future__ import annotations import argparse import gzip import io from pathlib import Path import pandas as pd from cmapPy.pandasGEXpress.parse import parse LINCS = Path("data/raw/lincs") DRUG_SET = Path("data/processed/drug_set_v1.csv") OUT = Path("data/processed/lincs_signatures_v1.parquet") GCTX = {1: LINCS / "phase1_level5.gctx", 2: LINCS / "phase2_level5.gctx"} SIG_INFO = {1: "GSE92742_sig_info.txt.gz", 2: "GSE70138_sig_info.txt.gz"} def read_gz_tsv(name: str) -> pd.DataFrame: return pd.read_csv(io.BytesIO(gzip.decompress((LINCS / name).read_bytes())), sep="\t", low_memory=False) def landmark_ids_and_symbols() -> tuple[list[str], dict[str, str]]: lm = pd.read_csv(LINCS / "landmark_genes.csv") ids = [str(x) for x in lm["pr_gene_id"]] id_to_symbol = {str(r.pr_gene_id): r.pr_gene_symbol for r in lm.itertuples()} return ids, id_to_symbol def extract_phase(phase: int, drug_names: set[str], landmark_ids: list[str]) -> pd.DataFrame: """Return DataFrame: rows=pert_iname, cols=landmark gene_id (str), one mean vector per drug.""" sig = read_gz_tsv(SIG_INFO[phase]) sig = sig[(sig["pert_type"] == "trt_cp") & (sig["pert_iname"].isin(drug_names))] if sig.empty: return pd.DataFrame() sig_ids = sig["sig_id"].tolist() print(f"[phase {phase}] {sig['pert_iname'].nunique()} drugs, {len(sig_ids)} signatures to slice", flush=True) gctoo = parse(str(GCTX[phase]), rid=landmark_ids, cid=sig_ids) data = gctoo.data_df # rows=gene_id, cols=sig_id sig_to_drug = dict(zip(sig["sig_id"], sig["pert_iname"])) # mean across each drug's sig_ids -> one consensus vector per drug per_drug = data.T.groupby(data.columns.map(sig_to_drug)).mean() print(f"[phase {phase}] aggregated to {len(per_drug)} drug consensus vectors", flush=True) return per_drug # rows=pert_iname, cols=gene_id def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--phase", type=int, choices=[1, 2], default=None, help="single phase (test)") args = ap.parse_args() drugs = pd.read_csv(DRUG_SET) drug_names = set(drugs["pert_iname"]) landmark_ids, id_to_symbol = landmark_ids_and_symbols() phases = [args.phase] if args.phase else [1, 2] frames = [] for ph in phases: if not GCTX[ph].exists(): print(f"[phase {ph}] {GCTX[ph]} missing — skipping") continue frames.append(extract_phase(ph, drug_names, landmark_ids)) frames = [f for f in frames if not f.empty] if not frames: print("no signatures extracted") return # A drug present in both phases: average the two phase consensus vectors. combined = pd.concat(frames).groupby(level=0).mean() combined.columns = [id_to_symbol.get(c, c) for c in combined.columns] # gene_id -> symbol covered = sorted(set(combined.index)) missing = sorted(drug_names - set(covered)) print(f"\nsignatures extracted for {len(covered)}/{len(drug_names)} drugs") for gt in ["hydroxyurea", "glutamine"]: print(f" ground truth '{gt}': {'OK' if gt in covered else 'MISSING'}") if args.phase is None: combined.to_parquet(OUT) print(f"wrote {OUT} ({combined.shape[0]} drugs x {combined.shape[1]} landmark genes)") if missing: print(f"{len(missing)} drugs without signature (will be marked not-scored in Week 3)") if __name__ == "__main__": main()