Build the drug profile dataset (PLAN §6 Week 2): - week2_curate_drugset.py: 300-drug set (2 ground-truth + 32 related- mechanism + 26 negative-control + 240 random), restricted to LINCS-scorable compounds, seed=42 - week2_chembl.py: InChIKey->ChEMBL match (145/300), MoA + targets - week2_lincs_extract.py: cmapPy-slice both Level-5 GCTX phases to 978 landmark genes, mean-aggregate per drug to one consensus signature - week2_assemble.py: join into drug_profiles_v1.parquet, Tier B (LINCS single-source), scored flag per PLAN §6 Week 3 task 2 - docs/data_sources.md: drug set composition + LINCS/ChEMBL provenance Results (all gitignored data): 300/300 drugs scored, both ground-truth drugs present (hydroxyurea Phase II = CHEMBL467, L-glutamine Phase I). Key caveat recorded: only 56/477 (12%) of the disease signature genes are LINCS landmarks, so Week-3 scoring uses a 30-up/26-down query. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
102 lines
4.1 KiB
Python
102 lines
4.1 KiB
Python
"""Week 2, task 3: extract per-drug LINCS L1000 consensus signatures.
|
|
|
|
For each drug in the set, slice its Level-5 MODZ signatures (978 landmark genes x its sig_ids)
|
|
out of the big GCTX via cmapPy, then aggregate to ONE consensus 978-vector per drug by mean
|
|
across its sig_ids (the "MODZ aggregation across cell lines/replicates" of PLAN §6). hydroxyurea
|
|
lives in Phase II, L-glutamine in Phase I, so both phases are processed and merged.
|
|
|
|
Output: data/processed/lincs_signatures_v1.parquet (rows = pert_iname, cols = 978 landmark
|
|
gene symbols, values = mean MODZ z-score).
|
|
|
|
Usage:
|
|
python scripts/week2_lincs_extract.py --phase 2 # test on Phase II (already downloaded)
|
|
python scripts/week2_lincs_extract.py # both phases (needs Phase I gunzipped)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import gzip
|
|
import io
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
from cmapPy.pandasGEXpress.parse import parse
|
|
|
|
LINCS = Path("data/raw/lincs")
|
|
DRUG_SET = Path("data/processed/drug_set_v1.csv")
|
|
OUT = Path("data/processed/lincs_signatures_v1.parquet")
|
|
|
|
GCTX = {1: LINCS / "phase1_level5.gctx", 2: LINCS / "phase2_level5.gctx"}
|
|
SIG_INFO = {1: "GSE92742_sig_info.txt.gz", 2: "GSE70138_sig_info.txt.gz"}
|
|
|
|
|
|
def read_gz_tsv(name: str) -> pd.DataFrame:
|
|
return pd.read_csv(io.BytesIO(gzip.decompress((LINCS / name).read_bytes())), sep="\t", low_memory=False)
|
|
|
|
|
|
def landmark_ids_and_symbols() -> tuple[list[str], dict[str, str]]:
|
|
lm = pd.read_csv(LINCS / "landmark_genes.csv")
|
|
ids = [str(x) for x in lm["pr_gene_id"]]
|
|
id_to_symbol = {str(r.pr_gene_id): r.pr_gene_symbol for r in lm.itertuples()}
|
|
return ids, id_to_symbol
|
|
|
|
|
|
def extract_phase(phase: int, drug_names: set[str], landmark_ids: list[str]) -> pd.DataFrame:
|
|
"""Return DataFrame: rows=pert_iname, cols=landmark gene_id (str), one mean vector per drug."""
|
|
sig = read_gz_tsv(SIG_INFO[phase])
|
|
sig = sig[(sig["pert_type"] == "trt_cp") & (sig["pert_iname"].isin(drug_names))]
|
|
if sig.empty:
|
|
return pd.DataFrame()
|
|
sig_ids = sig["sig_id"].tolist()
|
|
print(f"[phase {phase}] {sig['pert_iname'].nunique()} drugs, {len(sig_ids)} signatures to slice", flush=True)
|
|
|
|
gctoo = parse(str(GCTX[phase]), rid=landmark_ids, cid=sig_ids)
|
|
data = gctoo.data_df # rows=gene_id, cols=sig_id
|
|
sig_to_drug = dict(zip(sig["sig_id"], sig["pert_iname"]))
|
|
# mean across each drug's sig_ids -> one consensus vector per drug
|
|
per_drug = data.T.groupby(data.columns.map(sig_to_drug)).mean()
|
|
print(f"[phase {phase}] aggregated to {len(per_drug)} drug consensus vectors", flush=True)
|
|
return per_drug # rows=pert_iname, cols=gene_id
|
|
|
|
|
|
def main() -> None:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--phase", type=int, choices=[1, 2], default=None, help="single phase (test)")
|
|
args = ap.parse_args()
|
|
|
|
drugs = pd.read_csv(DRUG_SET)
|
|
drug_names = set(drugs["pert_iname"])
|
|
landmark_ids, id_to_symbol = landmark_ids_and_symbols()
|
|
|
|
phases = [args.phase] if args.phase else [1, 2]
|
|
frames = []
|
|
for ph in phases:
|
|
if not GCTX[ph].exists():
|
|
print(f"[phase {ph}] {GCTX[ph]} missing — skipping")
|
|
continue
|
|
frames.append(extract_phase(ph, drug_names, landmark_ids))
|
|
|
|
frames = [f for f in frames if not f.empty]
|
|
if not frames:
|
|
print("no signatures extracted")
|
|
return
|
|
# A drug present in both phases: average the two phase consensus vectors.
|
|
combined = pd.concat(frames).groupby(level=0).mean()
|
|
combined.columns = [id_to_symbol.get(c, c) for c in combined.columns] # gene_id -> symbol
|
|
|
|
covered = sorted(set(combined.index))
|
|
missing = sorted(drug_names - set(covered))
|
|
print(f"\nsignatures extracted for {len(covered)}/{len(drug_names)} drugs")
|
|
for gt in ["hydroxyurea", "glutamine"]:
|
|
print(f" ground truth '{gt}': {'OK' if gt in covered else 'MISSING'}")
|
|
if args.phase is None:
|
|
combined.to_parquet(OUT)
|
|
print(f"wrote {OUT} ({combined.shape[0]} drugs x {combined.shape[1]} landmark genes)")
|
|
if missing:
|
|
print(f"{len(missing)} drugs without signature (will be marked not-scored in Week 3)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|