v1.1: full gene space + specificity z-score; hydroxyurea recovers
Post-hoc improvement after the pre-registered v1 recovery test failed. Two changes, diagnosing v1's failure: - score on the full 12,328-gene LINCS space (week2_lincs_extract.py), lifting signature overlap from 12% to 85% (brings erythroid markers in) - src/scoring.py: KS connectivity + per-drug specificity z-score (spec_z = SDs below a 1,000 random-query null). Primary ranking is now spec_z. (Textbook tau saturated at +/-100 for a coherent query — documented; needs a reference-signature library, a v2 item.) - week3_scoring.py: spec_z primary + WTCS reference + prior-blended - tests: tau/spec_z calibration test; 19 passing - scripts/exp_genespace.py: the BING vs all-12,328 comparison Result: hydroxyurea recovers (rank 40 -> 18, top 6%, passes top-10%), confirming the v1 failure was the landmark bottleneck not the algorithm. Overall STILL FAILS: L-glutamine does not reverse (rank 213, metabolite), and negative controls (norethindrone, ciprofloxacin) rank top-3 — connectivity != therapeutic relatedness. v1.1 is post-hoc/exploratory, not a confirmatory test; reported as such in recovery_test_report.md. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
102
scripts/exp_genespace.py
Normal file
102
scripts/exp_genespace.py
Normal file
@@ -0,0 +1,102 @@
|
||||
"""Experiment (v1.1): re-score on a larger LINCS gene space and re-run the recovery test.
|
||||
|
||||
v1 used only the 978 landmark genes (12% signature overlap). This re-slices the SAME GCTX files
|
||||
to the BING space (~10,174) and the full 12,328-gene space, re-aggregates per-drug consensus
|
||||
signatures, re-scores connectivity, and evaluates the pre-registered recovery criteria — so we
|
||||
can see whether hydroxyurea recovers. Writes nothing to the committed v1 artifacts.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import gzip
|
||||
import io
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
from src.scoring import rank_drugs # noqa: E402
|
||||
|
||||
LINCS = Path("data/raw/lincs")
|
||||
PROCESSED = Path("data/processed")
|
||||
GCTX = {1: LINCS / "phase1_level5.gctx", 2: LINCS / "phase2_level5.gctx"}
|
||||
SIG_INFO = {1: "GSE92742_sig_info.txt.gz", 2: "GSE70138_sig_info.txt.gz"}
|
||||
NEG5 = ["clotrimazole", "astemizole", "azithromycin", "ethinyl-estradiol", "caffeine"]
|
||||
|
||||
|
||||
def read_gz(name):
|
||||
return pd.read_csv(io.BytesIO(gzip.decompress((LINCS / name).read_bytes())), sep="\t", low_memory=False)
|
||||
|
||||
|
||||
def gene_ids_for_space(space: str):
|
||||
g = pd.read_csv(LINCS / "GSE92742_gene_info.txt.gz", sep="\t")
|
||||
if space == "bing":
|
||||
g = g[g.pr_is_bing == 1]
|
||||
# 'all' -> keep everything
|
||||
ids = [str(x) for x in g.pr_gene_id]
|
||||
id_to_symbol = {str(r.pr_gene_id): r.pr_gene_symbol for r in g.itertuples()}
|
||||
return ids, id_to_symbol
|
||||
|
||||
|
||||
def extract(space, drug_names, gene_ids, id_to_symbol):
|
||||
from cmapPy.pandasGEXpress.parse import parse
|
||||
frames = []
|
||||
for ph in (1, 2):
|
||||
sig = read_gz(SIG_INFO[ph])
|
||||
sig = sig[(sig.pert_type == "trt_cp") & (sig.pert_iname.isin(drug_names))]
|
||||
if sig.empty:
|
||||
continue
|
||||
gct = parse(str(GCTX[ph]), rid=gene_ids, cid=sig.sig_id.tolist())
|
||||
data = gct.data_df
|
||||
s2d = dict(zip(sig.sig_id, sig.pert_iname))
|
||||
frames.append(data.T.groupby(data.columns.map(s2d)).mean())
|
||||
print(f" [{space}] phase {ph}: {sig.pert_iname.nunique()} drugs sliced", flush=True)
|
||||
combined = pd.concat(frames).groupby(level=0).mean()
|
||||
combined.columns = [id_to_symbol.get(c, c) for c in combined.columns]
|
||||
combined = combined.loc[:, ~combined.columns.duplicated()] # drop dup symbols
|
||||
return combined
|
||||
|
||||
|
||||
def evaluate(space, sig_matrix, up, down):
|
||||
landmark_overlap = None
|
||||
ranked = rank_drugs(up, down, sig_matrix)
|
||||
n = len(ranked)
|
||||
top10, top25, half = int(n * 0.10), int(n * 0.25), n // 2
|
||||
profiles = pd.read_parquet(PROCESSED / "drug_profiles_v1.parquet").set_index("name")
|
||||
ranked = ranked.join(profiles[["inclusion_reason"]])
|
||||
|
||||
hu, glut = int(ranked.loc["hydroxyurea", "rank"]), int(ranked.loc["glutamine", "rank"])
|
||||
glut_s = ranked.loc["glutamine", "connectivity_score"]
|
||||
n_overlap = len((set(up) | set(down)) & set(sig_matrix.columns))
|
||||
negs = {d: int(ranked.loc[d, "rank"]) for d in NEG5 if d in ranked.index}
|
||||
n_bottom = sum(r > half for r in negs.values())
|
||||
|
||||
print(f"\n=== gene space: {space.upper()} ({sig_matrix.shape[1]} genes; query overlap {n_overlap}) ===")
|
||||
print(f" hydroxyurea: rank {hu}/{n} (top {100*hu/n:.1f}%) top-10%? {hu <= top10}")
|
||||
print(f" L-glutamine: rank {glut}/{n} (top {100*glut/n:.1f}%), WTCS={glut_s:.3f} top-25%? {glut <= top25}")
|
||||
print(f" neg controls in bottom half: {n_bottom}/5 {negs}")
|
||||
crit = (hu <= top10) and (glut <= top25) and (n_bottom >= 4)
|
||||
print(f" OVERALL: {'PASS' if crit else 'FAIL'}")
|
||||
print(" top 8:")
|
||||
for name, r in ranked.nsmallest(8, "connectivity_score").iterrows():
|
||||
print(f" {int(r['rank']):2d} {name:18s} {r['connectivity_score']:+.3f} [{r['inclusion_reason']}]")
|
||||
return ranked
|
||||
|
||||
|
||||
def main():
|
||||
sig = json.loads((PROCESSED / "sickle_cell_signature_v1.json").read_text())
|
||||
up = [g["gene"] for g in sig["up_regulated"]]
|
||||
down = [g["gene"] for g in sig["down_regulated"]]
|
||||
drug_names = set(pd.read_csv(PROCESSED / "drug_set_v1.csv").pert_iname)
|
||||
|
||||
for space in ("bing", "all"):
|
||||
print(f"\n>>> extracting {space} ...", flush=True)
|
||||
ids, id2sym = gene_ids_for_space(space)
|
||||
mat = extract(space, drug_names, ids, id2sym)
|
||||
evaluate(space, mat, up, down)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -36,9 +36,15 @@ def read_gz_tsv(name: str) -> pd.DataFrame:
|
||||
|
||||
|
||||
def landmark_ids_and_symbols() -> tuple[list[str], dict[str, str]]:
|
||||
lm = pd.read_csv(LINCS / "landmark_genes.csv")
|
||||
ids = [str(x) for x in lm["pr_gene_id"]]
|
||||
id_to_symbol = {str(r.pr_gene_id): r.pr_gene_symbol for r in lm.itertuples()}
|
||||
"""Gene row-ids + id->symbol map for the scored gene space.
|
||||
|
||||
v1.1: use the FULL 12,328-gene space (landmark + inferred), not just the 978 landmarks.
|
||||
This lifts disease-signature overlap from 12% to ~85% and brings the erythroid markers into
|
||||
scoring (see docs/recovery_test_report.md). Inferred genes are model-predicted (noisier).
|
||||
"""
|
||||
g = pd.read_csv(LINCS / "GSE92742_gene_info.txt.gz", sep="\t")
|
||||
ids = [str(x) for x in g["pr_gene_id"]]
|
||||
id_to_symbol = {str(r.pr_gene_id): r.pr_gene_symbol for r in g.itertuples()}
|
||||
return ids, id_to_symbol
|
||||
|
||||
|
||||
|
||||
@@ -1,13 +1,11 @@
|
||||
"""Week 3: run connectivity scoring over all drugs -> ranked_candidates_v1.csv (PLAN §6).
|
||||
"""Week 3 (v1.1): connectivity scoring over the full gene space with tau calibration.
|
||||
|
||||
Loads the disease signature + the 300 drug LINCS signatures, computes the weighted-KS
|
||||
connectivity score per drug, and produces two rankings:
|
||||
1. raw connectivity (most negative = strongest reversal = rank 1)
|
||||
2. a secondary ranking blending connectivity with a mechanistic prior (sickle-relevant
|
||||
target pathways), to temper broad-effect drugs (HDAC/kinase) that dominate raw rankings.
|
||||
Primary ranking is now **tau** (KS connectivity expressed as a signed percentile vs a null of
|
||||
random queries) — this calibrates out broad-effect drugs that connect to random signatures too,
|
||||
the specificity fix. The weighted connectivity score (WTCS) is retained as a reference column,
|
||||
and a secondary ranking blends tau with the sickle-pathway mechanistic prior.
|
||||
|
||||
The formal recovery test (ground-truth + negative-control evaluation against the pre-registered
|
||||
criteria) is Week 4; this script only prints a sanity peek.
|
||||
Output: data/results/ranked_candidates_v1.csv.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -19,10 +17,13 @@ import pandas as pd
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
from src.scoring import mechanistic_prior, persist_ranking, rank_drugs # noqa: E402
|
||||
from src.scoring import ( # noqa: E402
|
||||
connectivity_score, mechanistic_prior, normalize_scores, persist_ranking, tau_calibrate,
|
||||
)
|
||||
|
||||
PROCESSED = Path("data/processed")
|
||||
PRIOR_LAMBDA = 0.5 # weight of the mechanistic prior in the secondary ranking
|
||||
N_NULL = 1000
|
||||
PRIOR_LAMBDA = 0.5 # spec_z credit per matched sickle pathway, for the blended ranking
|
||||
|
||||
|
||||
def main() -> None:
|
||||
@@ -30,52 +31,51 @@ def main() -> None:
|
||||
up = [g["gene"] for g in sig["up_regulated"]]
|
||||
down = [g["gene"] for g in sig["down_regulated"]]
|
||||
|
||||
sig_matrix = pd.read_parquet(PROCESSED / "lincs_signatures_v1.parquet") # drug x 978 symbols
|
||||
sig_matrix = pd.read_parquet(PROCESSED / "lincs_signatures_v1.parquet") # drug x 12,328 genes
|
||||
profiles = pd.read_parquet(PROCESSED / "drug_profiles_v1.parquet").set_index("name")
|
||||
|
||||
landmark = set(sig_matrix.columns)
|
||||
n_up_ov = len(set(up) & landmark)
|
||||
n_down_ov = len(set(down) & landmark)
|
||||
print(f"query overlap with 978 landmarks: {n_up_ov} up + {n_down_ov} down = {n_up_ov + n_down_ov}")
|
||||
print(f"scoring {len(sig_matrix)} drugs (all scored; 0 without signature)")
|
||||
n_up = len(set(up) & set(sig_matrix.columns))
|
||||
n_down = len(set(down) & set(sig_matrix.columns))
|
||||
print(f"gene space: {sig_matrix.shape[1]} genes; query overlap {n_up} up + {n_down} down = {n_up + n_down}")
|
||||
|
||||
ranked = rank_drugs(up, down, sig_matrix)
|
||||
# primary: tau calibration
|
||||
print(f"computing tau over {N_NULL} random-query permutations ...", flush=True)
|
||||
ranked = tau_calibrate(up, down, sig_matrix, n_null=N_NULL)
|
||||
|
||||
# attach metadata + mechanistic prior
|
||||
# reference: weighted connectivity score (WTCS) + NCS
|
||||
wtcs = pd.Series({d: connectivity_score(up, down, sig_matrix.loc[d]) for d in sig_matrix.index},
|
||||
name="connectivity_score")
|
||||
ranked["connectivity_score"] = wtcs
|
||||
ranked["normalized_score"] = normalize_scores(wtcs)
|
||||
|
||||
# metadata + mechanistic prior
|
||||
ranked = ranked.join(profiles[["chembl_id", "inclusion_reason", "targets", "mechanism_of_action"]])
|
||||
ranked["mechanistic_prior"] = ranked["targets"].apply(
|
||||
lambda t: mechanistic_prior(list(t) if t is not None else [])
|
||||
)
|
||||
lambda t: mechanistic_prior(list(t) if t is not None else []))
|
||||
ranked["known_targets"] = ranked["targets"].apply(
|
||||
lambda t: "; ".join(t) if t is not None and len(t) else ""
|
||||
)
|
||||
lambda t: "; ".join(t) if t is not None and len(t) else "")
|
||||
ranked = ranked.rename(columns={"mechanism_of_action": "mechanism_summary"})
|
||||
|
||||
# secondary, prior-weighted ranking: relevant drugs pushed toward better (more negative)
|
||||
ranked["blended_score"] = ranked["normalized_score"] - PRIOR_LAMBDA * ranked["mechanistic_prior"]
|
||||
# secondary, prior-weighted ranking (relevant drugs pushed toward more-negative spec_z)
|
||||
ranked["blended_score"] = ranked["spec_z"] - PRIOR_LAMBDA * ranked["mechanistic_prior"]
|
||||
ranked["blended_rank"] = ranked["blended_score"].rank(method="first").astype(int)
|
||||
|
||||
out = ranked.rename_axis("drug_name").reset_index()[[
|
||||
"rank", "drug_name", "chembl_id", "connectivity_score", "normalized_score",
|
||||
"rank", "drug_name", "chembl_id", "spec_z", "tau", "connectivity_ks", "connectivity_score",
|
||||
"inclusion_reason", "mechanistic_prior", "blended_rank", "known_targets", "mechanism_summary",
|
||||
]]
|
||||
path = persist_ranking(out)
|
||||
print(f"wrote {path} ({len(out)} drugs)")
|
||||
|
||||
# --- sanity peek (formal recovery test is Week 4) ---
|
||||
print("\n--- sanity peek (raw connectivity rank) ---")
|
||||
print("\n--- sanity peek (spec_z ranking) ---")
|
||||
for gt in ["hydroxyurea", "glutamine"]:
|
||||
r = ranked.loc[gt]
|
||||
pct = 100 * r["rank"] / len(ranked)
|
||||
print(f" {gt:12s} rank {int(r['rank'])}/{len(ranked)} (top {pct:.0f}%), "
|
||||
f"score={r['connectivity_score']:.3f}")
|
||||
neg = ranked[ranked["inclusion_reason"] == "negative_control"]
|
||||
print(f" negative controls in bottom half: "
|
||||
f"{(neg['rank'] > len(ranked) / 2).sum()}/{len(neg)}")
|
||||
print("\n top 5 raw candidates:")
|
||||
for name, r in ranked.nsmallest(5, "connectivity_score").iterrows():
|
||||
print(f" {int(r['rank']):3d} {name:18s} {r['connectivity_score']:+.3f} "
|
||||
f"[{r['inclusion_reason']}] {r['known_targets'][:50]}")
|
||||
print(f" {gt:12s} rank {int(r['rank'])}/{len(ranked)} (top {100*r['rank']/len(ranked):.0f}%), "
|
||||
f"spec_z={r['spec_z']:.2f}")
|
||||
print(" top 10 by spec_z:")
|
||||
for name, r in ranked.nsmallest(10, "spec_z").iterrows():
|
||||
print(f" {int(r['rank']):2d} {name:18s} z={r['spec_z']:6.2f} [{r['inclusion_reason']:16s}] "
|
||||
f"{str(r['known_targets'])[:38]}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -36,6 +36,7 @@ def main() -> None:
|
||||
return int(df.loc[name, "rank"]) if name in df.index else None
|
||||
|
||||
hu, glut = rk("hydroxyurea"), rk("glutamine")
|
||||
glut_z = df.loc["glutamine", "spec_z"]
|
||||
|
||||
# pick negative controls present in the ranking
|
||||
negs = {}
|
||||
@@ -47,9 +48,8 @@ def main() -> None:
|
||||
print("=" * 60)
|
||||
print(f"N = {n}; top10 cut = {top10_cut}, top25 cut = {top25_cut}, bottom-half > {half}")
|
||||
print(f"\nhydroxyurea: rank {hu} (top {100*hu/n:.1f}%) -> top-10%? {hu <= top10_cut}")
|
||||
glut_score = df.loc["glutamine", "connectivity_score"]
|
||||
print(f"L-glutamine: rank {glut} (top {100*glut/n:.1f}%), WTCS={glut_score:.3f} "
|
||||
f"-> top-25%? {glut <= top25_cut} (has signature, so NOT 'missing-signature unscorable')")
|
||||
print(f"L-glutamine: rank {glut} (top {100*glut/n:.1f}%), spec_z={glut_z:+.2f} "
|
||||
f"-> top-25%? {glut <= top25_cut} (positive z => does not reverse; has a signature)")
|
||||
print("\nnegative controls (pre-specified, 1 per category):")
|
||||
n_bottom = 0
|
||||
for d, (cat, r) in negs.items():
|
||||
@@ -70,10 +70,10 @@ def main() -> None:
|
||||
print(f"\nsecondary (mechanistic-prior) ranking: hydroxyurea blended_rank {hu_b} "
|
||||
f"(top {100*hu_b/n:.1f}%)")
|
||||
|
||||
print("\n--- TOP 10 (raw connectivity) ---")
|
||||
top10 = df.nsmallest(10, "connectivity_score")
|
||||
print("\n--- TOP 10 (primary spec_z ranking) ---")
|
||||
top10 = df.sort_values("rank").head(10)
|
||||
for name, r in top10.iterrows():
|
||||
print(f" {int(r['rank']):2d} {name:18s} {r['connectivity_score']:+.3f} "
|
||||
print(f" {int(r['rank']):2d} {name:18s} z={r['spec_z']:+.2f} "
|
||||
f"[{r['inclusion_reason']}] {str(r['known_targets'])[:45]}")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user