| |
| """对比:4096D cosine 邻居 vs t-SNE 2D 圆内邻居,看哪种判别更准。 |
| |
| 复用 cache_emb/。t-SNE 用 1000 golden + 200 ruler 一起做(1200 点),保证投影一致。 |
| """ |
| import argparse |
| import json |
| from pathlib import Path |
|
|
| import numpy as np |
| import pandas as pd |
| from sklearn.manifold import TSNE |
|
|
|
|
| DEFAULTS = dict( |
| cache_dir = "cache_emb", |
| csv = "/mnt/bn/tns-algo-ue-my/biaowu/aipf_dm_metric/example/yss_ruler_eval/data/aipf_golden_set.csv", |
| ruler = "/mnt/bn/tns-algo-ue-my/biaowu/aipf_dm_metric/ranking_moderation/data/dm/youth_sexual_and_physical_abuse_aigt_v009/ranking_bucket/ruler_items.json", |
| ) |
|
|
|
|
| def load_npy_pair(cache_dir, n_csv, n_ruler, max_length=4096): |
| cd = Path(cache_dir) |
| csvs = list(cd.glob(f"csv_*_n{n_csv}_L{max_length}.npy")) |
| rulers = list(cd.glob(f"ruler_*_n{n_ruler}_L{max_length}.npy")) |
| if not csvs or not rulers: |
| raise FileNotFoundError(f"找不到缓存。期望 {cd}/csv_*_n{n_csv}_L{max_length}.npy") |
| return np.load(csvs[0]), np.load(rulers[0]) |
|
|
|
|
| def load_ruler_meta(path): |
| with open(path) as f: |
| data = json.load(f) |
| items = data if isinstance(data, list) else (data.get("items") or data.get("ruler_items") or data.get("data") or []) |
| ranks = np.array([int(it["rank"]) for it in items]) |
| scores = np.array([float(it["score"]) for it in items]) |
| return ranks, scores |
|
|
|
|
| def metrics(preds, gts): |
| tp = int(((preds == 1) & (gts == 1)).sum()) |
| fp = int(((preds == 1) & (gts == 0)).sum()) |
| tn = int(((preds == 0) & (gts == 0)).sum()) |
| fn = int(((preds == 0) & (gts == 1)).sum()) |
| p = tp/(tp+fp) if tp+fp else 0.0 |
| r = tp/(tp+fn) if tp+fn else 0.0 |
| f = 2*p*r/(p+r) if p+r else 0.0 |
| a = (tp+tn)/len(preds) |
| return tp, fp, tn, fn, p, r, f, a |
|
|
|
|
| def best_threshold(scores, gts): |
| cands = sorted(set(scores.tolist())) |
| best = (-1.0, None, None, None) |
| for c in cands: |
| preds = (scores >= c).astype(int) |
| _, _, _, _, p, r, f, _ = metrics(preds, gts) |
| if f > best[0]: |
| best = (f, c, p, r) |
| return best |
|
|
|
|
| def topk_neighbors(query_xy, ruler_xy, k): |
| """对每个 query,找 ruler 里最近的 k 个,返回 (idx, dist)""" |
| |
| diffs = query_xy[:, None, :] - ruler_xy[None, :, :] |
| dists = np.linalg.norm(diffs, axis=-1) |
| idx = np.argpartition(dists, k - 1, axis=1)[:, :k] |
| row = np.arange(len(query_xy))[:, None] |
| selected = dists[row, idx] |
| order = np.argsort(selected, axis=1) |
| return np.take_along_axis(idx, order, axis=1) |
|
|
|
|
| def main(): |
| p = argparse.ArgumentParser() |
| p.add_argument("--cache-dir", default=DEFAULTS["cache_dir"]) |
| p.add_argument("--csv", default=DEFAULTS["csv"]) |
| p.add_argument("--ruler", default=DEFAULTS["ruler"]) |
| p.add_argument("--positive-label", default="Y") |
| p.add_argument("--boundary-rank", type=int, default=106) |
| p.add_argument("--max-length", type=int, default=4096) |
| p.add_argument("--perplexity", type=float, default=30.0) |
| p.add_argument("--k", type=int, default=100) |
| p.add_argument("--seed", type=int, default=42) |
| args = p.parse_args() |
|
|
| print("[1] load") |
| df = pd.read_csv(args.csv, keep_default_na=False) |
| gts = df["label"].astype(str).str.upper().eq(args.positive_label.upper()).astype(int).values |
| ruler_rank, ruler_score = load_ruler_meta(args.ruler) |
| n_csv, n_ruler = len(gts), len(ruler_rank) |
| csv_emb, ruler_emb = load_npy_pair(args.cache_dir, n_csv, n_ruler, args.max_length) |
|
|
| K = args.k |
| methods = {} |
|
|
| |
| print(f"[2] baseline: 4096D cosine top-{K}") |
| sims = csv_emb @ ruler_emb.T |
| top_idx = np.argpartition(-sims, K-1, axis=1)[:, :K] |
| row = np.arange(n_csv)[:, None] |
| top_sims = sims[row, top_idx] |
| top_score_4096 = ruler_score[top_idx] |
| raw_w = (top_sims * top_score_4096).sum(axis=1) / np.maximum(top_sims.sum(axis=1), 1e-12) |
| raw_mean = top_score_4096.mean(axis=1) |
| raw_vote = (ruler_rank[top_idx] < args.boundary_rank).sum(axis=1) |
| methods["4096D cosine | weighted_score"] = raw_w |
| methods["4096D cosine | mean(score)"] = raw_mean |
| methods["4096D cosine | vote_count"] = raw_vote.astype(float) |
|
|
| |
| print(f"[3] t-SNE on 1200 points (perplexity={args.perplexity})") |
| all_emb = np.vstack([csv_emb, ruler_emb]) |
| tsne = TSNE(n_components=2, perplexity=args.perplexity, |
| init="pca", random_state=args.seed, |
| metric="cosine", learning_rate="auto") |
| xy = tsne.fit_transform(all_emb) |
| csv_xy, ruler_xy = xy[:n_csv], xy[n_csv:] |
|
|
| |
| print(f"[4] 2D Euclidean top-{K} (in t-SNE space)") |
| top_idx_2d = topk_neighbors(csv_xy, ruler_xy, K) |
| top_score_2d = ruler_score[top_idx_2d] |
| rank_2d = ruler_rank[top_idx_2d] |
| methods["t-SNE 2D | mean(score)"] = top_score_2d.mean(axis=1) |
| methods["t-SNE 2D | vote_count"] = (rank_2d < args.boundary_rank).sum(axis=1).astype(float) |
| |
| diffs = csv_xy[:, None, :] - ruler_xy[None, :, :] |
| dists2d = np.linalg.norm(diffs, axis=-1) |
| selected_dist = np.take_along_axis(dists2d, top_idx_2d, axis=1) |
| weights = 1.0 / (selected_dist + 1e-6) |
| weighted_2d = (weights * top_score_2d).sum(axis=1) / weights.sum(axis=1) |
| methods["t-SNE 2D | inv_dist weighted"] = weighted_2d |
|
|
| |
| overlap = [] |
| for i in range(n_csv): |
| a = set(top_idx[i].tolist()) |
| b = set(top_idx_2d[i].tolist()) |
| overlap.append(len(a & b) / K) |
| print(f"\n[5] 邻居重叠率(4096D vs 2D 各取 top-{K}):") |
| print(f" 平均 = {np.mean(overlap):.2%}") |
| print(f" 中位数 = {np.median(overlap):.2%}") |
| print(f" p10 / p90 = {np.percentile(overlap, 10):.2%} / {np.percentile(overlap, 90):.2%}") |
|
|
| |
| print(f"\n[6] best F1 by sweeping threshold (K={K})") |
| print(f"{'method':<35}{'F1':>9}{'thr':>10}{'P':>9}{'R':>9}") |
| print("-" * 75) |
| for name, scores in methods.items(): |
| f1, thr, prec, rec = best_threshold(scores, gts) |
| print(f"{name:<35}{f1:>9.4f}{thr:>10.4f}{prec:>9.4f}{rec:>9.4f}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|