From 819380102331f95a236506d9dc5abad1be09e6d9 Mon Sep 17 00:00:00 2001 From: weixin_43297441 Date: Tue, 2 Dec 2025 11:56:01 +0800 Subject: [PATCH] up --- SV_alpaca_7B_AdvBench/res/9/5/log.txt | 23 +++++++------------- steer_vector.py | 30 ++++++++++++++++----------- 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/SV_alpaca_7B_AdvBench/res/9/5/log.txt b/SV_alpaca_7B_AdvBench/res/9/5/log.txt index 226dfe9..f88614f 100644 --- a/SV_alpaca_7B_AdvBench/res/9/5/log.txt +++ b/SV_alpaca_7B_AdvBench/res/9/5/log.txt @@ -1,16 +1,7 @@ -2025-12-02 11:13:28,208 - INFO - Starting training -2025-12-02 11:13:28,208 - INFO - component=res, str_layer=9 -2025-12-02 11:13:48,802 - INFO - Epoch [1/20], Loss: 0.3692 -2025-12-02 11:13:48,802 - INFO - Best test AUROC: 1.0000, at epoch: 0 -2025-12-02 11:13:48,803 - INFO - Saved best centroids to SV_alpaca_7B_AdvBench/res/9/5/best_centroids_epoch_0.npy -2025-12-02 11:13:48,803 - INFO - Epoch [1/20], Train Loss: 0.3692, -2025-12-02 11:13:48,803 - INFO - Test AUROC: 1.0000 -2025-12-02 11:14:08,725 - INFO - Epoch [2/20], Loss: 0.0742 -2025-12-02 11:14:08,726 - INFO - Epoch [2/20], Train Loss: 0.0742, -2025-12-02 11:14:08,726 - INFO - Test AUROC: 1.0000 -2025-12-02 11:14:28,751 - INFO - Epoch [3/20], Loss: 0.0150 -2025-12-02 11:14:28,751 - INFO - Epoch [3/20], Train Loss: 0.0150, -2025-12-02 11:14:28,751 - INFO - Test AUROC: 1.0000 -2025-12-02 11:14:48,784 - INFO - Epoch [4/20], Loss: 0.0036 -2025-12-02 11:14:48,784 - INFO - Epoch [4/20], Train Loss: 0.0036, -2025-12-02 11:14:48,784 - INFO - Test AUROC: 1.0000 +2025-12-02 11:50:33,933 - INFO - Starting training +2025-12-02 11:50:33,934 - INFO - component=res, str_layer=9 +2025-12-02 11:50:54,534 - INFO - Epoch [1/20], Loss: 0.3692 +2025-12-02 11:50:54,535 - INFO - Best test AUROC: 1.0000, at epoch: 0 +2025-12-02 11:50:54,535 - INFO - Saved best centroids to SV_alpaca_7B_AdvBench/res/9/5/best_centroids_epoch_0.npy +2025-12-02 11:50:54,535 - INFO - Epoch [1/20], Train Loss: 0.3692, +2025-12-02 11:50:54,535 - INFO - Test AUROC: 1.0000 diff --git a/steer_vector.py b/steer_vector.py index 7bf6312..136b4fb 100644 --- a/steer_vector.py +++ b/steer_vector.py @@ -12,6 +12,7 @@ from sklearn.metrics import roc_auc_score from torch.amp import autocast, GradScaler import torch.nn.functional as F import logging +from copy import deepcopy @@ -175,6 +176,7 @@ def train_model(model, optimizer, device, prompts, labels, args): test_predictions, test_labels= test_model( model, centroids, test_prompts, test_labels_, device, batch_size, layer_number ) + test_auroc = roc_auc_score( test_labels.cpu().numpy(), test_predictions.cpu().numpy() @@ -265,11 +267,7 @@ def test_model(model, centroids, test_prompts, test_labels, device, batch_size, val_predictions = torch.cat(val_predictions) val_labels = torch.cat(val_labels) - # Debug: print predictions and labels distribution - print(f"[DEBUG] test_model: {len(val_predictions)} samples") - print(f"[DEBUG] Predictions min/max/mean: {val_predictions.min():.4f}/{val_predictions.max():.4f}/{val_predictions.mean():.4f}") - print(f"[DEBUG] Labels distribution: {torch.sum(val_labels == 0)} zeros, {torch.sum(val_labels == 1)} ones") - + return val_predictions, val_labels @@ -382,14 +380,22 @@ def main(): qa_pairs.append({'Question': question, 'Answer': clean}) categories.append(1) # 1 = benign - train_index, val_index, test_index=split_indices(len(prompts_), args.train_ratio, args.val_ratio) + # 检查数据划分 + train_index, val_index, test_index = split_indices(len(prompts_), args.train_ratio, args.val_ratio) - # Convert numpy arrays to lists for Python list indexing - test_index_list = test_index.tolist() - train_index_list = train_index.tolist() - - labels = [[categories[i] for i in test_index_list], [categories[i] for i in train_index_list]] - prompts = [[prompts_[i] for i in test_index_list], [prompts_[i] for i in train_index_list]] + + train_index = train_index.tolist() + val_index = val_index.tolist() + test_index = test_index.tolist() + + prompts = [ + [prompts_[i] for i in test_index], # test + [prompts_[i] for i in train_index] # train + ] + labels = [ + [categories[i] for i in test_index], + [categories[i] for i in train_index] + ]