up

2025-12-02 11:56:01 +08:00 · 2025-12-02 11:56:01 +08:00 · 8193801023
parent 67139e4279
commit 8193801023
2 changed files with 25 additions and 28 deletions
--- a/SV_alpaca_7B_AdvBench/res/9/5/log.txt
+++ b/SV_alpaca_7B_AdvBench/res/9/5/log.txt
@ -1,16 +1,7 @@
-2025-12-02 11:13:28,208 - INFO - Starting training
+2025-12-02 11:50:33,933 - INFO - Starting training
-2025-12-02 11:13:28,208 - INFO - component=res, str_layer=9
+2025-12-02 11:50:33,934 - INFO - component=res, str_layer=9
-2025-12-02 11:13:48,802 - INFO - Epoch [1/20], Loss: 0.3692
+2025-12-02 11:50:54,534 - INFO - Epoch [1/20], Loss: 0.3692
-2025-12-02 11:13:48,802 - INFO - Best test AUROC: 1.0000, at epoch: 0
+2025-12-02 11:50:54,535 - INFO - Best test AUROC: 1.0000, at epoch: 0
-2025-12-02 11:13:48,803 - INFO - Saved best centroids to SV_alpaca_7B_AdvBench/res/9/5/best_centroids_epoch_0.npy
+2025-12-02 11:50:54,535 - INFO - Saved best centroids to SV_alpaca_7B_AdvBench/res/9/5/best_centroids_epoch_0.npy
-2025-12-02 11:13:48,803 - INFO - Epoch [1/20], Train Loss: 0.3692, 
+2025-12-02 11:50:54,535 - INFO - Epoch [1/20], Train Loss: 0.3692, 
-2025-12-02 11:13:48,803 - INFO - Test AUROC: 1.0000
+2025-12-02 11:50:54,535 - INFO - Test AUROC: 1.0000
 2025-12-02 11:14:08,725 - INFO - Epoch [2/20], Loss: 0.0742
 2025-12-02 11:14:08,726 - INFO - Epoch [2/20], Train Loss: 0.0742, 
 2025-12-02 11:14:08,726 - INFO - Test AUROC: 1.0000
 2025-12-02 11:14:28,751 - INFO - Epoch [3/20], Loss: 0.0150
 2025-12-02 11:14:28,751 - INFO - Epoch [3/20], Train Loss: 0.0150, 
 2025-12-02 11:14:28,751 - INFO - Test AUROC: 1.0000
 2025-12-02 11:14:48,784 - INFO - Epoch [4/20], Loss: 0.0036
 2025-12-02 11:14:48,784 - INFO - Epoch [4/20], Train Loss: 0.0036, 
 2025-12-02 11:14:48,784 - INFO - Test AUROC: 1.0000
--- a/steer_vector.py
+++ b/steer_vector.py
@ -12,6 +12,7 @@ from sklearn.metrics import roc_auc_score
 from torch.amp import autocast, GradScaler
 import torch.nn.functional as F
 import logging
 from copy import deepcopy
@ -175,6 +176,7 @@ def train_model(model, optimizer, device, prompts, labels, args):
            test_predictions, test_labels= test_model(
                model, centroids, test_prompts, test_labels_, device, batch_size, layer_number
            )
            test_auroc = roc_auc_score(
                test_labels.cpu().numpy(), test_predictions.cpu().numpy()
@ -265,11 +267,7 @@ def test_model(model, centroids, test_prompts, test_labels, device, batch_size,
    val_predictions = torch.cat(val_predictions)
    val_labels = torch.cat(val_labels)
-    # Debug: print predictions and labels distribution
+
    print(f"[DEBUG] test_model: {len(val_predictions)} samples")
    print(f"[DEBUG] Predictions min/max/mean: {val_predictions.min():.4f}/{val_predictions.max():.4f}/{val_predictions.mean():.4f}")
    print(f"[DEBUG] Labels distribution: {torch.sum(val_labels == 0)} zeros, {torch.sum(val_labels == 1)} ones")
    return val_predictions, val_labels
@ -382,14 +380,22 @@ def main():
            qa_pairs.append({'Question': question, 'Answer': clean})
            categories.append(1)  # 1 = benign
-        train_index, val_index, test_index=split_indices(len(prompts_), args.train_ratio, args.val_ratio)
+        # 检查数据划分
        train_index, val_index, test_index = split_indices(len(prompts_), args.train_ratio, args.val_ratio)
-        # Convert numpy arrays to lists for Python list indexing
+
-        test_index_list = test_index.tolist()
+        train_index = train_index.tolist()
-        train_index_list = train_index.tolist()
+        val_index   = val_index.tolist()
-        
+        test_index  = test_index.tolist()
-        labels = [[categories[i] for i in test_index_list], [categories[i] for i in train_index_list]]
+
-        prompts = [[prompts_[i] for i in test_index_list], [prompts_[i] for i in train_index_list]]
+        prompts = [
            [prompts_[i] for i in test_index],  # test
            [prompts_[i] for i in train_index]  # train
        ]
        labels = [
            [categories[i] for i in test_index],
            [categories[i] for i in train_index]
        ]