From 819380102331f95a236506d9dc5abad1be09e6d9 Mon Sep 17 00:00:00 2001
From: weixin_43297441 <weixin_43297441@noreply.gitcode.com>
Date: Tue, 2 Dec 2025 11:56:01 +0800
Subject: [PATCH] up

---
 SV_alpaca_7B_AdvBench/res/9/5/log.txt | 23 +++++++-------------
 steer_vector.py                       | 30 ++++++++++++++++-----------
 2 files changed, 25 insertions(+), 28 deletions(-)

diff --git a/SV_alpaca_7B_AdvBench/res/9/5/log.txt b/SV_alpaca_7B_AdvBench/res/9/5/log.txt
index 226dfe9..f88614f 100644
--- a/SV_alpaca_7B_AdvBench/res/9/5/log.txt
+++ b/SV_alpaca_7B_AdvBench/res/9/5/log.txt
@@ -1,16 +1,7 @@
-2025-12-02 11:13:28,208 - INFO - Starting training
-2025-12-02 11:13:28,208 - INFO - component=res, str_layer=9
-2025-12-02 11:13:48,802 - INFO - Epoch [1/20], Loss: 0.3692
-2025-12-02 11:13:48,802 - INFO - Best test AUROC: 1.0000, at epoch: 0
-2025-12-02 11:13:48,803 - INFO - Saved best centroids to SV_alpaca_7B_AdvBench/res/9/5/best_centroids_epoch_0.npy
-2025-12-02 11:13:48,803 - INFO - Epoch [1/20], Train Loss: 0.3692, 
-2025-12-02 11:13:48,803 - INFO - Test AUROC: 1.0000
-2025-12-02 11:14:08,725 - INFO - Epoch [2/20], Loss: 0.0742
-2025-12-02 11:14:08,726 - INFO - Epoch [2/20], Train Loss: 0.0742, 
-2025-12-02 11:14:08,726 - INFO - Test AUROC: 1.0000
-2025-12-02 11:14:28,751 - INFO - Epoch [3/20], Loss: 0.0150
-2025-12-02 11:14:28,751 - INFO - Epoch [3/20], Train Loss: 0.0150, 
-2025-12-02 11:14:28,751 - INFO - Test AUROC: 1.0000
-2025-12-02 11:14:48,784 - INFO - Epoch [4/20], Loss: 0.0036
-2025-12-02 11:14:48,784 - INFO - Epoch [4/20], Train Loss: 0.0036, 
-2025-12-02 11:14:48,784 - INFO - Test AUROC: 1.0000
+2025-12-02 11:50:33,933 - INFO - Starting training
+2025-12-02 11:50:33,934 - INFO - component=res, str_layer=9
+2025-12-02 11:50:54,534 - INFO - Epoch [1/20], Loss: 0.3692
+2025-12-02 11:50:54,535 - INFO - Best test AUROC: 1.0000, at epoch: 0
+2025-12-02 11:50:54,535 - INFO - Saved best centroids to SV_alpaca_7B_AdvBench/res/9/5/best_centroids_epoch_0.npy
+2025-12-02 11:50:54,535 - INFO - Epoch [1/20], Train Loss: 0.3692, 
+2025-12-02 11:50:54,535 - INFO - Test AUROC: 1.0000
diff --git a/steer_vector.py b/steer_vector.py
index 7bf6312..136b4fb 100644
--- a/steer_vector.py
+++ b/steer_vector.py
@@ -12,6 +12,7 @@ from sklearn.metrics import roc_auc_score
 from torch.amp import autocast, GradScaler
 import torch.nn.functional as F
 import logging
+from copy import deepcopy
 
 
 
@@ -175,6 +176,7 @@ def train_model(model, optimizer, device, prompts, labels, args):
             test_predictions, test_labels= test_model(
                 model, centroids, test_prompts, test_labels_, device, batch_size, layer_number
             )
+            
 
             test_auroc = roc_auc_score(
                 test_labels.cpu().numpy(), test_predictions.cpu().numpy()
@@ -265,11 +267,7 @@ def test_model(model, centroids, test_prompts, test_labels, device, batch_size,
     val_predictions = torch.cat(val_predictions)
     val_labels = torch.cat(val_labels)
     
-    # Debug: print predictions and labels distribution
-    print(f"[DEBUG] test_model: {len(val_predictions)} samples")
-    print(f"[DEBUG] Predictions min/max/mean: {val_predictions.min():.4f}/{val_predictions.max():.4f}/{val_predictions.mean():.4f}")
-    print(f"[DEBUG] Labels distribution: {torch.sum(val_labels == 0)} zeros, {torch.sum(val_labels == 1)} ones")
-    
+
     return val_predictions, val_labels
 
 
@@ -382,14 +380,22 @@ def main():
             qa_pairs.append({'Question': question, 'Answer': clean})
             categories.append(1)  # 1 = benign
             
-        train_index, val_index, test_index=split_indices(len(prompts_), args.train_ratio, args.val_ratio)
+        # 检查数据划分
+        train_index, val_index, test_index = split_indices(len(prompts_), args.train_ratio, args.val_ratio)
         
-        # Convert numpy arrays to lists for Python list indexing
-        test_index_list = test_index.tolist()
-        train_index_list = train_index.tolist()
-        
-        labels = [[categories[i] for i in test_index_list], [categories[i] for i in train_index_list]]
-        prompts = [[prompts_[i] for i in test_index_list], [prompts_[i] for i in train_index_list]]
+
+        train_index = train_index.tolist()
+        val_index   = val_index.tolist()
+        test_index  = test_index.tolist()
+
+        prompts = [
+            [prompts_[i] for i in test_index],  # test
+            [prompts_[i] for i in train_index]  # train
+        ]
+        labels = [
+            [categories[i] for i in test_index],
+            [categories[i] for i in train_index]
+        ]