From 67139e42798b158497404d0115d54a1ffb8f6003 Mon Sep 17 00:00:00 2001 From: weixin_43297441 Date: Tue, 2 Dec 2025 11:27:03 +0800 Subject: [PATCH] add --- SV_alpaca_7B_AdvBench/res/9/5/log.txt | 16 ++++ a.sh | 2 +- cache_utils.py | 112 ++++++++++++++++++++++++++ llm_layers.py | 2 +- steer_vector.py | 22 +++-- train_utils.py | 1 + 6 files changed, 147 insertions(+), 8 deletions(-) create mode 100644 SV_alpaca_7B_AdvBench/res/9/5/log.txt create mode 100644 cache_utils.py diff --git a/SV_alpaca_7B_AdvBench/res/9/5/log.txt b/SV_alpaca_7B_AdvBench/res/9/5/log.txt new file mode 100644 index 0000000..226dfe9 --- /dev/null +++ b/SV_alpaca_7B_AdvBench/res/9/5/log.txt @@ -0,0 +1,16 @@ +2025-12-02 11:13:28,208 - INFO - Starting training +2025-12-02 11:13:28,208 - INFO - component=res, str_layer=9 +2025-12-02 11:13:48,802 - INFO - Epoch [1/20], Loss: 0.3692 +2025-12-02 11:13:48,802 - INFO - Best test AUROC: 1.0000, at epoch: 0 +2025-12-02 11:13:48,803 - INFO - Saved best centroids to SV_alpaca_7B_AdvBench/res/9/5/best_centroids_epoch_0.npy +2025-12-02 11:13:48,803 - INFO - Epoch [1/20], Train Loss: 0.3692, +2025-12-02 11:13:48,803 - INFO - Test AUROC: 1.0000 +2025-12-02 11:14:08,725 - INFO - Epoch [2/20], Loss: 0.0742 +2025-12-02 11:14:08,726 - INFO - Epoch [2/20], Train Loss: 0.0742, +2025-12-02 11:14:08,726 - INFO - Test AUROC: 1.0000 +2025-12-02 11:14:28,751 - INFO - Epoch [3/20], Loss: 0.0150 +2025-12-02 11:14:28,751 - INFO - Epoch [3/20], Train Loss: 0.0150, +2025-12-02 11:14:28,751 - INFO - Test AUROC: 1.0000 +2025-12-02 11:14:48,784 - INFO - Epoch [4/20], Loss: 0.0036 +2025-12-02 11:14:48,784 - INFO - Epoch [4/20], Train Loss: 0.0036, +2025-12-02 11:14:48,784 - INFO - Test AUROC: 1.0000 diff --git a/a.sh b/a.sh index b0cefda..d88039f 100644 --- a/a.sh +++ b/a.sh @@ -1,2 +1,2 @@ export HF_ENDPOINT=https://hf-mirror.com -CUDA_VISIBLE_DEVICES=9 python hal_generate.py \ No newline at end of file +CUDA_VISIBLE_DEVICES=9 python steer_vector.py \ No newline at end of file diff --git a/cache_utils.py b/cache_utils.py new file mode 100644 index 0000000..f59f2af --- /dev/null +++ b/cache_utils.py @@ -0,0 +1,112 @@ +import copy +import importlib.metadata +import json +import os +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +import torch +from packaging import version + +# from .configuration_utils import PretrainedConfig +# from .utils import ( +# is_hqq_available, +# is_optimum_quanto_available, +# is_torchdynamo_compiling, +# logging, +# ) +# from .utils.deprecation import deprecate_kwarg + + +# if is_hqq_available(): +# from hqq.core.quantize import Quantizer as HQQQuantizer + +# logger = logging.get_logger(__name__) + + +class Cache(torch.nn.Module): + """ + Base, abstract class for all caches. The actual data structure is specific to each subclass. + """ + + def __init__(self): + super().__init__() + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[Dict[str, Any]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`. + + Parameters: + key_states (`torch.Tensor`): + The new key states to cache. + value_states (`torch.Tensor`): + The new value states to cache. + layer_idx (`int`): + The index of the layer to cache the states for. + cache_kwargs (`Dict[str, Any]`, `optional`): + Additional arguments for the cache subclass. These are specific to each subclass and allow new types of + cache to be created. + + Return: + A tuple containing the updated key and value states. + """ + raise NotImplementedError("Make sure to implement `update` in a subclass.") + + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + """Returns the sequence length of the cached states. A layer index can be optionally passed.""" + # TODO: deprecate this function in favor of `cache_position` + raise NotImplementedError("Make sure to implement `get_seq_length` in a subclass.") + + # Deprecate in favor of max-cache-shape because we want to be specifc by what we mean with "max_length" + # Prev some cache objects didn't have "max_length" (SlidingWindowCache or SinkCache) because the cache object technically handles + # infinite amount of tokens. In the codebase what we really need to check is the max capacity of certain cache instances, so + # we change naming to be more explicit + def get_max_length(self) -> Optional[int]: + # logger.warning_once( + # "`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. " + # "Calling `get_max_cache()` will raise error from v4.48" + # ) + return self.get_max_cache_shape() + + def get_max_cache_shape(self) -> Optional[int]: + """Returns the maximum sequence length (i.e. max capacity) of the cache object""" + raise NotImplementedError("Make sure to implement `get_max_cache_shape` in a subclass.") + + def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -> int: + """Given the sequence length of the new inputs, returns the usable length of the cache.""" + # Cache without size limit -> all cache is usable + # Cache with size limit -> if the length cache plus the length of the new inputs is larger the maximum cache + # length, we will need to evict part of the cache (and thus not all cache is usable) + max_length = self.get_max_cache_shape() + previous_seq_length = self.get_seq_length(layer_idx) + if max_length is not None and previous_seq_length + new_seq_length > max_length: + return max_length - new_seq_length + return previous_seq_length + + def reorder_cache(self, beam_idx: torch.LongTensor): + """Reorders the cache for beam search, given the selected beam indices.""" + for layer_idx in range(len(self.key_cache)): + if self.key_cache[layer_idx] != []: + device = self.key_cache[layer_idx].device + self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device)) + if self.value_cache[layer_idx] != []: + device = self.value_cache[layer_idx].device + self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device)) + + @property + def seen_tokens(self): + # logger.warning_once( + # "The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` " + # "model input instead." + # ) + if hasattr(self, "_seen_tokens"): + return self._seen_tokens + else: + return None + diff --git a/llm_layers.py b/llm_layers.py index cdd9fd6..502d24f 100644 --- a/llm_layers.py +++ b/llm_layers.py @@ -91,7 +91,7 @@ class SVLayer(nn.Module): self.lam = lam def forward(self, x): - if self.tv is not None: + if self.sv is not None: x = x.half() y = self.lam[0] * self.sv.repeat(1,x.shape[1],1) diff --git a/steer_vector.py b/steer_vector.py index cb0f5bb..7bf6312 100644 --- a/steer_vector.py +++ b/steer_vector.py @@ -48,8 +48,7 @@ def train_model(model, optimizer, device, prompts, labels, args): # ========= 日志 & 结果保存目录 ========= dir_name = f"SV_{args.model_name}_{args.dataset_name}/{args.component}/{args.str_layer}/{args.lam}" - log_dir = f"/{dir_name}/" - log_file = os.path.join(log_dir, f"log.txt") + log_file = os.path.join(dir_name, f"log.txt") os.makedirs(dir_name, exist_ok=True) logging.basicConfig( @@ -74,7 +73,7 @@ def train_model(model, optimizer, device, prompts, labels, args): scaler = GradScaler('cuda') # 混合精度的梯度缩放器 - num_trains = args.num_train + num_trains = len(train_prompts) @@ -207,7 +206,7 @@ def train_model(model, optimizer, device, prompts, labels, args): - return best_test_auroc + return best_test_auroc @@ -265,6 +264,12 @@ def test_model(model, centroids, test_prompts, test_labels, device, batch_size, val_predictions = torch.cat(val_predictions) val_labels = torch.cat(val_labels) + + # Debug: print predictions and labels distribution + print(f"[DEBUG] test_model: {len(val_predictions)} samples") + print(f"[DEBUG] Predictions min/max/mean: {val_predictions.min():.4f}/{val_predictions.max():.4f}/{val_predictions.mean():.4f}") + print(f"[DEBUG] Labels distribution: {torch.sum(val_labels == 0)} zeros, {torch.sum(val_labels == 1)} ones") + return val_predictions, val_labels @@ -287,6 +292,7 @@ def main(): # 1. 解析命令行参数 # ====================== parser = argparse.ArgumentParser() + parser.add_argument('--model_prefix', type=str, default='') parser.add_argument('--model_name', type=str, default='alpaca_7B') parser.add_argument('--num_gene', type=int, default=1) # 每个问题生成多少个答案 parser.add_argument('--train_sv', type=bool, default=True) # 是否执行“生成答案”阶段(1=生成+保存答案,0=不生成) @@ -378,8 +384,12 @@ def main(): train_index, val_index, test_index=split_indices(len(prompts_), args.train_ratio, args.val_ratio) - labels = [categories[test_index], categories[train_index]] - prompts = [prompts_[test_index], prompts_[train_index]] + # Convert numpy arrays to lists for Python list indexing + test_index_list = test_index.tolist() + train_index_list = train_index.tolist() + + labels = [[categories[i] for i in test_index_list], [categories[i] for i in train_index_list]] + prompts = [[prompts_[i] for i in test_index_list], [prompts_[i] for i in train_index_list]] diff --git a/train_utils.py b/train_utils.py index fbcdc68..97097e2 100644 --- a/train_utils.py +++ b/train_utils.py @@ -3,6 +3,7 @@ import torch from tqdm import tqdm from torch.amp import autocast import torch.nn.functional as F +import numpy as np