This commit is contained in:
weixin_43297441 2025-11-30 13:35:28 +08:00
parent f29b0c5286
commit fd6d6b8cec
4 changed files with 192 additions and 1270 deletions

View File

@ -58,6 +58,7 @@ def main():
parser.add_argument('--model_name', type=str, default='alpaca_7B') parser.add_argument('--model_name', type=str, default='alpaca_7B')
parser.add_argument('--judge_model', type=str, default='gpt-4') parser.add_argument('--judge_model', type=str, default='gpt-4')
parser.add_argument('--dataset_name', type=str, default='AdvBench') parser.add_argument('--dataset_name', type=str, default='AdvBench')
parser.add_argument('--test_ratio', type=float, default=0.2)
parser.add_argument("--model_dir", type=str, default=None, help='local directory with model data') parser.add_argument("--model_dir", type=str, default=None, help='local directory with model data')
@ -94,10 +95,13 @@ def main():
adverse_embed_generated_loc1=[] #head-wise representations adverse_embed_generated_loc1=[] #head-wise representations
adverse_embed_generated_loc2=[] #mlp-wise representations adverse_embed_generated_loc2=[] #mlp-wise representations
length = int(len(dataset)*0.55) length = len(dataset)
for i in tqdm(range(length)): for i in tqdm(range(length)):
question = dataset[i]['query'] question = dataset[i]['query']
adversary = dataset[i]['target'] adversary = dataset[i]['target']
@ -142,8 +146,8 @@ def main():
benign_embed_generated = np.asarray(np.stack(benign_embed_generated), dtype=np.float32) benign_embed_generated = np.asarray(np.stack(benign_embed_generated), dtype=np.float32)
adverse_embed_generated = np.asarray(np.stack(adverse_embed_generated), dtype=np.float32) adverse_embed_generated = np.asarray(np.stack(adverse_embed_generated), dtype=np.float32)
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_benign_embeddings_layer_wise.npy', benign_embed_generated) np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + 'benign_embeddings_layer_wise.npy', benign_embed_generated)
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_adverse_embeddings_layer_wise.npy', adverse_embed_generated) np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + 'adverse_embeddings_layer_wise.npy', adverse_embed_generated)
benign_embed_generated_loc2 = np.asarray(np.stack(benign_embed_generated_loc2), dtype=np.float32) benign_embed_generated_loc2 = np.asarray(np.stack(benign_embed_generated_loc2), dtype=np.float32)
benign_embed_generated_loc1 = np.asarray(np.stack(benign_embed_generated_loc1), dtype=np.float32) benign_embed_generated_loc1 = np.asarray(np.stack(benign_embed_generated_loc1), dtype=np.float32)
@ -151,10 +155,10 @@ def main():
adverse_embed_generated_loc1 = np.asarray(np.stack(adverse_embed_generated_loc1), dtype=np.float32) adverse_embed_generated_loc1 = np.asarray(np.stack(adverse_embed_generated_loc1), dtype=np.float32)
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_benign_embeddings_head_wise.npy', benign_embed_generated_loc2) np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + 'benign_embeddings_head_wise.npy', benign_embed_generated_loc2)
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_benign_embeddings_mlp_wise.npy', benign_embed_generated_loc1) np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + 'benign_embeddings_mlp_wise.npy', benign_embed_generated_loc1)
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_adverse_embeddings_head_wise.npy', adverse_embed_generated_loc2) np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + 'adverse_embeddings_head_wise.npy', adverse_embed_generated_loc2)
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_adverse_embeddings_mlp_wise.npy', adverse_embed_generated_loc1) np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + 'adverse_embeddings_mlp_wise.npy', adverse_embed_generated_loc1)

View File

@ -13,6 +13,7 @@ from torch.cuda.amp import autocast, GradScaler
import torch.nn.functional as F import torch.nn.functional as F
from sinkhorn_knopp import SinkhornKnopp_imb from sinkhorn_knopp import SinkhornKnopp_imb
import logging import logging
from utils import load_npy_shapes,split_indices
def seed_everything(seed: int): def seed_everything(seed: int):
@ -47,7 +48,7 @@ def train_model(model, optimizer, device, prompts, labels, args):
layer_number = -1 # 使用最后一层 hidden state这里 -1 当索引) layer_number = -1 # 使用最后一层 hidden state这里 -1 当索引)
# ========= 日志 & 结果保存目录 ========= # ========= 日志 & 结果保存目录 =========
dir_name = f"TSV_{args.model_name}_{args.dataset_name}/exemplar_num_{args.num_exemplars}_num_selected_data_{args.num_selected_data}/{args.component}/{args.str_layer}/{args.lam}" dir_name = f"SV_{args.model_name}_{args.dataset_name}/{args.component}/{args.str_layer}/{args.lam}"
log_dir = f"/{dir_name}/" log_dir = f"/{dir_name}/"
log_file = os.path.join(log_dir, f"log.txt") log_file = os.path.join(log_dir, f"log.txt")
os.makedirs(dir_name, exist_ok=True) os.makedirs(dir_name, exist_ok=True)
@ -61,14 +62,12 @@ def train_model(model, optimizer, device, prompts, labels, args):
logging.info("Starting training") logging.info("Starting training")
logging.info( logging.info(
f"Training parameters: few_shot_size={args.num_exemplars}, "
f"num_selected_data={args.num_selected_data}, "
f"component={args.component}, str_layer={args.str_layer}" f"component={args.component}, str_layer={args.str_layer}"
) )
# 解包数据test / wild train / exemplar # 解包数据test / wild train / exemplar
test_prompts, train_prompts, exemplar_prompts = prompts[0], prompts[1], prompts[2] test_prompts, train_prompts = prompts[0], prompts[1]
test_labels, train_labels, exemplar_labels = labels[0], labels[1], labels[2] test_labels, train_labels = labels[0], labels[1]
batch_size = args.batch_size batch_size = args.batch_size
losses = [] losses = []
@ -76,11 +75,9 @@ def train_model(model, optimizer, device, prompts, labels, args):
scaler = GradScaler() # 混合精度的梯度缩放器 scaler = GradScaler() # 混合精度的梯度缩放器
num_exemplars = args.num_exemplars num_trains = args.num_train
# ========= Sinkhorn OT 相关初始化(伪标签阶段要用)=========
args.num_iters_sk = 3 # Sinkhorn 迭代次数(论文 Eq.(8)(9)
args.epsilon_sk = 0.05 # OT 熵正则 ε(论文中也设为 0.05
# 用 exemplar 的标签估计类分布 wtruthful/hallu 的先验) # 用 exemplar 的标签估计类分布 wtruthful/hallu 的先验)
# ex_hallu = P(hallucinated), ex_true = P(truthful) # ex_hallu = P(hallucinated), ex_true = P(truthful)
@ -98,8 +95,8 @@ def train_model(model, optimizer, device, prompts, labels, args):
centroids = F.normalize(centroids, p=2, dim=1) centroids = F.normalize(centroids, p=2, dim=1)
# 把 exemplar 的 prompt/label 整理成一个大 batchpadding # 把 exemplar 的 prompt/label 整理成一个大 batchpadding
exemplar_prompts_, exemplar_labels_ = exemplar_prompts, exemplar_labels train_prompts_, train_labels_ = train_prompts, train_labels
exemplar_prompts, exemplar_labels = collate_fn(exemplar_prompts, exemplar_labels) train_prompts, train_labels = collate_fn(train_prompts, train_labels)
# ========= Phase 1: Initial training on exemplars ========= # ========= Phase 1: Initial training on exemplars =========
num_epochs = args.init_num_epochs num_epochs = args.init_num_epochs
@ -107,7 +104,7 @@ def train_model(model, optimizer, device, prompts, labels, args):
for epoch in range(num_epochs): for epoch in range(num_epochs):
running_loss = 0.0 running_loss = 0.0
total = 0 total = 0
num_samples = num_exemplars num_samples = num_trains
# 在 exemplar 上分 batch 训练(对应论文中 D_E 只含人工标注数据) # 在 exemplar 上分 batch 训练(对应论文中 D_E 只含人工标注数据)
for batch_start in tqdm( for batch_start in tqdm(
@ -115,8 +112,8 @@ def train_model(model, optimizer, device, prompts, labels, args):
desc=f"Epoch {epoch+1}/{num_epochs} Batches", desc=f"Epoch {epoch+1}/{num_epochs} Batches",
leave=False, leave=False,
): ):
batch_prompts = exemplar_prompts[batch_start: batch_start + batch_size] batch_prompts = train_prompts[batch_start: batch_start + batch_size]
batch_labels = exemplar_labels[batch_start: batch_start + batch_size] batch_labels = train_labels[batch_start: batch_start + batch_size]
# attention_mask: 1 = valid token, 0 = padding # attention_mask: 1 = valid token, 0 = padding
attention_mask = (batch_prompts != 0).half() attention_mask = (batch_prompts != 0).half()
@ -137,7 +134,7 @@ def train_model(model, optimizer, device, prompts, labels, args):
hidden_states = torch.stack(hidden_states, dim=0).squeeze() hidden_states = torch.stack(hidden_states, dim=0).squeeze()
last_layer_hidden_state = hidden_states[layer_number] # [B, T, H] last_layer_hidden_state = hidden_states[layer_number] # [B, T, H]
# 对应论文里 r_v最后非 padding token 的 hidden包含 TSV steer # 对应论文里 r_v最后非 padding token 的 hidden包含 SV steer
last_token_rep = get_last_non_padded_token_rep( last_token_rep = get_last_non_padded_token_rep(
last_layer_hidden_state, attention_mask.squeeze() last_layer_hidden_state, attention_mask.squeeze()
) )
@ -405,363 +402,126 @@ def test_model(model, centroids, test_prompts, test_labels, device, batch_size,
HF_NAMES = { HF_NAMES = {
'llama3.1-8B': 'meta-llama/Meta-Llama-3.1-8B', 'llama3.1-8B': 'meta-llama/Meta-Llama-3.1-8B',
'qwen2.5-7B': 'Qwen/Qwen2.5-7B' 'qwen2.5-7B': 'Qwen/Qwen2.5-7B' ,
'llama_7B': 'baffo32/decapoda-research-llama-7B-hf',
'honest_llama_7B': 'validation/results_dump/llama_7B_seed_42_top_48_heads_alpha_15',
'alpaca_7B': 'circulus/alpaca-7b',
'vicuna_7B': 'AlekseyKorshuk/vicuna-7b',
'llama2_chat_7B': 'models/Llama-2-7b-chat-hf',
'llama2_chat_13B': 'models/Llama-2-13b-chat-hf',
'llama2_chat_70B': 'meta-llama/Llama-2-70b-chat-hf',
} }
def main(): def main():
# ======================
# 1. 解析命令行参数
# ======================
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--model_name', type=str, default='llama3.1-8B') parser.add_argument('--model_name', type=str, default='alpaca_7B')
parser.add_argument('--model_prefix', type=str, default='', help='prefix of model name') parser.add_argument('--num_gene', type=int, default=1) # 每个问题生成多少个答案
parser.add_argument('--num_gene', type=int, default=1) parser.add_argument('--train_sv', type=bool, default=True) # 是否执行“生成答案”阶段1=生成+保存答案0=不生成)
parser.add_argument('--gene', type=int, default=0) parser.add_argument('--steer_place', type=str, default='layer', help='where to steer (layer, mlp and head)') # 是否执行“生成答案”阶段1=生成+保存答案0=不生成)
parser.add_argument('--generate_gt', type=int, default=0) parser.add_argument('--dataset_name', type=str, default='AdvBench') # 使用哪个数据集
parser.add_argument('--dataset_name', type=str, default='tqa') parser.add_argument('--device', type=int, default=0) # GPU id没怎么用device_map="auto" 为主)
parser.add_argument('--device', type=int, default=0)
parser.add_argument('--wild_ratio', type=float, default=0.75)
parser.add_argument('--thres_gt', type=float, default=0.5)
parser.add_argument('--most_likely', type=int, default=0)
parser.add_argument("--model_dir", type=str, default=None, help='local directory with model data') parser.add_argument("--model_dir", type=str, default=None, help='local directory with model data')
parser.add_argument("--batch_size", type=int, default=128) parser.add_argument("--batch_size", type=int, default=64)
parser.add_argument("--cos_temp", type=float, default=0.1) parser.add_argument("--cos_temp", type=float, default=0.1) # 余弦相似度的 softmax 温度
parser.add_argument("--ema_decay", type=float, default=0.99) parser.add_argument("--ema_decay", type=float, default=0.99) # 原型向量 EMA 衰减系数
parser.add_argument("--lr", type=float, default=0.005) parser.add_argument("--lr", type=float, default=0.005) # SV 的学习率
parser.add_argument("--str_layer", type=int, default=9) parser.add_argument("--str_layer", type=int, default=9) # 插 SV 的层号(第几层 hidden
parser.add_argument("--component", type=str, default='res') parser.add_argument("--component", type=str, default='res') # SV 以何种方式注入(残差等)
parser.add_argument("--lam", type=float, default=5) parser.add_argument("--lam", type=float, default=5) # SV 强度 λ
parser.add_argument("--init_num_epochs", type=int, default=20) parser.add_argument("--init_num_epochs", type=int, default=20) # SV训练轮数
parser.add_argument("--aug_num_epochs", type=int, default=20)
parser.add_argument("--num_exemplars", type=int, default=32)
parser.add_argument("--num_selected_data", type=int, default=128)
parser.add_argument("--cls_dist", type=str, default='proxy')
parser.add_argument("--optimizer", type=str, default='AdamW') parser.add_argument("--optimizer", type=str, default='AdamW')
parser.add_argument("--num_iters_sk", type=int, default=3) parser.add_argument('--train_ratio', type=float, default=0.8)
parser.add_argument("--epsilon_sk", type=float, default=0.05) parser.add_argument('--val_ratio', type=float, default=0.1)
args = parser.parse_args() args = parser.parse_args()
# ======================
# 2. 确定 HuggingFace 模型名
# ======================
# HF_NAMES 在文件前面定义:
# HF_NAMES = {'llama3.1-8B': 'meta-llama/Meta-Llama-3.1-8B', 'qwen2.5-7B': 'Qwen/Qwen2.5-7B'}
# model_prefix 可以拼接一些前缀,这里默认是空
model_name_or_path = HF_NAMES[args.model_prefix + args.model_name] model_name_or_path = HF_NAMES[args.model_prefix + args.model_name]
if args.dataset_name == "tqa": # ======================
dataset = load_dataset("truthful_qa", 'generation')['validation'] # 3. 加载不同的数据集
# ======================
elif args.dataset_name == 'triviaqa': if args.dataset_name == "AdvBench":
dataset = load_dataset("trivia_qa", "rc.nocontext", split="validation") dataset = load_dataset("LeeWlving/Safety_Datasets", "advbench")['test']
id_mem = set() elif args.dataset_name == 'HarmBench':
dataset = load_dataset("LeeWlving/Safety_Datasets", "harmbench")['standard']
def remove_dups(batch): elif args.dataset_name == 'JBB':
if batch['question_id'][0] in id_mem: dataset = load_dataset("LeeWlving/Safety_Datasets", "JBB")['test']
return {_: [] for _ in batch.keys()}
id_mem.add(batch['question_id'][0])
return batch
dataset = dataset.map(remove_dups, batch_size=1, batched=True, load_from_cache_file=False)
elif args.dataset_name == 'sciq':
dataset = load_dataset("allenai/sciq", split="validation")
elif args.dataset_name == 'nq_open':
dataset = load_dataset("google-research-datasets/nq_open", split="validation")
else: else:
raise ValueError("Invalid dataset name") assert "Not supported dataset name!"
target_directory = f"save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak"
benign_data, adverse_data = load_npy_shapes(target_directory, args.steer_place)
y_zero=np.zeros(len(benign_data))
y_one=np.ones(len(adverse_data))
data_embedding = np.concatenate((benign_data, adverse_data), axis=0)
gts = np.concatenate((y_zero, y_one), axis=0)
train_index, val_index, test_index=split_indices(len(data_embedding), args.train_ratio, args.val_ratio)
if args.gene:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token = '') # ============================================================
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto", token = '') # 6. 模式三SV 训练阶段gene=0 且 generate_gt=0
device = torch.device("cuda") # ============================================================
all_decoded_answers = [] if args.train_sv:
begin_index = 0
end_index = len(dataset)
if not os.path.exists(f'./save_for_eval/{args.dataset_name}_hal_det/'):
os.mkdir(f'./save_for_eval/{args.dataset_name}_hal_det/')
if not os.path.exists(f'./save_for_eval/{args.dataset_name}_hal_det/answers'):
os.mkdir(f'./save_for_eval/{args.dataset_name}_hal_det/answers')
period_token_id = [tokenizer(_)['input_ids'][-1] for _ in ['\n']]
period_token_id += [tokenizer.eos_token_id]
for i in range(begin_index, end_index):
answers = [None] * args.num_gene
answers_ = [None] * args.num_gene
question = dataset[i]['question']
prompt = tokenizer(f"Answer the question concisely. Q: {question}" + " A:", return_tensors='pt').input_ids.cuda()
for gen_iter in range(args.num_gene):
if args.most_likely:
generated = model.generate(prompt,
num_beams=5,
num_return_sequences=1,
do_sample=False,
max_new_tokens=64,
)
else:
generated = model.generate(prompt,
do_sample=True,
num_return_sequences=1,
num_beams=1,
max_new_tokens=64,
temperature=0.5,
top_p=1.0)
decoded = tokenizer.decode(generated[0, prompt.shape[-1]:],
skip_special_tokens=True)
# answers[gen_iter] = decoded
# Cleaning
if '\nAnswer the question concisely.' in decoded:
print('#####error')
print(decoded.split('\nAnswer the question concisely.')[1])
print('#####error')
decoded = decoded.split('\nAnswer the question concisely.')[0]
if 'Answer the question concisely' in decoded:
print('#####error')
print(decoded.split('Answer the question concisely')[1])
print('#####error')
decoded = decoded.split('Answer the question concisely')[0]
if 'The answer to the question' in decoded:
print('#####error')
print(decoded.split('The answer to the question')[1])
print('#####error')
decoded = decoded.split('The answer to the question')[0]
if 'How to Write a Concise Statement' in decoded:
print('#####error')
print(decoded.split('How to Write a Concise Statement')[1])
print('#####error')
decoded = decoded.split('How to Write a Concise Statement')[0]
if 'Q:' in decoded:
print('#####error')
print(decoded.split('Q:')[1])
print('#####error')
decoded = decoded.split('Q:')[0]
if '\nYou are an AI assistant' in decoded:
print('#####error')
print(decoded.split('\nYou are an AI assistant')[1])
print('#####error')
decoded = decoded.split('\nYou are an AI assistant')[0]
if 'You are an AI assistant' in decoded:
print('#####error')
print(decoded.split('You are an AI assistant')[1])
print('#####error')
decoded = decoded.split('You are an AI assistant')[0]
if 'A:' in decoded:
print('#####error')
print(decoded.split('A:')[1])
print('#####error')
decoded = decoded.split('A:')[0]
if 'B:' in decoded:
print('#####error')
print(decoded.split('B:')[1])
print('#####error')
decoded = decoded.split('B:')[0]
if 'C:' in decoded:
print('#####error')
print(decoded.split('C:')[1])
print('#####error')
decoded = decoded.split('C:')[0]
if 'D:' in decoded:
print('#####error')
print(decoded.split('D:')[1])
print('#####error')
decoded = decoded.split('D:')[0]
print(f'Cleaned Answer: {decoded}')
answers[gen_iter] = decoded
print('sample: ', i)
if args.most_likely:
info = 'most_likely_'
else:
info = 'batch_generations_'
print("Saving answers")
print(decoded)
np.save(f'./save_for_eval/{args.dataset_name}_hal_det/answers/' + info + f'hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy',
answers)
elif args.generate_gt:
from bleurt_pytorch import BleurtForSequenceClassification, BleurtTokenizer
model = BleurtForSequenceClassification.from_pretrained('lucadiliello/BLEURT-20').cuda()
tokenizer = BleurtTokenizer.from_pretrained('lucadiliello/BLEURT-20')
model.eval()
gts = np.zeros(0)
length = len(dataset)
for i in range(length):
if args.dataset_name == 'tqa':
best_answer = dataset[i]['best_answer']
correct_answer = dataset[i]['correct_answers']
all_answers = [best_answer] + correct_answer
question = dataset[i]['question']
elif args.dataset_name == 'triviaqa':
all_answers = dataset[i]['answer']['aliases']
if args.most_likely:
# answers = np.load(
# f'./save_for_eval/{args.dataset_name}_hal_det/answers/most_likely_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy')
answers = np.load(
f'./save_for_eval/{args.dataset_name}_hal_det/answers/most_likely_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy')
else:
answers = np.load(
f'./save_for_eval/{args.dataset_name}_hal_det/answers/batch_generations_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy')
# get the gt.
predictions = answers
all_results = np.zeros((len(all_answers), len(predictions)))
with torch.no_grad():
for anw in range(len(all_answers)):
inputs = tokenizer(predictions.tolist(), [all_answers[anw]] * len(predictions),
padding='longest', return_tensors='pt')
for key in list(inputs.keys()):
inputs[key] = inputs[key].cuda()
res = np.asarray(model(**inputs).logits.flatten().tolist())
all_results[anw] = res
gts = np.concatenate([gts, np.max(all_results, axis=0)], 0)
if i % 10 == 0:
print("samples passed: ", i)
if args.most_likely:
# np.save(f'./ml_{args.dataset_name}_bleurt_score.npy', gts)
np.save(f'./ml_{args.dataset_name}_bleurt_score.npy', gts)
else:
np.save(f'./bg_{args.dataset_name}_bleurt_score.npy', gts)
else:
device = torch.device("cuda") device = torch.device("cuda")
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto", token = '') # 加载基础 LLM之后会在其上插 TSV
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token = '') model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
low_cpu_mem_usage=True,
torch_dtype=torch.float16,
device_map="auto",
token = ''
)
prompts = [] train_data = [data_embedding[i] for i in train_index]
qa_pairs = [] train_labels = [gts[i] for i in train_index]
categories = []
length = len(dataset) test_data = [data_embedding[i] for i in test_index]
test_labels = [gts[i] for i in test_index]
for i in tqdm(range(length)):
question = dataset[i]['question']
if args.dataset_name == 'tqa':
categories.append(dataset[i]['category'])
answers = np.load(
f'./save_for_eval/{args.dataset_name}_hal_det/answers/most_likely_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy')
for anw in answers:
prompt = tokenizer(
f"Answer the question concisely. Q: {question}" + " A:" + anw,
return_tensors='pt').input_ids.cuda()
prompts.append(prompt)
qa_pairs.append({'Question': question, 'Answer': anw})
gts = np.load(f'./ml_{args.dataset_name}_bleurt_score.npy')
length = len(dataset)
if args.dataset_name == 'tqa' or args.dataset_name == 'triviaqa':
args.thres_gt = 0.5
else:
args.thres_gt = 0.2
gt_label = np.asarray(gts> args.thres_gt, dtype=np.int32)
# index = np.random.permutation(length)
# exemplar_index = index[:args.num_exemplars]
# wild_q_indices = index[:int(args.wild_ratio * length)]
index = np.load(f'data_indices/data_index_{args.dataset_name}.npy')
exemplar_index = np.load(f'data_indices/exemplar_idx_{args.dataset_name}.npy')
wild_q_indices = index[:int(args.wild_ratio * length)]
wild_q_indices1 = wild_q_indices[:len(wild_q_indices) - 100]
args.num_exemplars = len(exemplar_index)
gt_label_test = []
gt_label_wild = []
gt_label_exemplar = []
test_prompts = []
train_prompts = []
exemplar_prompts = []
for i in range(length):
if i not in wild_q_indices:
gt_label_test.extend(gt_label[i: i+1])
test_prompts.extend(prompts[i:i+1])
elif i in exemplar_index:
gt_label_exemplar.extend(gt_label[i: i+1])
exemplar_prompts.extend(prompts[i:i+1])
elif i in wild_q_indices1:
gt_label_wild.extend(gt_label[i: i+1])
train_prompts.extend(prompts[i:i+1])
gt_label_test = np.asarray(gt_label_test)
gt_label_exemplar = np.asarray(gt_label_exemplar)
gt_label_wild = np.asarray(gt_label_wild)
labels = [ gt_label_test, gt_label_wild, gt_label_exemplar]
prompts = [ test_prompts, train_prompts, exemplar_prompts]
# ====== 6.4 冻结 LLM只训练 SV 参数 ======
num_layers = model.config.num_hidden_layers num_layers = model.config.num_hidden_layers
hidden_size = model.config.hidden_size hidden_size = model.config.hidden_size
for param in model.parameters(): for param in model.parameters():
param.requires_grad = False param.requires_grad = False
tsv = nn.ParameterList( # 每一层一个 SV 向量(虽然实际只在某几层生效)
[nn.Parameter(torch.zeros(hidden_size), requires_grad=True) for _ in range(num_layers)]) sv = nn.ParameterList(
[nn.Parameter(torch.zeros(hidden_size), requires_grad=True) for _ in range(num_layers)]
)
sv.to(device)
tsv.to(device) # 在模型对应层里插入 SVadd_sv_layers 会改 forward
add_sv_layers(model, sv, [args.lam], args)
add_tsv_layers(model, tsv, [args.lam], args)
optimizer = torch.optim.AdamW(list(tsv.parameters()), lr=args.lr) # 只把 SV 的参数丢给优化器
optimizer = torch.optim.AdamW(list(sv.parameters()), lr=args.lr)
# ====== 6.5 调用 train_model进入两阶段训练流程 ======
train_model(model, optimizer, device, train_data, train_labels, args=args)
else:
print("Skip training steer vectors.")
raise NotImplementedError("Only training SV is implemented in this script.")
train_model(model, optimizer, device, prompts, labels, args=args)
if __name__ == '__main__': if __name__ == '__main__':
# 为了结果可复现,固定随机种子
seed_everything(42) seed_everything(42)
main() main()

983
utils.py

File diff suppressed because it is too large Load Diff

View File

@ -13,7 +13,8 @@ from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
def check_npy_shapes(directory_path):
def load_npy_shapes(directory_path, steer_place="layer"):
""" """
Check shapes of all .npy files in the specified directory Check shapes of all .npy files in the specified directory
and load them as benign vs adverse pairs. and load them as benign vs adverse pairs.
@ -29,8 +30,8 @@ def check_npy_shapes(directory_path):
print("-" * 50) print("-" * 50)
# Separate files into benign and adverse # Separate files into benign and adverse
benign_files = [f for f in npy_files if "benign" in f.lower()] benign_files = [f for f in npy_files if "benign" in f.lower() and steer_place in f.lower()]
adverse_files = [f for f in npy_files if "adverse" in f.lower()] adverse_files = [f for f in npy_files if "adverse" in f.lower() and steer_place in f.lower()]
# Check shapes and load data # Check shapes and load data
benign_data = {} benign_data = {}
@ -75,12 +76,6 @@ def check_npy_shapes(directory_path):
if benign_array.shape == adverse_array.shape: if benign_array.shape == adverse_array.shape:
print(f" ✓ Shapes match") print(f" ✓ Shapes match")
# Calculate some basic statistics for comparison
diff = np.abs(benign_array - adverse_array)
print(f" Mean absolute difference: {np.mean(diff):.6f}")
print(f" Max absolute difference: {np.max(diff):.6f}")
print(f" Min absolute difference: {np.min(diff):.6f}")
else: else:
print(f" ✗ Shapes don't match!") print(f" ✗ Shapes don't match!")
@ -300,7 +295,7 @@ def main():
return return
# Check and load the .npy files # Check and load the .npy files
benign_data, adverse_data = check_npy_shapes(target_directory) benign_data, adverse_data = load_npy_shapes(target_directory)
# Perform PCA analysis to identify discriminative layers # Perform PCA analysis to identify discriminative layers
if benign_data and adverse_data: if benign_data and adverse_data: