This commit is contained in:
weixin_43297441 2025-12-01 20:52:56 +08:00
parent fd6d6b8cec
commit 9b8eca3c38
3 changed files with 34 additions and 16 deletions

Binary file not shown.

View File

@ -460,18 +460,6 @@ def main():
assert "Not supported dataset name!"
target_directory = f"save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak"
benign_data, adverse_data = load_npy_shapes(target_directory, args.steer_place)
y_zero=np.zeros(len(benign_data))
y_one=np.ones(len(adverse_data))
data_embedding = np.concatenate((benign_data, adverse_data), axis=0)
gts = np.concatenate((y_zero, y_one), axis=0)
train_index, val_index, test_index=split_indices(len(data_embedding), args.train_ratio, args.val_ratio)
# ============================================================
# 6. 模式三SV 训练阶段gene=0 且 generate_gt=0
@ -487,12 +475,42 @@ def main():
device_map="auto",
token = ''
)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token = '')
prompts = []
qa_pairs = []
categories = []
length = len(dataset)
for i in tqdm(range(length)):
question = dataset[i]['query']
adversary = dataset[i]['target']
prompt = tokenizer(
f"Answer the question concisely. Q: {question}" + " A:" + adversary,
return_tensors='pt'
).input_ids.cuda()
prompts.append(prompt)
qa_pairs.append({'Question': question, 'Answer': adversary})
categories.append(1) # 1 = adverse
for i in tqdm(range(length)):
question = dataset[i]['query']
clean= dataset[i]['response']
prompt = tokenizer(
f"Answer the question concisely. Q: {question}" + " A:" + clean,
return_tensors='pt'
).input_ids.cuda()
prompts.append(prompt)
qa_pairs.append({'Question': question, 'Answer': clean})
categories.append(0) # 0 = benign
train_index, val_index, test_index=split_indices(len(prompts), args.train_ratio, args.val_ratio)
train_data = [data_embedding[i] for i in train_index]
train_labels = [gts[i] for i in train_index]
test_data = [data_embedding[i] for i in test_index]
test_labels = [gts[i] for i in test_index]
# ====== 6.4 冻结 LLM只训练 SV 参数 ======
num_layers = model.config.num_hidden_layers

Binary file not shown.