new
This commit is contained in:
parent
fd6d6b8cec
commit
9b8eca3c38
Binary file not shown.
|
|
@ -460,18 +460,6 @@ def main():
|
|||
assert "Not supported dataset name!"
|
||||
|
||||
|
||||
target_directory = f"save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak"
|
||||
benign_data, adverse_data = load_npy_shapes(target_directory, args.steer_place)
|
||||
y_zero=np.zeros(len(benign_data))
|
||||
y_one=np.ones(len(adverse_data))
|
||||
|
||||
data_embedding = np.concatenate((benign_data, adverse_data), axis=0)
|
||||
gts = np.concatenate((y_zero, y_one), axis=0)
|
||||
|
||||
train_index, val_index, test_index=split_indices(len(data_embedding), args.train_ratio, args.val_ratio)
|
||||
|
||||
|
||||
|
||||
|
||||
# ============================================================
|
||||
# 6. 模式三:SV 训练阶段(gene=0 且 generate_gt=0)
|
||||
|
|
@ -487,12 +475,42 @@ def main():
|
|||
device_map="auto",
|
||||
token = ''
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token = '')
|
||||
|
||||
prompts = []
|
||||
qa_pairs = []
|
||||
categories = []
|
||||
|
||||
length = len(dataset)
|
||||
|
||||
for i in tqdm(range(length)):
|
||||
question = dataset[i]['query']
|
||||
adversary = dataset[i]['target']
|
||||
prompt = tokenizer(
|
||||
f"Answer the question concisely. Q: {question}" + " A:" + adversary,
|
||||
return_tensors='pt'
|
||||
).input_ids.cuda()
|
||||
|
||||
prompts.append(prompt)
|
||||
qa_pairs.append({'Question': question, 'Answer': adversary})
|
||||
categories.append(1) # 1 = adverse
|
||||
|
||||
for i in tqdm(range(length)):
|
||||
question = dataset[i]['query']
|
||||
clean= dataset[i]['response']
|
||||
prompt = tokenizer(
|
||||
f"Answer the question concisely. Q: {question}" + " A:" + clean,
|
||||
return_tensors='pt'
|
||||
).input_ids.cuda()
|
||||
|
||||
prompts.append(prompt)
|
||||
qa_pairs.append({'Question': question, 'Answer': clean})
|
||||
categories.append(0) # 0 = benign
|
||||
|
||||
train_index, val_index, test_index=split_indices(len(prompts), args.train_ratio, args.val_ratio)
|
||||
|
||||
|
||||
train_data = [data_embedding[i] for i in train_index]
|
||||
train_labels = [gts[i] for i in train_index]
|
||||
|
||||
test_data = [data_embedding[i] for i in test_index]
|
||||
test_labels = [gts[i] for i in test_index]
|
||||
|
||||
# ====== 6.4 冻结 LLM,只训练 SV 参数 ======
|
||||
num_layers = model.config.num_hidden_layers
|
||||
|
|
|
|||
BIN
tqa_score.mat
BIN
tqa_score.mat
Binary file not shown.
Loading…
Reference in New Issue