update data.

This commit is contained in:
weixin_43297441 2025-11-27 21:39:32 +08:00
parent e44ad1700f
commit 3641dba78e
2 changed files with 36 additions and 36 deletions

View File

View File

@ -69,11 +69,11 @@ def main():
if args.dataset_name == "AdvBench":
dataset = load_dataset("thu-coai/AISafetyLab_Datasets", "advbench")['test']
dataset = load_dataset("LeeWlving/Safety_Datasets", "advbench")['test']
elif args.dataset_name == 'HarmBench':
dataset = load_dataset("thu-coai/AISafetyLab_Datasets", "harmbench")['standard']
dataset = load_dataset("LeeWlving/Safety_Datasets", "harmbench")['standard']
elif args.dataset_name == 'JBB':
dataset = load_dataset("thu-coai/AISafetyLab_Datasets", "JBB")['test']
dataset = load_dataset("LeeWlving/Safety_Datasets", "JBB")['test']
else:
assert "Not supported dataset name!"
@ -96,20 +96,20 @@ def main():
with open('benign.json') as f:
benign_answers = json.load(f)
length = int(len(dataset)*0.55)
for i in tqdm(range(length)):
question = dataset[i]['query']
adversary = dataset[i]['target']
for anw in benign_answers:
clean= dataset[i]['response']
benign_prompt = tokenizer(
f" Q: {question}" + " A:" + anw['response'],
f" Q: {question}" + " A:" + clean,
return_tensors='pt').input_ids.cuda()
adverse_prompt = tokenizer(
f" Q: {question}" + " A:" + adversary,
return_tensors='pt').input_ids.cuda()
with torch.no_grad():
benign_hidden_states = model(benign_prompt, output_hidden_states=True).hidden_states
benign_hidden_states = torch.stack(benign_hidden_states, dim=0).squeeze()