update data.
This commit is contained in:
parent
e44ad1700f
commit
3641dba78e
|
|
@ -69,11 +69,11 @@ def main():
|
|||
|
||||
|
||||
if args.dataset_name == "AdvBench":
|
||||
dataset = load_dataset("thu-coai/AISafetyLab_Datasets", "advbench")['test']
|
||||
dataset = load_dataset("LeeWlving/Safety_Datasets", "advbench")['test']
|
||||
elif args.dataset_name == 'HarmBench':
|
||||
dataset = load_dataset("thu-coai/AISafetyLab_Datasets", "harmbench")['standard']
|
||||
dataset = load_dataset("LeeWlving/Safety_Datasets", "harmbench")['standard']
|
||||
elif args.dataset_name == 'JBB':
|
||||
dataset = load_dataset("thu-coai/AISafetyLab_Datasets", "JBB")['test']
|
||||
dataset = load_dataset("LeeWlving/Safety_Datasets", "JBB")['test']
|
||||
else:
|
||||
assert "Not supported dataset name!"
|
||||
|
||||
|
|
@ -96,48 +96,48 @@ def main():
|
|||
|
||||
|
||||
|
||||
with open('benign.json') as f:
|
||||
benign_answers = json.load(f)
|
||||
|
||||
length = int(len(dataset)*0.55)
|
||||
for i in tqdm(range(length)):
|
||||
question = dataset[i]['query']
|
||||
adversary = dataset[i]['target']
|
||||
for anw in benign_answers:
|
||||
benign_prompt = tokenizer(
|
||||
f" Q: {question}" + " A:" + anw['response'],
|
||||
clean= dataset[i]['response']
|
||||
benign_prompt = tokenizer(
|
||||
f" Q: {question}" + " A:" + clean,
|
||||
return_tensors='pt').input_ids.cuda()
|
||||
adverse_prompt = tokenizer(
|
||||
adverse_prompt = tokenizer(
|
||||
f" Q: {question}" + " A:" + adversary,
|
||||
return_tensors='pt').input_ids.cuda()
|
||||
|
||||
|
||||
with torch.no_grad():
|
||||
benign_hidden_states = model(benign_prompt, output_hidden_states=True).hidden_states
|
||||
benign_hidden_states = torch.stack(benign_hidden_states, dim=0).squeeze()
|
||||
benign_hidden_states = benign_hidden_states.detach().cpu().numpy()[:, -1, :]
|
||||
benign_embed_generated.append(benign_hidden_states)
|
||||
with torch.no_grad():
|
||||
benign_hidden_states = model(benign_prompt, output_hidden_states=True).hidden_states
|
||||
benign_hidden_states = torch.stack(benign_hidden_states, dim=0).squeeze()
|
||||
benign_hidden_states = benign_hidden_states.detach().cpu().numpy()[:, -1, :]
|
||||
benign_embed_generated.append(benign_hidden_states)
|
||||
|
||||
adverse_hidden_states = model(adverse_prompt, output_hidden_states=True).hidden_states
|
||||
adverse_hidden_states = torch.stack(adverse_hidden_states, dim=0).squeeze()
|
||||
adverse_hidden_states = adverse_hidden_states.detach().cpu().numpy()[:, -1, :]
|
||||
adverse_embed_generated.append(adverse_hidden_states)
|
||||
with torch.no_grad():
|
||||
with TraceDict(model, HEADS + MLPS) as ret:
|
||||
output = model(benign_prompt, output_hidden_states=True)
|
||||
head_wise_hidden_states = [ret[head].output.squeeze().detach().cpu() for head in HEADS]
|
||||
head_wise_hidden_states = torch.stack(head_wise_hidden_states, dim=0).squeeze().numpy()
|
||||
mlp_wise_hidden_states = [ret[mlp].output.squeeze().detach().cpu() for mlp in MLPS]
|
||||
mlp_wise_hidden_states = torch.stack(mlp_wise_hidden_states, dim=0).squeeze().numpy()
|
||||
benign_embed_generated_loc2.append(mlp_wise_hidden_states[:, -1, :])
|
||||
benign_embed_generated_loc1.append(head_wise_hidden_states[:, -1, :])
|
||||
with torch.no_grad():
|
||||
with TraceDict(model, HEADS + MLPS) as ret:
|
||||
output = model(adverse_prompt, output_hidden_states=True)
|
||||
head_wise_hidden_states = [ret[head].output.squeeze().detach().cpu() for head in HEADS]
|
||||
head_wise_hidden_states = torch.stack(head_wise_hidden_states, dim=0).squeeze().numpy()
|
||||
mlp_wise_hidden_states = [ret[mlp].output.squeeze().detach().cpu() for mlp in MLPS]
|
||||
mlp_wise_hidden_states = torch.stack(mlp_wise_hidden_states, dim=0).squeeze().numpy()
|
||||
adverse_embed_generated_loc2.append(mlp_wise_hidden_states[:, -1, :])
|
||||
adverse_embed_generated_loc1.append(head_wise_hidden_states[:, -1, :])
|
||||
adverse_hidden_states = model(adverse_prompt, output_hidden_states=True).hidden_states
|
||||
adverse_hidden_states = torch.stack(adverse_hidden_states, dim=0).squeeze()
|
||||
adverse_hidden_states = adverse_hidden_states.detach().cpu().numpy()[:, -1, :]
|
||||
adverse_embed_generated.append(adverse_hidden_states)
|
||||
with torch.no_grad():
|
||||
with TraceDict(model, HEADS + MLPS) as ret:
|
||||
output = model(benign_prompt, output_hidden_states=True)
|
||||
head_wise_hidden_states = [ret[head].output.squeeze().detach().cpu() for head in HEADS]
|
||||
head_wise_hidden_states = torch.stack(head_wise_hidden_states, dim=0).squeeze().numpy()
|
||||
mlp_wise_hidden_states = [ret[mlp].output.squeeze().detach().cpu() for mlp in MLPS]
|
||||
mlp_wise_hidden_states = torch.stack(mlp_wise_hidden_states, dim=0).squeeze().numpy()
|
||||
benign_embed_generated_loc2.append(mlp_wise_hidden_states[:, -1, :])
|
||||
benign_embed_generated_loc1.append(head_wise_hidden_states[:, -1, :])
|
||||
with torch.no_grad():
|
||||
with TraceDict(model, HEADS + MLPS) as ret:
|
||||
output = model(adverse_prompt, output_hidden_states=True)
|
||||
head_wise_hidden_states = [ret[head].output.squeeze().detach().cpu() for head in HEADS]
|
||||
head_wise_hidden_states = torch.stack(head_wise_hidden_states, dim=0).squeeze().numpy()
|
||||
mlp_wise_hidden_states = [ret[mlp].output.squeeze().detach().cpu() for mlp in MLPS]
|
||||
mlp_wise_hidden_states = torch.stack(mlp_wise_hidden_states, dim=0).squeeze().numpy()
|
||||
adverse_embed_generated_loc2.append(mlp_wise_hidden_states[:, -1, :])
|
||||
adverse_embed_generated_loc1.append(head_wise_hidden_states[:, -1, :])
|
||||
|
||||
|
||||
benign_embed_generated = np.asarray(np.stack(benign_embed_generated), dtype=np.float32)
|
||||
|
|
|
|||
Loading…
Reference in New Issue