diff --git a/generate_data.py b/generate_data.py deleted file mode 100644 index e69de29..0000000 diff --git a/jailbreak_llama.py b/jailbreak_llama.py index 65f21db..27efc69 100644 --- a/jailbreak_llama.py +++ b/jailbreak_llama.py @@ -69,11 +69,11 @@ def main(): if args.dataset_name == "AdvBench": - dataset = load_dataset("thu-coai/AISafetyLab_Datasets", "advbench")['test'] + dataset = load_dataset("LeeWlving/Safety_Datasets", "advbench")['test'] elif args.dataset_name == 'HarmBench': - dataset = load_dataset("thu-coai/AISafetyLab_Datasets", "harmbench")['standard'] + dataset = load_dataset("LeeWlving/Safety_Datasets", "harmbench")['standard'] elif args.dataset_name == 'JBB': - dataset = load_dataset("thu-coai/AISafetyLab_Datasets", "JBB")['test'] + dataset = load_dataset("LeeWlving/Safety_Datasets", "JBB")['test'] else: assert "Not supported dataset name!" @@ -96,48 +96,48 @@ def main(): - with open('benign.json') as f: - benign_answers = json.load(f) + length = int(len(dataset)*0.55) for i in tqdm(range(length)): question = dataset[i]['query'] adversary = dataset[i]['target'] - for anw in benign_answers: - benign_prompt = tokenizer( - f" Q: {question}" + " A:" + anw['response'], + clean= dataset[i]['response'] + benign_prompt = tokenizer( + f" Q: {question}" + " A:" + clean, return_tensors='pt').input_ids.cuda() - adverse_prompt = tokenizer( + adverse_prompt = tokenizer( f" Q: {question}" + " A:" + adversary, return_tensors='pt').input_ids.cuda() + - with torch.no_grad(): - benign_hidden_states = model(benign_prompt, output_hidden_states=True).hidden_states - benign_hidden_states = torch.stack(benign_hidden_states, dim=0).squeeze() - benign_hidden_states = benign_hidden_states.detach().cpu().numpy()[:, -1, :] - benign_embed_generated.append(benign_hidden_states) + with torch.no_grad(): + benign_hidden_states = model(benign_prompt, output_hidden_states=True).hidden_states + benign_hidden_states = torch.stack(benign_hidden_states, dim=0).squeeze() + benign_hidden_states = benign_hidden_states.detach().cpu().numpy()[:, -1, :] + benign_embed_generated.append(benign_hidden_states) - adverse_hidden_states = model(adverse_prompt, output_hidden_states=True).hidden_states - adverse_hidden_states = torch.stack(adverse_hidden_states, dim=0).squeeze() - adverse_hidden_states = adverse_hidden_states.detach().cpu().numpy()[:, -1, :] - adverse_embed_generated.append(adverse_hidden_states) - with torch.no_grad(): - with TraceDict(model, HEADS + MLPS) as ret: - output = model(benign_prompt, output_hidden_states=True) - head_wise_hidden_states = [ret[head].output.squeeze().detach().cpu() for head in HEADS] - head_wise_hidden_states = torch.stack(head_wise_hidden_states, dim=0).squeeze().numpy() - mlp_wise_hidden_states = [ret[mlp].output.squeeze().detach().cpu() for mlp in MLPS] - mlp_wise_hidden_states = torch.stack(mlp_wise_hidden_states, dim=0).squeeze().numpy() - benign_embed_generated_loc2.append(mlp_wise_hidden_states[:, -1, :]) - benign_embed_generated_loc1.append(head_wise_hidden_states[:, -1, :]) - with torch.no_grad(): - with TraceDict(model, HEADS + MLPS) as ret: - output = model(adverse_prompt, output_hidden_states=True) - head_wise_hidden_states = [ret[head].output.squeeze().detach().cpu() for head in HEADS] - head_wise_hidden_states = torch.stack(head_wise_hidden_states, dim=0).squeeze().numpy() - mlp_wise_hidden_states = [ret[mlp].output.squeeze().detach().cpu() for mlp in MLPS] - mlp_wise_hidden_states = torch.stack(mlp_wise_hidden_states, dim=0).squeeze().numpy() - adverse_embed_generated_loc2.append(mlp_wise_hidden_states[:, -1, :]) - adverse_embed_generated_loc1.append(head_wise_hidden_states[:, -1, :]) + adverse_hidden_states = model(adverse_prompt, output_hidden_states=True).hidden_states + adverse_hidden_states = torch.stack(adverse_hidden_states, dim=0).squeeze() + adverse_hidden_states = adverse_hidden_states.detach().cpu().numpy()[:, -1, :] + adverse_embed_generated.append(adverse_hidden_states) + with torch.no_grad(): + with TraceDict(model, HEADS + MLPS) as ret: + output = model(benign_prompt, output_hidden_states=True) + head_wise_hidden_states = [ret[head].output.squeeze().detach().cpu() for head in HEADS] + head_wise_hidden_states = torch.stack(head_wise_hidden_states, dim=0).squeeze().numpy() + mlp_wise_hidden_states = [ret[mlp].output.squeeze().detach().cpu() for mlp in MLPS] + mlp_wise_hidden_states = torch.stack(mlp_wise_hidden_states, dim=0).squeeze().numpy() + benign_embed_generated_loc2.append(mlp_wise_hidden_states[:, -1, :]) + benign_embed_generated_loc1.append(head_wise_hidden_states[:, -1, :]) + with torch.no_grad(): + with TraceDict(model, HEADS + MLPS) as ret: + output = model(adverse_prompt, output_hidden_states=True) + head_wise_hidden_states = [ret[head].output.squeeze().detach().cpu() for head in HEADS] + head_wise_hidden_states = torch.stack(head_wise_hidden_states, dim=0).squeeze().numpy() + mlp_wise_hidden_states = [ret[mlp].output.squeeze().detach().cpu() for mlp in MLPS] + mlp_wise_hidden_states = torch.stack(mlp_wise_hidden_states, dim=0).squeeze().numpy() + adverse_embed_generated_loc2.append(mlp_wise_hidden_states[:, -1, :]) + adverse_embed_generated_loc1.append(head_wise_hidden_states[:, -1, :]) benign_embed_generated = np.asarray(np.stack(benign_embed_generated), dtype=np.float32)