175 lines
8.2 KiB
Python
175 lines
8.2 KiB
Python
import os
|
|
import torch
|
|
import torch.nn.functional as F
|
|
# import evaluate
|
|
from datasets import load_metric
|
|
from datasets import load_dataset
|
|
# import datasets
|
|
from tqdm import tqdm
|
|
import numpy as np
|
|
import pickle
|
|
# from utils import get_llama_activations_bau, tokenized_tqa, tokenized_tqa_gen, tokenized_tqa_gen_end_q
|
|
from utils import mahalanobis_distance
|
|
from scipy.io import savemat
|
|
import llama_iti
|
|
import pickle
|
|
import argparse
|
|
import matplotlib.pyplot as plt
|
|
from pprint import pprint
|
|
from baukit import Trace, TraceDict
|
|
from metric_utils import get_measures, print_measures
|
|
import re
|
|
from torch.autograd import Variable
|
|
from scipy.spatial import distance
|
|
from sklearn.linear_model import Perceptron
|
|
from sklearn.ensemble import GradientBoostingRegressor
|
|
import json
|
|
|
|
|
|
|
|
def seed_everything(seed: int):
|
|
import random, os
|
|
import numpy as np
|
|
import torch
|
|
|
|
random.seed(seed)
|
|
os.environ['PYTHONHASHSEED'] = str(seed)
|
|
np.random.seed(seed)
|
|
torch.manual_seed(seed)
|
|
torch.cuda.manual_seed(seed)
|
|
torch.backends.cudnn.deterministic = True
|
|
torch.backends.cudnn.benchmark = True
|
|
|
|
HF_NAMES = {
|
|
'llama_7B': 'baffo32/decapoda-research-llama-7B-hf',
|
|
'honest_llama_7B': 'validation/results_dump/llama_7B_seed_42_top_48_heads_alpha_15',
|
|
'alpaca_7B': 'circulus/alpaca-7b',
|
|
'vicuna_7B': 'AlekseyKorshuk/vicuna-7b',
|
|
'llama2_chat_7B': 'models/Llama-2-7b-chat-hf',
|
|
'llama2_chat_13B': 'models/Llama-2-13b-chat-hf',
|
|
'llama2_chat_70B': 'meta-llama/Llama-2-70b-chat-hf',
|
|
}
|
|
|
|
|
|
def main():
|
|
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--model_name', type=str, default='alpaca_7B')
|
|
parser.add_argument('--judge_model', type=str, default='gpt-4')
|
|
parser.add_argument('--dataset_name', type=str, default='AdvBench')
|
|
|
|
|
|
parser.add_argument("--model_dir", type=str, default=None, help='local directory with model data')
|
|
args = parser.parse_args()
|
|
|
|
MODEL = HF_NAMES[args.model_name]
|
|
|
|
|
|
|
|
|
|
if args.dataset_name == "AdvBench":
|
|
dataset = load_dataset("thu-coai/AISafetyLab_Datasets", "advbench")['test']
|
|
elif args.dataset_name == 'HarmBench':
|
|
dataset = load_dataset("thu-coai/AISafetyLab_Datasets", "harmbench")['standard']
|
|
elif args.dataset_name == 'JBB':
|
|
dataset = load_dataset("thu-coai/AISafetyLab_Datasets", "JBB")['test']
|
|
else:
|
|
assert "Not supported dataset name!"
|
|
|
|
|
|
|
|
|
|
tokenizer = llama_iti.LlamaTokenizer.from_pretrained('circulus/alpaca-7b', trust_remote_code=True)
|
|
model = llama_iti.LlamaForCausalLM.from_pretrained('circulus/alpaca-7b', low_cpu_mem_usage=True,
|
|
torch_dtype=torch.float16,
|
|
device_map="auto").cuda()
|
|
HEADS = [f"model.layers.{i}.self_attn.head_out" for i in range(model.config.num_hidden_layers)]
|
|
MLPS = [f"model.layers.{i}.mlp" for i in range(model.config.num_hidden_layers)]
|
|
# firstly get the embeddings of the generated question and answers.
|
|
# embed_generated = []
|
|
benign_embed_generated =[]
|
|
benign_embed_generated_loc1 =[]
|
|
benign_embed_generated_loc2 =[]
|
|
adverse_embed_generated=[]
|
|
adverse_embed_generated_loc1=[]
|
|
adverse_embed_generated_loc2=[]
|
|
|
|
|
|
|
|
with open('benign.json') as f:
|
|
benign_answers = json.load(f)
|
|
length = int(len(dataset)*0.45)
|
|
for i in tqdm(range(length)):
|
|
question = dataset[i]['query']
|
|
adversary = dataset[i]['target']
|
|
for anw in benign_answers:
|
|
benign_prompt = tokenizer(
|
|
f" Q: {question}" + " A:" + anw['response'],
|
|
return_tensors='pt').input_ids.cuda()
|
|
adverse_prompt = tokenizer(
|
|
f" Q: {question}" + " A:" + adversary,
|
|
return_tensors='pt').input_ids.cuda()
|
|
|
|
with torch.no_grad():
|
|
benign_hidden_states = model(benign_prompt, output_hidden_states=True).hidden_states
|
|
benign_hidden_states = torch.stack(benign_hidden_states, dim=0).squeeze()
|
|
benign_hidden_states = benign_hidden_states.detach().cpu().numpy()[:, -1, :]
|
|
benign_embed_generated.append(benign_hidden_states)
|
|
|
|
adverse_hidden_states = model(adverse_prompt, output_hidden_states=True).hidden_states
|
|
adverse_hidden_states = torch.stack(adverse_hidden_states, dim=0).squeeze()
|
|
adverse_hidden_states = adverse_hidden_states.detach().cpu().numpy()[:, -1, :]
|
|
adverse_embed_generated.append(adverse_hidden_states)
|
|
with torch.no_grad():
|
|
with TraceDict(model, HEADS + MLPS) as ret:
|
|
output = model(benign_prompt, output_hidden_states=True)
|
|
head_wise_hidden_states = [ret[head].output.squeeze().detach().cpu() for head in HEADS]
|
|
head_wise_hidden_states = torch.stack(head_wise_hidden_states, dim=0).squeeze().numpy()
|
|
mlp_wise_hidden_states = [ret[mlp].output.squeeze().detach().cpu() for mlp in MLPS]
|
|
mlp_wise_hidden_states = torch.stack(mlp_wise_hidden_states, dim=0).squeeze().numpy()
|
|
benign_embed_generated_loc2.append(mlp_wise_hidden_states[:, -1, :])
|
|
benign_embed_generated_loc1.append(head_wise_hidden_states[:, -1, :])
|
|
with torch.no_grad():
|
|
with TraceDict(model, HEADS + MLPS) as ret:
|
|
output = model(adverse_prompt, output_hidden_states=True)
|
|
head_wise_hidden_states = [ret[head].output.squeeze().detach().cpu() for head in HEADS]
|
|
head_wise_hidden_states = torch.stack(head_wise_hidden_states, dim=0).squeeze().numpy()
|
|
mlp_wise_hidden_states = [ret[mlp].output.squeeze().detach().cpu() for mlp in MLPS]
|
|
mlp_wise_hidden_states = torch.stack(mlp_wise_hidden_states, dim=0).squeeze().numpy()
|
|
adverse_embed_generated_loc2.append(mlp_wise_hidden_states[:, -1, :])
|
|
adverse_embed_generated_loc1.append(head_wise_hidden_states[:, -1, :])
|
|
|
|
|
|
benign_embed_generated = np.asarray(np.stack(benign_embed_generated), dtype=np.float32)
|
|
adverse_embed_generated = np.asarray(np.stack(adverse_embed_generated), dtype=np.float32)
|
|
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_benign_embeddings_layer_wise.npy', benign_embed_generated)
|
|
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_adverse_embeddings_layer_wise.npy', adverse_embed_generated)
|
|
|
|
benign_embed_generated_loc2 = np.asarray(np.stack(benign_embed_generated_loc2), dtype=np.float32)
|
|
benign_embed_generated_loc1 = np.asarray(np.stack(benign_embed_generated_loc1), dtype=np.float32)
|
|
adverse_embed_generated_loc2 = np.asarray(np.stack(adverse_embed_generated_loc2), dtype=np.float32)
|
|
adverse_embed_generated_loc1 = np.asarray(np.stack(adverse_embed_generated_loc1), dtype=np.float32)
|
|
|
|
|
|
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_benign_embeddings_head_wise.npy', benign_embed_generated_loc2)
|
|
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_benign_embeddings_mlp_wise.npy', benign_embed_generated_loc1)
|
|
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_adverse_embeddings_head_wise.npy', adverse_embed_generated_loc2)
|
|
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_adverse_embeddings_mlp_wise.npy', adverse_embed_generated_loc1)
|
|
# np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_gene_embeddings_t_head_wise.npy', embed_generated_t_loc2)
|
|
# np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_gene_embeddings_t_mlp_wise.npy', embed_generated_t_loc1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
seed_everything(42)
|
|
main() |