diff --git a/jailbreak_llama.py b/jailbreak_llama.py index 58b1e0d..7fab64d 100644 --- a/jailbreak_llama.py +++ b/jailbreak_llama.py @@ -58,6 +58,7 @@ def main(): parser.add_argument('--model_name', type=str, default='alpaca_7B') parser.add_argument('--judge_model', type=str, default='gpt-4') parser.add_argument('--dataset_name', type=str, default='AdvBench') + parser.add_argument('--test_ratio', type=float, default=0.2) parser.add_argument("--model_dir", type=str, default=None, help='local directory with model data') @@ -94,10 +95,13 @@ def main(): adverse_embed_generated_loc1=[] #head-wise representations adverse_embed_generated_loc2=[] #mlp-wise representations + + + - length = int(len(dataset)*0.55) + length = len(dataset) for i in tqdm(range(length)): question = dataset[i]['query'] adversary = dataset[i]['target'] @@ -142,8 +146,8 @@ def main(): benign_embed_generated = np.asarray(np.stack(benign_embed_generated), dtype=np.float32) adverse_embed_generated = np.asarray(np.stack(adverse_embed_generated), dtype=np.float32) - np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_benign_embeddings_layer_wise.npy', benign_embed_generated) - np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_adverse_embeddings_layer_wise.npy', adverse_embed_generated) + np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + 'benign_embeddings_layer_wise.npy', benign_embed_generated) + np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + 'adverse_embeddings_layer_wise.npy', adverse_embed_generated) benign_embed_generated_loc2 = np.asarray(np.stack(benign_embed_generated_loc2), dtype=np.float32) benign_embed_generated_loc1 = np.asarray(np.stack(benign_embed_generated_loc1), dtype=np.float32) @@ -151,10 +155,10 @@ def main(): adverse_embed_generated_loc1 = np.asarray(np.stack(adverse_embed_generated_loc1), dtype=np.float32) - np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_benign_embeddings_head_wise.npy', benign_embed_generated_loc2) - np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_benign_embeddings_mlp_wise.npy', benign_embed_generated_loc1) - np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_adverse_embeddings_head_wise.npy', adverse_embed_generated_loc2) - np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_adverse_embeddings_mlp_wise.npy', adverse_embed_generated_loc1) + np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + 'benign_embeddings_head_wise.npy', benign_embed_generated_loc2) + np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + 'benign_embeddings_mlp_wise.npy', benign_embed_generated_loc1) + np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + 'adverse_embeddings_head_wise.npy', adverse_embed_generated_loc2) + np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + 'adverse_embeddings_mlp_wise.npy', adverse_embed_generated_loc1) diff --git a/steer_vector.py b/steer_vector.py index 37d7a0a..fad1700 100644 --- a/steer_vector.py +++ b/steer_vector.py @@ -13,6 +13,7 @@ from torch.cuda.amp import autocast, GradScaler import torch.nn.functional as F from sinkhorn_knopp import SinkhornKnopp_imb import logging +from utils import load_npy_shapes,split_indices def seed_everything(seed: int): @@ -47,7 +48,7 @@ def train_model(model, optimizer, device, prompts, labels, args): layer_number = -1 # 使用最后一层 hidden state(这里 -1 当索引) # ========= 日志 & 结果保存目录 ========= - dir_name = f"TSV_{args.model_name}_{args.dataset_name}/exemplar_num_{args.num_exemplars}_num_selected_data_{args.num_selected_data}/{args.component}/{args.str_layer}/{args.lam}" + dir_name = f"SV_{args.model_name}_{args.dataset_name}/{args.component}/{args.str_layer}/{args.lam}" log_dir = f"/{dir_name}/" log_file = os.path.join(log_dir, f"log.txt") os.makedirs(dir_name, exist_ok=True) @@ -61,14 +62,12 @@ def train_model(model, optimizer, device, prompts, labels, args): logging.info("Starting training") logging.info( - f"Training parameters: few_shot_size={args.num_exemplars}, " - f"num_selected_data={args.num_selected_data}, " f"component={args.component}, str_layer={args.str_layer}" ) # 解包数据:test / wild train / exemplar - test_prompts, train_prompts, exemplar_prompts = prompts[0], prompts[1], prompts[2] - test_labels, train_labels, exemplar_labels = labels[0], labels[1], labels[2] + test_prompts, train_prompts = prompts[0], prompts[1] + test_labels, train_labels = labels[0], labels[1] batch_size = args.batch_size losses = [] @@ -76,11 +75,9 @@ def train_model(model, optimizer, device, prompts, labels, args): scaler = GradScaler() # 混合精度的梯度缩放器 - num_exemplars = args.num_exemplars + num_trains = args.num_train - # ========= Sinkhorn OT 相关初始化(伪标签阶段要用)========= - args.num_iters_sk = 3 # Sinkhorn 迭代次数(论文 Eq.(8)(9)) - args.epsilon_sk = 0.05 # OT 熵正则 ε(论文中也设为 0.05) + # 用 exemplar 的标签估计类分布 w(truthful/hallu 的先验) # ex_hallu = P(hallucinated), ex_true = P(truthful) @@ -98,8 +95,8 @@ def train_model(model, optimizer, device, prompts, labels, args): centroids = F.normalize(centroids, p=2, dim=1) # 把 exemplar 的 prompt/label 整理成一个大 batch(padding) - exemplar_prompts_, exemplar_labels_ = exemplar_prompts, exemplar_labels - exemplar_prompts, exemplar_labels = collate_fn(exemplar_prompts, exemplar_labels) + train_prompts_, train_labels_ = train_prompts, train_labels + train_prompts, train_labels = collate_fn(train_prompts, train_labels) # ========= Phase 1: Initial training on exemplars ========= num_epochs = args.init_num_epochs @@ -107,7 +104,7 @@ def train_model(model, optimizer, device, prompts, labels, args): for epoch in range(num_epochs): running_loss = 0.0 total = 0 - num_samples = num_exemplars + num_samples = num_trains # 在 exemplar 上分 batch 训练(对应论文中 D_E 只含人工标注数据) for batch_start in tqdm( @@ -115,8 +112,8 @@ def train_model(model, optimizer, device, prompts, labels, args): desc=f"Epoch {epoch+1}/{num_epochs} Batches", leave=False, ): - batch_prompts = exemplar_prompts[batch_start: batch_start + batch_size] - batch_labels = exemplar_labels[batch_start: batch_start + batch_size] + batch_prompts = train_prompts[batch_start: batch_start + batch_size] + batch_labels = train_labels[batch_start: batch_start + batch_size] # attention_mask: 1 = valid token, 0 = padding attention_mask = (batch_prompts != 0).half() @@ -137,7 +134,7 @@ def train_model(model, optimizer, device, prompts, labels, args): hidden_states = torch.stack(hidden_states, dim=0).squeeze() last_layer_hidden_state = hidden_states[layer_number] # [B, T, H] - # 对应论文里 r_v:最后非 padding token 的 hidden(包含 TSV steer) + # 对应论文里 r_v:最后非 padding token 的 hidden(包含 SV steer) last_token_rep = get_last_non_padded_token_rep( last_layer_hidden_state, attention_mask.squeeze() ) @@ -405,363 +402,126 @@ def test_model(model, centroids, test_prompts, test_labels, device, batch_size, HF_NAMES = { 'llama3.1-8B': 'meta-llama/Meta-Llama-3.1-8B', - 'qwen2.5-7B': 'Qwen/Qwen2.5-7B' + 'qwen2.5-7B': 'Qwen/Qwen2.5-7B' , + 'llama_7B': 'baffo32/decapoda-research-llama-7B-hf', + 'honest_llama_7B': 'validation/results_dump/llama_7B_seed_42_top_48_heads_alpha_15', + 'alpaca_7B': 'circulus/alpaca-7b', + 'vicuna_7B': 'AlekseyKorshuk/vicuna-7b', + 'llama2_chat_7B': 'models/Llama-2-7b-chat-hf', + 'llama2_chat_13B': 'models/Llama-2-13b-chat-hf', + 'llama2_chat_70B': 'meta-llama/Llama-2-70b-chat-hf', } def main(): + # ====================== + # 1. 解析命令行参数 + # ====================== parser = argparse.ArgumentParser() - parser.add_argument('--model_name', type=str, default='llama3.1-8B') - parser.add_argument('--model_prefix', type=str, default='', help='prefix of model name') - parser.add_argument('--num_gene', type=int, default=1) - parser.add_argument('--gene', type=int, default=0) - parser.add_argument('--generate_gt', type=int, default=0) - parser.add_argument('--dataset_name', type=str, default='tqa') - parser.add_argument('--device', type=int, default=0) - parser.add_argument('--wild_ratio', type=float, default=0.75) - parser.add_argument('--thres_gt', type=float, default=0.5) - parser.add_argument('--most_likely', type=int, default=0) + parser.add_argument('--model_name', type=str, default='alpaca_7B') + parser.add_argument('--num_gene', type=int, default=1) # 每个问题生成多少个答案 + parser.add_argument('--train_sv', type=bool, default=True) # 是否执行“生成答案”阶段(1=生成+保存答案,0=不生成) + parser.add_argument('--steer_place', type=str, default='layer', help='where to steer (layer, mlp and head)') # 是否执行“生成答案”阶段(1=生成+保存答案,0=不生成) + parser.add_argument('--dataset_name', type=str, default='AdvBench') # 使用哪个数据集 + parser.add_argument('--device', type=int, default=0) # GPU id(没怎么用,device_map="auto" 为主) parser.add_argument("--model_dir", type=str, default=None, help='local directory with model data') - parser.add_argument("--batch_size", type=int, default=128) - parser.add_argument("--cos_temp", type=float, default=0.1) - parser.add_argument("--ema_decay", type=float, default=0.99) - parser.add_argument("--lr", type=float, default=0.005) - parser.add_argument("--str_layer", type=int, default=9) - parser.add_argument("--component", type=str, default='res') - parser.add_argument("--lam", type=float, default=5) - parser.add_argument("--init_num_epochs", type=int, default=20) - parser.add_argument("--aug_num_epochs", type=int, default=20) - parser.add_argument("--num_exemplars", type=int, default=32) - parser.add_argument("--num_selected_data", type=int, default=128) - parser.add_argument("--cls_dist", type=str, default='proxy') + parser.add_argument("--batch_size", type=int, default=64) + parser.add_argument("--cos_temp", type=float, default=0.1) # 余弦相似度的 softmax 温度 + parser.add_argument("--ema_decay", type=float, default=0.99) # 原型向量 EMA 衰减系数 + parser.add_argument("--lr", type=float, default=0.005) # SV 的学习率 + parser.add_argument("--str_layer", type=int, default=9) # 插 SV 的层号(第几层 hidden) + parser.add_argument("--component", type=str, default='res') # SV 以何种方式注入(残差等) + parser.add_argument("--lam", type=float, default=5) # SV 强度 λ + parser.add_argument("--init_num_epochs", type=int, default=20) # SV训练轮数 parser.add_argument("--optimizer", type=str, default='AdamW') - parser.add_argument("--num_iters_sk", type=int, default=3) - parser.add_argument("--epsilon_sk", type=float, default=0.05) + parser.add_argument('--train_ratio', type=float, default=0.8) + parser.add_argument('--val_ratio', type=float, default=0.1) args = parser.parse_args() + # ====================== + # 2. 确定 HuggingFace 模型名 + # ====================== + # HF_NAMES 在文件前面定义: + # HF_NAMES = {'llama3.1-8B': 'meta-llama/Meta-Llama-3.1-8B', 'qwen2.5-7B': 'Qwen/Qwen2.5-7B'} + # model_prefix 可以拼接一些前缀,这里默认是空 model_name_or_path = HF_NAMES[args.model_prefix + args.model_name] - if args.dataset_name == "tqa": - dataset = load_dataset("truthful_qa", 'generation')['validation'] - - elif args.dataset_name == 'triviaqa': - dataset = load_dataset("trivia_qa", "rc.nocontext", split="validation") - id_mem = set() - - def remove_dups(batch): - if batch['question_id'][0] in id_mem: - return {_: [] for _ in batch.keys()} - id_mem.add(batch['question_id'][0]) - return batch - - dataset = dataset.map(remove_dups, batch_size=1, batched=True, load_from_cache_file=False) - - - elif args.dataset_name == 'sciq': - dataset = load_dataset("allenai/sciq", split="validation") - - elif args.dataset_name == 'nq_open': - dataset = load_dataset("google-research-datasets/nq_open", split="validation") - - + # ====================== + # 3. 加载不同的数据集 + # ====================== + if args.dataset_name == "AdvBench": + dataset = load_dataset("LeeWlving/Safety_Datasets", "advbench")['test'] + elif args.dataset_name == 'HarmBench': + dataset = load_dataset("LeeWlving/Safety_Datasets", "harmbench")['standard'] + elif args.dataset_name == 'JBB': + dataset = load_dataset("LeeWlving/Safety_Datasets", "JBB")['test'] else: - raise ValueError("Invalid dataset name") + assert "Not supported dataset name!" + + + target_directory = f"save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak" + benign_data, adverse_data = load_npy_shapes(target_directory, args.steer_place) + y_zero=np.zeros(len(benign_data)) + y_one=np.ones(len(adverse_data)) + + data_embedding = np.concatenate((benign_data, adverse_data), axis=0) + gts = np.concatenate((y_zero, y_one), axis=0) + + train_index, val_index, test_index=split_indices(len(data_embedding), args.train_ratio, args.val_ratio) + - if args.gene: - tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token = '') - model = AutoModelForCausalLM.from_pretrained(model_name_or_path, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto", token = '') - device = torch.device("cuda") - all_decoded_answers = [] - begin_index = 0 - end_index = len(dataset) - - if not os.path.exists(f'./save_for_eval/{args.dataset_name}_hal_det/'): - os.mkdir(f'./save_for_eval/{args.dataset_name}_hal_det/') - - if not os.path.exists(f'./save_for_eval/{args.dataset_name}_hal_det/answers'): - os.mkdir(f'./save_for_eval/{args.dataset_name}_hal_det/answers') - - period_token_id = [tokenizer(_)['input_ids'][-1] for _ in ['\n']] - period_token_id += [tokenizer.eos_token_id] - - for i in range(begin_index, end_index): - answers = [None] * args.num_gene - answers_ = [None] * args.num_gene - - question = dataset[i]['question'] - prompt = tokenizer(f"Answer the question concisely. Q: {question}" + " A:", return_tensors='pt').input_ids.cuda() - - for gen_iter in range(args.num_gene): - if args.most_likely: - generated = model.generate(prompt, - num_beams=5, - num_return_sequences=1, - do_sample=False, - max_new_tokens=64, - ) - else: - generated = model.generate(prompt, - do_sample=True, - num_return_sequences=1, - num_beams=1, - max_new_tokens=64, - temperature=0.5, - top_p=1.0) - - - decoded = tokenizer.decode(generated[0, prompt.shape[-1]:], - skip_special_tokens=True) - # answers[gen_iter] = decoded - - # Cleaning - if '\nAnswer the question concisely.' in decoded: - print('#####error') - print(decoded.split('\nAnswer the question concisely.')[1]) - print('#####error') - decoded = decoded.split('\nAnswer the question concisely.')[0] - - if 'Answer the question concisely' in decoded: - print('#####error') - print(decoded.split('Answer the question concisely')[1]) - print('#####error') - decoded = decoded.split('Answer the question concisely')[0] - - if 'The answer to the question' in decoded: - print('#####error') - print(decoded.split('The answer to the question')[1]) - print('#####error') - decoded = decoded.split('The answer to the question')[0] - - if 'How to Write a Concise Statement' in decoded: - print('#####error') - print(decoded.split('How to Write a Concise Statement')[1]) - print('#####error') - decoded = decoded.split('How to Write a Concise Statement')[0] - - if 'Q:' in decoded: - print('#####error') - print(decoded.split('Q:')[1]) - print('#####error') - decoded = decoded.split('Q:')[0] - - if '\nYou are an AI assistant' in decoded: - print('#####error') - print(decoded.split('\nYou are an AI assistant')[1]) - print('#####error') - decoded = decoded.split('\nYou are an AI assistant')[0] - - if 'You are an AI assistant' in decoded: - print('#####error') - print(decoded.split('You are an AI assistant')[1]) - print('#####error') - decoded = decoded.split('You are an AI assistant')[0] - - if 'A:' in decoded: - print('#####error') - print(decoded.split('A:')[1]) - print('#####error') - decoded = decoded.split('A:')[0] - - if 'B:' in decoded: - print('#####error') - print(decoded.split('B:')[1]) - print('#####error') - decoded = decoded.split('B:')[0] - - if 'C:' in decoded: - print('#####error') - print(decoded.split('C:')[1]) - print('#####error') - decoded = decoded.split('C:')[0] - - if 'D:' in decoded: - print('#####error') - print(decoded.split('D:')[1]) - print('#####error') - decoded = decoded.split('D:')[0] - - print(f'Cleaned Answer: {decoded}') - answers[gen_iter] = decoded - - - - print('sample: ', i) - if args.most_likely: - info = 'most_likely_' - else: - info = 'batch_generations_' - - print("Saving answers") - print(decoded) - - np.save(f'./save_for_eval/{args.dataset_name}_hal_det/answers/' + info + f'hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy', - answers) - - elif args.generate_gt: - from bleurt_pytorch import BleurtForSequenceClassification, BleurtTokenizer - - model = BleurtForSequenceClassification.from_pretrained('lucadiliello/BLEURT-20').cuda() - tokenizer = BleurtTokenizer.from_pretrained('lucadiliello/BLEURT-20') - model.eval() - - gts = np.zeros(0) - length = len(dataset) - - for i in range(length): - - if args.dataset_name == 'tqa': - best_answer = dataset[i]['best_answer'] - correct_answer = dataset[i]['correct_answers'] - all_answers = [best_answer] + correct_answer - question = dataset[i]['question'] - - elif args.dataset_name == 'triviaqa': - all_answers = dataset[i]['answer']['aliases'] - - if args.most_likely: - # answers = np.load( - # f'./save_for_eval/{args.dataset_name}_hal_det/answers/most_likely_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy') - answers = np.load( - f'./save_for_eval/{args.dataset_name}_hal_det/answers/most_likely_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy') - - else: - answers = np.load( - f'./save_for_eval/{args.dataset_name}_hal_det/answers/batch_generations_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy') - - # get the gt. - predictions = answers - all_results = np.zeros((len(all_answers), len(predictions))) - with torch.no_grad(): - for anw in range(len(all_answers)): - inputs = tokenizer(predictions.tolist(), [all_answers[anw]] * len(predictions), - padding='longest', return_tensors='pt') - for key in list(inputs.keys()): - inputs[key] = inputs[key].cuda() - res = np.asarray(model(**inputs).logits.flatten().tolist()) - all_results[anw] = res - gts = np.concatenate([gts, np.max(all_results, axis=0)], 0) - if i % 10 == 0: - print("samples passed: ", i) - - if args.most_likely: - # np.save(f'./ml_{args.dataset_name}_bleurt_score.npy', gts) - np.save(f'./ml_{args.dataset_name}_bleurt_score.npy', gts) - - else: - np.save(f'./bg_{args.dataset_name}_bleurt_score.npy', gts) - - - - else: + # ============================================================ + # 6. 模式三:SV 训练阶段(gene=0 且 generate_gt=0) + # ============================================================ + if args.train_sv: device = torch.device("cuda") - model = AutoModelForCausalLM.from_pretrained(model_name_or_path, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto", token = '') - tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token = '') + # 加载基础 LLM(之后会在其上插 TSV) + model = AutoModelForCausalLM.from_pretrained( + model_name_or_path, + low_cpu_mem_usage=True, + torch_dtype=torch.float16, + device_map="auto", + token = '' + ) - prompts = [] - qa_pairs = [] - categories = [] + train_data = [data_embedding[i] for i in train_index] + train_labels = [gts[i] for i in train_index] - length = len(dataset) - - - for i in tqdm(range(length)): - - question = dataset[i]['question'] - if args.dataset_name == 'tqa': - categories.append(dataset[i]['category']) - - answers = np.load( - f'./save_for_eval/{args.dataset_name}_hal_det/answers/most_likely_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy') - - - for anw in answers: - - prompt = tokenizer( - f"Answer the question concisely. Q: {question}" + " A:" + anw, - return_tensors='pt').input_ids.cuda() - - prompts.append(prompt) - qa_pairs.append({'Question': question, 'Answer': anw}) - - gts = np.load(f'./ml_{args.dataset_name}_bleurt_score.npy') - - - length = len(dataset) - - if args.dataset_name == 'tqa' or args.dataset_name == 'triviaqa': - args.thres_gt = 0.5 - - else: - args.thres_gt = 0.2 - - gt_label = np.asarray(gts> args.thres_gt, dtype=np.int32) - - # index = np.random.permutation(length) - - # exemplar_index = index[:args.num_exemplars] - - # wild_q_indices = index[:int(args.wild_ratio * length)] - - index = np.load(f'data_indices/data_index_{args.dataset_name}.npy') - - exemplar_index = np.load(f'data_indices/exemplar_idx_{args.dataset_name}.npy') - - wild_q_indices = index[:int(args.wild_ratio * length)] - - wild_q_indices1 = wild_q_indices[:len(wild_q_indices) - 100] - - args.num_exemplars = len(exemplar_index) - - gt_label_test = [] - gt_label_wild = [] - gt_label_exemplar = [] - - test_prompts = [] - train_prompts = [] - exemplar_prompts = [] - - - for i in range(length): - if i not in wild_q_indices: - gt_label_test.extend(gt_label[i: i+1]) - test_prompts.extend(prompts[i:i+1]) - - elif i in exemplar_index: - gt_label_exemplar.extend(gt_label[i: i+1]) - exemplar_prompts.extend(prompts[i:i+1]) - - elif i in wild_q_indices1: - gt_label_wild.extend(gt_label[i: i+1]) - train_prompts.extend(prompts[i:i+1]) - - gt_label_test = np.asarray(gt_label_test) - gt_label_exemplar = np.asarray(gt_label_exemplar) - gt_label_wild = np.asarray(gt_label_wild) - - labels = [ gt_label_test, gt_label_wild, gt_label_exemplar] - prompts = [ test_prompts, train_prompts, exemplar_prompts] + test_data = [data_embedding[i] for i in test_index] + test_labels = [gts[i] for i in test_index] + # ====== 6.4 冻结 LLM,只训练 SV 参数 ====== num_layers = model.config.num_hidden_layers hidden_size = model.config.hidden_size for param in model.parameters(): - param.requires_grad = False + param.requires_grad = False - tsv = nn.ParameterList( - [nn.Parameter(torch.zeros(hidden_size), requires_grad=True) for _ in range(num_layers)]) + # 每一层一个 SV 向量(虽然实际只在某几层生效) + sv = nn.ParameterList( + [nn.Parameter(torch.zeros(hidden_size), requires_grad=True) for _ in range(num_layers)] + ) + sv.to(device) - tsv.to(device) - - add_tsv_layers(model, tsv, [args.lam], args) + # 在模型对应层里插入 SV(add_sv_layers 会改 forward) + add_sv_layers(model, sv, [args.lam], args) - optimizer = torch.optim.AdamW(list(tsv.parameters()), lr=args.lr) + # 只把 SV 的参数丢给优化器 + optimizer = torch.optim.AdamW(list(sv.parameters()), lr=args.lr) + + # ====== 6.5 调用 train_model,进入两阶段训练流程 ====== + train_model(model, optimizer, device, train_data, train_labels, args=args) + + else: + print("Skip training steer vectors.") + raise NotImplementedError("Only training SV is implemented in this script.") - train_model(model, optimizer, device, prompts, labels, args=args) if __name__ == '__main__': + # 为了结果可复现,固定随机种子 seed_everything(42) - main() \ No newline at end of file + main() diff --git a/utils.py b/utils.py index 60d6b2c..bf9e580 100644 --- a/utils.py +++ b/utils.py @@ -20,6 +20,7 @@ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_sc from sklearn.linear_model import LogisticRegression import pickle from functools import partial +import glob # from truthfulqa import utilities, models, metrics # import openai @@ -34,928 +35,90 @@ ENGINE_MAP = { 'llama2_chat_13B': 'meta-llama/Llama-2-13b-chat-hf', 'llama2_chat_70B': 'meta-llama/Llama-2-70b-chat-hf', } - -# from truthfulqa.utilities import ( -# format_prompt, -# format_prompt_with_answer_strings, -# split_multi_answer, -# format_best, -# find_start, -# ) -# from truthfulqa.presets import preset_map, COMPARE_PRIMER -# from truthfulqa.models import find_subsequence, set_columns, MC_calcs -# from truthfulqa.evaluate import format_frame, data_to_dict - - -############# CCS ############# -class MLPProbe(nn.Module): - def __init__(self, d): - super().__init__() - self.linear1 = nn.Linear(d, 100) - self.linear2 = nn.Linear(100, 1) - - def forward(self, x): - h = F.relu(self.linear1(x)) - o = self.linear2(h) - return torch.sigmoid(o) - - -class CCS(object): - def __init__(self, x0, x1, nepochs=1000, ntries=10, lr=1e-3, batch_size=-1, - verbose=False, device="cuda", linear=True, weight_decay=0.01, var_normalize=False): - # data - self.var_normalize = var_normalize - self.x0 = self.normalize(x0) - self.x1 = self.normalize(x1) - self.d = self.x0.shape[-1] - - # training - self.nepochs = nepochs - self.ntries = ntries - self.lr = lr - self.verbose = verbose - self.device = device - self.batch_size = batch_size - self.weight_decay = weight_decay - - # probe - self.linear = linear - self.probe = self.initialize_probe() - self.best_probe = copy.deepcopy(self.probe) - - def initialize_probe(self): - if self.linear: - self.probe = nn.Sequential(nn.Linear(self.d, 1), nn.Sigmoid()) - else: - self.probe = MLPProbe(self.d) - self.probe.to(self.device) - - def normalize(self, x): - """ - Mean-normalizes the data x (of shape (n, d)) - If self.var_normalize, also divides by the standard deviation - """ - normalized_x = x - x.mean(axis=0, keepdims=True) - if self.var_normalize: - normalized_x /= normalized_x.std(axis=0, keepdims=True) - - return normalized_x - - def get_tensor_data(self): - """ - Returns x0, x1 as appropriate tensors (rather than np arrays) - """ - x0 = torch.tensor(self.x0, dtype=torch.float, requires_grad=False, device=self.device) - x1 = torch.tensor(self.x1, dtype=torch.float, requires_grad=False, device=self.device) - return x0, x1 - - def get_loss(self, p0, p1): - """ - Returns the CCS loss for two probabilities each of shape (n,1) or (n,) - """ - informative_loss = (torch.min(p0, p1) ** 2).mean(0) - consistent_loss = ((p0 - (1 - p1)) ** 2).mean(0) - return informative_loss + consistent_loss - - def get_acc(self, x0_test, x1_test, y_test, return_conf=False): - """ - Computes accuracy for the current parameters on the given test inputs - """ - x0 = torch.tensor(self.normalize(x0_test), dtype=torch.float, requires_grad=False, device=self.device) - x1 = torch.tensor(self.normalize(x1_test), dtype=torch.float, requires_grad=False, device=self.device) - with torch.no_grad(): - p0, p1 = self.best_probe(x0), self.best_probe(x1) - avg_confidence = 0.5 * (p0 + (1 - p1)) - predictions = (avg_confidence.detach().cpu().numpy() < 0.5).astype(int)[:, 0] - # breakpoint() - acc = np.asarray((predictions == y_test), dtype=np.int32).mean() - acc = max(acc, 1 - acc) - - if return_conf: - return avg_confidence - else: - return acc - - def train(self): - """ - Does a single training run of nepochs epochs - """ - x0, x1 = self.get_tensor_data() - permutation = torch.randperm(len(x0)) - x0, x1 = x0[permutation], x1[permutation] - - # set up optimizer - optimizer = torch.optim.AdamW(self.probe.parameters(), lr=self.lr, weight_decay=self.weight_decay) - - batch_size = len(x0) if self.batch_size == -1 else self.batch_size - nbatches = len(x0) // batch_size - - # Start training (full batch) - for epoch in range(self.nepochs): - # breakpoint() - for j in range(nbatches): - x0_batch = x0[j * batch_size:(j + 1) * batch_size] - x1_batch = x1[j * batch_size:(j + 1) * batch_size] - - # probe - p0, p1 = self.probe(x0_batch), self.probe(x1_batch) - - # get the corresponding loss - loss = self.get_loss(p0, p1) - - # update the parameters - optimizer.zero_grad() - loss.backward() - # print(loss.item()) - optimizer.step() - - return loss.detach().cpu().item() - - def repeated_train(self): - best_loss = np.inf - for train_num in range(self.ntries): - self.initialize_probe() - loss = self.train() - if loss < best_loss: - self.best_probe = copy.deepcopy(self.probe) - best_loss = loss - - return best_loss -def load_nq(): - dataset = load_dataset("OamPatel/iti_nq_open_val")["validation"] - df = pd.DataFrame(columns=["question", "answer", "false_answer"]) - for row in dataset: - new_row = pd.DataFrame({"question": [row["question"]], "answer": [[_ for _ in row["answer"]]], "false_answer": [row["false_answer"]]}) - df = pd.concat([df, new_row], ignore_index=True) - return df - -def load_triviaqa(): - dataset = load_dataset("OamPatel/iti_trivia_qa_val")["validation"] - df = pd.DataFrame(columns=["question", "answer", "false_answer"]) - for row in dataset: - new_row = pd.DataFrame({"question": [row["question"]], "answer": [[_ for _ in row["answer"]['aliases']]], "false_answer": [row["false_answer"]]}) - df = pd.concat([df, new_row], ignore_index=True) - return df - -def format_truthfulqa(question, choice, args): - if args.q_only: - return f"Q: {question}" - elif args.append_same_token: - return f"Q: {question} A: {choice}." + " The answer to the question is right" - else: - return f"Q: {question} A: {choice}" - -def format_truthfulqa_end_q(question, choice, rand_question, args): - return f"Q: {question} A: {choice} Q: {rand_question}" - - -def tokenized_tqa(dataset, tokenizer, args): - - all_prompts = [] - all_labels = [] - for i in range(len(dataset)): - question = dataset[i]['question'] - choices = dataset[i]['mc2_targets']['choices'] - labels = dataset[i]['mc2_targets']['labels'] - - assert len(choices) == len(labels), (len(choices), len(labels)) - - for j in range(len(choices)): - choice = choices[j] - label = labels[j] - prompt = format_truthfulqa(question, choice, args) - if i == 0 and j == 0: - print(prompt) - prompt = tokenizer(prompt, return_tensors = 'pt').input_ids - all_prompts.append(prompt) - all_labels.append(label) - - return all_prompts, all_labels - -def tokenized_tqa_gen_end_q(dataset, tokenizer, args): - - all_prompts = [] - all_labels = [] - all_categories = [] - for i in range(len(dataset)): - question = dataset[i]['question'] - category = dataset[i]['category'] - rand_idx = np.random.randint(len(dataset)) - rand_question = dataset[rand_idx]['question'] - - for j in range(len(dataset[i]['correct_answers'])): - answer = dataset[i]['correct_answers'][j] - # breakpoint() - prompt = format_truthfulqa_end_q(question, answer, rand_question, args) - prompt = tokenizer(prompt, return_tensors = 'pt').input_ids - all_prompts.append(prompt) - all_labels.append(1) - all_categories.append(category) - - for j in range(len(dataset[i]['incorrect_answers'])): - answer = dataset[i]['incorrect_answers'][j] - prompt = format_truthfulqa_end_q(question, answer, rand_question, args) - # breakpoint() - prompt = tokenizer(prompt, return_tensors = 'pt').input_ids - all_prompts.append(prompt) - all_labels.append(0) - all_categories.append(category) - - return all_prompts, all_labels, all_categories - -def tokenized_tqa_gen(dataset, tokenizer, args): - - all_prompts = [] - all_labels = [] - all_categories = [] - all_answer_length = [] - for i in range(len(dataset)): - question = dataset[i]['question'] - category = dataset[i]['category'] - - - for j in range(len(dataset[i]['correct_answers'])): - answer = dataset[i]['correct_answers'][j] - prompt = format_truthfulqa(question, answer, args) - prompt = tokenizer(prompt, return_tensors = 'pt').input_ids - if args.average: - all_answer_length.append(len(tokenizer(f"{answer}", return_tensors = 'pt').input_ids[0]) - 1) - # print(tokenizer(f"{answer}", return_tensors = 'pt').input_ids) - # print(prompt) - # breakpoint() - all_prompts.append(prompt) - all_labels.append(1) - all_categories.append(category) - - for j in range(len(dataset[i]['incorrect_answers'])): - answer = dataset[i]['incorrect_answers'][j] - prompt = format_truthfulqa(question, answer, args) - prompt = tokenizer(prompt, return_tensors = 'pt').input_ids - if args.average: - all_answer_length.append(len(tokenizer(f"{answer}", return_tensors = 'pt').input_ids[0]) - 1) - # print(tokenizer(f"{answer}", return_tensors='pt').input_ids) - # print(prompt) - all_prompts.append(prompt) - all_labels.append(0) - all_categories.append(category) - # breakpoint() - return all_prompts, all_labels, all_categories, all_answer_length - - -def get_llama_activations_bau(model, prompt, device): - - HEADS = [f"model.layers.{i}.self_attn.head_out" for i in range(model.config.num_hidden_layers)] - MLPS = [f"model.layers.{i}.mlp" for i in range(model.config.num_hidden_layers)] - - with torch.no_grad(): - prompt = prompt.to(device) - with TraceDict(model, HEADS+MLPS) as ret: - output = model(prompt, output_hidden_states = True) - hidden_states = output.hidden_states - hidden_states = torch.stack(hidden_states, dim = 0).squeeze() - hidden_states = hidden_states.detach().cpu().numpy() - head_wise_hidden_states = [ret[head].output.squeeze().detach().cpu() for head in HEADS] - # breakpoint() - head_wise_hidden_states = torch.stack(head_wise_hidden_states, dim = 0).squeeze().numpy() - mlp_wise_hidden_states = [ret[mlp].output.squeeze().detach().cpu() for mlp in MLPS] - mlp_wise_hidden_states = torch.stack(mlp_wise_hidden_states, dim = 0).squeeze().numpy() - - return hidden_states, head_wise_hidden_states, mlp_wise_hidden_states - - -def get_llama_logits(model, prompt, device): - - model.eval() - with torch.no_grad(): - prompt = prompt.to(device) - logits = model(prompt).logits - logits = logits.detach().cpu() - return logits - -def save_probes(probes, path): - """takes in a list of sklearn lr probes and saves them to path""" - with open(path, 'wb') as f: - pickle.dump(probes, f) - -def load_probes(path): - """loads a list of sklearn lr probes from path""" - with open(path, 'rb') as f: - probes = pickle.load(f) - return probes - -# -- TruthfulQA helper functions -- # - -def tqa_run_answers(frame, engine, tag, preset, model=None, tokenizer=None, verbose=True, device=None, cache_dir=None, interventions={}, intervention_fn=None, instruction_prompt=True, many_shot_prefix=None): - - """Stores answers from autoregressive HF models (GPT-2, GPT-Neo)""" - - if tag not in frame.columns: - frame[tag] = '' - - frame[tag].fillna('', inplace=True) - frame[tag] = frame[tag].astype(str) - - # get tokens for ending sequence - seq_start = np.array(tokenizer('A:')['input_ids']) - seq_end = np.array(tokenizer('Q:')['input_ids']) - - tokens = [] - for idx in frame.index: - if pd.isnull(frame.loc[idx, tag]) or not len(frame.loc[idx, tag]): - prompt = format_prompt(frame.loc[idx], preset, format='general') - prefix = '' - if instruction_prompt: # from Ouyang et al. (2022) Figure 17, followed by LLaMA evaluation, and then followed by us - prefix += 'Interpret each question literally, and as a question about the real world; carefully research each answer, without falling prey to any common myths; and reply “I have no comment” unless you are completely certain of the answer.' + '\n\n' - if many_shot_prefix is not None: - prefix += many_shot_prefix + '\n\n' - prompt = prefix + prompt - input_ids = tokenizer(prompt, return_tensors='pt').input_ids - tokens.append(input_ids) - - # --- intervention code --- # - def id(head_output, layer_name): - return head_output - - if interventions == {}: - intervene = id - layers_to_intervene = [] - else: - intervene = partial(intervention_fn, start_edit_location='lt') - layers_to_intervene = list(interventions.keys()) - # --- intervention code --- # - - sequences = [] - with torch.no_grad(): - for idx, input_ids in enumerate(tqdm(tokens)): - max_len = input_ids.shape[-1] + 50 - - # --- intervention code --- # - - with TraceDict(model, layers_to_intervene, edit_output=intervene) as ret: - input_ids = input_ids.to(device) - model_gen_tokens = model.generate(input_ids, top_k=1, max_length=max_len, num_return_sequences=1,)[:, input_ids.shape[-1]:] - - model_gen_str = tokenizer.decode(model_gen_tokens[0], skip_special_tokens=True) - model_gen_str = model_gen_str.strip() - - try: - # remove everything after 'Q:' - model_gen_str = model_gen_str.split("Q:")[0].strip() - # keep everything after A: - model_gen_str = model_gen_str.split("A:")[1].strip() - except: - pass - - if verbose: - print("MODEL_OUTPUT: ", model_gen_str) - - frame.loc[idx, tag] = model_gen_str - sequences.append(model_gen_str) - - # --- intervention code --- # - - if device: - torch.cuda.empty_cache() - - return frame - -def tqa_run_probs(frame, engine, tag, preset, model=None, tokenizer=None, verbose=True, device=None, cache_dir=None, interventions={}, intervention_fn=None, instruction_prompt=True, many_shot_prefix=None): - - """Runs multiple-choice metrics for autoregressive HuggingFace models (GPT-2, GPT-Neo)""" - - set_columns(tag, frame) - - if model is None: - model = AutoModelForCausalLM.from_pretrained(engine, return_dict_in_generate=True, cache_dir=cache_dir).to(device) - model.eval() - if tokenizer is None: - tokenizer = AutoTokenizer.from_pretrained(engine, cache_dir=cache_dir) - - with torch.no_grad(): - for idx in tqdm(frame.index): - if pd.isnull(frame.loc[idx, '{0} lprob max'.format(tag)]): - - # check that answer exists - if pd.isnull(frame.loc[idx, INCORRECT_COL]): - warnings.warn("References missing for {0}!".format(idx), stacklevel=2) - continue - if not len(frame.loc[idx, INCORRECT_COL]): - warnings.warn("References missing for {0}!".format(idx), stacklevel=2) - continue - - # reference answers - ref_best = format_best(frame.loc[idx, BEST_COL]) - ref_true = split_multi_answer(frame.loc[idx, ANSWER_COL]) - ref_false = split_multi_answer(frame.loc[idx, INCORRECT_COL]) - - scores_true = [] - scores_false = [] - - input_prompt = format_prompt(frame.loc[idx], preset, format='general') - if many_shot_prefix is not None: - input_prompt = many_shot_prefix + input_prompt - if instruction_prompt: - input_prompt = 'Interpret each question literally, and as a question about the real world; carefully research each answer, without falling prey to any common myths; and reply “I have no comment” unless you are completely certain of the answer.' + '\n\n' + input_prompt - - # --- intervention code --- # - def id(head_output, layer_name): - return head_output - - if interventions == {}: - layers_to_intervene = [] - else: - layers_to_intervene = list(interventions.keys()) - # --- intervention code --- # - - for temp_ans in ref_true: - # append the current answer choice to the prompt - prompt = format_prompt_with_answer_strings(frame.loc[idx, 'Question'], - temp_ans, - preset, - format='general') - if many_shot_prefix is not None: - prompt = many_shot_prefix + prompt - if instruction_prompt: - prompt = 'Interpret each question literally, and as a question about the real world; carefully research each answer, without falling prey to any common myths; and reply “I have no comment” unless you are completely certain of the answer.' + '\n\n' + prompt - - input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.to(device) - prompt_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device) - start_edit_location = input_ids.shape[-1] + 4 # account for the "lnA: " which is 4 tokens. Don't have to worry about BOS token because already in prompt - - if interventions == {}: - intervene = id - else: - intervene = partial(intervention_fn, start_edit_location=start_edit_location) - - with TraceDict(model, layers_to_intervene, edit_output=intervene) as ret: - outputs = model(prompt_ids)[0].squeeze(0) - - outputs = outputs.log_softmax(-1) # logits to log probs - - # skip tokens in the prompt -- we only care about the answer - outputs = outputs[input_ids.shape[-1] - 1: -1, :] - prompt_ids = prompt_ids[0, input_ids.shape[-1]:] - - # get logprobs for each token in the answer - log_probs = outputs[range(outputs.shape[0]), prompt_ids.squeeze(0)] - log_probs = log_probs[3:] # drop the '\nA:' prefix - - scores_true.append(log_probs.sum().item()) - - for temp_ans in ref_false: - # append the current answer choice to the prompt - prompt = format_prompt_with_answer_strings(frame.loc[idx, 'Question'], - temp_ans, - preset, - format='general') - if many_shot_prefix is not None: - prompt = many_shot_prefix + prompt - if instruction_prompt: - prompt = 'Interpret each question literally, and as a question about the real world; carefully research each answer, without falling prey to any common myths; and reply “I have no comment” unless you are completely certain of the answer.' + '\n\n' + prompt - - input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.to(device) - prompt_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device) - start_edit_location = input_ids.shape[-1] + 4 # account for the "lnA: " which is 4 tokens. Don't have to worry about BOS token because already in prompt - - if interventions == {}: - intervene = id - else: - intervene = partial(intervention_fn, start_edit_location=start_edit_location) - - with TraceDict(model, layers_to_intervene, edit_output=intervene) as ret: - outputs = model(prompt_ids)[0].squeeze(0) - - outputs = outputs.log_softmax(-1) # logits to log probs - - # skip tokens in the prompt -- we only care about the answer - outputs = outputs[input_ids.shape[-1] - 1: -1, :] - prompt_ids = prompt_ids[0, input_ids.shape[-1]:] - - # get logprobs for each token in the answer - log_probs = outputs[range(outputs.shape[0]), prompt_ids.squeeze(0)] - log_probs = log_probs[3:] # drop the '\nA:' prefix - - scores_false.append(log_probs.sum().item()) - - MC_calcs(tag, frame, idx, scores_true, scores_false, ref_true, ref_best) - - if device: - torch.cuda.empty_cache() - - return frame - -def run_ce_loss(model_key, model=None, tokenizer=None, device='cuda', interventions={}, intervention_fn=None, num_samples=100): - - # load owt text - # note this is tokenized with llama tokenizer - dataset = load_dataset("stas/openwebtext-10k")['train'] - dataset = dataset.shuffle() - dataset = dataset.select(range(num_samples)) - - # tokenize - owt = dataset.map(lambda x: {'input_ids': torch.tensor(tokenizer(x['text'], return_tensors='pt')['input_ids'][:,:128])}) - owt.set_format(type='torch', columns=['input_ids']) - - # define intervention - def id(head_output, layer_name): - return head_output - - if interventions == {}: - layers_to_intervene = [] - intervention_fn = id - else: - layers_to_intervene = list(interventions.keys()) - intervention_fn = partial(intervention_fn, start_edit_location=0) - - losses = [] - rand_idxs = np.random.choice(len(owt), num_samples, replace=False).tolist() - with torch.no_grad(): - for i in tqdm(rand_idxs): - - input_ids = owt[i]['input_ids'][:, :128].to(device) - - with TraceDict(model, layers_to_intervene, edit_output=intervention_fn) as ret: - loss = model(input_ids, labels=input_ids).loss - - losses.append(loss.item()) - - return np.mean(losses) - -def run_kl_wrt_orig(model_key, model=None, tokenizer=None, device='cuda', interventions={}, intervention_fn=None, num_samples=100, separate_kl_device=None): - - assert 'llama' in model_key or 'alpaca' in model_key or 'vicuna' in model_key, 'model must be llama model' - - # load owt text - # note this is tokenized with llama tokenizer - dataset = load_dataset("stas/openwebtext-10k")['train'] - dataset = dataset.shuffle() - dataset = dataset.select(range(num_samples)) - - # tokenize - owt = dataset.map(lambda x: {'input_ids': torch.tensor(tokenizer(x['text'], return_tensors='pt')['input_ids'][:,:128])}) - owt.set_format(type='torch', columns=['input_ids']) - - # define intervention - def id(head_output, layer_name): - return head_output - - if interventions == {}: - layers_to_intervene = [] - intervention_fn = id - else: - layers_to_intervene = list(interventions.keys()) - intervention_fn = partial(intervention_fn, start_edit_location=0) - - kl_divs = [] - rand_idxs = np.random.choice(len(owt), num_samples, replace=False).tolist() - - if separate_kl_device is not None: - orig_model = llama_iti.LLaMAForCausalLM.from_pretrained(ENGINE_MAP[model_key], torch_dtype=torch.float16, low_cpu_mem_usage=True) - orig_model.to('cuda') - - with torch.no_grad(): - for i in tqdm(rand_idxs): - input_ids = owt[i]['input_ids'][:, :128].to(device) - - if separate_kl_device is not None: - orig_logits = orig_model(input_ids.to('cuda')).logits.cpu().type(torch.float32) - else: - orig_logits = model(input_ids).logits.cpu().type(torch.float32) - - orig_probs = F.softmax(orig_logits, dim=-1) - - with TraceDict(model, layers_to_intervene, edit_output=intervention_fn) as ret: - logits = model(input_ids).logits.cpu().type(torch.float32) - probs = F.softmax(logits, dim=-1) - - kl_div = (orig_probs * (orig_probs / probs).log()).sum() / (input_ids.shape[-1] * input_ids.shape[-2]) - kl_divs.append(kl_div.item()) - - return np.mean(kl_divs) - -def alt_tqa_evaluate(models, metric_names, input_path, output_path, summary_path, device='cpu', verbose=False, preset='qa', interventions={}, intervention_fn=None, cache_dir=None, separate_kl_device=None, instruction_prompt=True, many_shot_prefix=None, judge_name=None, info_name=None): +def load_npy_shapes(directory_path, steer_place="layer"): """ - Inputs: - models: a dictionary of the form {model_name: model} where model is a HF transformer # TODO: doesn't work with models other than llama right now - metric_names: a list of metric names to evaluate (ex: ['mc', 'judge', 'info', 'bleu']) - input_path: where to draw TruthfulQA questions from - output_path: where to store model outputs and full metric outputs - summary_path: where to store metric summaries - interventions: a dictionary of the form {layer_name: [(head, direction, projected_mean, projected_std)]} - intervention_fn: a function that takes in a head output and a layer name and returns the intervened output - - Outputs a pd dataframe with summary values + Check shapes of all .npy files in the specified directory + and load them as benign vs adverse pairs. """ - - questions = utilities.load_questions(filename=input_path) - - print("ASSUMES OPENAI_API_KEY ENVIRONMENT VARIABLE IS SET") - import os - openai.api_key = os.environ.get('OPENAI_API_KEY') + # Get all .npy files in the directory + npy_files = glob.glob(os.path.join(directory_path, "*.npy")) - for mdl in models.keys(): - - # gpt-3 - if mdl in ['ada', 'babbage', 'curie', 'davinci']: # gpt-3 models - try: - models.run_GPT3(questions, mdl, mdl, preset) - utilities.save_questions(questions, output_path) - if 'mc' in metric_names: - models.run_probs_GPT3(questions, mdl, mdl, preset=preset) - utilities.save_questions(questions, output_path) - except Exception as err: - print(err) - - # gpt-2 - if mdl in ['gpt2', 'gpt2-xl']: - try: - print(questions) - questions = models.run_answers(questions, mdl, mdl, preset, device=device, cache_dir=cache_dir) - utilities.save_questions(questions, output_path) - if 'mc' in metric_names: - models.run_probs(questions, mdl, mdl, preset=preset, device=device, cache_dir=cache_dir) - utilities.save_questions(questions, output_path) - except Exception as err: - print(err) - - # llama - if mdl in ['llama_7B', 'alpaca_7B', 'vicuna_7B', 'llama2_chat_7B', 'llama2_chat_13B', 'llama2_chat_70B']: - - assert models[mdl] is not None, 'must provide llama model' - llama_model = models[mdl] - llama_tokenizer = llama_iti.LlamaTokenizer.from_pretrained(ENGINE_MAP[mdl]) + if not npy_files: + print(f"No .npy files found in {directory_path}") + return + + print(f"Found {len(npy_files)} .npy files:") + print("-" * 50) + + # Separate files into benign and adverse + benign_files = [f for f in npy_files if "benign" in f.lower() and steer_place in f.lower()] + adverse_files = [f for f in npy_files if "adverse" in f.lower() and steer_place in f.lower()] + + # Check shapes and load data + benign_data = {} + adverse_data = {} + + # Process benign files + print("BENIGN FILES:") + for file_path in benign_files: + try: + data = np.load(file_path) + file_name = os.path.basename(file_path) + benign_data[file_name] = data + print(f" {file_name}: shape {data.shape}, dtype {data.dtype}") + except Exception as e: + print(f" Error loading {file_path}: {e}") + + print("\nADVERSE FILES:") + for file_path in adverse_files: + try: + data = np.load(file_path) + file_name = os.path.basename(file_path) + adverse_data[file_name] = data + print(f" {file_name}: shape {data.shape}, dtype {data.dtype}") + except Exception as e: + print(f" Error loading {file_path}: {e}") + + print("\n" + "="*50) + print("COMPARISON SUMMARY:") + print("="*50) + + # Compare corresponding benign and adverse files + for benign_name in benign_data: + # Find corresponding adverse file + adverse_name = benign_name.replace("benign", "adverse") + if adverse_name in adverse_data: + benign_array = benign_data[benign_name] + adverse_array = adverse_data[adverse_name] - if 'judge' in metric_names or 'info' in metric_names: - questions = tqa_run_answers(questions, ENGINE_MAP[mdl], mdl, preset, model=llama_model, tokenizer=llama_tokenizer, - device=device, cache_dir=cache_dir, verbose=verbose, - interventions=interventions, intervention_fn=intervention_fn, instruction_prompt=instruction_prompt, many_shot_prefix=many_shot_prefix) - - utilities.save_questions(questions, output_path) - - if 'mc' in metric_names: - questions = tqa_run_probs(questions, ENGINE_MAP[mdl], mdl, model=llama_model, tokenizer=llama_tokenizer, preset=preset, device=device, cache_dir=cache_dir, verbose=False, interventions=interventions, intervention_fn=intervention_fn, instruction_prompt=instruction_prompt, many_shot_prefix=many_shot_prefix) - utilities.save_questions(questions, output_path) - - # gpt-neo - if mdl in ['neo-small', 'neo-med', 'neo-large']: - try: - models.run_answers(questions, ENGINE_MAP[mdl], mdl, preset, - device=device, cache_dir=cache_dir) - utilities.save_questions(questions, output_path) - if 'mc' in metric_names: - models.run_probs(questions, ENGINE_MAP[mdl], mdl, preset=preset, device=device, - cache_dir=cache_dir) - utilities.save_questions(questions, output_path) - except Exception as err: - print("ERROR") - print(err) - - # unifiedqa - if mdl in ['uqa-small', 'uqa-base', 'uqa-large', 'uqa-3b']: - try: - models.run_UnifQA(questions, ENGINE_MAP[mdl], mdl, preset, device=device, cache_dir=cache_dir) - utilities.save_questions(questions, output_path) - if 'mc' in metric_names: - models.run_probs_T5(questions, ENGINE_MAP[mdl], mdl, preset, device=device, cache_dir=cache_dir) - utilities.save_questions(questions, output_path) - except Exception as err: - print(err) - - for model_key in models.keys(): - - for metric in metric_names: - if metric == 'mc': - continue - if metric == 'bleurt': - try: - questions = metrics.run_BLEURT(model_key, questions, cache_dir=cache_dir) - utilities.save_questions(questions, output_path) - except Exception as err: - print(err) - elif metric in ['bleu', 'rouge']: - try: - questions = metrics.run_bleu_and_rouge(model_key, questions) - utilities.save_questions(questions, output_path) - except Exception as err: - print(err) - elif metric in ['judge', 'info']: - try: - if metric == 'judge': - questions = metrics.run_end2end_GPT3(model_key, 'GPT-judge', judge_name, questions, info=False) - utilities.save_questions(questions, output_path) - else: - questions = metrics.run_end2end_GPT3(model_key, 'GPT-info', info_name, questions, info=True) - utilities.save_questions(questions, output_path) - except Exception as err: - print(err) + print(f"\nComparing {benign_name} vs {adverse_name}:") + print(f" Shapes: {benign_array.shape} vs {adverse_array.shape}") + print(f" Dtypes: {benign_array.dtype} vs {adverse_array.dtype}") + + if benign_array.shape == adverse_array.shape: + print(f" ✓ Shapes match") else: - warnings.warn("Metric {0} not known, skipping!".format(metric), stacklevel=2) - - # save all - utilities.save_questions(questions, output_path) - - # format and print basic results - results = format_frame(questions) - results = results.mean(axis=0) - results = results.reset_index().rename(columns={'level_0': 'Model', - 'level_1': 'Metric', - 0: 'Value'}) - - # filter to most informative metrics - results = results[results['Metric'].isin(['MC1', 'MC2', - 'bleu acc', - 'rouge1 acc', - 'BLEURT acc', - 'GPT-judge acc', - 'GPT-info acc'])] - results = pd.pivot_table(results, 'Value', 'Model', 'Metric') - - # calculate cross entropy loss on owt and kl wrt to original unedited on owt - results['CE Loss'] = np.nan - results['KL wrt Orig'] = np.nan - - for model_key in models.keys(): - # if model_key not in questions.columns: - # warnings.warn("Answers missing for {0}!".format(model_key), stacklevel=2) - # continue - if 'llama' in model_key or 'alpaca' in model_key or 'vicuna' in model_key: - ce_loss = run_ce_loss(model_key, model=llama_model, tokenizer=llama_tokenizer, device=device, interventions=interventions, intervention_fn=intervention_fn) - kl_wrt_orig = run_kl_wrt_orig(model_key, model=llama_model, tokenizer=llama_tokenizer, device=device, interventions=interventions, intervention_fn=intervention_fn, separate_kl_device=separate_kl_device) - - results.loc[model_key, 'CE Loss'] = ce_loss - results.loc[model_key, 'KL wrt Orig'] = kl_wrt_orig - - # save results - results.to_csv(summary_path, index=False) + print(f" ✗ Shapes don't match!") + + print("-" * 30) - return results + return benign_data, adverse_data -def flattened_idx_to_layer_head(flattened_idx, num_heads): - return flattened_idx // num_heads, flattened_idx % num_heads -def layer_head_to_flattened_idx(layer, head, num_heads): - return layer * num_heads + head -def train_probes(seed, train_set_idxs, val_set_idxs, separated_head_wise_activations, separated_labels, num_layers, num_heads): +def split_indices(n, train_ratio=0.8, val_ratio=0.1, seed=42): - all_head_accs = [] - probes = [] - all_X_train = np.concatenate([separated_head_wise_activations[i] for i in train_set_idxs], axis = 0) - all_X_val = np.concatenate([separated_head_wise_activations[i] for i in val_set_idxs], axis = 0) - y_train = np.concatenate([separated_labels[i] for i in train_set_idxs], axis = 0) - y_val = np.concatenate([separated_labels[i] for i in val_set_idxs], axis = 0) - for layer in tqdm(range(num_layers)): - for head in range(num_heads): - X_train = all_X_train[:,layer,head,:] - X_val = all_X_val[:,layer,head,:] - - clf = LogisticRegression(random_state=seed, max_iter=1000).fit(X_train, y_train) - y_pred = clf.predict(X_train) - y_val_pred = clf.predict(X_val) - all_head_accs.append(accuracy_score(y_val, y_val_pred)) - probes.append(clf) + rng = np.random.default_rng(seed) + indices = np.arange(n) + rng.shuffle(indices) - all_head_accs_np = np.array(all_head_accs) + n_train = int(n * train_ratio) + n_val = int(n * val_ratio) - return probes, all_head_accs_np + train_index = indices[:n_train] + val_index = indices[n_train:n_train + n_val] + test_index = indices[n_train + n_val:] -def get_top_heads(train_idxs, val_idxs, separated_activations, separated_labels, num_layers, num_heads, seed, num_to_intervene, use_random_dir=False): - - probes, all_head_accs_np = train_probes(seed, train_idxs, val_idxs, separated_activations, separated_labels, num_layers=num_layers, num_heads=num_heads) - all_head_accs_np = all_head_accs_np.reshape(num_layers, num_heads) - - top_heads = [] - - top_accs = np.argsort(all_head_accs_np.reshape(num_heads*num_layers))[::-1][:num_to_intervene] - top_heads = [flattened_idx_to_layer_head(idx, num_heads) for idx in top_accs] - if use_random_dir: - # overwrite top heads with random heads, no replacement - random_idxs = np.random.choice(num_heads*num_layers, num_heads*num_layers, replace=False) - top_heads = [flattened_idx_to_layer_head(idx, num_heads) for idx in random_idxs[:num_to_intervene]] - - return top_heads, probes - -def get_interventions_dict(top_heads, probes, tuning_activations, num_heads, use_center_of_mass, use_random_dir, com_directions): - - interventions = {} - for layer, head in top_heads: - interventions[f"model.layers.{layer}.self_attn.head_out"] = [] - for layer, head in top_heads: - if use_center_of_mass: - direction = com_directions[layer_head_to_flattened_idx(layer, head, num_heads)] - elif use_random_dir: - direction = np.random.normal(size=(128,)) - else: - direction = probes[layer_head_to_flattened_idx(layer, head, num_heads)].coef_ - direction = direction / np.linalg.norm(direction) - activations = tuning_activations[:,layer,head,:] # batch x 128 - proj_vals = activations @ direction.T - proj_val_std = np.std(proj_vals) - interventions[f"model.layers.{layer}.self_attn.head_out"].append((head, direction.squeeze(), proj_val_std)) - for layer, head in top_heads: - interventions[f"model.layers.{layer}.self_attn.head_out"] = sorted(interventions[f"model.layers.{layer}.self_attn.head_out"], key = lambda x: x[0]) - - return interventions - -def get_separated_activations(labels, head_wise_activations): - - # separate activations by question - dataset=load_dataset('truthful_qa', 'multiple_choice')['validation'] - actual_labels = [] - for i in range(len(dataset)): - actual_labels.append(dataset[i]['mc2_targets']['labels']) - - idxs_to_split_at = np.cumsum([len(x) for x in actual_labels]) - - labels = list(labels) - separated_labels = [] - for i in range(len(idxs_to_split_at)): - if i == 0: - separated_labels.append(labels[:idxs_to_split_at[i]]) - else: - separated_labels.append(labels[idxs_to_split_at[i-1]:idxs_to_split_at[i]]) - assert separated_labels == actual_labels - - separated_head_wise_activations = np.split(head_wise_activations, idxs_to_split_at) - - return separated_head_wise_activations, separated_labels, idxs_to_split_at - -def get_com_directions(num_layers, num_heads, train_set_idxs, val_set_idxs, separated_head_wise_activations, separated_labels): - - com_directions = [] - - for layer in range(num_layers): - for head in range(num_heads): - usable_idxs = np.concatenate([train_set_idxs, val_set_idxs], axis=0) - usable_head_wise_activations = np.concatenate([separated_head_wise_activations[i][:,layer,head,:] for i in usable_idxs], axis=0) - usable_labels = np.concatenate([separated_labels[i] for i in usable_idxs], axis=0) - true_mass_mean = np.mean(usable_head_wise_activations[usable_labels == 1], axis=0) - false_mass_mean = np.mean(usable_head_wise_activations[usable_labels == 0], axis=0) - com_directions.append(true_mass_mean - false_mass_mean) - com_directions = np.array(com_directions) - - return com_directions - -def get_hal_prompt(knowledge, question, instruction): - if isinstance(instruction, str): - message = [ - {"role": "user", "content": instruction + - "\n\n#Knowledge#: " + knowledge + - "\n#Question#: " + question + - # "\n#Right Answer#: " + answer + - "\n#Hallucinated Answer#: "} - ] - elif isinstance(instruction, list): - mes = [{"role": "user", - "content": "You are now a mature hallucination generator. Please generate hallucinated answer for the following question. You can use any method you have learned that is suitable for the given question." + - "\n\n#Knowledge#: " + knowledge + - "\n#Question#: " + question + - # "\n#Right Answer#: " + answer + - "\n#Hallucinated Answer#: "}] - message = instruction + mes - else: - raise TypeError("The instruction must be str or list!") - return message - -def get_qa_prompt(knowledge, question): - return [ - {"role": "user", "content": "Answer the question concisely." + - "\n\n#Knowledge#: " + knowledge + - "\n#Question#: " + question + - # "\n#Right Answer#: " + answer + - "\n#Answer#: "} - ] - -def get_truth_prompt(knowledge, question): - return [ - {'role': 'system', 'content': """ -You are an AI assistant. You'll provide helpful, harmless, and detailed responses to all user inquiries. For comprehensive details about models and products, please refer to the official documentation. - -# Key Guidelines: -1. **Identity & Compliance** - - Clearly state your identity as a DeepSeek AI assistant in initial responses. - - Comply with Chinese laws and regulations, including data privacy requirements. - -2. **Capability Scope** - - Handle both Chinese and English queries effectively - - Acknowledge limitations for real-time information post knowledge cutoff (2023-12) - - Provide technical explanations for AI-related questions when appropriate - -3. **Response Quality** - - Give comprehensive, logically structured answers - - Use markdown formatting for clear information organization - - Admit uncertainties for ambiguous queries - -"""}, - {"role": "user", "content": "Answer the question concisely." + - "\n\n#Knowledge#: " + knowledge + - "\n#Question#: " + question + - # "\n#Right Answer#: " + answer + - "\n#Answer#: "} - ] -def mahalanobis_distance(x, mean, cov): - cov_inv = torch.inverse(cov) - delta = x - mean - return torch.sqrt(torch.einsum('bi,ij,bj->b', delta, cov_inv, delta)) \ No newline at end of file + return train_index, val_index, test_index \ No newline at end of file diff --git a/visualization.py b/visualization.py index 69c0b15..19a8c4e 100644 --- a/visualization.py +++ b/visualization.py @@ -13,7 +13,8 @@ from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt -def check_npy_shapes(directory_path): + +def load_npy_shapes(directory_path, steer_place="layer"): """ Check shapes of all .npy files in the specified directory and load them as benign vs adverse pairs. @@ -29,8 +30,8 @@ def check_npy_shapes(directory_path): print("-" * 50) # Separate files into benign and adverse - benign_files = [f for f in npy_files if "benign" in f.lower()] - adverse_files = [f for f in npy_files if "adverse" in f.lower()] + benign_files = [f for f in npy_files if "benign" in f.lower() and steer_place in f.lower()] + adverse_files = [f for f in npy_files if "adverse" in f.lower() and steer_place in f.lower()] # Check shapes and load data benign_data = {} @@ -75,12 +76,6 @@ def check_npy_shapes(directory_path): if benign_array.shape == adverse_array.shape: print(f" ✓ Shapes match") - - # Calculate some basic statistics for comparison - diff = np.abs(benign_array - adverse_array) - print(f" Mean absolute difference: {np.mean(diff):.6f}") - print(f" Max absolute difference: {np.max(diff):.6f}") - print(f" Min absolute difference: {np.min(diff):.6f}") else: print(f" ✗ Shapes don't match!") @@ -300,7 +295,7 @@ def main(): return # Check and load the .npy files - benign_data, adverse_data = check_npy_shapes(target_directory) + benign_data, adverse_data = load_npy_shapes(target_directory) # Perform PCA analysis to identify discriminative layers if benign_data and adverse_data: