diff --git a/ml_triviaqa_bleurt_score.npy b/ml_triviaqa_bleurt_score.npy deleted file mode 100644 index d721980..0000000 Binary files a/ml_triviaqa_bleurt_score.npy and /dev/null differ diff --git a/steer_vector.py b/steer_vector.py index fad1700..6dca5ba 100644 --- a/steer_vector.py +++ b/steer_vector.py @@ -459,18 +459,6 @@ def main(): else: assert "Not supported dataset name!" - - target_directory = f"save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak" - benign_data, adverse_data = load_npy_shapes(target_directory, args.steer_place) - y_zero=np.zeros(len(benign_data)) - y_one=np.ones(len(adverse_data)) - - data_embedding = np.concatenate((benign_data, adverse_data), axis=0) - gts = np.concatenate((y_zero, y_one), axis=0) - - train_index, val_index, test_index=split_indices(len(data_embedding), args.train_ratio, args.val_ratio) - - # ============================================================ @@ -487,12 +475,42 @@ def main(): device_map="auto", token = '' ) + tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token = '') + + prompts = [] + qa_pairs = [] + categories = [] + + length = len(dataset) + + for i in tqdm(range(length)): + question = dataset[i]['query'] + adversary = dataset[i]['target'] + prompt = tokenizer( + f"Answer the question concisely. Q: {question}" + " A:" + adversary, + return_tensors='pt' + ).input_ids.cuda() + + prompts.append(prompt) + qa_pairs.append({'Question': question, 'Answer': adversary}) + categories.append(1) # 1 = adverse + + for i in tqdm(range(length)): + question = dataset[i]['query'] + clean= dataset[i]['response'] + prompt = tokenizer( + f"Answer the question concisely. Q: {question}" + " A:" + clean, + return_tensors='pt' + ).input_ids.cuda() + + prompts.append(prompt) + qa_pairs.append({'Question': question, 'Answer': clean}) + categories.append(0) # 0 = benign + + train_index, val_index, test_index=split_indices(len(prompts), args.train_ratio, args.val_ratio) + - train_data = [data_embedding[i] for i in train_index] - train_labels = [gts[i] for i in train_index] - test_data = [data_embedding[i] for i in test_index] - test_labels = [gts[i] for i in test_index] # ====== 6.4 冻结 LLM,只训练 SV 参数 ====== num_layers = model.config.num_hidden_layers diff --git a/tqa_score.mat b/tqa_score.mat deleted file mode 100644 index c8be060..0000000 Binary files a/tqa_score.mat and /dev/null differ