update

2025-11-30 13:35:28 +08:00 · 2025-11-30 13:35:28 +08:00 · fd6d6b8cec
parent f29b0c5286
commit fd6d6b8cec
4 changed files with 192 additions and 1270 deletions
--- a/jailbreak_llama.py
+++ b/jailbreak_llama.py
@ -58,6 +58,7 @@ def main():
    parser.add_argument('--model_name', type=str, default='alpaca_7B')
    parser.add_argument('--judge_model', type=str, default='gpt-4')
    parser.add_argument('--dataset_name', type=str, default='AdvBench')
+    parser.add_argument('--test_ratio', type=float, default=0.2)
    

    parser.add_argument("--model_dir", type=str, default=None, help='local directory with model data')
@ -97,7 +98,10 @@ def main():
    
    
    
-    length = int(len(dataset)*0.55)
+
+    
+
+    length = len(dataset)
    for i in tqdm(range(length)):
        question = dataset[i]['query']
        adversary = dataset[i]['target']
@ -142,8 +146,8 @@ def main():
                
    benign_embed_generated = np.asarray(np.stack(benign_embed_generated), dtype=np.float32)
    adverse_embed_generated = np.asarray(np.stack(adverse_embed_generated), dtype=np.float32)
-    np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' +  f'{args.model_name}_benign_embeddings_layer_wise.npy', benign_embed_generated)
-    np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' +  f'{args.model_name}_adverse_embeddings_layer_wise.npy', adverse_embed_generated)
+    np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' +  'benign_embeddings_layer_wise.npy', benign_embed_generated)
+    np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' +  'adverse_embeddings_layer_wise.npy', adverse_embed_generated)
    
    benign_embed_generated_loc2 = np.asarray(np.stack(benign_embed_generated_loc2), dtype=np.float32)
    benign_embed_generated_loc1 = np.asarray(np.stack(benign_embed_generated_loc1), dtype=np.float32)
@ -151,10 +155,10 @@ def main():
    adverse_embed_generated_loc1 = np.asarray(np.stack(adverse_embed_generated_loc1), dtype=np.float32)


-    np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' +  f'{args.model_name}_benign_embeddings_head_wise.npy', benign_embed_generated_loc2)
-    np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' +  f'{args.model_name}_benign_embeddings_mlp_wise.npy',  benign_embed_generated_loc1)
-    np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' +  f'{args.model_name}_adverse_embeddings_head_wise.npy', adverse_embed_generated_loc2)
-    np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' +  f'{args.model_name}_adverse_embeddings_mlp_wise.npy', adverse_embed_generated_loc1)
+    np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' +  'benign_embeddings_head_wise.npy', benign_embed_generated_loc2)
+    np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' +  'benign_embeddings_mlp_wise.npy',  benign_embed_generated_loc1)
+    np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' +  'adverse_embeddings_head_wise.npy', adverse_embed_generated_loc2)
+    np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' +  'adverse_embeddings_mlp_wise.npy', adverse_embed_generated_loc1)


    
--- a/steer_vector.py
+++ b/steer_vector.py
@ -13,6 +13,7 @@ from torch.cuda.amp import autocast, GradScaler
 import torch.nn.functional as F
 from sinkhorn_knopp import SinkhornKnopp_imb
 import logging
+from utils import load_npy_shapes,split_indices


 def seed_everything(seed: int):
@ -47,7 +48,7 @@ def train_model(model, optimizer, device, prompts, labels, args):
    layer_number = -1  # 使用最后一层 hidden state（这里 -1 当索引）

    # ========= 日志 & 结果保存目录 =========
-    dir_name = f"TSV_{args.model_name}_{args.dataset_name}/exemplar_num_{args.num_exemplars}_num_selected_data_{args.num_selected_data}/{args.component}/{args.str_layer}/{args.lam}"
+    dir_name = f"SV_{args.model_name}_{args.dataset_name}/{args.component}/{args.str_layer}/{args.lam}"
    log_dir = f"/{dir_name}/"
    log_file = os.path.join(log_dir, f"log.txt")
    os.makedirs(dir_name, exist_ok=True)
@ -61,14 +62,12 @@ def train_model(model, optimizer, device, prompts, labels, args):

    logging.info("Starting training")
    logging.info(
-        f"Training parameters: few_shot_size={args.num_exemplars}, "
-        f"num_selected_data={args.num_selected_data}, "
        f"component={args.component}, str_layer={args.str_layer}"
    )

    # 解包数据：test / wild train / exemplar
-    test_prompts, train_prompts, exemplar_prompts = prompts[0], prompts[1], prompts[2]
-    test_labels, train_labels, exemplar_labels = labels[0], labels[1], labels[2]
+    test_prompts, train_prompts = prompts[0], prompts[1]
+    test_labels, train_labels = labels[0], labels[1]

    batch_size = args.batch_size
    losses = []
@ -76,11 +75,9 @@ def train_model(model, optimizer, device, prompts, labels, args):

    scaler = GradScaler()   # 混合精度的梯度缩放器

-    num_exemplars = args.num_exemplars
+    num_trains = args.num_train
+
 
-    # ========= Sinkhorn OT 相关初始化（伪标签阶段要用）=========
-    args.num_iters_sk = 3       # Sinkhorn 迭代次数（论文 Eq.(8)(9)）
-    args.epsilon_sk = 0.05      # OT 熵正则 ε（论文中也设为 0.05）

    # 用 exemplar 的标签估计类分布 w（truthful/hallu 的先验）
    # ex_hallu = P(hallucinated), ex_true = P(truthful)
@ -98,8 +95,8 @@ def train_model(model, optimizer, device, prompts, labels, args):
    centroids = F.normalize(centroids, p=2, dim=1)

    # 把 exemplar 的 prompt/label 整理成一个大 batch（padding）
-    exemplar_prompts_, exemplar_labels_ = exemplar_prompts, exemplar_labels
-    exemplar_prompts, exemplar_labels = collate_fn(exemplar_prompts, exemplar_labels)
+    train_prompts_, train_labels_ = train_prompts, train_labels
+    train_prompts, train_labels = collate_fn(train_prompts, train_labels)

    # ========= Phase 1: Initial training on exemplars =========
    num_epochs = args.init_num_epochs
@ -107,7 +104,7 @@ def train_model(model, optimizer, device, prompts, labels, args):
    for epoch in range(num_epochs):
        running_loss = 0.0
        total = 0
-        num_samples = num_exemplars
+        num_samples = num_trains

        # 在 exemplar 上分 batch 训练（对应论文中 D_E 只含人工标注数据）
        for batch_start in tqdm(
@ -115,8 +112,8 @@ def train_model(model, optimizer, device, prompts, labels, args):
            desc=f"Epoch {epoch+1}/{num_epochs} Batches",
            leave=False,
        ):
-            batch_prompts = exemplar_prompts[batch_start: batch_start + batch_size]
-            batch_labels = exemplar_labels[batch_start: batch_start + batch_size]
+            batch_prompts = train_prompts[batch_start: batch_start + batch_size]
+            batch_labels = train_labels[batch_start: batch_start + batch_size]

            # attention_mask: 1 = valid token, 0 = padding
            attention_mask = (batch_prompts != 0).half()
@ -137,7 +134,7 @@ def train_model(model, optimizer, device, prompts, labels, args):
                hidden_states = torch.stack(hidden_states, dim=0).squeeze()
                last_layer_hidden_state = hidden_states[layer_number]  # [B, T, H]

-                # 对应论文里 r_v：最后非 padding token 的 hidden（包含 TSV steer）
+                # 对应论文里 r_v：最后非 padding token 的 hidden（包含 SV steer）
                last_token_rep = get_last_non_padded_token_rep(
                    last_layer_hidden_state, attention_mask.squeeze()
                )
@ -405,363 +402,126 @@ def test_model(model, centroids, test_prompts, test_labels, device, batch_size,

 HF_NAMES = {
    'llama3.1-8B': 'meta-llama/Meta-Llama-3.1-8B',
-    'qwen2.5-7B': 'Qwen/Qwen2.5-7B'
+    'qwen2.5-7B': 'Qwen/Qwen2.5-7B' ,
+    'llama_7B': 'baffo32/decapoda-research-llama-7B-hf',
+    'honest_llama_7B': 'validation/results_dump/llama_7B_seed_42_top_48_heads_alpha_15',
+    'alpaca_7B': 'circulus/alpaca-7b',
+    'vicuna_7B': 'AlekseyKorshuk/vicuna-7b',
+    'llama2_chat_7B': 'models/Llama-2-7b-chat-hf',
+    'llama2_chat_13B': 'models/Llama-2-13b-chat-hf',
+    'llama2_chat_70B': 'meta-llama/Llama-2-70b-chat-hf',
 }

 def main(): 

+    # ======================
+    # 1. 解析命令行参数
+    # ======================
    parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name', type=str, default='llama3.1-8B')
-    parser.add_argument('--model_prefix', type=str, default='', help='prefix of model name')
-    parser.add_argument('--num_gene', type=int, default=1)
-    parser.add_argument('--gene', type=int, default=0) 
-    parser.add_argument('--generate_gt', type=int, default=0)
-    parser.add_argument('--dataset_name', type=str, default='tqa')
-    parser.add_argument('--device', type=int, default=0)
-    parser.add_argument('--wild_ratio', type=float, default=0.75)
-    parser.add_argument('--thres_gt', type=float, default=0.5)
-    parser.add_argument('--most_likely', type=int, default=0)
+    parser.add_argument('--model_name', type=str, default='alpaca_7B')
+    parser.add_argument('--num_gene', type=int, default=1)          # 每个问题生成多少个答案
+    parser.add_argument('--train_sv', type=bool, default=True)              # 是否执行“生成答案”阶段（1=生成+保存答案，0=不生成）
+    parser.add_argument('--steer_place', type=str, default='layer', help='where to steer (layer, mlp and head)')              # 是否执行“生成答案”阶段（1=生成+保存答案，0=不生成）
+    parser.add_argument('--dataset_name', type=str, default='AdvBench')  # 使用哪个数据集
+    parser.add_argument('--device', type=int, default=0)            # GPU id（没怎么用，device_map="auto" 为主）
    parser.add_argument("--model_dir", type=str, default=None, help='local directory with model data')
-    parser.add_argument("--batch_size", type=int, default=128)
-    parser.add_argument("--cos_temp", type=float, default=0.1)
-    parser.add_argument("--ema_decay", type=float, default=0.99)
-    parser.add_argument("--lr", type=float, default=0.005)
-    parser.add_argument("--str_layer", type=int, default=9)
-    parser.add_argument("--component", type=str, default='res')
-    parser.add_argument("--lam", type=float, default=5)
-    parser.add_argument("--init_num_epochs", type=int, default=20)
-    parser.add_argument("--aug_num_epochs", type=int, default=20)
-    parser.add_argument("--num_exemplars", type=int, default=32)
-    parser.add_argument("--num_selected_data", type=int, default=128)
-    parser.add_argument("--cls_dist", type=str, default='proxy')
+    parser.add_argument("--batch_size", type=int, default=64)
+    parser.add_argument("--cos_temp", type=float, default=0.1)      # 余弦相似度的 softmax 温度
+    parser.add_argument("--ema_decay", type=float, default=0.99)    # 原型向量 EMA 衰减系数
+    parser.add_argument("--lr", type=float, default=0.005)          # SV 的学习率
+    parser.add_argument("--str_layer", type=int, default=9)         # 插 SV 的层号（第几层 hidden）
+    parser.add_argument("--component", type=str, default='res')     # SV 以何种方式注入（残差等）
+    parser.add_argument("--lam", type=float, default=5)             # SV 强度 λ
+    parser.add_argument("--init_num_epochs", type=int, default=20)  # SV训练轮数
    parser.add_argument("--optimizer", type=str, default='AdamW')
-    parser.add_argument("--num_iters_sk", type=int, default=3)
-    parser.add_argument("--epsilon_sk", type=float, default=0.05)
+    parser.add_argument('--train_ratio', type=float, default=0.8)
+    parser.add_argument('--val_ratio', type=float, default=0.1)
    
    args = parser.parse_args()
        
+    # ======================
+    # 2. 确定 HuggingFace 模型名
+    # ======================
+    # HF_NAMES 在文件前面定义：
+    # HF_NAMES = {'llama3.1-8B': 'meta-llama/Meta-Llama-3.1-8B', 'qwen2.5-7B': 'Qwen/Qwen2.5-7B'}
+    # model_prefix 可以拼接一些前缀，这里默认是空
    model_name_or_path = HF_NAMES[args.model_prefix + args.model_name]

-    if args.dataset_name == "tqa":
-        dataset = load_dataset("truthful_qa", 'generation')['validation']
-    
-    elif args.dataset_name == 'triviaqa':
-        dataset = load_dataset("trivia_qa", "rc.nocontext", split="validation")
-        id_mem = set()
-    
-        def remove_dups(batch):
-            if batch['question_id'][0] in id_mem:
-                return {_: [] for _ in batch.keys()}
-            id_mem.add(batch['question_id'][0])
-            return batch
-
-        dataset = dataset.map(remove_dups, batch_size=1, batched=True, load_from_cache_file=False)
-        
-        
-    elif args.dataset_name == 'sciq':
-        dataset = load_dataset("allenai/sciq", split="validation")
-        
-    elif args.dataset_name == 'nq_open':
-        dataset = load_dataset("google-research-datasets/nq_open", split="validation") 
-        
-            
+    # ======================
+    # 3. 加载不同的数据集
+    # ======================
+    if args.dataset_name == "AdvBench":
+        dataset = load_dataset("LeeWlving/Safety_Datasets", "advbench")['test']
+    elif args.dataset_name == 'HarmBench':
+        dataset = load_dataset("LeeWlving/Safety_Datasets", "harmbench")['standard']
+    elif args.dataset_name == 'JBB':
+        dataset = load_dataset("LeeWlving/Safety_Datasets", "JBB")['test']
    else:
-        raise ValueError("Invalid dataset name")
+        assert "Not supported dataset name!"
        
        
-    if args.gene:
+    target_directory = f"save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak"
+    benign_data, adverse_data = load_npy_shapes(target_directory, args.steer_place)
+    y_zero=np.zeros(len(benign_data))
+    y_one=np.ones(len(adverse_data))
    
-        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token = '')
-        model = AutoModelForCausalLM.from_pretrained(model_name_or_path, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto", token = '')
-        device = torch.device("cuda")
-        all_decoded_answers = []
-        begin_index = 0
-        end_index = len(dataset)
+    data_embedding = np.concatenate((benign_data, adverse_data), axis=0)
+    gts = np.concatenate((y_zero, y_one), axis=0)
    
-        if not os.path.exists(f'./save_for_eval/{args.dataset_name}_hal_det/'):
-            os.mkdir(f'./save_for_eval/{args.dataset_name}_hal_det/')
-
-        if not os.path.exists(f'./save_for_eval/{args.dataset_name}_hal_det/answers'):
-            os.mkdir(f'./save_for_eval/{args.dataset_name}_hal_det/answers')
-
-        period_token_id = [tokenizer(_)['input_ids'][-1] for _ in ['\n']]
-        period_token_id += [tokenizer.eos_token_id]
-        
-        for i in range(begin_index, end_index):
-            answers = [None] * args.num_gene
-            answers_ = [None] * args.num_gene
-            
-            question = dataset[i]['question']
-            prompt = tokenizer(f"Answer the question concisely. Q: {question}" + " A:", return_tensors='pt').input_ids.cuda()
-            
-            for gen_iter in range(args.num_gene):
-                if args.most_likely:
-                    generated = model.generate(prompt,
-                                                num_beams=5,
-                                                num_return_sequences=1,
-                                                do_sample=False,
-                                                max_new_tokens=64,
-                                               )
-                else:
-                    generated = model.generate(prompt,
-                                                do_sample=True,
-                                                num_return_sequences=1,
-                                                num_beams=1,
-                                                max_new_tokens=64,
-                                                temperature=0.5,
-                                                top_p=1.0)
-                
-                
-                decoded = tokenizer.decode(generated[0, prompt.shape[-1]:],
-                                           skip_special_tokens=True)
-                # answers[gen_iter] = decoded
-     
-                # Cleaning
-                if '\nAnswer the question concisely.' in decoded:
-                    print('#####error')
-                    print(decoded.split('\nAnswer the question concisely.')[1])
-                    print('#####error')
-                    decoded = decoded.split('\nAnswer the question concisely.')[0]
-                
-                if 'Answer the question concisely' in decoded:
-                    print('#####error')
-                    print(decoded.split('Answer the question concisely')[1])
-                    print('#####error')
-                    decoded = decoded.split('Answer the question concisely')[0]
-                    
-                if 'The answer to the question' in decoded:
-                    print('#####error')
-                    print(decoded.split('The answer to the question')[1])
-                    print('#####error')
-                    decoded = decoded.split('The answer to the question')[0]
-                
-                if 'How to Write a Concise Statement' in decoded:
-                    print('#####error')
-                    print(decoded.split('How to Write a Concise Statement')[1])
-                    print('#####error')
-                    decoded = decoded.split('How to Write a Concise Statement')[0]         
-                    
-                if 'Q:' in decoded:
-                    print('#####error')
-                    print(decoded.split('Q:')[1])
-                    print('#####error')
-                    decoded = decoded.split('Q:')[0]     
-
-                if '\nYou are an AI assistant' in decoded:    
-                        print('#####error')
-                        print(decoded.split('\nYou are an AI assistant')[1])
-                        print('#####error')
-                        decoded = decoded.split('\nYou are an AI assistant')[0]  
-                        
-                if 'You are an AI assistant' in decoded:    
-                    print('#####error')
-                    print(decoded.split('You are an AI assistant')[1])
-                    print('#####error')
-                    decoded = decoded.split('You are an AI assistant')[0]  
-                    
-                if 'A:' in decoded:
-                    print('#####error')
-                    print(decoded.split('A:')[1])
-                    print('#####error')
-                    decoded = decoded.split('A:')[0]  
-                
-                if 'B:' in decoded:
-                    print('#####error')
-                    print(decoded.split('B:')[1])
-                    print('#####error')
-                    decoded = decoded.split('B:')[0] 
-                    
-                if 'C:' in decoded:
-                    print('#####error')
-                    print(decoded.split('C:')[1])
-                    print('#####error')
-                    decoded = decoded.split('C:')[0] 
-                    
-                if 'D:' in decoded:
-                    print('#####error')
-                    print(decoded.split('D:')[1])
-                    print('#####error')
-                    decoded = decoded.split('D:')[0] 
-                
-                print(f'Cleaned Answer: {decoded}')
-                answers[gen_iter] = decoded        
+    train_index, val_index, test_index=split_indices(len(data_embedding), args.train_ratio, args.val_ratio)
    
    

-            print('sample: ', i)
-            if args.most_likely:
-                info = 'most_likely_'
-            else:
-                info = 'batch_generations_'

-            print("Saving answers")
-            print(decoded)
-            
-            np.save(f'./save_for_eval/{args.dataset_name}_hal_det/answers/' + info + f'hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy',
-                        answers)
-            
-    elif args.generate_gt:
-        from bleurt_pytorch import BleurtForSequenceClassification, BleurtTokenizer
-
-        model = BleurtForSequenceClassification.from_pretrained('lucadiliello/BLEURT-20').cuda()
-        tokenizer = BleurtTokenizer.from_pretrained('lucadiliello/BLEURT-20')
-        model.eval()
-        
-        gts = np.zeros(0)
-        length = len(dataset)
-            
-        for i in range(length):
-            
-            if args.dataset_name == 'tqa':
-                best_answer = dataset[i]['best_answer']
-                correct_answer = dataset[i]['correct_answers']
-                all_answers = [best_answer] + correct_answer
-                question = dataset[i]['question']
-
-            elif args.dataset_name == 'triviaqa':
-                all_answers = dataset[i]['answer']['aliases']
-                
-            if args.most_likely:
-                # answers = np.load(
-                #     f'./save_for_eval/{args.dataset_name}_hal_det/answers/most_likely_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy')
-                answers = np.load(
-                    f'./save_for_eval/{args.dataset_name}_hal_det/answers/most_likely_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy')
-
-            else:
-                answers = np.load(
-                    f'./save_for_eval/{args.dataset_name}_hal_det/answers/batch_generations_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy')
-                    
-            # get the gt.
-            predictions = answers
-            all_results = np.zeros((len(all_answers), len(predictions)))
-            with torch.no_grad():
-                for anw in range(len(all_answers)):
-                    inputs = tokenizer(predictions.tolist(), [all_answers[anw]] * len(predictions),
-                                        padding='longest', return_tensors='pt')
-                    for key in list(inputs.keys()):
-                        inputs[key] = inputs[key].cuda()
-                    res = np.asarray(model(**inputs).logits.flatten().tolist())
-                    all_results[anw] = res
-            gts = np.concatenate([gts, np.max(all_results, axis=0)], 0)
-            if i % 10 == 0:
-                print("samples passed: ", i)
-        
-        if args.most_likely:
-            # np.save(f'./ml_{args.dataset_name}_bleurt_score.npy', gts)
-            np.save(f'./ml_{args.dataset_name}_bleurt_score.npy', gts)
-            
-        else:
-            np.save(f'./bg_{args.dataset_name}_bleurt_score.npy', gts)
-    
-
-                
-    else:
+    # ============================================================
+    # 6. 模式三：SV 训练阶段（gene=0 且 generate_gt=0）
+    # ============================================================
+    if args.train_sv:
        
        device = torch.device("cuda")
-        model = AutoModelForCausalLM.from_pretrained(model_name_or_path, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto", token = '')
-        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token = '')
+        # 加载基础 LLM（之后会在其上插 TSV）
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name_or_path,
+            low_cpu_mem_usage=True,
+            torch_dtype=torch.float16,
+            device_map="auto",
+            token = ''
+        )
        
-        prompts = []
-        qa_pairs = []
-        categories = []
+        train_data = [data_embedding[i] for i in train_index]
+        train_labels = [gts[i] for i in train_index]
        
-        length = len(dataset)
-    
-        
-        for i in tqdm(range(length)):
-           
-            question = dataset[i]['question']
-            if args.dataset_name == 'tqa':
-                categories.append(dataset[i]['category'])
-            
-            answers = np.load(
-                f'./save_for_eval/{args.dataset_name}_hal_det/answers/most_likely_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy')
-            
-     
-            for anw in answers:
-
-                prompt = tokenizer(
-                    f"Answer the question concisely. Q: {question}" + " A:" + anw,
-                    return_tensors='pt').input_ids.cuda()
-                   
-                prompts.append(prompt)    
-                qa_pairs.append({'Question': question, 'Answer': anw})
-        
-        gts = np.load(f'./ml_{args.dataset_name}_bleurt_score.npy') 
-
-        
-        length = len(dataset)
-        
-        if args.dataset_name == 'tqa' or args.dataset_name == 'triviaqa':
-            args.thres_gt = 0.5
-        
-        else:
-            args.thres_gt = 0.2
-               
-        gt_label = np.asarray(gts> args.thres_gt, dtype=np.int32)
-        
-        # index = np.random.permutation(length)
-        
-        # exemplar_index = index[:args.num_exemplars]
-        
-        # wild_q_indices = index[:int(args.wild_ratio * length)]
-        
-        index = np.load(f'data_indices/data_index_{args.dataset_name}.npy')
-        
-        exemplar_index = np.load(f'data_indices/exemplar_idx_{args.dataset_name}.npy')
-        
-        wild_q_indices = index[:int(args.wild_ratio * length)]
-    
-        wild_q_indices1 = wild_q_indices[:len(wild_q_indices) - 100]
-        
-        args.num_exemplars = len(exemplar_index)
-        
-        gt_label_test = []
-        gt_label_wild = []
-        gt_label_exemplar = []
-        
-        test_prompts = []
-        train_prompts = []
-        exemplar_prompts = []
-        
-        
-        for i in range(length):
-            if i not in wild_q_indices:
-                gt_label_test.extend(gt_label[i: i+1])
-                test_prompts.extend(prompts[i:i+1])
-                
-            elif i in exemplar_index:
-                gt_label_exemplar.extend(gt_label[i: i+1])
-                exemplar_prompts.extend(prompts[i:i+1])
-                
-            elif i in wild_q_indices1:
-                gt_label_wild.extend(gt_label[i: i+1])
-                train_prompts.extend(prompts[i:i+1])
-            
-        gt_label_test = np.asarray(gt_label_test)  
-        gt_label_exemplar = np.asarray(gt_label_exemplar)
-        gt_label_wild = np.asarray(gt_label_wild)        
-
-        labels = [ gt_label_test,  gt_label_wild, gt_label_exemplar]
-        prompts = [ test_prompts,  train_prompts, exemplar_prompts]
+        test_data = [data_embedding[i] for i in test_index]
+        test_labels = [gts[i] for i in test_index]
        
+        # ====== 6.4 冻结 LLM，只训练 SV 参数 ======
        num_layers = model.config.num_hidden_layers
        hidden_size = model.config.hidden_size
        
        for param in model.parameters():
-                param.requires_grad = False
+            param.requires_grad = False
        
-        tsv = nn.ParameterList(
-            [nn.Parameter(torch.zeros(hidden_size), requires_grad=True) for _ in range(num_layers)])
+        # 每一层一个 SV 向量（虽然实际只在某几层生效）
+        sv = nn.ParameterList(
+            [nn.Parameter(torch.zeros(hidden_size), requires_grad=True) for _ in range(num_layers)]
+        )
+        sv.to(device)
        
-        tsv.to(device)
+        # 在模型对应层里插入 SV（add_sv_layers 会改 forward）
+        add_sv_layers(model, sv, [args.lam], args)
    
-        add_tsv_layers(model, tsv, [args.lam], args)
+        # 只把 SV 的参数丢给优化器
+        optimizer = torch.optim.AdamW(list(sv.parameters()), lr=args.lr)

-        optimizer = torch.optim.AdamW(list(tsv.parameters()), lr=args.lr)
+        # ====== 6.5 调用 train_model，进入两阶段训练流程 ======
+        train_model(model, optimizer, device, train_data, train_labels, args=args)
+        
+    else:
+        print("Skip training steer vectors.")
+        raise NotImplementedError("Only training SV is implemented in this script.")

-        train_model(model, optimizer, device, prompts, labels, args=args)

 if __name__ == '__main__':
+    # 为了结果可复现，固定随机种子
    seed_everything(42)
    main()
--- a/utils.py
+++ b/utils.py
@ -20,6 +20,7 @@ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_sc
 from sklearn.linear_model import LogisticRegression
 import pickle
 from functools import partial
+import glob

 # from truthfulqa import utilities, models, metrics
 # import openai
@ -34,928 +35,90 @@ ENGINE_MAP = {
    'llama2_chat_13B': 'meta-llama/Llama-2-13b-chat-hf', 
    'llama2_chat_70B': 'meta-llama/Llama-2-70b-chat-hf', 
 }
-
-# from truthfulqa.utilities import (
-#     format_prompt,
-#     format_prompt_with_answer_strings,
-#     split_multi_answer,
-#     format_best,
-#     find_start,
-# )
-# from truthfulqa.presets import preset_map, COMPARE_PRIMER
-# from truthfulqa.models import find_subsequence, set_columns, MC_calcs
-# from truthfulqa.evaluate import format_frame, data_to_dict
-
-
-############# CCS #############
-class MLPProbe(nn.Module):
-    def __init__(self, d):
-        super().__init__()
-        self.linear1 = nn.Linear(d, 100)
-        self.linear2 = nn.Linear(100, 1)
-
-    def forward(self, x):
-        h = F.relu(self.linear1(x))
-        o = self.linear2(h)
-        return torch.sigmoid(o)
-
-
-class CCS(object):
-    def __init__(self, x0, x1, nepochs=1000, ntries=10, lr=1e-3, batch_size=-1,
-                 verbose=False, device="cuda", linear=True, weight_decay=0.01, var_normalize=False):
-        # data
-        self.var_normalize = var_normalize
-        self.x0 = self.normalize(x0)
-        self.x1 = self.normalize(x1)
-        self.d = self.x0.shape[-1]
-
-        # training
-        self.nepochs = nepochs
-        self.ntries = ntries
-        self.lr = lr
-        self.verbose = verbose
-        self.device = device
-        self.batch_size = batch_size
-        self.weight_decay = weight_decay
-
-        # probe
-        self.linear = linear
-        self.probe = self.initialize_probe()
-        self.best_probe = copy.deepcopy(self.probe)
-
-    def initialize_probe(self):
-        if self.linear:
-            self.probe = nn.Sequential(nn.Linear(self.d, 1), nn.Sigmoid())
-        else:
-            self.probe = MLPProbe(self.d)
-        self.probe.to(self.device)
-
-    def normalize(self, x):
-        """
-        Mean-normalizes the data x (of shape (n, d))
-        If self.var_normalize, also divides by the standard deviation
-        """
-        normalized_x = x - x.mean(axis=0, keepdims=True)
-        if self.var_normalize:
-            normalized_x /= normalized_x.std(axis=0, keepdims=True)
-
-        return normalized_x
-
-    def get_tensor_data(self):
-        """
-        Returns x0, x1 as appropriate tensors (rather than np arrays)
-        """
-        x0 = torch.tensor(self.x0, dtype=torch.float, requires_grad=False, device=self.device)
-        x1 = torch.tensor(self.x1, dtype=torch.float, requires_grad=False, device=self.device)
-        return x0, x1
-
-    def get_loss(self, p0, p1):
-        """
-        Returns the CCS loss for two probabilities each of shape (n,1) or (n,)
-        """
-        informative_loss = (torch.min(p0, p1) ** 2).mean(0)
-        consistent_loss = ((p0 - (1 - p1)) ** 2).mean(0)
-        return informative_loss + consistent_loss
-
-    def get_acc(self, x0_test, x1_test, y_test, return_conf=False):
-        """
-        Computes accuracy for the current parameters on the given test inputs
-        """
-        x0 = torch.tensor(self.normalize(x0_test), dtype=torch.float, requires_grad=False, device=self.device)
-        x1 = torch.tensor(self.normalize(x1_test), dtype=torch.float, requires_grad=False, device=self.device)
-        with torch.no_grad():
-            p0, p1 = self.best_probe(x0), self.best_probe(x1)
-        avg_confidence = 0.5 * (p0 + (1 - p1))
-        predictions = (avg_confidence.detach().cpu().numpy() < 0.5).astype(int)[:, 0]
-        # breakpoint()
-        acc = np.asarray((predictions == y_test), dtype=np.int32).mean()
-        acc = max(acc, 1 - acc)
-
-        if return_conf:
-            return avg_confidence
-        else:
-            return acc
-
-    def train(self):
-        """
-        Does a single training run of nepochs epochs
-        """
-        x0, x1 = self.get_tensor_data()
-        permutation = torch.randperm(len(x0))
-        x0, x1 = x0[permutation], x1[permutation]
-
-        # set up optimizer
-        optimizer = torch.optim.AdamW(self.probe.parameters(), lr=self.lr, weight_decay=self.weight_decay)
-
-        batch_size = len(x0) if self.batch_size == -1 else self.batch_size
-        nbatches = len(x0) // batch_size
-
-        # Start training (full batch)
-        for epoch in range(self.nepochs):
-            # breakpoint()
-            for j in range(nbatches):
-                x0_batch = x0[j * batch_size:(j + 1) * batch_size]
-                x1_batch = x1[j * batch_size:(j + 1) * batch_size]
-
-                # probe
-                p0, p1 = self.probe(x0_batch), self.probe(x1_batch)
-
-                # get the corresponding loss
-                loss = self.get_loss(p0, p1)
-
-                # update the parameters
-                optimizer.zero_grad()
-                loss.backward()
-                # print(loss.item())
-                optimizer.step()
-
-        return loss.detach().cpu().item()
-
-    def repeated_train(self):
-        best_loss = np.inf
-        for train_num in range(self.ntries):
-            self.initialize_probe()
-            loss = self.train()
-            if loss < best_loss:
-                self.best_probe = copy.deepcopy(self.probe)
-                best_loss = loss
-
-        return best_loss
-def load_nq():
-    dataset = load_dataset("OamPatel/iti_nq_open_val")["validation"]
-    df = pd.DataFrame(columns=["question", "answer", "false_answer"])
-    for row in dataset:
-        new_row = pd.DataFrame({"question": [row["question"]], "answer": [[_ for _ in row["answer"]]], "false_answer": [row["false_answer"]]})
-        df = pd.concat([df, new_row], ignore_index=True)
-    return df
-
-def load_triviaqa():
-    dataset = load_dataset("OamPatel/iti_trivia_qa_val")["validation"]
-    df = pd.DataFrame(columns=["question", "answer", "false_answer"])
-    for row in dataset:
-        new_row = pd.DataFrame({"question": [row["question"]], "answer": [[_ for _ in row["answer"]['aliases']]], "false_answer": [row["false_answer"]]})
-        df = pd.concat([df, new_row], ignore_index=True)
-    return df
-
-def format_truthfulqa(question, choice, args):
-    if args.q_only:
-        return f"Q: {question}"
-    elif args.append_same_token:
-        return f"Q: {question} A: {choice}." + " The answer to the question is right"
-    else:
-        return f"Q: {question} A: {choice}"
-
-def format_truthfulqa_end_q(question, choice, rand_question, args):
-    return f"Q: {question} A: {choice} Q: {rand_question}"
-
-
-def tokenized_tqa(dataset, tokenizer, args):
-
-    all_prompts = []
-    all_labels = []
-    for i in range(len(dataset)):
-        question = dataset[i]['question']
-        choices = dataset[i]['mc2_targets']['choices']
-        labels = dataset[i]['mc2_targets']['labels']
-
-        assert len(choices) == len(labels), (len(choices), len(labels))
-
-        for j in range(len(choices)): 
-            choice = choices[j]
-            label = labels[j]
-            prompt = format_truthfulqa(question, choice, args)
-            if i == 0 and j == 0: 
-                print(prompt)
-            prompt = tokenizer(prompt, return_tensors = 'pt').input_ids
-            all_prompts.append(prompt)
-            all_labels.append(label)
-    
-    return all_prompts, all_labels
-
-def tokenized_tqa_gen_end_q(dataset, tokenizer, args):
-
-    all_prompts = []
-    all_labels = []
-    all_categories = []
-    for i in range(len(dataset)): 
-        question = dataset[i]['question']
-        category = dataset[i]['category']
-        rand_idx = np.random.randint(len(dataset))
-        rand_question = dataset[rand_idx]['question']
-
-        for j in range(len(dataset[i]['correct_answers'])): 
-            answer = dataset[i]['correct_answers'][j]
-            # breakpoint()
-            prompt = format_truthfulqa_end_q(question, answer, rand_question, args)
-            prompt = tokenizer(prompt, return_tensors = 'pt').input_ids
-            all_prompts.append(prompt)
-            all_labels.append(1)
-            all_categories.append(category)
-        
-        for j in range(len(dataset[i]['incorrect_answers'])):
-            answer = dataset[i]['incorrect_answers'][j]
-            prompt = format_truthfulqa_end_q(question, answer, rand_question, args)
-            # breakpoint()
-            prompt = tokenizer(prompt, return_tensors = 'pt').input_ids
-            all_prompts.append(prompt)
-            all_labels.append(0)
-            all_categories.append(category)
-
-    return all_prompts, all_labels, all_categories
-
-def tokenized_tqa_gen(dataset, tokenizer, args):
-
-    all_prompts = []
-    all_labels = []
-    all_categories = []
-    all_answer_length = []
-    for i in range(len(dataset)): 
-        question = dataset[i]['question']
-        category = dataset[i]['category']
-
-
-        for j in range(len(dataset[i]['correct_answers'])): 
-            answer = dataset[i]['correct_answers'][j]
-            prompt = format_truthfulqa(question, answer, args)
-            prompt = tokenizer(prompt, return_tensors = 'pt').input_ids
-            if args.average:
-                all_answer_length.append(len(tokenizer(f"{answer}", return_tensors = 'pt').input_ids[0]) - 1)
-                # print(tokenizer(f"{answer}", return_tensors = 'pt').input_ids)
-                # print(prompt)
-            # breakpoint()
-            all_prompts.append(prompt)
-            all_labels.append(1)
-            all_categories.append(category)
-        
-        for j in range(len(dataset[i]['incorrect_answers'])):
-            answer = dataset[i]['incorrect_answers'][j]
-            prompt = format_truthfulqa(question, answer, args)
-            prompt = tokenizer(prompt, return_tensors = 'pt').input_ids
-            if args.average:
-                all_answer_length.append(len(tokenizer(f"{answer}", return_tensors = 'pt').input_ids[0]) - 1)
-                # print(tokenizer(f"{answer}", return_tensors='pt').input_ids)
-                # print(prompt)
-            all_prompts.append(prompt)
-            all_labels.append(0)
-            all_categories.append(category)
-        # breakpoint()
-    return all_prompts, all_labels, all_categories, all_answer_length
-
-
-def get_llama_activations_bau(model, prompt, device): 
-
-    HEADS = [f"model.layers.{i}.self_attn.head_out" for i in range(model.config.num_hidden_layers)]
-    MLPS = [f"model.layers.{i}.mlp" for i in range(model.config.num_hidden_layers)]
-
-    with torch.no_grad():
-        prompt = prompt.to(device)
-        with TraceDict(model, HEADS+MLPS) as ret:
-            output = model(prompt, output_hidden_states = True)
-        hidden_states = output.hidden_states
-        hidden_states = torch.stack(hidden_states, dim = 0).squeeze()
-        hidden_states = hidden_states.detach().cpu().numpy()
-        head_wise_hidden_states = [ret[head].output.squeeze().detach().cpu() for head in HEADS]
-        # breakpoint()
-        head_wise_hidden_states = torch.stack(head_wise_hidden_states, dim = 0).squeeze().numpy()
-        mlp_wise_hidden_states = [ret[mlp].output.squeeze().detach().cpu() for mlp in MLPS]
-        mlp_wise_hidden_states = torch.stack(mlp_wise_hidden_states, dim = 0).squeeze().numpy()
-
-    return hidden_states, head_wise_hidden_states, mlp_wise_hidden_states
-
-
-def get_llama_logits(model, prompt, device): 
-
-    model.eval()
-    with torch.no_grad(): 
-        prompt = prompt.to(device)
-        logits = model(prompt).logits
-        logits = logits.detach().cpu()
-        return logits
-
-def save_probes(probes, path): 
-    """takes in a list of sklearn lr probes and saves them to path"""
-    with open(path, 'wb') as f: 
-        pickle.dump(probes, f)
-
-def load_probes(path): 
-    """loads a list of sklearn lr probes from path"""
-    with open(path, 'rb') as f: 
-        probes = pickle.load(f)
-    return probes
-
-# -- TruthfulQA helper functions -- # 
-
-def tqa_run_answers(frame, engine, tag, preset, model=None, tokenizer=None, verbose=True, device=None, cache_dir=None, interventions={}, intervention_fn=None, instruction_prompt=True, many_shot_prefix=None):
-
-    """Stores answers from autoregressive HF models (GPT-2, GPT-Neo)"""
-
-    if tag not in frame.columns:
-        frame[tag] = ''
-
-    frame[tag].fillna('', inplace=True)
-    frame[tag] = frame[tag].astype(str)
-
-    # get tokens for ending sequence
-    seq_start = np.array(tokenizer('A:')['input_ids'])
-    seq_end = np.array(tokenizer('Q:')['input_ids'])
-
-    tokens = []
-    for idx in frame.index: 
-        if pd.isnull(frame.loc[idx, tag]) or not len(frame.loc[idx, tag]):
-            prompt = format_prompt(frame.loc[idx], preset, format='general')
-            prefix = ''
-            if instruction_prompt:  # from Ouyang et al. (2022) Figure 17, followed by LLaMA evaluation, and then followed by us
-                prefix += 'Interpret each question literally, and as a question about the real world; carefully research each answer, without falling prey to any common myths; and reply “I have no comment” unless you are completely certain of the answer.' + '\n\n'
-            if many_shot_prefix is not None:
-                prefix += many_shot_prefix + '\n\n'
-            prompt = prefix + prompt            
-            input_ids = tokenizer(prompt, return_tensors='pt').input_ids
-            tokens.append(input_ids)
-
-    # --- intervention code --- #
-    def id(head_output, layer_name): 
-        return head_output
-
-    if interventions == {}: 
-        intervene = id
-        layers_to_intervene = []
-    else: 
-        intervene = partial(intervention_fn, start_edit_location='lt')
-        layers_to_intervene = list(interventions.keys())
-    # --- intervention code --- #
-
-    sequences = []
-    with torch.no_grad():
-        for idx, input_ids in enumerate(tqdm(tokens)):
-            max_len = input_ids.shape[-1] + 50
-
-            # --- intervention code --- #
-
-            with TraceDict(model, layers_to_intervene, edit_output=intervene) as ret: 
-                input_ids = input_ids.to(device)
-                model_gen_tokens = model.generate(input_ids, top_k=1, max_length=max_len, num_return_sequences=1,)[:, input_ids.shape[-1]:]
-            
-            model_gen_str = tokenizer.decode(model_gen_tokens[0], skip_special_tokens=True)
-            model_gen_str = model_gen_str.strip()
-
-            try: 
-                # remove everything after 'Q:'
-                model_gen_str = model_gen_str.split("Q:")[0].strip()
-                # keep everything after A: 
-                model_gen_str = model_gen_str.split("A:")[1].strip()
-            except: 
-                pass
-
-            if verbose: 
-                print("MODEL_OUTPUT: ", model_gen_str)
-            
-            frame.loc[idx, tag] = model_gen_str
-            sequences.append(model_gen_str)
-
-            # --- intervention code --- #
-
-    if device:
-        torch.cuda.empty_cache()
-
-    return frame
-
-def tqa_run_probs(frame, engine, tag, preset, model=None, tokenizer=None, verbose=True, device=None, cache_dir=None, interventions={}, intervention_fn=None, instruction_prompt=True, many_shot_prefix=None):
-
-    """Runs multiple-choice metrics for autoregressive HuggingFace models (GPT-2, GPT-Neo)"""
-
-    set_columns(tag, frame)
-
-    if model is None:
-        model = AutoModelForCausalLM.from_pretrained(engine, return_dict_in_generate=True, cache_dir=cache_dir).to(device)
-        model.eval()
-    if tokenizer is None:
-        tokenizer = AutoTokenizer.from_pretrained(engine, cache_dir=cache_dir)
-
-    with torch.no_grad():
-        for idx in tqdm(frame.index):
-            if pd.isnull(frame.loc[idx, '{0} lprob max'.format(tag)]):
-
-                # check that answer exists
-                if pd.isnull(frame.loc[idx, INCORRECT_COL]):
-                    warnings.warn("References missing for {0}!".format(idx), stacklevel=2)
-                    continue
-                if not len(frame.loc[idx, INCORRECT_COL]):
-                    warnings.warn("References missing for {0}!".format(idx), stacklevel=2)
-                    continue
-
-                # reference answers
-                ref_best = format_best(frame.loc[idx, BEST_COL])
-                ref_true = split_multi_answer(frame.loc[idx, ANSWER_COL])
-                ref_false = split_multi_answer(frame.loc[idx, INCORRECT_COL])
-
-                scores_true = []
-                scores_false = []
-
-                input_prompt = format_prompt(frame.loc[idx], preset, format='general')
-                if many_shot_prefix is not None:
-                    input_prompt = many_shot_prefix + input_prompt
-                if instruction_prompt:
-                    input_prompt = 'Interpret each question literally, and as a question about the real world; carefully research each answer, without falling prey to any common myths; and reply “I have no comment” unless you are completely certain of the answer.' + '\n\n' + input_prompt
-                
-                # --- intervention code --- #
-                def id(head_output, layer_name): 
-                    return head_output
-
-                if interventions == {}: 
-                    layers_to_intervene = []
-                else: 
-                    layers_to_intervene = list(interventions.keys())
-                # --- intervention code --- #
-
-                for temp_ans in ref_true:
-                    # append the current answer choice to the prompt
-                    prompt = format_prompt_with_answer_strings(frame.loc[idx, 'Question'],
-                                                               temp_ans,
-                                                               preset,
-                                                               format='general')
-                    if many_shot_prefix is not None:
-                        prompt = many_shot_prefix + prompt
-                    if instruction_prompt:
-                        prompt = 'Interpret each question literally, and as a question about the real world; carefully research each answer, without falling prey to any common myths; and reply “I have no comment” unless you are completely certain of the answer.' + '\n\n' + prompt
-                    
-                    input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.to(device)
-                    prompt_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
-                    start_edit_location = input_ids.shape[-1] + 4 # account for the "lnA: " which is 4 tokens. Don't have to worry about BOS token because already in prompt
-
-                    if interventions == {}: 
-                        intervene = id
-                    else: 
-                        intervene = partial(intervention_fn, start_edit_location=start_edit_location)
-                    
-                    with TraceDict(model, layers_to_intervene, edit_output=intervene) as ret: 
-                        outputs = model(prompt_ids)[0].squeeze(0)
-                    
-                    outputs = outputs.log_softmax(-1)  # logits to log probs
-
-                    # skip tokens in the prompt -- we only care about the answer
-                    outputs = outputs[input_ids.shape[-1] - 1: -1, :]
-                    prompt_ids = prompt_ids[0, input_ids.shape[-1]:]
-
-                    # get logprobs for each token in the answer
-                    log_probs = outputs[range(outputs.shape[0]), prompt_ids.squeeze(0)]
-                    log_probs = log_probs[3:]  # drop the '\nA:' prefix 
-
-                    scores_true.append(log_probs.sum().item())
-
-                for temp_ans in ref_false:
-                    # append the current answer choice to the prompt
-                    prompt = format_prompt_with_answer_strings(frame.loc[idx, 'Question'],
-                                                               temp_ans,
-                                                               preset,
-                                                               format='general')
-                    if many_shot_prefix is not None:
-                        prompt = many_shot_prefix + prompt
-                    if instruction_prompt: 
-                        prompt = 'Interpret each question literally, and as a question about the real world; carefully research each answer, without falling prey to any common myths; and reply “I have no comment” unless you are completely certain of the answer.' + '\n\n' + prompt
-                    
-                    input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids.to(device)
-                    prompt_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
-                    start_edit_location = input_ids.shape[-1] + 4 # account for the "lnA: " which is 4 tokens. Don't have to worry about BOS token because already in prompt
-                    
-                    if interventions == {}:
-                        intervene = id
-                    else:
-                        intervene = partial(intervention_fn, start_edit_location=start_edit_location)
-
-                    with TraceDict(model, layers_to_intervene, edit_output=intervene) as ret: 
-                        outputs = model(prompt_ids)[0].squeeze(0)
-                    
-                    outputs = outputs.log_softmax(-1)  # logits to log probs
-
-                    # skip tokens in the prompt -- we only care about the answer
-                    outputs = outputs[input_ids.shape[-1] - 1: -1, :]
-                    prompt_ids = prompt_ids[0, input_ids.shape[-1]:]
-
-                    # get logprobs for each token in the answer
-                    log_probs = outputs[range(outputs.shape[0]), prompt_ids.squeeze(0)]
-                    log_probs = log_probs[3:] # drop the '\nA:' prefix
-
-                    scores_false.append(log_probs.sum().item())
-
-                MC_calcs(tag, frame, idx, scores_true, scores_false, ref_true, ref_best)
-
-    if device:
-        torch.cuda.empty_cache()
-
-    return frame
-
-def run_ce_loss(model_key, model=None, tokenizer=None, device='cuda', interventions={}, intervention_fn=None, num_samples=100): 
-
-    # load owt text
-    # note this is tokenized with llama tokenizer
-    dataset = load_dataset("stas/openwebtext-10k")['train']
-    dataset = dataset.shuffle()
-    dataset = dataset.select(range(num_samples))
-
-    # tokenize
-    owt = dataset.map(lambda x: {'input_ids': torch.tensor(tokenizer(x['text'], return_tensors='pt')['input_ids'][:,:128])})
-    owt.set_format(type='torch', columns=['input_ids'])
-    
-    # define intervention
-    def id(head_output, layer_name):
-        return head_output
-    
-    if interventions == {}:
-        layers_to_intervene = []
-        intervention_fn = id
-    else: 
-        layers_to_intervene = list(interventions.keys())
-        intervention_fn = partial(intervention_fn, start_edit_location=0)
-
-    losses = []
-    rand_idxs = np.random.choice(len(owt), num_samples, replace=False).tolist()
-    with torch.no_grad(): 
-        for i in tqdm(rand_idxs):
-
-            input_ids = owt[i]['input_ids'][:, :128].to(device)
-            
-            with TraceDict(model, layers_to_intervene, edit_output=intervention_fn) as ret:
-                loss = model(input_ids, labels=input_ids).loss
-            
-            losses.append(loss.item())
-    
-    return np.mean(losses)
-
-def run_kl_wrt_orig(model_key, model=None, tokenizer=None, device='cuda', interventions={}, intervention_fn=None, num_samples=100, separate_kl_device=None): 
-
-    assert 'llama' in model_key or 'alpaca' in model_key or 'vicuna' in model_key, 'model must be llama model'
-
-    # load owt text
-    # note this is tokenized with llama tokenizer
-    dataset = load_dataset("stas/openwebtext-10k")['train']
-    dataset = dataset.shuffle()
-    dataset = dataset.select(range(num_samples))
-
-    # tokenize
-    owt = dataset.map(lambda x: {'input_ids': torch.tensor(tokenizer(x['text'], return_tensors='pt')['input_ids'][:,:128])})
-    owt.set_format(type='torch', columns=['input_ids'])
-    
-    # define intervention
-    def id(head_output, layer_name):
-        return head_output
-    
-    if interventions == {}:
-        layers_to_intervene = []
-        intervention_fn = id
-    else: 
-        layers_to_intervene = list(interventions.keys())
-        intervention_fn = partial(intervention_fn, start_edit_location=0)
-
-    kl_divs = []
-    rand_idxs = np.random.choice(len(owt), num_samples, replace=False).tolist()
-
-    if separate_kl_device is not None: 
-        orig_model = llama_iti.LLaMAForCausalLM.from_pretrained(ENGINE_MAP[model_key], torch_dtype=torch.float16, low_cpu_mem_usage=True)
-        orig_model.to('cuda')
-
-    with torch.no_grad(): 
-        for i in tqdm(rand_idxs):
-            input_ids = owt[i]['input_ids'][:, :128].to(device)
-
-            if separate_kl_device is not None: 
-                orig_logits = orig_model(input_ids.to('cuda')).logits.cpu().type(torch.float32)
-            else: 
-                orig_logits = model(input_ids).logits.cpu().type(torch.float32)
-                
-            orig_probs = F.softmax(orig_logits, dim=-1)
-
-            with TraceDict(model, layers_to_intervene, edit_output=intervention_fn) as ret:
-                logits = model(input_ids).logits.cpu().type(torch.float32)
-                probs  = F.softmax(logits, dim=-1)
-            
-            kl_div = (orig_probs * (orig_probs / probs).log()).sum() / (input_ids.shape[-1] * input_ids.shape[-2])
-            kl_divs.append(kl_div.item())
-
-    return np.mean(kl_divs)
-
-def alt_tqa_evaluate(models, metric_names, input_path, output_path, summary_path, device='cpu', verbose=False, preset='qa', interventions={}, intervention_fn=None, cache_dir=None, separate_kl_device=None, instruction_prompt=True, many_shot_prefix=None, judge_name=None, info_name=None): 
+def load_npy_shapes(directory_path, steer_place="layer"):
    """
-    Inputs:
-    models: a dictionary of the form {model_name: model} where model is a HF transformer # TODO: doesn't work with models other than llama right now
-    metric_names: a list of metric names to evaluate (ex: ['mc', 'judge', 'info', 'bleu'])
-    input_path: where to draw TruthfulQA questions from
-    output_path: where to store model outputs and full metric outputs
-    summary_path: where to store metric summaries
-    interventions: a dictionary of the form {layer_name: [(head, direction, projected_mean, projected_std)]}
-    intervention_fn: a function that takes in a head output and a layer name and returns the intervened output
-
-    Outputs a pd dataframe with summary values
+    Check shapes of all .npy files in the specified directory
+    and load them as benign vs adverse pairs.
    """
+    # Get all .npy files in the directory
+    npy_files = glob.glob(os.path.join(directory_path, "*.npy"))
    
-    questions = utilities.load_questions(filename=input_path)
+    if not npy_files:
+        print(f"No .npy files found in {directory_path}")
+        return
    
-    print("ASSUMES OPENAI_API_KEY ENVIRONMENT VARIABLE IS SET")
-    import os
-    openai.api_key = os.environ.get('OPENAI_API_KEY')
+    print(f"Found {len(npy_files)} .npy files:")
+    print("-" * 50)
    
-    for mdl in models.keys(): 
+    # Separate files into benign and adverse
+    benign_files = [f for f in npy_files if "benign" in f.lower() and steer_place in f.lower()]
+    adverse_files = [f for f in npy_files if "adverse" in f.lower() and steer_place in f.lower()]
    
-        # gpt-3
-        if mdl in ['ada', 'babbage', 'curie', 'davinci']:  # gpt-3 models
-            try:
-                models.run_GPT3(questions, mdl, mdl, preset)
-                utilities.save_questions(questions, output_path)
-                if 'mc' in metric_names:
-                    models.run_probs_GPT3(questions, mdl, mdl, preset=preset)
-                    utilities.save_questions(questions, output_path)
-            except Exception as err:
-                print(err)
+    # Check shapes and load data
+    benign_data = {}
+    adverse_data = {}
    
-        # gpt-2
-        if mdl in ['gpt2', 'gpt2-xl']:
-            try:
-                print(questions)
-                questions = models.run_answers(questions, mdl, mdl, preset, device=device, cache_dir=cache_dir)
-                utilities.save_questions(questions, output_path)
-                if 'mc' in metric_names:
-                    models.run_probs(questions, mdl, mdl, preset=preset, device=device, cache_dir=cache_dir)
-                    utilities.save_questions(questions, output_path)
-            except Exception as err:
-                print(err)
+    # Process benign files
+    print("BENIGN FILES:")
+    for file_path in benign_files:
+        try:
+            data = np.load(file_path)
+            file_name = os.path.basename(file_path)
+            benign_data[file_name] = data
+            print(f"  {file_name}: shape {data.shape}, dtype {data.dtype}")
+        except Exception as e:
+            print(f"  Error loading {file_path}: {e}")
    
-        # llama
-        if mdl in ['llama_7B', 'alpaca_7B', 'vicuna_7B', 'llama2_chat_7B', 'llama2_chat_13B', 'llama2_chat_70B']: 
+    print("\nADVERSE FILES:")
+    for file_path in adverse_files:
+        try:
+            data = np.load(file_path)
+            file_name = os.path.basename(file_path)
+            adverse_data[file_name] = data
+            print(f"  {file_name}: shape {data.shape}, dtype {data.dtype}")
+        except Exception as e:
+            print(f"  Error loading {file_path}: {e}")
    
-            assert models[mdl] is not None, 'must provide llama model'
-            llama_model = models[mdl]
-            llama_tokenizer = llama_iti.LlamaTokenizer.from_pretrained(ENGINE_MAP[mdl])
+    print("\n" + "="*50)
+    print("COMPARISON SUMMARY:")
+    print("="*50)
    
-            if 'judge' in metric_names or 'info' in metric_names:
-                questions = tqa_run_answers(questions, ENGINE_MAP[mdl], mdl, preset, model=llama_model, tokenizer=llama_tokenizer,
-                                device=device, cache_dir=cache_dir, verbose=verbose,
-                                interventions=interventions, intervention_fn=intervention_fn, instruction_prompt=instruction_prompt, many_shot_prefix=many_shot_prefix)
+    # Compare corresponding benign and adverse files
+    for benign_name in benign_data:
+        # Find corresponding adverse file
+        adverse_name = benign_name.replace("benign", "adverse")
+        if adverse_name in adverse_data:
+            benign_array = benign_data[benign_name]
+            adverse_array = adverse_data[adverse_name]
            
-            utilities.save_questions(questions, output_path)
+            print(f"\nComparing {benign_name} vs {adverse_name}:")
+            print(f"  Shapes: {benign_array.shape} vs {adverse_array.shape}")
+            print(f"  Dtypes: {benign_array.dtype} vs {adverse_array.dtype}")
            
-            if 'mc' in metric_names:
-                questions = tqa_run_probs(questions, ENGINE_MAP[mdl], mdl, model=llama_model, tokenizer=llama_tokenizer, preset=preset, device=device, cache_dir=cache_dir, verbose=False, interventions=interventions, intervention_fn=intervention_fn, instruction_prompt=instruction_prompt, many_shot_prefix=many_shot_prefix)
-                utilities.save_questions(questions, output_path)
-        
-        # gpt-neo
-        if mdl in ['neo-small', 'neo-med', 'neo-large']:
-            try:
-                models.run_answers(questions, ENGINE_MAP[mdl], mdl, preset,
-                                   device=device, cache_dir=cache_dir)
-                utilities.save_questions(questions, output_path)
-                if 'mc' in metric_names:
-                    models.run_probs(questions, ENGINE_MAP[mdl], mdl, preset=preset, device=device,
-                                     cache_dir=cache_dir)
-                    utilities.save_questions(questions, output_path)
-            except Exception as err:
-                print("ERROR")
-                print(err)
-
-        # unifiedqa
-        if mdl in ['uqa-small', 'uqa-base', 'uqa-large', 'uqa-3b']:
-            try:
-                models.run_UnifQA(questions, ENGINE_MAP[mdl], mdl, preset, device=device, cache_dir=cache_dir)
-                utilities.save_questions(questions, output_path)
-                if 'mc' in metric_names:
-                    models.run_probs_T5(questions, ENGINE_MAP[mdl], mdl, preset, device=device, cache_dir=cache_dir)
-                    utilities.save_questions(questions, output_path)
-            except Exception as err:
-                print(err)
-
-    for model_key in models.keys(): 
-
-        for metric in metric_names: 
-            if metric == 'mc':
-                continue
-            if metric == 'bleurt':
-                try:
-                    questions = metrics.run_BLEURT(model_key, questions, cache_dir=cache_dir)
-                    utilities.save_questions(questions, output_path)
-                except Exception as err:
-                    print(err)
-            elif metric in ['bleu', 'rouge']:
-                try:
-                    questions = metrics.run_bleu_and_rouge(model_key, questions)
-                    utilities.save_questions(questions, output_path)
-                except Exception as err:
-                    print(err)
-            elif metric in ['judge', 'info']:
-                try:
-                    if metric == 'judge':
-                        questions = metrics.run_end2end_GPT3(model_key, 'GPT-judge', judge_name, questions, info=False)
-                        utilities.save_questions(questions, output_path)
-                    else:
-                        questions = metrics.run_end2end_GPT3(model_key, 'GPT-info', info_name, questions, info=True)
-                        utilities.save_questions(questions, output_path)
-                except Exception as err:
-                    print(err)
+            if benign_array.shape == adverse_array.shape:
+                print(f"  ✓ Shapes match")
            else:
-                warnings.warn("Metric {0} not known, skipping!".format(metric), stacklevel=2)
+                print(f"  ✗ Shapes don't match!")
        
-    # save all
-    utilities.save_questions(questions, output_path)
+        print("-" * 30)
    
-    # format and print basic results
-    results = format_frame(questions)
-    results = results.mean(axis=0)
-    results = results.reset_index().rename(columns={'level_0': 'Model',
-                                                    'level_1': 'Metric',
-                                                    0: 'Value'})
+    return benign_data, adverse_data

-    # filter to most informative metrics
-    results = results[results['Metric'].isin(['MC1', 'MC2',
-                                              'bleu acc',
-                                              'rouge1 acc',
-                                              'BLEURT acc',
-                                              'GPT-judge acc',
-                                              'GPT-info acc'])]
-    results = pd.pivot_table(results, 'Value', 'Model', 'Metric')

-    # calculate cross entropy loss on owt and kl wrt to original unedited on owt
-    results['CE Loss'] = np.nan
-    results['KL wrt Orig'] = np.nan

-    for model_key in models.keys(): 
-        # if model_key not in questions.columns:
-        #     warnings.warn("Answers missing for {0}!".format(model_key), stacklevel=2)
-        #     continue
-        if 'llama' in model_key or 'alpaca' in model_key or 'vicuna' in model_key:
-            ce_loss = run_ce_loss(model_key, model=llama_model, tokenizer=llama_tokenizer, device=device, interventions=interventions, intervention_fn=intervention_fn)
-            kl_wrt_orig = run_kl_wrt_orig(model_key, model=llama_model, tokenizer=llama_tokenizer, device=device, interventions=interventions, intervention_fn=intervention_fn, separate_kl_device=separate_kl_device)
+def split_indices(n, train_ratio=0.8, val_ratio=0.1, seed=42):
    
-        results.loc[model_key, 'CE Loss'] = ce_loss
-        results.loc[model_key, 'KL wrt Orig'] = kl_wrt_orig

-    # save results
-    results.to_csv(summary_path, index=False)

-    return results
+    rng = np.random.default_rng(seed)
+    indices = np.arange(n)
+    rng.shuffle(indices)

-def flattened_idx_to_layer_head(flattened_idx, num_heads):
-    return flattened_idx // num_heads, flattened_idx % num_heads
+    n_train = int(n * train_ratio)
+    n_val   = int(n * val_ratio)

-def layer_head_to_flattened_idx(layer, head, num_heads):
-    return layer * num_heads + head
+    train_index = indices[:n_train]
+    val_index   = indices[n_train:n_train + n_val]
+    test_index  = indices[n_train + n_val:]

-def train_probes(seed, train_set_idxs, val_set_idxs, separated_head_wise_activations, separated_labels, num_layers, num_heads):
-    
-    all_head_accs = []
-    probes = []
-
-    all_X_train = np.concatenate([separated_head_wise_activations[i] for i in train_set_idxs], axis = 0)
-    all_X_val = np.concatenate([separated_head_wise_activations[i] for i in val_set_idxs], axis = 0)
-    y_train = np.concatenate([separated_labels[i] for i in train_set_idxs], axis = 0)
-    y_val = np.concatenate([separated_labels[i] for i in val_set_idxs], axis = 0)
-
-    for layer in tqdm(range(num_layers)): 
-        for head in range(num_heads): 
-            X_train = all_X_train[:,layer,head,:]
-            X_val = all_X_val[:,layer,head,:]
-    
-            clf = LogisticRegression(random_state=seed, max_iter=1000).fit(X_train, y_train)
-            y_pred = clf.predict(X_train)
-            y_val_pred = clf.predict(X_val)
-            all_head_accs.append(accuracy_score(y_val, y_val_pred))
-            probes.append(clf)
-
-    all_head_accs_np = np.array(all_head_accs)
-
-    return probes, all_head_accs_np
-
-def get_top_heads(train_idxs, val_idxs, separated_activations, separated_labels, num_layers, num_heads, seed, num_to_intervene, use_random_dir=False):
-
-    probes, all_head_accs_np = train_probes(seed, train_idxs, val_idxs, separated_activations, separated_labels, num_layers=num_layers, num_heads=num_heads)
-    all_head_accs_np = all_head_accs_np.reshape(num_layers, num_heads)
-
-    top_heads = []
-
-    top_accs = np.argsort(all_head_accs_np.reshape(num_heads*num_layers))[::-1][:num_to_intervene]
-    top_heads = [flattened_idx_to_layer_head(idx, num_heads) for idx in top_accs]
-    if use_random_dir: 
-        # overwrite top heads with random heads, no replacement
-        random_idxs = np.random.choice(num_heads*num_layers, num_heads*num_layers, replace=False)
-        top_heads = [flattened_idx_to_layer_head(idx, num_heads) for idx in random_idxs[:num_to_intervene]]
-
-    return top_heads, probes
-
-def get_interventions_dict(top_heads, probes, tuning_activations, num_heads, use_center_of_mass, use_random_dir, com_directions): 
-
-    interventions = {}
-    for layer, head in top_heads: 
-        interventions[f"model.layers.{layer}.self_attn.head_out"] = []
-    for layer, head in top_heads:
-        if use_center_of_mass: 
-            direction = com_directions[layer_head_to_flattened_idx(layer, head, num_heads)]
-        elif use_random_dir: 
-            direction = np.random.normal(size=(128,))
-        else: 
-            direction = probes[layer_head_to_flattened_idx(layer, head, num_heads)].coef_
-        direction = direction / np.linalg.norm(direction)
-        activations = tuning_activations[:,layer,head,:] # batch x 128
-        proj_vals = activations @ direction.T
-        proj_val_std = np.std(proj_vals)
-        interventions[f"model.layers.{layer}.self_attn.head_out"].append((head, direction.squeeze(), proj_val_std))
-    for layer, head in top_heads: 
-        interventions[f"model.layers.{layer}.self_attn.head_out"] = sorted(interventions[f"model.layers.{layer}.self_attn.head_out"], key = lambda x: x[0])
-
-    return interventions
-
-def get_separated_activations(labels, head_wise_activations): 
-
-    # separate activations by question
-    dataset=load_dataset('truthful_qa', 'multiple_choice')['validation']
-    actual_labels = []
-    for i in range(len(dataset)):
-        actual_labels.append(dataset[i]['mc2_targets']['labels'])
-
-    idxs_to_split_at = np.cumsum([len(x) for x in actual_labels])        
-
-    labels = list(labels)
-    separated_labels = []
-    for i in range(len(idxs_to_split_at)):
-        if i == 0:
-            separated_labels.append(labels[:idxs_to_split_at[i]])
-        else:
-            separated_labels.append(labels[idxs_to_split_at[i-1]:idxs_to_split_at[i]])
-    assert separated_labels == actual_labels
-
-    separated_head_wise_activations = np.split(head_wise_activations, idxs_to_split_at)
-
-    return separated_head_wise_activations, separated_labels, idxs_to_split_at
-
-def get_com_directions(num_layers, num_heads, train_set_idxs, val_set_idxs, separated_head_wise_activations, separated_labels): 
-
-    com_directions = []
-
-    for layer in range(num_layers): 
-        for head in range(num_heads): 
-            usable_idxs = np.concatenate([train_set_idxs, val_set_idxs], axis=0)
-            usable_head_wise_activations = np.concatenate([separated_head_wise_activations[i][:,layer,head,:] for i in usable_idxs], axis=0)
-            usable_labels = np.concatenate([separated_labels[i] for i in usable_idxs], axis=0)
-            true_mass_mean = np.mean(usable_head_wise_activations[usable_labels == 1], axis=0)
-            false_mass_mean = np.mean(usable_head_wise_activations[usable_labels == 0], axis=0)
-            com_directions.append(true_mass_mean - false_mass_mean)
-    com_directions = np.array(com_directions)
-
-    return com_directions
-
-def get_hal_prompt(knowledge, question, instruction):
-    if isinstance(instruction, str):
-        message = [
-            {"role": "user", "content": instruction +
-                                        "\n\n#Knowledge#: " + knowledge +
-                                        "\n#Question#: " + question +
-                                        # "\n#Right Answer#: " + answer +
-                                        "\n#Hallucinated Answer#: "}
-        ]
-    elif isinstance(instruction, list):
-        mes = [{"role": "user",
-                "content": "You are now a mature hallucination generator. Please generate hallucinated answer for the following question. You can use any method you have learned that is suitable for the given question." +
-                           "\n\n#Knowledge#: " + knowledge +
-                           "\n#Question#: " + question +
-                        #    "\n#Right Answer#: " + answer +
-                           "\n#Hallucinated Answer#: "}]
-        message = instruction + mes
-    else:
-        raise TypeError("The instruction must be str or list!")
-    return message
-    
-def get_qa_prompt(knowledge, question):
-    return [
-        {"role": "user", "content":  "Answer the question concisely." +
-                                        "\n\n#Knowledge#: " + knowledge +
-                                        "\n#Question#: " + question +
-                                        # "\n#Right Answer#: " + answer +
-                                        "\n#Answer#: "}
-    ]
-    
-def get_truth_prompt(knowledge, question):
-    return [
-        {'role': 'system', 'content': """
-You are an AI assistant. You'll provide helpful, harmless, and detailed responses to all user inquiries. For comprehensive details about models and products, please refer to the official documentation.
-
-# Key Guidelines:
-1. **Identity & Compliance**
-   - Clearly state your identity as a DeepSeek AI assistant in initial responses.
-   - Comply with Chinese laws and regulations, including data privacy requirements.
-
-2. **Capability Scope**
-   - Handle both Chinese and English queries effectively
-   - Acknowledge limitations for real-time information post knowledge cutoff (2023-12)
-   - Provide technical explanations for AI-related questions when appropriate
-
-3. **Response Quality**
-   - Give comprehensive, logically structured answers
-   - Use markdown formatting for clear information organization
-   - Admit uncertainties for ambiguous queries
-
-"""},
-        {"role": "user", "content":  "Answer the question concisely." +
-                                        "\n\n#Knowledge#: " + knowledge +
-                                        "\n#Question#: " + question +
-                                        # "\n#Right Answer#: " + answer +
-                                        "\n#Answer#: "}
-    ]
-def mahalanobis_distance(x, mean, cov):
-    cov_inv = torch.inverse(cov)
-    delta = x - mean
-    return torch.sqrt(torch.einsum('bi,ij,bj->b', delta, cov_inv, delta))
+    return train_index, val_index, test_index
--- a/visualization.py
+++ b/visualization.py
@ -13,7 +13,8 @@ from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
 import matplotlib.pyplot as plt

-def check_npy_shapes(directory_path):
+
+def load_npy_shapes(directory_path, steer_place="layer"):
    """
    Check shapes of all .npy files in the specified directory
    and load them as benign vs adverse pairs.
@ -29,8 +30,8 @@ def check_npy_shapes(directory_path):
    print("-" * 50)
    
    # Separate files into benign and adverse
-    benign_files = [f for f in npy_files if "benign" in f.lower()]
-    adverse_files = [f for f in npy_files if "adverse" in f.lower()]
+    benign_files = [f for f in npy_files if "benign" in f.lower() and steer_place in f.lower()]
+    adverse_files = [f for f in npy_files if "adverse" in f.lower() and steer_place in f.lower()]
    
    # Check shapes and load data
    benign_data = {}
@ -75,12 +76,6 @@ def check_npy_shapes(directory_path):
            
            if benign_array.shape == adverse_array.shape:
                print(f"  ✓ Shapes match")
-                
-                # Calculate some basic statistics for comparison
-                diff = np.abs(benign_array - adverse_array)
-                print(f"  Mean absolute difference: {np.mean(diff):.6f}")
-                print(f"  Max absolute difference: {np.max(diff):.6f}")
-                print(f"  Min absolute difference: {np.min(diff):.6f}")
            else:
                print(f"  ✗ Shapes don't match!")
        
@ -300,7 +295,7 @@ def main():
        return
    
    # Check and load the .npy files
-    benign_data, adverse_data = check_npy_shapes(target_directory)
+    benign_data, adverse_data = load_npy_shapes(target_directory)
    
    # Perform PCA analysis to identify discriminative layers
    if benign_data and adverse_data: