124 lines
4.0 KiB
Python
124 lines
4.0 KiB
Python
import os
|
|
import sys
|
|
sys.path.insert(0, "TruthfulQA")
|
|
|
|
import torch
|
|
import torch.nn as nn
|
|
import torch.nn.functional as F
|
|
import llama_iti
|
|
from datasets import load_dataset
|
|
from tqdm import tqdm
|
|
import numpy as np
|
|
import llama_iti
|
|
import pandas as pd
|
|
import warnings
|
|
from einops import rearrange
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
from baukit import Trace, TraceDict
|
|
import sklearn
|
|
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
|
|
from sklearn.linear_model import LogisticRegression
|
|
import pickle
|
|
from functools import partial
|
|
import glob
|
|
|
|
# from truthfulqa import utilities, models, metrics
|
|
# import openai
|
|
# from truthfulqa.configs import BEST_COL, ANSWER_COL, INCORRECT_COL
|
|
import copy
|
|
|
|
ENGINE_MAP = {
|
|
'llama_7B': 'baffo32/decapoda-research-llama-7B-hf',
|
|
'alpaca_7B': 'circulus/alpaca-7b',
|
|
'vicuna_7B': 'AlekseyKorshuk/vicuna-7b',
|
|
'llama2_chat_7B': 'meta-llama/Llama-2-7b-chat-hf',
|
|
'llama2_chat_13B': 'meta-llama/Llama-2-13b-chat-hf',
|
|
'llama2_chat_70B': 'meta-llama/Llama-2-70b-chat-hf',
|
|
}
|
|
def load_npy_shapes(directory_path, steer_place="layer"):
|
|
"""
|
|
Check shapes of all .npy files in the specified directory
|
|
and load them as benign vs adverse pairs.
|
|
"""
|
|
# Get all .npy files in the directory
|
|
npy_files = glob.glob(os.path.join(directory_path, "*.npy"))
|
|
|
|
if not npy_files:
|
|
print(f"No .npy files found in {directory_path}")
|
|
return
|
|
|
|
print(f"Found {len(npy_files)} .npy files:")
|
|
print("-" * 50)
|
|
|
|
# Separate files into benign and adverse
|
|
benign_files = [f for f in npy_files if "benign" in f.lower() and steer_place in f.lower()]
|
|
adverse_files = [f for f in npy_files if "adverse" in f.lower() and steer_place in f.lower()]
|
|
|
|
# Check shapes and load data
|
|
benign_data = {}
|
|
adverse_data = {}
|
|
|
|
# Process benign files
|
|
print("BENIGN FILES:")
|
|
for file_path in benign_files:
|
|
try:
|
|
data = np.load(file_path)
|
|
file_name = os.path.basename(file_path)
|
|
benign_data[file_name] = data
|
|
print(f" {file_name}: shape {data.shape}, dtype {data.dtype}")
|
|
except Exception as e:
|
|
print(f" Error loading {file_path}: {e}")
|
|
|
|
print("\nADVERSE FILES:")
|
|
for file_path in adverse_files:
|
|
try:
|
|
data = np.load(file_path)
|
|
file_name = os.path.basename(file_path)
|
|
adverse_data[file_name] = data
|
|
print(f" {file_name}: shape {data.shape}, dtype {data.dtype}")
|
|
except Exception as e:
|
|
print(f" Error loading {file_path}: {e}")
|
|
|
|
print("\n" + "="*50)
|
|
print("COMPARISON SUMMARY:")
|
|
print("="*50)
|
|
|
|
# Compare corresponding benign and adverse files
|
|
for benign_name in benign_data:
|
|
# Find corresponding adverse file
|
|
adverse_name = benign_name.replace("benign", "adverse")
|
|
if adverse_name in adverse_data:
|
|
benign_array = benign_data[benign_name]
|
|
adverse_array = adverse_data[adverse_name]
|
|
|
|
print(f"\nComparing {benign_name} vs {adverse_name}:")
|
|
print(f" Shapes: {benign_array.shape} vs {adverse_array.shape}")
|
|
print(f" Dtypes: {benign_array.dtype} vs {adverse_array.dtype}")
|
|
|
|
if benign_array.shape == adverse_array.shape:
|
|
print(f" ✓ Shapes match")
|
|
else:
|
|
print(f" ✗ Shapes don't match!")
|
|
|
|
print("-" * 30)
|
|
|
|
return benign_data, adverse_data
|
|
|
|
|
|
|
|
def split_indices(n, train_ratio=0.8, val_ratio=0.1, seed=42):
|
|
|
|
|
|
|
|
rng = np.random.default_rng(seed)
|
|
indices = np.arange(n)
|
|
rng.shuffle(indices)
|
|
|
|
n_train = int(n * train_ratio)
|
|
n_val = int(n * val_ratio)
|
|
|
|
train_index = indices[:n_train]
|
|
val_index = indices[n_train:n_train + n_val]
|
|
test_index = indices[n_train + n_val:]
|
|
|
|
return train_index, val_index, test_index |