haloscope/utils.py

109 lines
3.6 KiB
Python

import os
import sys
sys.path.insert(0, "TruthfulQA")
import torch
import torch.nn as nn
import torch.nn.functional as F
import llama_iti
from datasets import load_dataset
from tqdm import tqdm
import numpy as np
import llama_iti
import pandas as pd
import warnings
from einops import rearrange
from transformers import AutoTokenizer, AutoModelForCausalLM
from baukit import Trace, TraceDict
import sklearn
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
import pickle
from functools import partial
import glob
# from truthfulqa import utilities, models, metrics
# import openai
# from truthfulqa.configs import BEST_COL, ANSWER_COL, INCORRECT_COL
import copy
ENGINE_MAP = {
'llama_7B': 'baffo32/decapoda-research-llama-7B-hf',
'alpaca_7B': 'circulus/alpaca-7b',
'vicuna_7B': 'AlekseyKorshuk/vicuna-7b',
'llama2_chat_7B': 'meta-llama/Llama-2-7b-chat-hf',
'llama2_chat_13B': 'meta-llama/Llama-2-13b-chat-hf',
'llama2_chat_70B': 'meta-llama/Llama-2-70b-chat-hf',
}
def load_npy_shapes(directory_path, steer_place="layer"):
"""
Check shapes of all .npy files in the specified directory
and load them as benign vs adverse pairs.
"""
# Get all .npy files in the directory
npy_files = glob.glob(os.path.join(directory_path, "*.npy"))
if not npy_files:
print(f"No .npy files found in {directory_path}")
return
print(f"Found {len(npy_files)} .npy files:")
print("-" * 50)
# Separate files into benign and adverse
benign_files = [f for f in npy_files if "benign" in f.lower() and steer_place in f.lower()]
adverse_files = [f for f in npy_files if "adverse" in f.lower() and steer_place in f.lower()]
# Check shapes and load data
benign_data = {}
adverse_data = {}
# Process benign files
print("BENIGN FILES:")
for file_path in benign_files:
try:
data = np.load(file_path)
file_name = os.path.basename(file_path)
benign_data[file_name] = data
print(f" {file_name}: shape {data.shape}, dtype {data.dtype}")
except Exception as e:
print(f" Error loading {file_path}: {e}")
print("\nADVERSE FILES:")
for file_path in adverse_files:
try:
data = np.load(file_path)
file_name = os.path.basename(file_path)
adverse_data[file_name] = data
print(f" {file_name}: shape {data.shape}, dtype {data.dtype}")
except Exception as e:
print(f" Error loading {file_path}: {e}")
print("\n" + "="*50)
print("COMPARISON SUMMARY:")
print("="*50)
# Compare corresponding benign and adverse files
for benign_name in benign_data:
# Find corresponding adverse file
adverse_name = benign_name.replace("benign", "adverse")
if adverse_name in adverse_data:
benign_array = benign_data[benign_name]
adverse_array = adverse_data[adverse_name]
print(f"\nComparing {benign_name} vs {adverse_name}:")
print(f" Shapes: {benign_array.shape} vs {adverse_array.shape}")
print(f" Dtypes: {benign_array.dtype} vs {adverse_array.dtype}")
if benign_array.shape == adverse_array.shape:
print(f" ✓ Shapes match")
else:
print(f" ✗ Shapes don't match!")
print("-" * 30)
return benign_data, adverse_data