update
This commit is contained in:
parent
f29b0c5286
commit
fd6d6b8cec
|
|
@ -58,6 +58,7 @@ def main():
|
|||
parser.add_argument('--model_name', type=str, default='alpaca_7B')
|
||||
parser.add_argument('--judge_model', type=str, default='gpt-4')
|
||||
parser.add_argument('--dataset_name', type=str, default='AdvBench')
|
||||
parser.add_argument('--test_ratio', type=float, default=0.2)
|
||||
|
||||
|
||||
parser.add_argument("--model_dir", type=str, default=None, help='local directory with model data')
|
||||
|
|
@ -94,10 +95,13 @@ def main():
|
|||
adverse_embed_generated_loc1=[] #head-wise representations
|
||||
adverse_embed_generated_loc2=[] #mlp-wise representations
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
length = int(len(dataset)*0.55)
|
||||
length = len(dataset)
|
||||
for i in tqdm(range(length)):
|
||||
question = dataset[i]['query']
|
||||
adversary = dataset[i]['target']
|
||||
|
|
@ -142,8 +146,8 @@ def main():
|
|||
|
||||
benign_embed_generated = np.asarray(np.stack(benign_embed_generated), dtype=np.float32)
|
||||
adverse_embed_generated = np.asarray(np.stack(adverse_embed_generated), dtype=np.float32)
|
||||
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_benign_embeddings_layer_wise.npy', benign_embed_generated)
|
||||
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_adverse_embeddings_layer_wise.npy', adverse_embed_generated)
|
||||
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + 'benign_embeddings_layer_wise.npy', benign_embed_generated)
|
||||
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + 'adverse_embeddings_layer_wise.npy', adverse_embed_generated)
|
||||
|
||||
benign_embed_generated_loc2 = np.asarray(np.stack(benign_embed_generated_loc2), dtype=np.float32)
|
||||
benign_embed_generated_loc1 = np.asarray(np.stack(benign_embed_generated_loc1), dtype=np.float32)
|
||||
|
|
@ -151,10 +155,10 @@ def main():
|
|||
adverse_embed_generated_loc1 = np.asarray(np.stack(adverse_embed_generated_loc1), dtype=np.float32)
|
||||
|
||||
|
||||
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_benign_embeddings_head_wise.npy', benign_embed_generated_loc2)
|
||||
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_benign_embeddings_mlp_wise.npy', benign_embed_generated_loc1)
|
||||
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_adverse_embeddings_head_wise.npy', adverse_embed_generated_loc2)
|
||||
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + f'{args.model_name}_adverse_embeddings_mlp_wise.npy', adverse_embed_generated_loc1)
|
||||
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + 'benign_embeddings_head_wise.npy', benign_embed_generated_loc2)
|
||||
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + 'benign_embeddings_mlp_wise.npy', benign_embed_generated_loc1)
|
||||
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + 'adverse_embeddings_head_wise.npy', adverse_embed_generated_loc2)
|
||||
np.save(f'save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak/' + 'adverse_embeddings_mlp_wise.npy', adverse_embed_generated_loc1)
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
446
steer_vector.py
446
steer_vector.py
|
|
@ -13,6 +13,7 @@ from torch.cuda.amp import autocast, GradScaler
|
|||
import torch.nn.functional as F
|
||||
from sinkhorn_knopp import SinkhornKnopp_imb
|
||||
import logging
|
||||
from utils import load_npy_shapes,split_indices
|
||||
|
||||
|
||||
def seed_everything(seed: int):
|
||||
|
|
@ -47,7 +48,7 @@ def train_model(model, optimizer, device, prompts, labels, args):
|
|||
layer_number = -1 # 使用最后一层 hidden state(这里 -1 当索引)
|
||||
|
||||
# ========= 日志 & 结果保存目录 =========
|
||||
dir_name = f"TSV_{args.model_name}_{args.dataset_name}/exemplar_num_{args.num_exemplars}_num_selected_data_{args.num_selected_data}/{args.component}/{args.str_layer}/{args.lam}"
|
||||
dir_name = f"SV_{args.model_name}_{args.dataset_name}/{args.component}/{args.str_layer}/{args.lam}"
|
||||
log_dir = f"/{dir_name}/"
|
||||
log_file = os.path.join(log_dir, f"log.txt")
|
||||
os.makedirs(dir_name, exist_ok=True)
|
||||
|
|
@ -61,14 +62,12 @@ def train_model(model, optimizer, device, prompts, labels, args):
|
|||
|
||||
logging.info("Starting training")
|
||||
logging.info(
|
||||
f"Training parameters: few_shot_size={args.num_exemplars}, "
|
||||
f"num_selected_data={args.num_selected_data}, "
|
||||
f"component={args.component}, str_layer={args.str_layer}"
|
||||
)
|
||||
|
||||
# 解包数据:test / wild train / exemplar
|
||||
test_prompts, train_prompts, exemplar_prompts = prompts[0], prompts[1], prompts[2]
|
||||
test_labels, train_labels, exemplar_labels = labels[0], labels[1], labels[2]
|
||||
test_prompts, train_prompts = prompts[0], prompts[1]
|
||||
test_labels, train_labels = labels[0], labels[1]
|
||||
|
||||
batch_size = args.batch_size
|
||||
losses = []
|
||||
|
|
@ -76,11 +75,9 @@ def train_model(model, optimizer, device, prompts, labels, args):
|
|||
|
||||
scaler = GradScaler() # 混合精度的梯度缩放器
|
||||
|
||||
num_exemplars = args.num_exemplars
|
||||
num_trains = args.num_train
|
||||
|
||||
# ========= Sinkhorn OT 相关初始化(伪标签阶段要用)=========
|
||||
args.num_iters_sk = 3 # Sinkhorn 迭代次数(论文 Eq.(8)(9))
|
||||
args.epsilon_sk = 0.05 # OT 熵正则 ε(论文中也设为 0.05)
|
||||
|
||||
|
||||
# 用 exemplar 的标签估计类分布 w(truthful/hallu 的先验)
|
||||
# ex_hallu = P(hallucinated), ex_true = P(truthful)
|
||||
|
|
@ -98,8 +95,8 @@ def train_model(model, optimizer, device, prompts, labels, args):
|
|||
centroids = F.normalize(centroids, p=2, dim=1)
|
||||
|
||||
# 把 exemplar 的 prompt/label 整理成一个大 batch(padding)
|
||||
exemplar_prompts_, exemplar_labels_ = exemplar_prompts, exemplar_labels
|
||||
exemplar_prompts, exemplar_labels = collate_fn(exemplar_prompts, exemplar_labels)
|
||||
train_prompts_, train_labels_ = train_prompts, train_labels
|
||||
train_prompts, train_labels = collate_fn(train_prompts, train_labels)
|
||||
|
||||
# ========= Phase 1: Initial training on exemplars =========
|
||||
num_epochs = args.init_num_epochs
|
||||
|
|
@ -107,7 +104,7 @@ def train_model(model, optimizer, device, prompts, labels, args):
|
|||
for epoch in range(num_epochs):
|
||||
running_loss = 0.0
|
||||
total = 0
|
||||
num_samples = num_exemplars
|
||||
num_samples = num_trains
|
||||
|
||||
# 在 exemplar 上分 batch 训练(对应论文中 D_E 只含人工标注数据)
|
||||
for batch_start in tqdm(
|
||||
|
|
@ -115,8 +112,8 @@ def train_model(model, optimizer, device, prompts, labels, args):
|
|||
desc=f"Epoch {epoch+1}/{num_epochs} Batches",
|
||||
leave=False,
|
||||
):
|
||||
batch_prompts = exemplar_prompts[batch_start: batch_start + batch_size]
|
||||
batch_labels = exemplar_labels[batch_start: batch_start + batch_size]
|
||||
batch_prompts = train_prompts[batch_start: batch_start + batch_size]
|
||||
batch_labels = train_labels[batch_start: batch_start + batch_size]
|
||||
|
||||
# attention_mask: 1 = valid token, 0 = padding
|
||||
attention_mask = (batch_prompts != 0).half()
|
||||
|
|
@ -137,7 +134,7 @@ def train_model(model, optimizer, device, prompts, labels, args):
|
|||
hidden_states = torch.stack(hidden_states, dim=0).squeeze()
|
||||
last_layer_hidden_state = hidden_states[layer_number] # [B, T, H]
|
||||
|
||||
# 对应论文里 r_v:最后非 padding token 的 hidden(包含 TSV steer)
|
||||
# 对应论文里 r_v:最后非 padding token 的 hidden(包含 SV steer)
|
||||
last_token_rep = get_last_non_padded_token_rep(
|
||||
last_layer_hidden_state, attention_mask.squeeze()
|
||||
)
|
||||
|
|
@ -405,363 +402,126 @@ def test_model(model, centroids, test_prompts, test_labels, device, batch_size,
|
|||
|
||||
HF_NAMES = {
|
||||
'llama3.1-8B': 'meta-llama/Meta-Llama-3.1-8B',
|
||||
'qwen2.5-7B': 'Qwen/Qwen2.5-7B'
|
||||
'qwen2.5-7B': 'Qwen/Qwen2.5-7B' ,
|
||||
'llama_7B': 'baffo32/decapoda-research-llama-7B-hf',
|
||||
'honest_llama_7B': 'validation/results_dump/llama_7B_seed_42_top_48_heads_alpha_15',
|
||||
'alpaca_7B': 'circulus/alpaca-7b',
|
||||
'vicuna_7B': 'AlekseyKorshuk/vicuna-7b',
|
||||
'llama2_chat_7B': 'models/Llama-2-7b-chat-hf',
|
||||
'llama2_chat_13B': 'models/Llama-2-13b-chat-hf',
|
||||
'llama2_chat_70B': 'meta-llama/Llama-2-70b-chat-hf',
|
||||
}
|
||||
|
||||
def main():
|
||||
|
||||
# ======================
|
||||
# 1. 解析命令行参数
|
||||
# ======================
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--model_name', type=str, default='llama3.1-8B')
|
||||
parser.add_argument('--model_prefix', type=str, default='', help='prefix of model name')
|
||||
parser.add_argument('--num_gene', type=int, default=1)
|
||||
parser.add_argument('--gene', type=int, default=0)
|
||||
parser.add_argument('--generate_gt', type=int, default=0)
|
||||
parser.add_argument('--dataset_name', type=str, default='tqa')
|
||||
parser.add_argument('--device', type=int, default=0)
|
||||
parser.add_argument('--wild_ratio', type=float, default=0.75)
|
||||
parser.add_argument('--thres_gt', type=float, default=0.5)
|
||||
parser.add_argument('--most_likely', type=int, default=0)
|
||||
parser.add_argument('--model_name', type=str, default='alpaca_7B')
|
||||
parser.add_argument('--num_gene', type=int, default=1) # 每个问题生成多少个答案
|
||||
parser.add_argument('--train_sv', type=bool, default=True) # 是否执行“生成答案”阶段(1=生成+保存答案,0=不生成)
|
||||
parser.add_argument('--steer_place', type=str, default='layer', help='where to steer (layer, mlp and head)') # 是否执行“生成答案”阶段(1=生成+保存答案,0=不生成)
|
||||
parser.add_argument('--dataset_name', type=str, default='AdvBench') # 使用哪个数据集
|
||||
parser.add_argument('--device', type=int, default=0) # GPU id(没怎么用,device_map="auto" 为主)
|
||||
parser.add_argument("--model_dir", type=str, default=None, help='local directory with model data')
|
||||
parser.add_argument("--batch_size", type=int, default=128)
|
||||
parser.add_argument("--cos_temp", type=float, default=0.1)
|
||||
parser.add_argument("--ema_decay", type=float, default=0.99)
|
||||
parser.add_argument("--lr", type=float, default=0.005)
|
||||
parser.add_argument("--str_layer", type=int, default=9)
|
||||
parser.add_argument("--component", type=str, default='res')
|
||||
parser.add_argument("--lam", type=float, default=5)
|
||||
parser.add_argument("--init_num_epochs", type=int, default=20)
|
||||
parser.add_argument("--aug_num_epochs", type=int, default=20)
|
||||
parser.add_argument("--num_exemplars", type=int, default=32)
|
||||
parser.add_argument("--num_selected_data", type=int, default=128)
|
||||
parser.add_argument("--cls_dist", type=str, default='proxy')
|
||||
parser.add_argument("--batch_size", type=int, default=64)
|
||||
parser.add_argument("--cos_temp", type=float, default=0.1) # 余弦相似度的 softmax 温度
|
||||
parser.add_argument("--ema_decay", type=float, default=0.99) # 原型向量 EMA 衰减系数
|
||||
parser.add_argument("--lr", type=float, default=0.005) # SV 的学习率
|
||||
parser.add_argument("--str_layer", type=int, default=9) # 插 SV 的层号(第几层 hidden)
|
||||
parser.add_argument("--component", type=str, default='res') # SV 以何种方式注入(残差等)
|
||||
parser.add_argument("--lam", type=float, default=5) # SV 强度 λ
|
||||
parser.add_argument("--init_num_epochs", type=int, default=20) # SV训练轮数
|
||||
parser.add_argument("--optimizer", type=str, default='AdamW')
|
||||
parser.add_argument("--num_iters_sk", type=int, default=3)
|
||||
parser.add_argument("--epsilon_sk", type=float, default=0.05)
|
||||
parser.add_argument('--train_ratio', type=float, default=0.8)
|
||||
parser.add_argument('--val_ratio', type=float, default=0.1)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# ======================
|
||||
# 2. 确定 HuggingFace 模型名
|
||||
# ======================
|
||||
# HF_NAMES 在文件前面定义:
|
||||
# HF_NAMES = {'llama3.1-8B': 'meta-llama/Meta-Llama-3.1-8B', 'qwen2.5-7B': 'Qwen/Qwen2.5-7B'}
|
||||
# model_prefix 可以拼接一些前缀,这里默认是空
|
||||
model_name_or_path = HF_NAMES[args.model_prefix + args.model_name]
|
||||
|
||||
if args.dataset_name == "tqa":
|
||||
dataset = load_dataset("truthful_qa", 'generation')['validation']
|
||||
|
||||
elif args.dataset_name == 'triviaqa':
|
||||
dataset = load_dataset("trivia_qa", "rc.nocontext", split="validation")
|
||||
id_mem = set()
|
||||
|
||||
def remove_dups(batch):
|
||||
if batch['question_id'][0] in id_mem:
|
||||
return {_: [] for _ in batch.keys()}
|
||||
id_mem.add(batch['question_id'][0])
|
||||
return batch
|
||||
|
||||
dataset = dataset.map(remove_dups, batch_size=1, batched=True, load_from_cache_file=False)
|
||||
|
||||
|
||||
elif args.dataset_name == 'sciq':
|
||||
dataset = load_dataset("allenai/sciq", split="validation")
|
||||
|
||||
elif args.dataset_name == 'nq_open':
|
||||
dataset = load_dataset("google-research-datasets/nq_open", split="validation")
|
||||
|
||||
|
||||
# ======================
|
||||
# 3. 加载不同的数据集
|
||||
# ======================
|
||||
if args.dataset_name == "AdvBench":
|
||||
dataset = load_dataset("LeeWlving/Safety_Datasets", "advbench")['test']
|
||||
elif args.dataset_name == 'HarmBench':
|
||||
dataset = load_dataset("LeeWlving/Safety_Datasets", "harmbench")['standard']
|
||||
elif args.dataset_name == 'JBB':
|
||||
dataset = load_dataset("LeeWlving/Safety_Datasets", "JBB")['test']
|
||||
else:
|
||||
raise ValueError("Invalid dataset name")
|
||||
assert "Not supported dataset name!"
|
||||
|
||||
|
||||
target_directory = f"save_for_eval/{args.dataset_name}/{args.model_name}_jailbreak"
|
||||
benign_data, adverse_data = load_npy_shapes(target_directory, args.steer_place)
|
||||
y_zero=np.zeros(len(benign_data))
|
||||
y_one=np.ones(len(adverse_data))
|
||||
|
||||
data_embedding = np.concatenate((benign_data, adverse_data), axis=0)
|
||||
gts = np.concatenate((y_zero, y_one), axis=0)
|
||||
|
||||
train_index, val_index, test_index=split_indices(len(data_embedding), args.train_ratio, args.val_ratio)
|
||||
|
||||
|
||||
|
||||
if args.gene:
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token = '')
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto", token = '')
|
||||
device = torch.device("cuda")
|
||||
all_decoded_answers = []
|
||||
begin_index = 0
|
||||
end_index = len(dataset)
|
||||
|
||||
if not os.path.exists(f'./save_for_eval/{args.dataset_name}_hal_det/'):
|
||||
os.mkdir(f'./save_for_eval/{args.dataset_name}_hal_det/')
|
||||
|
||||
if not os.path.exists(f'./save_for_eval/{args.dataset_name}_hal_det/answers'):
|
||||
os.mkdir(f'./save_for_eval/{args.dataset_name}_hal_det/answers')
|
||||
|
||||
period_token_id = [tokenizer(_)['input_ids'][-1] for _ in ['\n']]
|
||||
period_token_id += [tokenizer.eos_token_id]
|
||||
|
||||
for i in range(begin_index, end_index):
|
||||
answers = [None] * args.num_gene
|
||||
answers_ = [None] * args.num_gene
|
||||
|
||||
question = dataset[i]['question']
|
||||
prompt = tokenizer(f"Answer the question concisely. Q: {question}" + " A:", return_tensors='pt').input_ids.cuda()
|
||||
|
||||
for gen_iter in range(args.num_gene):
|
||||
if args.most_likely:
|
||||
generated = model.generate(prompt,
|
||||
num_beams=5,
|
||||
num_return_sequences=1,
|
||||
do_sample=False,
|
||||
max_new_tokens=64,
|
||||
)
|
||||
else:
|
||||
generated = model.generate(prompt,
|
||||
do_sample=True,
|
||||
num_return_sequences=1,
|
||||
num_beams=1,
|
||||
max_new_tokens=64,
|
||||
temperature=0.5,
|
||||
top_p=1.0)
|
||||
|
||||
|
||||
decoded = tokenizer.decode(generated[0, prompt.shape[-1]:],
|
||||
skip_special_tokens=True)
|
||||
# answers[gen_iter] = decoded
|
||||
|
||||
# Cleaning
|
||||
if '\nAnswer the question concisely.' in decoded:
|
||||
print('#####error')
|
||||
print(decoded.split('\nAnswer the question concisely.')[1])
|
||||
print('#####error')
|
||||
decoded = decoded.split('\nAnswer the question concisely.')[0]
|
||||
|
||||
if 'Answer the question concisely' in decoded:
|
||||
print('#####error')
|
||||
print(decoded.split('Answer the question concisely')[1])
|
||||
print('#####error')
|
||||
decoded = decoded.split('Answer the question concisely')[0]
|
||||
|
||||
if 'The answer to the question' in decoded:
|
||||
print('#####error')
|
||||
print(decoded.split('The answer to the question')[1])
|
||||
print('#####error')
|
||||
decoded = decoded.split('The answer to the question')[0]
|
||||
|
||||
if 'How to Write a Concise Statement' in decoded:
|
||||
print('#####error')
|
||||
print(decoded.split('How to Write a Concise Statement')[1])
|
||||
print('#####error')
|
||||
decoded = decoded.split('How to Write a Concise Statement')[0]
|
||||
|
||||
if 'Q:' in decoded:
|
||||
print('#####error')
|
||||
print(decoded.split('Q:')[1])
|
||||
print('#####error')
|
||||
decoded = decoded.split('Q:')[0]
|
||||
|
||||
if '\nYou are an AI assistant' in decoded:
|
||||
print('#####error')
|
||||
print(decoded.split('\nYou are an AI assistant')[1])
|
||||
print('#####error')
|
||||
decoded = decoded.split('\nYou are an AI assistant')[0]
|
||||
|
||||
if 'You are an AI assistant' in decoded:
|
||||
print('#####error')
|
||||
print(decoded.split('You are an AI assistant')[1])
|
||||
print('#####error')
|
||||
decoded = decoded.split('You are an AI assistant')[0]
|
||||
|
||||
if 'A:' in decoded:
|
||||
print('#####error')
|
||||
print(decoded.split('A:')[1])
|
||||
print('#####error')
|
||||
decoded = decoded.split('A:')[0]
|
||||
|
||||
if 'B:' in decoded:
|
||||
print('#####error')
|
||||
print(decoded.split('B:')[1])
|
||||
print('#####error')
|
||||
decoded = decoded.split('B:')[0]
|
||||
|
||||
if 'C:' in decoded:
|
||||
print('#####error')
|
||||
print(decoded.split('C:')[1])
|
||||
print('#####error')
|
||||
decoded = decoded.split('C:')[0]
|
||||
|
||||
if 'D:' in decoded:
|
||||
print('#####error')
|
||||
print(decoded.split('D:')[1])
|
||||
print('#####error')
|
||||
decoded = decoded.split('D:')[0]
|
||||
|
||||
print(f'Cleaned Answer: {decoded}')
|
||||
answers[gen_iter] = decoded
|
||||
|
||||
|
||||
|
||||
print('sample: ', i)
|
||||
if args.most_likely:
|
||||
info = 'most_likely_'
|
||||
else:
|
||||
info = 'batch_generations_'
|
||||
|
||||
print("Saving answers")
|
||||
print(decoded)
|
||||
|
||||
np.save(f'./save_for_eval/{args.dataset_name}_hal_det/answers/' + info + f'hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy',
|
||||
answers)
|
||||
|
||||
elif args.generate_gt:
|
||||
from bleurt_pytorch import BleurtForSequenceClassification, BleurtTokenizer
|
||||
|
||||
model = BleurtForSequenceClassification.from_pretrained('lucadiliello/BLEURT-20').cuda()
|
||||
tokenizer = BleurtTokenizer.from_pretrained('lucadiliello/BLEURT-20')
|
||||
model.eval()
|
||||
|
||||
gts = np.zeros(0)
|
||||
length = len(dataset)
|
||||
|
||||
for i in range(length):
|
||||
|
||||
if args.dataset_name == 'tqa':
|
||||
best_answer = dataset[i]['best_answer']
|
||||
correct_answer = dataset[i]['correct_answers']
|
||||
all_answers = [best_answer] + correct_answer
|
||||
question = dataset[i]['question']
|
||||
|
||||
elif args.dataset_name == 'triviaqa':
|
||||
all_answers = dataset[i]['answer']['aliases']
|
||||
|
||||
if args.most_likely:
|
||||
# answers = np.load(
|
||||
# f'./save_for_eval/{args.dataset_name}_hal_det/answers/most_likely_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy')
|
||||
answers = np.load(
|
||||
f'./save_for_eval/{args.dataset_name}_hal_det/answers/most_likely_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy')
|
||||
|
||||
else:
|
||||
answers = np.load(
|
||||
f'./save_for_eval/{args.dataset_name}_hal_det/answers/batch_generations_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy')
|
||||
|
||||
# get the gt.
|
||||
predictions = answers
|
||||
all_results = np.zeros((len(all_answers), len(predictions)))
|
||||
with torch.no_grad():
|
||||
for anw in range(len(all_answers)):
|
||||
inputs = tokenizer(predictions.tolist(), [all_answers[anw]] * len(predictions),
|
||||
padding='longest', return_tensors='pt')
|
||||
for key in list(inputs.keys()):
|
||||
inputs[key] = inputs[key].cuda()
|
||||
res = np.asarray(model(**inputs).logits.flatten().tolist())
|
||||
all_results[anw] = res
|
||||
gts = np.concatenate([gts, np.max(all_results, axis=0)], 0)
|
||||
if i % 10 == 0:
|
||||
print("samples passed: ", i)
|
||||
|
||||
if args.most_likely:
|
||||
# np.save(f'./ml_{args.dataset_name}_bleurt_score.npy', gts)
|
||||
np.save(f'./ml_{args.dataset_name}_bleurt_score.npy', gts)
|
||||
|
||||
else:
|
||||
np.save(f'./bg_{args.dataset_name}_bleurt_score.npy', gts)
|
||||
|
||||
|
||||
|
||||
else:
|
||||
# ============================================================
|
||||
# 6. 模式三:SV 训练阶段(gene=0 且 generate_gt=0)
|
||||
# ============================================================
|
||||
if args.train_sv:
|
||||
|
||||
device = torch.device("cuda")
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto", token = '')
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token = '')
|
||||
# 加载基础 LLM(之后会在其上插 TSV)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name_or_path,
|
||||
low_cpu_mem_usage=True,
|
||||
torch_dtype=torch.float16,
|
||||
device_map="auto",
|
||||
token = ''
|
||||
)
|
||||
|
||||
prompts = []
|
||||
qa_pairs = []
|
||||
categories = []
|
||||
train_data = [data_embedding[i] for i in train_index]
|
||||
train_labels = [gts[i] for i in train_index]
|
||||
|
||||
length = len(dataset)
|
||||
|
||||
|
||||
for i in tqdm(range(length)):
|
||||
|
||||
question = dataset[i]['question']
|
||||
if args.dataset_name == 'tqa':
|
||||
categories.append(dataset[i]['category'])
|
||||
|
||||
answers = np.load(
|
||||
f'./save_for_eval/{args.dataset_name}_hal_det/answers/most_likely_hal_det_{args.model_name}_{args.dataset_name}_answers_index_{i}.npy')
|
||||
|
||||
|
||||
for anw in answers:
|
||||
|
||||
prompt = tokenizer(
|
||||
f"Answer the question concisely. Q: {question}" + " A:" + anw,
|
||||
return_tensors='pt').input_ids.cuda()
|
||||
|
||||
prompts.append(prompt)
|
||||
qa_pairs.append({'Question': question, 'Answer': anw})
|
||||
|
||||
gts = np.load(f'./ml_{args.dataset_name}_bleurt_score.npy')
|
||||
|
||||
|
||||
length = len(dataset)
|
||||
|
||||
if args.dataset_name == 'tqa' or args.dataset_name == 'triviaqa':
|
||||
args.thres_gt = 0.5
|
||||
|
||||
else:
|
||||
args.thres_gt = 0.2
|
||||
|
||||
gt_label = np.asarray(gts> args.thres_gt, dtype=np.int32)
|
||||
|
||||
# index = np.random.permutation(length)
|
||||
|
||||
# exemplar_index = index[:args.num_exemplars]
|
||||
|
||||
# wild_q_indices = index[:int(args.wild_ratio * length)]
|
||||
|
||||
index = np.load(f'data_indices/data_index_{args.dataset_name}.npy')
|
||||
|
||||
exemplar_index = np.load(f'data_indices/exemplar_idx_{args.dataset_name}.npy')
|
||||
|
||||
wild_q_indices = index[:int(args.wild_ratio * length)]
|
||||
|
||||
wild_q_indices1 = wild_q_indices[:len(wild_q_indices) - 100]
|
||||
|
||||
args.num_exemplars = len(exemplar_index)
|
||||
|
||||
gt_label_test = []
|
||||
gt_label_wild = []
|
||||
gt_label_exemplar = []
|
||||
|
||||
test_prompts = []
|
||||
train_prompts = []
|
||||
exemplar_prompts = []
|
||||
|
||||
|
||||
for i in range(length):
|
||||
if i not in wild_q_indices:
|
||||
gt_label_test.extend(gt_label[i: i+1])
|
||||
test_prompts.extend(prompts[i:i+1])
|
||||
|
||||
elif i in exemplar_index:
|
||||
gt_label_exemplar.extend(gt_label[i: i+1])
|
||||
exemplar_prompts.extend(prompts[i:i+1])
|
||||
|
||||
elif i in wild_q_indices1:
|
||||
gt_label_wild.extend(gt_label[i: i+1])
|
||||
train_prompts.extend(prompts[i:i+1])
|
||||
|
||||
gt_label_test = np.asarray(gt_label_test)
|
||||
gt_label_exemplar = np.asarray(gt_label_exemplar)
|
||||
gt_label_wild = np.asarray(gt_label_wild)
|
||||
|
||||
labels = [ gt_label_test, gt_label_wild, gt_label_exemplar]
|
||||
prompts = [ test_prompts, train_prompts, exemplar_prompts]
|
||||
test_data = [data_embedding[i] for i in test_index]
|
||||
test_labels = [gts[i] for i in test_index]
|
||||
|
||||
# ====== 6.4 冻结 LLM,只训练 SV 参数 ======
|
||||
num_layers = model.config.num_hidden_layers
|
||||
hidden_size = model.config.hidden_size
|
||||
|
||||
for param in model.parameters():
|
||||
param.requires_grad = False
|
||||
param.requires_grad = False
|
||||
|
||||
tsv = nn.ParameterList(
|
||||
[nn.Parameter(torch.zeros(hidden_size), requires_grad=True) for _ in range(num_layers)])
|
||||
# 每一层一个 SV 向量(虽然实际只在某几层生效)
|
||||
sv = nn.ParameterList(
|
||||
[nn.Parameter(torch.zeros(hidden_size), requires_grad=True) for _ in range(num_layers)]
|
||||
)
|
||||
sv.to(device)
|
||||
|
||||
tsv.to(device)
|
||||
|
||||
add_tsv_layers(model, tsv, [args.lam], args)
|
||||
# 在模型对应层里插入 SV(add_sv_layers 会改 forward)
|
||||
add_sv_layers(model, sv, [args.lam], args)
|
||||
|
||||
optimizer = torch.optim.AdamW(list(tsv.parameters()), lr=args.lr)
|
||||
# 只把 SV 的参数丢给优化器
|
||||
optimizer = torch.optim.AdamW(list(sv.parameters()), lr=args.lr)
|
||||
|
||||
# ====== 6.5 调用 train_model,进入两阶段训练流程 ======
|
||||
train_model(model, optimizer, device, train_data, train_labels, args=args)
|
||||
|
||||
else:
|
||||
print("Skip training steer vectors.")
|
||||
raise NotImplementedError("Only training SV is implemented in this script.")
|
||||
|
||||
train_model(model, optimizer, device, prompts, labels, args=args)
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 为了结果可复现,固定随机种子
|
||||
seed_everything(42)
|
||||
main()
|
||||
main()
|
||||
|
|
|
|||
|
|
@ -13,7 +13,8 @@ from sklearn.decomposition import PCA
|
|||
from sklearn.preprocessing import StandardScaler
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
def check_npy_shapes(directory_path):
|
||||
|
||||
def load_npy_shapes(directory_path, steer_place="layer"):
|
||||
"""
|
||||
Check shapes of all .npy files in the specified directory
|
||||
and load them as benign vs adverse pairs.
|
||||
|
|
@ -29,8 +30,8 @@ def check_npy_shapes(directory_path):
|
|||
print("-" * 50)
|
||||
|
||||
# Separate files into benign and adverse
|
||||
benign_files = [f for f in npy_files if "benign" in f.lower()]
|
||||
adverse_files = [f for f in npy_files if "adverse" in f.lower()]
|
||||
benign_files = [f for f in npy_files if "benign" in f.lower() and steer_place in f.lower()]
|
||||
adverse_files = [f for f in npy_files if "adverse" in f.lower() and steer_place in f.lower()]
|
||||
|
||||
# Check shapes and load data
|
||||
benign_data = {}
|
||||
|
|
@ -75,12 +76,6 @@ def check_npy_shapes(directory_path):
|
|||
|
||||
if benign_array.shape == adverse_array.shape:
|
||||
print(f" ✓ Shapes match")
|
||||
|
||||
# Calculate some basic statistics for comparison
|
||||
diff = np.abs(benign_array - adverse_array)
|
||||
print(f" Mean absolute difference: {np.mean(diff):.6f}")
|
||||
print(f" Max absolute difference: {np.max(diff):.6f}")
|
||||
print(f" Min absolute difference: {np.min(diff):.6f}")
|
||||
else:
|
||||
print(f" ✗ Shapes don't match!")
|
||||
|
||||
|
|
@ -300,7 +295,7 @@ def main():
|
|||
return
|
||||
|
||||
# Check and load the .npy files
|
||||
benign_data, adverse_data = check_npy_shapes(target_directory)
|
||||
benign_data, adverse_data = load_npy_shapes(target_directory)
|
||||
|
||||
# Perform PCA analysis to identify discriminative layers
|
||||
if benign_data and adverse_data:
|
||||
|
|
|
|||
Loading…
Reference in New Issue