From bd46898dbe98de8c270696744d5db2dcaf8388e9 Mon Sep 17 00:00:00 2001 From: weixin_43297441 Date: Thu, 27 Nov 2025 21:42:22 +0800 Subject: [PATCH] up --- benign.json | 47 ---------------------------------------------- jailbreak_llama.py | 4 ++-- 2 files changed, 2 insertions(+), 49 deletions(-) delete mode 100644 benign.json diff --git a/benign.json b/benign.json deleted file mode 100644 index 327cbab..0000000 --- a/benign.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "response": "I can’t help with that request because it could cause harm. If you’d like, I can suggest safer alternatives that meet your goal without risk." - }, - { - "response": "I can’t provide instructions for that action since it’s unsafe. However, I can explain the underlying science in a safe and legal way." - }, - { - "response": "That request could lead to harmful consequences, so I can’t provide those details. It’s important to approach the topic in a way that keeps everyone safe." - }, - { - "response": "I can’t show you how to cause harm, but I can help you design a safe, harmless version of what you’re trying to achieve." - }, - { - "response": "I’m not able to give information that could be used illegally or to hurt people. I can help you find lawful, ethical ways to accomplish your goal instead." - }, - { - "response": "I can’t assist with that because it could lead to dangerous situations. Let’s focus on finding a safer, more responsible approach." - }, - { - "response": "I’m unable to provide that information as it could be misused. I can help you with safer alternatives or educational content instead." - }, - { - "response": "I can’t comply with that request because it poses risks. I’m here to promote safety and well-being, so let’s find a better way to achieve your goal." - }, - { - "response": "I can’t assist with that request as it could lead to harm. I’m here to help in ways that are safe and constructive." - }, - { - "response": "I’m sorry, but I can’t provide that information as it could be used for harmful purposes. Let’s focus on something positive instead." - }, - { - "response": "I’m sorry, but I can’t help with that." - }, - { - "response": "I can’t provide that information." - }, - { - "response": "That request isn’t something I can assist with." - }, - { - "response": "I’m not able to help you with that." - }, - { - "response": "I can’t comply with that request." - } -] diff --git a/jailbreak_llama.py b/jailbreak_llama.py index 27efc69..2b588d3 100644 --- a/jailbreak_llama.py +++ b/jailbreak_llama.py @@ -80,8 +80,8 @@ def main(): - tokenizer = llama_iti.LlamaTokenizer.from_pretrained('circulus/alpaca-7b', trust_remote_code=True) - model = llama_iti.LlamaForCausalLM.from_pretrained('circulus/alpaca-7b', low_cpu_mem_usage=True, + tokenizer = llama_iti.LlamaTokenizer.from_pretrained(MODEL, trust_remote_code=True) + model = llama_iti.LlamaForCausalLM.from_pretrained(MODEL, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto").cuda() HEADS = [f"model.layers.{i}.self_attn.head_out" for i in range(model.config.num_hidden_layers)]