From f17c0142ea8ed18f407a66080d5f480dccd7febd Mon Sep 17 00:00:00 2001
From: liwenyun <lee_wlving@outlook.com>
Date: Mon, 25 Nov 2024 00:16:08 +0800
Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20a.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 a.py | 270 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 270 insertions(+)
 create mode 100644 a.py

diff --git a/a.py b/a.py
new file mode 100644
index 0000000..ca4cb5f
--- /dev/null
+++ b/a.py
@@ -0,0 +1,270 @@
+import logging
+
+from numpy.f2py.f90mod_rules import options
+from openai import OpenAI
+from openai import OpenAIError
+import time
+import json
+import argparse
+from datasets import load_dataset
+import csv
+
+client = OpenAI(
+    api_key="sk-5f06261529bb44df86d9b2fdbae1a6b5", # 在这里将 MOONSHOT_API_KEY 替换为你从 Kimi 开放平台申请的 API Key
+    base_url="https://api.deepseek.com/v1",
+)
+
+options_dic={'A':0,'B':1,'C':2,'D':3}
+
+def get_qa_res(knowledge, question, answer, instruction):
+    if isinstance(instruction, str):
+        message = [
+            {"role": "user", "content": instruction +
+                                        "\n\n#Знание#: " + knowledge +
+                                        "\n#Задать вопрос#: " + question +
+                                        "\n#правильный ответ#: " + answer +
+                                        "\n#галлюцинация ответ#: "}
+        ]
+    elif isinstance(instruction, list):
+        mes = [{"role": "user",
+                "content": "Теперь вы полноценный генератор иллюзий. Пожалуйста, сформулируйте галлюцинационные ответы на следующие вопросы. Вы можете использовать любой метод, который вы изучите, который подходит для конкретной проблемы." +
+                           "\n\n#Знание#: " + knowledge +
+                           "\n#Задать вопрос#: " + question +
+                           "\n#правильный ответ#: " + answer +
+                           "\n#галлюцинация ответ#: "}]
+        message = instruction + mes
+    else:
+        raise TypeError("The instruction must be str or list!")
+             
+    while True:
+        try:
+            res = client.chat.completions.create(
+                model="deepseek-chat",
+                messages=message,
+                temperature=1,
+                max_tokens=256,
+                top_p=1
+            )
+            break
+        except OpenAIError:
+            logging.warning('openai.error\nRetrying...')
+            time.sleep(60)
+    
+    
+    # print(res['choices'][0]['message']['content'])
+    return res.choices[0].message.content
+
+
+def get_dialogue_res(knowledge, dialog, response, instruction):
+    if isinstance(instruction, str):
+        message = [
+            {"role": "user", "content": instruction +
+                                        "\n\n#Knowledge#: " + knowledge +
+                                        "\n#Dialogue History#: " + dialog +
+                                        "\n#True Response#: " + response +
+                                        "\n#Hallucinated Response#: "}
+        ]
+    elif isinstance(instruction, list):
+        mes = [{"role": "user",
+                "content": "You are now a mature hallucination generator. Please generate hallucinated response for the following dialogue. You can use any method you have learned that is suitable for the given dialogue history." +
+                           "\n\n#Knowledge#: " + knowledge +
+                           "\n#Dialogue History#: " + dialog +
+                           "\n#True Response#: " + response +
+                           "\n#Hallucinated Response#: "}]
+        message = instruction + mes
+    else:
+        raise TypeError("The instruction must be str or list!")
+
+    while True:
+        try:
+            res = client.chat.completions.create(
+                model="deepseek-chat",
+                messages=message,
+                temperature=1,
+                max_tokens=256,
+                top_p=1
+            )
+            break
+        except OpenAIError:
+            logging.warning('openai.error\nRetrying...')
+            time.sleep(60)
+
+    # print(res['choices'][0]['message']['content'])
+    return res.choices[0].message.content
+
+
+def get_summarization_res(text, summary, instruction):
+    if isinstance(instruction, str):
+        message = [
+            {"role": "user", "content": instruction +
+                                        "\n\n#Document#: " + text +
+                                        "\n#Right Summary#: " + summary +
+                                        "\n#Hallucinated Summary#: "}
+        ]
+    elif isinstance(instruction, list):
+        mes = [{"role": "user",
+                "content": "You are now a mature hallucination generator. Please generate hallucinated summary for the following document. You can use any method you have learned that is suitable for the given document. #Hallucinated Summary# must not be longer than #Right Summary#." +
+                           "\n\n#Document#: " + text +
+                           "\n#Right Summary#: " + summary +
+                           "\n#Hallucinated Summary#: "}]
+        message = instruction + mes
+    else:
+        raise TypeError("The instruction must be str or list!")
+
+    while True:
+        try:
+            res = client.chat.completions.create(
+                model="deepseek-chat",
+                messages=message,
+                temperature=1,
+                max_tokens=256,
+                top_p=1
+            )
+            break
+        except OpenAIError:
+            logging.warning('openai.error\nRetrying...')
+            time.sleep(60)
+
+    # print(res['choices'][0]['message']['content'])
+    return res.choices[0].message.content
+
+
+def generate_qa_dataset(datas, instruction, output_path):
+
+    # with open(seed_data, 'r', encoding="utf-8") as f:
+    #     text = json.load(f)
+    #     text=text["data"]
+    #     temp=0
+      for i in range(len(datas)):
+            # print(len(data))
+          content=datas[i]['paragraph']
+          question = datas[i]['question']
+          answer = datas[i]['answer']
+          knowledge = content
+          ans = get_qa_res(knowledge, question, answer, instruction)
+          data = {"knowledge": knowledge, "question": question, "right_answer": answer, "hallucinated_answer": ans}
+          dump_jsonl(data, output_path, append=True)
+          print(" sample {} completed!".format(i))
+            # for j in range(len(text[i]['questions'])):
+            #     question = text[i]['questions'][j]
+            #     answer= text[i]['options'][j][options_dic[text[i]['answers'][j]]]
+            #     derivations= text[i]['evidences'][j]
+            #     knowledge =content
+            #     for derivation in derivations:
+            #         for para in derivation:
+            #             if isinstance(para, str):
+            #                 knowledge = knowledge + para
+            #             elif isinstance(para, list):
+            #                 for p in para:
+            #                     knowledge = knowledge + p
+            #             else:
+            #                 raise TypeError("The derivations must be str or list!")
+            #
+            #
+            #     ans = get_qa_res(knowledge, question, answer, instruction)
+            #     data = {"knowledge": knowledge, "question": question, "right_answer": answer, "hallucinated_answer": ans}
+            #     dump_jsonl(data, output_path, append=True)
+            #     print(" sample {} completed!".format(temp))
+            #     temp+=1
+
+
+def generate_dialogue_dataset(seed_data, instruction, output_path):
+    SENDER = {"user": "[人間]", "assistant": "[アシスタント]"}
+    with open(seed_data, 'r', encoding="utf-8") as f:
+        i = 0
+        data = csv.DictReader(f)
+        for r in data:
+            if i >= 10000:
+                break
+            r = eval(r['Messages'])
+            dialog = ""
+            knowledge = ""
+            response = ""
+            k = 0
+            d = 0
+            for message in r:
+                if "message" in message:
+                    if k > 1 and message['sender'] == "アシスタント":
+                        response = message['message']
+                        break
+                    if d > 3 and message['sender'] == "アシスタント":
+                        response = message['message']
+                        break
+                    else:
+                        dialog = dialog + (SENDER[message['sender']] + ": " + message['message']) + " "
+                        d = d + 1
+
+                if "metadata" in message:
+                    if "path" in message['metadata']:
+                        knowledge = knowledge + message['metadata']['path'][2]
+                    k = k + 1
+
+            if knowledge == "" or dialog == "" or response == "":
+                continue
+            res = get_dialogue_res(knowledge, dialog, response, instruction)
+            data = {"knowledge": knowledge, "dialogue_history": dialog, "right_response": response, "hallucinated_response": res}
+            dump_jsonl(data, output_path, append=True)
+            i = i + 1
+            print("sample {} completed!".format(i))
+
+
+def generate_summarization_dataset(seed_data, instruction, output_path):
+    with open(seed_data, 'r', encoding="utf-8") as f:
+        data = f.readlines()
+        text = [json.loads(d) for d in data]
+
+        for i in range(10000):
+            document = text[i]["document"]
+            summary = text[i]["summary"]
+            sum = get_summarization_res(document, summary, instruction)
+            data = {"document": document, "right_summary": summary, "hallucinated_summary": sum}
+            dump_jsonl(data, output_path, append=True)
+            print("sample {} completed!".format(i))
+
+
+def dump_jsonl(data, output_path, append=False):
+    """
+    Write list of objects to a JSON lines file.
+    """
+    mode = 'a+' if append else 'w'
+    with open(output_path, mode, encoding='utf-8') as f:
+            json_record = json.dumps(data, ensure_ascii=False)
+            f.write(json_record + '\n')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Hallucination Generation")
+
+    # parser.add_argument("--seed_data", default="hotpot_train_v1.1.json", help="the original dataset file")
+    parser.add_argument("--task", default="qa", help="qa, dialogue, or summarization")
+    parser.add_argument("--strategy",default="one-turn", help="one-turn or multi-turn")
+    args = parser.parse_args()
+
+    # seed_data = args.seed_data
+    # from datasets import clear_cache
+    #
+    # clear_cache()
+    dataset=load_dataset("RussianNLP/russian_super_glue", "muserc")
+    # print(dataset)
+    if args.strategy == "one-turn":
+        instruction_file = "{}/ru_{}_{}_instruction.txt".format(args.task, args.task, args.strategy)
+        f = open(instruction_file, 'r', encoding="utf-8")
+        instruction = f.read()
+    elif args.strategy == "multi-turn":
+        instruction_file = "{}/ru_{}_{}_instruction.json".format(args.task, args.task, args.strategy)
+        with open(instruction_file, 'r', encoding="utf-8") as f:
+            lines = f.readlines()
+        instruction = [json.loads(line) for line in lines]
+    else:
+        raise ValueError("The strategy must be one-turn or multi-turn!")
+
+    output_path = "{}/ru_{}_{}_data.json".format(args.task, args.task, args.strategy)
+
+    if args.task == "qa":
+        generate_qa_dataset(dataset["train"], instruction, output_path)
+    elif args.task == "dialogue":
+        generate_dialogue_dataset(dataset["train"], instruction, output_path)
+    elif args.task == "summarization":
+        generate_summarization_dataset(dataset["train"][:], instruction, output_path)
+    else:
+        raise ValueError("The task must be qa, dialogue, or summarization!")