From f17c0142ea8ed18f407a66080d5f480dccd7febd Mon Sep 17 00:00:00 2001 From: liwenyun Date: Mon, 25 Nov 2024 00:16:08 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20a.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- a.py | 270 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100644 a.py diff --git a/a.py b/a.py new file mode 100644 index 0000000..ca4cb5f --- /dev/null +++ b/a.py @@ -0,0 +1,270 @@ +import logging + +from numpy.f2py.f90mod_rules import options +from openai import OpenAI +from openai import OpenAIError +import time +import json +import argparse +from datasets import load_dataset +import csv + +client = OpenAI( + api_key="sk-5f06261529bb44df86d9b2fdbae1a6b5", # 在这里将 MOONSHOT_API_KEY 替换为你从 Kimi 开放平台申请的 API Key + base_url="https://api.deepseek.com/v1", +) + +options_dic={'A':0,'B':1,'C':2,'D':3} + +def get_qa_res(knowledge, question, answer, instruction): + if isinstance(instruction, str): + message = [ + {"role": "user", "content": instruction + + "\n\n#Знание#: " + knowledge + + "\n#Задать вопрос#: " + question + + "\n#правильный ответ#: " + answer + + "\n#галлюцинация ответ#: "} + ] + elif isinstance(instruction, list): + mes = [{"role": "user", + "content": "Теперь вы полноценный генератор иллюзий. Пожалуйста, сформулируйте галлюцинационные ответы на следующие вопросы. Вы можете использовать любой метод, который вы изучите, который подходит для конкретной проблемы." + + "\n\n#Знание#: " + knowledge + + "\n#Задать вопрос#: " + question + + "\n#правильный ответ#: " + answer + + "\n#галлюцинация ответ#: "}] + message = instruction + mes + else: + raise TypeError("The instruction must be str or list!") + + while True: + try: + res = client.chat.completions.create( + model="deepseek-chat", + messages=message, + temperature=1, + max_tokens=256, + top_p=1 + ) + break + except OpenAIError: + logging.warning('openai.error\nRetrying...') + time.sleep(60) + + + # print(res['choices'][0]['message']['content']) + return res.choices[0].message.content + + +def get_dialogue_res(knowledge, dialog, response, instruction): + if isinstance(instruction, str): + message = [ + {"role": "user", "content": instruction + + "\n\n#Knowledge#: " + knowledge + + "\n#Dialogue History#: " + dialog + + "\n#True Response#: " + response + + "\n#Hallucinated Response#: "} + ] + elif isinstance(instruction, list): + mes = [{"role": "user", + "content": "You are now a mature hallucination generator. Please generate hallucinated response for the following dialogue. You can use any method you have learned that is suitable for the given dialogue history." + + "\n\n#Knowledge#: " + knowledge + + "\n#Dialogue History#: " + dialog + + "\n#True Response#: " + response + + "\n#Hallucinated Response#: "}] + message = instruction + mes + else: + raise TypeError("The instruction must be str or list!") + + while True: + try: + res = client.chat.completions.create( + model="deepseek-chat", + messages=message, + temperature=1, + max_tokens=256, + top_p=1 + ) + break + except OpenAIError: + logging.warning('openai.error\nRetrying...') + time.sleep(60) + + # print(res['choices'][0]['message']['content']) + return res.choices[0].message.content + + +def get_summarization_res(text, summary, instruction): + if isinstance(instruction, str): + message = [ + {"role": "user", "content": instruction + + "\n\n#Document#: " + text + + "\n#Right Summary#: " + summary + + "\n#Hallucinated Summary#: "} + ] + elif isinstance(instruction, list): + mes = [{"role": "user", + "content": "You are now a mature hallucination generator. Please generate hallucinated summary for the following document. You can use any method you have learned that is suitable for the given document. #Hallucinated Summary# must not be longer than #Right Summary#." + + "\n\n#Document#: " + text + + "\n#Right Summary#: " + summary + + "\n#Hallucinated Summary#: "}] + message = instruction + mes + else: + raise TypeError("The instruction must be str or list!") + + while True: + try: + res = client.chat.completions.create( + model="deepseek-chat", + messages=message, + temperature=1, + max_tokens=256, + top_p=1 + ) + break + except OpenAIError: + logging.warning('openai.error\nRetrying...') + time.sleep(60) + + # print(res['choices'][0]['message']['content']) + return res.choices[0].message.content + + +def generate_qa_dataset(datas, instruction, output_path): + + # with open(seed_data, 'r', encoding="utf-8") as f: + # text = json.load(f) + # text=text["data"] + # temp=0 + for i in range(len(datas)): + # print(len(data)) + content=datas[i]['paragraph'] + question = datas[i]['question'] + answer = datas[i]['answer'] + knowledge = content + ans = get_qa_res(knowledge, question, answer, instruction) + data = {"knowledge": knowledge, "question": question, "right_answer": answer, "hallucinated_answer": ans} + dump_jsonl(data, output_path, append=True) + print(" sample {} completed!".format(i)) + # for j in range(len(text[i]['questions'])): + # question = text[i]['questions'][j] + # answer= text[i]['options'][j][options_dic[text[i]['answers'][j]]] + # derivations= text[i]['evidences'][j] + # knowledge =content + # for derivation in derivations: + # for para in derivation: + # if isinstance(para, str): + # knowledge = knowledge + para + # elif isinstance(para, list): + # for p in para: + # knowledge = knowledge + p + # else: + # raise TypeError("The derivations must be str or list!") + # + # + # ans = get_qa_res(knowledge, question, answer, instruction) + # data = {"knowledge": knowledge, "question": question, "right_answer": answer, "hallucinated_answer": ans} + # dump_jsonl(data, output_path, append=True) + # print(" sample {} completed!".format(temp)) + # temp+=1 + + +def generate_dialogue_dataset(seed_data, instruction, output_path): + SENDER = {"user": "[人間]", "assistant": "[アシスタント]"} + with open(seed_data, 'r', encoding="utf-8") as f: + i = 0 + data = csv.DictReader(f) + for r in data: + if i >= 10000: + break + r = eval(r['Messages']) + dialog = "" + knowledge = "" + response = "" + k = 0 + d = 0 + for message in r: + if "message" in message: + if k > 1 and message['sender'] == "アシスタント": + response = message['message'] + break + if d > 3 and message['sender'] == "アシスタント": + response = message['message'] + break + else: + dialog = dialog + (SENDER[message['sender']] + ": " + message['message']) + " " + d = d + 1 + + if "metadata" in message: + if "path" in message['metadata']: + knowledge = knowledge + message['metadata']['path'][2] + k = k + 1 + + if knowledge == "" or dialog == "" or response == "": + continue + res = get_dialogue_res(knowledge, dialog, response, instruction) + data = {"knowledge": knowledge, "dialogue_history": dialog, "right_response": response, "hallucinated_response": res} + dump_jsonl(data, output_path, append=True) + i = i + 1 + print("sample {} completed!".format(i)) + + +def generate_summarization_dataset(seed_data, instruction, output_path): + with open(seed_data, 'r', encoding="utf-8") as f: + data = f.readlines() + text = [json.loads(d) for d in data] + + for i in range(10000): + document = text[i]["document"] + summary = text[i]["summary"] + sum = get_summarization_res(document, summary, instruction) + data = {"document": document, "right_summary": summary, "hallucinated_summary": sum} + dump_jsonl(data, output_path, append=True) + print("sample {} completed!".format(i)) + + +def dump_jsonl(data, output_path, append=False): + """ + Write list of objects to a JSON lines file. + """ + mode = 'a+' if append else 'w' + with open(output_path, mode, encoding='utf-8') as f: + json_record = json.dumps(data, ensure_ascii=False) + f.write(json_record + '\n') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Hallucination Generation") + + # parser.add_argument("--seed_data", default="hotpot_train_v1.1.json", help="the original dataset file") + parser.add_argument("--task", default="qa", help="qa, dialogue, or summarization") + parser.add_argument("--strategy",default="one-turn", help="one-turn or multi-turn") + args = parser.parse_args() + + # seed_data = args.seed_data + # from datasets import clear_cache + # + # clear_cache() + dataset=load_dataset("RussianNLP/russian_super_glue", "muserc") + # print(dataset) + if args.strategy == "one-turn": + instruction_file = "{}/ru_{}_{}_instruction.txt".format(args.task, args.task, args.strategy) + f = open(instruction_file, 'r', encoding="utf-8") + instruction = f.read() + elif args.strategy == "multi-turn": + instruction_file = "{}/ru_{}_{}_instruction.json".format(args.task, args.task, args.strategy) + with open(instruction_file, 'r', encoding="utf-8") as f: + lines = f.readlines() + instruction = [json.loads(line) for line in lines] + else: + raise ValueError("The strategy must be one-turn or multi-turn!") + + output_path = "{}/ru_{}_{}_data.json".format(args.task, args.task, args.strategy) + + if args.task == "qa": + generate_qa_dataset(dataset["train"], instruction, output_path) + elif args.task == "dialogue": + generate_dialogue_dataset(dataset["train"], instruction, output_path) + elif args.task == "summarization": + generate_summarization_dataset(dataset["train"][:], instruction, output_path) + else: + raise ValueError("The task must be qa, dialogue, or summarization!")