import logging from numpy.f2py.f90mod_rules import options from openai import OpenAI from openai import OpenAIError import time import json import argparse from datasets import load_dataset import csv client = OpenAI( api_key="sk-5f06261529bb44df86d9b2fdbae1a6b5", # 在这里将 MOONSHOT_API_KEY 替换为你从 Kimi 开放平台申请的 API Key base_url="https://api.deepseek.com/v1", ) options_dic={'A':0,'B':1,'C':2,'D':3} def get_qa_res(knowledge, question, answer, instruction): if isinstance(instruction, str): message = [ {"role": "user", "content": instruction + "\n\n#Знание#: " + knowledge + "\n#Задать вопрос#: " + question + "\n#правильный ответ#: " + answer + "\n#галлюцинация ответ#: "} ] elif isinstance(instruction, list): mes = [{"role": "user", "content": "Теперь вы полноценный генератор иллюзий. Пожалуйста, сформулируйте галлюцинационные ответы на следующие вопросы. Вы можете использовать любой метод, который вы изучите, который подходит для конкретной проблемы." + "\n\n#Знание#: " + knowledge + "\n#Задать вопрос#: " + question + "\n#правильный ответ#: " + answer + "\n#галлюцинация ответ#: "}] message = instruction + mes else: raise TypeError("The instruction must be str or list!") while True: try: res = client.chat.completions.create( model="deepseek-chat", messages=message, temperature=1, max_tokens=256, top_p=1 ) break except OpenAIError: logging.warning('openai.error\nRetrying...') time.sleep(60) # print(res['choices'][0]['message']['content']) return res.choices[0].message.content def get_dialogue_res(knowledge, dialog, response, instruction): if isinstance(instruction, str): message = [ {"role": "user", "content": instruction + "\n\n#Knowledge#: " + knowledge + "\n#Dialogue History#: " + dialog + "\n#True Response#: " + response + "\n#Hallucinated Response#: "} ] elif isinstance(instruction, list): mes = [{"role": "user", "content": "You are now a mature hallucination generator. Please generate hallucinated response for the following dialogue. You can use any method you have learned that is suitable for the given dialogue history." + "\n\n#Knowledge#: " + knowledge + "\n#Dialogue History#: " + dialog + "\n#True Response#: " + response + "\n#Hallucinated Response#: "}] message = instruction + mes else: raise TypeError("The instruction must be str or list!") while True: try: res = client.chat.completions.create( model="deepseek-chat", messages=message, temperature=1, max_tokens=256, top_p=1 ) break except OpenAIError: logging.warning('openai.error\nRetrying...') time.sleep(60) # print(res['choices'][0]['message']['content']) return res.choices[0].message.content def get_summarization_res(text, summary, instruction): if isinstance(instruction, str): message = [ {"role": "user", "content": instruction + "\n\n#Document#: " + text + "\n#Right Summary#: " + summary + "\n#Hallucinated Summary#: "} ] elif isinstance(instruction, list): mes = [{"role": "user", "content": "You are now a mature hallucination generator. Please generate hallucinated summary for the following document. You can use any method you have learned that is suitable for the given document. #Hallucinated Summary# must not be longer than #Right Summary#." + "\n\n#Document#: " + text + "\n#Right Summary#: " + summary + "\n#Hallucinated Summary#: "}] message = instruction + mes else: raise TypeError("The instruction must be str or list!") while True: try: res = client.chat.completions.create( model="deepseek-chat", messages=message, temperature=1, max_tokens=256, top_p=1 ) break except OpenAIError: logging.warning('openai.error\nRetrying...') time.sleep(60) # print(res['choices'][0]['message']['content']) return res.choices[0].message.content def generate_qa_dataset(datas, instruction, output_path): # with open(seed_data, 'r', encoding="utf-8") as f: # text = json.load(f) # text=text["data"] # temp=0 for i in range(len(datas)): # print(len(data)) content=datas[i]['paragraph'] question = datas[i]['question'] answer = datas[i]['answer'] knowledge = content ans = get_qa_res(knowledge, question, answer, instruction) data = {"knowledge": knowledge, "question": question, "right_answer": answer, "hallucinated_answer": ans} dump_jsonl(data, output_path, append=True) print(" sample {} completed!".format(i)) # for j in range(len(text[i]['questions'])): # question = text[i]['questions'][j] # answer= text[i]['options'][j][options_dic[text[i]['answers'][j]]] # derivations= text[i]['evidences'][j] # knowledge =content # for derivation in derivations: # for para in derivation: # if isinstance(para, str): # knowledge = knowledge + para # elif isinstance(para, list): # for p in para: # knowledge = knowledge + p # else: # raise TypeError("The derivations must be str or list!") # # # ans = get_qa_res(knowledge, question, answer, instruction) # data = {"knowledge": knowledge, "question": question, "right_answer": answer, "hallucinated_answer": ans} # dump_jsonl(data, output_path, append=True) # print(" sample {} completed!".format(temp)) # temp+=1 def generate_dialogue_dataset(seed_data, instruction, output_path): SENDER = {"user": "[人間]", "assistant": "[アシスタント]"} with open(seed_data, 'r', encoding="utf-8") as f: i = 0 data = csv.DictReader(f) for r in data: if i >= 10000: break r = eval(r['Messages']) dialog = "" knowledge = "" response = "" k = 0 d = 0 for message in r: if "message" in message: if k > 1 and message['sender'] == "アシスタント": response = message['message'] break if d > 3 and message['sender'] == "アシスタント": response = message['message'] break else: dialog = dialog + (SENDER[message['sender']] + ": " + message['message']) + " " d = d + 1 if "metadata" in message: if "path" in message['metadata']: knowledge = knowledge + message['metadata']['path'][2] k = k + 1 if knowledge == "" or dialog == "" or response == "": continue res = get_dialogue_res(knowledge, dialog, response, instruction) data = {"knowledge": knowledge, "dialogue_history": dialog, "right_response": response, "hallucinated_response": res} dump_jsonl(data, output_path, append=True) i = i + 1 print("sample {} completed!".format(i)) def generate_summarization_dataset(seed_data, instruction, output_path): with open(seed_data, 'r', encoding="utf-8") as f: data = f.readlines() text = [json.loads(d) for d in data] for i in range(10000): document = text[i]["document"] summary = text[i]["summary"] sum = get_summarization_res(document, summary, instruction) data = {"document": document, "right_summary": summary, "hallucinated_summary": sum} dump_jsonl(data, output_path, append=True) print("sample {} completed!".format(i)) def dump_jsonl(data, output_path, append=False): """ Write list of objects to a JSON lines file. """ mode = 'a+' if append else 'w' with open(output_path, mode, encoding='utf-8') as f: json_record = json.dumps(data, ensure_ascii=False) f.write(json_record + '\n') if __name__ == '__main__': parser = argparse.ArgumentParser(description="Hallucination Generation") # parser.add_argument("--seed_data", default="hotpot_train_v1.1.json", help="the original dataset file") parser.add_argument("--task", default="qa", help="qa, dialogue, or summarization") parser.add_argument("--strategy",default="one-turn", help="one-turn or multi-turn") args = parser.parse_args() # seed_data = args.seed_data # from datasets import clear_cache # # clear_cache() dataset=load_dataset("RussianNLP/russian_super_glue", "muserc") # print(dataset) if args.strategy == "one-turn": instruction_file = "{}/ru_{}_{}_instruction.txt".format(args.task, args.task, args.strategy) f = open(instruction_file, 'r', encoding="utf-8") instruction = f.read() elif args.strategy == "multi-turn": instruction_file = "{}/ru_{}_{}_instruction.json".format(args.task, args.task, args.strategy) with open(instruction_file, 'r', encoding="utf-8") as f: lines = f.readlines() instruction = [json.loads(line) for line in lines] else: raise ValueError("The strategy must be one-turn or multi-turn!") output_path = "{}/ru_{}_{}_data.json".format(args.task, args.task, args.strategy) if args.task == "qa": generate_qa_dataset(dataset["train"], instruction, output_path) elif args.task == "dialogue": generate_dialogue_dataset(dataset["train"], instruction, output_path) elif args.task == "summarization": generate_summarization_dataset(dataset["train"][:], instruction, output_path) else: raise ValueError("The task must be qa, dialogue, or summarization!")