添加 a.py
This commit is contained in:
commit
f17c0142ea
|
|
@ -0,0 +1,270 @@
|
|||
import logging
|
||||
|
||||
from numpy.f2py.f90mod_rules import options
|
||||
from openai import OpenAI
|
||||
from openai import OpenAIError
|
||||
import time
|
||||
import json
|
||||
import argparse
|
||||
from datasets import load_dataset
|
||||
import csv
|
||||
|
||||
client = OpenAI(
|
||||
api_key="sk-5f06261529bb44df86d9b2fdbae1a6b5", # 在这里将 MOONSHOT_API_KEY 替换为你从 Kimi 开放平台申请的 API Key
|
||||
base_url="https://api.deepseek.com/v1",
|
||||
)
|
||||
|
||||
options_dic={'A':0,'B':1,'C':2,'D':3}
|
||||
|
||||
def get_qa_res(knowledge, question, answer, instruction):
|
||||
if isinstance(instruction, str):
|
||||
message = [
|
||||
{"role": "user", "content": instruction +
|
||||
"\n\n#Знание#: " + knowledge +
|
||||
"\n#Задать вопрос#: " + question +
|
||||
"\n#правильный ответ#: " + answer +
|
||||
"\n#галлюцинация ответ#: "}
|
||||
]
|
||||
elif isinstance(instruction, list):
|
||||
mes = [{"role": "user",
|
||||
"content": "Теперь вы полноценный генератор иллюзий. Пожалуйста, сформулируйте галлюцинационные ответы на следующие вопросы. Вы можете использовать любой метод, который вы изучите, который подходит для конкретной проблемы." +
|
||||
"\n\n#Знание#: " + knowledge +
|
||||
"\n#Задать вопрос#: " + question +
|
||||
"\n#правильный ответ#: " + answer +
|
||||
"\n#галлюцинация ответ#: "}]
|
||||
message = instruction + mes
|
||||
else:
|
||||
raise TypeError("The instruction must be str or list!")
|
||||
|
||||
while True:
|
||||
try:
|
||||
res = client.chat.completions.create(
|
||||
model="deepseek-chat",
|
||||
messages=message,
|
||||
temperature=1,
|
||||
max_tokens=256,
|
||||
top_p=1
|
||||
)
|
||||
break
|
||||
except OpenAIError:
|
||||
logging.warning('openai.error\nRetrying...')
|
||||
time.sleep(60)
|
||||
|
||||
|
||||
# print(res['choices'][0]['message']['content'])
|
||||
return res.choices[0].message.content
|
||||
|
||||
|
||||
def get_dialogue_res(knowledge, dialog, response, instruction):
|
||||
if isinstance(instruction, str):
|
||||
message = [
|
||||
{"role": "user", "content": instruction +
|
||||
"\n\n#Knowledge#: " + knowledge +
|
||||
"\n#Dialogue History#: " + dialog +
|
||||
"\n#True Response#: " + response +
|
||||
"\n#Hallucinated Response#: "}
|
||||
]
|
||||
elif isinstance(instruction, list):
|
||||
mes = [{"role": "user",
|
||||
"content": "You are now a mature hallucination generator. Please generate hallucinated response for the following dialogue. You can use any method you have learned that is suitable for the given dialogue history." +
|
||||
"\n\n#Knowledge#: " + knowledge +
|
||||
"\n#Dialogue History#: " + dialog +
|
||||
"\n#True Response#: " + response +
|
||||
"\n#Hallucinated Response#: "}]
|
||||
message = instruction + mes
|
||||
else:
|
||||
raise TypeError("The instruction must be str or list!")
|
||||
|
||||
while True:
|
||||
try:
|
||||
res = client.chat.completions.create(
|
||||
model="deepseek-chat",
|
||||
messages=message,
|
||||
temperature=1,
|
||||
max_tokens=256,
|
||||
top_p=1
|
||||
)
|
||||
break
|
||||
except OpenAIError:
|
||||
logging.warning('openai.error\nRetrying...')
|
||||
time.sleep(60)
|
||||
|
||||
# print(res['choices'][0]['message']['content'])
|
||||
return res.choices[0].message.content
|
||||
|
||||
|
||||
def get_summarization_res(text, summary, instruction):
|
||||
if isinstance(instruction, str):
|
||||
message = [
|
||||
{"role": "user", "content": instruction +
|
||||
"\n\n#Document#: " + text +
|
||||
"\n#Right Summary#: " + summary +
|
||||
"\n#Hallucinated Summary#: "}
|
||||
]
|
||||
elif isinstance(instruction, list):
|
||||
mes = [{"role": "user",
|
||||
"content": "You are now a mature hallucination generator. Please generate hallucinated summary for the following document. You can use any method you have learned that is suitable for the given document. #Hallucinated Summary# must not be longer than #Right Summary#." +
|
||||
"\n\n#Document#: " + text +
|
||||
"\n#Right Summary#: " + summary +
|
||||
"\n#Hallucinated Summary#: "}]
|
||||
message = instruction + mes
|
||||
else:
|
||||
raise TypeError("The instruction must be str or list!")
|
||||
|
||||
while True:
|
||||
try:
|
||||
res = client.chat.completions.create(
|
||||
model="deepseek-chat",
|
||||
messages=message,
|
||||
temperature=1,
|
||||
max_tokens=256,
|
||||
top_p=1
|
||||
)
|
||||
break
|
||||
except OpenAIError:
|
||||
logging.warning('openai.error\nRetrying...')
|
||||
time.sleep(60)
|
||||
|
||||
# print(res['choices'][0]['message']['content'])
|
||||
return res.choices[0].message.content
|
||||
|
||||
|
||||
def generate_qa_dataset(datas, instruction, output_path):
|
||||
|
||||
# with open(seed_data, 'r', encoding="utf-8") as f:
|
||||
# text = json.load(f)
|
||||
# text=text["data"]
|
||||
# temp=0
|
||||
for i in range(len(datas)):
|
||||
# print(len(data))
|
||||
content=datas[i]['paragraph']
|
||||
question = datas[i]['question']
|
||||
answer = datas[i]['answer']
|
||||
knowledge = content
|
||||
ans = get_qa_res(knowledge, question, answer, instruction)
|
||||
data = {"knowledge": knowledge, "question": question, "right_answer": answer, "hallucinated_answer": ans}
|
||||
dump_jsonl(data, output_path, append=True)
|
||||
print(" sample {} completed!".format(i))
|
||||
# for j in range(len(text[i]['questions'])):
|
||||
# question = text[i]['questions'][j]
|
||||
# answer= text[i]['options'][j][options_dic[text[i]['answers'][j]]]
|
||||
# derivations= text[i]['evidences'][j]
|
||||
# knowledge =content
|
||||
# for derivation in derivations:
|
||||
# for para in derivation:
|
||||
# if isinstance(para, str):
|
||||
# knowledge = knowledge + para
|
||||
# elif isinstance(para, list):
|
||||
# for p in para:
|
||||
# knowledge = knowledge + p
|
||||
# else:
|
||||
# raise TypeError("The derivations must be str or list!")
|
||||
#
|
||||
#
|
||||
# ans = get_qa_res(knowledge, question, answer, instruction)
|
||||
# data = {"knowledge": knowledge, "question": question, "right_answer": answer, "hallucinated_answer": ans}
|
||||
# dump_jsonl(data, output_path, append=True)
|
||||
# print(" sample {} completed!".format(temp))
|
||||
# temp+=1
|
||||
|
||||
|
||||
def generate_dialogue_dataset(seed_data, instruction, output_path):
|
||||
SENDER = {"user": "[人間]", "assistant": "[アシスタント]"}
|
||||
with open(seed_data, 'r', encoding="utf-8") as f:
|
||||
i = 0
|
||||
data = csv.DictReader(f)
|
||||
for r in data:
|
||||
if i >= 10000:
|
||||
break
|
||||
r = eval(r['Messages'])
|
||||
dialog = ""
|
||||
knowledge = ""
|
||||
response = ""
|
||||
k = 0
|
||||
d = 0
|
||||
for message in r:
|
||||
if "message" in message:
|
||||
if k > 1 and message['sender'] == "アシスタント":
|
||||
response = message['message']
|
||||
break
|
||||
if d > 3 and message['sender'] == "アシスタント":
|
||||
response = message['message']
|
||||
break
|
||||
else:
|
||||
dialog = dialog + (SENDER[message['sender']] + ": " + message['message']) + " "
|
||||
d = d + 1
|
||||
|
||||
if "metadata" in message:
|
||||
if "path" in message['metadata']:
|
||||
knowledge = knowledge + message['metadata']['path'][2]
|
||||
k = k + 1
|
||||
|
||||
if knowledge == "" or dialog == "" or response == "":
|
||||
continue
|
||||
res = get_dialogue_res(knowledge, dialog, response, instruction)
|
||||
data = {"knowledge": knowledge, "dialogue_history": dialog, "right_response": response, "hallucinated_response": res}
|
||||
dump_jsonl(data, output_path, append=True)
|
||||
i = i + 1
|
||||
print("sample {} completed!".format(i))
|
||||
|
||||
|
||||
def generate_summarization_dataset(seed_data, instruction, output_path):
|
||||
with open(seed_data, 'r', encoding="utf-8") as f:
|
||||
data = f.readlines()
|
||||
text = [json.loads(d) for d in data]
|
||||
|
||||
for i in range(10000):
|
||||
document = text[i]["document"]
|
||||
summary = text[i]["summary"]
|
||||
sum = get_summarization_res(document, summary, instruction)
|
||||
data = {"document": document, "right_summary": summary, "hallucinated_summary": sum}
|
||||
dump_jsonl(data, output_path, append=True)
|
||||
print("sample {} completed!".format(i))
|
||||
|
||||
|
||||
def dump_jsonl(data, output_path, append=False):
|
||||
"""
|
||||
Write list of objects to a JSON lines file.
|
||||
"""
|
||||
mode = 'a+' if append else 'w'
|
||||
with open(output_path, mode, encoding='utf-8') as f:
|
||||
json_record = json.dumps(data, ensure_ascii=False)
|
||||
f.write(json_record + '\n')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description="Hallucination Generation")
|
||||
|
||||
# parser.add_argument("--seed_data", default="hotpot_train_v1.1.json", help="the original dataset file")
|
||||
parser.add_argument("--task", default="qa", help="qa, dialogue, or summarization")
|
||||
parser.add_argument("--strategy",default="one-turn", help="one-turn or multi-turn")
|
||||
args = parser.parse_args()
|
||||
|
||||
# seed_data = args.seed_data
|
||||
# from datasets import clear_cache
|
||||
#
|
||||
# clear_cache()
|
||||
dataset=load_dataset("RussianNLP/russian_super_glue", "muserc")
|
||||
# print(dataset)
|
||||
if args.strategy == "one-turn":
|
||||
instruction_file = "{}/ru_{}_{}_instruction.txt".format(args.task, args.task, args.strategy)
|
||||
f = open(instruction_file, 'r', encoding="utf-8")
|
||||
instruction = f.read()
|
||||
elif args.strategy == "multi-turn":
|
||||
instruction_file = "{}/ru_{}_{}_instruction.json".format(args.task, args.task, args.strategy)
|
||||
with open(instruction_file, 'r', encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
instruction = [json.loads(line) for line in lines]
|
||||
else:
|
||||
raise ValueError("The strategy must be one-turn or multi-turn!")
|
||||
|
||||
output_path = "{}/ru_{}_{}_data.json".format(args.task, args.task, args.strategy)
|
||||
|
||||
if args.task == "qa":
|
||||
generate_qa_dataset(dataset["train"], instruction, output_path)
|
||||
elif args.task == "dialogue":
|
||||
generate_dialogue_dataset(dataset["train"], instruction, output_path)
|
||||
elif args.task == "summarization":
|
||||
generate_summarization_dataset(dataset["train"][:], instruction, output_path)
|
||||
else:
|
||||
raise ValueError("The task must be qa, dialogue, or summarization!")
|
||||
Loading…
Reference in New Issue