更新 evaluation.py
This commit is contained in:
parent
061cd74fca
commit
caeb501dfa
283
evaluation.py
283
evaluation.py
|
|
@ -9,19 +9,19 @@ from openai import OpenAIError
|
||||||
|
|
||||||
|
|
||||||
client = OpenAI(
|
client = OpenAI(
|
||||||
api_key="sk-5f06261529bb44df86d9b2fdbae1a6b5", # 在这里将 MOONSHOT_API_KEY 替换为你从 Kimi 开放平台申请的 API Key
|
api_key="sk-", # 在这里将 MOONSHOT_API_KEY 替换为你从 Kimi 开放平台申请的 API Key
|
||||||
base_url="https://api.deepseek.com/v1",
|
base_url="https://api.deepseek.com/v1",
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_qa_response(model, question, answer):
|
def get_qa_response(model, question, answer):
|
||||||
message = [
|
message = [
|
||||||
{"role": "system", "content":"You are a huallucination detector. You MUST determine if the provided answer contains hallucination or not for the question based on the world knowledge. The answer you provided MUST be \"Yes\" or \"No\""},
|
{"role": "system", "content":"你是一个幻觉检测器。你必须根据世界知识确定问题的答案是否符合事实。你提供的答案必须是 \"YES\" or \"NO\" 并且给出你的理由"},
|
||||||
{"role": "user", "content":
|
{"role": "user", "content":
|
||||||
"\n\n#Question#: " + question +
|
"\n\n#Question#: " + question +
|
||||||
"\n#Answer#: " + answer +
|
"\n#Answer#: " + answer +
|
||||||
"\n#Your Judgement#: "}
|
"\n#Your Judgement#: "}
|
||||||
]
|
]
|
||||||
prompt = "\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
|
# prompt = "\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
if model == "gpt-3.5-turbo":
|
if model == "gpt-3.5-turbo":
|
||||||
|
|
@ -35,9 +35,7 @@ def get_qa_response(model, question, answer):
|
||||||
res = client.chat.completions.create(
|
res = client.chat.completions.create(
|
||||||
model="deepseek-chat",
|
model="deepseek-chat",
|
||||||
messages=message,
|
messages=message,
|
||||||
temperature=1,
|
stream=False
|
||||||
max_tokens=256,
|
|
||||||
top_p=1
|
|
||||||
)
|
)
|
||||||
response = res.choices[0].message.content
|
response = res.choices[0].message.content
|
||||||
break
|
break
|
||||||
|
|
@ -60,49 +58,7 @@ def get_qa_response(model, question, answer):
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
def get_dialogue_response(model, dialog, response, instruction):
|
|
||||||
message = [
|
|
||||||
{"role": "system", "content": "You are a response judge. You MUST determine if the provided response contains non-factual or hallucinated information. The answer you give MUST be \"Yes\" or \"No\""},
|
|
||||||
{"role": "user", "content": instruction +
|
|
||||||
"\n\n#Dialogue History#: " + dialog +
|
|
||||||
"\n#Response#: " + response +
|
|
||||||
"\n#Your Judgement#: "}
|
|
||||||
]
|
|
||||||
prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
if model == "gpt-3.5-turbo":
|
|
||||||
res = openai.ChatCompletion.create(
|
|
||||||
model="gpt-3.5-turbo",
|
|
||||||
messages=message,
|
|
||||||
temperature=0.0,
|
|
||||||
)
|
|
||||||
response = res['choices'][0]['message']['content']
|
|
||||||
else:
|
|
||||||
res = openai.Completion.create(
|
|
||||||
model=model,
|
|
||||||
prompt=prompt,
|
|
||||||
temperature=0.0
|
|
||||||
)
|
|
||||||
response = res["choices"][0]['text'].strip()
|
|
||||||
break
|
|
||||||
except openai.error.RateLimitError:
|
|
||||||
print('openai.error.RateLimitError\nRetrying...')
|
|
||||||
time.sleep(60)
|
|
||||||
except openai.error.ServiceUnavailableError:
|
|
||||||
print('openai.error.ServiceUnavailableError\nRetrying...')
|
|
||||||
time.sleep(20)
|
|
||||||
except openai.error.Timeout:
|
|
||||||
print('openai.error.Timeout\nRetrying...')
|
|
||||||
time.sleep(20)
|
|
||||||
except openai.error.APIError:
|
|
||||||
print('openai.error.APIError\nRetrying...')
|
|
||||||
time.sleep(20)
|
|
||||||
except openai.error.APIConnectionError:
|
|
||||||
print('openai.error.APIConnectionError\nRetrying...')
|
|
||||||
time.sleep(20)
|
|
||||||
|
|
||||||
return response
|
|
||||||
|
|
||||||
|
|
||||||
def num_tokens_from_message(message, model="davinci"):
|
def num_tokens_from_message(message, model="davinci"):
|
||||||
|
|
@ -120,61 +76,12 @@ def truncate_message(prompt1, prompt2, model="davinci"):
|
||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
|
|
||||||
def get_summarization_response(model, document, summary, instruction):
|
|
||||||
message = [
|
|
||||||
{"role": "system", "content": "You are a summary judge. You MUST determine if the provided summary contains non-factual or hallucinated information. The answer you give MUST be \"Yes\" or \"No\""},
|
|
||||||
{"role": "user", "content": instruction +
|
|
||||||
"\n\n#Document#: " + document +
|
|
||||||
"\n#Summary#: " + summary +
|
|
||||||
"\n#Your Judgement#: "}
|
|
||||||
]
|
|
||||||
prompt1 = instruction + "\n\n#Document#: " + document
|
|
||||||
prompt2 = "\n#Summary#: " + summary + "\n#Your Judgement#:"
|
|
||||||
if model == "davinci":
|
|
||||||
prompt = truncate_message(prompt1, prompt2)
|
|
||||||
else:
|
|
||||||
prompt = prompt1 + prompt2
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
if model == "gpt-3.5-turbo":
|
|
||||||
res = openai.ChatCompletion.create(
|
|
||||||
model="gpt-3.5-turbo",
|
|
||||||
messages=message,
|
|
||||||
temperature=0.0,
|
|
||||||
)
|
|
||||||
response = res['choices'][0]['message']['content']
|
|
||||||
else:
|
|
||||||
res = openai.Completion.create(
|
|
||||||
model=model,
|
|
||||||
prompt=prompt,
|
|
||||||
temperature=0.0
|
|
||||||
)
|
|
||||||
response = res["choices"][0]['text'].strip()
|
|
||||||
break
|
|
||||||
except openai.error.RateLimitError:
|
|
||||||
print('openai.error.RateLimitError\nRetrying...')
|
|
||||||
time.sleep(60)
|
|
||||||
except openai.error.ServiceUnavailableError:
|
|
||||||
print('openai.error.ServiceUnavailableError\nRetrying...')
|
|
||||||
time.sleep(20)
|
|
||||||
except openai.error.Timeout:
|
|
||||||
print('openai.error.Timeout\nRetrying...')
|
|
||||||
time.sleep(20)
|
|
||||||
except openai.error.APIError:
|
|
||||||
print('openai.error.APIError\nRetrying...')
|
|
||||||
time.sleep(20)
|
|
||||||
except openai.error.APIConnectionError:
|
|
||||||
print('openai.error.APIConnectionError\nRetrying...')
|
|
||||||
time.sleep(20)
|
|
||||||
|
|
||||||
return response
|
|
||||||
|
|
||||||
|
|
||||||
def evaluation_qa_dataset(model, file, output_path):
|
def evaluation_qa_dataset(model, file, output_path):
|
||||||
result=[]
|
result=[]
|
||||||
TP = 0
|
|
||||||
FP=0
|
|
||||||
FN=0
|
|
||||||
# test_file=json.loads(file)
|
# test_file=json.loads(file)
|
||||||
with open(file, 'r', encoding="utf-8") as f:
|
with open(file, 'r', encoding="utf-8") as f:
|
||||||
# print(f"File content: {file}")
|
# print(f"File content: {file}")
|
||||||
|
|
@ -185,175 +92,19 @@ def evaluation_qa_dataset(model, file, output_path):
|
||||||
for i in range(len(data)):
|
for i in range(len(data)):
|
||||||
question= data[i]["Question"]
|
question= data[i]["Question"]
|
||||||
answer=data[i]["Answer"]
|
answer=data[i]["Answer"]
|
||||||
ground_truth = data[i]["Hallucination"]
|
|
||||||
|
|
||||||
output_samples = get_qa_response(model, question, answer)
|
output_samples = get_qa_response(model, question, answer)
|
||||||
print('sample {} success......'.format(i))
|
print('sample {} success......'.format(i))
|
||||||
if ("Yes" in output_samples and "YES" in ground_truth):
|
result.append({"Question":question,"Answer":answer, "Prediction":"YES" if "YES" in output_samples else "NO"})
|
||||||
TP=TP+1
|
# result.append({"Question":question,"Answer":answer, "Prediction":"YES" if "YES" in output_samples else "NO", "Reason":output_samples})
|
||||||
elif ("Yes" in output_samples and "NO" in ground_truth):
|
dump_jsonl(result, output_path, append=False)
|
||||||
FP=FP+1
|
|
||||||
else:
|
|
||||||
FN=FN+1
|
|
||||||
result.append({"Question":question,"Answer":answer,"Hallucination":ground_truth, "res":output_samples})
|
|
||||||
Precision=TP/(TP+FP)
|
|
||||||
Recall=TP/(TP+FN)
|
|
||||||
F1score=2*(Precision*Recall)/(Precision+Recall)
|
|
||||||
print(' F1score: {}'.format(F1score))
|
|
||||||
dump_jsonl(result, output_path, append=True)
|
|
||||||
# correct = 0
|
|
||||||
# incorrect = 0
|
|
||||||
# for i in range(len(data)):
|
|
||||||
# knowledge = data[i]["knowledge"]
|
|
||||||
# question = data[i]["question"]
|
|
||||||
# hallucinated_answer = data[i]["hallucinated_answer"]
|
|
||||||
# right_answer = data[i]["right_answer"]
|
|
||||||
|
|
||||||
# if random.random() > 0.5:
|
|
||||||
# answer = hallucinated_answer
|
|
||||||
# ground_truth = "Yes"
|
|
||||||
# else:
|
|
||||||
# answer = right_answer
|
|
||||||
# ground_truth = "No"
|
|
||||||
|
|
||||||
# ans = get_qa_response(model, question, answer, instruction)
|
|
||||||
# ans = ans.replace(".", "")
|
|
||||||
|
|
||||||
# if ("Yes" in ans and "No" in ans) or ("Yes" not in ans and "No" not in ans):
|
|
||||||
# gen = {"knowledge": knowledge, "question": question, "answer": answer, "ground_truth": ground_truth, "judgement": "failed!"}
|
|
||||||
# dump_jsonl(gen, output_path, append=True)
|
|
||||||
# incorrect += 1
|
|
||||||
# print('sample {} fails......'.format(i))
|
|
||||||
# continue
|
|
||||||
# elif "Yes" in ans:
|
|
||||||
# if ans != "Yes":
|
|
||||||
# ans = "Yes"
|
|
||||||
# gen = {"knowledge": knowledge, "question": question, "answer": answer, "ground_truth": ground_truth, "judgement": ans}
|
|
||||||
# elif "No" in ans:
|
|
||||||
# if ans != "No":
|
|
||||||
# ans = "No"
|
|
||||||
# gen = {"knowledge": knowledge, "question": question, "answer": answer, "ground_truth": ground_truth, "judgement": ans}
|
|
||||||
# else:
|
|
||||||
# gen = None
|
|
||||||
# incorrect += 1
|
|
||||||
|
|
||||||
# assert(gen is not None)
|
|
||||||
|
|
||||||
# if ground_truth == ans:
|
|
||||||
# correct += 1
|
|
||||||
# else:
|
|
||||||
# incorrect += 1
|
|
||||||
|
|
||||||
# print('sample {} success......'.format(i))
|
|
||||||
|
|
||||||
|
|
||||||
# print('{} correct samples, {} incorrect samples, Accuracy: {}'.format(correct, incorrect, correct/len(data)))
|
|
||||||
|
|
||||||
|
|
||||||
def evaluation_dialogue_dataset(model, file, instruction, output_path):
|
|
||||||
with open(file, 'r', encoding="utf-8") as f:
|
|
||||||
data = []
|
|
||||||
for line in f:
|
|
||||||
data.append(json.loads(line))
|
|
||||||
|
|
||||||
correct = 0
|
|
||||||
incorrect = 0
|
|
||||||
for i in range(len(data)):
|
|
||||||
knowledge = data[i]["knowledge"]
|
|
||||||
dialog = data[i]["dialogue_history"]
|
|
||||||
hallucinated_response = data[i]["hallucinated_response"]
|
|
||||||
right_response = data[i]["right_response"]
|
|
||||||
|
|
||||||
if random.random() > 0.5:
|
|
||||||
response = hallucinated_response
|
|
||||||
ground_truth = "Yes"
|
|
||||||
else:
|
|
||||||
response = right_response
|
|
||||||
ground_truth = "No"
|
|
||||||
|
|
||||||
ans = get_dialogue_response(model, dialog, response, instruction)
|
|
||||||
ans = ans.replace(".", "")
|
|
||||||
|
|
||||||
if ("Yes" in ans and "No" in ans) or ("Yes" not in ans and "No" not in ans):
|
|
||||||
gen = {"knowledge": knowledge, "dialogue_history": dialog, "response": response, "ground_truth": ground_truth, "judgement": "failed!"}
|
|
||||||
dump_jsonl(gen, output_path, append=True)
|
|
||||||
incorrect += 1
|
|
||||||
print('sample {} fails......'.format(i))
|
|
||||||
continue
|
|
||||||
elif "Yes" in ans:
|
|
||||||
if ans != "Yes":
|
|
||||||
ans = "Yes"
|
|
||||||
gen = {"knowledge": knowledge, "dialogue_history": dialog, "response": response, "ground_truth": ground_truth, "judgement": ans}
|
|
||||||
elif "No" in ans:
|
|
||||||
if ans != "No":
|
|
||||||
ans = "No"
|
|
||||||
gen = {"knowledge": knowledge, "dialogue_history": dialog, "response": response, "ground_truth": ground_truth, "judgement": ans}
|
|
||||||
else:
|
|
||||||
gen = None
|
|
||||||
assert (gen is not None)
|
|
||||||
|
|
||||||
if ground_truth == ans:
|
|
||||||
correct += 1
|
|
||||||
else:
|
|
||||||
incorrect += 1
|
|
||||||
|
|
||||||
print('sample {} success......'.format(i))
|
|
||||||
dump_jsonl(gen, output_path, append=True)
|
|
||||||
|
|
||||||
print('{} correct samples, {} incorrect samples, Accuracy: {}'.format(correct, incorrect, correct / len(data)))
|
|
||||||
|
|
||||||
|
|
||||||
def evaluation_summarization_dataset(model, file, instruction, output_path):
|
|
||||||
with open(file, 'r', encoding="utf-8") as f:
|
|
||||||
data = []
|
|
||||||
for line in f:
|
|
||||||
data.append(json.loads(line))
|
|
||||||
|
|
||||||
correct = 0
|
|
||||||
incorrect = 0
|
|
||||||
for i in range(len(data)):
|
|
||||||
|
|
||||||
document = data[i]["document"]
|
|
||||||
hallucinated_summary = data[i]["hallucinated_summary"]
|
|
||||||
right_summary = data[i]["right_summary"]
|
|
||||||
|
|
||||||
if random.random() > 0.5:
|
|
||||||
summary = hallucinated_summary
|
|
||||||
ground_truth = "Yes"
|
|
||||||
else:
|
|
||||||
summary = right_summary
|
|
||||||
ground_truth = "No"
|
|
||||||
|
|
||||||
ans = get_summarization_response(model, document, summary, instruction)
|
|
||||||
ans = ans.replace(".", "")
|
|
||||||
|
|
||||||
if ("Yes" in ans and "No" in ans) or ("Yes" not in ans and "No" not in ans):
|
|
||||||
gen = {"document": document, "summary": summary, "ground_truth": ground_truth, "judgement": "failed!"}
|
|
||||||
dump_jsonl(gen, output_path, append=True)
|
|
||||||
incorrect += 1
|
|
||||||
print('sample {} fails......'.format(i))
|
|
||||||
continue
|
|
||||||
elif "Yes" in ans:
|
|
||||||
if ans != "Yes":
|
|
||||||
ans = "Yes"
|
|
||||||
gen = {"document": document, "summary": summary, "ground_truth": ground_truth, "judgement": ans}
|
|
||||||
elif "No" in ans:
|
|
||||||
if ans != "No":
|
|
||||||
ans = "No"
|
|
||||||
gen = {"document": document, "summary": summary, "ground_truth": ground_truth, "judgement": ans}
|
|
||||||
else:
|
|
||||||
gen = None
|
|
||||||
assert (gen is not None)
|
|
||||||
|
|
||||||
if ground_truth == ans:
|
|
||||||
correct += 1
|
|
||||||
else:
|
|
||||||
incorrect += 1
|
|
||||||
|
|
||||||
print('sample {} success......'.format(i))
|
|
||||||
dump_jsonl(gen, output_path, append=True)
|
|
||||||
|
|
||||||
print('{} correct samples, {} incorrect samples, Accuracy: {}'.format(correct, incorrect, correct / len(data)))
|
|
||||||
|
|
||||||
|
|
||||||
def dump_jsonl(data, output_path, append=False):
|
def dump_jsonl(data, output_path, append=False):
|
||||||
|
|
@ -370,24 +121,20 @@ if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(description="Hallucination Generation")
|
parser = argparse.ArgumentParser(description="Hallucination Generation")
|
||||||
|
|
||||||
parser.add_argument("--task", default="qa", help="qa, dialogue, or summarization")
|
parser.add_argument("--task", default="qa", help="qa, dialogue, or summarization")
|
||||||
parser.add_argument("--model", default="qwen", help="model name")
|
parser.add_argument("--model", default="deepseek", help="model name")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
instruction_file = "{}/{}_evaluation_instruction.txt".format(args.task, args.task)
|
# instruction_file = "{}/{}_evaluation_instruction.txt".format(args.task, args.task)
|
||||||
f = open(instruction_file, 'r', encoding="utf-8")
|
# f = open(instruction_file, 'r', encoding="utf-8")
|
||||||
instruction = f.read()
|
# instruction = f.read()
|
||||||
|
|
||||||
model = args.model
|
model = args.model
|
||||||
output_path = "{}/{}_{}_results.json".format(args.task, args.task, args.model)
|
output_path = "{}/{}_results.json".format(args.task, args.model)
|
||||||
|
|
||||||
# data = "../data/{}_data.json".format(args.task)
|
# data = "../data/{}_data.json".format(args.task)
|
||||||
data="/home/lee/code/HaluEval/evaluation/factuality_train.json"
|
data="/home/leewlving/PycharmProjects/xianxing_cup3/factuality_predict.json"
|
||||||
|
|
||||||
if args.task == "qa":
|
if args.task == "qa":
|
||||||
evaluation_qa_dataset(model, data, output_path)
|
evaluation_qa_dataset(model, data, output_path)
|
||||||
elif args.task == "dialogue":
|
|
||||||
evaluation_dialogue_dataset(model, data, instruction, output_path)
|
|
||||||
elif args.task == "summarization":
|
|
||||||
evaluation_summarization_dataset(model, data, instruction, output_path)
|
|
||||||
else:
|
else:
|
||||||
raise ValueError("The task must be qa, dialogue, or summarization!")
|
raise ValueError("The task must be qa, dialogue, or summarization!")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue