From 061cd74fca5256e6309f632f50308277248d4db3 Mon Sep 17 00:00:00 2001 From: liwenyun Date: Wed, 27 Nov 2024 00:22:05 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20evaluation.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- evaluation.py | 393 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 393 insertions(+) create mode 100644 evaluation.py diff --git a/evaluation.py b/evaluation.py new file mode 100644 index 0000000..3c7dcab --- /dev/null +++ b/evaluation.py @@ -0,0 +1,393 @@ +import random +import openai +import time +import json +import argparse +import tiktoken +from openai import OpenAI +from openai import OpenAIError + + +client = OpenAI( + api_key="sk-5f06261529bb44df86d9b2fdbae1a6b5", # 在这里将 MOONSHOT_API_KEY 替换为你从 Kimi 开放平台申请的 API Key + base_url="https://api.deepseek.com/v1", +) + +def get_qa_response(model, question, answer): + message = [ + {"role": "system", "content":"You are a huallucination detector. You MUST determine if the provided answer contains hallucination or not for the question based on the world knowledge. The answer you provided MUST be \"Yes\" or \"No\""}, + {"role": "user", "content": + "\n\n#Question#: " + question + + "\n#Answer#: " + answer + + "\n#Your Judgement#: "} + ] + prompt = "\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:" + while True: + try: + if model == "gpt-3.5-turbo": + res = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=message, + temperature=0.0, + ) + response = res['choices'][0]['message']['content'] + else: + res = client.chat.completions.create( + model="deepseek-chat", + messages=message, + temperature=1, + max_tokens=256, + top_p=1 + ) + response = res.choices[0].message.content + break + except OpenAIError: + print('openai.error.RateLimitError\nRetrying...') + time.sleep(60) + except openai.error.ServiceUnavailableError: + print('openai.error.ServiceUnavailableError\nRetrying...') + time.sleep(20) + except openai.error.Timeout: + print('openai.error.Timeout\nRetrying...') + time.sleep(20) + except openai.error.APIError: + print('openai.error.APIError\nRetrying...') + time.sleep(20) + except openai.error.APIConnectionError: + print('openai.error.APIConnectionError\nRetrying...') + time.sleep(20) + + return response + + +def get_dialogue_response(model, dialog, response, instruction): + message = [ + {"role": "system", "content": "You are a response judge. You MUST determine if the provided response contains non-factual or hallucinated information. The answer you give MUST be \"Yes\" or \"No\""}, + {"role": "user", "content": instruction + + "\n\n#Dialogue History#: " + dialog + + "\n#Response#: " + response + + "\n#Your Judgement#: "} + ] + prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:" + while True: + try: + if model == "gpt-3.5-turbo": + res = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=message, + temperature=0.0, + ) + response = res['choices'][0]['message']['content'] + else: + res = openai.Completion.create( + model=model, + prompt=prompt, + temperature=0.0 + ) + response = res["choices"][0]['text'].strip() + break + except openai.error.RateLimitError: + print('openai.error.RateLimitError\nRetrying...') + time.sleep(60) + except openai.error.ServiceUnavailableError: + print('openai.error.ServiceUnavailableError\nRetrying...') + time.sleep(20) + except openai.error.Timeout: + print('openai.error.Timeout\nRetrying...') + time.sleep(20) + except openai.error.APIError: + print('openai.error.APIError\nRetrying...') + time.sleep(20) + except openai.error.APIConnectionError: + print('openai.error.APIConnectionError\nRetrying...') + time.sleep(20) + + return response + + +def num_tokens_from_message(message, model="davinci"): + encoding = tiktoken.encoding_for_model(model) + num_tokens = len(encoding.encode(message)) + return num_tokens + + +def truncate_message(prompt1, prompt2, model="davinci"): + if num_tokens_from_message(prompt1 + prompt2, model) > 2033: + truncation_length = 2033 - num_tokens_from_message(prompt2) + while num_tokens_from_message(prompt1) > truncation_length: + prompt1 = " ".join(prompt1.split()[:-1]) + prompt = prompt1 + prompt2 + return prompt + + +def get_summarization_response(model, document, summary, instruction): + message = [ + {"role": "system", "content": "You are a summary judge. You MUST determine if the provided summary contains non-factual or hallucinated information. The answer you give MUST be \"Yes\" or \"No\""}, + {"role": "user", "content": instruction + + "\n\n#Document#: " + document + + "\n#Summary#: " + summary + + "\n#Your Judgement#: "} + ] + prompt1 = instruction + "\n\n#Document#: " + document + prompt2 = "\n#Summary#: " + summary + "\n#Your Judgement#:" + if model == "davinci": + prompt = truncate_message(prompt1, prompt2) + else: + prompt = prompt1 + prompt2 + while True: + try: + if model == "gpt-3.5-turbo": + res = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=message, + temperature=0.0, + ) + response = res['choices'][0]['message']['content'] + else: + res = openai.Completion.create( + model=model, + prompt=prompt, + temperature=0.0 + ) + response = res["choices"][0]['text'].strip() + break + except openai.error.RateLimitError: + print('openai.error.RateLimitError\nRetrying...') + time.sleep(60) + except openai.error.ServiceUnavailableError: + print('openai.error.ServiceUnavailableError\nRetrying...') + time.sleep(20) + except openai.error.Timeout: + print('openai.error.Timeout\nRetrying...') + time.sleep(20) + except openai.error.APIError: + print('openai.error.APIError\nRetrying...') + time.sleep(20) + except openai.error.APIConnectionError: + print('openai.error.APIConnectionError\nRetrying...') + time.sleep(20) + + return response + + +def evaluation_qa_dataset(model, file, output_path): + result=[] + TP = 0 + FP=0 + FN=0 + # test_file=json.loads(file) + with open(file, 'r', encoding="utf-8") as f: + # print(f"File content: {file}") + test_file=json.load(f) + data = [] + for i in range(len(test_file)): + data.append(test_file[i]) + for i in range(len(data)): + question= data[i]["Question"] + answer=data[i]["Answer"] + ground_truth = data[i]["Hallucination"] + + output_samples = get_qa_response(model, question, answer) + print('sample {} success......'.format(i)) + if ("Yes" in output_samples and "YES" in ground_truth): + TP=TP+1 + elif ("Yes" in output_samples and "NO" in ground_truth): + FP=FP+1 + else: + FN=FN+1 + result.append({"Question":question,"Answer":answer,"Hallucination":ground_truth, "res":output_samples}) + Precision=TP/(TP+FP) + Recall=TP/(TP+FN) + F1score=2*(Precision*Recall)/(Precision+Recall) + print(' F1score: {}'.format(F1score)) + dump_jsonl(result, output_path, append=True) + # correct = 0 + # incorrect = 0 + # for i in range(len(data)): + # knowledge = data[i]["knowledge"] + # question = data[i]["question"] + # hallucinated_answer = data[i]["hallucinated_answer"] + # right_answer = data[i]["right_answer"] + + # if random.random() > 0.5: + # answer = hallucinated_answer + # ground_truth = "Yes" + # else: + # answer = right_answer + # ground_truth = "No" + + # ans = get_qa_response(model, question, answer, instruction) + # ans = ans.replace(".", "") + + # if ("Yes" in ans and "No" in ans) or ("Yes" not in ans and "No" not in ans): + # gen = {"knowledge": knowledge, "question": question, "answer": answer, "ground_truth": ground_truth, "judgement": "failed!"} + # dump_jsonl(gen, output_path, append=True) + # incorrect += 1 + # print('sample {} fails......'.format(i)) + # continue + # elif "Yes" in ans: + # if ans != "Yes": + # ans = "Yes" + # gen = {"knowledge": knowledge, "question": question, "answer": answer, "ground_truth": ground_truth, "judgement": ans} + # elif "No" in ans: + # if ans != "No": + # ans = "No" + # gen = {"knowledge": knowledge, "question": question, "answer": answer, "ground_truth": ground_truth, "judgement": ans} + # else: + # gen = None + # incorrect += 1 + + # assert(gen is not None) + + # if ground_truth == ans: + # correct += 1 + # else: + # incorrect += 1 + + # print('sample {} success......'.format(i)) + + + # print('{} correct samples, {} incorrect samples, Accuracy: {}'.format(correct, incorrect, correct/len(data))) + + +def evaluation_dialogue_dataset(model, file, instruction, output_path): + with open(file, 'r', encoding="utf-8") as f: + data = [] + for line in f: + data.append(json.loads(line)) + + correct = 0 + incorrect = 0 + for i in range(len(data)): + knowledge = data[i]["knowledge"] + dialog = data[i]["dialogue_history"] + hallucinated_response = data[i]["hallucinated_response"] + right_response = data[i]["right_response"] + + if random.random() > 0.5: + response = hallucinated_response + ground_truth = "Yes" + else: + response = right_response + ground_truth = "No" + + ans = get_dialogue_response(model, dialog, response, instruction) + ans = ans.replace(".", "") + + if ("Yes" in ans and "No" in ans) or ("Yes" not in ans and "No" not in ans): + gen = {"knowledge": knowledge, "dialogue_history": dialog, "response": response, "ground_truth": ground_truth, "judgement": "failed!"} + dump_jsonl(gen, output_path, append=True) + incorrect += 1 + print('sample {} fails......'.format(i)) + continue + elif "Yes" in ans: + if ans != "Yes": + ans = "Yes" + gen = {"knowledge": knowledge, "dialogue_history": dialog, "response": response, "ground_truth": ground_truth, "judgement": ans} + elif "No" in ans: + if ans != "No": + ans = "No" + gen = {"knowledge": knowledge, "dialogue_history": dialog, "response": response, "ground_truth": ground_truth, "judgement": ans} + else: + gen = None + assert (gen is not None) + + if ground_truth == ans: + correct += 1 + else: + incorrect += 1 + + print('sample {} success......'.format(i)) + dump_jsonl(gen, output_path, append=True) + + print('{} correct samples, {} incorrect samples, Accuracy: {}'.format(correct, incorrect, correct / len(data))) + + +def evaluation_summarization_dataset(model, file, instruction, output_path): + with open(file, 'r', encoding="utf-8") as f: + data = [] + for line in f: + data.append(json.loads(line)) + + correct = 0 + incorrect = 0 + for i in range(len(data)): + + document = data[i]["document"] + hallucinated_summary = data[i]["hallucinated_summary"] + right_summary = data[i]["right_summary"] + + if random.random() > 0.5: + summary = hallucinated_summary + ground_truth = "Yes" + else: + summary = right_summary + ground_truth = "No" + + ans = get_summarization_response(model, document, summary, instruction) + ans = ans.replace(".", "") + + if ("Yes" in ans and "No" in ans) or ("Yes" not in ans and "No" not in ans): + gen = {"document": document, "summary": summary, "ground_truth": ground_truth, "judgement": "failed!"} + dump_jsonl(gen, output_path, append=True) + incorrect += 1 + print('sample {} fails......'.format(i)) + continue + elif "Yes" in ans: + if ans != "Yes": + ans = "Yes" + gen = {"document": document, "summary": summary, "ground_truth": ground_truth, "judgement": ans} + elif "No" in ans: + if ans != "No": + ans = "No" + gen = {"document": document, "summary": summary, "ground_truth": ground_truth, "judgement": ans} + else: + gen = None + assert (gen is not None) + + if ground_truth == ans: + correct += 1 + else: + incorrect += 1 + + print('sample {} success......'.format(i)) + dump_jsonl(gen, output_path, append=True) + + print('{} correct samples, {} incorrect samples, Accuracy: {}'.format(correct, incorrect, correct / len(data))) + + +def dump_jsonl(data, output_path, append=False): + """ + Write list of objects to a JSON lines file. + """ + mode = 'a+' if append else 'w' + with open(output_path, mode, encoding='utf-8') as f: + json_record = json.dumps(data, ensure_ascii=False) + f.write(json_record + '\n') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Hallucination Generation") + + parser.add_argument("--task", default="qa", help="qa, dialogue, or summarization") + parser.add_argument("--model", default="qwen", help="model name") + args = parser.parse_args() + + instruction_file = "{}/{}_evaluation_instruction.txt".format(args.task, args.task) + f = open(instruction_file, 'r', encoding="utf-8") + instruction = f.read() + + model = args.model + output_path = "{}/{}_{}_results.json".format(args.task, args.task, args.model) + + # data = "../data/{}_data.json".format(args.task) + data="/home/lee/code/HaluEval/evaluation/factuality_train.json" + + if args.task == "qa": + evaluation_qa_dataset(model, data, output_path) + elif args.task == "dialogue": + evaluation_dialogue_dataset(model, data, instruction, output_path) + elif args.task == "summarization": + evaluation_summarization_dataset(model, data, instruction, output_path) + else: + raise ValueError("The task must be qa, dialogue, or summarization!")