diff --git a/evaluation.py b/evaluation.py
index 3c7dcab..2701b7e 100644
--- a/evaluation.py
+++ b/evaluation.py
@@ -9,19 +9,19 @@ from openai import OpenAIError
 
 
 client = OpenAI(
-    api_key="sk-5f06261529bb44df86d9b2fdbae1a6b5", # 在这里将 MOONSHOT_API_KEY 替换为你从 Kimi 开放平台申请的 API Key
+    api_key="sk-", # 在这里将 MOONSHOT_API_KEY 替换为你从 Kimi 开放平台申请的 API Key
     base_url="https://api.deepseek.com/v1",
 )
 
 def get_qa_response(model, question, answer):
     message = [
-        {"role": "system", "content":"You are a huallucination detector. You MUST determine if the provided answer contains hallucination or not for the question based on the world knowledge. The answer you provided MUST be \"Yes\" or \"No\""},
+        {"role": "system", "content":"你是一个幻觉检测器。你必须根据世界知识确定问题的答案是否符合事实。你提供的答案必须是 \"YES\" or \"NO\" 并且给出你的理由"},
         {"role": "user", "content": 
                                     "\n\n#Question#: " + question +
                                     "\n#Answer#: " + answer +
                                     "\n#Your Judgement#: "} 
     ]
-    prompt =  "\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
+    # prompt =  "\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
     while True:
         try:
             if model == "gpt-3.5-turbo":
@@ -35,9 +35,7 @@ def get_qa_response(model, question, answer):
                 res = client.chat.completions.create(
                 model="deepseek-chat",
                 messages=message,
-                temperature=1,
-                max_tokens=256,
-                top_p=1
+                stream=False
             )
                 response = res.choices[0].message.content
             break
@@ -60,49 +58,7 @@ def get_qa_response(model, question, answer):
     return response
 
 
-def get_dialogue_response(model, dialog, response, instruction):
-    message = [
-        {"role": "system", "content": "You are a response judge. You MUST determine if the provided response contains non-factual or hallucinated information. The answer you give MUST be \"Yes\" or \"No\""},
-        {"role": "user", "content": instruction +
-                                    "\n\n#Dialogue History#: " + dialog +
-                                    "\n#Response#: " + response +
-                                    "\n#Your Judgement#: "}
-    ]
-    prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
-    while True:
-        try:
-            if model == "gpt-3.5-turbo":
-                res = openai.ChatCompletion.create(
-                    model="gpt-3.5-turbo",
-                    messages=message,
-                    temperature=0.0,
-                )
-                response = res['choices'][0]['message']['content']
-            else:
-                res = openai.Completion.create(
-                    model=model,
-                    prompt=prompt,
-                    temperature=0.0
-                )
-                response = res["choices"][0]['text'].strip()
-            break
-        except openai.error.RateLimitError:
-            print('openai.error.RateLimitError\nRetrying...')
-            time.sleep(60)
-        except openai.error.ServiceUnavailableError:
-            print('openai.error.ServiceUnavailableError\nRetrying...')
-            time.sleep(20)
-        except openai.error.Timeout:
-            print('openai.error.Timeout\nRetrying...')
-            time.sleep(20)
-        except openai.error.APIError:
-            print('openai.error.APIError\nRetrying...')
-            time.sleep(20)
-        except openai.error.APIConnectionError:
-            print('openai.error.APIConnectionError\nRetrying...')
-            time.sleep(20)
 
-    return response
 
 
 def num_tokens_from_message(message, model="davinci"):
@@ -120,61 +76,12 @@ def truncate_message(prompt1, prompt2, model="davinci"):
     return prompt
 
 
-def get_summarization_response(model, document, summary, instruction):
-    message = [
-        {"role": "system", "content": "You are a summary judge. You MUST determine if the provided summary contains non-factual or hallucinated information. The answer you give MUST be \"Yes\" or \"No\""},
-        {"role": "user", "content": instruction +
-                                    "\n\n#Document#: " + document +
-                                    "\n#Summary#: " + summary +
-                                    "\n#Your Judgement#: "}
-    ]
-    prompt1 = instruction + "\n\n#Document#: " + document
-    prompt2 = "\n#Summary#: " + summary + "\n#Your Judgement#:"
-    if model == "davinci":
-        prompt = truncate_message(prompt1, prompt2)
-    else:
-        prompt = prompt1 + prompt2
-    while True:
-        try:
-            if model == "gpt-3.5-turbo":
-                res = openai.ChatCompletion.create(
-                    model="gpt-3.5-turbo",
-                    messages=message,
-                    temperature=0.0,
-                )
-                response = res['choices'][0]['message']['content']
-            else:
-                res = openai.Completion.create(
-                    model=model,
-                    prompt=prompt,
-                    temperature=0.0
-                )
-                response = res["choices"][0]['text'].strip()
-            break
-        except openai.error.RateLimitError:
-            print('openai.error.RateLimitError\nRetrying...')
-            time.sleep(60)
-        except openai.error.ServiceUnavailableError:
-            print('openai.error.ServiceUnavailableError\nRetrying...')
-            time.sleep(20)
-        except openai.error.Timeout:
-            print('openai.error.Timeout\nRetrying...')
-            time.sleep(20)
-        except openai.error.APIError:
-            print('openai.error.APIError\nRetrying...')
-            time.sleep(20)
-        except openai.error.APIConnectionError:
-            print('openai.error.APIConnectionError\nRetrying...')
-            time.sleep(20)
 
-    return response
 
 
 def evaluation_qa_dataset(model, file,  output_path):
     result=[]
-    TP = 0
-    FP=0
-    FN=0
+    
     # test_file=json.loads(file)
     with open(file, 'r', encoding="utf-8") as f:
         # print(f"File content: {file}")
@@ -185,175 +92,19 @@ def evaluation_qa_dataset(model, file,  output_path):
         for i in range(len(data)):
             question= data[i]["Question"]
             answer=data[i]["Answer"]
-            ground_truth = data[i]["Hallucination"]
             
             output_samples = get_qa_response(model, question, answer)
             print('sample {} success......'.format(i))
-            if ("Yes" in output_samples and "YES" in ground_truth):
-                TP=TP+1
-            elif ("Yes" in output_samples and "NO" in ground_truth):
-                FP=FP+1
-            else:
-                FN=FN+1
-            result.append({"Question":question,"Answer":answer,"Hallucination":ground_truth, "res":output_samples})
-        Precision=TP/(TP+FP)
-        Recall=TP/(TP+FN)
-        F1score=2*(Precision*Recall)/(Precision+Recall)
-        print(' F1score: {}'.format(F1score))
-        dump_jsonl(result, output_path, append=True)
-        # correct = 0
-        # incorrect = 0
-        # for i in range(len(data)):
-        #     knowledge = data[i]["knowledge"]
-        #     question = data[i]["question"]
-        #     hallucinated_answer = data[i]["hallucinated_answer"]
-        #     right_answer = data[i]["right_answer"]
-
-        #     if random.random() > 0.5:
-        #         answer = hallucinated_answer
-        #         ground_truth = "Yes"
-        #     else:
-        #         answer = right_answer
-        #         ground_truth = "No"
-
-        #     ans = get_qa_response(model, question, answer, instruction)
-        #     ans = ans.replace(".", "")
-
-        #     if ("Yes" in ans and "No" in ans) or ("Yes" not in ans and "No" not in ans):
-        #         gen = {"knowledge": knowledge, "question": question, "answer": answer, "ground_truth": ground_truth, "judgement": "failed!"}
-        #         dump_jsonl(gen, output_path, append=True)
-        #         incorrect += 1
-        #         print('sample {} fails......'.format(i))
-        #         continue
-        #     elif "Yes" in ans:
-        #         if ans != "Yes":
-        #             ans = "Yes"
-        #         gen = {"knowledge": knowledge, "question": question, "answer": answer, "ground_truth": ground_truth, "judgement": ans}
-        #     elif "No" in ans:
-        #         if ans != "No":
-        #             ans = "No"
-        #         gen = {"knowledge": knowledge, "question": question, "answer": answer, "ground_truth": ground_truth, "judgement": ans}
-        #     else:
-        #         gen = None
-        #         incorrect += 1
-
-        #     assert(gen is not None)
-
-        #     if ground_truth == ans:
-        #         correct += 1
-        #     else:
-        #         incorrect += 1
-
-        #     print('sample {} success......'.format(i))
-            
-
-        # print('{} correct samples, {} incorrect samples, Accuracy: {}'.format(correct, incorrect, correct/len(data)))
+            result.append({"Question":question,"Answer":answer, "Prediction":"YES" if "YES" in output_samples else "NO"})
+            # result.append({"Question":question,"Answer":answer, "Prediction":"YES" if "YES" in output_samples else "NO", "Reason":output_samples})
+        dump_jsonl(result, output_path, append=False)
+        
 
 
-def evaluation_dialogue_dataset(model, file, instruction, output_path):
-    with open(file, 'r', encoding="utf-8") as f:
-        data = []
-        for line in f:
-            data.append(json.loads(line))
-
-        correct = 0
-        incorrect = 0
-        for i in range(len(data)):
-            knowledge = data[i]["knowledge"]
-            dialog = data[i]["dialogue_history"]
-            hallucinated_response = data[i]["hallucinated_response"]
-            right_response = data[i]["right_response"]
-
-            if random.random() > 0.5:
-                response = hallucinated_response
-                ground_truth = "Yes"
-            else:
-                response = right_response
-                ground_truth = "No"
-
-            ans = get_dialogue_response(model, dialog, response, instruction)
-            ans = ans.replace(".", "")
-
-            if ("Yes" in ans and "No" in ans) or ("Yes" not in ans and "No" not in ans):
-                gen = {"knowledge": knowledge, "dialogue_history": dialog, "response": response, "ground_truth": ground_truth, "judgement": "failed!"}
-                dump_jsonl(gen, output_path, append=True)
-                incorrect += 1
-                print('sample {} fails......'.format(i))
-                continue
-            elif "Yes" in ans:
-                if ans != "Yes":
-                    ans = "Yes"
-                gen = {"knowledge": knowledge, "dialogue_history": dialog, "response": response, "ground_truth": ground_truth, "judgement": ans}
-            elif "No" in ans:
-                if ans != "No":
-                    ans = "No"
-                gen = {"knowledge": knowledge, "dialogue_history": dialog, "response": response, "ground_truth": ground_truth, "judgement": ans}
-            else:
-                gen = None
-            assert (gen is not None)
-
-            if ground_truth == ans:
-                correct += 1
-            else:
-                incorrect += 1
-
-            print('sample {} success......'.format(i))
-            dump_jsonl(gen, output_path, append=True)
-
-        print('{} correct samples, {} incorrect samples, Accuracy: {}'.format(correct, incorrect, correct / len(data)))
 
 
-def evaluation_summarization_dataset(model, file, instruction, output_path):
-    with open(file, 'r', encoding="utf-8") as f:
-        data = []
-        for line in f:
-            data.append(json.loads(line))
 
-        correct = 0
-        incorrect = 0
-        for i in range(len(data)):
 
-            document = data[i]["document"]
-            hallucinated_summary = data[i]["hallucinated_summary"]
-            right_summary = data[i]["right_summary"]
-
-            if random.random() > 0.5:
-                summary = hallucinated_summary
-                ground_truth = "Yes"
-            else:
-                summary = right_summary
-                ground_truth = "No"
-
-            ans = get_summarization_response(model, document, summary, instruction)
-            ans = ans.replace(".", "")
-
-            if ("Yes" in ans and "No" in ans) or ("Yes" not in ans and "No" not in ans):
-                gen = {"document": document, "summary": summary, "ground_truth": ground_truth, "judgement": "failed!"}
-                dump_jsonl(gen, output_path, append=True)
-                incorrect += 1
-                print('sample {} fails......'.format(i))
-                continue
-            elif "Yes" in ans:
-                if ans != "Yes":
-                    ans = "Yes"
-                gen = {"document": document, "summary": summary, "ground_truth": ground_truth, "judgement": ans}
-            elif "No" in ans:
-                if ans != "No":
-                    ans = "No"
-                gen = {"document": document, "summary": summary, "ground_truth": ground_truth, "judgement": ans}
-            else:
-                gen = None
-            assert (gen is not None)
-
-            if ground_truth == ans:
-                correct += 1
-            else:
-                incorrect += 1
-
-            print('sample {} success......'.format(i))
-            dump_jsonl(gen, output_path, append=True)
-
-        print('{} correct samples, {} incorrect samples, Accuracy: {}'.format(correct, incorrect, correct / len(data)))
 
 
 def dump_jsonl(data, output_path, append=False):
@@ -370,24 +121,20 @@ if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="Hallucination Generation")
 
     parser.add_argument("--task", default="qa", help="qa, dialogue, or summarization")
-    parser.add_argument("--model", default="qwen", help="model name")
+    parser.add_argument("--model", default="deepseek", help="model name")
     args = parser.parse_args()
 
-    instruction_file = "{}/{}_evaluation_instruction.txt".format(args.task, args.task)
-    f = open(instruction_file, 'r', encoding="utf-8")
-    instruction = f.read()
+    # instruction_file = "{}/{}_evaluation_instruction.txt".format(args.task, args.task)
+    # f = open(instruction_file, 'r', encoding="utf-8")
+    # instruction = f.read()
 
     model = args.model
-    output_path = "{}/{}_{}_results.json".format(args.task, args.task, args.model)
+    output_path = "{}/{}_results.json".format(args.task,  args.model)
 
     # data = "../data/{}_data.json".format(args.task)
-    data="/home/lee/code/HaluEval/evaluation/factuality_train.json"
+    data="/home/leewlving/PycharmProjects/xianxing_cup3/factuality_predict.json"
 
     if args.task == "qa":
         evaluation_qa_dataset(model, data,  output_path)
-    elif args.task == "dialogue":
-        evaluation_dialogue_dataset(model, data, instruction, output_path)
-    elif args.task == "summarization":
-        evaluation_summarization_dataset(model, data, instruction, output_path)
     else:
         raise ValueError("The task must be qa, dialogue, or summarization!")