| import json |
| import os |
| import sys |
| import pandas as pd |
| from timeit import default_timer as timer |
| import nltk |
|
|
| sys.path.insert(0, os.getcwd()) |
|
|
| chatting = len(sys.argv) > 1 and sys.argv[1] == "chat" |
|
|
| if chatting: |
| os.environ["BATCH_SIZE"] = "1" |
|
|
| from app_modules.init import app_init |
| from app_modules.llm_qa_chain import QAChain |
| from app_modules.utils import print_llm_response, calc_metrics, detect_repetition_scores |
|
|
| llm_loader, qa_chain = app_init() |
|
|
| if chatting: |
| print("Starting chat mode") |
| while True: |
| question = input("Please enter your question: ") |
| if question.lower() == "exit": |
| break |
| result = qa_chain.call_chain({"question": question, "chat_history": []}, None) |
| print_llm_response(result) |
|
|
| sys.exit(0) |
|
|
| num_of_questions = 0 |
|
|
| if len(sys.argv) > 1: |
| num_of_questions = int(sys.argv[1]) |
|
|
| |
| df = pd.DataFrame( |
| columns=[ |
| "id", |
| "question", |
| "answer", |
| ] |
| ) |
|
|
| batch_size = int(os.getenv("BATCH_SIZE", "1")) |
| print(f"Batch size: {batch_size}") |
|
|
| questions_file_path = os.environ.get("QUESTIONS_FILE_PATH") |
| debug_retrieval = os.getenv("DEBUG_RETRIEVAL", "false").lower() == "true" |
|
|
| |
| print(f"Reading questions from file: {questions_file_path}") |
| test_data = json.loads(open(questions_file_path).read()) |
|
|
| if isinstance(test_data, dict): |
| questions = [test_data[key] for key in test_data.keys()] |
| ids = [key for key in test_data.keys()] |
| else: |
| questions = test_data |
| ids = [row["id"] for row in questions] |
|
|
| if num_of_questions > 0: |
| questions = questions[:num_of_questions] |
|
|
| print(f"Number of questions: {len(questions)}") |
|
|
| if __name__ == "__main__": |
| chat_start = timer() |
| index = 0 |
|
|
| while index < len(questions): |
| batch_ids = ids[index : index + batch_size] |
| batch_questions = [q["question"] for q in questions[index : index + batch_size]] |
|
|
| if isinstance(qa_chain, QAChain): |
| inputs = [{"question": q, "chat_history": []} for q in batch_questions] |
| else: |
| inputs = [{"question": q} for q in batch_questions] |
|
|
| start = timer() |
| result = qa_chain.call_chain(inputs, None) |
| end = timer() |
| print(f"Completed in {end - start:.3f}s") |
|
|
| |
| batch_answers = [r["answer"] for r in result] |
|
|
| for id, question, answer in zip(batch_ids, batch_questions, batch_answers): |
| df.loc[len(df)] = { |
| "id": id, |
| "question": question, |
| "answer": answer, |
| } |
|
|
| index += batch_size |
|
|
| for r in result: |
| print_llm_response(r, debug_retrieval) |
|
|
| chat_end = timer() |
| total_time = chat_end - chat_start |
| print(f"Total time used: {total_time:.3f} s") |
|
|
| df2 = pd.DataFrame( |
| columns=[ |
| "id", |
| "question", |
| "answer", |
| "word_count", |
| "ground_truth", |
| ] |
| ) |
|
|
| for i in range(len(df)): |
| question = questions[i] |
| answer = df["answer"][i] |
| query = df["question"][i] |
| id = df["id"][i] |
|
|
| ground_truth = question[ |
| "wellFormedAnswers" if "wellFormedAnswers" in question else "answers" |
| ] |
|
|
| word_count = len(nltk.word_tokenize(answer)) |
|
|
| df2.loc[len(df2)] = { |
| "id": id, |
| "question": query, |
| "answer": answer, |
| "word_count": word_count, |
| "ground_truth": ground_truth, |
| } |
|
|
| df2[["newline_score", "repetition_score", "total_repetitions"]] = df2[ |
| "answer" |
| ].apply(detect_repetition_scores) |
|
|
| pd.options.display.float_format = "{:.3f}".format |
| print(df2.describe()) |
|
|
| word_count = df2["word_count"].sum() |
|
|
| csv_file = ( |
| os.getenv("TEST_RESULTS_CSV_FILE") or f"qa_batch_{batch_size}_test_results.csv" |
| ) |
| with open(csv_file, "w") as f: |
| f.write( |
| f"# RAG: {isinstance(qa_chain, QAChain)} questions: {questions_file_path}\n" |
| ) |
| f.write( |
| f"# model: {llm_loader.model_name} repetition_penalty: {llm_loader.repetition_penalty}\n" |
| ) |
|
|
| df2.to_csv(csv_file, mode="a", index=False, header=True) |
| print(f"test results saved to file: {csv_file}") |
|
|
| scores = calc_metrics(df2) |
|
|
| df = pd.DataFrame( |
| { |
| "model": [llm_loader.model_name], |
| "repetition_penalty": [llm_loader.repetition_penalty], |
| "word_count": [word_count], |
| "inference_time": [total_time], |
| "inference_speed": [word_count / total_time], |
| "bleu1": [scores["bleu_scores"]["bleu"]], |
| "rougeL": [scores["rouge_scores"]["rougeL"]], |
| } |
| ) |
|
|
| print(f"Number of words generated: {word_count}") |
| print(f"Average generation speed: {word_count / total_time:.3f} words/s") |
|
|
| csv_file = os.getenv("ALL_RESULTS_CSV_FILE") or "qa_chain_all_results.csv" |
| file_existed = os.path.exists(csv_file) and os.path.getsize(csv_file) > 0 |
| df.to_csv(csv_file, mode="a", index=False, header=not file_existed) |
| print(f"all results appended to file: {csv_file}") |
|
|