| import json |
| import os |
| from llm_utils import generate_with_retry |
| import google.generativeai as genai |
| from dotenv import load_dotenv |
|
|
| load_dotenv() |
|
|
| LOG_FILE = "rag_eval_logs.jsonl" |
| MODEL_NAME = "gemini-2.5-flash" |
| API_KEY = os.getenv("GEMINI_API_KEY") |
|
|
| if not API_KEY: |
| print("❌ GEMINI_API_KEY not found in env.") |
| exit(1) |
|
|
| genai.configure(api_key=API_KEY) |
|
|
| def calculate_faithfulness(answer, contexts): |
| """ |
| Score 0.0 to 1.0 |
| Measure: Is the answer derived *only* from the context? |
| """ |
| if not contexts: return 0.0 |
| |
| context_text = "\n".join(contexts) |
| prompt = f""" |
| You are an AI Judge. |
| Rate the 'Faithfulness' of the Answer to the Context on a scale of 0.0 to 1.0. |
| 1.0 = Answer is strictly derived from Context. |
| 0.0 = Answer contains hallucinations or info not in Context. |
| |
| Context: {context_text[:3000]} |
| |
| Answer: {answer} |
| |
| Return ONLY a single float number (e.g. 0.9). |
| """ |
| model = genai.GenerativeModel(MODEL_NAME) |
| try: |
| resp = model.generate_content(prompt) |
| score = float(resp.text.strip()) |
| return max(0.0, min(1.0, score)) |
| except: |
| return 0.5 |
|
|
| def calculate_relevancy(query, answer): |
| """ |
| Score 0.0 to 1.0 |
| Measure: Does the answer directly address the query? |
| """ |
| prompt = f""" |
| You are an AI Judge. |
| Rate the 'Relevancy' of the Answer to the Query on a scale of 0.0 to 1.0. |
| 1.0 = Answer directly addresses the query. |
| 0.0 = Answer is unrelated or ignores the user. |
| |
| Query: {query} |
| Answer: {answer} |
| |
| Return ONLY a single float number (e.g. 0.9). |
| """ |
| model = genai.GenerativeModel(MODEL_NAME) |
| try: |
| resp = model.generate_content(prompt) |
| score = float(resp.text.strip()) |
| return max(0.0, min(1.0, score)) |
| except: |
| return 0.5 |
|
|
| def run_audit(): |
| if not os.path.exists(LOG_FILE): |
| print(f"No log file found at {LOG_FILE}") |
| return |
|
|
| print(f"📊 Running Post-Hoc Audit on {LOG_FILE}...\n") |
| print(f"{'Query':<30} | {'Faithful':<10} | {'Relevancy':<10}") |
| print("-" * 60) |
| |
| total_f = 0 |
| total_r = 0 |
| count = 0 |
| |
| with open(LOG_FILE, "r", encoding="utf-8") as f: |
| for line in f: |
| try: |
| data = json.loads(line) |
| |
| if "final_answer" not in data or not data["final_answer"]: |
| continue |
| |
| q = data["query"] |
| a = data["final_answer"] |
| c = data.get("context_list", []) |
| |
| f_score = calculate_faithfulness(a, c) |
| r_score = calculate_relevancy(q, a) |
| |
| print(f"{q[:30]:<30} | {f_score:.2f} | {r_score:.2f}") |
| |
| total_f += f_score |
| total_r += r_score |
| count += 1 |
| except Exception as e: |
| pass |
| |
| if count > 0: |
| print("-" * 60) |
| print(f"\n✅ Audit Complete.") |
| print(f"Average Faithfulness: {total_f/count:.2f}") |
| print(f"Average Relevancy: {total_r/count:.2f}") |
| else: |
| print("\n⚠️ No complete records found to audit. Ask some questions first!") |
|
|
| if __name__ == "__main__": |
| run_audit() |
|
|