Spaces:
Runtime error
Runtime error
| import json | |
| import os | |
| from llm_utils import generate_with_retry | |
| import google.generativeai as genai | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| LOG_FILE = "rag_eval_logs.jsonl" | |
| MODEL_NAME = "gemini-2.5-flash" | |
| API_KEY = os.getenv("GEMINI_API_KEY") | |
| if not API_KEY: | |
| print("❌ GEMINI_API_KEY not found in env.") | |
| exit(1) | |
| genai.configure(api_key=API_KEY) | |
| def calculate_faithfulness(answer, contexts): | |
| """ | |
| Score 0.0 to 1.0 | |
| Measure: Is the answer derived *only* from the context? | |
| """ | |
| if not contexts: return 0.0 | |
| context_text = "\n".join(contexts) | |
| prompt = f""" | |
| You are an AI Judge. | |
| Rate the 'Faithfulness' of the Answer to the Context on a scale of 0.0 to 1.0. | |
| 1.0 = Answer is strictly derived from Context. | |
| 0.0 = Answer contains hallucinations or info not in Context. | |
| Context: {context_text[:3000]} | |
| Answer: {answer} | |
| Return ONLY a single float number (e.g. 0.9). | |
| """ | |
| model = genai.GenerativeModel(MODEL_NAME) | |
| try: | |
| resp = model.generate_content(prompt) | |
| score = float(resp.text.strip()) | |
| return max(0.0, min(1.0, score)) | |
| except: | |
| return 0.5 # Default on error | |
| def calculate_relevancy(query, answer): | |
| """ | |
| Score 0.0 to 1.0 | |
| Measure: Does the answer directly address the query? | |
| """ | |
| prompt = f""" | |
| You are an AI Judge. | |
| Rate the 'Relevancy' of the Answer to the Query on a scale of 0.0 to 1.0. | |
| 1.0 = Answer directly addresses the query. | |
| 0.0 = Answer is unrelated or ignores the user. | |
| Query: {query} | |
| Answer: {answer} | |
| Return ONLY a single float number (e.g. 0.9). | |
| """ | |
| model = genai.GenerativeModel(MODEL_NAME) | |
| try: | |
| resp = model.generate_content(prompt) | |
| score = float(resp.text.strip()) | |
| return max(0.0, min(1.0, score)) | |
| except: | |
| return 0.5 | |
| def run_audit(): | |
| if not os.path.exists(LOG_FILE): | |
| print(f"No log file found at {LOG_FILE}") | |
| return | |
| print(f"📊 Running Post-Hoc Audit on {LOG_FILE}...\n") | |
| print(f"{'Query':<30} | {'Faithful':<10} | {'Relevancy':<10}") | |
| print("-" * 60) | |
| total_f = 0 | |
| total_r = 0 | |
| count = 0 | |
| with open(LOG_FILE, "r", encoding="utf-8") as f: | |
| for line in f: | |
| try: | |
| data = json.loads(line) | |
| # Skip legacy logs without final answer | |
| if "final_answer" not in data or not data["final_answer"]: | |
| continue | |
| q = data["query"] | |
| a = data["final_answer"] | |
| c = data.get("context_list", []) | |
| f_score = calculate_faithfulness(a, c) | |
| r_score = calculate_relevancy(q, a) | |
| print(f"{q[:30]:<30} | {f_score:.2f} | {r_score:.2f}") | |
| total_f += f_score | |
| total_r += r_score | |
| count += 1 | |
| except Exception as e: | |
| pass # Skip bad lines | |
| if count > 0: | |
| print("-" * 60) | |
| print(f"\n✅ Audit Complete.") | |
| print(f"Average Faithfulness: {total_f/count:.2f}") | |
| print(f"Average Relevancy: {total_r/count:.2f}") | |
| else: | |
| print("\n⚠️ No complete records found to audit. Ask some questions first!") | |
| if __name__ == "__main__": | |
| run_audit() | |