Spaces:

lvvignesh2122
/

Gemini-Rag-Fastapi-Pro

Sleeping

App Files Files Community

Gemini-Rag-Fastapi-Pro / run_evals.py

lvvignesh2122

feat: Masters Level Upgrade - SQL Hybrid Agent, Docker, Tests, and RAGAS Eval

4e0f514 2 months ago

raw

history blame contribute delete

3.39 kB

	import json
	import os
	from llm_utils import generate_with_retry
	import google.generativeai as genai
	from dotenv import load_dotenv

	load_dotenv()

	LOG_FILE = "rag_eval_logs.jsonl"
	MODEL_NAME = "gemini-2.5-flash"
	API_KEY = os.getenv("GEMINI_API_KEY")

	if not API_KEY:
	print("❌ GEMINI_API_KEY not found in env.")
	exit(1)

	genai.configure(api_key=API_KEY)

	def calculate_faithfulness(answer, contexts):
	"""
	Score 0.0 to 1.0
	Measure: Is the answer derived only from the context?
	"""
	if not contexts: return 0.0

	context_text = "\n".join(contexts)
	prompt = f"""
	You are an AI Judge.
	Rate the 'Faithfulness' of the Answer to the Context on a scale of 0.0 to 1.0.
	1.0 = Answer is strictly derived from Context.
	0.0 = Answer contains hallucinations or info not in Context.

	Context: {context_text[:3000]}

	Answer: {answer}

	Return ONLY a single float number (e.g. 0.9).
	"""
	model = genai.GenerativeModel(MODEL_NAME)
	try:
	resp = model.generate_content(prompt)
	score = float(resp.text.strip())
	return max(0.0, min(1.0, score))
	except:
	return 0.5 # Default on error

	def calculate_relevancy(query, answer):
	"""
	Score 0.0 to 1.0
	Measure: Does the answer directly address the query?
	"""
	prompt = f"""
	You are an AI Judge.
	Rate the 'Relevancy' of the Answer to the Query on a scale of 0.0 to 1.0.
	1.0 = Answer directly addresses the query.
	0.0 = Answer is unrelated or ignores the user.

	Query: {query}
	Answer: {answer}

	Return ONLY a single float number (e.g. 0.9).
	"""
	model = genai.GenerativeModel(MODEL_NAME)
	try:
	resp = model.generate_content(prompt)
	score = float(resp.text.strip())
	return max(0.0, min(1.0, score))
	except:
	return 0.5

	def run_audit():
	if not os.path.exists(LOG_FILE):
	print(f"No log file found at {LOG_FILE}")
	return

	print(f"📊 Running Post-Hoc Audit on {LOG_FILE}...\n")
	print(f"{'Query':<30} \| {'Faithful':<10} \| {'Relevancy':<10}")
	print("-" * 60)

	total_f = 0
	total_r = 0
	count = 0

	with open(LOG_FILE, "r", encoding="utf-8") as f:
	for line in f:
	try:
	data = json.loads(line)
	# Skip legacy logs without final answer
	if "final_answer" not in data or not data["final_answer"]:
	continue

	q = data["query"]
	a = data["final_answer"]
	c = data.get("context_list", [])

	f_score = calculate_faithfulness(a, c)
	r_score = calculate_relevancy(q, a)

	print(f"{q[:30]:<30} \| {f_score:.2f} \| {r_score:.2f}")

	total_f += f_score
	total_r += r_score
	count += 1
	except Exception as e:
	pass # Skip bad lines

	if count > 0:
	print("-" * 60)
	print(f"\n✅ Audit Complete.")
	print(f"Average Faithfulness: {total_f/count:.2f}")
	print(f"Average Relevancy: {total_r/count:.2f}")
	else:
	print("\n⚠️ No complete records found to audit. Ask some questions first!")

	if __name__ == "__main__":
	run_audit()