Spaces:

Qar-Raz
/

NLP-RAG

Running

App Files Files Community

NLP-RAG / test.py

Qar-Raz

Sync backend Docker context from GitHub main

8f37cc7 verified about 2 hours ago

raw

history blame contribute delete

7.64 kB

	import os
	os.environ["OMP_NUM_THREADS"] = "1"
	os.environ["MKL_NUM_THREADS"] = "1"
	import sys
	import traceback
	from datetime import datetime
	from dotenv import load_dotenv

	from config_loader import cfg
	from data.vector_db import get_index_by_name
	from retriever.retriever import HybridRetriever
	from retriever.processor import ChunkProcessor
	from data.ingest import CHUNKING_TECHNIQUES

	def generate_retrieval_report(all_results, queries, output_file="retrieval_report.md"):
	"""
	Generates a Markdown document summarizing the retrieved chunks
	for each query, chunking technique, and retrieval strategy.
	"""
	timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

	content = f"# Retrieval Testing Report\n\nGenerated: {timestamp}\n\n"
	content += "## Test Queries\n\n"
	for i, q in enumerate(queries, 1):
	content += f"{i}. {q}\n"

	content += "\n## Retrieval Results by Query\n\n"

	for q_idx, q_results in all_results.items():
	content += f"### Query {q_idx + 1}: {queries[q_idx]}\n\n"

	for tech_strat_key, chunks_data in q_results.items():
	content += f"#### Strategy & Technique: {tech_strat_key}\n\n"

	chunks = chunks_data.get('chunks', [])
	score = chunks_data.get('score', 0)

	content += f"ChunkScore: {score:.4f} \| Chunks retrieved: {len(chunks)}\n\n"

	if not chunks:
	content += "No chunks retrieved.\n\n"
	else:
	for i, chunk in enumerate(chunks, 1):
	content += f"[Chunk {i}] ({len(chunk)} chars):\n"
	content += f"text\n{chunk}\n\n\n"

	content += "---\n\n"

	with open(output_file, 'w', encoding='utf-8') as f:
	f.write(content)

	print(f"\nRetrieval report saved to: {output_file}")


	def main():
	# Load environment variables
	load_dotenv()

	pinecone_key = os.getenv("PINECONE_API_KEY")
	if not pinecone_key:
	raise RuntimeError("PINECONE_API_KEY not found in environment variables")

	test_queries = [
	"What is cognitive behavior therapy and how does it work?",
	"I feel like a complete failure because I made a mistake at work today. Everyone must think I am incompetent, and I will probably get fired. I just want to hide.",
	"No matter what I do, my anxiety will not go away. I am constantly worried about the future and avoid social situations because of it.",
	"I have been feeling really down lately and have no energy. It feels like nothing will ever get better and there is no point in trying.",
	"My friend didn't text me back for five hours. I'm certain they are mad at me or that I've done something to ruin our friendship.",
	"Can you explain the difference between a 'situation,' a 'thought,' and an 'emotion' in the context of a CBT thought record?",
	"I have to do everything perfectly. If I make even one small mistake, it means the entire project is a total disaster and I've wasted everyone's time.",
	"Whenever I have to give a presentation, my heart starts racing and I'm sure I'm going to have a heart attack or pass out in front of everyone.",
	"I feel like I'm fundamentally broken and that if people really knew me, they would never want to be around me.",
	"What is 'behavioral activation' and how can it help someone who is struggling with a lack of motivation or depression?"
	]

	# TECHNIQUES_TO_EVALUATE = ["fixed", "semantic", "markdown", "page"]
	# Use all 7 chunking techniques from ingest.py
	CHUNKING_TECHNIQUES_FILTERED = CHUNKING_TECHNIQUES
	print(f"Testing all {len(CHUNKING_TECHNIQUES_FILTERED)} chunking techniques:")
	for tech in CHUNKING_TECHNIQUES_FILTERED:
	print(f" - {tech['name']}: {tech['description']}")

	RETRIEVAL_STRATEGIES = [
	{"mode": "semantic", "use_mmr": False, "label": "semantic-no-mmr","rerank_strategy":"cross-encoder"},
	{"mode": "semantic", "use_mmr": True, "label": "semantic-with-mmr","rerank_strategy":"cross-encoder"},
	{"mode": "semantic", "use_mmr": False, "label": "semantic-no-mmr","rerank_strategy":"none"},
	{"mode": "semantic", "use_mmr": True, "label": "semantic-with-mmr","rerank_strategy":"none"},
	{"mode": "hybrid", "use_mmr": False, "label": "hybrid-no-mmr","rerank_strategy":"cross-encoder"},
	{"mode": "hybrid", "use_mmr": True, "label": "hybrid-with-mmr","rerank_strategy":"rrf"},
	{"mode": "hybrid", "use_mmr": True, "label": "hybrid-with-mmr","rerank_strategy":"cross-encoder"},
	{"mode": "hybrid", "use_mmr": False, "label": "hybrid-with-mmr","rerank_strategy":"rrf"},
	{"mode": "bm25", "use_mmr": False, "label": "bm25-no-mmr","rerank_strategy":"cross-encoder"},
	{"mode": "bm25", "use_mmr": False, "label": "bm25-no-mmr","rerank_strategy":"none"},
	]

	print("Initializing ChunkProcessor to load Embedding Model...")
	proc = ChunkProcessor(model_name=cfg.processing['embedding_model'], verbose=False)


	print("Initializing HybridRetriever...")
	retriever = HybridRetriever(
	embed_model=proc.encoder,
	rerank_model_name='jinaai/jina-reranker-v1-tiny-en',
	verbose=False
	)

	all_query_results = {}

	for query_idx, query in enumerate(test_queries):
	print(f"\n{'='*80}")
	print(f"PROCESSING QUERY {query_idx + 1}/{len(test_queries)}: {query}")
	print(f"{'='*80}")

	query_results = {}

	# Connect to the single index where all techniques are stored with metadata differentiation
	index_name = "cbt-book-recursive"
	try:
	index = get_index_by_name(pinecone_key, index_name)
	stats = index.describe_index_stats()
	if stats.get('total_vector_count', 0) == 0:
	print(f" [!] Warning: Index {index_name} is empty. Proceeding for sparse test.")
	except Exception as e:
	print(f" [X] Failed to connect to index {index_name}: {e}")
	continue

	for technique in CHUNKING_TECHNIQUES_FILTERED:
	technique_name = technique['name']

	for strategy in RETRIEVAL_STRATEGIES:
	result_key = f"{technique_name} + {strategy['label']} + {strategy['rerank_strategy']}"
	print(f"\nEvaluating: {result_key}")

	try:
	context_chunks, chunk_score = retriever.search(
	query=query,
	index=index,
	mode=strategy['mode'],
	rerank_strategy=strategy['rerank_strategy'],
	use_mmr=strategy['use_mmr'],
	top_k=50,
	final_k=4,
	technique_name=technique_name,
	verbose=False,
	test=True
	)

	query_results[result_key] = {
	'chunks': context_chunks,
	'score': chunk_score
	}
	print(f" -> Retrieved {len(context_chunks)} chunks (Score: {chunk_score:.4f})")

	except Exception as e:
	print(f" -> Error retrieving for {result_key}: {e}")

	all_query_results[query_idx] = query_results

	# Generate isolated retrieval test report
	generate_retrieval_report(all_query_results, test_queries)


	if __name__ == '__main__':
	main()