NLP-RAG / test.py
Qar-Raz's picture
Sync backend Docker context from GitHub main
8f37cc7 verified
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
import sys
import traceback
from datetime import datetime
from dotenv import load_dotenv
from config_loader import cfg
from data.vector_db import get_index_by_name
from retriever.retriever import HybridRetriever
from retriever.processor import ChunkProcessor
from data.ingest import CHUNKING_TECHNIQUES
def generate_retrieval_report(all_results, queries, output_file="retrieval_report.md"):
"""
Generates a Markdown document summarizing the retrieved chunks
for each query, chunking technique, and retrieval strategy.
"""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
content = f"# Retrieval Testing Report\n\n*Generated:* {timestamp}\n\n"
content += "## Test Queries\n\n"
for i, q in enumerate(queries, 1):
content += f"{i}. {q}\n"
content += "\n## Retrieval Results by Query\n\n"
for q_idx, q_results in all_results.items():
content += f"### Query {q_idx + 1}: {queries[q_idx]}\n\n"
for tech_strat_key, chunks_data in q_results.items():
content += f"#### Strategy & Technique: {tech_strat_key}\n\n"
chunks = chunks_data.get('chunks', [])
score = chunks_data.get('score', 0)
content += f"*ChunkScore:* {score:.4f} | *Chunks retrieved:* {len(chunks)}\n\n"
if not chunks:
content += "No chunks retrieved.\n\n"
else:
for i, chunk in enumerate(chunks, 1):
content += f"*[Chunk {i}]* ({len(chunk)} chars):\n"
content += f"text\n{chunk}\n\n\n"
content += "---\n\n"
with open(output_file, 'w', encoding='utf-8') as f:
f.write(content)
print(f"\nRetrieval report saved to: {output_file}")
def main():
# Load environment variables
load_dotenv()
pinecone_key = os.getenv("PINECONE_API_KEY")
if not pinecone_key:
raise RuntimeError("PINECONE_API_KEY not found in environment variables")
test_queries = [
"What is cognitive behavior therapy and how does it work?",
"I feel like a complete failure because I made a mistake at work today. Everyone must think I am incompetent, and I will probably get fired. I just want to hide.",
"No matter what I do, my anxiety will not go away. I am constantly worried about the future and avoid social situations because of it.",
"I have been feeling really down lately and have no energy. It feels like nothing will ever get better and there is no point in trying.",
"My friend didn't text me back for five hours. I'm certain they are mad at me or that I've done something to ruin our friendship.",
"Can you explain the difference between a 'situation,' a 'thought,' and an 'emotion' in the context of a CBT thought record?",
"I have to do everything perfectly. If I make even one small mistake, it means the entire project is a total disaster and I've wasted everyone's time.",
"Whenever I have to give a presentation, my heart starts racing and I'm sure I'm going to have a heart attack or pass out in front of everyone.",
"I feel like I'm fundamentally broken and that if people really knew me, they would never want to be around me.",
"What is 'behavioral activation' and how can it help someone who is struggling with a lack of motivation or depression?"
]
# TECHNIQUES_TO_EVALUATE = ["fixed", "semantic", "markdown", "page"]
# Use all 7 chunking techniques from ingest.py
CHUNKING_TECHNIQUES_FILTERED = CHUNKING_TECHNIQUES
print(f"Testing all {len(CHUNKING_TECHNIQUES_FILTERED)} chunking techniques:")
for tech in CHUNKING_TECHNIQUES_FILTERED:
print(f" - {tech['name']}: {tech['description']}")
RETRIEVAL_STRATEGIES = [
{"mode": "semantic", "use_mmr": False, "label": "semantic-no-mmr","rerank_strategy":"cross-encoder"},
{"mode": "semantic", "use_mmr": True, "label": "semantic-with-mmr","rerank_strategy":"cross-encoder"},
{"mode": "semantic", "use_mmr": False, "label": "semantic-no-mmr","rerank_strategy":"none"},
{"mode": "semantic", "use_mmr": True, "label": "semantic-with-mmr","rerank_strategy":"none"},
{"mode": "hybrid", "use_mmr": False, "label": "hybrid-no-mmr","rerank_strategy":"cross-encoder"},
{"mode": "hybrid", "use_mmr": True, "label": "hybrid-with-mmr","rerank_strategy":"rrf"},
{"mode": "hybrid", "use_mmr": True, "label": "hybrid-with-mmr","rerank_strategy":"cross-encoder"},
{"mode": "hybrid", "use_mmr": False, "label": "hybrid-with-mmr","rerank_strategy":"rrf"},
{"mode": "bm25", "use_mmr": False, "label": "bm25-no-mmr","rerank_strategy":"cross-encoder"},
{"mode": "bm25", "use_mmr": False, "label": "bm25-no-mmr","rerank_strategy":"none"},
]
print("Initializing ChunkProcessor to load Embedding Model...")
proc = ChunkProcessor(model_name=cfg.processing['embedding_model'], verbose=False)
print("Initializing HybridRetriever...")
retriever = HybridRetriever(
embed_model=proc.encoder,
rerank_model_name='jinaai/jina-reranker-v1-tiny-en',
verbose=False
)
all_query_results = {}
for query_idx, query in enumerate(test_queries):
print(f"\n{'='*80}")
print(f"PROCESSING QUERY {query_idx + 1}/{len(test_queries)}: {query}")
print(f"{'='*80}")
query_results = {}
# Connect to the single index where all techniques are stored with metadata differentiation
index_name = "cbt-book-recursive"
try:
index = get_index_by_name(pinecone_key, index_name)
stats = index.describe_index_stats()
if stats.get('total_vector_count', 0) == 0:
print(f" [!] Warning: Index {index_name} is empty. Proceeding for sparse test.")
except Exception as e:
print(f" [X] Failed to connect to index {index_name}: {e}")
continue
for technique in CHUNKING_TECHNIQUES_FILTERED:
technique_name = technique['name']
for strategy in RETRIEVAL_STRATEGIES:
result_key = f"{technique_name} + {strategy['label']} + {strategy['rerank_strategy']}"
print(f"\nEvaluating: {result_key}")
try:
context_chunks, chunk_score = retriever.search(
query=query,
index=index,
mode=strategy['mode'],
rerank_strategy=strategy['rerank_strategy'],
use_mmr=strategy['use_mmr'],
top_k=50,
final_k=4,
technique_name=technique_name,
verbose=False,
test=True
)
query_results[result_key] = {
'chunks': context_chunks,
'score': chunk_score
}
print(f" -> Retrieved {len(context_chunks)} chunks (Score: {chunk_score:.4f})")
except Exception as e:
print(f" -> Error retrieving for {result_key}: {e}")
all_query_results[query_idx] = query_results
# Generate isolated retrieval test report
generate_retrieval_report(all_query_results, test_queries)
if __name__ == '__main__':
main()