| import os |
| os.environ["OMP_NUM_THREADS"] = "1" |
| os.environ["MKL_NUM_THREADS"] = "1" |
| import sys |
| import traceback |
| from datetime import datetime |
| from dotenv import load_dotenv |
|
|
| from config_loader import cfg |
| from data.vector_db import get_index_by_name |
| from retriever.retriever import HybridRetriever |
| from retriever.processor import ChunkProcessor |
| from data.ingest import CHUNKING_TECHNIQUES |
|
|
| def generate_retrieval_report(all_results, queries, output_file="retrieval_report.md"): |
| """ |
| Generates a Markdown document summarizing the retrieved chunks |
| for each query, chunking technique, and retrieval strategy. |
| """ |
| timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
| |
| content = f"# Retrieval Testing Report\n\n*Generated:* {timestamp}\n\n" |
| content += "## Test Queries\n\n" |
| for i, q in enumerate(queries, 1): |
| content += f"{i}. {q}\n" |
| |
| content += "\n## Retrieval Results by Query\n\n" |
| |
| for q_idx, q_results in all_results.items(): |
| content += f"### Query {q_idx + 1}: {queries[q_idx]}\n\n" |
| |
| for tech_strat_key, chunks_data in q_results.items(): |
| content += f"#### Strategy & Technique: {tech_strat_key}\n\n" |
| |
| chunks = chunks_data.get('chunks', []) |
| score = chunks_data.get('score', 0) |
| |
| content += f"*ChunkScore:* {score:.4f} | *Chunks retrieved:* {len(chunks)}\n\n" |
| |
| if not chunks: |
| content += "No chunks retrieved.\n\n" |
| else: |
| for i, chunk in enumerate(chunks, 1): |
| content += f"*[Chunk {i}]* ({len(chunk)} chars):\n" |
| content += f"text\n{chunk}\n\n\n" |
| |
| content += "---\n\n" |
| |
| with open(output_file, 'w', encoding='utf-8') as f: |
| f.write(content) |
| |
| print(f"\nRetrieval report saved to: {output_file}") |
|
|
|
|
| def main(): |
| |
| load_dotenv() |
| |
| pinecone_key = os.getenv("PINECONE_API_KEY") |
| if not pinecone_key: |
| raise RuntimeError("PINECONE_API_KEY not found in environment variables") |
| |
| test_queries = [ |
| "What is cognitive behavior therapy and how does it work?", |
| "I feel like a complete failure because I made a mistake at work today. Everyone must think I am incompetent, and I will probably get fired. I just want to hide.", |
| "No matter what I do, my anxiety will not go away. I am constantly worried about the future and avoid social situations because of it.", |
| "I have been feeling really down lately and have no energy. It feels like nothing will ever get better and there is no point in trying.", |
| "My friend didn't text me back for five hours. I'm certain they are mad at me or that I've done something to ruin our friendship.", |
| "Can you explain the difference between a 'situation,' a 'thought,' and an 'emotion' in the context of a CBT thought record?", |
| "I have to do everything perfectly. If I make even one small mistake, it means the entire project is a total disaster and I've wasted everyone's time.", |
| "Whenever I have to give a presentation, my heart starts racing and I'm sure I'm going to have a heart attack or pass out in front of everyone.", |
| "I feel like I'm fundamentally broken and that if people really knew me, they would never want to be around me.", |
| "What is 'behavioral activation' and how can it help someone who is struggling with a lack of motivation or depression?" |
| ] |
|
|
| |
| |
| CHUNKING_TECHNIQUES_FILTERED = CHUNKING_TECHNIQUES |
| print(f"Testing all {len(CHUNKING_TECHNIQUES_FILTERED)} chunking techniques:") |
| for tech in CHUNKING_TECHNIQUES_FILTERED: |
| print(f" - {tech['name']}: {tech['description']}") |
| |
| RETRIEVAL_STRATEGIES = [ |
| {"mode": "semantic", "use_mmr": False, "label": "semantic-no-mmr","rerank_strategy":"cross-encoder"}, |
| {"mode": "semantic", "use_mmr": True, "label": "semantic-with-mmr","rerank_strategy":"cross-encoder"}, |
| {"mode": "semantic", "use_mmr": False, "label": "semantic-no-mmr","rerank_strategy":"none"}, |
| {"mode": "semantic", "use_mmr": True, "label": "semantic-with-mmr","rerank_strategy":"none"}, |
| {"mode": "hybrid", "use_mmr": False, "label": "hybrid-no-mmr","rerank_strategy":"cross-encoder"}, |
| {"mode": "hybrid", "use_mmr": True, "label": "hybrid-with-mmr","rerank_strategy":"rrf"}, |
| {"mode": "hybrid", "use_mmr": True, "label": "hybrid-with-mmr","rerank_strategy":"cross-encoder"}, |
| {"mode": "hybrid", "use_mmr": False, "label": "hybrid-with-mmr","rerank_strategy":"rrf"}, |
| {"mode": "bm25", "use_mmr": False, "label": "bm25-no-mmr","rerank_strategy":"cross-encoder"}, |
| {"mode": "bm25", "use_mmr": False, "label": "bm25-no-mmr","rerank_strategy":"none"}, |
| ] |
|
|
| print("Initializing ChunkProcessor to load Embedding Model...") |
| proc = ChunkProcessor(model_name=cfg.processing['embedding_model'], verbose=False) |
|
|
| |
| print("Initializing HybridRetriever...") |
| retriever = HybridRetriever( |
| embed_model=proc.encoder, |
| rerank_model_name='jinaai/jina-reranker-v1-tiny-en', |
| verbose=False |
| ) |
|
|
| all_query_results = {} |
|
|
| for query_idx, query in enumerate(test_queries): |
| print(f"\n{'='*80}") |
| print(f"PROCESSING QUERY {query_idx + 1}/{len(test_queries)}: {query}") |
| print(f"{'='*80}") |
| |
| query_results = {} |
| |
| |
| index_name = "cbt-book-recursive" |
| try: |
| index = get_index_by_name(pinecone_key, index_name) |
| stats = index.describe_index_stats() |
| if stats.get('total_vector_count', 0) == 0: |
| print(f" [!] Warning: Index {index_name} is empty. Proceeding for sparse test.") |
| except Exception as e: |
| print(f" [X] Failed to connect to index {index_name}: {e}") |
| continue |
|
|
| for technique in CHUNKING_TECHNIQUES_FILTERED: |
| technique_name = technique['name'] |
|
|
| for strategy in RETRIEVAL_STRATEGIES: |
| result_key = f"{technique_name} + {strategy['label']} + {strategy['rerank_strategy']}" |
| print(f"\nEvaluating: {result_key}") |
| |
| try: |
| context_chunks, chunk_score = retriever.search( |
| query=query, |
| index=index, |
| mode=strategy['mode'], |
| rerank_strategy=strategy['rerank_strategy'], |
| use_mmr=strategy['use_mmr'], |
| top_k=50, |
| final_k=4, |
| technique_name=technique_name, |
| verbose=False, |
| test=True |
| ) |
| |
| query_results[result_key] = { |
| 'chunks': context_chunks, |
| 'score': chunk_score |
| } |
| print(f" -> Retrieved {len(context_chunks)} chunks (Score: {chunk_score:.4f})") |
| |
| except Exception as e: |
| print(f" -> Error retrieving for {result_key}: {e}") |
|
|
| all_query_results[query_idx] = query_results |
|
|
| |
| generate_retrieval_report(all_query_results, test_queries) |
|
|
|
|
| if __name__ == '__main__': |
| main() |