""" Generate training datasets for ALL frameworks automatically. This script auto-discovers all chunk files and processes them, generating separate datasets for each framework PLUS a combined dataset. Usage: python scripts/generate_all_frameworks.py Output Structure: data/processed/training_crewai/ - positive_pairs.json - triplets.json data/processed/training_langgraph/ - positive_pairs.json - triplets.json data/processed/training_combined/ - positive_pairs.json (ALL frameworks merged) - triplets.json (ALL frameworks merged) """ import sys import json from pathlib import Path from typing import List, Tuple from dataclasses import asdict # Add project root to path PROJECT_ROOT = Path(__file__).parent.parent sys.path.insert(0, str(PROJECT_ROOT)) from src.task_3_data_engineering.export.pairs_triplets_generator import ( generate_pairs_and_triplets, PositivePair, Triplet ) def discover_all_chunk_files() -> List[Tuple[Path, str]]: """ Discover all chunk files in the workspace. Returns: List of (chunk_path, framework_name) tuples """ chunk_files = [] # Check local chunks local_paths = [ PROJECT_ROOT / "data" / "processed" / "chunks" / "Local_saved_files" / "chunks.jsonl", PROJECT_ROOT / "data" / "processed" / "chunks" / "sample_code" / "chunks.jsonl", ] for path in local_paths: if path.exists(): # Extract framework from parent directory or use "local" if "Local_saved_files" in str(path): framework = "crewai" elif "sample_code" in str(path): framework = "sample" else: framework = path.parent.name chunk_files.append((path, framework)) # Check repository chunks repos_dir = PROJECT_ROOT / "data" / "processed" / "repos" if repos_dir.exists(): for repo_dir in repos_dir.iterdir(): if repo_dir.is_dir(): for jsonl_file in repo_dir.glob("*_chunks.jsonl"): # Extract framework from filename or directory framework = jsonl_file.stem.replace("_chunks", "").split("_")[0] chunk_files.append((jsonl_file, framework)) return chunk_files def merge_datasets(all_pairs: List[List[PositivePair]], all_triplets: List[List[Triplet]], output_dir: Path) -> None: """Merge all framework datasets into combined files (JSON + JSONL).""" output_dir.mkdir(parents=True, exist_ok=True) # Flatten lists combined_pairs = [] for pairs in all_pairs: combined_pairs.extend(pairs) combined_triplets = [] for triplets in all_triplets: combined_triplets.extend(triplets) # Export combined positive pairs - JSON pairs_json_path = output_dir / "positive_pairs.json" with open(pairs_json_path, "w", encoding="utf-8") as f: json.dump([asdict(p) for p in combined_pairs], f, indent=2, ensure_ascii=False) print(f"āœ… Combined positive pairs (JSON): {pairs_json_path}") # Export combined positive pairs - JSONL pairs_jsonl_path = output_dir / "positive_pairs.jsonl" with open(pairs_jsonl_path, "w", encoding="utf-8") as f: for p in combined_pairs: f.write(json.dumps(asdict(p), ensure_ascii=False) + "\n") print(f"āœ… Combined positive pairs (JSONL): {pairs_jsonl_path}") # Export combined triplets - JSON triplets_json_path = output_dir / "triplets.json" with open(triplets_json_path, "w", encoding="utf-8") as f: json.dump([asdict(t) for t in combined_triplets], f, indent=2, ensure_ascii=False) print(f"āœ… Combined triplets (JSON): {triplets_json_path}") # Export combined triplets - JSONL triplets_jsonl_path = output_dir / "triplets.jsonl" with open(triplets_jsonl_path, "w", encoding="utf-8") as f: for t in combined_triplets: f.write(json.dumps(asdict(t), ensure_ascii=False) + "\n") print(f"āœ… Combined triplets (JSONL): {triplets_jsonl_path}") return len(combined_pairs), len(combined_triplets) def main(): """Generate datasets for all discovered frameworks + combined dataset.""" print("=" * 80) print("šŸš€ MULTI-FRAMEWORK TRAINING DATA GENERATOR") print("=" * 80) # Discover all chunk files print("\nšŸ” Discovering chunk files...") chunk_files = discover_all_chunk_files() if not chunk_files: print("āŒ No chunk files found!") print("\nPlease ensure chunks exist in:") print(" - data/processed/chunks/Local_saved_files/") print(" - data/processed/repos/*/") return print(f"āœ… Found {len(chunk_files)} chunk file(s):\n") for path, framework in chunk_files: print(f" šŸ“¦ {framework}: {path.name}") # Process each framework print("\n" + "=" * 80) print("šŸ”„ PROCESSING INDIVIDUAL FRAMEWORKS") print("=" * 80 + "\n") results = [] all_pairs = [] all_triplets = [] for i, (chunks_path, framework) in enumerate(chunk_files, 1): print(f"\n[{i}/{len(chunk_files)}] Processing {framework.upper()}...") print("-" * 60) output_dir = PROJECT_ROOT / "data" / "processed" / f"training_{framework}" try: pairs, triplets = generate_pairs_and_triplets( chunks_path=chunks_path, output_dir=output_dir, num_pairs=100, num_triplets=100, variance=5, export_format="both" # JSON + JSONL ) # Collect for combined dataset all_pairs.append(pairs) all_triplets.append(triplets) results.append({ "framework": framework, "status": "āœ… SUCCESS", "pairs": len(pairs), "variations": sum(len(p.variations) for p in pairs), "triplets": len(triplets), "output": output_dir }) except Exception as e: results.append({ "framework": framework, "status": f"āŒ FAILED: {str(e)}", "output": output_dir }) # Create combined dataset print("\n" + "=" * 80) print("šŸ”— CREATING COMBINED DATASET (ALL FRAMEWORKS)") print("=" * 80 + "\n") combined_dir = PROJECT_ROOT / "data" / "processed" / "training_combined" total_pairs, total_triplets = merge_datasets(all_pairs, all_triplets, combined_dir) # Final summary print("\n" + "=" * 80) print("šŸ“Š FINAL SUMMARY") print("=" * 80 + "\n") print("INDIVIDUAL FRAMEWORK DATASETS:") print("-" * 40) for result in results: print(f"\nšŸ“¦ {result['framework'].upper()}") print(f" Status: {result['status']}") if "pairs" in result: print(f" - positive_pairs.json: {result['pairs']} docs ({result['variations']} variations)") print(f" - triplets.json: {result['triplets']} docs") print(f" šŸ“ {result['output']}") print("\n\nCOMBINED DATASET (ALL FRAMEWORKS):") print("-" * 40) print(f"šŸ“ {combined_dir}") print(f" - positive_pairs.json: {total_pairs} docs") print(f" - triplets.json: {total_triplets} docs") # File count summary successful = sum(1 for r in results if "SUCCESS" in r["status"]) total_files = (successful * 4) + 4 # 4 per framework + 4 combined print(f"\n\nšŸ“„ TOTAL FILES GENERATED: {total_files}") print(f" - {successful} frameworks Ɨ 4 files = {successful * 4} files") print(f" - Combined dataset = 4 files") print("=" * 80) if __name__ == "__main__": main()