CodeMode / scripts /generate_all_frameworks.py
CodeMode Agent
Deploy CodeMode via Agent
463fc7e
"""
Generate training datasets for ALL frameworks automatically.
This script auto-discovers all chunk files and processes them,
generating separate datasets for each framework PLUS a combined dataset.
Usage:
python scripts/generate_all_frameworks.py
Output Structure:
data/processed/training_crewai/
- positive_pairs.json
- triplets.json
data/processed/training_langgraph/
- positive_pairs.json
- triplets.json
data/processed/training_combined/
- positive_pairs.json (ALL frameworks merged)
- triplets.json (ALL frameworks merged)
"""
import sys
import json
from pathlib import Path
from typing import List, Tuple
from dataclasses import asdict
# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
from src.task_3_data_engineering.export.pairs_triplets_generator import (
generate_pairs_and_triplets,
PositivePair,
Triplet
)
def discover_all_chunk_files() -> List[Tuple[Path, str]]:
"""
Discover all chunk files in the workspace.
Returns:
List of (chunk_path, framework_name) tuples
"""
chunk_files = []
# Check local chunks
local_paths = [
PROJECT_ROOT / "data" / "processed" / "chunks" / "Local_saved_files" / "chunks.jsonl",
PROJECT_ROOT / "data" / "processed" / "chunks" / "sample_code" / "chunks.jsonl",
]
for path in local_paths:
if path.exists():
# Extract framework from parent directory or use "local"
if "Local_saved_files" in str(path):
framework = "crewai"
elif "sample_code" in str(path):
framework = "sample"
else:
framework = path.parent.name
chunk_files.append((path, framework))
# Check repository chunks
repos_dir = PROJECT_ROOT / "data" / "processed" / "repos"
if repos_dir.exists():
for repo_dir in repos_dir.iterdir():
if repo_dir.is_dir():
for jsonl_file in repo_dir.glob("*_chunks.jsonl"):
# Extract framework from filename or directory
framework = jsonl_file.stem.replace("_chunks", "").split("_")[0]
chunk_files.append((jsonl_file, framework))
return chunk_files
def merge_datasets(all_pairs: List[List[PositivePair]],
all_triplets: List[List[Triplet]],
output_dir: Path) -> None:
"""Merge all framework datasets into combined files (JSON + JSONL)."""
output_dir.mkdir(parents=True, exist_ok=True)
# Flatten lists
combined_pairs = []
for pairs in all_pairs:
combined_pairs.extend(pairs)
combined_triplets = []
for triplets in all_triplets:
combined_triplets.extend(triplets)
# Export combined positive pairs - JSON
pairs_json_path = output_dir / "positive_pairs.json"
with open(pairs_json_path, "w", encoding="utf-8") as f:
json.dump([asdict(p) for p in combined_pairs], f, indent=2, ensure_ascii=False)
print(f"βœ… Combined positive pairs (JSON): {pairs_json_path}")
# Export combined positive pairs - JSONL
pairs_jsonl_path = output_dir / "positive_pairs.jsonl"
with open(pairs_jsonl_path, "w", encoding="utf-8") as f:
for p in combined_pairs:
f.write(json.dumps(asdict(p), ensure_ascii=False) + "\n")
print(f"βœ… Combined positive pairs (JSONL): {pairs_jsonl_path}")
# Export combined triplets - JSON
triplets_json_path = output_dir / "triplets.json"
with open(triplets_json_path, "w", encoding="utf-8") as f:
json.dump([asdict(t) for t in combined_triplets], f, indent=2, ensure_ascii=False)
print(f"βœ… Combined triplets (JSON): {triplets_json_path}")
# Export combined triplets - JSONL
triplets_jsonl_path = output_dir / "triplets.jsonl"
with open(triplets_jsonl_path, "w", encoding="utf-8") as f:
for t in combined_triplets:
f.write(json.dumps(asdict(t), ensure_ascii=False) + "\n")
print(f"βœ… Combined triplets (JSONL): {triplets_jsonl_path}")
return len(combined_pairs), len(combined_triplets)
def main():
"""Generate datasets for all discovered frameworks + combined dataset."""
print("=" * 80)
print("πŸš€ MULTI-FRAMEWORK TRAINING DATA GENERATOR")
print("=" * 80)
# Discover all chunk files
print("\nπŸ” Discovering chunk files...")
chunk_files = discover_all_chunk_files()
if not chunk_files:
print("❌ No chunk files found!")
print("\nPlease ensure chunks exist in:")
print(" - data/processed/chunks/Local_saved_files/")
print(" - data/processed/repos/*/")
return
print(f"βœ… Found {len(chunk_files)} chunk file(s):\n")
for path, framework in chunk_files:
print(f" πŸ“¦ {framework}: {path.name}")
# Process each framework
print("\n" + "=" * 80)
print("πŸ”„ PROCESSING INDIVIDUAL FRAMEWORKS")
print("=" * 80 + "\n")
results = []
all_pairs = []
all_triplets = []
for i, (chunks_path, framework) in enumerate(chunk_files, 1):
print(f"\n[{i}/{len(chunk_files)}] Processing {framework.upper()}...")
print("-" * 60)
output_dir = PROJECT_ROOT / "data" / "processed" / f"training_{framework}"
try:
pairs, triplets = generate_pairs_and_triplets(
chunks_path=chunks_path,
output_dir=output_dir,
num_pairs=100,
num_triplets=100,
variance=5,
export_format="both" # JSON + JSONL
)
# Collect for combined dataset
all_pairs.append(pairs)
all_triplets.append(triplets)
results.append({
"framework": framework,
"status": "βœ… SUCCESS",
"pairs": len(pairs),
"variations": sum(len(p.variations) for p in pairs),
"triplets": len(triplets),
"output": output_dir
})
except Exception as e:
results.append({
"framework": framework,
"status": f"❌ FAILED: {str(e)}",
"output": output_dir
})
# Create combined dataset
print("\n" + "=" * 80)
print("πŸ”— CREATING COMBINED DATASET (ALL FRAMEWORKS)")
print("=" * 80 + "\n")
combined_dir = PROJECT_ROOT / "data" / "processed" / "training_combined"
total_pairs, total_triplets = merge_datasets(all_pairs, all_triplets, combined_dir)
# Final summary
print("\n" + "=" * 80)
print("πŸ“Š FINAL SUMMARY")
print("=" * 80 + "\n")
print("INDIVIDUAL FRAMEWORK DATASETS:")
print("-" * 40)
for result in results:
print(f"\nπŸ“¦ {result['framework'].upper()}")
print(f" Status: {result['status']}")
if "pairs" in result:
print(f" - positive_pairs.json: {result['pairs']} docs ({result['variations']} variations)")
print(f" - triplets.json: {result['triplets']} docs")
print(f" πŸ“ {result['output']}")
print("\n\nCOMBINED DATASET (ALL FRAMEWORKS):")
print("-" * 40)
print(f"πŸ“ {combined_dir}")
print(f" - positive_pairs.json: {total_pairs} docs")
print(f" - triplets.json: {total_triplets} docs")
# File count summary
successful = sum(1 for r in results if "SUCCESS" in r["status"])
total_files = (successful * 4) + 4 # 4 per framework + 4 combined
print(f"\n\nπŸ“„ TOTAL FILES GENERATED: {total_files}")
print(f" - {successful} frameworks Γ— 4 files = {successful * 4} files")
print(f" - Combined dataset = 4 files")
print("=" * 80)
if __name__ == "__main__":
main()