|
|
""" |
|
|
Generate training datasets for ALL frameworks automatically. |
|
|
|
|
|
This script auto-discovers all chunk files and processes them, |
|
|
generating separate datasets for each framework PLUS a combined dataset. |
|
|
|
|
|
Usage: |
|
|
python scripts/generate_all_frameworks.py |
|
|
|
|
|
Output Structure: |
|
|
data/processed/training_crewai/ |
|
|
- positive_pairs.json |
|
|
- triplets.json |
|
|
data/processed/training_langgraph/ |
|
|
- positive_pairs.json |
|
|
- triplets.json |
|
|
data/processed/training_combined/ |
|
|
- positive_pairs.json (ALL frameworks merged) |
|
|
- triplets.json (ALL frameworks merged) |
|
|
""" |
|
|
|
|
|
import sys |
|
|
import json |
|
|
from pathlib import Path |
|
|
from typing import List, Tuple |
|
|
from dataclasses import asdict |
|
|
|
|
|
|
|
|
PROJECT_ROOT = Path(__file__).parent.parent |
|
|
sys.path.insert(0, str(PROJECT_ROOT)) |
|
|
|
|
|
from src.task_3_data_engineering.export.pairs_triplets_generator import ( |
|
|
generate_pairs_and_triplets, |
|
|
PositivePair, |
|
|
Triplet |
|
|
) |
|
|
|
|
|
|
|
|
def discover_all_chunk_files() -> List[Tuple[Path, str]]: |
|
|
""" |
|
|
Discover all chunk files in the workspace. |
|
|
|
|
|
Returns: |
|
|
List of (chunk_path, framework_name) tuples |
|
|
""" |
|
|
chunk_files = [] |
|
|
|
|
|
|
|
|
local_paths = [ |
|
|
PROJECT_ROOT / "data" / "processed" / "chunks" / "Local_saved_files" / "chunks.jsonl", |
|
|
PROJECT_ROOT / "data" / "processed" / "chunks" / "sample_code" / "chunks.jsonl", |
|
|
] |
|
|
|
|
|
for path in local_paths: |
|
|
if path.exists(): |
|
|
|
|
|
if "Local_saved_files" in str(path): |
|
|
framework = "crewai" |
|
|
elif "sample_code" in str(path): |
|
|
framework = "sample" |
|
|
else: |
|
|
framework = path.parent.name |
|
|
chunk_files.append((path, framework)) |
|
|
|
|
|
|
|
|
repos_dir = PROJECT_ROOT / "data" / "processed" / "repos" |
|
|
if repos_dir.exists(): |
|
|
for repo_dir in repos_dir.iterdir(): |
|
|
if repo_dir.is_dir(): |
|
|
for jsonl_file in repo_dir.glob("*_chunks.jsonl"): |
|
|
|
|
|
framework = jsonl_file.stem.replace("_chunks", "").split("_")[0] |
|
|
chunk_files.append((jsonl_file, framework)) |
|
|
|
|
|
return chunk_files |
|
|
|
|
|
|
|
|
def merge_datasets(all_pairs: List[List[PositivePair]], |
|
|
all_triplets: List[List[Triplet]], |
|
|
output_dir: Path) -> None: |
|
|
"""Merge all framework datasets into combined files (JSON + JSONL).""" |
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
combined_pairs = [] |
|
|
for pairs in all_pairs: |
|
|
combined_pairs.extend(pairs) |
|
|
|
|
|
combined_triplets = [] |
|
|
for triplets in all_triplets: |
|
|
combined_triplets.extend(triplets) |
|
|
|
|
|
|
|
|
pairs_json_path = output_dir / "positive_pairs.json" |
|
|
with open(pairs_json_path, "w", encoding="utf-8") as f: |
|
|
json.dump([asdict(p) for p in combined_pairs], f, indent=2, ensure_ascii=False) |
|
|
print(f"β
Combined positive pairs (JSON): {pairs_json_path}") |
|
|
|
|
|
|
|
|
pairs_jsonl_path = output_dir / "positive_pairs.jsonl" |
|
|
with open(pairs_jsonl_path, "w", encoding="utf-8") as f: |
|
|
for p in combined_pairs: |
|
|
f.write(json.dumps(asdict(p), ensure_ascii=False) + "\n") |
|
|
print(f"β
Combined positive pairs (JSONL): {pairs_jsonl_path}") |
|
|
|
|
|
|
|
|
triplets_json_path = output_dir / "triplets.json" |
|
|
with open(triplets_json_path, "w", encoding="utf-8") as f: |
|
|
json.dump([asdict(t) for t in combined_triplets], f, indent=2, ensure_ascii=False) |
|
|
print(f"β
Combined triplets (JSON): {triplets_json_path}") |
|
|
|
|
|
|
|
|
triplets_jsonl_path = output_dir / "triplets.jsonl" |
|
|
with open(triplets_jsonl_path, "w", encoding="utf-8") as f: |
|
|
for t in combined_triplets: |
|
|
f.write(json.dumps(asdict(t), ensure_ascii=False) + "\n") |
|
|
print(f"β
Combined triplets (JSONL): {triplets_jsonl_path}") |
|
|
|
|
|
return len(combined_pairs), len(combined_triplets) |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Generate datasets for all discovered frameworks + combined dataset.""" |
|
|
print("=" * 80) |
|
|
print("π MULTI-FRAMEWORK TRAINING DATA GENERATOR") |
|
|
print("=" * 80) |
|
|
|
|
|
|
|
|
print("\nπ Discovering chunk files...") |
|
|
chunk_files = discover_all_chunk_files() |
|
|
|
|
|
if not chunk_files: |
|
|
print("β No chunk files found!") |
|
|
print("\nPlease ensure chunks exist in:") |
|
|
print(" - data/processed/chunks/Local_saved_files/") |
|
|
print(" - data/processed/repos/*/") |
|
|
return |
|
|
|
|
|
print(f"β
Found {len(chunk_files)} chunk file(s):\n") |
|
|
for path, framework in chunk_files: |
|
|
print(f" π¦ {framework}: {path.name}") |
|
|
|
|
|
|
|
|
print("\n" + "=" * 80) |
|
|
print("π PROCESSING INDIVIDUAL FRAMEWORKS") |
|
|
print("=" * 80 + "\n") |
|
|
|
|
|
results = [] |
|
|
all_pairs = [] |
|
|
all_triplets = [] |
|
|
|
|
|
for i, (chunks_path, framework) in enumerate(chunk_files, 1): |
|
|
print(f"\n[{i}/{len(chunk_files)}] Processing {framework.upper()}...") |
|
|
print("-" * 60) |
|
|
|
|
|
output_dir = PROJECT_ROOT / "data" / "processed" / f"training_{framework}" |
|
|
|
|
|
try: |
|
|
pairs, triplets = generate_pairs_and_triplets( |
|
|
chunks_path=chunks_path, |
|
|
output_dir=output_dir, |
|
|
num_pairs=100, |
|
|
num_triplets=100, |
|
|
variance=5, |
|
|
export_format="both" |
|
|
) |
|
|
|
|
|
|
|
|
all_pairs.append(pairs) |
|
|
all_triplets.append(triplets) |
|
|
|
|
|
results.append({ |
|
|
"framework": framework, |
|
|
"status": "β
SUCCESS", |
|
|
"pairs": len(pairs), |
|
|
"variations": sum(len(p.variations) for p in pairs), |
|
|
"triplets": len(triplets), |
|
|
"output": output_dir |
|
|
}) |
|
|
|
|
|
except Exception as e: |
|
|
results.append({ |
|
|
"framework": framework, |
|
|
"status": f"β FAILED: {str(e)}", |
|
|
"output": output_dir |
|
|
}) |
|
|
|
|
|
|
|
|
print("\n" + "=" * 80) |
|
|
print("π CREATING COMBINED DATASET (ALL FRAMEWORKS)") |
|
|
print("=" * 80 + "\n") |
|
|
|
|
|
combined_dir = PROJECT_ROOT / "data" / "processed" / "training_combined" |
|
|
total_pairs, total_triplets = merge_datasets(all_pairs, all_triplets, combined_dir) |
|
|
|
|
|
|
|
|
print("\n" + "=" * 80) |
|
|
print("π FINAL SUMMARY") |
|
|
print("=" * 80 + "\n") |
|
|
|
|
|
print("INDIVIDUAL FRAMEWORK DATASETS:") |
|
|
print("-" * 40) |
|
|
for result in results: |
|
|
print(f"\nπ¦ {result['framework'].upper()}") |
|
|
print(f" Status: {result['status']}") |
|
|
if "pairs" in result: |
|
|
print(f" - positive_pairs.json: {result['pairs']} docs ({result['variations']} variations)") |
|
|
print(f" - triplets.json: {result['triplets']} docs") |
|
|
print(f" π {result['output']}") |
|
|
|
|
|
print("\n\nCOMBINED DATASET (ALL FRAMEWORKS):") |
|
|
print("-" * 40) |
|
|
print(f"π {combined_dir}") |
|
|
print(f" - positive_pairs.json: {total_pairs} docs") |
|
|
print(f" - triplets.json: {total_triplets} docs") |
|
|
|
|
|
|
|
|
successful = sum(1 for r in results if "SUCCESS" in r["status"]) |
|
|
total_files = (successful * 4) + 4 |
|
|
|
|
|
print(f"\n\nπ TOTAL FILES GENERATED: {total_files}") |
|
|
print(f" - {successful} frameworks Γ 4 files = {successful * 4} files") |
|
|
print(f" - Combined dataset = 4 files") |
|
|
print("=" * 80) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|