CodeMode / scripts /aggregate_datasets.py
CodeMode Agent
Deploy CodeMode via Agent
463fc7e
'''
Aggregate synthetic datasets from multiple runs into a single combined dataset generated using triplets_synthesis.py.
'''
import json
from pathlib import Path
from datetime import datetime
from typing import List, Dict
BASE_SYNTHETIC_DIR = Path("data/synthetic")
OUTPUT_DIR = BASE_SYNTHETIC_DIR / "combined"
def load_jsonl(path: Path) -> List[Dict]:
with path.open("r", encoding="utf-8") as f:
return [json.loads(line) for line in f]
def save_jsonl(path: Path, records: List[Dict]):
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", encoding="utf-8") as f:
for r in records:
f.write(json.dumps(r, ensure_ascii=False) + "\n")
def save_json(path: Path, records: List[Dict]):
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", encoding="utf-8") as f:
json.dump(records, f, indent=2)
def aggregate():
positive_pairs_all = []
triplets_all = []
included_runs = []
for run_dir in BASE_SYNTHETIC_DIR.iterdir():
if not run_dir.is_dir():
continue
if run_dir.name == "combined":
continue
pos_path = run_dir / "positive_pairs.jsonl"
tri_path = run_dir / "triplets.jsonl"
if pos_path.exists() and tri_path.exists():
positive_pairs_all.extend(load_jsonl(pos_path))
triplets_all.extend(load_jsonl(tri_path))
included_runs.append(run_dir.name)
# Save JSONL (training)
save_jsonl(OUTPUT_DIR / "positive_pairs.jsonl", positive_pairs_all)
save_jsonl(OUTPUT_DIR / "triplets.jsonl", triplets_all)
# Save JSON (inspection / upload)
save_json(OUTPUT_DIR / "positive_pairs.json", positive_pairs_all)
save_json(OUTPUT_DIR / "triplets.json", triplets_all)
# Metadata
metadata = {
"type": "combined_dataset",
"included_runs": included_runs,
"total_positive_pairs": len(positive_pairs_all),
"total_triplets": len(triplets_all),
"created_at": datetime.utcnow().isoformat(),
}
with (OUTPUT_DIR / "metadata.json").open("w", encoding="utf-8") as f:
json.dump(metadata, f, indent=2)
print("✅ Combined dataset created at:", OUTPUT_DIR)
if __name__ == "__main__":
aggregate()