''' Aggregate synthetic datasets from multiple runs into a single combined dataset generated using triplets_synthesis.py. ''' import json from pathlib import Path from datetime import datetime from typing import List, Dict BASE_SYNTHETIC_DIR = Path("data/synthetic") OUTPUT_DIR = BASE_SYNTHETIC_DIR / "combined" def load_jsonl(path: Path) -> List[Dict]: with path.open("r", encoding="utf-8") as f: return [json.loads(line) for line in f] def save_jsonl(path: Path, records: List[Dict]): path.parent.mkdir(parents=True, exist_ok=True) with path.open("w", encoding="utf-8") as f: for r in records: f.write(json.dumps(r, ensure_ascii=False) + "\n") def save_json(path: Path, records: List[Dict]): path.parent.mkdir(parents=True, exist_ok=True) with path.open("w", encoding="utf-8") as f: json.dump(records, f, indent=2) def aggregate(): positive_pairs_all = [] triplets_all = [] included_runs = [] for run_dir in BASE_SYNTHETIC_DIR.iterdir(): if not run_dir.is_dir(): continue if run_dir.name == "combined": continue pos_path = run_dir / "positive_pairs.jsonl" tri_path = run_dir / "triplets.jsonl" if pos_path.exists() and tri_path.exists(): positive_pairs_all.extend(load_jsonl(pos_path)) triplets_all.extend(load_jsonl(tri_path)) included_runs.append(run_dir.name) # Save JSONL (training) save_jsonl(OUTPUT_DIR / "positive_pairs.jsonl", positive_pairs_all) save_jsonl(OUTPUT_DIR / "triplets.jsonl", triplets_all) # Save JSON (inspection / upload) save_json(OUTPUT_DIR / "positive_pairs.json", positive_pairs_all) save_json(OUTPUT_DIR / "triplets.json", triplets_all) # Metadata metadata = { "type": "combined_dataset", "included_runs": included_runs, "total_positive_pairs": len(positive_pairs_all), "total_triplets": len(triplets_all), "created_at": datetime.utcnow().isoformat(), } with (OUTPUT_DIR / "metadata.json").open("w", encoding="utf-8") as f: json.dump(metadata, f, indent=2) print("✅ Combined dataset created at:", OUTPUT_DIR) if __name__ == "__main__": aggregate()