|
|
''' |
|
|
|
|
|
Aggregate synthetic datasets from multiple runs into a single combined dataset generated using triplets_synthesis.py. |
|
|
|
|
|
''' |
|
|
|
|
|
import json |
|
|
from pathlib import Path |
|
|
from datetime import datetime |
|
|
from typing import List, Dict |
|
|
|
|
|
BASE_SYNTHETIC_DIR = Path("data/synthetic") |
|
|
OUTPUT_DIR = BASE_SYNTHETIC_DIR / "combined" |
|
|
|
|
|
|
|
|
def load_jsonl(path: Path) -> List[Dict]: |
|
|
with path.open("r", encoding="utf-8") as f: |
|
|
return [json.loads(line) for line in f] |
|
|
|
|
|
|
|
|
def save_jsonl(path: Path, records: List[Dict]): |
|
|
path.parent.mkdir(parents=True, exist_ok=True) |
|
|
with path.open("w", encoding="utf-8") as f: |
|
|
for r in records: |
|
|
f.write(json.dumps(r, ensure_ascii=False) + "\n") |
|
|
|
|
|
|
|
|
def save_json(path: Path, records: List[Dict]): |
|
|
path.parent.mkdir(parents=True, exist_ok=True) |
|
|
with path.open("w", encoding="utf-8") as f: |
|
|
json.dump(records, f, indent=2) |
|
|
|
|
|
|
|
|
def aggregate(): |
|
|
positive_pairs_all = [] |
|
|
triplets_all = [] |
|
|
included_runs = [] |
|
|
|
|
|
for run_dir in BASE_SYNTHETIC_DIR.iterdir(): |
|
|
if not run_dir.is_dir(): |
|
|
continue |
|
|
if run_dir.name == "combined": |
|
|
continue |
|
|
|
|
|
pos_path = run_dir / "positive_pairs.jsonl" |
|
|
tri_path = run_dir / "triplets.jsonl" |
|
|
|
|
|
if pos_path.exists() and tri_path.exists(): |
|
|
positive_pairs_all.extend(load_jsonl(pos_path)) |
|
|
triplets_all.extend(load_jsonl(tri_path)) |
|
|
included_runs.append(run_dir.name) |
|
|
|
|
|
|
|
|
save_jsonl(OUTPUT_DIR / "positive_pairs.jsonl", positive_pairs_all) |
|
|
save_jsonl(OUTPUT_DIR / "triplets.jsonl", triplets_all) |
|
|
|
|
|
|
|
|
save_json(OUTPUT_DIR / "positive_pairs.json", positive_pairs_all) |
|
|
save_json(OUTPUT_DIR / "triplets.json", triplets_all) |
|
|
|
|
|
|
|
|
metadata = { |
|
|
"type": "combined_dataset", |
|
|
"included_runs": included_runs, |
|
|
"total_positive_pairs": len(positive_pairs_all), |
|
|
"total_triplets": len(triplets_all), |
|
|
"created_at": datetime.utcnow().isoformat(), |
|
|
} |
|
|
|
|
|
with (OUTPUT_DIR / "metadata.json").open("w", encoding="utf-8") as f: |
|
|
json.dump(metadata, f, indent=2) |
|
|
|
|
|
print("✅ Combined dataset created at:", OUTPUT_DIR) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
aggregate() |
|
|
|