File size: 2,270 Bytes
463fc7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
'''

Aggregate synthetic datasets from multiple runs into a single combined dataset generated using triplets_synthesis.py.

'''

import json
from pathlib import Path
from datetime import datetime
from typing import List, Dict

BASE_SYNTHETIC_DIR = Path("data/synthetic")
OUTPUT_DIR = BASE_SYNTHETIC_DIR / "combined"


def load_jsonl(path: Path) -> List[Dict]:
    with path.open("r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]


def save_jsonl(path: Path, records: List[Dict]):
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")


def save_json(path: Path, records: List[Dict]):
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        json.dump(records, f, indent=2)


def aggregate():
    positive_pairs_all = []
    triplets_all = []
    included_runs = []

    for run_dir in BASE_SYNTHETIC_DIR.iterdir():
        if not run_dir.is_dir():
            continue
        if run_dir.name == "combined":
            continue

        pos_path = run_dir / "positive_pairs.jsonl"
        tri_path = run_dir / "triplets.jsonl"

        if pos_path.exists() and tri_path.exists():
            positive_pairs_all.extend(load_jsonl(pos_path))
            triplets_all.extend(load_jsonl(tri_path))
            included_runs.append(run_dir.name)

    # Save JSONL (training)
    save_jsonl(OUTPUT_DIR / "positive_pairs.jsonl", positive_pairs_all)
    save_jsonl(OUTPUT_DIR / "triplets.jsonl", triplets_all)

    # Save JSON (inspection / upload)
    save_json(OUTPUT_DIR / "positive_pairs.json", positive_pairs_all)
    save_json(OUTPUT_DIR / "triplets.json", triplets_all)

    # Metadata
    metadata = {
        "type": "combined_dataset",
        "included_runs": included_runs,
        "total_positive_pairs": len(positive_pairs_all),
        "total_triplets": len(triplets_all),
        "created_at": datetime.utcnow().isoformat(),
    }

    with (OUTPUT_DIR / "metadata.json").open("w", encoding="utf-8") as f:
        json.dump(metadata, f, indent=2)

    print("✅ Combined dataset created at:", OUTPUT_DIR)


if __name__ == "__main__":
    aggregate()