"""
Pulls raw samples of 10k each from the [cited in README] datasets used in this project.
In the final version of the training data, a lot of the example outputs are tuned, and they are all merged into a single 

HuggingFace seems to have disabled this functionality.
Currently trying to see how to work around it
"""

import json
from datasets import load_dataset

targets = {
    "mediasum": ("nbroad/mediasum", None, "train"),
    "dialogsum": ("knkarthick/dialogsum", None, "train"),
    "squality": ("mattercalm/squality", None, "train"),
    "msmarco_corpus": ("Hyukkyu/beir-msmarco", "corpus", "train"),
}

for name, (repo, config, split) in targets.items():
    # load with generic loader (no trust_remote_code)
    if config:
        ds = load_dataset(repo, config, split=split)
    else:
        ds = load_dataset(repo, split=split)

    # take first 10k (shuffling in memory)
    small = ds.shuffle(seed=42).select(range(10_000))

    out = f"{name}_10k.jsonl"
    with open(out, "w", encoding="utf-8") as f:
        for example in small:
            f.write(json.dumps(example) + "\n")