precis / scripts /pull.py
compendious's picture
data cleanres
f71ba81
"""
Pulls raw samples of 10k each from the [cited in README] datasets used in this project.
In the final version of the training data, a lot of the example outputs are tuned, and they are all merged into a single
HuggingFace seems to have disabled this functionality.
Currently trying to see how to work around it
"""
import json
from datasets import load_dataset
targets = {
"mediasum": ("nbroad/mediasum", None, "train"),
"dialogsum": ("knkarthick/dialogsum", None, "train"),
"squality": ("mattercalm/squality", None, "train"),
"msmarco_corpus": ("Hyukkyu/beir-msmarco", "corpus", "train"),
}
for name, (repo, config, split) in targets.items():
# load with generic loader (no trust_remote_code)
if config:
ds = load_dataset(repo, config, split=split)
else:
ds = load_dataset(repo, split=split)
# take first 10k (shuffling in memory)
small = ds.shuffle(seed=42).select(range(10_000))
out = f"{name}_10k.jsonl"
with open(out, "w", encoding="utf-8") as f:
for example in small:
f.write(json.dumps(example) + "\n")