| from functools import partial |
|
|
| from litgpt.tokenizer import Tokenizer |
| from litdata import optimize, TokensLoader, StreamingDataset |
| from transformers import AutoTokenizer |
|
|
| from utils import tokenize_fn |
| from core_base_datasets import core_base_datasets |
| from core_instruct_datasets import core_instruct_datasets |
|
|
|
|
| |
| |
| |
| for i, (block_size, subchunk_size) in enumerate([(8192, 2000)]): |
| chunk_size = block_size * subchunk_size |
| output_dir = f'../core-data-{i}-{block_size}-{subchunk_size}' |
|
|
| outputs = optimize( |
| fn=partial( |
| tokenize_fn, |
| hf_tokenizer=AutoTokenizer.from_pretrained('..', trust_remote_code=True, use_fast=True), |
| tokenizer=Tokenizer('..'), |
| ), |
| inputs=core_base_datasets + core_instruct_datasets, |
| output_dir=output_dir, |
| chunk_size=chunk_size, |
| num_workers=32, |
| reorder_files=False, |
| |
| |
| |
| ) |
|
|
| |
| |
| |
| for i, (block_size, subchunk_size) in enumerate([(8192, 2000)]): |
| chunk_size = block_size * subchunk_size |
| input_dir = f'../core-data-{i}-{block_size}-{subchunk_size}' |
|
|
| dataset = StreamingDataset( |
| input_dir=input_dir, |
| item_loader=TokensLoader(block_size=block_size), |
| ) |
|
|
| print(f'{i=}, {block_size=}, {chunk_size=}, {len(dataset)=}, {len(dataset) * block_size=}') |
|
|
| |
| |
| total_tokens = len(dataset) * block_size |
| print(f'Total number of tokens in the optimized dataset {input_dir!r} is {total_tokens}') |
|
|