| |
|
|
|
|
| |
| |
| |
| |
|
|
|
|
|
|
| |
| |
|
|
| |
|
|
|
|
|
|
| |
|
|
| |
|
|
|
|
|
|
| |
|
|
| |
|
|
| from datasets import load_dataset |
| |
| Falcon = load_dataset('csv', data_files={"train": 'FalconData_train2.csv', "validation": 'FalconData_validation2.csv'}) |
|
|
| print('Dataset Loaded!') |
|
|
| |
|
|
| """Then take a look at an example:""" |
|
|
| Falcon['train'][0] |
|
|
| Falcon['validation'][0] |
|
|
|
|
|
|
| |
| |
|
|
| |
|
|
| """The next step is to load a DistilGPT2 tokenizer to process the `text` subfield:""" |
|
|
| from transformers import AutoTokenizer, GPT2TokenizerFast |
|
|
| tokenizer = AutoTokenizer.from_pretrained("distilgpt2") |
|
|
|
|
| |
| |
|
|
| |
| |
| |
|
|
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| Falcon = Falcon.flatten() |
| Falcon["train"][0] |
|
|
|
|
|
|
| def preprocess_function(examples): |
| return tokenizer([" ".join(x) for x in examples["Text"]]) |
|
|
|
|
|
|
| tokenized_Falcon = Falcon.map( |
| preprocess_function, |
| batched=True, |
| num_proc=4, |
| remove_columns=Falcon["train"].column_names, |
| ) |
|
|
|
|
| block_size = tokenizer.model_max_length |
| |
|
|
|
|
| def group_texts(examples): |
| |
| concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} |
| total_length = len(concatenated_examples[list(examples.keys())[0]]) |
| |
| |
| if total_length >= block_size: |
| total_length = (total_length // block_size) * block_size |
| |
| result = { |
| k: [t[i : i + block_size] for i in range(0, total_length, block_size)] |
| for k, t in concatenated_examples.items() |
| } |
| result["labels"] = result["input_ids"].copy() |
| return result |
|
|
| """Apply the `group_texts` function over the entire dataset:""" |
|
|
| lm_dataset = tokenized_Falcon.map(group_texts, batched=True, num_proc=4) |
|
|
|
|
|
|
| from transformers import DataCollatorForLanguageModeling |
|
|
| |
| |
| |
|
|
| data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) |
|
|
|
|
|
|
| from transformers import AutoModelForCausalLM, TrainingArguments, Trainer |
| import torch |
| model = AutoModelForCausalLM.from_pretrained("rwh/tiny8", torch_dtype=torch.bfloat16) |
|
|
| print('Model Loaded!') |
|
|
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
|
|
| |
|
|
| |
|
|
| model.to('cuda') |
|
|
| OutputDir = "C1ReadyModel" |
|
|
| training_args = TrainingArguments( |
| output_dir=OutputDir, |
| overwrite_output_dir=True, |
| bf16=True, |
| |
| evaluation_strategy="steps", |
| |
| |
| learning_rate=1e-5, |
| weight_decay=0.01, |
| |
| num_train_epochs=6, |
| per_device_train_batch_size=8, |
| per_device_eval_batch_size=8, |
| |
| lr_scheduler_type = 'linear', |
| push_to_hub=False, |
| save_total_limit = 2, |
| save_strategy = "steps", |
| load_best_model_at_end=True, |
| save_safetensors=True, |
| ) |
|
|
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=lm_dataset["train"], |
| eval_dataset=lm_dataset["validation"], |
| |
| data_collator=data_collator, |
| ) |
|
|
| |
| print('Started Training!') |
| trainer.train() |
|
|
| trainer.save_model(OutputDir) |
| print('Saved Model Path:', OutputDir) |
|
|
| import math |
|
|
| eval_results = trainer.evaluate() |
| print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}") |
|
|
|
|
|
|