| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import dataclasses |
| import gc |
| import importlib |
| import json |
| import math |
| import os |
| import random |
| import re |
| import subprocess |
| import sys |
| import tempfile |
| import unittest |
| from functools import partial |
| from itertools import product |
| from pathlib import Path |
| from typing import Any |
| from unittest.mock import Mock, patch |
|
|
| import numpy as np |
| from huggingface_hub import HfFolder, ModelCard, create_branch, list_repo_commits, list_repo_files |
| from packaging import version |
| from parameterized import parameterized |
|
|
| from transformers import ( |
| AutoFeatureExtractor, |
| AutoImageProcessor, |
| AutoProcessor, |
| AutoTokenizer, |
| DataCollatorForLanguageModeling, |
| IntervalStrategy, |
| PretrainedConfig, |
| TrainerCallback, |
| TrainingArguments, |
| default_data_collator, |
| enable_full_determinism, |
| get_polynomial_decay_schedule_with_warmup, |
| is_torch_available, |
| logging, |
| set_seed, |
| ) |
| from transformers.hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS |
| from transformers.testing_utils import ( |
| ENDPOINT_STAGING, |
| TOKEN, |
| USER, |
| CaptureLogger, |
| LoggingLevel, |
| TemporaryHubRepo, |
| TestCasePlus, |
| backend_device_count, |
| evaluate_side_effect_factory, |
| execute_subprocess_async, |
| get_gpu_count, |
| get_steps_per_epoch, |
| get_tests_dir, |
| is_staging_test, |
| require_accelerate, |
| require_apollo_torch, |
| require_bitsandbytes, |
| require_deepspeed, |
| require_galore_torch, |
| require_grokadamw, |
| require_intel_extension_for_pytorch, |
| require_liger_kernel, |
| require_lomo, |
| require_non_hpu, |
| require_non_xpu, |
| require_optuna, |
| require_peft, |
| require_ray, |
| require_safetensors, |
| require_schedulefree, |
| require_sentencepiece, |
| require_sigopt, |
| require_tensorboard, |
| require_tokenizers, |
| require_torch, |
| require_torch_accelerator, |
| require_torch_bf16, |
| require_torch_fp16, |
| require_torch_gpu, |
| require_torch_multi_accelerator, |
| require_torch_non_multi_accelerator, |
| require_torch_non_multi_gpu, |
| require_torch_tensorrt_fx, |
| require_torch_tf32, |
| require_torch_up_to_2_accelerators, |
| require_vision, |
| require_wandb, |
| run_first, |
| run_test_using_subprocess, |
| slow, |
| torch_device, |
| ) |
| from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, HPSearchBackend, check_target_module_exists |
| from transformers.training_args import OptimizerNames |
| from transformers.utils import ( |
| SAFE_WEIGHTS_INDEX_NAME, |
| SAFE_WEIGHTS_NAME, |
| WEIGHTS_INDEX_NAME, |
| WEIGHTS_NAME, |
| is_accelerate_available, |
| is_apex_available, |
| is_bitsandbytes_available, |
| is_safetensors_available, |
| is_torchao_available, |
| is_torchdistx_available, |
| ) |
| from transformers.utils.hp_naming import TrialShortNamer |
|
|
|
|
| if torch_device == "hpu": |
| RTOL = 1e-3 |
| ATOL = 1e-3 |
| else: |
| RTOL = 1e-5 |
| ATOL = 1e-5 |
|
|
| if is_torch_available(): |
| import torch |
| from torch import nn |
| from torch.utils.data import IterableDataset |
|
|
| import transformers.optimization |
| from transformers import ( |
| AutoModelForCausalLM, |
| AutoModelForSequenceClassification, |
| EarlyStoppingCallback, |
| GlueDataset, |
| GlueDataTrainingArguments, |
| GPT2Config, |
| GPT2LMHeadModel, |
| LineByLineTextDataset, |
| LlamaConfig, |
| LlamaForCausalLM, |
| PreTrainedModel, |
| Trainer, |
| TrainerState, |
| ) |
| from transformers.trainer_pt_utils import AcceleratorConfig |
|
|
| if is_safetensors_available(): |
| import safetensors.torch |
|
|
|
|
| |
| require_accelerate_version_min_0_28 = partial(require_accelerate, min_version="0.28") |
| require_accelerate_version_min_0_30 = partial(require_accelerate, min_version="0.30") |
| GRAD_ACCUM_KWARGS_VERSION_AVAILABLE = is_accelerate_available("0.28") |
| if is_accelerate_available(): |
| from accelerate import Accelerator |
| from accelerate.state import AcceleratorState |
|
|
|
|
| PATH_SAMPLE_TEXT = f"{get_tests_dir()}/fixtures/sample_text.txt" |
|
|
|
|
| class StoreLossCallback(TrainerCallback): |
| """ |
| Simple callback to store the loss. |
| """ |
|
|
| def __init__(self): |
| self.losses = [] |
|
|
| def on_log(self, args, state, control, logs=None, **kwargs): |
| if "loss" in logs: |
| self.losses.append(logs["loss"]) |
|
|
|
|
| class MockCudaOOMCallback(TrainerCallback): |
| """ |
| Simple callback to simulate CUDA OOM error if |
| the batch size is >= to `batch_size_limit`. |
| """ |
|
|
| def __init__(self, batch_size_limit=16): |
| self.batch_size_limit = batch_size_limit |
|
|
| def on_step_end(self, args, state, control, **kwargs): |
| |
| if state.train_batch_size >= self.batch_size_limit: |
| raise RuntimeError("CUDA out of memory.") |
|
|
|
|
| def ForCausalLMLoss(logits, labels, vocab_size, num_items_in_batch, disable_num_items_in_batch=False): |
| |
| logits = logits.float() |
| |
| shift_logits = logits[..., :-1, :].contiguous() |
| shift_labels = labels[..., 1:].contiguous() |
|
|
| |
| shift_logits = shift_logits.view(-1, vocab_size) |
| shift_labels = shift_labels.view(-1) |
| |
| shift_labels = shift_labels.to(shift_logits.device) |
| if num_items_in_batch is None or disable_num_items_in_batch: |
| loss = nn.functional.cross_entropy(shift_logits, shift_labels, ignore_index=-100, reduction="mean") |
| else: |
| loss = nn.functional.cross_entropy(shift_logits, shift_labels, ignore_index=-100, reduction="sum") |
| loss = loss / num_items_in_batch |
| return loss |
|
|
|
|
| class RegressionDataset: |
| def __init__(self, a=2, b=3, length=64, seed=42, label_names=None): |
| np.random.seed(seed) |
| self.label_names = ["labels"] if label_names is None else label_names |
| self.length = length |
| self.x = np.random.normal(size=(length,)).astype(np.float32) |
| self.ys = [a * self.x + b + np.random.normal(scale=0.1, size=(length,)) for _ in self.label_names] |
| self.ys = [y.astype(np.float32) for y in self.ys] |
|
|
| def __len__(self): |
| return self.length |
|
|
| def __getitem__(self, i): |
| result = {name: y[i] for name, y in zip(self.label_names, self.ys)} |
| result["input_x"] = self.x[i] |
| return result |
|
|
|
|
| |
| def bytes2megabytes(x): |
| return int(x / 2**20) |
|
|
|
|
| |
| class TorchTracemalloc: |
| def __enter__(self): |
| gc.collect() |
| if torch.cuda.is_available(): |
| torch.cuda.empty_cache() |
| torch.cuda.reset_max_memory_allocated() |
| self.begin = torch.cuda.memory_allocated() |
| return self |
|
|
| def __exit__(self, *exc): |
| gc.collect() |
| if torch.cuda.is_available(): |
| torch.cuda.empty_cache() |
| self.end = torch.cuda.memory_allocated() |
| self.peak = torch.cuda.max_memory_allocated() |
| self.used = bytes2megabytes(self.end - self.begin) |
| self.peaked = bytes2megabytes(self.peak - self.begin) |
|
|
|
|
| @dataclasses.dataclass |
| class RegressionTrainingArguments(TrainingArguments): |
| a: float = 0.0 |
| b: float = 0.0 |
| keep_report_to: bool = False |
|
|
| def __post_init__(self): |
| super().__post_init__() |
| |
| |
| if not self.keep_report_to: |
| self.report_to = [] |
|
|
|
|
| class RepeatDataset: |
| def __init__(self, x, length=64): |
| self.x = x |
| self.length = length |
|
|
| def __len__(self): |
| return self.length |
|
|
| def __getitem__(self, i): |
| return {"input_ids": self.x, "labels": self.x} |
|
|
|
|
| class SequenceClassificationDataset: |
| def __init__(self, length=64, vocab_size=100, num_labels=5): |
| self.length = length |
| self.sequences = [torch.randint(0, vocab_size, (64,)).tolist() for _ in range(length)] |
| self.labels = torch.randint(0, num_labels, (length,)).tolist() |
|
|
| def __len__(self): |
| return self.length |
|
|
| def __getitem__(self, i): |
| return {"input_ids": self.sequences[i], "label": self.labels[i]} |
|
|
|
|
| class DynamicShapesDataset: |
| def __init__(self, length=64, seed=42, batch_size=8): |
| self.length = length |
| np.random.seed(seed) |
| sizes = np.random.randint(1, 20, (length // batch_size,)) |
| |
| self.xs = [np.random.normal(size=(s,)).astype(np.float32) for s in sizes.repeat(batch_size)] |
| self.ys = [np.random.normal(size=(s,)).astype(np.float32) for s in sizes.repeat(batch_size)] |
|
|
| def __len__(self): |
| return self.length |
|
|
| def __getitem__(self, i): |
| return {"input_x": self.xs[i], "labels": self.ys[i]} |
|
|
|
|
| class AlmostAccuracy: |
| def __init__(self, thresh=0.25): |
| self.thresh = thresh |
|
|
| def __call__(self, eval_pred): |
| predictions, labels = eval_pred |
| true = np.abs(predictions - labels) <= self.thresh |
| return {"accuracy": true.astype(np.float32).mean().item()} |
|
|
|
|
| class AlmostAccuracyBatched: |
| def __init__(self, thresh=0.25): |
| self.thresh = thresh |
| self.batch_acc = [] |
|
|
| def __call__(self, eval_pred, compute_result): |
| predictions, labels = eval_pred |
| if isinstance(predictions, tuple): |
| predictions = predictions[0] |
| if isinstance(labels, tuple): |
| labels = labels[0] |
| batch_size = len(predictions) |
| true = torch.abs(predictions - labels) <= self.thresh |
| acc = true.type(torch.FloatTensor).mean().item() |
| self.batch_acc.extend([acc] * batch_size) |
| if compute_result: |
| result = {"accuracy": np.mean(self.batch_acc).item()} |
| self.batch_acc = [] |
| return result |
|
|
|
|
| class RegressionModelConfig(PretrainedConfig): |
| def __init__(self, a=0, b=0, double_output=False, random_torch=True, **kwargs): |
| super().__init__(**kwargs) |
| self.a = a |
| self.b = b |
| self.double_output = double_output |
| self.random_torch = random_torch |
| self.hidden_size = 1 |
|
|
|
|
| if is_torch_available(): |
|
|
| class SampleIterableDataset(IterableDataset): |
| def __init__(self, a=2, b=3, length=64, seed=42, label_names=None): |
| self.dataset = RegressionDataset(a=a, b=b, length=length, seed=seed, label_names=label_names) |
|
|
| def __iter__(self): |
| for i in range(len(self.dataset)): |
| yield self.dataset[i] |
|
|
| class FiniteIterableDataset(SampleIterableDataset): |
| def __init__(self, a=2, b=3, length=64, seed=42, label_names=None): |
| super().__init__(a, b, length, seed, label_names) |
| self.current_sample = 0 |
|
|
| def __iter__(self): |
| while self.current_sample < len(self.dataset): |
| yield self.dataset[self.current_sample] |
| self.current_sample += 1 |
|
|
| class MultiLoader: |
| def __init__(self, loaders): |
| self.loaders = loaders |
|
|
| def __len__(self): |
| return sum(len(loader) for loader in self.loaders) |
|
|
| def __iter__(self): |
| for loader in self.loaders: |
| yield from loader |
|
|
| class CustomDataloaderTrainer(Trainer): |
| def get_train_dataloader(self): |
| dataloaders = [super().get_train_dataloader(), super().get_train_dataloader()] |
| return MultiLoader(dataloaders) |
|
|
| def get_eval_dataloader(self, eval_dataset): |
| dataloaders = [super().get_eval_dataloader(eval_dataset), super().get_eval_dataloader(eval_dataset)] |
| return MultiLoader(dataloaders) |
|
|
| class RegressionModel(nn.Module): |
| def __init__(self, a=0, b=0, double_output=False): |
| super().__init__() |
| self.a = nn.Parameter(torch.tensor(a).float()) |
| self.b = nn.Parameter(torch.tensor(b).float()) |
| self.double_output = double_output |
| self.config = None |
|
|
| def forward(self, input_x, labels=None, **kwargs): |
| y = input_x * self.a + self.b |
| if labels is None: |
| return (y, y) if self.double_output else (y,) |
| loss = nn.functional.mse_loss(y, labels) |
| return (loss, y, y) if self.double_output else (loss, y) |
|
|
| class RegressionDictModel(nn.Module): |
| def __init__(self, a=0, b=0): |
| super().__init__() |
| self.a = nn.Parameter(torch.tensor(a).float()) |
| self.b = nn.Parameter(torch.tensor(b).float()) |
| self.config = None |
|
|
| def forward(self, input_x, labels=None, **kwargs): |
| y = input_x * self.a + self.b |
| result = {"output": y} |
| if labels is not None: |
| result["loss"] = nn.functional.mse_loss(y, labels) |
| return result |
|
|
| class RegressionPreTrainedModel(PreTrainedModel): |
| config_class = RegressionModelConfig |
| base_model_prefix = "regression" |
|
|
| def __init__(self, config): |
| super().__init__(config) |
| self.a = nn.Parameter(torch.tensor(config.a).float()) |
| self.b = nn.Parameter(torch.tensor(config.b).float()) |
| self.double_output = config.double_output |
|
|
| def forward(self, input_x, labels=None, **kwargs): |
| y = input_x * self.a + self.b |
| if labels is None: |
| return (y, y) if self.double_output else (y,) |
| loss = nn.functional.mse_loss(y, labels) |
| return (loss, y, y) if self.double_output else (loss, y) |
|
|
| class RegressionPreTrainedModelWithGradientCheckpointing(PreTrainedModel): |
| config_class = RegressionModelConfig |
| base_model_prefix = "regression" |
| supports_gradient_checkpointing = True |
|
|
| def __init__(self, config): |
| super().__init__(config) |
| self.layers = nn.ModuleList([nn.Linear(config.hidden_size, config.hidden_size) for _ in range(4)]) |
| self.head = nn.Linear(config.hidden_size, 1) |
| self.gradient_checkpointing = False |
| self.double_output = config.double_output |
|
|
| def forward(self, input_x, labels=None, **kwargs): |
| y = input_x.unsqueeze(0) |
|
|
| for layer in self.layers: |
| if self.training and self.gradient_checkpointing: |
| outputs = self._gradient_checkpointing_func(layer.__call__, y) |
| else: |
| outputs = layer(y) |
|
|
| y = outputs * 3 |
|
|
| logits = self.head(y) |
|
|
| if labels is None: |
| return (logits, logits) if self.double_output else (logits,) |
|
|
| loss = nn.functional.mse_loss(logits, labels) |
|
|
| return (loss, y, y) if self.double_output else (loss, y) |
|
|
| class RegressionRandomPreTrainedModel(PreTrainedModel): |
| config_class = RegressionModelConfig |
| base_model_prefix = "regression" |
|
|
| def __init__(self, config): |
| super().__init__(config) |
| self.a = nn.Parameter(torch.tensor(config.a).float()) |
| self.b = nn.Parameter(torch.tensor(config.b).float()) |
| self.random_torch = config.random_torch |
|
|
| def forward(self, input_x, labels=None, **kwargs): |
| y = input_x * self.a + self.b |
| if self.random_torch: |
| torch_rand = torch.randn(1).squeeze() |
| np_rand = np.random.rand() |
| rand_rand = random.random() |
|
|
| if self.random_torch: |
| y += 0.05 * torch_rand |
| y += 0.05 * torch.tensor(np_rand + rand_rand) |
|
|
| if labels is None: |
| return (y,) |
| loss = nn.functional.mse_loss(y, labels) |
| return (loss, y) |
|
|
| class BasicTextGenerationModel(nn.Module): |
| def __init__(self, vocab_size, hidden_size): |
| super().__init__() |
| self.embedding = nn.Embedding(vocab_size, hidden_size) |
| self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True) |
| self.fc = nn.Linear(hidden_size, vocab_size) |
|
|
| def forward(self, input_ids, **kwargs): |
| embedded = self.embedding(input_ids) |
| lstm_out, _ = self.lstm(embedded) |
| logits = self.fc(lstm_out) |
| return logits |
|
|
| def create_dummy_dataset_for_text_generation(vocab_size, seq_length, num_samples): |
| import datasets |
| import numpy as np |
|
|
| |
| input_ids = np.random.randint(0, vocab_size, (num_samples, seq_length)) |
|
|
| |
| dataset = datasets.Dataset.from_dict({"input_ids": input_ids, "labels": input_ids}) |
|
|
| return dataset |
|
|
| class TstLayer(nn.Module): |
| def __init__(self, hidden_size): |
| super().__init__() |
| self.linear1 = nn.Linear(hidden_size, hidden_size) |
| self.ln1 = nn.LayerNorm(hidden_size) |
| self.linear2 = nn.Linear(hidden_size, hidden_size) |
| self.ln2 = nn.LayerNorm(hidden_size) |
| self.bias = nn.Parameter(torch.zeros(hidden_size)) |
|
|
| def forward(self, x): |
| h = self.ln1(nn.functional.relu(self.linear1(x))) |
| h = nn.functional.relu(self.linear2(x)) |
| return self.ln2(x + h + self.bias) |
|
|
| def get_regression_trainer( |
| a=0, |
| b=0, |
| double_output=False, |
| train_len=64, |
| eval_len=64, |
| pretrained=True, |
| keep_report_to=False, |
| output_dir=None, |
| **kwargs, |
| ): |
| label_names = kwargs.get("label_names", None) |
| gradient_checkpointing = kwargs.get("gradient_checkpointing", False) |
| train_dataset = RegressionDataset(length=train_len, label_names=label_names) |
| eval_dataset = RegressionDataset(length=eval_len, label_names=label_names) |
|
|
| model_init = kwargs.pop("model_init", None) |
| if model_init is not None: |
| model = None |
| else: |
| if pretrained: |
| config = RegressionModelConfig(a=a, b=b, double_output=double_output) |
| |
| target_cls = ( |
| RegressionPreTrainedModel |
| if not gradient_checkpointing |
| else RegressionPreTrainedModelWithGradientCheckpointing |
| ) |
| model = target_cls(config) |
| else: |
| model = RegressionModel(a=a, b=b, double_output=double_output) |
|
|
| compute_metrics = kwargs.pop("compute_metrics", None) |
| data_collator = kwargs.pop("data_collator", None) |
| optimizers = kwargs.pop("optimizers", (None, None)) |
| preprocess_logits_for_metrics = kwargs.pop("preprocess_logits_for_metrics", None) |
| assert output_dir is not None, "output_dir should be specified for testing" |
| args = RegressionTrainingArguments(output_dir, a=a, b=b, keep_report_to=keep_report_to, **kwargs) |
| return Trainer( |
| model, |
| args, |
| data_collator=data_collator, |
| train_dataset=train_dataset, |
| eval_dataset=eval_dataset, |
| compute_metrics=compute_metrics, |
| optimizers=optimizers, |
| model_init=model_init, |
| preprocess_logits_for_metrics=preprocess_logits_for_metrics, |
| ) |
|
|
| def get_language_model_trainer(**kwargs): |
| import datasets |
|
|
| dataset = datasets.load_dataset("fka/awesome-chatgpt-prompts") |
| model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") |
| tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| def _tokenize_function(examples): |
| model_inputs = tokenizer(examples["prompt"], padding="max_length", truncation=True) |
| model_inputs["labels"] = np.array(model_inputs["input_ids"]).astype(np.int64) |
| return model_inputs |
|
|
| tokenized_datasets = dataset.map(_tokenize_function, batched=True) |
| training_args = TrainingArguments(**kwargs) |
|
|
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=tokenized_datasets["train"], |
| ) |
|
|
| return trainer |
|
|
|
|
| class TrainerIntegrationCommon: |
| def check_saved_checkpoints( |
| self, output_dir, freq, total, is_pretrained=True, safe_weights=True, use_scaler=False |
| ): |
| weights_file = WEIGHTS_NAME if not safe_weights else SAFE_WEIGHTS_NAME |
| file_list = [weights_file, "training_args.bin", "optimizer.pt", "scheduler.pt", "trainer_state.json"] |
| if is_pretrained: |
| file_list.append("config.json") |
| if use_scaler: |
| file_list.append("scaler.pt") |
| for step in range(freq, total, freq): |
| checkpoint = os.path.join(output_dir, f"checkpoint-{step}") |
| self.assertTrue(os.path.isdir(checkpoint)) |
| for filename in file_list: |
| self.assertTrue(os.path.isfile(os.path.join(checkpoint, filename))) |
|
|
| def check_best_model_has_been_loaded( |
| self, output_dir, freq, total, trainer, metric, greater_is_better=False, is_pretrained=True, safe_weights=True |
| ): |
| checkpoint = os.path.join(output_dir, f"checkpoint-{(total // freq) * freq}") |
| log_history = TrainerState.load_from_json(os.path.join(checkpoint, "trainer_state.json")).log_history |
|
|
| values = [d[metric] for d in log_history] |
| best_value = max(values) if greater_is_better else min(values) |
| best_checkpoint = (values.index(best_value) + 1) * freq |
| checkpoint = os.path.join(output_dir, f"checkpoint-{best_checkpoint}") |
| if is_pretrained: |
| best_model = RegressionPreTrainedModel.from_pretrained(checkpoint) |
| best_model.to(trainer.args.device) |
| else: |
| best_model = RegressionModel() |
| if not safe_weights: |
| state_dict = torch.load(os.path.join(checkpoint, WEIGHTS_NAME), weights_only=True) |
| else: |
| state_dict = safetensors.torch.load_file(os.path.join(checkpoint, SAFE_WEIGHTS_NAME)) |
| best_model.load_state_dict(state_dict) |
| best_model.to(trainer.args.device) |
| torch.testing.assert_close(best_model.a, trainer.model.a) |
| torch.testing.assert_close(best_model.b, trainer.model.b) |
|
|
| metrics = trainer.evaluate() |
| self.assertEqual(metrics[metric], best_value) |
|
|
| def check_trainer_state_are_the_same(self, trainer_state, trainer_state1): |
| |
| state = trainer_state.copy() |
| state1 = trainer_state1.copy() |
| |
| log_history = state.pop("log_history", None) |
| log_history1 = state1.pop("log_history", None) |
| self.assertEqual(state, state1) |
| skip_log_keys = ["train_runtime", "train_samples_per_second", "train_steps_per_second", "train_loss"] |
| for log, log1 in zip(log_history, log_history1): |
| for key in skip_log_keys: |
| _ = log.pop(key, None) |
| _ = log1.pop(key, None) |
| self.assertEqual(log, log1) |
|
|
| def convert_to_sharded_checkpoint(self, folder, save_safe=True, load_safe=True): |
| |
| if load_safe: |
| loader = safetensors.torch.load_file |
| weights_file = os.path.join(folder, SAFE_WEIGHTS_NAME) |
| else: |
| loader = torch.load |
| weights_file = os.path.join(folder, WEIGHTS_NAME) |
|
|
| if save_safe: |
| extension = "safetensors" |
| saver = safetensors.torch.save_file |
| index_file = os.path.join(folder, SAFE_WEIGHTS_INDEX_NAME) |
| shard_name = SAFE_WEIGHTS_NAME |
| else: |
| extension = "bin" |
| saver = torch.save |
| index_file = os.path.join(folder, WEIGHTS_INDEX_NAME) |
| shard_name = WEIGHTS_NAME |
|
|
| state_dict = loader(weights_file) |
|
|
| os.remove(weights_file) |
| keys = list(state_dict.keys()) |
|
|
| shard_files = [ |
| shard_name.replace(f".{extension}", f"-{idx + 1:05d}-of-{len(keys):05d}.{extension}") |
| for idx in range(len(keys)) |
| ] |
| index = {"metadata": {}, "weight_map": {key: shard_files[i] for i, key in enumerate(keys)}} |
|
|
| with open(index_file, "w", encoding="utf-8") as f: |
| content = json.dumps(index, indent=2, sort_keys=True) + "\n" |
| f.write(content) |
|
|
| for param_name, shard_file in zip(keys, shard_files): |
| saver({param_name: state_dict[param_name]}, os.path.join(folder, shard_file)) |
|
|
|
|
| @require_torch |
| @require_sentencepiece |
| @require_tokenizers |
| class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon): |
| """ |
| Only tests that want to tap into the auto-pre-run 2 trainings: |
| - self.default_trained_model |
| - self.alternate_trained_model |
| directly, or via check_trained_model |
| """ |
|
|
| def setUp(self): |
| super().setUp() |
| args = TrainingArguments("..") |
| self.n_epochs = args.num_train_epochs |
| self.batch_size = args.train_batch_size |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir) |
| trainer.train() |
| self.default_trained_model = (trainer.model.a, trainer.model.b) |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer(learning_rate=0.1, seed=314, output_dir=tmp_dir) |
| trainer.train() |
| self.alternate_trained_model = (trainer.model.a, trainer.model.b) |
|
|
| def check_trained_model(self, model, alternate_seed=False, **kwargs): |
| |
| (a, b) = self.alternate_trained_model if alternate_seed else self.default_trained_model |
| torch.testing.assert_close(model.a, a, **kwargs) |
| torch.testing.assert_close(model.b, b, **kwargs) |
|
|
| def test_reproducible_training(self): |
| |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir) |
| trainer.train() |
| self.check_trained_model(trainer.model) |
|
|
| |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer(learning_rate=0.1, seed=314, output_dir=tmp_dir) |
| trainer.train() |
| self.check_trained_model(trainer.model, alternate_seed=True) |
|
|
| def test_trainer_with_datasets(self): |
| import datasets |
|
|
| np.random.seed(42) |
| x = np.random.normal(size=(64,)).astype(np.float32) |
| y = 2.0 * x + 3.0 + np.random.normal(scale=0.1, size=(64,)).astype(np.float32) |
| train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y}) |
|
|
| |
| model = RegressionModel() |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments(tmp_dir, learning_rate=0.1, report_to="none") |
| trainer = Trainer(model, args, train_dataset=train_dataset) |
| trainer.train() |
| self.check_trained_model(trainer.model) |
|
|
| |
| train_dataset.set_format(type="torch", dtype=torch.float32) |
| model = RegressionModel() |
| trainer = Trainer(model, args, train_dataset=train_dataset) |
| trainer.train() |
| self.check_trained_model(trainer.model) |
|
|
| |
| z = np.random.normal(size=(64,)).astype(np.float32) |
| train_dataset = datasets.Dataset.from_dict({"input_x": x, "label": y, "extra": z}) |
| model = RegressionModel() |
| trainer = Trainer(model, args, train_dataset=train_dataset) |
| trainer.train() |
| self.check_trained_model(trainer.model) |
|
|
| def test_model_init(self): |
| train_dataset = RegressionDataset() |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments(tmp_dir, learning_rate=0.1, report_to="none") |
| trainer = Trainer(args=args, train_dataset=train_dataset, model_init=lambda: RegressionModel()) |
| trainer.train() |
| self.check_trained_model(trainer.model) |
|
|
| |
| trainer.train() |
| self.check_trained_model(trainer.model) |
|
|
| |
| trainer.args.seed = 314 |
| trainer.train() |
| self.check_trained_model(trainer.model, alternate_seed=True) |
|
|
| @slow |
| def test_gradient_accumulation_loss_alignment_with_model_loss(self): |
| set_seed(42) |
| import datasets |
|
|
| model_name = "nickypro/tinyllama-15M" |
| dataset_name = "wikitext" |
| dataset_config = "wikitext-2-raw-v1" |
| dataset = datasets.load_dataset(dataset_name, dataset_config, split="train[:40]") |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| def tokenize_function(examples): |
| return tokenizer(examples["text"], max_length=16, padding="max_length", truncation=True) |
|
|
| tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names) |
|
|
| data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) |
|
|
| args_kwargs = { |
| "report_to": "none", |
| "logging_steps": 1, |
| "max_steps": 5, |
| "learning_rate": 3e-4, |
| "disable_tqdm": True, |
| } |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments( |
| tmp_dir, |
| **args_kwargs, |
| ) |
| |
| set_seed(42) |
| model = AutoModelForCausalLM.from_pretrained(model_name) |
| base_loss_callback = StoreLossCallback() |
| trainer = Trainer( |
| model, |
| args, |
| train_dataset=tokenized_dataset, |
| callbacks=[base_loss_callback], |
| data_collator=data_collator, |
| ) |
| assert trainer.model_accepts_loss_kwargs |
| trainer.train() |
|
|
| args = TrainingArguments( |
| tmp_dir, |
| **args_kwargs, |
| gradient_accumulation_steps=2, |
| per_device_train_batch_size=4, |
| ) |
|
|
| |
| set_seed(42) |
| model = AutoModelForCausalLM.from_pretrained(model_name) |
| grad_accum_loss_callback = StoreLossCallback() |
| trainer = Trainer( |
| model, |
| args, |
| train_dataset=tokenized_dataset, |
| callbacks=[grad_accum_loss_callback], |
| data_collator=data_collator, |
| ) |
| assert trainer.model_accepts_loss_kwargs |
| trainer.train() |
|
|
| |
| set_seed(42) |
| model = AutoModelForCausalLM.from_pretrained(model_name) |
| broken_loss_callback = StoreLossCallback() |
| trainer = Trainer( |
| model, |
| args, |
| train_dataset=tokenized_dataset, |
| callbacks=[broken_loss_callback], |
| data_collator=data_collator, |
| ) |
| |
| trainer.model_accepts_loss_kwargs = False |
| trainer.train() |
|
|
| |
| diff_truth = [ |
| abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses) |
| ] |
| diff_broken = [abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)] |
|
|
| |
| self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01") |
| |
| self.assertGreater(max(diff_broken), 0.7, f"Difference {max(diff_broken)} is not greater than 0.7") |
|
|
| loss_base = sum(base_loss_callback.losses) |
| loss_broken = sum(broken_loss_callback.losses) |
|
|
| |
| relative_diff = abs(loss_base - loss_broken) / max(loss_base, loss_broken) |
| self.assertLess(relative_diff, 0.2, f"Relative difference {relative_diff} is not within 0.2") |
|
|
| def test_gradient_accumulation_loss_alignment_with_loss_func(self): |
| set_seed(42) |
| import datasets |
|
|
| model_name = "roneneldan/TinyStories-33M" |
| dataset_name = "wikitext" |
| dataset_config = "wikitext-2-raw-v1" |
| dataset = datasets.load_dataset(dataset_name, dataset_config, split="train[:40]") |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| def tokenize_function(examples): |
| return tokenizer(examples["text"], max_length=16, padding="max_length", truncation=True) |
|
|
| tokenized_dataset = dataset.map(tokenize_function, batched=True) |
|
|
| tokenizer.pad_token = tokenizer.eos_token |
| data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) |
|
|
| model = AutoModelForCausalLM.from_pretrained(model_name) |
|
|
| def compute_loss(logits, labels, vocab_size, num_items_in_batch, disable_num_items_in_batch=False): |
| return ForCausalLMLoss( |
| logits["logits"], labels, vocab_size, num_items_in_batch, disable_num_items_in_batch |
| ) |
|
|
| loss_fn = partial(compute_loss, vocab_size=model.config.vocab_size, disable_num_items_in_batch=False) |
|
|
| base_loss_callback = StoreLossCallback() |
|
|
| args_kwargs = { |
| "report_to": "none", |
| "logging_steps": 1, |
| "max_steps": 5, |
| "learning_rate": 3e-4, |
| "disable_tqdm": True, |
| } |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments( |
| tmp_dir, |
| **args_kwargs, |
| ) |
| trainer = Trainer( |
| model, |
| args, |
| train_dataset=tokenized_dataset, |
| callbacks=[base_loss_callback], |
| compute_loss_func=loss_fn, |
| data_collator=data_collator, |
| ) |
| trainer.train() |
|
|
| grad_accum_loss_callback = StoreLossCallback() |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments( |
| tmp_dir, |
| **args_kwargs, |
| gradient_accumulation_steps=2, |
| per_device_train_batch_size=4, |
| ) |
| set_seed(42) |
| model = AutoModelForCausalLM.from_pretrained(model_name) |
| trainer = Trainer( |
| model, |
| args, |
| train_dataset=tokenized_dataset, |
| callbacks=[grad_accum_loss_callback], |
| compute_loss_func=loss_fn, |
| data_collator=data_collator, |
| ) |
| trainer.train() |
|
|
| set_seed(42) |
| model = AutoModelForCausalLM.from_pretrained(model_name) |
| broken_loss_callback = StoreLossCallback() |
| loss_fn = partial(compute_loss, vocab_size=model.config.vocab_size, disable_num_items_in_batch=True) |
| trainer = Trainer( |
| model, |
| args, |
| train_dataset=tokenized_dataset, |
| callbacks=[broken_loss_callback], |
| compute_loss_func=loss_fn, |
| data_collator=data_collator, |
| ) |
| trainer.train() |
|
|
| |
| diff_truth = [ |
| abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses) |
| ] |
| diff_broken = [ |
| abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses) |
| ] |
|
|
| |
| self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01") |
|
|
| |
| self.assertGreater(max(diff_broken), 3, f"Difference {max(diff_broken)} is not greater than 3") |
|
|
| def test_gradient_accumulation(self): |
| |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer( |
| gradient_accumulation_steps=2, per_device_train_batch_size=4, learning_rate=0.1, output_dir=tmp_dir |
| ) |
| trainer.train() |
| self.check_trained_model(trainer.model) |
|
|
| def test_gradient_checkpointing(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer( |
| per_device_train_batch_size=1, |
| learning_rate=0.1, |
| gradient_checkpointing=True, |
| gradient_checkpointing_kwargs={"use_reentrant": False}, |
| output_dir=tmp_dir, |
| ) |
| previous_params = {k: v.detach().clone() for k, v in trainer.model.named_parameters()} |
|
|
| trainer.train() |
|
|
| |
| for k, v in trainer.model.named_parameters(): |
| self.assertFalse( |
| torch.allclose(previous_params[k], v, rtol=1e-4, atol=1e-4), |
| f"Model weights for {k} have not been updated", |
| ) |
|
|
| def test_training_loss(self): |
| n_gpus = max(1, backend_device_count(torch_device)) |
|
|
| |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer(logging_steps=64 / (8 * n_gpus), output_dir=tmp_dir) |
| trainer.train() |
| log_history = trainer.state.log_history |
|
|
| losses = [log["loss"] for log in log_history if "loss" in log] |
| train_loss = log_history[-1]["train_loss"] |
| self.assertAlmostEqual(sum(losses) / len(losses), train_loss, places=4) |
|
|
| |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer(logging_steps=5, output_dir=tmp_dir) |
| trainer.train() |
| log_history = trainer.state.log_history |
|
|
| |
| new_train_loss = log_history[-1]["train_loss"] |
| self.assertAlmostEqual(train_loss, new_train_loss, places=4) |
|
|
| def test_custom_optimizer(self): |
| train_dataset = RegressionDataset() |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments(tmp_dir, report_to="none") |
| model = RegressionModel() |
| optimizer = torch.optim.SGD(model.parameters(), lr=1.0) |
| lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: 1.0) |
| trainer = Trainer(model, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler)) |
| trainer.train() |
|
|
| (a, b) = self.default_trained_model |
| self.assertFalse(torch.allclose(trainer.model.a, a)) |
| self.assertFalse(torch.allclose(trainer.model.b, b)) |
| self.assertEqual(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 1.0) |
|
|
| def test_lr_scheduler_kwargs(self): |
| |
| train_dataset = RegressionDataset() |
| model = RegressionModel() |
| num_steps, num_warmup_steps = 10, 2 |
| extra_kwargs = {"power": 5.0, "lr_end": 1e-5} |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments( |
| tmp_dir, |
| lr_scheduler_type="polynomial", |
| lr_scheduler_kwargs=extra_kwargs, |
| learning_rate=0.2, |
| warmup_steps=num_warmup_steps, |
| report_to="none", |
| ) |
| trainer = Trainer(model, args, train_dataset=train_dataset) |
| trainer.create_optimizer_and_scheduler(num_training_steps=num_steps) |
|
|
| |
| self.assertIsNotNone(trainer.lr_scheduler) |
|
|
| |
| sched1 = trainer.lr_scheduler |
| sched2 = get_polynomial_decay_schedule_with_warmup( |
| trainer.optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_steps, **extra_kwargs |
| ) |
| self.assertEqual(sched1.lr_lambdas[0].args, sched2.lr_lambdas[0].args) |
| self.assertEqual(sched1.lr_lambdas[0].keywords, sched2.lr_lambdas[0].keywords) |
|
|
| def test_cosine_with_min_lr_scheduler(self): |
| train_dataset = RegressionDataset() |
| model = RegressionModel() |
| num_steps, num_warmup_steps = 10, 2 |
| extra_kwargs = {"min_lr": 1e-5} |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments( |
| tmp_dir, |
| lr_scheduler_type="cosine_with_min_lr", |
| lr_scheduler_kwargs=extra_kwargs, |
| learning_rate=0.2, |
| warmup_steps=num_warmup_steps, |
| report_to="none", |
| ) |
| trainer = Trainer(model, args, train_dataset=train_dataset) |
| trainer.create_optimizer_and_scheduler(num_training_steps=num_steps) |
|
|
| |
| self.assertIsNotNone(trainer.lr_scheduler) |
|
|
| |
| for _ in range(num_steps): |
| trainer.lr_scheduler.step() |
| self.assertEqual(trainer.lr_scheduler.get_last_lr()[0], 1e-5) |
|
|
| def test_reduce_lr_on_plateau_args(self): |
| |
| train_dataset = RegressionDataset(length=64) |
| eval_dataset = RegressionDataset(length=64) |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments( |
| tmp_dir, |
| eval_strategy="epoch", |
| metric_for_best_model="eval_loss", |
| report_to="none", |
| ) |
| model = RegressionModel() |
| optimizer = torch.optim.SGD(model.parameters(), lr=1.0) |
| lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, patience=5, cooldown=2) |
| trainer = Trainer( |
| model, |
| args, |
| train_dataset=train_dataset, |
| eval_dataset=eval_dataset, |
| optimizers=(optimizer, lr_scheduler), |
| ) |
| trainer.train() |
|
|
| self.assertIsInstance(trainer.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau) |
| self.assertEqual(trainer.lr_scheduler.factor, 0.2) |
| self.assertEqual(trainer.lr_scheduler.patience, 5) |
| self.assertEqual(trainer.lr_scheduler.cooldown, 2) |
|
|
| def test_reduce_lr_on_plateau(self): |
| |
|
|
| class TrainerWithLRLogs(Trainer): |
| def log(self, logs): |
| |
| if hasattr(self.lr_scheduler, "_last_lr"): |
| logs["learning_rate"] = self.lr_scheduler._last_lr[0] |
| super().log(logs) |
|
|
| train_dataset = RegressionDataset(length=64) |
| eval_dataset = RegressionDataset(length=64) |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments( |
| tmp_dir, |
| lr_scheduler_type="reduce_lr_on_plateau", |
| eval_strategy="epoch", |
| metric_for_best_model="eval_loss", |
| num_train_epochs=10, |
| learning_rate=0.2, |
| report_to="none", |
| ) |
| model = RegressionModel() |
| trainer = TrainerWithLRLogs(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset) |
| trainer.train() |
|
|
| self.assertIsInstance(trainer.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau) |
| patience = trainer.lr_scheduler.patience |
|
|
| logs = trainer.state.log_history[1:] |
| best_loss = logs[0]["eval_loss"] |
| bad_epochs = 0 |
| for i, log in enumerate(logs[:-1]): |
| loss = log["eval_loss"] |
| just_decreased = False |
| if loss > best_loss: |
| bad_epochs += 1 |
| if bad_epochs > patience: |
| self.assertLess(logs[i + 1]["learning_rate"], log["learning_rate"]) |
| just_decreased = True |
| bad_epochs = 0 |
| else: |
| best_loss = loss |
| bad_epochs = 0 |
| if not just_decreased: |
| self.assertEqual(logs[i + 1]["learning_rate"], log["learning_rate"]) |
|
|
| def test_adafactor_lr_none(self): |
| |
|
|
| from transformers.optimization import Adafactor, AdafactorSchedule |
|
|
| train_dataset = RegressionDataset() |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments(tmp_dir, report_to="none") |
| model = RegressionModel() |
| optimizer = Adafactor( |
| model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None |
| ) |
| lr_scheduler = AdafactorSchedule(optimizer) |
| trainer = Trainer(model, args, train_dataset=train_dataset, optimizers=(optimizer, lr_scheduler)) |
| trainer.train() |
|
|
| (a, b) = self.default_trained_model |
| self.assertFalse(torch.allclose(trainer.model.a, a)) |
| self.assertFalse(torch.allclose(trainer.model.b, b)) |
| self.assertGreater(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 0) |
|
|
| @require_torch_bf16 |
| @require_torch_accelerator |
| def test_mixed_bf16(self): |
| |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer(learning_rate=0.1, bf16=True, output_dir=tmp_dir) |
| trainer.train() |
| self.check_trained_model(trainer.model, atol=ATOL, rtol=RTOL) |
|
|
| |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| with self.assertRaises(ValueError): |
| trainer = get_regression_trainer( |
| learning_rate=0.1, bf16=True, half_precision_backend="apex", output_dir=tmp_dir |
| ) |
|
|
| |
|
|
| @require_non_xpu |
| @require_torch_gpu |
| @require_torch_tf32 |
| def test_tf32(self): |
| |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer(learning_rate=0.1, tf32=True, output_dir=tmp_dir) |
| trainer.train() |
| self.check_trained_model(trainer.model) |
|
|
|
|
| @require_torch |
| @require_sentencepiece |
| @require_tokenizers |
| class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): |
| def setUp(self): |
| super().setUp() |
| args = TrainingArguments("..") |
| self.n_epochs = args.num_train_epochs |
| self.batch_size = args.train_batch_size |
|
|
| def test_trainer_works_with_dict(self): |
| |
| |
| train_dataset = RegressionDataset() |
| eval_dataset = RegressionDataset() |
| model = RegressionDictModel() |
| args = TrainingArguments(self.get_auto_remove_tmp_dir(), report_to="none") |
| trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset) |
| trainer.train() |
| _ = trainer.evaluate() |
| _ = trainer.predict(eval_dataset) |
|
|
| def test_evaluation_with_keys_to_drop(self): |
| config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4) |
| tiny_gpt2 = GPT2LMHeadModel(config) |
| x = torch.randint(0, 100, (128,)) |
| eval_dataset = RepeatDataset(x) |
| args = TrainingArguments(self.get_auto_remove_tmp_dir(), report_to="none") |
| trainer = Trainer(tiny_gpt2, args, eval_dataset=eval_dataset) |
| |
| result = trainer.predict(eval_dataset) |
| self.assertTrue(isinstance(result.predictions, np.ndarray)) |
| |
| result = trainer.predict(eval_dataset, ignore_keys=[]) |
| self.assertTrue(isinstance(result.predictions, tuple)) |
| self.assertEqual(len(result.predictions), 2) |
|
|
| def test_training_arguments_are_left_untouched(self): |
| tmp_dir = self.get_auto_remove_tmp_dir() |
| trainer = get_regression_trainer(output_dir=tmp_dir) |
| trainer.train() |
| args = TrainingArguments(tmp_dir, report_to=[]) |
| dict1, dict2 = args.to_dict(), trainer.args.to_dict() |
| for key in dict1.keys(): |
| |
| if key != "logging_dir": |
| self.assertEqual(dict1[key], dict2[key]) |
|
|
| def test_number_of_steps_in_training(self): |
| |
| tmp_dir = self.get_auto_remove_tmp_dir() |
| trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir) |
| train_output = trainer.train() |
| self.assertEqual(train_output.global_step, self.n_epochs * 64 / self.batch_size) |
|
|
| |
| trainer = get_regression_trainer(learning_rate=0.1, num_train_epochs=1.5, output_dir=tmp_dir) |
| train_output = trainer.train() |
| self.assertEqual(train_output.global_step, int(1.5 * 64 / self.batch_size)) |
|
|
| |
| trainer = get_regression_trainer(learning_rate=0.1, max_steps=10, output_dir=tmp_dir) |
| train_output = trainer.train() |
| self.assertEqual(train_output.global_step, 10) |
|
|
| @require_torch_bf16 |
| @require_intel_extension_for_pytorch |
| def test_number_of_steps_in_training_with_ipex(self): |
| for mix_bf16 in [True, False]: |
| tmp_dir = self.get_auto_remove_tmp_dir() |
| |
| trainer = get_regression_trainer( |
| learning_rate=0.1, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir |
| ) |
| train_output = trainer.train() |
| self.assertEqual(train_output.global_step, self.n_epochs * 64 / trainer.args.train_batch_size) |
|
|
| |
| trainer = get_regression_trainer( |
| learning_rate=0.1, |
| num_train_epochs=1.5, |
| use_ipex=True, |
| bf16=mix_bf16, |
| use_cpu=True, |
| output_dir=tmp_dir, |
| ) |
| train_output = trainer.train() |
| self.assertEqual(train_output.global_step, int(1.5 * 64 / trainer.args.train_batch_size)) |
|
|
| |
| trainer = get_regression_trainer( |
| learning_rate=0.1, max_steps=10, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir |
| ) |
| train_output = trainer.train() |
| self.assertEqual(train_output.global_step, 10) |
|
|
| def test_torch_compile_loss_func_compatibility(self): |
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
|
|
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| per_device_train_batch_size=2, |
| torch_compile=True, |
| max_steps=1, |
| ) |
| trainer = Trainer(model=tiny_llama, args=args, train_dataset=train_dataset) |
| trainer.train() |
|
|
| @require_peft |
| @require_bitsandbytes |
| def test_bnb_compile(self): |
| from peft import LoraConfig, get_peft_model |
|
|
| |
| |
| |
| tiny_model = AutoModelForCausalLM.from_pretrained( |
| "hf-internal-testing/tiny-random-LlamaForCausalLM", load_in_4bit=True |
| ) |
|
|
| peft_config = LoraConfig( |
| r=8, |
| lora_alpha=32, |
| target_modules=["q_proj", "k_proj", "v_proj"], |
| lora_dropout=0.05, |
| bias="none", |
| task_type="CAUSAL_LM", |
| ) |
| tiny_model = get_peft_model(tiny_model, peft_config) |
|
|
| tiny_model = torch.compile(tiny_model) |
|
|
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=1e-9, |
| logging_steps=5, |
| ) |
| with self.assertRaises(ValueError): |
| _ = Trainer(tiny_model, args, train_dataset=train_dataset) |
|
|
| @require_peft |
| def test_multiple_peft_adapters(self): |
| from peft import LoraConfig, get_peft_model |
|
|
| |
|
|
| MODEL_ID = "hf-internal-testing/tiny-random-LlamaForCausalLM" |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) |
| tiny_model = AutoModelForCausalLM.from_pretrained(MODEL_ID) |
|
|
| peft_config = LoraConfig( |
| r=4, |
| lora_alpha=16, |
| lora_dropout=0.05, |
| bias="none", |
| task_type="CAUSAL_LM", |
| ) |
| tiny_model = get_peft_model(tiny_model, peft_config, "adapter1") |
| tiny_model.add_adapter("adapter2", peft_config) |
|
|
| train_dataset = LineByLineTextDataset( |
| tokenizer=tokenizer, |
| file_path=PATH_SAMPLE_TEXT, |
| block_size=tokenizer.max_len_single_sentence, |
| ) |
| for example in train_dataset.examples: |
| example["labels"] = example["input_ids"] |
|
|
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| tmp_dir = self.get_auto_remove_tmp_dir() |
| args = TrainingArguments( |
| tmp_dir, |
| per_device_train_batch_size=1, |
| learning_rate=1e-9, |
| save_steps=5, |
| logging_steps=5, |
| max_steps=10, |
| use_cpu=True, |
| ) |
| trainer = Trainer(tiny_model, args, processing_class=tokenizer, train_dataset=train_dataset) |
|
|
| trainer.train() |
| parameters = dict(tiny_model.named_parameters()) |
| state = dataclasses.asdict(trainer.state) |
|
|
| |
| trainer = Trainer(tiny_model, args, processing_class=tokenizer, train_dataset=train_dataset) |
|
|
| checkpoint = os.path.join(tmp_dir, "checkpoint-5") |
|
|
| trainer.train(resume_from_checkpoint=checkpoint) |
| parameters1 = dict(tiny_model.named_parameters()) |
| state1 = dataclasses.asdict(trainer.state) |
| self.assertEqual(parameters, parameters1) |
| self.check_trainer_state_are_the_same(state, state1) |
|
|
| @require_bitsandbytes |
| def test_rmsprop_bnb(self): |
| config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4) |
| tiny_gpt2 = GPT2LMHeadModel(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=1e-9, |
| logging_steps=5, |
| logging_nan_inf_filter=False, |
| optim="rmsprop_bnb", |
| ) |
| trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) |
|
|
| |
| trainer.train() |
|
|
| @require_bitsandbytes |
| def test_ademamix_bnb(self): |
| config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4) |
| tiny_gpt2 = GPT2LMHeadModel(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=1e-9, |
| logging_steps=5, |
| logging_nan_inf_filter=False, |
| optim="ademamix", |
| ) |
| trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) |
|
|
| |
| trainer.train() |
|
|
| @require_bitsandbytes |
| def test_ademamix_bnb_8bit(self): |
| config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4) |
| tiny_gpt2 = GPT2LMHeadModel(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=1e-9, |
| logging_steps=5, |
| logging_nan_inf_filter=False, |
| optim="ademamix_8bit", |
| ) |
| trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) |
|
|
| |
| trainer.train() |
|
|
| @require_bitsandbytes |
| def test_rmsprop_bnb_8bit(self): |
| config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4) |
| tiny_gpt2 = GPT2LMHeadModel(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=1e-9, |
| logging_steps=5, |
| logging_nan_inf_filter=False, |
| optim="rmsprop_bnb_8bit", |
| ) |
| trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) |
|
|
| |
| trainer.train() |
|
|
| @require_bitsandbytes |
| def test_rmsprop_bnb_32bit(self): |
| config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4) |
| tiny_gpt2 = GPT2LMHeadModel(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=1e-9, |
| logging_steps=5, |
| logging_nan_inf_filter=False, |
| optim="rmsprop_bnb_32bit", |
| ) |
| trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) |
|
|
| |
| trainer.train() |
|
|
| def test_neftune(self): |
| config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4) |
| tiny_gpt2 = GPT2LMHeadModel(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=1e-9, |
| logging_steps=5, |
| logging_nan_inf_filter=False, |
| neftune_noise_alpha=0.4, |
| report_to="none", |
| ) |
| trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) |
|
|
| trainer.model = trainer._activate_neftune(trainer.model) |
|
|
| dummy_input = torch.LongTensor([[1, 0, 1]]).to(torch_device) |
|
|
| emb1 = trainer.model.get_input_embeddings()(dummy_input) |
| emb2 = trainer.model.get_input_embeddings()(dummy_input) |
|
|
| self.assertFalse(torch.allclose(emb1, emb2), "Neftune noise is not applied!") |
|
|
| |
| tiny_gpt2 = GPT2LMHeadModel(config) |
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=1e-9, |
| logging_steps=5, |
| logging_nan_inf_filter=False, |
| neftune_noise_alpha=0.4, |
| report_to="none", |
| ) |
| trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) |
|
|
| |
| trainer.train() |
|
|
| |
| _ = trainer.model(dummy_input) |
| self.assertTrue(len(trainer.model.get_input_embeddings()._forward_hooks) == 0) |
|
|
| trainer.model.eval() |
|
|
| |
| emb1 = trainer.model.get_input_embeddings()(dummy_input) |
| emb2 = trainer.model.get_input_embeddings()(dummy_input) |
| torch.testing.assert_close(emb1, emb2) |
|
|
| def test_logging_inf_nan_filter(self): |
| config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4) |
| tiny_gpt2 = GPT2LMHeadModel(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=1e9, |
| logging_steps=5, |
| logging_nan_inf_filter=False, |
| report_to="none", |
| ) |
| trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) |
| trainer.train() |
| log_history_no_filter = trainer.state.log_history |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=1e9, |
| logging_steps=5, |
| logging_nan_inf_filter=True, |
| report_to="none", |
| ) |
| trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset) |
| trainer.train() |
| log_history_filter = trainer.state.log_history |
|
|
| def is_any_loss_nan_or_inf(log_history): |
| losses = [l["loss"] for l in log_history[:-1]] |
| return any(math.isnan(x) for x in losses) or any(math.isinf(x) for x in losses) |
|
|
| self.assertTrue(is_any_loss_nan_or_inf(log_history_no_filter)) |
| self.assertFalse(is_any_loss_nan_or_inf(log_history_filter)) |
|
|
| def test_train_and_eval_dataloaders(self): |
| if torch_device == "cuda": |
| n_gpu = max(1, backend_device_count(torch_device)) |
| else: |
| n_gpu = 1 |
|
|
| tmp_dir = self.get_auto_remove_tmp_dir() |
| trainer = get_regression_trainer(learning_rate=0.1, per_device_train_batch_size=16, output_dir=tmp_dir) |
| self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16 * n_gpu) |
| trainer = get_regression_trainer(learning_rate=0.1, per_device_eval_batch_size=16, output_dir=tmp_dir) |
| self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16 * n_gpu) |
|
|
| |
| trainer = get_regression_trainer( |
| train_len=66, |
| eval_len=74, |
| learning_rate=0.1, |
| per_device_train_batch_size=16, |
| per_device_eval_batch_size=32, |
| output_dir=tmp_dir, |
| ) |
| self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16 * n_gpu) + 1) |
| self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32 * n_gpu) + 1) |
|
|
| trainer = get_regression_trainer( |
| train_len=66, |
| eval_len=74, |
| learning_rate=0.1, |
| per_device_train_batch_size=16, |
| per_device_eval_batch_size=32, |
| dataloader_drop_last=True, |
| output_dir=tmp_dir, |
| ) |
| self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16 * n_gpu)) |
| self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32 * n_gpu)) |
|
|
| |
| new_eval_dataset = RegressionDataset(length=128) |
| self.assertEqual(len(trainer.get_eval_dataloader(new_eval_dataset)), 128 // (32 * n_gpu)) |
|
|
| |
| def test_dataloader_without_dataset(self): |
| train_dataset = RegressionDataset(length=128) |
| trainer = CustomDataloaderTrainer( |
| model=RegressionModel(), |
| train_dataset=train_dataset, |
| eval_dataset=train_dataset, |
| args=TrainingArguments(output_dir=self.get_auto_remove_tmp_dir(), report_to="none"), |
| ) |
|
|
| trainer.train() |
| trainer.evaluate() |
|
|
| def test_get_eval_dataloader_without_persistent_workers(self): |
| train_dataset = RegressionDataset() |
| config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4) |
| tiny_gpt2 = GPT2LMHeadModel(config) |
| args = TrainingArguments(self.get_auto_remove_tmp_dir(), report_to="none", dataloader_persistent_workers=False) |
|
|
| |
| eval_dataset = RegressionDataset() |
| trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset, eval_dataset=eval_dataset) |
| |
| trainer.accelerator.prepare = lambda x: x |
|
|
| default_dataloader = trainer.get_eval_dataloader() |
| dataloader_with_dataset = trainer.get_eval_dataloader(eval_dataset) |
|
|
| self.assertEqual(default_dataloader.dataset, eval_dataset) |
| self.assertEqual(dataloader_with_dataset.dataset, eval_dataset) |
| self.assertNotEqual(default_dataloader, dataloader_with_dataset) |
|
|
| |
| first_dataset = RegressionDataset() |
| second_dataset = RegressionDataset() |
| trainer = Trainer( |
| tiny_gpt2, |
| args, |
| train_dataset=train_dataset, |
| eval_dataset={"first": first_dataset, "second": second_dataset}, |
| ) |
| |
| trainer.accelerator.prepare = lambda x: x |
|
|
| first_dataloader = trainer.get_eval_dataloader("first") |
| first_dataloader_repeated = trainer.get_eval_dataloader("first") |
| second_dataloader = trainer.get_eval_dataloader("second") |
| second_dataloader_repeated = trainer.get_eval_dataloader("second") |
|
|
| self.assertEqual(first_dataset, first_dataloader.dataset) |
| self.assertEqual(first_dataloader.dataset, first_dataloader_repeated.dataset) |
| self.assertEqual(second_dataset, second_dataloader.dataset) |
| self.assertEqual(second_dataloader.dataset, second_dataloader_repeated.dataset) |
| self.assertNotEqual(first_dataloader, first_dataloader_repeated) |
| self.assertNotEqual(second_dataloader, second_dataloader_repeated) |
|
|
| def test_get_eval_dataloader_with_persistent_workers(self): |
| train_dataset = RegressionDataset() |
| config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4) |
| tiny_gpt2 = GPT2LMHeadModel(config) |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| report_to="none", |
| dataloader_persistent_workers=True, |
| dataloader_num_workers=2, |
| ) |
|
|
| |
| eval_dataset = RegressionDataset() |
| trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset, eval_dataset=eval_dataset) |
| |
| trainer.accelerator.prepare = lambda x: x |
|
|
| default_dataloader = trainer.get_eval_dataloader() |
| dataloader_with_dataset = trainer.get_eval_dataloader(eval_dataset) |
|
|
| self.assertEqual(default_dataloader.dataset, eval_dataset) |
| self.assertEqual(dataloader_with_dataset.dataset, eval_dataset) |
| self.assertEqual(default_dataloader, dataloader_with_dataset) |
|
|
| |
| first_dataset = RegressionDataset() |
| second_dataset = RegressionDataset() |
| trainer = Trainer( |
| tiny_gpt2, |
| args, |
| train_dataset=train_dataset, |
| eval_dataset={"first": first_dataset, "second": second_dataset}, |
| ) |
| |
| trainer.accelerator.prepare = lambda x: x |
|
|
| first_dataloader = trainer.get_eval_dataloader("first") |
| first_dataloader_repeated = trainer.get_eval_dataloader("first") |
| second_dataloader = trainer.get_eval_dataloader("second") |
| second_dataloader_repeated = trainer.get_eval_dataloader("second") |
|
|
| self.assertEqual(first_dataset, first_dataloader.dataset) |
| self.assertEqual(first_dataloader.dataset, first_dataloader_repeated.dataset) |
| self.assertEqual(second_dataset, second_dataloader.dataset) |
| self.assertEqual(second_dataloader.dataset, second_dataloader_repeated.dataset) |
| self.assertEqual(first_dataloader, first_dataloader_repeated) |
| self.assertEqual(second_dataloader, second_dataloader_repeated) |
|
|
| @require_liger_kernel |
| def test_use_liger_kernel_patching(self): |
| |
| with patch("transformers.models.llama.modeling_llama"): |
| from liger_kernel.transformers import LigerRMSNorm, liger_rotary_pos_emb |
|
|
| from transformers.models.llama import modeling_llama |
|
|
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
|
|
| |
| self.assertNotEqual(modeling_llama.apply_rotary_pos_emb, liger_rotary_pos_emb) |
| self.assertFalse(isinstance(tiny_llama.model.norm, LigerRMSNorm)) |
|
|
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| use_liger_kernel=True, |
| ) |
| Trainer(tiny_llama, args) |
|
|
| |
| self.assertEqual(modeling_llama.apply_rotary_pos_emb, liger_rotary_pos_emb) |
| self.assertTrue(isinstance(tiny_llama.model.norm, LigerRMSNorm)) |
|
|
| @require_liger_kernel |
| @require_torch_accelerator |
| def test_use_liger_kernel_trainer(self): |
| |
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
|
|
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), learning_rate=1e-2, logging_steps=5, max_steps=20, use_liger_kernel=True |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
|
|
| |
| _ = trainer.train() |
|
|
| @require_lomo |
| @require_torch_gpu |
| def test_lomo(self): |
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
|
|
| previous_params = {n: p.clone() for n, p in tiny_llama.named_parameters()} |
|
|
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), learning_rate=1e-2, logging_steps=5, optim="lomo", max_steps=20 |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
|
|
| |
| _ = trainer.train() |
|
|
| for name, param in tiny_llama.named_parameters(): |
| self.assertFalse(torch.allclose(param, previous_params[name].to(param.device), rtol=1e-12, atol=1e-12)) |
|
|
| @require_lomo |
| @require_torch_gpu |
| def test_adalomo(self): |
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=1e-9, |
| logging_steps=5, |
| optim="adalomo", |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
|
|
| |
| _ = trainer.train() |
|
|
| @require_grokadamw |
| @require_torch_accelerator |
| def test_grokadamw(self): |
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=2e-5, |
| logging_steps=5, |
| optim="grokadamw", |
| max_steps=20, |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
|
|
| |
| _ = trainer.train() |
|
|
| @require_schedulefree |
| @require_torch_accelerator |
| def test_schedulefree_adam(self): |
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| with tempfile.TemporaryDirectory() as tmpdir: |
| |
| args = TrainingArguments( |
| tmpdir, |
| learning_rate=1e-9, |
| logging_steps=5, |
| optim="schedule_free_adamw", |
| lr_scheduler_type="constant", |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
|
|
| |
| _ = trainer.train() |
|
|
| @require_schedulefree |
| @require_torch_accelerator |
| def test_schedulefree_radam(self): |
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| with tempfile.TemporaryDirectory() as tmpdir: |
| |
| args = TrainingArguments( |
| tmpdir, |
| learning_rate=1e-9, |
| logging_steps=5, |
| lr_scheduler_type="constant", |
| optim="schedule_free_radam", |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
|
|
| |
| _ = trainer.train() |
|
|
| def test_galore_matched_modules(self): |
| regex_patterns = [r".*.attn.*", r".*.mlp.*"] |
|
|
| module_names = [ |
| "model.transformer.h.0.ln_1", |
| "model.transformer.h.0.attn.q_proj", |
| "model.lm_head", |
| "model.transformer.h.0.mlp.up_proj", |
| ] |
| expected_values = [False, True, False, True] |
|
|
| for expected_value, module_name in zip(expected_values, module_names): |
| is_module_matched, is_regex = check_target_module_exists(regex_patterns, module_name, return_is_regex=True) |
| self.assertTrue(is_module_matched == expected_value) |
| if is_module_matched: |
| self.assertTrue(is_regex) |
|
|
| exact_patterns = ["q_proj", "up_proj"] |
|
|
| module_names = [ |
| "model.transformer.h.0.ln_1", |
| "model.transformer.h.0.attn.q_proj", |
| "model.lm_head", |
| "model.transformer.h.0.mlp.up_proj", |
| ] |
| expected_values = [False, True, False, True] |
|
|
| for expected_value, module_name in zip(expected_values, module_names): |
| is_module_matched, is_regex = check_target_module_exists(exact_patterns, module_name, return_is_regex=True) |
| self.assertTrue(is_module_matched == expected_value) |
| if is_module_matched: |
| self.assertFalse(is_regex) |
|
|
| simple_regex = r".*.attn.*" |
|
|
| module_names = [ |
| "model.transformer.h.0.ln_1", |
| "model.transformer.h.0.attn.q_proj", |
| "model.lm_head", |
| "model.transformer.h.0.mlp.up_proj", |
| ] |
| expected_values = [False, True, False, False] |
|
|
| for expected_value, module_name in zip(expected_values, module_names): |
| is_module_matched, is_regex = check_target_module_exists(simple_regex, module_name, return_is_regex=True) |
| self.assertTrue(is_module_matched == expected_value) |
| if is_module_matched: |
| self.assertTrue(is_regex) |
|
|
| simple_regex = "model.transformer.h.0.attn.q_proj" |
|
|
| module_names = [ |
| "model.transformer.h.0.ln_1", |
| "model.transformer.h.0.attn.q_proj", |
| "model.lm_head", |
| "model.transformer.h.0.mlp.up_proj", |
| ] |
| expected_values = [False, True, False, False] |
|
|
| for expected_value, module_name in zip(expected_values, module_names): |
| is_module_matched, is_regex = check_target_module_exists(simple_regex, module_name, return_is_regex=True) |
| self.assertTrue(is_module_matched == expected_value) |
| if is_module_matched: |
| self.assertFalse(is_regex) |
|
|
| target_modules = ["attn", "mlp"] |
|
|
| module_names = [ |
| "model.transformer.h.0.ln_1", |
| "model.transformer.h.0.attn.q_proj", |
| "model.lm_head", |
| "model.transformer.h.0.mlp.up_proj", |
| ] |
| expected_values = [False, True, False, True] |
|
|
| for expected_value, module_name in zip(expected_values, module_names): |
| is_module_matched, is_regex = check_target_module_exists(target_modules, module_name, return_is_regex=True) |
| self.assertTrue(is_module_matched == expected_value) |
| if is_module_matched: |
| self.assertFalse(is_regex) |
|
|
| @require_galore_torch |
| @require_torch_gpu |
| def test_galore(self): |
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=1e-9, |
| logging_steps=5, |
| optim="galore_adamw", |
| optim_target_modules=[r".*attn.*", r".*mlp.*"], |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
|
|
| |
| _ = trainer.train() |
|
|
| @require_galore_torch |
| @require_torch_gpu |
| def test_galore_extra_args(self): |
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=1e-9, |
| logging_steps=5, |
| optim="galore_adamw", |
| optim_args="rank=64, update_proj_gap=100, scale=0.10", |
| optim_target_modules=[r".*attn.*", r".*mlp.*"], |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
|
|
| |
| _ = trainer.train() |
|
|
| @require_galore_torch |
| @require_torch_gpu |
| def test_galore_layerwise(self): |
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=1e-9, |
| logging_steps=5, |
| optim="galore_adamw_layerwise", |
| optim_target_modules=[r".*attn.*", r".*mlp.*"], |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
|
|
| |
| _ = trainer.train() |
|
|
| @require_galore_torch |
| @require_torch_gpu |
| def test_galore_layerwise_with_scheduler(self): |
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=1e-9, |
| logging_steps=5, |
| optim="galore_adamw_layerwise", |
| lr_scheduler_type="cosine", |
| optim_target_modules=[r".*attn.*", r".*mlp.*"], |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
|
|
| |
| _ = trainer.train() |
|
|
| @require_galore_torch |
| @require_torch_gpu |
| def test_galore_adamw_8bit(self): |
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=1e-9, |
| logging_steps=5, |
| optim="galore_adamw_8bit", |
| optim_target_modules=[r".*attn.*", r".*mlp.*"], |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
|
|
| |
| _ = trainer.train() |
|
|
| @require_galore_torch |
| @require_torch_gpu |
| def test_galore_adafactor(self): |
| |
| |
| upper_bound_pm = 700 |
| lower_bound_pm = 650 |
|
|
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| with tempfile.TemporaryDirectory() as tmpdir, TorchTracemalloc() as tracemalloc: |
| |
| args = TrainingArguments( |
| tmpdir, |
| learning_rate=1e-9, |
| logging_steps=5, |
| optim="galore_adafactor", |
| optim_target_modules=[r".*attn.*", r".*mlp.*"], |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
|
|
| |
| _ = trainer.train() |
|
|
| galore_peak_memory = tracemalloc.peaked + bytes2megabytes(tracemalloc.begin) |
|
|
| self.assertTrue(galore_peak_memory < upper_bound_pm) |
| self.assertTrue(lower_bound_pm < galore_peak_memory) |
|
|
| @require_galore_torch |
| @require_torch_gpu |
| def test_galore_adafactor_attention_only(self): |
| |
| |
| upper_bound_pm = 700 |
| lower_bound_pm = 650 |
|
|
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| with tempfile.TemporaryDirectory() as tmpdir, TorchTracemalloc() as tracemalloc: |
| |
| args = TrainingArguments( |
| tmpdir, |
| learning_rate=1e-9, |
| logging_steps=5, |
| optim="galore_adafactor", |
| optim_target_modules=["q_proj", "k_proj", "v_proj"], |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
|
|
| |
| _ = trainer.train() |
|
|
| galore_peak_memory = tracemalloc.peaked + bytes2megabytes(tracemalloc.begin) |
| self.assertTrue(galore_peak_memory < upper_bound_pm) |
| self.assertTrue(lower_bound_pm < galore_peak_memory) |
|
|
| @require_galore_torch |
| @require_torch_gpu |
| def test_galore_adafactor_all_linear(self): |
| |
| |
| upper_bound_pm = 700 |
| lower_bound_pm = 650 |
|
|
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| with tempfile.TemporaryDirectory() as tmpdir, TorchTracemalloc() as tracemalloc: |
| |
| args = TrainingArguments( |
| tmpdir, |
| learning_rate=1e-9, |
| logging_steps=5, |
| optim="galore_adafactor", |
| optim_target_modules="all-linear", |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
|
|
| |
| _ = trainer.train() |
|
|
| galore_peak_memory = tracemalloc.peaked + bytes2megabytes(tracemalloc.begin) |
| self.assertTrue(galore_peak_memory < upper_bound_pm) |
| self.assertTrue(lower_bound_pm < galore_peak_memory) |
|
|
| @require_galore_torch |
| @require_torch_accelerator |
| def test_galore_lr_display_without_scheduler(self): |
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| learning_rate = 1e-9 |
| num_steps = 10 |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=learning_rate, |
| logging_steps=5, |
| optim="galore_adamw", |
| optim_target_modules=[r".*attn.*", r".*mlp.*"], |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
| trainer.create_optimizer_and_scheduler(num_training_steps=num_steps) |
|
|
| |
| self.assertEqual(trainer.get_learning_rates(), [learning_rate, learning_rate]) |
|
|
| @require_galore_torch |
| @require_torch_accelerator |
| def test_galore_lr_display_with_scheduler(self): |
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| learning_rate = 2e-4 |
| num_train_epochs = 2 |
| num_warmup_steps = 5 |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| num_train_epochs=num_train_epochs, |
| learning_rate=learning_rate, |
| warmup_steps=num_warmup_steps, |
| lr_scheduler_type="cosine", |
| logging_steps=1, |
| optim="galore_adamw", |
| optim_target_modules=[r".*attn.*", r".*mlp.*"], |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
|
|
| |
| trainer.train() |
| logs = trainer.state.log_history[1:-1] |
|
|
| |
| self.assertTrue(logs[num_warmup_steps - 1]["learning_rate"] == learning_rate) |
| |
| self.assertTrue(np.allclose(logs[-1]["learning_rate"], 0, atol=5e-6)) |
|
|
| |
| increasing_lrs = [ |
| logs[i]["learning_rate"] < logs[i + 1]["learning_rate"] |
| for i in range(len(logs)) |
| if i < num_warmup_steps - 1 |
| ] |
| decreasing_lrs = [ |
| logs[i]["learning_rate"] > logs[i + 1]["learning_rate"] |
| for i in range(len(logs) - 1) |
| if i >= num_warmup_steps - 1 |
| ] |
|
|
| self.assertTrue(all(increasing_lrs)) |
| self.assertTrue(all(decreasing_lrs)) |
|
|
| |
| self.assertTrue(len(decreasing_lrs) > len(increasing_lrs)) |
|
|
| @require_apollo_torch |
| @require_torch_gpu |
| def test_apollo(self): |
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=1e-9, |
| logging_steps=5, |
| optim="apollo_adamw", |
| optim_target_modules=[r".*attn.*", r".*mlp.*"], |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
|
|
| |
| _ = trainer.train() |
|
|
| @require_apollo_torch |
| @require_torch_gpu |
| def test_apollo_extra_args(self): |
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=1e-9, |
| logging_steps=5, |
| optim="apollo_adamw", |
| optim_args="proj=random,scale_type=tensor,rank=1,update_proj_gap=100,scale=128.0", |
| optim_target_modules=[r".*attn.*", r".*mlp.*"], |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
|
|
| |
| _ = trainer.train() |
|
|
| @require_apollo_torch |
| @require_torch_gpu |
| def test_apollo_layerwise(self): |
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=1e-9, |
| logging_steps=5, |
| optim="apollo_adamw_layerwise", |
| optim_target_modules=[r".*attn.*", r".*mlp.*"], |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
|
|
| |
| _ = trainer.train() |
|
|
| @require_apollo_torch |
| @require_torch_gpu |
| def test_apollo_layerwise_with_scheduler(self): |
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=1e-9, |
| logging_steps=5, |
| optim="apollo_adamw_layerwise", |
| lr_scheduler_type="cosine", |
| optim_target_modules=[r".*attn.*", r".*mlp.*"], |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
|
|
| |
| _ = trainer.train() |
|
|
| @require_apollo_torch |
| @require_torch_gpu |
| def test_apollo_lr_display_without_scheduler(self): |
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| learning_rate = 1e-9 |
| num_steps = 10 |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| learning_rate=learning_rate, |
| logging_steps=5, |
| optim="apollo_adamw", |
| optim_target_modules=[r".*attn.*", r".*mlp.*"], |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
| trainer.create_optimizer_and_scheduler(num_training_steps=num_steps) |
|
|
| |
| self.assertEqual(trainer.get_learning_rates(), [learning_rate, learning_rate]) |
|
|
| @require_apollo_torch |
| @require_torch_gpu |
| def test_apollo_lr_display_with_scheduler(self): |
| config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) |
| tiny_llama = LlamaForCausalLM(config) |
| x = torch.randint(0, 100, (128,)) |
| train_dataset = RepeatDataset(x) |
|
|
| learning_rate = 2e-4 |
| num_train_epochs = 10 |
| num_warmup_steps = 5 |
|
|
| |
| args = TrainingArguments( |
| self.get_auto_remove_tmp_dir(), |
| num_train_epochs=num_train_epochs, |
| learning_rate=learning_rate, |
| warmup_steps=num_warmup_steps, |
| lr_scheduler_type="cosine", |
| logging_steps=1, |
| optim="apollo_adamw", |
| optim_target_modules=[r".*attn.*", r".*mlp.*"], |
| ) |
| trainer = Trainer(tiny_llama, args, train_dataset=train_dataset) |
|
|
| |
| trainer.train() |
| logs = trainer.state.log_history[1:][:-1] |
|
|
| |
| self.assertTrue(logs[num_warmup_steps - 2]["learning_rate"] == learning_rate) |
| self.assertTrue(logs[-1]["learning_rate"] == 0) |
|
|
| |
| increasing_lrs = [ |
| logs[i]["learning_rate"] < logs[i + 1]["learning_rate"] |
| for i in range(len(logs)) |
| if i < num_warmup_steps - 2 |
| ] |
| decreasing_lrs = [ |
| logs[i]["learning_rate"] > logs[i + 1]["learning_rate"] |
| for i in range(len(logs) - 1) |
| if i >= num_warmup_steps - 2 |
| ] |
|
|
| self.assertTrue(all(increasing_lrs)) |
| self.assertTrue(all(decreasing_lrs)) |
|
|
| |
| self.assertTrue(len(decreasing_lrs) > len(increasing_lrs)) |
|
|
| @require_torch_multi_accelerator |
| def test_data_is_not_parallelized_when_model_is_parallel(self): |
| model = RegressionModel() |
| |
| model.is_parallelizable = True |
| model.model_parallel = True |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments( |
| tmp_dir, per_device_train_batch_size=16, per_device_eval_batch_size=16, report_to="none" |
| ) |
| trainer = Trainer(model, args, train_dataset=RegressionDataset(), eval_dataset=RegressionDataset()) |
| |
| self.assertTrue(trainer.is_model_parallel) |
| self.assertEqual(trainer.args.n_gpu, 1) |
|
|
| |
| self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16) |
| self.assertEqual(len(trainer.get_train_dataloader()), 64 // 16) |
| self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16) |
| self.assertEqual(len(trainer.get_eval_dataloader()), 64 // 16) |
|
|
| def test_evaluate(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer(a=1.5, b=2.5, compute_metrics=AlmostAccuracy(), output_dir=tmp_dir) |
| results = trainer.evaluate() |
|
|
| x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] |
| pred = 1.5 * x + 2.5 |
| expected_loss = ((pred - y) ** 2).mean() |
| self.assertAlmostEqual(results["eval_loss"], expected_loss) |
| expected_acc = AlmostAccuracy()((pred, y))["accuracy"] |
| self.assertAlmostEqual(results["eval_accuracy"], expected_acc) |
|
|
| |
| trainer = get_regression_trainer( |
| a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracy(), output_dir=tmp_dir |
| ) |
| results = trainer.evaluate() |
|
|
| x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] |
| pred = 1.5 * x + 2.5 |
| expected_loss = ((pred - y) ** 2).mean() |
| self.assertAlmostEqual(results["eval_loss"], expected_loss) |
| expected_acc = AlmostAccuracy()((pred, y))["accuracy"] |
| self.assertAlmostEqual(results["eval_accuracy"], expected_acc) |
|
|
| |
| trainer = get_regression_trainer( |
| a=1.5, |
| b=2.5, |
| compute_metrics=AlmostAccuracy(), |
| preprocess_logits_for_metrics=lambda logits, labels: logits + 1, |
| output_dir=tmp_dir, |
| ) |
| results = trainer.evaluate() |
|
|
| x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] |
| pred = 1.5 * x + 2.5 |
| expected_loss = ((pred - y) ** 2).mean() |
| self.assertAlmostEqual(results["eval_loss"], expected_loss) |
| expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"] |
| self.assertAlmostEqual(results["eval_accuracy"], expected_acc) |
|
|
| def test_evaluate_with_batch_eval_metrics(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer( |
| a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True, output_dir=tmp_dir |
| ) |
| results = trainer.evaluate() |
|
|
| x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] |
| pred = 1.5 * x + 2.5 |
| expected_loss = ((pred - y) ** 2).mean() |
| self.assertAlmostEqual(results["eval_loss"], expected_loss) |
| expected_acc = AlmostAccuracy()((pred, y))["accuracy"] |
| self.assertAlmostEqual(results["eval_accuracy"], expected_acc) |
|
|
| |
| trainer = get_regression_trainer( |
| a=1.5, |
| b=2.5, |
| eval_len=66, |
| compute_metrics=AlmostAccuracyBatched(), |
| batch_eval_metrics=True, |
| output_dir=tmp_dir, |
| ) |
| results = trainer.evaluate() |
|
|
| x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] |
| pred = 1.5 * x + 2.5 |
| expected_loss = ((pred - y) ** 2).mean() |
| self.assertAlmostEqual(results["eval_loss"], expected_loss) |
| expected_acc = AlmostAccuracy()((pred, y))["accuracy"] |
| self.assertAlmostEqual(results["eval_accuracy"], expected_acc) |
|
|
| |
| trainer = get_regression_trainer( |
| a=1.5, |
| b=2.5, |
| compute_metrics=AlmostAccuracyBatched(), |
| batch_eval_metrics=True, |
| preprocess_logits_for_metrics=lambda logits, labels: logits + 1, |
| output_dir=tmp_dir, |
| ) |
| results = trainer.evaluate() |
|
|
| x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] |
| pred = 1.5 * x + 2.5 |
| expected_loss = ((pred - y) ** 2).mean() |
| self.assertAlmostEqual(results["eval_loss"], expected_loss) |
| expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"] |
| self.assertAlmostEqual(results["eval_accuracy"], expected_acc) |
|
|
| def test_evaluate_with_jit(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer( |
| a=1.5, b=2.5, compute_metrics=AlmostAccuracy(), jit_mode_eval=True, output_dir=tmp_dir |
| ) |
| results = trainer.evaluate() |
|
|
| x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] |
| pred = 1.5 * x + 2.5 |
| expected_loss = ((pred - y) ** 2).mean() |
| self.assertAlmostEqual(results["eval_loss"], expected_loss) |
| expected_acc = AlmostAccuracy()((pred, y))["accuracy"] |
| self.assertAlmostEqual(results["eval_accuracy"], expected_acc) |
|
|
| |
| trainer = get_regression_trainer( |
| a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracy(), jit_mode_eval=True, output_dir=tmp_dir |
| ) |
| results = trainer.evaluate() |
|
|
| x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] |
| pred = 1.5 * x + 2.5 |
| expected_loss = ((pred - y) ** 2).mean() |
| self.assertAlmostEqual(results["eval_loss"], expected_loss) |
| expected_acc = AlmostAccuracy()((pred, y))["accuracy"] |
| self.assertAlmostEqual(results["eval_accuracy"], expected_acc) |
|
|
| |
| trainer = get_regression_trainer( |
| a=1.5, |
| b=2.5, |
| compute_metrics=AlmostAccuracy(), |
| preprocess_logits_for_metrics=lambda logits, labels: logits + 1, |
| jit_mode_eval=True, |
| output_dir=tmp_dir, |
| ) |
| results = trainer.evaluate() |
|
|
| x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] |
| pred = 1.5 * x + 2.5 |
| expected_loss = ((pred - y) ** 2).mean() |
| self.assertAlmostEqual(results["eval_loss"], expected_loss) |
| expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"] |
| self.assertAlmostEqual(results["eval_accuracy"], expected_acc) |
|
|
| @require_torch_bf16 |
| @require_intel_extension_for_pytorch |
| def test_evaluate_with_ipex(self): |
| for mix_bf16 in [True, False]: |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer( |
| a=1.5, |
| b=2.5, |
| use_ipex=True, |
| compute_metrics=AlmostAccuracy(), |
| bf16=mix_bf16, |
| use_cpu=True, |
| output_dir=tmp_dir, |
| ) |
| results = trainer.evaluate() |
|
|
| x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] |
| pred = 1.5 * x + 2.5 |
| expected_loss = ((pred - y) ** 2).mean() |
| self.assertAlmostEqual(results["eval_loss"], expected_loss) |
| expected_acc = AlmostAccuracy()((pred, y))["accuracy"] |
| self.assertAlmostEqual(results["eval_accuracy"], expected_acc) |
|
|
| |
| trainer = get_regression_trainer( |
| a=1.5, |
| b=2.5, |
| use_ipex=True, |
| eval_len=66, |
| compute_metrics=AlmostAccuracy(), |
| bf16=mix_bf16, |
| use_cpu=True, |
| output_dir=tmp_dir, |
| ) |
| results = trainer.evaluate() |
|
|
| x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] |
| pred = 1.5 * x + 2.5 |
| expected_loss = ((pred - y) ** 2).mean() |
| self.assertAlmostEqual(results["eval_loss"], expected_loss) |
| expected_acc = AlmostAccuracy()((pred, y))["accuracy"] |
| self.assertAlmostEqual(results["eval_accuracy"], expected_acc) |
|
|
| |
| trainer = get_regression_trainer( |
| a=1.5, |
| b=2.5, |
| use_ipex=True, |
| compute_metrics=AlmostAccuracy(), |
| preprocess_logits_for_metrics=lambda logits, labels: logits + 1, |
| bf16=mix_bf16, |
| use_cpu=True, |
| output_dir=tmp_dir, |
| ) |
| results = trainer.evaluate() |
|
|
| x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] |
| pred = 1.5 * x + 2.5 |
| expected_loss = ((pred - y) ** 2).mean() |
| self.assertAlmostEqual(results["eval_loss"], expected_loss) |
| expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"] |
| self.assertAlmostEqual(results["eval_accuracy"], expected_acc) |
|
|
| def test_predict(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer(a=1.5, b=2.5, output_dir=tmp_dir) |
| preds = trainer.predict(trainer.eval_dataset).predictions |
| x = trainer.eval_dataset.x |
| self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) |
|
|
| |
| trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66, output_dir=tmp_dir) |
| preds = trainer.predict(trainer.eval_dataset).predictions |
| x = trainer.eval_dataset.x |
| self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) |
|
|
| |
| trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True, output_dir=tmp_dir) |
| preds = trainer.predict(trainer.eval_dataset).predictions |
| x = trainer.eval_dataset.x |
| self.assertEqual(len(preds), 2) |
| self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) |
| self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) |
|
|
| |
| trainer = get_regression_trainer( |
| a=1.5, b=2.5, double_output=True, label_names=["labels", "labels_2"], output_dir=tmp_dir |
| ) |
| outputs = trainer.predict(trainer.eval_dataset) |
| preds = outputs.predictions |
| labels = outputs.label_ids |
| x = trainer.eval_dataset.x |
| self.assertEqual(len(preds), 2) |
| self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) |
| self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) |
| self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0])) |
| self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1])) |
|
|
| def test_predict_with_batch_eval_metrics(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer( |
| a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True, output_dir=tmp_dir |
| ) |
| results = trainer.predict(trainer.eval_dataset) |
| preds = results.predictions |
| x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] |
| gt = 1.5 * x + 2.5 |
| self.assertTrue(np.allclose(preds, gt)) |
| expected_acc = AlmostAccuracy()((preds, y))["accuracy"] |
| self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc) |
|
|
| |
| trainer = get_regression_trainer( |
| a=1.5, |
| b=2.5, |
| eval_len=66, |
| compute_metrics=AlmostAccuracyBatched(), |
| batch_eval_metrics=True, |
| output_dir=tmp_dir, |
| ) |
| results = trainer.predict(trainer.eval_dataset) |
| preds = results.predictions |
| x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0] |
| self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) |
| expected_acc = AlmostAccuracy()((preds, y))["accuracy"] |
| self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc) |
|
|
| |
| trainer = get_regression_trainer( |
| a=1.5, |
| b=2.5, |
| double_output=True, |
| compute_metrics=AlmostAccuracyBatched(), |
| batch_eval_metrics=True, |
| output_dir=tmp_dir, |
| ) |
| preds = trainer.predict(trainer.eval_dataset).predictions |
| x = trainer.eval_dataset.x |
| self.assertEqual(len(preds), 2) |
| self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) |
| self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) |
|
|
| |
| trainer = get_regression_trainer( |
| a=1.5, |
| b=2.5, |
| double_output=True, |
| label_names=["labels", "labels_2"], |
| compute_metrics=AlmostAccuracyBatched(), |
| batch_eval_metrics=True, |
| output_dir=tmp_dir, |
| ) |
| outputs = trainer.predict(trainer.eval_dataset) |
| preds = outputs.predictions |
| labels = outputs.label_ids |
| x = trainer.eval_dataset.x |
| self.assertEqual(len(preds), 2) |
| self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) |
| self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) |
| self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0])) |
| self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1])) |
|
|
| def test_predict_with_jit(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer(a=1.5, b=2.5, jit_mode_eval=True, output_dir=tmp_dir) |
| preds = trainer.predict(trainer.eval_dataset).predictions |
| x = trainer.eval_dataset.x |
| self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) |
|
|
| |
| trainer = get_regression_trainer(a=1.5, b=2.5, eval_len=66, jit_mode_eval=True, output_dir=tmp_dir) |
| preds = trainer.predict(trainer.eval_dataset).predictions |
| x = trainer.eval_dataset.x |
| self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) |
|
|
| |
| trainer = get_regression_trainer(a=1.5, b=2.5, double_output=True, jit_mode_eval=True, output_dir=tmp_dir) |
| preds = trainer.predict(trainer.eval_dataset).predictions |
| x = trainer.eval_dataset.x |
| self.assertEqual(len(preds), 2) |
| self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) |
| self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) |
|
|
| |
| trainer = get_regression_trainer( |
| a=1.5, |
| b=2.5, |
| double_output=True, |
| label_names=["labels", "labels_2"], |
| jit_mode_eval=True, |
| output_dir=tmp_dir, |
| ) |
| outputs = trainer.predict(trainer.eval_dataset) |
| preds = outputs.predictions |
| labels = outputs.label_ids |
| x = trainer.eval_dataset.x |
| self.assertEqual(len(preds), 2) |
| self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) |
| self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) |
| self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0])) |
| self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1])) |
|
|
| @require_torch_bf16 |
| @require_intel_extension_for_pytorch |
| def test_predict_with_ipex(self): |
| for mix_bf16 in [True, False]: |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer( |
| a=1.5, b=2.5, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir |
| ) |
| preds = trainer.predict(trainer.eval_dataset).predictions |
| x = trainer.eval_dataset.x |
| self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) |
|
|
| |
| trainer = get_regression_trainer( |
| a=1.5, b=2.5, eval_len=66, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir |
| ) |
| preds = trainer.predict(trainer.eval_dataset).predictions |
| x = trainer.eval_dataset.x |
| self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) |
|
|
| |
| trainer = get_regression_trainer( |
| a=1.5, b=2.5, double_output=True, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir |
| ) |
| preds = trainer.predict(trainer.eval_dataset).predictions |
| x = trainer.eval_dataset.x |
| self.assertEqual(len(preds), 2) |
| self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) |
| self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) |
|
|
| |
| trainer = get_regression_trainer( |
| a=1.5, |
| b=2.5, |
| double_output=True, |
| label_names=["labels", "labels_2"], |
| use_ipex=True, |
| bf16=mix_bf16, |
| use_cpu=True, |
| output_dir=tmp_dir, |
| ) |
| outputs = trainer.predict(trainer.eval_dataset) |
| preds = outputs.predictions |
| labels = outputs.label_ids |
| x = trainer.eval_dataset.x |
| self.assertEqual(len(preds), 2) |
| self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5)) |
| self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5)) |
| self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0])) |
| self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1])) |
|
|
| def test_dynamic_shapes(self): |
| eval_dataset = DynamicShapesDataset(batch_size=self.batch_size) |
| model = RegressionModel(a=2, b=1) |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments(tmp_dir, report_to="none") |
| trainer = Trainer(model, args, eval_dataset=eval_dataset) |
|
|
| |
| _ = trainer.evaluate() |
|
|
| |
| preds = trainer.predict(eval_dataset) |
| for expected, seen in zip(eval_dataset.ys, preds.label_ids): |
| self.assertTrue(np.array_equal(expected, seen[: expected.shape[0]])) |
| self.assertTrue(np.all(seen[expected.shape[0] :] == -100)) |
|
|
| for expected, seen in zip(eval_dataset.xs, preds.predictions): |
| self.assertTrue(np.array_equal(2 * expected + 1, seen[: expected.shape[0]])) |
| self.assertTrue(np.all(seen[expected.shape[0] :] == -100)) |
|
|
| |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments(tmp_dir, eval_accumulation_steps=2, report_to="none") |
| trainer = Trainer(model, args, eval_dataset=eval_dataset) |
|
|
| |
| _ = trainer.evaluate() |
|
|
| |
| preds = trainer.predict(eval_dataset) |
| for expected, seen in zip(eval_dataset.ys, preds.label_ids): |
| self.assertTrue(np.array_equal(expected, seen[: expected.shape[0]])) |
| self.assertTrue(np.all(seen[expected.shape[0] :] == -100)) |
|
|
| for expected, seen in zip(eval_dataset.xs, preds.predictions): |
| self.assertTrue(np.array_equal(2 * expected + 1, seen[: expected.shape[0]])) |
| self.assertTrue(np.all(seen[expected.shape[0] :] == -100)) |
|
|
| def test_log_level(self): |
| |
| logger = logging.get_logger() |
| log_info_string = "Running training" |
|
|
| |
| is_info = logging.get_verbosity() <= 20 |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| with CaptureLogger(logger) as cl: |
| trainer = get_regression_trainer(output_dir=tmp_dir) |
| trainer.train() |
| if is_info: |
| self.assertIn(log_info_string, cl.out) |
| else: |
| self.assertNotIn(log_info_string, cl.out) |
|
|
| with LoggingLevel(logging.INFO): |
| |
| with CaptureLogger(logger) as cl: |
| trainer = get_regression_trainer(log_level="debug", output_dir=tmp_dir) |
| trainer.train() |
| self.assertIn(log_info_string, cl.out) |
|
|
| with LoggingLevel(logging.INFO): |
| |
| with CaptureLogger(logger) as cl: |
| trainer = get_regression_trainer(log_level="error", output_dir=tmp_dir) |
| trainer.train() |
| self.assertNotIn(log_info_string, cl.out) |
|
|
| def test_save_checkpoints(self): |
| tmp_dir = self.get_auto_remove_tmp_dir() |
| trainer = get_regression_trainer(output_dir=tmp_dir, save_steps=5) |
| trainer.train() |
| self.check_saved_checkpoints(tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size)) |
|
|
| |
| tmp_dir = self.get_auto_remove_tmp_dir() |
| trainer = get_regression_trainer(output_dir=tmp_dir, save_steps=5, pretrained=False) |
| trainer.train() |
| self.check_saved_checkpoints(tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size), False) |
|
|
| @require_safetensors |
| def test_safe_checkpoints(self): |
| for save_safetensors in [True, False]: |
| tmp_dir = self.get_auto_remove_tmp_dir() |
| trainer = get_regression_trainer(output_dir=tmp_dir, save_steps=5, save_safetensors=save_safetensors) |
| trainer.train() |
| self.check_saved_checkpoints( |
| tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size), safe_weights=save_safetensors |
| ) |
|
|
| |
| tmp_dir = self.get_auto_remove_tmp_dir() |
| trainer = get_regression_trainer( |
| output_dir=tmp_dir, save_steps=5, pretrained=False, save_safetensors=save_safetensors |
| ) |
| trainer.train() |
| self.check_saved_checkpoints( |
| tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size), False, safe_weights=save_safetensors |
| ) |
|
|
| def test_save_collator_tokenizer_by_default(self): |
| class FakeCollator: |
| def __init__(self): |
| self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") |
| self.tokenizer.add_tokens(["<NEW_TOKEN1>", "<NEW_TOKEN2>"]) |
|
|
| def __call__(self, features: list[Any], return_tensors="pt") -> dict[str, Any]: |
| return default_data_collator(features, return_tensors) |
|
|
| data_collator = FakeCollator() |
| tmp_dir = self.get_auto_remove_tmp_dir() |
| trainer = get_regression_trainer( |
| output_dir=tmp_dir, save_steps=5, save_safetensors=True, data_collator=data_collator |
| ) |
| trainer.train() |
| loaded_tokenizer = AutoTokenizer.from_pretrained(os.path.join(tmp_dir, os.listdir(tmp_dir)[0])) |
| assert len(loaded_tokenizer) == len(trainer.data_collator.tokenizer), "Failed to load updated tokenizer" |
|
|
| def test_load_best_model_with_save(self): |
| tmp_dir = self.get_auto_remove_tmp_dir() |
| trainer = get_regression_trainer( |
| output_dir=tmp_dir, |
| save_steps=5, |
| eval_strategy="steps", |
| eval_steps=5, |
| max_steps=9, |
| ) |
| trainer.train() |
| |
| assert os.path.exists(os.path.join(tmp_dir, f"checkpoint-{trainer.state.max_steps}")), ( |
| f"Could not find checkpoint-{trainer.state.max_steps}" |
| ) |
| |
| assert os.path.exists(os.path.join(tmp_dir, "checkpoint-9")), "Could not find checkpoint-9" |
|
|
| |
| |
| |
| |
| |
| tmp_dir = self.get_auto_remove_tmp_dir() |
| trainer = get_regression_trainer( |
| output_dir=tmp_dir, |
| save_steps=5, |
| eval_strategy="steps", |
| eval_steps=5, |
| load_best_model_at_end=True, |
| save_total_limit=2, |
| max_steps=11, |
| ) |
| trainer.train() |
| |
| assert os.path.exists(os.path.join(tmp_dir, "checkpoint-11")), "Could not find checkpoint-11" |
| |
| assert os.path.exists(os.path.join(tmp_dir, "checkpoint-10")), "Could not find checkpoint-10" |
| |
| assert not os.path.exists(os.path.join(tmp_dir, "checkpoint-5")), "Found checkpoint-5, limit not respected" |
|
|
| |
| |
| |
| model_state = trainer.model.state_dict() |
| final_model_weights = safetensors.torch.load_file(os.path.join(tmp_dir, "checkpoint-10", "model.safetensors")) |
| for k, v in model_state.items(): |
| assert torch.allclose(v, final_model_weights[k]), f"{k} is not the same" |
|
|
| @require_torch_multi_accelerator |
| def test_run_seq2seq_double_train_wrap_once(self): |
| |
| |
| |
|
|
| trainer = get_regression_trainer(output_dir=self.get_auto_remove_tmp_dir()) |
| trainer.train() |
| model_wrapped_before = trainer.model_wrapped |
| trainer.train() |
| model_wrapped_after = trainer.model_wrapped |
| self.assertIs(model_wrapped_before, model_wrapped_after, "should be not wrapped twice") |
|
|
| @require_torch_up_to_2_accelerators |
| def test_can_resume_training(self): |
| |
| |
| |
|
|
| tmp_dir = self.get_auto_remove_tmp_dir() |
| kwargs = { |
| "output_dir": tmp_dir, |
| "train_len": 128, |
| "save_steps": 5, |
| "learning_rate": 0.1, |
| "logging_steps": 5, |
| } |
| trainer = get_regression_trainer(**kwargs) |
| trainer.train() |
| (a, b) = trainer.model.a.item(), trainer.model.b.item() |
| state = dataclasses.asdict(trainer.state) |
|
|
| checkpoint = os.path.join(tmp_dir, "checkpoint-5") |
|
|
| |
| trainer = get_regression_trainer(**kwargs) |
|
|
| trainer.train(resume_from_checkpoint=checkpoint) |
| (a1, b1) = trainer.model.a.item(), trainer.model.b.item() |
| state1 = dataclasses.asdict(trainer.state) |
| self.assertEqual(a, a1) |
| self.assertEqual(b, b1) |
| self.check_trainer_state_are_the_same(state, state1) |
|
|
| |
| checkpoint = os.path.join(tmp_dir, "checkpoint-15") |
|
|
| |
| trainer = get_regression_trainer(**kwargs) |
|
|
| trainer.train(resume_from_checkpoint=checkpoint) |
| (a1, b1) = trainer.model.a.item(), trainer.model.b.item() |
| state1 = dataclasses.asdict(trainer.state) |
| self.assertEqual(a, a1) |
| self.assertEqual(b, b1) |
| self.check_trainer_state_are_the_same(state, state1) |
|
|
| |
| tmp_dir = self.get_auto_remove_tmp_dir() |
| kwargs = { |
| "output_dir": tmp_dir, |
| "train_len": 128, |
| "save_steps": 5, |
| "learning_rate": 0.1, |
| "pretrained": False, |
| } |
|
|
| trainer = get_regression_trainer(**kwargs) |
| trainer.train() |
| (a, b) = trainer.model.a.item(), trainer.model.b.item() |
| state = dataclasses.asdict(trainer.state) |
|
|
| checkpoint = os.path.join(tmp_dir, "checkpoint-5") |
|
|
| |
| trainer = get_regression_trainer(**kwargs) |
|
|
| trainer.train(resume_from_checkpoint=checkpoint) |
| (a1, b1) = trainer.model.a.item(), trainer.model.b.item() |
| state1 = dataclasses.asdict(trainer.state) |
| self.assertEqual(a, a1) |
| self.assertEqual(b, b1) |
| self.check_trainer_state_are_the_same(state, state1) |
|
|
| |
| checkpoint = os.path.join(tmp_dir, "checkpoint-15") |
|
|
| |
| trainer = get_regression_trainer(**kwargs) |
|
|
| trainer.train(resume_from_checkpoint=checkpoint) |
| (a1, b1) = trainer.model.a.item(), trainer.model.b.item() |
| state1 = dataclasses.asdict(trainer.state) |
| self.assertEqual(a, a1) |
| self.assertEqual(b, b1) |
| self.check_trainer_state_are_the_same(state, state1) |
|
|
| |
|
|
| |
| tmp_dir = self.get_auto_remove_tmp_dir() |
| trainer = get_regression_trainer(output_dir=tmp_dir) |
| with self.assertRaises(Exception) as context: |
| trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus") |
| self.assertTrue("Can't find a valid checkpoint at" in str(context.exception)) |
|
|
| |
| tmp_dir = self.get_auto_remove_tmp_dir() |
| trainer = get_regression_trainer(output_dir=tmp_dir) |
| with self.assertRaises(Exception) as context: |
| trainer.train(resume_from_checkpoint=True) |
| self.assertTrue("No valid checkpoint found in output directory" in str(context.exception)) |
|
|
| |
| |
| @require_torch_non_multi_accelerator |
| @run_test_using_subprocess |
| @slow |
| def test_can_resume_training_lm(self): |
| |
| training_steps = 10 |
| resume_from_step = 8 |
| with tempfile.TemporaryDirectory() as tmpdir: |
| enable_full_determinism(0) |
| kwargs = { |
| "output_dir": tmpdir, |
| "fp16": True, |
| "max_steps": training_steps, |
| "per_device_train_batch_size": 1, |
| "learning_rate": 1e-5, |
| "lr_scheduler_type": "cosine", |
| "save_strategy": "steps", |
| "save_steps": 1, |
| "logging_strategy": "steps", |
| "logging_steps": 1, |
| "report_to": "none", |
| } |
|
|
| trainer = get_language_model_trainer(**kwargs) |
| trainer.train(resume_from_checkpoint=False) |
| |
| model_params = torch.cat([p.cpu().flatten() for p in trainer.model.parameters()]) |
| model_param_len = len(model_params) |
| |
| |
| indices = torch.randint(0, model_param_len, (1000,)) |
| |
| model_params_sample = model_params[indices].detach().clone() |
| state1 = dataclasses.asdict(trainer.state) |
| |
| del model_params, trainer |
| |
| self.check_saved_checkpoints( |
| tmpdir, freq=1, total=training_steps + 1, is_pretrained=True, safe_weights=True, use_scaler=True |
| ) |
|
|
| |
| enable_full_determinism(0) |
| checkpoint = os.path.join(tmpdir, f"checkpoint-{resume_from_step + 1}") |
| trainer = get_language_model_trainer(**kwargs) |
| trainer.train(resume_from_checkpoint=checkpoint) |
| model_params = torch.cat([p.cpu().flatten() for p in trainer.model.parameters()]) |
|
|
| |
| self.assertTrue(torch.allclose(model_params[indices], model_params_sample)) |
| state2 = dataclasses.asdict(trainer.state) |
| self.check_trainer_state_are_the_same(state1, state2) |
| del model_params, trainer |
|
|
| @unittest.skip( |
| reason="@muellerzr: Fix once Trainer can take an accelerate configuration. Need to set `seedable_sampler=True`." |
| ) |
| def test_resume_training_with_randomness(self): |
| |
| |
| |
| random_torch = not torch.cuda.is_available() or torch.cuda.device_count() <= 1 |
|
|
| if torch.cuda.is_available(): |
| torch.backends.cudnn.deterministic = True |
| train_dataset = RegressionDataset(length=128) |
| eval_dataset = RegressionDataset() |
|
|
| with self.subTest("Test every step"): |
| config = RegressionModelConfig(a=0, b=2, random_torch=random_torch) |
| model = RegressionRandomPreTrainedModel(config) |
|
|
| tmp_dir = self.get_auto_remove_tmp_dir() |
| args = RegressionTrainingArguments(tmp_dir, save_steps=5, learning_rate=0.1) |
| trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset) |
|
|
| trainer.train() |
| (a, b) = trainer.model.a.item(), trainer.model.b.item() |
|
|
| model = RegressionRandomPreTrainedModel(config) |
| trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset) |
| trainer.train(resume_from_checkpoint=os.path.join(tmp_dir, "checkpoint-15")) |
| (a1, b1) = trainer.model.a.item(), trainer.model.b.item() |
|
|
| self.assertAlmostEqual(a, a1, delta=1e-5) |
| self.assertAlmostEqual(b, b1, delta=1e-5) |
|
|
| with self.subTest("Test every epoch"): |
| config = RegressionModelConfig(a=0, b=2, random_torch=random_torch) |
| model = RegressionRandomPreTrainedModel(config) |
|
|
| tmp_dir = self.get_auto_remove_tmp_dir() |
| args = RegressionTrainingArguments(tmp_dir, save_strategy="epoch", learning_rate=0.1) |
| trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset) |
|
|
| trainer.train() |
| (a, b) = trainer.model.a.item(), trainer.model.b.item() |
|
|
| model = RegressionRandomPreTrainedModel(config) |
| trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset) |
|
|
| checkpoints = [d for d in os.listdir(tmp_dir) if d.startswith("checkpoint-")] |
| |
| self.assertEqual(len(checkpoints), 3) |
| checkpoint_dir = sorted(checkpoints, key=lambda x: int(x.replace("checkpoint-", "")))[0] |
|
|
| trainer.train(resume_from_checkpoint=os.path.join(tmp_dir, checkpoint_dir)) |
| (a1, b1) = trainer.model.a.item(), trainer.model.b.item() |
|
|
| self.assertAlmostEqual(a, a1, delta=1e-5) |
| self.assertAlmostEqual(b, b1, delta=1e-5) |
|
|
| @slow |
| @require_accelerate |
| @require_torch_non_multi_accelerator |
| def test_auto_batch_size_finder(self): |
| if torch.cuda.is_available(): |
| torch.backends.cudnn.deterministic = True |
|
|
| SRC_DIR = os.path.abspath( |
| os.path.join(os.path.dirname(__file__), "..", "..", "examples", "pytorch", "text-classification") |
| ) |
| sys.path.append(SRC_DIR) |
| import run_glue |
|
|
| with tempfile.TemporaryDirectory() as tmpdir: |
| testargs = f""" |
| run_glue.py |
| --model_name_or_path distilbert/distilbert-base-uncased |
| --task_name mrpc |
| --do_train |
| --do_eval |
| --max_seq_len 128 |
| --per_device_train_batch_size 4096 |
| --learning_rate 2e-5 |
| --num_train_epochs 1 |
| --output_dir {tmpdir} |
| --auto_find_batch_size 0 |
| """.split() |
| with self.assertRaises(RuntimeError): |
| with patch.object(sys, "argv", testargs): |
| run_glue.main() |
|
|
| testargs[-1] = "1" |
| with patch.object(sys, "argv", testargs): |
| run_glue.main() |
|
|
| @require_deepspeed |
| def test_auto_batch_size_with_deepspeed(self): |
| train_dataset = RegressionDataset(length=128) |
|
|
| config = RegressionModelConfig(a=0, b=2) |
| model = RegressionRandomPreTrainedModel(config) |
|
|
| tmp_dir = self.get_auto_remove_tmp_dir() |
|
|
| for stage in [1, 2]: |
| deepspeed = { |
| "zero_optimization": { |
| "stage": stage, |
| }, |
| "train_batch_size": "auto", |
| "train_micro_batch_size_per_gpu": "auto", |
| } |
|
|
| args = RegressionTrainingArguments( |
| tmp_dir, |
| do_train=True, |
| max_steps=2, |
| save_strategy="no", |
| per_device_train_batch_size=16, |
| auto_find_batch_size=True, |
| deepspeed=deepspeed, |
| ) |
| trainer = Trainer(model, args, train_dataset=train_dataset, callbacks=[MockCudaOOMCallback()]) |
| trainer.train() |
| self.assertEqual(trainer._train_batch_size, 8) |
|
|
| def test_auto_batch_size_with_resume_from_checkpoint(self): |
| train_dataset = RegressionDataset(length=128) |
|
|
| config = RegressionModelConfig(a=0, b=2) |
| model = RegressionRandomPreTrainedModel(config) |
|
|
| tmp_dir = self.get_auto_remove_tmp_dir() |
|
|
| args = RegressionTrainingArguments( |
| tmp_dir, |
| do_train=True, |
| max_steps=2, |
| save_steps=1, |
| per_device_train_batch_size=16, |
| auto_find_batch_size=True, |
| ) |
| trainer = Trainer(model, args, train_dataset=train_dataset, callbacks=[MockCudaOOMCallback()]) |
| trainer.train() |
| |
| self.assertEqual(trainer._train_batch_size, 8) |
|
|
| |
| trainer = Trainer(model, args, train_dataset=train_dataset) |
| |
| self.assertEqual(trainer._train_batch_size, 16 * max(trainer.args.n_gpu, 1)) |
| trainer.train(resume_from_checkpoint=True) |
| |
| self.assertEqual(trainer._train_batch_size, 8) |
|
|
| |
| def test_training_with_resume_from_checkpoint_false(self): |
| train_dataset = RegressionDataset(length=128) |
| eval_dataset = RegressionDataset() |
|
|
| config = RegressionModelConfig(a=0, b=2) |
| model = RegressionRandomPreTrainedModel(config) |
|
|
| tmp_dir = self.get_auto_remove_tmp_dir() |
| args = RegressionTrainingArguments(tmp_dir, save_steps=5, learning_rate=0.1) |
| trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset) |
|
|
| trainer.train(resume_from_checkpoint=False) |
|
|
| @require_torch_up_to_2_accelerators |
| def test_resume_training_with_shard_checkpoint(self): |
| |
| |
| |
|
|
| with tempfile.TemporaryDirectory() as tmpdir: |
| trainer = get_regression_trainer(output_dir=tmpdir, train_len=128, save_steps=5, learning_rate=0.1) |
| trainer.train() |
| (a, b) = trainer.model.a.item(), trainer.model.b.item() |
| state = dataclasses.asdict(trainer.state) |
|
|
| checkpoint = os.path.join(tmpdir, "checkpoint-5") |
| self.convert_to_sharded_checkpoint(checkpoint) |
|
|
| |
| trainer = get_regression_trainer(output_dir=tmpdir, train_len=128, save_steps=5, learning_rate=0.1) |
|
|
| trainer.train(resume_from_checkpoint=checkpoint) |
| (a1, b1) = trainer.model.a.item(), trainer.model.b.item() |
| state1 = dataclasses.asdict(trainer.state) |
| self.assertEqual(a, a1) |
| self.assertEqual(b, b1) |
| self.check_trainer_state_are_the_same(state, state1) |
|
|
| @require_safetensors |
| @require_torch_up_to_2_accelerators |
| def test_resume_training_with_safe_checkpoint(self): |
| |
| |
| |
|
|
| for initial_safe in [False, True]: |
| for loaded_safe in [False, True]: |
| with tempfile.TemporaryDirectory() as tmpdir: |
| trainer = get_regression_trainer( |
| output_dir=tmpdir, |
| train_len=128, |
| save_steps=5, |
| learning_rate=0.1, |
| save_safetensors=initial_safe, |
| ) |
| trainer.train() |
| (a, b) = trainer.model.a.item(), trainer.model.b.item() |
| state = dataclasses.asdict(trainer.state) |
|
|
| checkpoint = os.path.join(tmpdir, "checkpoint-5") |
| self.convert_to_sharded_checkpoint(checkpoint, load_safe=initial_safe, save_safe=loaded_safe) |
|
|
| |
| trainer = get_regression_trainer( |
| output_dir=tmpdir, train_len=128, save_steps=5, learning_rate=0.1, save_safetensors=loaded_safe |
| ) |
|
|
| trainer.train(resume_from_checkpoint=checkpoint) |
| (a1, b1) = trainer.model.a.item(), trainer.model.b.item() |
| state1 = dataclasses.asdict(trainer.state) |
| self.assertEqual(a, a1) |
| self.assertEqual(b, b1) |
| self.check_trainer_state_are_the_same(state, state1) |
|
|
| @require_torch_up_to_2_accelerators |
| def test_resume_training_with_gradient_accumulation(self): |
| |
| |
| |
|
|
| with tempfile.TemporaryDirectory() as tmpdir: |
| trainer = get_regression_trainer( |
| output_dir=tmpdir, |
| train_len=128, |
| gradient_accumulation_steps=2, |
| per_device_train_batch_size=4, |
| save_steps=5, |
| learning_rate=0.1, |
| ) |
| trainer.train() |
| (a, b) = trainer.model.a.item(), trainer.model.b.item() |
| state = dataclasses.asdict(trainer.state) |
|
|
| checkpoint = os.path.join(tmpdir, "checkpoint-5") |
|
|
| |
| trainer = get_regression_trainer( |
| output_dir=tmpdir, |
| train_len=128, |
| gradient_accumulation_steps=2, |
| per_device_train_batch_size=4, |
| save_steps=5, |
| learning_rate=0.1, |
| ) |
|
|
| trainer.train(resume_from_checkpoint=checkpoint) |
| (a1, b1) = trainer.model.a.item(), trainer.model.b.item() |
| state1 = dataclasses.asdict(trainer.state) |
| self.assertEqual(a, a1) |
| self.assertEqual(b, b1) |
| self.check_trainer_state_are_the_same(state, state1) |
|
|
| @require_torch_up_to_2_accelerators |
| def test_resume_training_with_frozen_params(self): |
| |
| |
| |
|
|
| with tempfile.TemporaryDirectory() as tmpdir: |
| trainer = get_regression_trainer( |
| output_dir=tmpdir, |
| train_len=128, |
| per_device_train_batch_size=4, |
| save_steps=5, |
| learning_rate=0.1, |
| ) |
| trainer.model.a.requires_grad_(False) |
| trainer.train() |
| (a, b) = trainer.model.a.item(), trainer.model.b.item() |
| state = dataclasses.asdict(trainer.state) |
|
|
| checkpoint = os.path.join(tmpdir, "checkpoint-5") |
|
|
| |
| trainer = get_regression_trainer( |
| output_dir=tmpdir, |
| train_len=128, |
| per_device_train_batch_size=4, |
| save_steps=5, |
| learning_rate=0.1, |
| ) |
| trainer.model.a.requires_grad_(False) |
|
|
| trainer.train(resume_from_checkpoint=checkpoint) |
|
|
| self.assertFalse(trainer.model.a.requires_grad) |
| (a1, b1) = trainer.model.a.item(), trainer.model.b.item() |
| state1 = dataclasses.asdict(trainer.state) |
| self.assertEqual(a, a1) |
| self.assertEqual(b, b1) |
| self.check_trainer_state_are_the_same(state, state1) |
|
|
| def test_load_best_model_at_end(self): |
| total = int(self.n_epochs * 64 / self.batch_size) |
| with tempfile.TemporaryDirectory() as tmpdir: |
| trainer = get_regression_trainer( |
| a=1.5, |
| b=2.5, |
| output_dir=tmpdir, |
| learning_rate=0.1, |
| eval_steps=5, |
| eval_strategy="steps", |
| save_steps=5, |
| load_best_model_at_end=True, |
| ) |
| self.assertFalse(trainer.args.greater_is_better) |
| trainer.train() |
| self.check_saved_checkpoints(tmpdir, 5, total) |
| self.check_best_model_has_been_loaded(tmpdir, 5, total, trainer, "eval_loss") |
|
|
| with tempfile.TemporaryDirectory() as tmpdir: |
| trainer = get_regression_trainer( |
| a=1.5, |
| b=2.5, |
| output_dir=tmpdir, |
| learning_rate=0.1, |
| eval_steps=5, |
| eval_strategy="steps", |
| save_steps=5, |
| load_best_model_at_end=True, |
| metric_for_best_model="accuracy", |
| compute_metrics=AlmostAccuracy(), |
| ) |
| self.assertTrue(trainer.args.greater_is_better) |
| trainer.train() |
| self.check_saved_checkpoints(tmpdir, 5, total) |
| self.check_best_model_has_been_loaded(tmpdir, 5, total, trainer, "eval_accuracy", greater_is_better=True) |
|
|
| with tempfile.TemporaryDirectory() as tmpdir: |
| trainer = get_regression_trainer( |
| a=1.5, |
| b=2.5, |
| output_dir=tmpdir, |
| learning_rate=0.1, |
| eval_strategy="epoch", |
| save_strategy="epoch", |
| load_best_model_at_end=True, |
| metric_for_best_model="accuracy", |
| compute_metrics=AlmostAccuracy(), |
| ) |
| self.assertTrue(trainer.args.greater_is_better) |
| trainer.train() |
| self.check_saved_checkpoints(tmpdir, 64 // self.batch_size, total) |
| self.check_best_model_has_been_loaded( |
| tmpdir, 64 // self.batch_size, total, trainer, "eval_accuracy", greater_is_better=True |
| ) |
|
|
| |
| with tempfile.TemporaryDirectory() as tmpdir: |
| trainer = get_regression_trainer( |
| output_dir=tmpdir, |
| learning_rate=0.1, |
| eval_steps=5, |
| eval_strategy="steps", |
| save_steps=5, |
| load_best_model_at_end=True, |
| pretrained=False, |
| ) |
| self.assertFalse(trainer.args.greater_is_better) |
| trainer.train() |
| self.check_saved_checkpoints(tmpdir, 5, total, is_pretrained=False) |
| self.check_best_model_has_been_loaded(tmpdir, 5, total, trainer, "eval_loss", is_pretrained=False) |
|
|
| @require_safetensors |
| def test_load_best_model_from_safetensors(self): |
| total = int(self.n_epochs * 64 / self.batch_size) |
| for save_safetensors, pretrained in product([False, True], [False, True]): |
| with tempfile.TemporaryDirectory() as tmpdir: |
| trainer = get_regression_trainer( |
| a=1.5, |
| b=2.5, |
| output_dir=tmpdir, |
| learning_rate=0.1, |
| eval_steps=5, |
| eval_strategy="steps", |
| save_steps=5, |
| load_best_model_at_end=True, |
| save_safetensors=save_safetensors, |
| pretrained=pretrained, |
| ) |
| self.assertFalse(trainer.args.greater_is_better) |
| trainer.train() |
| self.check_saved_checkpoints(tmpdir, 5, total, is_pretrained=pretrained, safe_weights=save_safetensors) |
| self.check_best_model_has_been_loaded( |
| tmpdir, 5, total, trainer, "eval_loss", is_pretrained=pretrained, safe_weights=save_safetensors |
| ) |
|
|
| @slow |
| @run_first |
| def test_trainer_eval_mrpc(self): |
| MODEL_ID = "google-bert/bert-base-cased-finetuned-mrpc" |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) |
| model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID) |
| data_args = GlueDataTrainingArguments( |
| task_name="mrpc", data_dir=f"{get_tests_dir()}/fixtures/tests_samples/MRPC", overwrite_cache=True |
| ) |
| eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev") |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| training_args = TrainingArguments(output_dir=tmp_dir, use_cpu=True, report_to="none") |
| trainer = Trainer(model=model, args=training_args, eval_dataset=eval_dataset) |
| result = trainer.evaluate() |
| self.assertLess(result["eval_loss"], 0.2) |
|
|
| @slow |
| @run_first |
| def test_trainer_eval_multiple(self): |
| MODEL_ID = "openai-community/gpt2" |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) |
| model = AutoModelForCausalLM.from_pretrained(MODEL_ID) |
| dataset = LineByLineTextDataset( |
| tokenizer=tokenizer, |
| file_path=PATH_SAMPLE_TEXT, |
| block_size=tokenizer.max_len_single_sentence, |
| ) |
| for example in dataset.examples: |
| example["labels"] = example["input_ids"] |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| training_args = TrainingArguments( |
| output_dir=tmp_dir, |
| use_cpu=True, |
| per_device_eval_batch_size=1, |
| report_to="none", |
| ) |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| eval_dataset={ |
| "data1": dataset, |
| "data2": dataset, |
| }, |
| ) |
| result = trainer.evaluate() |
| self.assertIn("eval_data1_loss", result) |
| self.assertIn("eval_data2_loss", result) |
|
|
| @slow |
| def test_trainer_eval_lm(self): |
| MODEL_ID = "distilbert/distilroberta-base" |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) |
| dataset = LineByLineTextDataset( |
| tokenizer=tokenizer, |
| file_path=PATH_SAMPLE_TEXT, |
| block_size=tokenizer.max_len_single_sentence, |
| ) |
| self.assertEqual(len(dataset), 31) |
|
|
| def test_training_iterable_dataset(self): |
| config = RegressionModelConfig() |
| model = RegressionPreTrainedModel(config) |
| |
| train_dataset = SampleIterableDataset(label_names=["labels", "extra"]) |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = RegressionTrainingArguments(output_dir=tmp_dir, max_steps=4) |
| trainer = Trainer(model=model, args=args, train_dataset=train_dataset) |
| trainer.train() |
| self.assertEqual(trainer.state.global_step, 4) |
|
|
| loader = trainer.get_train_dataloader() |
| self.assertIsInstance(loader, torch.utils.data.DataLoader) |
| self.assertIsInstance(loader.sampler, torch.utils.data.dataloader._InfiniteConstantSampler) |
|
|
| def test_evaluation_iterable_dataset(self): |
| config = RegressionModelConfig(a=1.5, b=2.5) |
| model = RegressionPreTrainedModel(config) |
| |
| eval_dataset = SampleIterableDataset(label_names=["labels", "extra"]) |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = RegressionTrainingArguments(output_dir=tmp_dir) |
| trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset, compute_metrics=AlmostAccuracy()) |
| results = trainer.evaluate() |
|
|
| x, y = trainer.eval_dataset.dataset.x, trainer.eval_dataset.dataset.ys[0] |
| pred = 1.5 * x + 2.5 |
| expected_loss = ((pred - y) ** 2).mean() |
| self.assertAlmostEqual(results["eval_loss"], expected_loss) |
| expected_acc = AlmostAccuracy()((pred, y))["accuracy"] |
| self.assertAlmostEqual(results["eval_accuracy"], expected_acc) |
|
|
| |
| eval_dataset = SampleIterableDataset(length=66) |
| results = trainer.evaluate(eval_dataset) |
|
|
| x, y = eval_dataset.dataset.x, eval_dataset.dataset.ys[0] |
| pred = 1.5 * x + 2.5 |
| expected_loss = ((pred - y) ** 2).mean() |
| self.assertAlmostEqual(results["eval_loss"], expected_loss) |
| expected_acc = AlmostAccuracy()((pred, y))["accuracy"] |
| self.assertAlmostEqual(results["eval_accuracy"], expected_acc) |
|
|
| def test_predict_iterable_dataset(self): |
| config = RegressionModelConfig(a=1.5, b=2.5) |
| model = RegressionPreTrainedModel(config) |
| eval_dataset = SampleIterableDataset() |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = RegressionTrainingArguments(output_dir=tmp_dir) |
| trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset, compute_metrics=AlmostAccuracy()) |
|
|
| preds = trainer.predict(trainer.eval_dataset).predictions |
| x = eval_dataset.dataset.x |
| self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) |
|
|
| |
| |
| test_dataset = SampleIterableDataset(length=66, label_names=["labels", "extra"]) |
| preds = trainer.predict(test_dataset).predictions |
| x = test_dataset.dataset.x |
| self.assertTrue(np.allclose(preds, 1.5 * x + 2.5)) |
|
|
| def test_num_train_epochs_in_training(self): |
| |
| |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer( |
| max_steps=3, |
| train_len=64, |
| per_device_train_batch_size=16, |
| gradient_accumulation_steps=5, |
| output_dir=tmp_dir, |
| ) |
| train_output = trainer.train() |
| self.assertEqual(train_output.global_step, 3) |
|
|
| |
| |
| trainer = get_regression_trainer( |
| train_len=64, per_device_train_batch_size=16, gradient_accumulation_steps=5, output_dir=tmp_dir |
| ) |
| train_output = trainer.train() |
| self.assertEqual(train_output.global_step, int(self.n_epochs)) |
|
|
| def test_early_stopping_callback(self): |
| |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer( |
| output_dir=tmp_dir, |
| num_train_epochs=20, |
| gradient_accumulation_steps=1, |
| per_device_train_batch_size=16, |
| load_best_model_at_end=True, |
| eval_strategy=IntervalStrategy.EPOCH, |
| save_strategy=IntervalStrategy.EPOCH, |
| compute_metrics=AlmostAccuracy(), |
| metric_for_best_model="accuracy", |
| ) |
| trainer.add_callback(EarlyStoppingCallback(1, 0.0001)) |
| train_output = trainer.train() |
| self.assertLess(train_output.global_step, 20 * 64 / 16) |
|
|
| |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer( |
| output_dir=tmp_dir, |
| num_train_epochs=20, |
| gradient_accumulation_steps=1, |
| per_device_train_batch_size=16, |
| eval_strategy=IntervalStrategy.EPOCH, |
| compute_metrics=AlmostAccuracy(), |
| metric_for_best_model="accuracy", |
| ) |
| trainer.add_callback(EarlyStoppingCallback(1)) |
| self.assertEqual(trainer.state.global_step, 0) |
| try: |
| trainer.train() |
| except AssertionError: |
| self.assertEqual(trainer.state.global_step, 0) |
|
|
| |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer( |
| output_dir=tmp_dir, |
| num_train_epochs=20, |
| gradient_accumulation_steps=1, |
| per_device_train_batch_size=16, |
| load_best_model_at_end=False, |
| eval_strategy=IntervalStrategy.EPOCH, |
| save_strategy=IntervalStrategy.EPOCH, |
| compute_metrics=AlmostAccuracy(), |
| metric_for_best_model="accuracy", |
| ) |
| trainer.add_callback(EarlyStoppingCallback(1, 0.0001)) |
| train_output = trainer.train() |
| self.assertIsNotNone(trainer.state.best_model_checkpoint) |
|
|
| def test_flos_extraction(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir) |
|
|
| def assert_flos_extraction(trainer, wrapped_model_to_check): |
| self.assertEqual(trainer.model, trainer.accelerator.unwrap_model(wrapped_model_to_check)) |
| self.assertGreaterEqual( |
| getattr(trainer.accelerator.unwrap_model(wrapped_model_to_check).config, "total_flos", 0), 0 |
| ) |
|
|
| |
| assert_flos_extraction(trainer, trainer.model) |
|
|
| |
| assert_flos_extraction(trainer, nn.DataParallel(trainer.model)) |
|
|
| trainer.train() |
| self.assertTrue(isinstance(trainer.state.total_flos, float)) |
|
|
| def check_checkpoint_deletion(self, trainer, output_dir, expected): |
| |
| for n in [5, 10, 15, 20, 25]: |
| os.makedirs(os.path.join(output_dir, f"{PREFIX_CHECKPOINT_DIR}-{n}"), exist_ok=True) |
| trainer._rotate_checkpoints(output_dir=output_dir) |
| glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{PREFIX_CHECKPOINT_DIR}-*")] |
| values = [int(re.match(f".*{PREFIX_CHECKPOINT_DIR}-([0-9]+)", d).groups()[0]) for d in glob_checkpoints] |
| self.assertSetEqual(set(values), set(expected)) |
|
|
| def test_checkpoint_rotation(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| |
| trainer = get_regression_trainer(output_dir=tmp_dir, save_total_limit=2) |
| self.check_checkpoint_deletion(trainer, tmp_dir, [20, 25]) |
|
|
| |
| trainer = get_regression_trainer( |
| output_dir=tmp_dir, eval_strategy="steps", load_best_model_at_end=True, save_total_limit=2 |
| ) |
| trainer.state.best_model_checkpoint = os.path.join(tmp_dir, "checkpoint-5") |
| self.check_checkpoint_deletion(trainer, tmp_dir, [5, 25]) |
|
|
| |
| |
| trainer = get_regression_trainer( |
| output_dir=tmp_dir, eval_strategy="steps", load_best_model_at_end=True, save_total_limit=1 |
| ) |
| trainer.state.best_model_checkpoint = os.path.join(tmp_dir, "checkpoint-25") |
| self.check_checkpoint_deletion(trainer, tmp_dir, [25]) |
|
|
| trainer.state.best_model_checkpoint = os.path.join(tmp_dir, "checkpoint-5") |
| self.check_checkpoint_deletion(trainer, tmp_dir, [5, 25]) |
|
|
| def test_compare_trainer_and_checkpoint_args_logging(self): |
| logger = logging.get_logger() |
|
|
| with tempfile.TemporaryDirectory() as tmpdir, CaptureLogger(logger) as cl: |
| trainer = get_regression_trainer( |
| output_dir=tmpdir, |
| train_len=128, |
| eval_steps=5, |
| gradient_accumulation_steps=2, |
| per_device_train_batch_size=4, |
| save_steps=5, |
| learning_rate=0.1, |
| ) |
| trainer.train() |
|
|
| checkpoint = os.path.join(tmpdir, "checkpoint-5") |
| checkpoint_trainer = get_regression_trainer( |
| output_dir=tmpdir, |
| train_len=256, |
| eval_steps=10, |
| gradient_accumulation_steps=4, |
| per_device_train_batch_size=8, |
| save_steps=10, |
| learning_rate=0.1, |
| ) |
| checkpoint_trainer.train(resume_from_checkpoint=checkpoint) |
|
|
| self.assertIn("save_steps: 10 (from args) != 5 (from trainer_state.json)", cl.out) |
|
|
| self.assertIn( |
| "per_device_train_batch_size: 8 (from args) != 4 (from trainer_state.json)", |
| cl.out, |
| ) |
| self.assertIn( |
| "eval_steps: 10 (from args) != 5 (from trainer_state.json)", |
| cl.out, |
| ) |
|
|
| def check_mem_metrics(self, trainer, check_func): |
| metrics = trainer.train().metrics |
| check_func("init_mem_cpu_alloc_delta", metrics) |
| check_func("train_mem_cpu_alloc_delta", metrics) |
| if backend_device_count(torch_device) > 0: |
| check_func("init_mem_gpu_alloc_delta", metrics) |
| check_func("train_mem_gpu_alloc_delta", metrics) |
|
|
| metrics = trainer.evaluate() |
| check_func("eval_mem_cpu_alloc_delta", metrics) |
| if backend_device_count(torch_device) > 0: |
| check_func("eval_mem_gpu_alloc_delta", metrics) |
|
|
| metrics = trainer.predict(RegressionDataset()).metrics |
| check_func("test_mem_cpu_alloc_delta", metrics) |
| if backend_device_count(torch_device) > 0: |
| check_func("test_mem_gpu_alloc_delta", metrics) |
|
|
| def test_mem_metrics(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| |
| trainer = get_regression_trainer(skip_memory_metrics=False, output_dir=tmp_dir) |
| self.check_mem_metrics(trainer, self.assertIn) |
|
|
| |
| trainer = get_regression_trainer(skip_memory_metrics=True, output_dir=tmp_dir) |
| self.check_mem_metrics(trainer, self.assertNotIn) |
|
|
| @require_torch_fp16 |
| @require_torch_accelerator |
| def test_fp16_full_eval(self): |
| |
| |
| debug = 0 |
| n_gpus = backend_device_count(torch_device) |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| bs = 8 |
| eval_len = 16 * n_gpus |
| |
| |
| a = torch.ones(1000, bs) + 0.001 |
| b = torch.ones(1000, bs) - 0.001 |
|
|
| |
| trainer = get_regression_trainer( |
| a=a, b=b, eval_len=eval_len, skip_memory_metrics=False, output_dir=tmp_dir |
| ) |
| metrics = trainer.evaluate() |
| del trainer |
| gc.collect() |
|
|
| fp32_init = metrics["init_mem_gpu_alloc_delta"] |
| fp32_eval = metrics["eval_mem_gpu_alloc_delta"] |
|
|
| if debug: |
| print(f"fp32_init {fp32_init}") |
| print(f"fp32_eval {fp32_eval}") |
|
|
| |
| |
| self.assertGreater(fp32_init, 59_000) |
| |
| |
| |
| self.assertLess(fp32_eval, 5_000) |
|
|
| |
| trainer = get_regression_trainer( |
| a=a, b=b, eval_len=eval_len, fp16_full_eval=True, skip_memory_metrics=False, output_dir=tmp_dir |
| ) |
| metrics = trainer.evaluate() |
| fp16_init = metrics["init_mem_gpu_alloc_delta"] |
| fp16_eval = metrics["eval_mem_gpu_alloc_delta"] |
|
|
| if debug: |
| print(f"fp16_init {fp16_init}") |
| print(f"fp16_eval {fp16_eval}") |
|
|
| |
| |
| self.assertLess(fp16_init, 5_000) |
| |
| |
| self.assertGreater(fp16_eval, 27_000) |
|
|
| |
| |
| |
| self.assertAlmostEqual(fp16_eval, fp32_init / 2, delta=5_000) |
|
|
| @require_non_xpu |
| @require_torch_non_multi_gpu |
| @require_torch_tensorrt_fx |
| def test_torchdynamo_full_eval(self): |
| from torch import _dynamo as torchdynamo |
|
|
| |
| n_gpus = get_gpu_count() |
|
|
| bs = 8 |
| eval_len = 16 * n_gpus |
| |
| |
| a = torch.ones(1000, bs) + 0.001 |
| b = torch.ones(1000, bs) - 0.001 |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| |
| trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, output_dir=tmp_dir) |
| metrics = trainer.evaluate() |
| original_eval_loss = metrics["eval_loss"] |
| del trainer |
|
|
| |
| trainer = get_regression_trainer( |
| a=a, b=b, eval_len=eval_len, torch_compile_backend="eager", output_dir=tmp_dir |
| ) |
| metrics = trainer.evaluate() |
| self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss) |
| del trainer |
| torchdynamo.reset() |
|
|
| |
| trainer = get_regression_trainer( |
| a=a, b=b, eval_len=eval_len, torch_compile_backend="nvfuser", output_dir=tmp_dir |
| ) |
| metrics = trainer.evaluate() |
| self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss) |
| torchdynamo.reset() |
|
|
| |
| trainer = get_regression_trainer( |
| a=a, b=b, eval_len=eval_len, torch_compile_backend="fx2trt", output_dir=tmp_dir |
| ) |
| metrics = trainer.evaluate() |
| self.assertAlmostEqual(metrics["eval_loss"], original_eval_loss) |
| torchdynamo.reset() |
|
|
| @require_torch_non_multi_gpu |
| @require_torch_gpu |
| def test_torchdynamo_memory(self): |
| |
| from torch import _dynamo as torchdynamo |
|
|
| class CustomTrainer(Trainer): |
| def compute_loss(self, model, inputs, return_outputs=False): |
| x = inputs["x"] |
| output = model(x) |
| if self.args.n_gpu == 1: |
| return output.mean() |
| return output |
|
|
| class MyModule(torch.nn.Module): |
| """Simple module that does aggressive fusion""" |
|
|
| def __init__(self): |
| super().__init__() |
|
|
| def forward(self, x): |
| for _ in range(20): |
| x = torch.cos(x) |
| return x |
|
|
| mod = MyModule() |
|
|
| |
| a = torch.ones(1024, 1024, device="cuda", requires_grad=True) |
| a.grad = None |
| trainer = CustomTrainer(model=mod) |
| |
| for _ in range(10): |
| orig_loss = trainer.training_step(mod, {"x": a}) |
|
|
| |
| gc.collect() |
| torch.cuda.empty_cache() |
| torch.cuda.reset_peak_memory_stats() |
|
|
| orig_loss = trainer.training_step(mod, {"x": a}) |
| orig_peak_mem = torch.cuda.max_memory_allocated() |
| torchdynamo.reset() |
| del trainer |
|
|
| |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| a = torch.ones(1024, 1024, device="cuda", requires_grad=True) |
| a.grad = None |
| args = TrainingArguments(output_dir=tmp_dir, torch_compile_backend="nvfuser") |
| trainer = CustomTrainer(model=mod, args=args) |
| |
| for _ in range(10): |
| loss = trainer.training_step(mod, {"x": a}) |
|
|
| |
| gc.collect() |
| torch.cuda.empty_cache() |
| torch.cuda.reset_peak_memory_stats() |
|
|
| loss = trainer.training_step(mod, {"x": a}) |
| peak_mem = torch.cuda.max_memory_allocated() |
| torchdynamo.reset() |
| del trainer |
|
|
| |
| self.assertAlmostEqual(loss, orig_loss) |
|
|
| |
| |
| self.assertGreater(orig_peak_mem, peak_mem * 2) |
|
|
| @require_torch_accelerator |
| @require_torch_bf16 |
| def test_bf16_full_eval(self): |
| |
|
|
| |
| |
| debug = 0 |
| n_gpus = backend_device_count(torch_device) |
|
|
| bs = 8 |
| eval_len = 16 * n_gpus |
| |
| |
| a = torch.ones(1000, bs) + 0.001 |
| b = torch.ones(1000, bs) - 0.001 |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| |
| trainer = get_regression_trainer( |
| a=a, b=b, eval_len=eval_len, skip_memory_metrics=False, output_dir=tmp_dir |
| ) |
| metrics = trainer.evaluate() |
| del trainer |
| gc.collect() |
|
|
| fp32_init = metrics["init_mem_gpu_alloc_delta"] |
| fp32_eval = metrics["eval_mem_gpu_alloc_delta"] |
|
|
| if debug: |
| print(f"fp32_init {fp32_init}") |
| print(f"fp32_eval {fp32_eval}") |
|
|
| |
| |
| self.assertGreater(fp32_init, 59_000) |
| |
| |
| |
| self.assertLess(fp32_eval, 5_000) |
|
|
| |
| trainer = get_regression_trainer( |
| a=a, b=b, eval_len=eval_len, bf16_full_eval=True, skip_memory_metrics=False, output_dir=tmp_dir |
| ) |
| metrics = trainer.evaluate() |
| bf16_init = metrics["init_mem_gpu_alloc_delta"] |
| bf16_eval = metrics["eval_mem_gpu_alloc_delta"] |
|
|
| if debug: |
| print(f"bf16_init {bf16_init}") |
| print(f"bf16_eval {bf16_eval}") |
|
|
| |
| |
| self.assertLess(bf16_init, 5_000) |
| |
| |
| self.assertGreater(bf16_eval, 27_000) |
|
|
| |
| |
| |
| self.assertAlmostEqual(bf16_eval, fp32_init / 2, delta=5_000) |
|
|
| def test_no_wd_param_group(self): |
| model = nn.Sequential(TstLayer(128), nn.ModuleList([TstLayer(128), TstLayer(128)])) |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = Trainer(model=model, args=TrainingArguments(output_dir=tmp_dir, report_to="none")) |
| trainer.create_optimizer_and_scheduler(10) |
| wd_names = ['0.linear1.weight', '0.linear2.weight', '1.0.linear1.weight', '1.0.linear2.weight', '1.1.linear1.weight', '1.1.linear2.weight'] |
| wd_params = [p for n, p in model.named_parameters() if n in wd_names] |
| no_wd_params = [p for n, p in model.named_parameters() if n not in wd_names] |
| self.assertListEqual(trainer.optimizer.param_groups[0]["params"], wd_params) |
| self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params) |
|
|
| @slow |
| @require_non_hpu |
| @require_torch_multi_accelerator |
| def test_end_to_end_example(self): |
| |
| script_path = os.path.abspath( |
| os.path.join( |
| os.path.dirname(__file__), "..", "..", "examples", "pytorch", "translation", "run_translation.py" |
| ) |
| ) |
|
|
| with tempfile.TemporaryDirectory() as tmpdir: |
| command = [ |
| "accelerate", |
| "launch", |
| script_path, |
| "--model_name_or_path", |
| "google-t5/t5-small", |
| "--per_device_train_batch_size", |
| "1", |
| "--output_dir", |
| tmpdir, |
| "--overwrite_output_dir", |
| "--do_train", |
| "--max_train_samples", |
| "64", |
| "--num_train_epochs", |
| "1", |
| "--dataset_name", |
| "wmt16", |
| "--dataset_config", |
| "ro-en", |
| "--source_lang", |
| "en", |
| "--target_lang", |
| "ro", |
| "--do_predict", |
| "--max_predict_samples", |
| "64", |
| "--predict_with_generate", |
| "--ddp_timeout", |
| "60", |
| "--report_to", |
| "none", |
| ] |
| execute_subprocess_async(command) |
| |
|
|
| def test_accelerator_config_empty(self): |
| |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| config = RegressionModelConfig(a=1.5, b=2.5) |
| model = RegressionPreTrainedModel(config) |
| eval_dataset = SampleIterableDataset() |
|
|
| |
| args = RegressionTrainingArguments(output_dir=tmp_dir) |
| trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset) |
| self.assertEqual(trainer.accelerator.split_batches, False) |
| self.assertEqual(trainer.accelerator.dispatch_batches, None) |
| self.assertEqual(trainer.accelerator.even_batches, True) |
| self.assertEqual(trainer.accelerator.use_seedable_sampler, True) |
|
|
| if GRAD_ACCUM_KWARGS_VERSION_AVAILABLE: |
| |
| self.assertNotIn("sync_each_batch", trainer.accelerator.gradient_state.plugin_kwargs) |
|
|
| def test_accelerator_config_from_dict(self): |
| |
| |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| config = RegressionModelConfig(a=1.5, b=2.5) |
| model = RegressionPreTrainedModel(config) |
| eval_dataset = SampleIterableDataset() |
|
|
| accelerator_config: dict[str, Any] = { |
| "split_batches": True, |
| "dispatch_batches": True, |
| "even_batches": False, |
| "use_seedable_sampler": True, |
| } |
| if GRAD_ACCUM_KWARGS_VERSION_AVAILABLE: |
| accelerator_config["gradient_accumulation_kwargs"] = {"sync_each_batch": True} |
|
|
| |
| args = RegressionTrainingArguments(output_dir=tmp_dir, accelerator_config=accelerator_config) |
| trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset) |
| self.assertEqual(trainer.accelerator.split_batches, True) |
| self.assertEqual(trainer.accelerator.dispatch_batches, True) |
| self.assertEqual(trainer.accelerator.even_batches, False) |
| self.assertEqual(trainer.accelerator.use_seedable_sampler, True) |
|
|
| def test_accelerator_config_from_yaml(self): |
| |
| |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| path_file = Path(tmp_dir) / "accelerator_config.json" |
| with open(path_file, "w") as f: |
| accelerator_config = { |
| "split_batches": True, |
| "dispatch_batches": True, |
| "even_batches": False, |
| "use_seedable_sampler": False, |
| } |
| json.dump(accelerator_config, f) |
| config = RegressionModelConfig(a=1.5, b=2.5) |
| model = RegressionPreTrainedModel(config) |
| eval_dataset = SampleIterableDataset() |
|
|
| |
| args = RegressionTrainingArguments(output_dir=tmp_dir, accelerator_config=path_file) |
| trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset) |
| self.assertEqual(trainer.accelerator.split_batches, True) |
| self.assertEqual(trainer.accelerator.dispatch_batches, True) |
| self.assertEqual(trainer.accelerator.even_batches, False) |
| self.assertEqual(trainer.accelerator.use_seedable_sampler, False) |
|
|
| def test_accelerator_config_from_dataclass(self): |
| |
| |
|
|
| accelerator_config = AcceleratorConfig( |
| split_batches=True, |
| dispatch_batches=True, |
| even_batches=False, |
| use_seedable_sampler=False, |
| ) |
| config = RegressionModelConfig(a=1.5, b=2.5) |
| model = RegressionPreTrainedModel(config) |
| eval_dataset = SampleIterableDataset() |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = RegressionTrainingArguments(output_dir=tmp_dir, accelerator_config=accelerator_config) |
| trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset) |
| self.assertEqual(trainer.accelerator.split_batches, True) |
| self.assertEqual(trainer.accelerator.dispatch_batches, True) |
| self.assertEqual(trainer.accelerator.even_batches, False) |
| self.assertEqual(trainer.accelerator.use_seedable_sampler, False) |
|
|
| @require_accelerate_version_min_0_28 |
| def test_accelerate_config_from_dataclass_grad_accum(self): |
| |
| |
|
|
| grad_acc_kwargs = { |
| "num_steps": 10, |
| "adjust_scheduler": False, |
| "sync_with_dataloader": False, |
| "sync_each_batch": True, |
| } |
| accelerator_config = AcceleratorConfig( |
| split_batches=True, |
| dispatch_batches=True, |
| even_batches=False, |
| use_seedable_sampler=False, |
| gradient_accumulation_kwargs=grad_acc_kwargs, |
| ) |
| config = RegressionModelConfig(a=1.5, b=2.5) |
| model = RegressionPreTrainedModel(config) |
| eval_dataset = SampleIterableDataset() |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = RegressionTrainingArguments(output_dir=tmp_dir, accelerator_config=accelerator_config) |
| trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset) |
| self.assertEqual(trainer.args.gradient_accumulation_steps, 10) |
|
|
| def test_accelerator_config_from_partial(self): |
| |
| |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| config = RegressionModelConfig(a=1.5, b=2.5) |
| model = RegressionPreTrainedModel(config) |
| eval_dataset = SampleIterableDataset() |
|
|
| |
| args = RegressionTrainingArguments( |
| output_dir=tmp_dir, |
| accelerator_config={ |
| "split_batches": True, |
| }, |
| ) |
| trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset) |
| self.assertEqual(trainer.accelerator.split_batches, True) |
| self.assertEqual(trainer.accelerator.dispatch_batches, None) |
| self.assertEqual(trainer.accelerator.even_batches, True) |
| self.assertEqual(trainer.accelerator.use_seedable_sampler, True) |
|
|
| def test_accelerator_custom_state(self): |
| AcceleratorState._reset_state(reset_partial_state=True) |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| with self.assertRaises(ValueError) as cm: |
| _ = RegressionTrainingArguments(output_dir=tmp_dir, accelerator_config={"use_configured_state": True}) |
| self.assertIn("Please define this beforehand", str(cm.warnings[0].message)) |
| _ = Accelerator() |
| _ = RegressionTrainingArguments(output_dir=tmp_dir, accelerator_config={"use_configured_state": True}) |
| AcceleratorState._reset_state(reset_partial_state=True) |
|
|
| @require_accelerate_version_min_0_28 |
| def test_accelerator_config_from_dict_grad_accum_num_steps(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| config = RegressionModelConfig(a=1.5, b=2.5) |
| model = RegressionPreTrainedModel(config) |
| eval_dataset = SampleIterableDataset() |
|
|
| |
| |
| |
| args = RegressionTrainingArguments( |
| output_dir=tmp_dir, |
| gradient_accumulation_steps=1, |
| accelerator_config={ |
| "gradient_accumulation_kwargs": { |
| "num_steps": 1, |
| } |
| }, |
| ) |
| trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset) |
| self.assertEqual(trainer.accelerator.gradient_state.plugin_kwargs["num_steps"], 1) |
|
|
| |
| |
| |
| args = RegressionTrainingArguments( |
| output_dir=tmp_dir, |
| gradient_accumulation_steps=2, |
| accelerator_config={ |
| "gradient_accumulation_kwargs": { |
| "num_steps": 10, |
| } |
| }, |
| ) |
| with self.assertRaises(Exception) as context: |
| trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset) |
| self.assertTrue("The `AcceleratorConfig`'s `num_steps` is set but" in str(context.exception)) |
|
|
| def test_accelerator_config_not_instantiated(self): |
| |
| |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| with self.assertRaises(NotImplementedError) as context: |
| _ = RegressionTrainingArguments( |
| output_dir=tmp_dir, |
| accelerator_config=AcceleratorConfig, |
| ) |
| self.assertTrue("Tried passing in a callable to `accelerator_config`" in str(context.exception)) |
|
|
| |
| @dataclasses.dataclass |
| class CustomAcceleratorConfig(AcceleratorConfig): |
| pass |
|
|
| @dataclasses.dataclass |
| class CustomTrainingArguments(TrainingArguments): |
| accelerator_config: dict = dataclasses.field( |
| default=CustomAcceleratorConfig, |
| ) |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| with self.assertRaises(NotImplementedError) as context: |
| _ = CustomTrainingArguments( |
| output_dir=tmp_dir, |
| ) |
| self.assertTrue("Tried passing in a callable to `accelerator_config`" in str(context.exception)) |
|
|
| def test_torch_dtype_to_json(self): |
| @dataclasses.dataclass |
| class TorchDtypeTrainingArguments(TrainingArguments): |
| torch_dtype: torch.dtype = dataclasses.field( |
| default=torch.float32, |
| ) |
|
|
| for dtype in [ |
| "float32", |
| "float64", |
| "complex64", |
| "complex128", |
| "float16", |
| "bfloat16", |
| "uint8", |
| "int8", |
| "int16", |
| "int32", |
| "int64", |
| "bool", |
| ]: |
| torch_dtype = getattr(torch, dtype) |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TorchDtypeTrainingArguments(output_dir=tmp_dir, torch_dtype=torch_dtype) |
|
|
| args_dict = args.to_dict() |
| self.assertIn("torch_dtype", args_dict) |
| self.assertEqual(args_dict["torch_dtype"], dtype) |
|
|
| @require_accelerate_version_min_0_30 |
| def test_eval_use_gather_object(self): |
| train_dataset = RegressionDataset() |
| eval_dataset = RegressionDataset() |
| model = RegressionDictModel() |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments(tmp_dir, report_to="none", eval_use_gather_object=True) |
| trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset) |
| trainer.train() |
| _ = trainer.evaluate() |
| _ = trainer.predict(eval_dataset) |
|
|
| def test_trainer_saves_tokenizer(self): |
| MODEL_ID = "google-bert/bert-base-uncased" |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False) |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| config = RegressionModelConfig(a=1.5, b=2.5) |
| trainer = Trainer( |
| model=RegressionPreTrainedModel(config), |
| args=TrainingArguments(output_dir=tmp_dir), |
| processing_class=tokenizer, |
| ) |
| trainer.save_model() |
|
|
| reloaded_tokenizer = AutoTokenizer.from_pretrained(tmp_dir) |
|
|
| |
| |
| test_sentence = "This is a test sentence" |
| self.assertListEqual( |
| tokenizer(test_sentence, padding="max_length").input_ids, |
| reloaded_tokenizer(test_sentence, padding="max_length").input_ids, |
| ) |
|
|
| @require_vision |
| def test_trainer_saves_image_processor(self): |
| MODEL_ID = "openai/clip-vit-base-patch32" |
| image_processor = AutoImageProcessor.from_pretrained(MODEL_ID) |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| config = RegressionModelConfig(a=1.5, b=2.5) |
| trainer = Trainer( |
| model=RegressionPreTrainedModel(config), |
| args=TrainingArguments(output_dir=tmp_dir), |
| processing_class=image_processor, |
| ) |
| trainer.save_model() |
| reloaded_image_processor = AutoImageProcessor.from_pretrained(tmp_dir) |
|
|
| self.assertDictEqual(image_processor.to_dict(), reloaded_image_processor.to_dict()) |
|
|
| def test_trainer_saves_feature_extractor(self): |
| MODEL_ID = "facebook/wav2vec2-base-960h" |
| feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_ID) |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| config = RegressionModelConfig(a=1.5, b=2.5) |
| trainer = Trainer( |
| model=RegressionPreTrainedModel(config), |
| args=TrainingArguments(output_dir=tmp_dir), |
| processing_class=feature_extractor, |
| ) |
| trainer.save_model() |
|
|
| reloaded_feature_extractor = AutoFeatureExtractor.from_pretrained(tmp_dir) |
|
|
| self.assertDictEqual(feature_extractor.to_dict(), reloaded_feature_extractor.to_dict()) |
|
|
| @require_vision |
| def test_trainer_saves_processor(self): |
| MODEL_ID = "openai/clip-vit-base-patch32" |
| image_processor = AutoImageProcessor.from_pretrained(MODEL_ID) |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False) |
| processor = AutoProcessor.from_pretrained(MODEL_ID) |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| config = RegressionModelConfig(a=1.5, b=2.5) |
| trainer = Trainer( |
| model=RegressionPreTrainedModel(config), |
| args=TrainingArguments(output_dir=tmp_dir), |
| processing_class=processor, |
| ) |
| trainer.save_model() |
|
|
| reloaded_processor = AutoProcessor.from_pretrained(tmp_dir) |
| reloaded_image_processor = AutoImageProcessor.from_pretrained(tmp_dir) |
| reloaded_tokenizer = AutoTokenizer.from_pretrained(tmp_dir) |
|
|
| self.assertDictEqual(reloaded_processor.to_dict(), processor.to_dict()) |
|
|
| image_processor_dict = image_processor.to_dict() |
| reloaded_image_processor_dict = reloaded_image_processor.to_dict() |
| |
| image_processor_dict.pop("_processor_class") |
| reloaded_image_processor_dict.pop("_processor_class") |
| self.assertDictEqual(image_processor_dict, reloaded_image_processor_dict) |
|
|
| |
| |
| test_sentence = "This is a test sentence" |
| self.assertListEqual( |
| tokenizer(test_sentence, padding="max_length").input_ids, |
| reloaded_tokenizer(test_sentence, padding="max_length").input_ids, |
| ) |
|
|
| def test_save_best_checkpoint(self): |
| freq = int(64 / self.batch_size) |
| total = int(self.n_epochs * 64 / self.batch_size) |
|
|
| |
| with tempfile.TemporaryDirectory() as tmpdir: |
| trainer = get_regression_trainer( |
| a=1.5, |
| b=2.5, |
| output_dir=tmpdir, |
| learning_rate=0.1, |
| eval_strategy="epoch", |
| save_strategy="best", |
| metric_for_best_model="accuracy", |
| compute_metrics=AlmostAccuracy(), |
| ) |
| self.assertTrue(trainer.args.metric_for_best_model == "accuracy") |
|
|
| with patch.object( |
| trainer, |
| "_evaluate", |
| side_effect=[ |
| {"eval_loss": 0.03, "eval_accuracy": 0.60, "epoch": 1.0}, |
| {"eval_loss": 0.02, "eval_accuracy": 0.65, "epoch": 2.0}, |
| {"eval_loss": 0.01, "eval_accuracy": 0.64, "epoch": 3.0}, |
| ], |
| ): |
| trainer.train() |
|
|
| self.assertEqual(len(os.listdir(tmpdir)), 2) |
| self.check_saved_checkpoints( |
| output_dir=tmpdir, |
| freq=freq, |
| total=total, |
| ) |
|
|
| |
| with tempfile.TemporaryDirectory() as tmpdir: |
| trainer = get_regression_trainer( |
| a=1.5, |
| b=2.5, |
| output_dir=tmpdir, |
| learning_rate=0.1, |
| eval_strategy="epoch", |
| save_strategy="best", |
| metric_for_best_model="loss", |
| compute_metrics=AlmostAccuracy(), |
| ) |
| self.assertTrue(trainer.args.metric_for_best_model == "loss") |
|
|
| with patch.object( |
| trainer, |
| "_evaluate", |
| side_effect=[ |
| {"eval_loss": 0.03, "eval_accuracy": 0.60, "epoch": 1.0}, |
| {"eval_loss": 0.02, "eval_accuracy": 0.65, "epoch": 2.0}, |
| {"eval_loss": 0.03, "eval_accuracy": 0.66, "epoch": 3.0}, |
| ], |
| ): |
| trainer.train() |
|
|
| self.assertEqual(len(os.listdir(tmpdir)), 2) |
| self.check_saved_checkpoints( |
| output_dir=tmpdir, |
| freq=freq, |
| total=total, |
| ) |
|
|
| def test_metric_for_best_model_behavior(self): |
| |
| |
| with tempfile.TemporaryDirectory() as tmpdir: |
| with self.assertRaises(ValueError) as context: |
| trainer = get_regression_trainer( |
| a=1.5, |
| b=2.5, |
| output_dir=tmpdir, |
| learning_rate=0.1, |
| eval_strategy="epoch", |
| save_strategy="best", |
| compute_metrics=AlmostAccuracy(), |
| ) |
| self.assertIn("`args.metric_for_best_model` must be provided", str(context.exception)) |
|
|
| |
| |
| with tempfile.TemporaryDirectory() as tmpdir: |
| trainer = get_regression_trainer( |
| a=1.5, |
| b=2.5, |
| output_dir=tmpdir, |
| learning_rate=0.1, |
| eval_strategy="steps", |
| save_strategy="steps", |
| load_best_model_at_end=True, |
| ) |
| self.assertTrue(trainer.args.metric_for_best_model == "loss") |
|
|
| def test_best_model_checkpoint_behavior(self): |
| |
| |
| with tempfile.TemporaryDirectory() as tmpdir: |
| trainer = get_regression_trainer( |
| output_dir=tmpdir, |
| eval_strategy="steps", |
| save_strategy="steps", |
| save_steps=1, |
| metric_for_best_model="accuracy", |
| greater_is_better=True, |
| ) |
| trainer.train() |
|
|
| assert trainer.state.best_metric is None |
| assert trainer.state.best_model_checkpoint is None |
| assert len(os.listdir(tmpdir)) == trainer.state.global_step |
|
|
| |
| |
| |
| with tempfile.TemporaryDirectory() as tmpdir: |
| trainer = get_regression_trainer( |
| output_dir=tmpdir, |
| eval_strategy="steps", |
| save_strategy="steps", |
| save_steps=1, |
| metric_for_best_model="accuracy", |
| greater_is_better=True, |
| save_total_limit=1, |
| ) |
| trainer.train() |
|
|
| num_steps = trainer.state.global_step |
|
|
| assert trainer.state.best_metric is None |
| assert trainer.state.best_model_checkpoint is None |
| assert len(os.listdir(tmpdir)) == 1 |
|
|
| ckpt = os.path.join(tmpdir, f"{PREFIX_CHECKPOINT_DIR}-{num_steps}") |
| assert os.path.isdir(ckpt) |
| assert os.listdir(tmpdir)[0] == f"{PREFIX_CHECKPOINT_DIR}-{num_steps}" |
|
|
| |
| |
| with tempfile.TemporaryDirectory() as tmpdir: |
| trainer = get_regression_trainer( |
| output_dir=tmpdir, |
| eval_strategy="epoch", |
| save_strategy="epoch", |
| metric_for_best_model="accuracy", |
| compute_metrics=AlmostAccuracy(), |
| greater_is_better=True, |
| load_best_model_at_end=False, |
| ) |
|
|
| with patch.object( |
| trainer, |
| "_evaluate", |
| side_effect=evaluate_side_effect_factory( |
| [ |
| {"eval_accuracy": 0.59}, |
| {"eval_accuracy": 0.57}, |
| {"eval_accuracy": 0.55}, |
| ] |
| ), |
| ): |
| trainer.train() |
|
|
| steps_per_epoch = get_steps_per_epoch(trainer) |
|
|
| assert trainer.state.best_metric == 0.59 |
| assert trainer.state.best_global_step == steps_per_epoch |
|
|
| best_ckpt = os.path.join(tmpdir, f"{PREFIX_CHECKPOINT_DIR}-{trainer.state.best_global_step}") |
| assert trainer.state.best_model_checkpoint == best_ckpt |
|
|
| assert len(os.listdir(tmpdir)) == trainer.state.num_train_epochs |
|
|
| |
| with tempfile.TemporaryDirectory() as tmpdir: |
| trainer = get_regression_trainer( |
| output_dir=tmpdir, |
| eval_strategy="epoch", |
| save_strategy="steps", |
| save_steps=1, |
| metric_for_best_model="accuracy", |
| compute_metrics=AlmostAccuracy(), |
| greater_is_better=True, |
| load_best_model_at_end=False, |
| ) |
|
|
| with patch.object( |
| trainer, |
| "_evaluate", |
| side_effect=evaluate_side_effect_factory( |
| [ |
| {"eval_accuracy": 0.59}, |
| {"eval_accuracy": 0.57}, |
| {"eval_accuracy": 0.55}, |
| ] |
| ), |
| ): |
| trainer.train() |
|
|
| steps_per_epoch = get_steps_per_epoch(trainer) |
|
|
| assert trainer.state.best_metric == 0.59 |
| assert trainer.state.best_global_step == steps_per_epoch |
|
|
| best_ckpt = os.path.join(tmpdir, f"{PREFIX_CHECKPOINT_DIR}-{trainer.state.best_global_step}") |
| assert trainer.state.best_model_checkpoint == best_ckpt |
|
|
| assert len(os.listdir(tmpdir)) == trainer.state.global_step |
|
|
| |
| |
| with tempfile.TemporaryDirectory() as tmpdir: |
| trainer = get_regression_trainer( |
| output_dir=tmpdir, |
| eval_strategy="steps", |
| eval_steps=1, |
| save_strategy="steps", |
| save_steps=1, |
| metric_for_best_model="accuracy", |
| compute_metrics=AlmostAccuracy(), |
| greater_is_better=True, |
| save_total_limit=1, |
| ) |
|
|
| with patch.object( |
| trainer, |
| "_evaluate", |
| side_effect=evaluate_side_effect_factory( |
| [ |
| {"eval_accuracy": 0.90}, |
| {"eval_accuracy": 0.80}, |
| {"eval_accuracy": 0.70}, |
| ] |
| ), |
| ): |
| trainer.train() |
|
|
| assert trainer.state.best_metric == 0.90 |
| assert trainer.state.best_global_step == 1 |
|
|
| best_ckpt = os.path.join(tmpdir, f"{PREFIX_CHECKPOINT_DIR}-{trainer.state.best_global_step}") |
| assert trainer.state.best_model_checkpoint == best_ckpt |
|
|
| assert len(os.listdir(tmpdir)) == 1 |
|
|
| |
| |
| with tempfile.TemporaryDirectory() as tmpdir: |
| trainer = get_regression_trainer( |
| output_dir=tmpdir, |
| eval_strategy="steps", |
| eval_steps=3, |
| save_strategy="steps", |
| save_steps=2, |
| metric_for_best_model="accuracy", |
| compute_metrics=AlmostAccuracy(), |
| greater_is_better=True, |
| ) |
|
|
| with patch.object( |
| trainer, |
| "_evaluate", |
| side_effect=evaluate_side_effect_factory( |
| [ |
| {"eval_accuracy": 0.90}, |
| {"eval_accuracy": 0.80}, |
| {"eval_accuracy": 0.70}, |
| ] |
| ), |
| ): |
| trainer.train() |
|
|
| assert trainer.state.best_metric == 0.90 |
| assert trainer.state.best_global_step == 3 |
|
|
| assert trainer.state.best_model_checkpoint is None |
|
|
| assert len(os.listdir(tmpdir)) == trainer.state.global_step // 2 |
|
|
|
|
| @require_torch |
| @is_staging_test |
| class TrainerIntegrationWithHubTester(unittest.TestCase): |
| @classmethod |
| def setUpClass(cls): |
| cls._token = TOKEN |
| HfFolder.save_token(TOKEN) |
|
|
| def test_push_to_hub(self): |
| with TemporaryHubRepo(token=self._token) as tmp_repo: |
| output_dir_name = tmp_repo.repo_name |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer( |
| output_dir=os.path.join(tmp_dir, output_dir_name), |
| push_to_hub=True, |
| hub_token=self._token, |
| ) |
| url = trainer.push_to_hub() |
|
|
| |
| re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url) |
| self.assertTrue(re_search is not None) |
| repo_name = re_search.groups()[0] |
|
|
| self.assertEqual(repo_name, f"{USER}/{output_dir_name}") |
|
|
| model = RegressionPreTrainedModel.from_pretrained(repo_name) |
| self.assertEqual(model.a.item(), trainer.model.a.item()) |
| self.assertEqual(model.b.item(), trainer.model.b.item()) |
|
|
| def test_push_to_hub_in_organization(self): |
| with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo: |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer(output_dir=tmp_dir) |
| trainer.save_model() |
| output_dir_name = tmp_repo.repo_name |
| trainer = get_regression_trainer( |
| output_dir=os.path.join(tmp_dir, output_dir_name), |
| push_to_hub=True, |
| hub_model_id=f"valid_org/{output_dir_name}", |
| hub_token=self._token, |
| ) |
| url = trainer.push_to_hub() |
|
|
| |
| re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url) |
| self.assertTrue(re_search is not None) |
| repo_name = re_search.groups()[0] |
| self.assertEqual(repo_name, f"valid_org/{output_dir_name}") |
|
|
| model = RegressionPreTrainedModel.from_pretrained(f"valid_org/{output_dir_name}") |
| self.assertEqual(model.a.item(), trainer.model.a.item()) |
| self.assertEqual(model.b.item(), trainer.model.b.item()) |
|
|
| def get_commit_history(self, repo): |
| commit_logs = subprocess.run( |
| "git log".split(), |
| capture_output=True, |
| check=True, |
| encoding="utf-8", |
| cwd=repo, |
| ).stdout |
| commits = commit_logs.split("\n\n")[1::2] |
| return [commit.strip() for commit in commits] |
|
|
| def test_push_to_hub_with_saves_each_epoch(self): |
| with TemporaryHubRepo(token=self._token) as tmp_repo: |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| with self.assertLogs(level="WARNING") as logs: |
| output_dir_name = tmp_repo.repo_name |
| trainer = get_regression_trainer( |
| output_dir=os.path.join(tmp_dir, output_dir_name), |
| push_to_hub=True, |
| hub_token=self._token, |
| |
| hub_always_push=True, |
| save_strategy="epoch", |
| ) |
| trainer.train() |
|
|
| commits = list_repo_commits(f"{USER}/{output_dir_name}", token=self._token) |
| commits = [c.title for c in commits] |
| self.assertIn("initial commit", commits) |
| self.assertIn("Training in progress, epoch 1", commits) |
| self.assertIn("Training in progress, epoch 2", commits) |
| |
| self.assertTrue(any("Skipping to prevent empty commit." in record.message for record in logs.records)) |
|
|
| def test_push_to_hub_with_saves_each_n_steps(self): |
| num_gpus = max(1, backend_device_count(torch_device)) |
| if num_gpus > 2: |
| self.skipTest(reason="More than 2 GPUs available") |
|
|
| with TemporaryHubRepo(token=self._token) as tmp_repo: |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| with self.assertLogs(level="WARNING") as logs: |
| output_dir_name = tmp_repo.repo_name |
| trainer = get_regression_trainer( |
| output_dir=os.path.join(tmp_dir, output_dir_name), |
| push_to_hub=True, |
| hub_token=self._token, |
| |
| hub_always_push=True, |
| save_strategy="steps", |
| save_steps=5, |
| ) |
| trainer.train() |
|
|
| commits = list_repo_commits(f"{USER}/{output_dir_name}", token=self._token) |
| commits = [c.title for c in commits] |
| self.assertIn("initial commit", commits) |
|
|
| |
| |
| nb_empty_commits = len( |
| [record for record in logs.records if "Skipping to prevent empty commit." in record.message] |
| ) |
| nb_epoch_commits = len([commit for commit in commits if "Training in progress, step" in commit]) |
|
|
| |
| max_steps = math.ceil(trainer.args.num_train_epochs * len(trainer.get_train_dataloader())) |
| nb_expected_commits = len(range(5, max_steps, 5)) |
|
|
| |
| self.assertGreaterEqual(nb_empty_commits + nb_epoch_commits, nb_expected_commits) |
|
|
| @require_tensorboard |
| def test_push_to_hub_with_tensorboard_logs(self): |
| with TemporaryHubRepo(token=self._token) as tmp_repo: |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| output_dir_name = tmp_repo.repo_name |
| trainer = get_regression_trainer( |
| output_dir=os.path.join(tmp_dir, output_dir_name), |
| hub_token=self._token, |
| save_strategy="epoch", |
| report_to=["tensorboard"], |
| keep_report_to=True, |
| ) |
| trainer.train() |
| |
| trainer.push_to_hub() |
|
|
| files = list_repo_files(f"{USER}/{output_dir_name}", token=self._token) |
| found_log = False |
| for f in files: |
| if len(f.split("runs")) > 1 and "events.out.tfevents" in f: |
| found_log = True |
|
|
| assert found_log is True, "No tensorboard log found in repo" |
|
|
| def test_push_to_hub_tags(self): |
| |
| |
| |
| with TemporaryHubRepo(token=self._token) as tmp_repo: |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| output_dir_name = tmp_repo.repo_name |
| trainer = get_regression_trainer( |
| output_dir=os.path.join(tmp_dir, output_dir_name), |
| push_to_hub=True, |
| hub_token=self._token, |
| ) |
|
|
| trainer.model.add_model_tags(["test-trainer-tags"]) |
|
|
| url = trainer.push_to_hub() |
|
|
| |
| re_search = re.search(ENDPOINT_STAGING + r"/([^/]+/[^/]+)/", url) |
| self.assertTrue(re_search is not None) |
| repo_name = re_search.groups()[0] |
|
|
| self.assertEqual(repo_name, f"{USER}/{output_dir_name}") |
|
|
| model_card = ModelCard.load(repo_name) |
| self.assertTrue("test-trainer-tags" in model_card.data.tags) |
|
|
| def test_push_to_hub_with_revision(self): |
| |
| with TemporaryHubRepo(token=self._token) as tmp_repo: |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| output_dir_name = tmp_repo.repo_name |
| trainer = get_regression_trainer( |
| output_dir=os.path.join(tmp_dir, output_dir_name), |
| push_to_hub=True, |
| hub_token=self._token, |
| ) |
| branch = "v1.0" |
| create_branch(repo_id=trainer.hub_model_id, branch=branch, token=self._token, exist_ok=True) |
| url = trainer.push_to_hub(revision=branch) |
|
|
| |
| re_search = re.search(r"tree/([^/]+)/", url) |
| self.assertIsNotNone(re_search) |
|
|
| branch_name = re_search.groups()[0] |
| self.assertEqual(branch_name, branch) |
|
|
|
|
| @require_torch |
| @require_optuna |
| class TrainerHyperParameterOptunaIntegrationTest(unittest.TestCase): |
| def setUp(self): |
| args = TrainingArguments("..") |
| self.n_epochs = args.num_train_epochs |
| self.batch_size = args.train_batch_size |
|
|
| def test_hyperparameter_search(self): |
| class MyTrialShortNamer(TrialShortNamer): |
| DEFAULTS = {"a": 0, "b": 0} |
|
|
| def hp_space(trial): |
| return {} |
|
|
| def model_init(trial): |
| if trial is not None: |
| a = trial.suggest_int("a", -4, 4) |
| b = trial.suggest_int("b", -4, 4) |
| else: |
| a = 0 |
| b = 0 |
| config = RegressionModelConfig(a=a, b=b, double_output=False) |
|
|
| return RegressionPreTrainedModel(config) |
|
|
| def hp_name(trial): |
| return MyTrialShortNamer.shortname(trial.params) |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer( |
| output_dir=tmp_dir, |
| learning_rate=0.1, |
| logging_steps=1, |
| eval_strategy=IntervalStrategy.EPOCH, |
| save_strategy=IntervalStrategy.EPOCH, |
| num_train_epochs=4, |
| disable_tqdm=True, |
| load_best_model_at_end=True, |
| logging_dir="runs", |
| run_name="test", |
| model_init=model_init, |
| ) |
| trainer.hyperparameter_search(direction="minimize", hp_space=hp_space, hp_name=hp_name, n_trials=4) |
|
|
|
|
| @require_torch |
| @require_optuna |
| class TrainerHyperParameterMultiObjectOptunaIntegrationTest(unittest.TestCase): |
| def setUp(self): |
| args = TrainingArguments("..") |
| self.n_epochs = args.num_train_epochs |
| self.batch_size = args.train_batch_size |
|
|
| def test_hyperparameter_search(self): |
| class MyTrialShortNamer(TrialShortNamer): |
| DEFAULTS = {"a": 0, "b": 0} |
|
|
| def hp_space(trial): |
| return {} |
|
|
| def model_init(trial): |
| if trial is not None: |
| a = trial.suggest_int("a", -4, 4) |
| b = trial.suggest_int("b", -4, 4) |
| else: |
| a = 0 |
| b = 0 |
| config = RegressionModelConfig(a=a, b=b, double_output=False) |
|
|
| return RegressionPreTrainedModel(config) |
|
|
| def hp_name(trial): |
| return MyTrialShortNamer.shortname(trial.params) |
|
|
| def compute_objective(metrics: dict[str, float]) -> list[float]: |
| return metrics["eval_loss"], metrics["eval_accuracy"] |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer( |
| output_dir=tmp_dir, |
| learning_rate=0.1, |
| logging_steps=1, |
| eval_strategy=IntervalStrategy.EPOCH, |
| save_strategy=IntervalStrategy.EPOCH, |
| num_train_epochs=10, |
| disable_tqdm=True, |
| load_best_model_at_end=True, |
| logging_dir="runs", |
| run_name="test", |
| model_init=model_init, |
| compute_metrics=AlmostAccuracy(), |
| ) |
| trainer.hyperparameter_search( |
| direction=["minimize", "maximize"], |
| hp_space=hp_space, |
| hp_name=hp_name, |
| n_trials=4, |
| compute_objective=compute_objective, |
| ) |
|
|
|
|
| @require_torch |
| @require_optuna |
| class TrainerHyperParameterOptunaIntegrationTestWithFullEval(unittest.TestCase): |
| def test_hyperparameter_search(self): |
| def hp_space(trial): |
| return {} |
|
|
| def model_init(trial): |
| if trial is not None: |
| a = trial.suggest_int("a", -4, 4) |
| b = trial.suggest_int("b", -4, 4) |
| else: |
| a = 0 |
| b = 0 |
| config = RegressionModelConfig(a=a, b=b, double_output=False) |
|
|
| return RegressionPreTrainedModel(config) |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer( |
| output_dir=tmp_dir, |
| disable_tqdm=True, |
| model_init=model_init, |
| fp16_full_eval=True, |
| ) |
| trainer.hyperparameter_search( |
| direction="minimize", |
| hp_space=hp_space, |
| n_trials=2, |
| ) |
|
|
|
|
| @require_torch |
| @require_ray |
| class TrainerHyperParameterRayIntegrationTest(unittest.TestCase): |
| def setUp(self): |
| args = TrainingArguments("..") |
| self.n_epochs = args.num_train_epochs |
| self.batch_size = args.train_batch_size |
|
|
| def ray_hyperparameter_search(self): |
| class MyTrialShortNamer(TrialShortNamer): |
| DEFAULTS = {"a": 0, "b": 0} |
|
|
| def hp_space(trial): |
| from ray import tune |
|
|
| return { |
| "a": tune.randint(-4, 4), |
| "b": tune.randint(-4, 4), |
| } |
|
|
| def model_init(config): |
| if config is None: |
| a = 0 |
| b = 0 |
| else: |
| a = config["a"] |
| b = config["b"] |
| model_config = RegressionModelConfig(a=a, b=b, double_output=False) |
|
|
| return RegressionPreTrainedModel(model_config) |
|
|
| def hp_name(params): |
| return MyTrialShortNamer.shortname(params) |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer( |
| output_dir=tmp_dir, |
| learning_rate=0.1, |
| logging_steps=1, |
| eval_strategy=IntervalStrategy.EPOCH, |
| save_strategy=IntervalStrategy.EPOCH, |
| num_train_epochs=4, |
| disable_tqdm=True, |
| load_best_model_at_end=True, |
| logging_dir="runs", |
| run_name="test", |
| model_init=model_init, |
| ) |
| trainer.hyperparameter_search( |
| direction="minimize", hp_space=hp_space, hp_name=hp_name, backend="ray", n_trials=4 |
| ) |
|
|
| def test_hyperparameter_search(self): |
| self.ray_hyperparameter_search() |
|
|
| def test_hyperparameter_search_ray_client(self): |
| import ray |
| from ray.util.client.ray_client_helpers import ray_start_client_server |
|
|
| with ray_start_client_server(): |
| assert ray.util.client.ray.is_connected() |
| self.ray_hyperparameter_search() |
|
|
|
|
| @slow |
| @require_torch |
| @require_sigopt |
| class TrainerHyperParameterSigOptIntegrationTest(unittest.TestCase): |
| def setUp(self): |
| args = TrainingArguments("..") |
| self.n_epochs = args.num_train_epochs |
| self.batch_size = args.train_batch_size |
|
|
| def test_hyperparameter_search(self): |
| class MyTrialShortNamer(TrialShortNamer): |
| DEFAULTS = {"a": 0, "b": 0} |
|
|
| def hp_space(trial): |
| return [ |
| {"bounds": {"min": -4, "max": 4}, "name": "a", "type": "int"}, |
| {"bounds": {"min": -4, "max": 4}, "name": "b", "type": "int"}, |
| ] |
|
|
| def model_init(trial): |
| if trial is not None: |
| a = trial.assignments["a"] |
| b = trial.assignments["b"] |
| else: |
| a = 0 |
| b = 0 |
| config = RegressionModelConfig(a=a, b=b, double_output=False) |
|
|
| return RegressionPreTrainedModel(config) |
|
|
| def hp_name(trial): |
| return MyTrialShortNamer.shortname(trial.assignments) |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer( |
| output_dir=tmp_dir, |
| learning_rate=0.1, |
| logging_steps=1, |
| eval_strategy=IntervalStrategy.EPOCH, |
| save_strategy=IntervalStrategy.EPOCH, |
| num_train_epochs=4, |
| disable_tqdm=True, |
| load_best_model_at_end=True, |
| logging_dir="runs", |
| run_name="test", |
| model_init=model_init, |
| ) |
| trainer.hyperparameter_search( |
| direction="minimize", hp_space=hp_space, hp_name=hp_name, backend="sigopt", n_trials=4 |
| ) |
|
|
|
|
| optim_test_params = [] |
| if is_torch_available(): |
| default_adam_kwargs = { |
| "betas": (TrainingArguments.adam_beta1, TrainingArguments.adam_beta2), |
| "eps": TrainingArguments.adam_epsilon, |
| "lr": TrainingArguments.learning_rate, |
| } |
|
|
| default_lion_kwargs = { |
| "betas": (TrainingArguments.adam_beta1, TrainingArguments.adam_beta2), |
| "lr": TrainingArguments.learning_rate, |
| } |
|
|
| default_ademamix_kwargs = { |
| "betas": (TrainingArguments.adam_beta1, TrainingArguments.adam_beta2, 0.9999), |
| "alpha": 5.0, |
| "eps": TrainingArguments.adam_epsilon, |
| "lr": TrainingArguments.learning_rate, |
| } |
|
|
| default_anyprecision_kwargs = { |
| "use_kahan_summation": False, |
| "momentum_dtype": torch.float32, |
| "variance_dtype": torch.float32, |
| "compensation_buffer_dtype": torch.bfloat16, |
| } |
|
|
| optim_test_params = [ |
| ( |
| OptimizerNames.ADAMW_TORCH, |
| torch.optim.AdamW, |
| default_adam_kwargs, |
| ), |
| ( |
| OptimizerNames.ADAFACTOR, |
| transformers.optimization.Adafactor, |
| { |
| "scale_parameter": False, |
| "relative_step": False, |
| "lr": TrainingArguments.learning_rate, |
| }, |
| ), |
| ] |
|
|
| if is_apex_available(): |
| import apex |
|
|
| optim_test_params.append( |
| ( |
| OptimizerNames.ADAMW_APEX_FUSED, |
| apex.optimizers.FusedAdam, |
| default_adam_kwargs, |
| ) |
| ) |
|
|
| if is_bitsandbytes_available(): |
| import bitsandbytes as bnb |
|
|
| optim_test_params.append( |
| ( |
| OptimizerNames.ADAMW_BNB, |
| bnb.optim.AdamW, |
| default_adam_kwargs, |
| ) |
| ) |
|
|
| optim_test_params.append( |
| ( |
| OptimizerNames.ADAMW_8BIT, |
| bnb.optim.AdamW, |
| default_adam_kwargs, |
| ) |
| ) |
|
|
| optim_test_params.append( |
| ( |
| OptimizerNames.PAGED_ADAMW, |
| bnb.optim.AdamW, |
| default_adam_kwargs, |
| ) |
| ) |
|
|
| optim_test_params.append( |
| ( |
| OptimizerNames.PAGED_ADAMW_8BIT, |
| bnb.optim.AdamW, |
| default_adam_kwargs, |
| ) |
| ) |
|
|
| optim_test_params.append( |
| ( |
| OptimizerNames.LION, |
| bnb.optim.Lion, |
| default_lion_kwargs, |
| ) |
| ) |
|
|
| optim_test_params.append( |
| ( |
| OptimizerNames.LION_8BIT, |
| bnb.optim.Lion, |
| default_lion_kwargs, |
| ) |
| ) |
|
|
| optim_test_params.append( |
| ( |
| OptimizerNames.PAGED_LION_8BIT, |
| bnb.optim.Lion, |
| default_lion_kwargs, |
| ) |
| ) |
|
|
| if version.parse(importlib.metadata.version("bitsandbytes")) >= version.parse("0.44.0"): |
| optim_test_params.append( |
| ( |
| OptimizerNames.ADEMAMIX, |
| bnb.optim.AdEMAMix, |
| default_ademamix_kwargs, |
| ) |
| ) |
| optim_test_params.append( |
| ( |
| OptimizerNames.ADEMAMIX_8BIT, |
| bnb.optim.AdEMAMix, |
| default_ademamix_kwargs, |
| ) |
| ) |
| optim_test_params.append( |
| ( |
| OptimizerNames.PAGED_ADEMAMIX_8BIT, |
| bnb.optim.AdEMAMix, |
| default_ademamix_kwargs, |
| ) |
| ) |
| optim_test_params.append( |
| ( |
| OptimizerNames.PAGED_ADEMAMIX, |
| bnb.optim.AdEMAMix, |
| default_ademamix_kwargs, |
| ) |
| ) |
|
|
| if is_torchdistx_available(): |
| import torchdistx |
|
|
| optim_test_params.append( |
| ( |
| OptimizerNames.ADAMW_ANYPRECISION, |
| torchdistx.optimizers.AnyPrecisionAdamW, |
| dict(default_adam_kwargs, **default_anyprecision_kwargs), |
| ) |
| ) |
| if is_torchao_available(): |
| import torchao |
|
|
| optim_test_params.append( |
| ( |
| OptimizerNames.ADAMW_TORCH_4BIT, |
| torchao.prototype.low_bit_optim.AdamW4bit, |
| default_adam_kwargs, |
| ) |
| ) |
| optim_test_params.append( |
| ( |
| TrainingArguments(optim=OptimizerNames.ADAMW_TORCH_8BIT, output_dir="None"), |
| torchao.prototype.low_bit_optim.AdamW8bit, |
| default_adam_kwargs, |
| ) |
| ) |
|
|
|
|
| @require_torch |
| class TrainerOptimizerChoiceTest(unittest.TestCase): |
| def check_optim_and_kwargs(self, training_args: TrainingArguments, expected_cls, expected_kwargs): |
| actual_cls, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args) |
| self.assertEqual(expected_cls, actual_cls) |
| self.assertIsNotNone(optim_kwargs) |
|
|
| for p, v in expected_kwargs.items(): |
| self.assertTrue(p in optim_kwargs) |
| actual_v = optim_kwargs[p] |
| self.assertTrue(actual_v == v, f"Failed check for {p}. Expected {v}, but got {actual_v}.") |
|
|
| @parameterized.expand(optim_test_params, skip_on_empty=True) |
| def test_optim_supported(self, optim: str, expected_cls, expected_kwargs): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer(output_dir=tmp_dir, optim=optim) |
|
|
| |
| self.check_optim_and_kwargs(trainer.args, expected_cls, expected_kwargs) |
| trainer.train() |
|
|
| def test_fused_adam(self): |
| |
| |
| |
| |
| mock = Mock() |
| modules = { |
| "apex": mock, |
| "apex.optimizers": mock.optimizers, |
| "apex.optimizers.FusedAdam": mock.optimizers.FusedAdam, |
| } |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| with patch.dict("sys.modules", modules): |
| self.check_optim_and_kwargs( |
| TrainingArguments(optim=OptimizerNames.ADAMW_APEX_FUSED, output_dir=tmp_dir), |
| mock.optimizers.FusedAdam, |
| default_adam_kwargs, |
| ) |
|
|
| def test_fused_adam_no_apex(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments(optim=OptimizerNames.ADAMW_APEX_FUSED, output_dir=tmp_dir) |
|
|
| |
| |
| with patch.dict("sys.modules", {"apex.optimizers": None}): |
| with self.assertRaises(ValueError): |
| Trainer.get_optimizer_cls_and_kwargs(args) |
|
|
| def test_bnb_adam8bit(self): |
| |
| |
| |
| |
| mock = Mock() |
| modules = { |
| "bitsandbytes": mock, |
| "bitsandbytes.optim": mock.optim, |
| "bitsandbytes.optim.AdamW": mock.optim.AdamW, |
| } |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| with patch.dict("sys.modules", modules): |
| self.check_optim_and_kwargs( |
| TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir=tmp_dir), |
| mock.optim.AdamW, |
| default_adam_kwargs, |
| ) |
|
|
| def test_bnb_paged_adam8bit_alias(self): |
| mock = Mock() |
| modules = { |
| "bitsandbytes": mock, |
| "bitsandbytes.optim": mock.optim, |
| "bitsandbytes.optim.AdamW": mock.optim.AdamW, |
| } |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| with patch.dict("sys.modules", modules): |
| self.check_optim_and_kwargs( |
| TrainingArguments(optim=OptimizerNames.ADAMW_8BIT, output_dir=tmp_dir), |
| mock.optim.AdamW, |
| default_adam_kwargs, |
| ) |
|
|
| def test_bnb_paged_adam(self): |
| mock = Mock() |
| modules = { |
| "bitsandbytes": mock, |
| "bitsandbytes.optim": mock.optim, |
| "bitsandbytes.optim.AdamW": mock.optim.AdamW, |
| } |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| with patch.dict("sys.modules", modules): |
| self.check_optim_and_kwargs( |
| TrainingArguments(optim=OptimizerNames.PAGED_ADAMW, output_dir=tmp_dir), |
| mock.optim.AdamW, |
| default_adam_kwargs, |
| ) |
|
|
| def test_bnb_paged_adam8bit(self): |
| mock = Mock() |
| modules = { |
| "bitsandbytes": mock, |
| "bitsandbytes.optim": mock.optim, |
| "bitsandbytes.optim.AdamW": mock.optim.AdamW, |
| } |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| with patch.dict("sys.modules", modules): |
| self.check_optim_and_kwargs( |
| TrainingArguments(optim=OptimizerNames.PAGED_ADAMW_8BIT, output_dir=tmp_dir), |
| mock.optim.AdamW, |
| default_adam_kwargs, |
| ) |
|
|
| def test_bnb_ademamix(self): |
| mock = Mock() |
| modules = { |
| "bitsandbytes": mock, |
| "bitsandbytes.optim": mock.optim, |
| "bitsandbytes.optim.AdEMAMix": mock.optim.AdEMAMix, |
| } |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| with patch.dict("sys.modules", modules): |
| self.check_optim_and_kwargs( |
| TrainingArguments(optim=OptimizerNames.ADEMAMIX, output_dir=tmp_dir), |
| mock.optim.AdEMAMix, |
| default_ademamix_kwargs, |
| ) |
|
|
| def test_bnb_ademamix8bit(self): |
| mock = Mock() |
| modules = { |
| "bitsandbytes": mock, |
| "bitsandbytes.optim": mock.optim, |
| "bitsandbytes.optim.AdEMAMix": mock.optim.AdEMAMix, |
| } |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| with patch.dict("sys.modules", modules): |
| self.check_optim_and_kwargs( |
| TrainingArguments(optim=OptimizerNames.ADEMAMIX_8BIT, output_dir=tmp_dir), |
| mock.optim.AdEMAMix, |
| default_ademamix_kwargs, |
| ) |
|
|
| def test_bnb_paged_ademamix(self): |
| mock = Mock() |
| modules = { |
| "bitsandbytes": mock, |
| "bitsandbytes.optim": mock.optim, |
| "bitsandbytes.optim.AdEMAMix": mock.optim.AdEMAMix, |
| } |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| with patch.dict("sys.modules", modules): |
| self.check_optim_and_kwargs( |
| TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX, output_dir=tmp_dir), |
| mock.optim.AdEMAMix, |
| default_ademamix_kwargs, |
| ) |
|
|
| def test_bnb_paged_ademamix8bit(self): |
| mock = Mock() |
| modules = { |
| "bitsandbytes": mock, |
| "bitsandbytes.optim": mock.optim, |
| "bitsandbytes.optim.AdEMAMix": mock.optim.AdEMAMix, |
| } |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| with patch.dict("sys.modules", modules): |
| self.check_optim_and_kwargs( |
| TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX_8BIT, output_dir=tmp_dir), |
| mock.optim.AdEMAMix, |
| default_ademamix_kwargs, |
| ) |
|
|
| def test_bnb_lion(self): |
| mock = Mock() |
| modules = { |
| "bitsandbytes": mock, |
| "bitsandbytes.optim": mock.optim, |
| "bitsandbytes.optim.Lion": mock.optim.Lion, |
| } |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| with patch.dict("sys.modules", modules): |
| self.check_optim_and_kwargs( |
| TrainingArguments(optim=OptimizerNames.LION, output_dir=tmp_dir), |
| mock.optim.Lion, |
| default_lion_kwargs, |
| ) |
|
|
| def test_bnb_lion8bit(self): |
| mock = Mock() |
| modules = { |
| "bitsandbytes": mock, |
| "bitsandbytes.optim": mock.optim, |
| "bitsandbytes.optim.Lion": mock.optim.Lion, |
| } |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| with patch.dict("sys.modules", modules): |
| self.check_optim_and_kwargs( |
| TrainingArguments(optim=OptimizerNames.LION_8BIT, output_dir=tmp_dir), |
| mock.optim.Lion, |
| default_lion_kwargs, |
| ) |
|
|
| def test_bnb_paged_lion8bit(self): |
| mock = Mock() |
| modules = { |
| "bitsandbytes": mock, |
| "bitsandbytes.optim": mock.optim, |
| "bitsandbytes.optim.Lion": mock.optim.Lion, |
| } |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| with patch.dict("sys.modules", modules): |
| self.check_optim_and_kwargs( |
| TrainingArguments(optim=OptimizerNames.PAGED_LION_8BIT, output_dir=tmp_dir), |
| mock.optim.Lion, |
| default_lion_kwargs, |
| ) |
|
|
| def test_bnb_paged_lion(self): |
| mock = Mock() |
| modules = { |
| "bitsandbytes": mock, |
| "bitsandbytes.optim": mock.optim, |
| "bitsandbytes.optim.Lion": mock.optim.Lion, |
| } |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| with patch.dict("sys.modules", modules): |
| self.check_optim_and_kwargs( |
| TrainingArguments(optim=OptimizerNames.PAGED_LION, output_dir=tmp_dir), |
| mock.optim.Lion, |
| default_lion_kwargs, |
| ) |
|
|
| def test_bnb_adam8bit_no_bnb(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir=tmp_dir) |
|
|
| |
| |
| with patch.dict("sys.modules", {"bitsandbytes.optim": None}): |
| with self.assertRaises(ValueError): |
| Trainer.get_optimizer_cls_and_kwargs(args) |
|
|
| def test_bnb_paged_adam_no_bnb(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments(optim=OptimizerNames.PAGED_ADAMW, output_dir=tmp_dir) |
|
|
| |
| |
| with patch.dict("sys.modules", {"bitsandbytes.optim": None}): |
| with self.assertRaises(ValueError): |
| Trainer.get_optimizer_cls_and_kwargs(args) |
|
|
| def test_bnb_paged_adam8bit_no_bnb(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments(optim=OptimizerNames.PAGED_ADAMW_8BIT, output_dir=tmp_dir) |
|
|
| |
| |
| with patch.dict("sys.modules", {"bitsandbytes.optim": None}): |
| with self.assertRaises(ValueError): |
| Trainer.get_optimizer_cls_and_kwargs(args) |
|
|
| def test_bnb_ademamix_no_bnb(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments(optim=OptimizerNames.ADEMAMIX, output_dir=tmp_dir) |
|
|
| |
| |
| with patch.dict("sys.modules", {"bitsandbytes.optim": None}): |
| with self.assertRaises(ValueError): |
| Trainer.get_optimizer_cls_and_kwargs(args) |
|
|
| def test_bnb_ademamix8bit_no_bnb(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments(optim=OptimizerNames.ADEMAMIX_8BIT, output_dir=tmp_dir) |
|
|
| |
| |
| with patch.dict("sys.modules", {"bitsandbytes.optim": None}): |
| with self.assertRaises(ValueError): |
| Trainer.get_optimizer_cls_and_kwargs(args) |
|
|
| def test_bnb_paged_ademamix_no_bnb(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX, output_dir=tmp_dir) |
|
|
| |
| |
| with patch.dict("sys.modules", {"bitsandbytes.optim": None}): |
| with self.assertRaises(ValueError): |
| Trainer.get_optimizer_cls_and_kwargs(args) |
|
|
| def test_bnb_paged_ademamix8bit_no_bnb(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments(optim=OptimizerNames.PAGED_ADEMAMIX_8BIT, output_dir=tmp_dir) |
|
|
| |
| |
| with patch.dict("sys.modules", {"bitsandbytes.optim": None}): |
| with self.assertRaises(ValueError): |
| Trainer.get_optimizer_cls_and_kwargs(args) |
|
|
| def test_bnb_paged_lion_no_bnb(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments(optim=OptimizerNames.PAGED_LION, output_dir=tmp_dir) |
|
|
| |
| |
| with patch.dict("sys.modules", {"bitsandbytes.optim": None}): |
| with self.assertRaises(ValueError): |
| Trainer.get_optimizer_cls_and_kwargs(args) |
|
|
| def test_bnb_paged_lion8bit_no_bnb(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments(optim=OptimizerNames.PAGED_LION_8BIT, output_dir=tmp_dir) |
|
|
| |
| |
| with patch.dict("sys.modules", {"bitsandbytes.optim": None}): |
| with self.assertRaises(ValueError): |
| Trainer.get_optimizer_cls_and_kwargs(args) |
|
|
| def test_anyprecision_adamw(self): |
| |
| |
| |
| |
| mock = Mock() |
| modules = { |
| "torchdistx": mock, |
| "torchdistx.optimizers": mock.optimizers, |
| "torchdistx.optimizers.AnyPrecisionAdamW.": mock.optimizers.AnyPrecisionAdamW, |
| } |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| with patch.dict("sys.modules", modules): |
| self.check_optim_and_kwargs( |
| TrainingArguments(optim=OptimizerNames.ADAMW_ANYPRECISION, output_dir=tmp_dir), |
| mock.optimizers.AnyPrecisionAdamW, |
| dict(default_adam_kwargs, **default_anyprecision_kwargs), |
| ) |
|
|
| def test_no_torchdistx_anyprecision_adamw(self): |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| args = TrainingArguments(optim=OptimizerNames.ADAMW_ANYPRECISION, output_dir=tmp_dir) |
|
|
| |
| |
| with patch.dict("sys.modules", {"torchdistx.optimizers": None}): |
| with self.assertRaises(ValueError): |
| Trainer.get_optimizer_cls_and_kwargs(args) |
|
|
|
|
| @require_torch |
| @require_wandb |
| class TrainerHyperParameterWandbIntegrationTest(unittest.TestCase): |
| def setUp(self): |
| args = TrainingArguments("..") |
| self.n_epochs = args.num_train_epochs |
| self.batch_size = args.train_batch_size |
|
|
| def test_hyperparameter_search(self): |
| def hp_space(trial): |
| return { |
| "method": "random", |
| "metric": {}, |
| "parameters": { |
| "a": {"distribution": "uniform", "min": 1e-6, "max": 1e-4}, |
| "b": {"distribution": "int_uniform", "min": 1, "max": 6}, |
| }, |
| } |
|
|
| def model_init(config): |
| if config is None: |
| a = 0 |
| b = 0 |
| else: |
| a = config["a"] |
| b = config["b"] |
| model_config = RegressionModelConfig(a=a, b=b, double_output=False) |
|
|
| return RegressionPreTrainedModel(model_config) |
|
|
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = get_regression_trainer( |
| output_dir=tmp_dir, |
| learning_rate=0.1, |
| logging_steps=1, |
| eval_strategy=IntervalStrategy.EPOCH, |
| save_strategy=IntervalStrategy.EPOCH, |
| num_train_epochs=4, |
| disable_tqdm=True, |
| load_best_model_at_end=True, |
| logging_dir="runs", |
| run_name="test", |
| model_init=model_init, |
| ) |
| sweep_kwargs = { |
| "direction": "minimize", |
| "hp_space": hp_space, |
| "backend": "wandb", |
| "n_trials": 4, |
| } |
| best_run = trainer.hyperparameter_search(**sweep_kwargs) |
|
|
| self.assertIsNotNone(best_run.run_id) |
| self.assertIsNotNone(best_run.run_summary) |
| hp_keys = set(best_run.hyperparameters.keys()) |
| self.assertSetEqual(hp_keys, {"a", "b", "assignments", "metric"}) |
|
|
| |
| import os |
|
|
| del os.environ["WANDB_ENTITY"] |
| del os.environ["WANDB_PROJECT"] |
| sweep_kwargs["sweep_id"] = best_run.run_summary |
| updated_best_run = trainer.hyperparameter_search(**sweep_kwargs) |
|
|
| self.assertIsNotNone(updated_best_run.run_id) |
| self.assertEqual(updated_best_run.run_summary, best_run.run_summary) |
| updated_hp_keys = set(updated_best_run.hyperparameters.keys()) |
| self.assertSetEqual(updated_hp_keys, {"a", "b", "assignments", "metric"}) |
|
|
|
|
| class HyperParameterSearchBackendsTest(unittest.TestCase): |
| def test_hyperparameter_search_backends(self): |
| self.assertEqual( |
| list(ALL_HYPERPARAMETER_SEARCH_BACKENDS.keys()), |
| list(HPSearchBackend), |
| ) |
|
|
|
|
| @require_torch |
| class OptimizerAndModelInspectionTest(unittest.TestCase): |
| def test_get_num_trainable_parameters(self): |
| model = nn.Sequential(nn.Linear(128, 64), nn.Linear(64, 32)) |
| |
| layer_1 = 128 * 64 + 64 |
| layer_2 = 64 * 32 + 32 |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = Trainer(model=model, args=TrainingArguments(output_dir=tmp_dir, report_to="none")) |
| self.assertEqual(trainer.get_num_trainable_parameters(), layer_1 + layer_2) |
| |
| for param in model[-1].parameters(): |
| param.requires_grad = False |
| self.assertEqual(trainer.get_num_trainable_parameters(), layer_1) |
|
|
| def test_get_learning_rates(self): |
| model = nn.Sequential(nn.Linear(128, 64)) |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = Trainer(model=model, args=TrainingArguments(output_dir=tmp_dir, report_to="none")) |
| with self.assertRaises(ValueError): |
| trainer.get_learning_rates() |
| trainer.create_optimizer() |
| self.assertEqual(trainer.get_learning_rates(), [5e-05, 5e-05]) |
|
|
| def test_get_optimizer_group(self): |
| model = nn.Sequential(nn.Linear(128, 64)) |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| trainer = Trainer(model=model, args=TrainingArguments(output_dir=tmp_dir, report_to="none")) |
| |
| with self.assertRaises(ValueError): |
| trainer.get_optimizer_group() |
| trainer.create_optimizer() |
| |
| num_groups = len(trainer.get_optimizer_group()) |
| self.assertEqual(num_groups, 2) |
| |
| param = next(model.parameters()) |
| group = trainer.get_optimizer_group(param) |
| self.assertIn(param, group["params"]) |
|
|
| @require_bitsandbytes |
| def test_bnb_8bit_optimizer_skip_embedding(self): |
| model = BasicTextGenerationModel(8, 4) |
| with tempfile.TemporaryDirectory() as tmp_dir: |
| for name_optim in ["rmsprop_bnb_8bit", "adamw_8bit"]: |
| args = TrainingArguments( |
| output_dir=tmp_dir, |
| report_to="none", |
| optim=name_optim, |
| ) |
| trainer = Trainer(model=model, args=args) |
| optimizer = trainer.create_optimizer() |
| modules = optimizer.mng.module_weight_config_triple |
| self.assertNotEqual(len(modules), 0) |
| module, name, config = modules[0] |
| self.assertIsInstance(module, torch.nn.Embedding) |
| self.assertEqual(name, "weight") |
| self.assertDictEqual(config, {"optim_bits": 32}) |
|
|