| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import itertools |
| import os |
| import subprocess |
| from os.path import dirname |
|
|
| from parameterized import parameterized |
|
|
| from tests.trainer.test_trainer import TrainerIntegrationCommon |
| from transformers import is_torch_available |
| from transformers.testing_utils import ( |
| TestCasePlus, |
| backend_device_count, |
| execute_subprocess_async, |
| get_tests_dir, |
| require_deepspeed, |
| require_torch_accelerator, |
| slow, |
| torch_device, |
| ) |
| from transformers.trainer_utils import set_seed |
|
|
|
|
| if is_torch_available(): |
| from tests.trainer.test_trainer import ( |
| RegressionModelConfig, |
| RegressionPreTrainedModel, |
| get_regression_trainer, |
| ) |
|
|
|
|
| set_seed(42) |
|
|
| FIXTURE_DIRECTORY = get_tests_dir("fixtures") |
| ROOT_DIRECTORY = os.path.join(dirname(get_tests_dir())) |
| DS_TESTS_DIRECTORY = dirname(os.path.abspath(__file__)) |
|
|
| |
| DEFAULT_MASTER_PORT = "10999" |
|
|
| T5_SMALL = "google-t5/t5-small" |
|
|
| |
| ALBERT_TINY = "hf-internal-testing/tiny-albert" |
| BART_TINY = "sshleifer/bart-tiny-random" |
| BERT_TINY = "hf-internal-testing/tiny-bert" |
| BIGBIRD_PEGASUS_TINY = "hf-internal-testing/tiny-random-bigbird_pegasus" |
| BIG_BIRD_TINY = "hf-internal-testing/tiny-random-big_bird" |
| BLENDERBOT_TINY = "hf-internal-testing/tiny-random-blenderbot" |
| BLOOM_TINY = "bigscience/bigscience-small-testing" |
| DEBERTA_TINY = "hf-internal-testing/tiny-random-deberta" |
| DEBERTA_V2_TINY = "hf-internal-testing/tiny-random-deberta-v2" |
| DISTILBERT_TINY = "sshleifer/tiny-distilbert-base-cased" |
| ELECTRA_TINY = "hf-internal-testing/tiny-electra" |
| FLAUBERT_TINY = "hf-internal-testing/tiny-random-flaubert" |
| FSMT_TINY = "stas/tiny-wmt19-en-de" |
| FUNNEL_TINY = "hf-internal-testing/tiny-random-funnel" |
| GPT2_TINY = "sshleifer/tiny-gpt2" |
| GPTJ_TINY = "hf-internal-testing/tiny-random-gptj" |
| GPT_NEO_TINY = "hf-internal-testing/tiny-random-gpt_neo" |
| LAYOUTLM_TINY = "hf-internal-testing/tiny-layoutlm" |
| LED_TINY = "hf-internal-testing/tiny-random-led" |
| LONGFORMER_TINY = "hf-internal-testing/tiny-random-longformer" |
| M2M_100_TINY = "stas/tiny-m2m_100" |
| MARIAN_TINY = "sshleifer/tiny-marian-en-de" |
| MBART_TINY = "sshleifer/tiny-mbart" |
| MOBILEBERT_TINY = "hf-internal-testing/tiny-random-mobilebert" |
| MPNET_TINY = "hf-internal-testing/tiny-random-mpnet" |
| PEGASUS_TINY = "stas/pegasus-cnn_dailymail-tiny-random" |
| PROPHETNET_TINY = "hf-internal-testing/tiny-random-prophetnet" |
| ROBERTA_TINY = "sshleifer/tiny-distilroberta-base" |
| SQUEEZEBERT_TINY = "hf-internal-testing/tiny-random-squeezebert" |
| T5_TINY = "patrickvonplaten/t5-tiny-random" |
| T5_V1_TINY = "hf-internal-testing/tiny-random-t5-v1.1" |
| VIT_TINY = "hf-internal-testing/tiny-random-vit" |
| XLM_ROBERTA_TINY = "hf-internal-testing/tiny-xlm-roberta" |
| XLNET_TINY = "sshleifer/tiny-xlnet-base-cased" |
|
|
|
|
| |
|
|
|
|
| |
| |
| MT5_TINY = "hf-internal-testing/tiny-random-mt5" |
| CAMEMBERT_TINY = "hf-internal-testing/tiny-random-camembert" |
| OPENAI_GPT_TINY = "hf-internal-testing/tiny-random-openai-gpt" |
|
|
| |
| CONVBERT_TINY = "hf-internal-testing/tiny-random-convbert" |
| LAYOUTLMV2_TINY = "hf-internal-testing/tiny-random-layoutlmv2" |
| HUBERT_TINY = "hf-internal-testing/tiny-random-hubert" |
|
|
| |
| CTRL_TINY = "hf-internal-testing/tiny-random-ctrl" |
| TRANSFO_XL_TINY = "hf-internal-testing/tiny-random-transfo-xl" |
|
|
| |
| IBERT_TINY = "hf-internal-testing/tiny-random-ibert" |
| REFORMER_TINY = "hf-internal-testing/tiny-random-reformer" |
|
|
| |
| |
| DPR_TINY = "hf-internal-testing/tiny-random-dpr" |
| |
| RAG_TINY = "hf-internal-testing/tiny-random-rag" |
| |
| LUKE_TINY = "" |
| |
| LXMERT_TINY = "hf-internal-testing/tiny-random-lxmert" |
| |
| CLIP_TINY = "hf-internal-testing/tiny-random-clip" |
| |
| SPEECH_TO_TEXT_TINY = "hf-internal-testing/tiny-random-speech_to_text" |
| |
|
|
|
|
| |
| |
| TAPAS_TINY = "hf-internal-testing/tiny-random-tapas" |
| |
| |
|
|
|
|
| |
| |
|
|
|
|
| def get_launcher(distributed=False): |
| |
| |
| |
| |
| num_gpus = min(2, backend_device_count(torch_device)) if distributed else 1 |
| master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT) |
| return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split() |
|
|
|
|
| def make_task_cmds(): |
| data_dir_samples = f"{FIXTURE_DIRECTORY}/tests_samples" |
| data_dir_wmt = f"{data_dir_samples}/wmt_en_ro" |
| data_dir_xsum = f"{data_dir_samples}/xsum" |
| args_main = """ |
| --do_train |
| --max_train_samples 4 |
| --per_device_train_batch_size 2 |
| --num_train_epochs 1 |
| --fp16 |
| --report_to none |
| --overwrite_output_dir |
| """.split() |
|
|
| |
| |
| |
| |
| tasks2models = { |
| "trans": [ |
| "bart", |
| "fsmt", |
| "m2m_100", |
| "marian", |
| "mbart", |
| "t5", |
| "t5_v1", |
| |
| ], |
| "sum": [ |
| "pegasus", |
| ], |
| "clm": [ |
| "big_bird", |
| "bigbird_pegasus", |
| "blenderbot", |
| "bloom", |
| "gpt2", |
| "gpt_neo", |
| "gptj", |
| "xlm-roberta", |
| "prophetnet", |
| |
| ], |
| "mlm": [ |
| "albert", |
| "deberta", |
| "deberta-v2", |
| "distilbert", |
| "electra", |
| "flaubert", |
| "funnel", |
| "layoutlm", |
| |
| ], |
| "qa": [ |
| "led", |
| "longformer", |
| "mobilebert", |
| "mpnet", |
| "roberta", |
| "squeezebert", |
| |
| |
| ], |
| "clas": [ |
| "bert", |
| "xlnet", |
| |
| |
| |
| |
| |
| |
| ], |
| "img_clas": [ |
| "vit", |
| ], |
| } |
|
|
| scripts_dir = f"{ROOT_DIRECTORY}/examples/pytorch" |
|
|
| tasks = { |
| "trans": f""" |
| {scripts_dir}/translation/run_translation.py |
| --train_file {data_dir_wmt}/train.json |
| --source_lang en |
| --target_lang ro |
| --max_source_length 12 |
| --max_target_length 12 |
| """, |
| "sum": f""" |
| {scripts_dir}/summarization/run_summarization.py |
| --train_file {data_dir_xsum}/sample.json |
| --max_source_length 12 |
| --max_target_length 12 |
| --lang en |
| """, |
| "clm": f""" |
| {scripts_dir}/language-modeling/run_clm.py |
| --train_file {FIXTURE_DIRECTORY}/sample_text.txt |
| --block_size 8 |
| """, |
| "mlm": f""" |
| {scripts_dir}/language-modeling/run_mlm.py |
| --train_file {FIXTURE_DIRECTORY}/sample_text.txt |
| """, |
| "qa": f""" |
| {scripts_dir}/question-answering/run_qa.py |
| --train_file {data_dir_samples}/SQUAD/sample.json |
| """, |
| "clas": f""" |
| {scripts_dir}/text-classification/run_glue.py |
| --train_file {data_dir_samples}/MRPC/train.csv |
| --max_seq_length 12 |
| --task_name MRPC |
| """, |
| "img_clas": f""" |
| {scripts_dir}/image-classification/run_image_classification.py |
| --dataset_name hf-internal-testing/cats_vs_dogs_sample |
| --trust_remote_code |
| --remove_unused_columns False |
| --max_steps 10 |
| --image_processor_name {DS_TESTS_DIRECTORY}/vit_feature_extractor.json |
| --label_column_name labels |
| """, |
| } |
|
|
| launcher = get_launcher(distributed=True) |
|
|
| cmds = {} |
| for task, args in tasks.items(): |
| args = args.split() |
| for model in tasks2models[task]: |
| model_name = globals()[f"{model.upper().replace('-', '_')}_TINY"] |
| args_model = f"--model_name_or_path {model_name}".split() |
| cmds[f"{task}_{model}"] = launcher + args + args_model + args_main |
|
|
| |
| |
| |
| |
| |
| |
|
|
| return cmds |
|
|
|
|
| task_cmds = make_task_cmds() |
|
|
| ZERO2 = "zero2" |
| ZERO3 = "zero3" |
|
|
| stages = [ZERO2, ZERO3] |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
| def parameterized_custom_name_func(func, param_num, param): |
| |
| |
| param_based_name = parameterized.to_safe_name("_".join(str(x) for x in param.args)) |
| return f"{func.__name__}_{param_based_name}" |
|
|
|
|
| |
| params = list(itertools.product(stages, task_cmds.keys())) |
|
|
|
|
| @slow |
| @require_deepspeed |
| @require_torch_accelerator |
| class TestDeepSpeedModelZoo(TestCasePlus): |
| """This class is for testing via an external script - can do multiple gpus""" |
|
|
| def get_task_cmd(self, task, stage): |
| |
| if task not in task_cmds: |
| raise ValueError(f"don't know of task {task}, have {task_cmds.keys()}") |
|
|
| cmd = task_cmds[task] |
| args_ds = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split() |
|
|
| output_dir = self.get_auto_remove_tmp_dir() |
| args_out = f"--output_dir {output_dir}".split() |
|
|
| cmd += args_ds + args_out |
|
|
| return cmd, output_dir |
|
|
| @parameterized.expand(params, name_func=parameterized_custom_name_func) |
| def test_zero_to_fp32(self, stage, task): |
| |
|
|
| cmd, output_dir = self.get_task_cmd(task, stage) |
|
|
| |
| cmd += "--save_steps 1".split() |
| |
| |
| execute_subprocess_async(cmd, env=self.get_env()) |
|
|
| |
| chkpt_dir = f"{output_dir}/checkpoint-1" |
| recovered_model_path = f"{chkpt_dir}/out.bin" |
| cmd = f"{chkpt_dir}/zero_to_fp32.py {chkpt_dir} {recovered_model_path}" |
| |
| |
| subprocess.check_call(cmd, shell=True) |
| assert os.path.exists(recovered_model_path), f"{recovered_model_path} was not found" |
|
|
| |
| |
|
|