| { |
| "name": "flexitok_llama", |
| "dump_dir": "/scratch/gsa/flexitok/init_models", |
| "seed": 777, |
| "grad_acc_steps": 8, |
| "gc_collect_freq": 1000, |
| "probe_freq": null, |
| "steps": 100000, |
| "data": { |
| "root_dir": "/scratch/craffel/lingua/data/flexitok/", |
| "sources": { |
| "fw_edu": 0.4, |
| "dan_Latn": 0.0216582869670702, |
| "swe_Latn": 0.0216359765418466, |
| "vie_Latn": 0.0197485510268674, |
| "hun_Latn": 0.0247194573562308, |
| "fas_Arab": 0.0205634624231076, |
| "tur_Latn": 0.0235455794841729, |
| "ces_Latn": 0.0248024455266208, |
| "arb_Arab": 0.0234323706569333, |
| "ell_Grek": 0.0233670886888026, |
| "ind_Latn": 0.0269322054593488, |
| "nld_Latn": 0.0277796326621489, |
| "pol_Latn": 0.0294120104572311, |
| "por_Latn": 0.0301413168306825, |
| "ita_Latn": 0.0324056371021865, |
| "jpn_Jpan": 0.03553104151369, |
| "fra_Latn": 0.0381835560678536, |
| "spa_Latn": 0.0387222793083669, |
| "deu_Latn": 0.0419925340453022, |
| "cmn_Hani": 0.0454067521384114, |
| "rus_Cyrl": 0.0500198157431261 |
| }, |
| "batch_size": 4, |
| "seq_len": 4096, |
| "n_views": 2, |
| "seed": 42, |
| "add_bos": true, |
| "add_eos": true, |
| "load_async": true, |
| "prefetch_size": 1024, |
| "tokenizer": { |
| "name": "huggingface", |
| "path": "meta-llama/Llama-3.2-1B", |
| "tokenizers": null, |
| "load_supermapping": false, |
| "dropout": 0.0, |
| "seed": 42, |
| "superset_code_name": "super_vocab", |
| "n_words": null |
| }, |
| "routing": { |
| "source_to_tokenizer": {}, |
| "task_to_tokenizer": {}, |
| "suitable_tokenizer_probability": 1.0 |
| } |
| }, |
| "optim": { |
| "lr": 0.001, |
| "weight_decay": 0.1, |
| "epsilon": 1e-08, |
| "beta1": 0.9, |
| "beta2": 0.95, |
| "clip": 1.0, |
| "scheduler": "cosine", |
| "warmup": 2000, |
| "lr_min_ratio": 1e-06, |
| "cycle_length": 1.0, |
| "cosine_theta": 1.0, |
| "annealing_step": 1000, |
| "decay_fraction": 0.1, |
| "exp_factor": 0.5 |
| }, |
| "model": { |
| "dim": 2048, |
| "n_layers": 25, |
| "head_dim": null, |
| "n_heads": 16, |
| "n_kv_heads": null, |
| "ffn_dim_multiplier": null, |
| "multiple_of": 256, |
| "norm_eps": 1e-05, |
| "rope_theta": 10000.0, |
| "init_base_std": null, |
| "init_std_factor": "disabled", |
| "max_seqlen": 4096, |
| "seed": 42, |
| "vocab_size": 128256, |
| "weight_tying": false, |
| "sliding_window": null, |
| "use_factorized_embeddings": false, |
| "factorized_embedding_dim": 0 |
| }, |
| "distributed": { |
| "dp_shard": 1, |
| "dp_replicate": 1, |
| "tp_size": 1, |
| "selective_activation_checkpointing": false, |
| "compile": true, |
| "fsdp_type": "full_shard", |
| "model_dtype": "bf16", |
| "float8_recipe": null, |
| "float8_filter": "layers\\.[0-9]+\\.", |
| "matmul_allow_tf32": false, |
| "detect_anomaly": false, |
| "compile_cache_size_limit": 8, |
| "spawn_method": "forkserver" |
| }, |
| "env": { |
| "MKL_SERVICE_FORCE_INTEL": "GNU", |
| "OMP_NUM_THREADS": "1", |
| "MKL_NUM_THREADS": "1", |
| "ENABLE_INTRA_NODE_COMM": "1", |
| "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", |
| "NCCL_IB_TIMEOUT": "22", |
| "NCCL_DEBUG": "INFO", |
| "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1" |
| }, |
| "checkpoint": { |
| "dump": { |
| "every": 10000, |
| "keep": -1 |
| }, |
| "eval": { |
| "every": 10000, |
| "keep": -1 |
| }, |
| "path": null, |
| "init_ckpt_path": null, |
| "load_init_optimizer_state": false, |
| "save_init_ckpt": false |
| }, |
| "profiling": { |
| "run": true, |
| "trace_folder": "profiling", |
| "mem_warmup": 0, |
| "mem_steps": 4, |
| "profile_warmup": 100, |
| "profile_steps": 4 |
| }, |
| "logging": { |
| "freq": 1, |
| "acc_freq": null, |
| "wandb": null |
| }, |
| "async_eval_gpus": 8, |
| "eval": { |
| "harness": { |
| "tasks": [ |
| "hellaswag", |
| "xnli_vi" |
| ] |
| }, |
| "generator": { |
| "max_tokens": 16384, |
| "dtype": "bf16", |
| "add_bos": false |
| } |
| } |
| } |