| name: flexitok_llama |
| dump_dir: /scratch/gsa/flexitok/init_models |
| seed: 777 |
| grad_acc_steps: 8 |
| gc_collect_freq: 1000 |
| probe_freq: null |
| steps: 100000 |
| data: |
| root_dir: /scratch/craffel/lingua/data/flexitok/ |
| sources: |
| fw_edu: 0.4 |
| dan_Latn: 0.0216582869670702 |
| swe_Latn: 0.0216359765418466 |
| vie_Latn: 0.0197485510268674 |
| hun_Latn: 0.0247194573562308 |
| fas_Arab: 0.0205634624231076 |
| tur_Latn: 0.0235455794841729 |
| ces_Latn: 0.0248024455266208 |
| arb_Arab: 0.0234323706569333 |
| ell_Grek: 0.0233670886888026 |
| ind_Latn: 0.0269322054593488 |
| nld_Latn: 0.0277796326621489 |
| pol_Latn: 0.0294120104572311 |
| por_Latn: 0.0301413168306825 |
| ita_Latn: 0.0324056371021865 |
| jpn_Jpan: 0.03553104151369 |
| fra_Latn: 0.0381835560678536 |
| spa_Latn: 0.0387222793083669 |
| deu_Latn: 0.0419925340453022 |
| cmn_Hani: 0.0454067521384114 |
| rus_Cyrl: 0.0500198157431261 |
| batch_size: 4 |
| seq_len: 4096 |
| n_views: 2 |
| seed: 42 |
| add_bos: true |
| add_eos: true |
| load_async: true |
| prefetch_size: 1024 |
| tokenizer: |
| name: huggingface |
| path: meta-llama/Llama-3.2-1B |
| tokenizers: null |
| load_supermapping: false |
| dropout: 0.0 |
| seed: 42 |
| superset_code_name: super_vocab |
| n_words: null |
| routing: |
| source_to_tokenizer: {} |
| task_to_tokenizer: {} |
| suitable_tokenizer_probability: 1.0 |
| optim: |
| lr: 0.001 |
| weight_decay: 0.1 |
| epsilon: 1.0e-08 |
| beta1: 0.9 |
| beta2: 0.95 |
| clip: 1.0 |
| scheduler: cosine |
| warmup: 2000 |
| lr_min_ratio: 1.0e-06 |
| cycle_length: 1.0 |
| cosine_theta: 1.0 |
| annealing_step: 1000 |
| decay_fraction: 0.1 |
| exp_factor: 0.5 |
| model: |
| dim: 2048 |
| n_layers: 25 |
| head_dim: null |
| n_heads: 16 |
| n_kv_heads: null |
| ffn_dim_multiplier: null |
| multiple_of: 256 |
| norm_eps: 1.0e-05 |
| rope_theta: 10000.0 |
| init_base_std: null |
| init_std_factor: disabled |
| max_seqlen: 4096 |
| seed: 42 |
| vocab_size: 128256 |
| weight_tying: false |
| sliding_window: null |
| use_factorized_embeddings: false |
| factorized_embedding_dim: 0 |
| distributed: |
| dp_shard: 1 |
| dp_replicate: 1 |
| tp_size: 1 |
| selective_activation_checkpointing: false |
| compile: true |
| fsdp_type: full_shard |
| model_dtype: bf16 |
| float8_recipe: null |
| float8_filter: layers\.[0-9]+\. |
| matmul_allow_tf32: false |
| detect_anomaly: false |
| compile_cache_size_limit: 8 |
| spawn_method: forkserver |
| env: |
| MKL_SERVICE_FORCE_INTEL: GNU |
| OMP_NUM_THREADS: '1' |
| MKL_NUM_THREADS: '1' |
| ENABLE_INTRA_NODE_COMM: '1' |
| TORCH_NCCL_AVOID_RECORD_STREAMS: '1' |
| NCCL_IB_TIMEOUT: '22' |
| NCCL_DEBUG: INFO |
| TORCH_NCCL_ASYNC_ERROR_HANDLING: '1' |
| checkpoint: |
| dump: |
| every: 10000 |
| keep: -1 |
| eval: |
| every: 10000 |
| keep: -1 |
| path: null |
| init_ckpt_path: null |
| load_init_optimizer_state: false |
| save_init_ckpt: false |
| profiling: |
| run: true |
| trace_folder: profiling |
| mem_warmup: 0 |
| mem_steps: 4 |
| profile_warmup: 100 |
| profile_steps: 4 |
| logging: |
| freq: 1 |
| acc_freq: null |
| wandb: null |
| async_eval_gpus: 8 |
| eval: |
| harness: |
| tasks: |
| - hellaswag |
| - xnli_vi |
| generator: |
| max_tokens: 16384 |
| dtype: bf16 |
| add_bos: false |
|
|