llama_1b_init_model / params.json
gsaltintas's picture
Upload folder using huggingface_hub
5b7d421 verified
{
"name": "flexitok_llama",
"dump_dir": "/scratch/gsa/flexitok/init_models",
"seed": 777,
"grad_acc_steps": 8,
"gc_collect_freq": 1000,
"probe_freq": null,
"steps": 100000,
"data": {
"root_dir": "/scratch/craffel/lingua/data/flexitok/",
"sources": {
"fw_edu": 0.4,
"dan_Latn": 0.0216582869670702,
"swe_Latn": 0.0216359765418466,
"vie_Latn": 0.0197485510268674,
"hun_Latn": 0.0247194573562308,
"fas_Arab": 0.0205634624231076,
"tur_Latn": 0.0235455794841729,
"ces_Latn": 0.0248024455266208,
"arb_Arab": 0.0234323706569333,
"ell_Grek": 0.0233670886888026,
"ind_Latn": 0.0269322054593488,
"nld_Latn": 0.0277796326621489,
"pol_Latn": 0.0294120104572311,
"por_Latn": 0.0301413168306825,
"ita_Latn": 0.0324056371021865,
"jpn_Jpan": 0.03553104151369,
"fra_Latn": 0.0381835560678536,
"spa_Latn": 0.0387222793083669,
"deu_Latn": 0.0419925340453022,
"cmn_Hani": 0.0454067521384114,
"rus_Cyrl": 0.0500198157431261
},
"batch_size": 4,
"seq_len": 4096,
"n_views": 2,
"seed": 42,
"add_bos": true,
"add_eos": true,
"load_async": true,
"prefetch_size": 1024,
"tokenizer": {
"name": "huggingface",
"path": "meta-llama/Llama-3.2-1B",
"tokenizers": null,
"load_supermapping": false,
"dropout": 0.0,
"seed": 42,
"superset_code_name": "super_vocab",
"n_words": null
},
"routing": {
"source_to_tokenizer": {},
"task_to_tokenizer": {},
"suitable_tokenizer_probability": 1.0
}
},
"optim": {
"lr": 0.001,
"weight_decay": 0.1,
"epsilon": 1e-08,
"beta1": 0.9,
"beta2": 0.95,
"clip": 1.0,
"scheduler": "cosine",
"warmup": 2000,
"lr_min_ratio": 1e-06,
"cycle_length": 1.0,
"cosine_theta": 1.0,
"annealing_step": 1000,
"decay_fraction": 0.1,
"exp_factor": 0.5
},
"model": {
"dim": 2048,
"n_layers": 25,
"head_dim": null,
"n_heads": 16,
"n_kv_heads": null,
"ffn_dim_multiplier": null,
"multiple_of": 256,
"norm_eps": 1e-05,
"rope_theta": 10000.0,
"init_base_std": null,
"init_std_factor": "disabled",
"max_seqlen": 4096,
"seed": 42,
"vocab_size": 128256,
"weight_tying": false,
"sliding_window": null,
"use_factorized_embeddings": false,
"factorized_embedding_dim": 0
},
"distributed": {
"dp_shard": 1,
"dp_replicate": 1,
"tp_size": 1,
"selective_activation_checkpointing": false,
"compile": true,
"fsdp_type": "full_shard",
"model_dtype": "bf16",
"float8_recipe": null,
"float8_filter": "layers\\.[0-9]+\\.",
"matmul_allow_tf32": false,
"detect_anomaly": false,
"compile_cache_size_limit": 8,
"spawn_method": "forkserver"
},
"env": {
"MKL_SERVICE_FORCE_INTEL": "GNU",
"OMP_NUM_THREADS": "1",
"MKL_NUM_THREADS": "1",
"ENABLE_INTRA_NODE_COMM": "1",
"TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
"NCCL_IB_TIMEOUT": "22",
"NCCL_DEBUG": "INFO",
"TORCH_NCCL_ASYNC_ERROR_HANDLING": "1"
},
"checkpoint": {
"dump": {
"every": 10000,
"keep": -1
},
"eval": {
"every": 10000,
"keep": -1
},
"path": null,
"init_ckpt_path": null,
"load_init_optimizer_state": false,
"save_init_ckpt": false
},
"profiling": {
"run": true,
"trace_folder": "profiling",
"mem_warmup": 0,
"mem_steps": 4,
"profile_warmup": 100,
"profile_steps": 4
},
"logging": {
"freq": 1,
"acc_freq": null,
"wandb": null
},
"async_eval_gpus": 8,
"eval": {
"harness": {
"tasks": [
"hellaswag",
"xnli_vi"
]
},
"generator": {
"max_tokens": 16384,
"dtype": "bf16",
"add_bos": false
}
}
}