| encoder: SenseVoiceEncoderSmall |
| encoder_conf: |
| output_size: 512 |
| attention_heads: 4 |
| linear_units: 2048 |
| num_blocks: 50 |
| tp_blocks: 20 |
| dropout_rate: 0.1 |
| positional_dropout_rate: 0.1 |
| attention_dropout_rate: 0.1 |
| input_layer: pe |
| pos_enc_class: SinusoidalPositionEncoder |
| normalize_before: true |
| kernel_size: 11 |
| sanm_shfit: 0 |
| selfattention_layer_type: sanm |
|
|
|
|
| model: SenseVoiceSmall |
| model_conf: |
| length_normalized_loss: true |
| sos: 1 |
| eos: 2 |
| ignore_id: -1 |
|
|
| tokenizer: SentencepiecesTokenizer |
| tokenizer_conf: |
| bpemodel: null |
| unk_symbol: <unk> |
| split_with_space: true |
|
|
| frontend: WavFrontend |
| frontend_conf: |
| fs: 16000 |
| window: hamming |
| n_mels: 80 |
| frame_length: 25 |
| frame_shift: 10 |
| lfr_m: 7 |
| lfr_n: 6 |
| cmvn_file: null |
|
|
|
|
| dataset: SenseVoiceCTCDataset |
| dataset_conf: |
| index_ds: IndexDSJsonl |
| batch_sampler: EspnetStyleBatchSampler |
| data_split_num: 32 |
| batch_type: token |
| batch_size: 14000 |
| max_token_length: 2000 |
| min_token_length: 60 |
| max_source_length: 2000 |
| min_source_length: 60 |
| max_target_length: 200 |
| min_target_length: 0 |
| shuffle: true |
| num_workers: 4 |
| sos: ${model_conf.sos} |
| eos: ${model_conf.eos} |
| IndexDSJsonl: IndexDSJsonl |
| retry: 20 |
|
|
| train_conf: |
| accum_grad: 1 |
| grad_clip: 5 |
| max_epoch: 20 |
| keep_nbest_models: 10 |
| avg_nbest_model: 10 |
| log_interval: 100 |
| resume: true |
| validate_interval: 10000 |
| save_checkpoint_interval: 10000 |
|
|
| optim: adamw |
| optim_conf: |
| lr: 0.00002 |
| scheduler: warmuplr |
| scheduler_conf: |
| warmup_steps: 25000 |
|
|
| specaug: SpecAugLFR |
| specaug_conf: |
| apply_time_warp: false |
| time_warp_window: 5 |
| time_warp_mode: bicubic |
| apply_freq_mask: true |
| freq_mask_width_range: |
| - 0 |
| - 30 |
| lfr_rate: 6 |
| num_freq_mask: 1 |
| apply_time_mask: true |
| time_mask_width_range: |
| - 0 |
| - 12 |
| num_time_mask: 1 |
|
|