| from __gin__ import dynamic_registration |
| import cached_conv as cc |
| from cached_conv import convs |
| import rave |
| from rave import blocks |
| from rave import core |
| from rave import dataset |
| from rave import descript_discriminator |
| from rave import discriminator |
| from rave import model |
| from rave import pqmf |
| import torch |
| import torch.nn as nn |
|
|
| # Macros: |
| # ============================================================================== |
| ACTIVATION = @blocks.Snake |
| CAPACITY = 96 |
| DILATIONS = [[1, 3, 9], [1, 3, 9], [1, 3, 9], [1, 3]] |
| KERNEL_SIZE = 3 |
| LATENT_SIZE = 128 |
| N_BAND = 16 |
| NOISE_AUGMENTATION = 0 |
| PHASE_1_DURATION = 50000 |
| RATIOS = [4, 4, 4, 2] |
| SAMPLING_RATE = 48000 |
|
|
| # Parameters for blocks.AdaptiveInstanceNormalization: |
| # ============================================================================== |
| # None. |
|
|
| # Parameters for variational/blocks.AdaptiveInstanceNormalization: |
| # ============================================================================== |
| # None. |
|
|
| # Parameters for core.AudioDistanceV1: |
| # ============================================================================== |
| core.AudioDistanceV1.log_epsilon = 1e-07 |
| core.AudioDistanceV1.multiscale_stft = @core.MultiScaleSTFT |
|
|
| # Parameters for model.BetaWarmupCallback: |
| # ============================================================================== |
| model.BetaWarmupCallback.initial_value = 1e-06 |
| model.BetaWarmupCallback.log = True |
| model.BetaWarmupCallback.target_value = 0.002 |
| model.BetaWarmupCallback.warmup_len = 20000 |
|
|
| # Parameters for pqmf.CachedPQMF: |
| # ============================================================================== |
| pqmf.CachedPQMF.attenuation = 100 |
| pqmf.CachedPQMF.n_band = %N_BAND |
|
|
| # Parameters for cc.Conv1d: |
| # ============================================================================== |
| cc.Conv1d.bias = False |
|
|
| # Parameters for variational/cc.Conv1d: |
| # ============================================================================== |
| variational/cc.Conv1d.bias = False |
|
|
| # Parameters for cc.ConvTranspose1d: |
| # ============================================================================== |
| cc.ConvTranspose1d.bias = False |
|
|
| # Parameters for descript_discriminator.DescriptDiscriminator: |
| # ============================================================================== |
| descript_discriminator.DescriptDiscriminator.bands = \ |
| [(0.0, 0.1), (0.1, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1.0)] |
| descript_discriminator.DescriptDiscriminator.fft_sizes = [2048, 1024, 512] |
| descript_discriminator.DescriptDiscriminator.periods = [2, 3, 5, 7, 11] |
| descript_discriminator.DescriptDiscriminator.rates = [] |
| descript_discriminator.DescriptDiscriminator.sample_rate = 44100 |
|
|
| # Parameters for variational/blocks.EncoderV2: |
| # ============================================================================== |
| variational/blocks.EncoderV2.activation = %ACTIVATION |
| variational/blocks.EncoderV2.adain = @blocks.AdaptiveInstanceNormalization |
| variational/blocks.EncoderV2.capacity = %CAPACITY |
| variational/blocks.EncoderV2.data_size = %N_BAND |
| variational/blocks.EncoderV2.dilations = %DILATIONS |
| variational/blocks.EncoderV2.keep_dim = False |
| variational/blocks.EncoderV2.kernel_size = %KERNEL_SIZE |
| variational/blocks.EncoderV2.latent_size = %LATENT_SIZE |
| variational/blocks.EncoderV2.n_out = 2 |
| variational/blocks.EncoderV2.ratios = %RATIOS |
| variational/blocks.EncoderV2.recurrent_layer = None |
| variational/blocks.EncoderV2.spectrogram = None |
|
|
| # Parameters for blocks.GeneratorV2: |
| # ============================================================================== |
| blocks.GeneratorV2.activation = %ACTIVATION |
| blocks.GeneratorV2.adain = @blocks.AdaptiveInstanceNormalization |
| blocks.GeneratorV2.amplitude_modulation = True |
| blocks.GeneratorV2.capacity = %CAPACITY |
| blocks.GeneratorV2.causal_convtranspose = True |
| blocks.GeneratorV2.data_size = %N_BAND |
| blocks.GeneratorV2.dilations = %DILATIONS |
| blocks.GeneratorV2.keep_dim = False |
| blocks.GeneratorV2.kernel_size = %KERNEL_SIZE |
| blocks.GeneratorV2.latent_size = @core.get_augmented_latent_size() |
| blocks.GeneratorV2.noise_module = @blocks.NoiseGeneratorV2 |
| blocks.GeneratorV2.ratios = %RATIOS |
| blocks.GeneratorV2.recurrent_layer = None |
|
|
| # Parameters for core.get_augmented_latent_size: |
| # ============================================================================== |
| core.get_augmented_latent_size.latent_size = %LATENT_SIZE |
| core.get_augmented_latent_size.noise_augmentation = %NOISE_AUGMENTATION |
|
|
| # Parameters for dataset.get_dataset: |
| # ============================================================================== |
| dataset.get_dataset.augmentations = [] |
|
|
| # Parameters for convs.get_padding: |
| # ============================================================================== |
| convs.get_padding.dilation = 1 |
| convs.get_padding.mode = 'causal' |
| convs.get_padding.stride = 1 |
|
|
| # Parameters for variational/convs.get_padding: |
| # ============================================================================== |
| variational/convs.get_padding.dilation = 1 |
| variational/convs.get_padding.mode = 'causal' |
| variational/convs.get_padding.stride = 1 |
|
|
| # Parameters for core.MultiScaleSTFT: |
| # ============================================================================== |
| core.MultiScaleSTFT.magnitude = True |
| core.MultiScaleSTFT.normalized = False |
| core.MultiScaleSTFT.num_mels = None |
| core.MultiScaleSTFT.random_crop = False |
| core.MultiScaleSTFT.sample_rate = %SAMPLING_RATE |
| core.MultiScaleSTFT.scales = [2048, 1024, 512, 256, 128] |
|
|
| # Parameters for blocks.NoiseGeneratorV2: |
| # ============================================================================== |
| blocks.NoiseGeneratorV2.activation = %ACTIVATION |
| blocks.NoiseGeneratorV2.data_size = %N_BAND |
| blocks.NoiseGeneratorV2.hidden_size = 128 |
| blocks.NoiseGeneratorV2.noise_bands = 5 |
| blocks.NoiseGeneratorV2.ratios = [2, 2, 2] |
|
|
| # Parameters for blocks.normalization: |
| # ============================================================================== |
| blocks.normalization.mode = 'weight_norm' |
|
|
| # Parameters for variational/blocks.normalization: |
| # ============================================================================== |
| variational/blocks.normalization.mode = 'weight_norm' |
|
|
| # Parameters for model.RAVE: |
| # ============================================================================== |
| model.RAVE.audio_distance = @core.AudioDistanceV1 |
| model.RAVE.audio_monitor_epochs = 1 |
| model.RAVE.balancer = None |
| model.RAVE.decoder = @blocks.GeneratorV2 |
| model.RAVE.discriminator = @descript_discriminator.DescriptDiscriminator |
| model.RAVE.enable_pqmf_decode = None |
| model.RAVE.enable_pqmf_encode = None |
| model.RAVE.encoder = @blocks.VariationalEncoder |
| model.RAVE.feature_matching_fun = @feature_matching/core.mean_difference |
| model.RAVE.freeze_encoder = True |
| model.RAVE.gan_loss = @core.hinge_gan |
| model.RAVE.input_mode = 'pqmf' |
| model.RAVE.is_mel_input = None |
| model.RAVE.latent_size = %LATENT_SIZE |
| model.RAVE.loss_weights = None |
| model.RAVE.multiband_audio_distance = @core.AudioDistanceV1 |
| model.RAVE.n_bands = 16 |
| model.RAVE.n_channels = 1 |
| model.RAVE.num_skipped_features = 1 |
| model.RAVE.output_mode = 'pqmf' |
| model.RAVE.phase_1_duration = %PHASE_1_DURATION |
| model.RAVE.pqmf = @pqmf.CachedPQMF |
| model.RAVE.sampling_rate = %SAMPLING_RATE |
| model.RAVE.spectrogram = None |
| model.RAVE.update_discriminator_every = 4 |
| model.RAVE.valid_signal_crop = True |
| model.RAVE.warmup_quantize = None |
| model.RAVE.weights = {'feature_matching': 20} |
|
|
| # Parameters for blocks.Snake: |
| # ============================================================================== |
| # None. |
|
|
| # Parameters for variational/blocks.Snake: |
| # ============================================================================== |
| # None. |
|
|
| # Parameters for dataset.split_dataset: |
| # ============================================================================== |
| dataset.split_dataset.max_residual = 1000 |
|
|
| # Parameters for blocks.VariationalEncoder: |
| # ============================================================================== |
| blocks.VariationalEncoder.beta = 1.0 |
| blocks.VariationalEncoder.encoder = @variational/blocks.EncoderV2 |
|
|