Upload 11 files

Browse files

Files changed (11) hide show

exp/30x1_lr_00002_norm_05_vq_065_nfft_320_hop_40_NRIR_012_pha_0005_com_04_early_peak_GAN_tel_mic/g_01134000.pth +3 -0
inference.py +106 -0
inference.sh +6 -0
inference_chunk.py +127 -0
inference_chunk.sh +9 -0
models/codec_module_time_d4.py +168 -0
models/generator_SEMamba_time_d4.py +91 -0
models/mamba_block2_SEMamba.py +81 -0
models/stfts.py +95 -0
recipes/USEMamba_30x1_lr_00002_norm_05_vq_065_nfft_320_hop_40_NRIR_012_pha_0005_com_04_early_001.yaml +44 -0
utils/util.py +37 -0

exp/30x1_lr_00002_norm_05_vq_065_nfft_320_hop_40_NRIR_012_pha_0005_com_04_early_peak_GAN_tel_mic/g_01134000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e27db9e1de904eb59fc627dea72c69da7ca25650a3e704b4096f89812b395fe5
+size 38982886

inference.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import os
+import argparse
+import torch
+import torchaudio
+import librosa
+from models.stfts import mag_phase_stft, mag_phase_istft
+from models.generator_SEMamba_time_d4 import SEMamba
+from utils.util import load_config, pad_or_trim_to_match
+def get_filepaths(directory, file_type=None):
+    file_paths = []  # List which will store all of the full filepaths.
+    # Walk the tree.
+    for root, directories, files in os.walk(directory):
+        for filename in files:
+            # Join the two strings in order to form the full filepath.
+            filepath = os.path.join(root, filename)
+            if file_type is not None:
+                if filepath.split('.')[-1] == file_type:
+                    file_paths.append(filepath)  # Add it to the list.
+            else:
+                file_paths.append(filepath)  # Add it to the list.
+    return file_paths  # Self-explanatory.
+def make_even(value):
+    value = int(round(value))
+    return value if value % 2 == 0 else value + 1
+def inference(args, device):
+    cfg = load_config(args.config)
+    n_fft, hop_size, win_size = cfg['stft_cfg']['n_fft'], cfg['stft_cfg']['hop_size'], cfg['stft_cfg']['win_size']
+    compress_factor = cfg['model_cfg']['compress_factor']
+    sampling_rate = cfg['stft_cfg']['sampling_rate']
+    SE_model = SEMamba(cfg).to(device)
+    state_dict = torch.load(args.checkpoint_file, map_location=device)
+    SE_model.load_state_dict(state_dict['generator'])
+    SE_model.eval()
+    os.makedirs(args.output_folder, exist_ok=True)
+    with torch.no_grad():
+        for i, fname in enumerate(get_filepaths(args.input_folder)):
+            print(fname)
+            try:
+                os.makedirs(args.output_folder + fname[0:fname.rfind('/')].replace(args.input_folder,''), exist_ok=True)
+                noisy_wav, noisy_sr = torchaudio.load(fname)
+            except Exception as e:
+                print(f"Warning: cannot read {fname}, skipping. ({e})")
+                continue
+            if args.BWE is not None:
+                opts = {"res_type": "kaiser_best"}
+                noisy_wav = librosa.resample(noisy_wav.cpu().numpy(), orig_sr=noisy_sr, target_sr=int(args.BWE), **opts)
+                noisy_sr = int(args.BWE)
+            noisy_wav = torch.FloatTensor(noisy_wav).to(device)
+            n_fft_scaled = make_even(n_fft * noisy_sr // sampling_rate)
+            hop_size_scaled = make_even(hop_size * noisy_sr // sampling_rate)
+            win_size_scaled = make_even(win_size * noisy_sr // sampling_rate)
+            noisy_mag, noisy_pha, noisy_com = mag_phase_stft(
+                noisy_wav,
+                n_fft=n_fft_scaled,
+                hop_size=hop_size_scaled,
+                win_size=win_size_scaled,
+                compress_factor=compress_factor,
+                center=True,
+                addeps=False
+            )
+            amp_g, pha_g, _ = SE_model(noisy_mag, noisy_pha)
+            audio_g = mag_phase_istft(amp_g, pha_g, n_fft_scaled, hop_size_scaled, win_size_scaled, compress_factor)
+            audio_g = pad_or_trim_to_match(noisy_wav.detach(), audio_g, pad_value=1e-8)  # Align lengths using epsilon padding
+            assert audio_g.shape == noisy_wav.shape, audio_g.shape
+            output_file = os.path.join(args.output_folder + fname.replace(args.input_folder,'').split('.')[0]+'.flac') # save to .flac format
+            torchaudio.save(output_file, audio_g.cpu(), noisy_sr)
+def main():
+    print('Initializing Inference Process...')
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_folder')
+    parser.add_argument('--output_folder')
+    parser.add_argument('--config')
+    parser.add_argument('--checkpoint_file', required=True)
+    parser.add_argument('--BWE', default=None)
+    args = parser.parse_args()
+    global device
+    if torch.cuda.is_available():
+        device = torch.device('cuda')
+    else:
+        raise RuntimeError("Currently, CPU mode is not supported.")
+    inference(args, device)
+if __name__ == '__main__':
+    main()

inference.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+CUDA_VISIBLE_DEVICES='0' python ./inference.py \
+   --input_folder ./noisy_audio \
+   --output_folder ./enhanced_audio \
+   --checkpoint_file ./exp/30x1_lr_00002_norm_05_vq_065_nfft_320_hop_40_NRIR_012_pha_0005_com_04_early_peak_GAN_tel_mic/g_01134000.pth  \
+   --config ./recipes/USEMamba_30x1_lr_00002_norm_05_vq_065_nfft_320_hop_40_NRIR_012_pha_0005_com_04_early_001.yaml \
+   #--BWE 32000 \

inference_chunk.py ADDED Viewed

	@@ -0,0 +1,127 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import os
+import argparse
+import torch
+import torchaudio
+import librosa
+import math
+from models.stfts import mag_phase_stft, mag_phase_istft
+from models.generator_SEMamba_time_d4 import SEMamba
+from utils.util import load_config, pad_or_trim_to_match
+def get_filepaths(directory, file_type=None):
+    file_paths = []  # List which will store all of the full filepaths.
+    # Walk the tree.
+    for root, directories, files in os.walk(directory):
+        for filename in files:
+            # Join the two strings in order to form the full filepath.
+            filepath = os.path.join(root, filename)
+            if file_type is not None:
+                if filepath.split('.')[-1] == file_type:
+                    file_paths.append(filepath)  # Add it to the list.
+            else:
+                file_paths.append(filepath)  # Add it to the list.
+    return file_paths  # Self-explanatory.
+def make_even(value):
+    value = int(round(value))
+    return value if value % 2 == 0 else value + 1
+def inference(args, device):
+    cfg = load_config(args.config)
+    n_fft, hop_size, win_size = cfg['stft_cfg']['n_fft'], cfg['stft_cfg']['hop_size'], cfg['stft_cfg']['win_size']
+    compress_factor = cfg['model_cfg']['compress_factor']
+    sampling_rate = cfg['stft_cfg']['sampling_rate']
+    SE_model = SEMamba(cfg).to(device)
+    state_dict = torch.load(args.checkpoint_file, map_location=device)
+    SE_model.load_state_dict(state_dict['generator'])
+    SE_model.eval()
+    os.makedirs(args.output_folder, exist_ok=True)
+    with torch.no_grad():
+        for fname in get_filepaths(args.input_folder):
+            print(fname)
+            try:
+                os.makedirs(args.output_folder + fname[0:fname.rfind('/')].replace(args.input_folder,''), exist_ok=True)
+                Noisy_wav, noisy_sr = torchaudio.load(fname)
+            except Exception as e:
+                print(f"Warning: cannot read {fname}, skipping. ({e})")
+                continue
+            if args.BWE is not None:
+                opts = {"res_type": "kaiser_best"}
+                Noisy_wav = librosa.resample(Noisy_wav.cpu().numpy(), orig_sr=noisy_sr, target_sr=int(args.BWE), **opts)
+                noisy_sr = int(args.BWE)
+            chunk_size = int(args.chunk_size_in_seconds*noisy_sr) # (in samples)
+            hop_length = int(args.hop_length_portion*chunk_size) # (in samples)
+            window = torch.hann_window(chunk_size).to(device)
+            n_fft_scaled = make_even(n_fft * noisy_sr // sampling_rate)
+            hop_size_scaled = make_even(hop_size * noisy_sr // sampling_rate)
+            win_size_scaled = make_even(win_size * noisy_sr // sampling_rate)
+            Noisy_wav = torch.FloatTensor(Noisy_wav).to(device)
+            audio_enhanced = torch.zeros_like(Noisy_wav).to(device)
+            #norm = torch.zeros_like(Noisy_wav).to(device)
+            window_sum = torch.zeros_like(Noisy_wav).to(device)
+            for c in range(Noisy_wav.shape[0]): # for multi-channel speech
+                noisy_wav = Noisy_wav[c:c+1,:]
+                for i in range(max(1, math.ceil((noisy_wav.shape[1]-chunk_size)/hop_length)+1)):
+                    noisy_wav_chunk = noisy_wav[:, i*hop_length : i*hop_length+chunk_size]
+                    noisy_mag, noisy_pha, noisy_com = mag_phase_stft(
+                        noisy_wav_chunk,
+                        n_fft=n_fft_scaled,
+                        hop_size=hop_size_scaled,
+                        win_size=win_size_scaled,
+                        compress_factor=compress_factor,
+                        center=True,
+                        addeps=False
+                    )
+                    amp_g, pha_g, _ = SE_model(noisy_mag, noisy_pha)
+                    audio_g = mag_phase_istft(amp_g, pha_g, n_fft_scaled, hop_size_scaled, win_size_scaled, compress_factor)
+                    audio_g = pad_or_trim_to_match(noisy_wav_chunk.detach(), audio_g, pad_value=1e-8)  # Align lengths using epsilon padding
+                    audio_enhanced[c:c+1,i*hop_length:i*hop_length+chunk_size] += audio_g*window[0:audio_g.shape[1]]
+                    window_sum[c:c+1,i*hop_length:i*hop_length+chunk_size] += window[0:audio_g.shape[1]]
+                    #norm[c:c+1,i*hop_length:i*hop_length+chunk_size] += 1.0
+            nonzero_indices = (window_sum > 1e-8)
+            audio_enhanced[:,nonzero_indices[0]] = audio_enhanced[:,nonzero_indices[0]]/window_sum[:,nonzero_indices[0]]
+            assert audio_enhanced.shape == Noisy_wav.shape, audio_enhanced.shape
+            output_file = os.path.join(args.output_folder + fname.replace(args.input_folder,'').split('.')[0]+'.flac') # save to .flac format
+            torchaudio.save(output_file, audio_enhanced.cpu(), noisy_sr)
+def main():
+    print('Initializing Inference Process..')
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_folder')
+    parser.add_argument('--output_folder')
+    parser.add_argument('--config')
+    parser.add_argument('--checkpoint_file')
+    parser.add_argument('--chunk_size_in_seconds', type=float)
+    parser.add_argument('--hop_length_portion', type=float)
+    parser.add_argument('--BWE', default=None)
+    args = parser.parse_args()
+    global device
+    if torch.cuda.is_available():
+        device = torch.device('cuda')
+    else:
+        raise RuntimeError("Currently, CPU mode is not supported.")
+    inference(args, device)
+if __name__ == '__main__':
+    main()

inference_chunk.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+CUDA_VISIBLE_DEVICES='0' python ./inference_chunk.py \
+   --input_folder ./long_noisy_audio \
+   --output_folder ./long_enhanced_audio \
+   --checkpoint_file ./exp/30x1_lr_00002_norm_05_vq_065_nfft_320_hop_40_NRIR_012_pha_0005_com_04_early_peak_GAN_tel_mic/g_01134000.pth  \
+   --config ./recipes/USEMamba_30x1_lr_00002_norm_05_vq_065_nfft_320_hop_40_NRIR_012_pha_0005_com_04_early_001.yaml \
+   --chunk_size_in_seconds 5\
+   --hop_length_portion 0.5\
+   #--BWE 32000 \

models/codec_module_time_d4.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import torch
+import torch.nn as nn
+import numpy as np
+from einops import rearrange
+def get_padding_2d(kernel_size, dilation=(1, 1)):
+    """
+    Calculate the padding size for a 2D convolutional layer.
+    Args:
+    - kernel_size (tuple): Size of the convolutional kernel (height, width).
+    - dilation (tuple, optional): Dilation rate of the convolution (height, width). Defaults to (1, 1).
+    Returns:
+    - tuple: Calculated padding size (height, width).
+    """
+    return (int((kernel_size[0] * dilation[0] - dilation[0]) / 2),
+            int((kernel_size[1] * dilation[1] - dilation[1]) / 2))
+class SPConvTranspose2d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, r=1):
+        super(SPConvTranspose2d, self).__init__()
+        self.pad1 = nn.ConstantPad2d((1, 1, 0, 0), value=0.)
+        self.out_channels = out_channels
+        self.conv = nn.Conv2d(in_channels, out_channels * r, kernel_size=kernel_size, stride=(1, 1))
+        self.r = r
+    def forward(self, x):
+        x = self.pad1(x)
+        out = self.conv(x)
+        batch_size, nchannels, H, W = out.shape
+        out = out.view((batch_size, self.r, nchannels // self.r, H, W))
+        out = out.permute(0, 2, 3, 4, 1)
+        out = out.contiguous().view((batch_size, nchannels // self.r, H, -1))
+        return out
+class DenseBlock(nn.Module):
+    """
+    DenseBlock module consisting of multiple convolutional layers with dilation.
+    """
+    def __init__(self, cfg, kernel_size=(3, 3), depth=4):
+        super(DenseBlock, self).__init__()
+        self.cfg = cfg
+        self.depth = depth
+        self.dense_block = nn.ModuleList()
+        self.hid_feature = cfg['model_cfg']['hid_feature']
+        for i in range(depth):
+            dil = 2 ** i
+            dense_conv = nn.Sequential(
+                nn.Conv2d(self.hid_feature * (i + 1), self.hid_feature, kernel_size,
+                          dilation=(dil, 1), padding=get_padding_2d(kernel_size, (dil, 1))),
+                nn.InstanceNorm2d(self.hid_feature, affine=True),
+                nn.PReLU(self.hid_feature)
+            )
+            self.dense_block.append(dense_conv)
+    def forward(self, x):
+        skip = x
+        for i in range(self.depth):
+            x = self.dense_block[i](skip)
+            skip = torch.cat([x, skip], dim=1)
+        return x
+class DenseEncoder(nn.Module):
+    """
+    DenseEncoder module consisting of initial convolution, dense block, and a final convolution.
+    """
+    def __init__(self, cfg):
+        super(DenseEncoder, self).__init__()
+        self.cfg = cfg
+        self.input_channel = cfg['model_cfg']['input_channel']
+        self.hid_feature = cfg['model_cfg']['hid_feature']
+        self.dense_conv_1 = nn.Sequential(
+            nn.Conv2d(self.input_channel, self.hid_feature, (1, 1)),
+            nn.InstanceNorm2d(self.hid_feature, affine=True),
+            nn.PReLU(self.hid_feature)
+        )
+        self.dense_block = DenseBlock(cfg, depth=4)
+        self.dense_conv_2 = nn.Sequential(
+            nn.Conv2d(self.hid_feature, self.hid_feature, (1, 3), stride=(4, 2)),
+            nn.InstanceNorm2d(self.hid_feature, affine=True),
+            nn.PReLU(self.hid_feature)
+        )
+    def forward(self, x):
+        x = self.dense_conv_1(x)  # [batch, hid_feature, time, freq]
+        x = self.dense_block(x)   # [batch, hid_feature, time, freq]
+        x = self.dense_conv_2(x)  # [batch, hid_feature, time, freq//2]
+        return x
+class MagDecoder(nn.Module):
+    """
+    MagDecoder module for decoding magnitude information.
+    """
+    def __init__(self, cfg):
+        super(MagDecoder, self).__init__()
+        self.dense_block = DenseBlock(cfg, depth=4)
+        self.hid_feature = cfg['model_cfg']['hid_feature']
+        self.output_channel = cfg['model_cfg']['output_channel']
+        self.n_fft = cfg['stft_cfg']['n_fft']
+        self.beta = cfg['model_cfg']['beta']
+        self.up_conv1 = nn.Sequential(
+            SPConvTranspose2d(self.hid_feature, self.hid_feature, (1, 3), 2),
+            nn.InstanceNorm2d(self.hid_feature, affine=True),
+            nn.PReLU(self.hid_feature)
+        )
+        self.up_conv2 = nn.Sequential(
+            SPConvTranspose2d(self.hid_feature, self.hid_feature, (1, 3), 4),
+            nn.InstanceNorm2d(self.hid_feature, affine=True),
+            nn.PReLU(self.hid_feature)
+        )
+        self.final_conv = nn.Conv2d(self.hid_feature, self.output_channel, (1, 1))
+    def forward(self, x):
+        x = self.dense_block(x)
+        x = self.up_conv1(x)
+        x = self.up_conv2(x.permute(0,1,3,2)).permute(0,1,3,2)
+        x = self.final_conv(x)
+        return x
+class PhaseDecoder(nn.Module):
+    """
+    PhaseDecoder module for decoding phase information.
+    """
+    def __init__(self, cfg):
+        super(PhaseDecoder, self).__init__()
+        self.dense_block = DenseBlock(cfg, depth=4)
+        self.hid_feature = cfg['model_cfg']['hid_feature']
+        self.output_channel = cfg['model_cfg']['output_channel']
+        self.up_conv1 = nn.Sequential(
+            SPConvTranspose2d(self.hid_feature, self.hid_feature, (1, 3), 2),
+            nn.InstanceNorm2d(self.hid_feature, affine=True),
+            nn.PReLU(self.hid_feature)
+        )
+        self.up_conv2 = nn.Sequential(
+            SPConvTranspose2d(self.hid_feature, self.hid_feature, (1, 3), 4),
+            nn.InstanceNorm2d(self.hid_feature, affine=True),
+            nn.PReLU(self.hid_feature)
+        )
+        self.phase_conv_r = nn.Conv2d(self.hid_feature, self.output_channel, (1, 1))
+        self.phase_conv_i = nn.Conv2d(self.hid_feature, self.output_channel, (1, 1))
+    def forward(self, x):
+        x = self.dense_block(x)
+        x = self.up_conv1(x)
+        x = self.up_conv2(x.permute(0,1,3,2)).permute(0,1,3,2)
+        x_r = self.phase_conv_r(x)
+        x_i = self.phase_conv_i(x)
+        x = torch.atan2(x_i, x_r)
+        return x

models/generator_SEMamba_time_d4.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import torch
+import torch.nn as nn
+from einops import rearrange
+from .mamba_block2_SEMamba import TFMambaBlock
+from .codec_module_time_d4 import DenseEncoder, MagDecoder, PhaseDecoder
+class SEMamba(nn.Module):
+    """
+    SEMamba model for speech enhancement using Mamba blocks.
+    This model uses a dense encoder, multiple Mamba blocks, and separate magnitude
+    and phase decoders to process noisy magnitude and phase inputs.
+    """
+    def __init__(self, cfg):
+        """
+        Initialize the SEMamba model.
+        Args:
+        - cfg: Configuration object containing model parameters.
+        """
+        super(SEMamba, self).__init__()
+        self.cfg = cfg
+        self.num_tscblocks = cfg['model_cfg']['num_tfmamba'] if cfg['model_cfg']['num_tfmamba'] is not None else 4  # default tfmamba: 4
+        # Initialize dense encoder
+        self.dense_encoder = DenseEncoder(cfg)
+        # Initialize Mamba blocks
+        self.TSMamba = nn.ModuleList([TFMambaBlock(cfg) for _ in range(self.num_tscblocks)])
+        # Initialize decoders
+        self.mask_decoder = MagDecoder(cfg)
+        self.phase_decoder = PhaseDecoder(cfg)
+    def forward(self, noisy_mag, noisy_pha):
+        """
+        Forward pass for the SEMamba model.
+        Args:
+        - noisy_mag (torch.Tensor): Noisy magnitude input tensor [B, F, T].
+        - noisy_pha (torch.Tensor): Noisy phase input tensor [B, F, T].
+        Returns:
+        - denoised_mag (torch.Tensor): Denoised magnitude tensor [B, F, T].
+        - denoised_pha (torch.Tensor): Denoised phase tensor [B, F, T].
+        - denoised_com (torch.Tensor): Denoised complex tensor [B, F, T, 2].
+        """
+        # Reshape inputs
+        noisy_mag = rearrange(noisy_mag, 'b f t -> b t f').unsqueeze(1)  # [B, 1, T, F]
+        noisy_pha = rearrange(noisy_pha, 'b f t -> b t f').unsqueeze(1)  # [B, 1, T, F]
+        # Concatenate magnitude and phase inputs
+        x = torch.cat((noisy_mag, noisy_pha), dim=1)  # [B, 2, T, F]
+        # Prevent unpredictable errors
+        B, C, T, F = x.shape
+        zeros = torch.zeros(B, C, T, 2, device=x.device)
+        x = torch.cat((x, zeros), dim=-1)
+        zeros = torch.zeros(B, C, 2, F+2, device=x.device)
+        x = torch.cat((x, zeros), dim=-2)
+        # Encode input
+        x = self.dense_encoder(x)
+        # Apply Mamba blocks
+        for block in self.TSMamba:
+            x = block(x)
+        # Decode output
+        denoised_mag = rearrange(self.mask_decoder(x), 'b c t f -> b f t c').squeeze(-1)
+        denoised_pha = rearrange(self.phase_decoder(x), 'b c t f -> b f t c').squeeze(-1)
+        # Prevent unpredictable errors
+        denoised_mag = denoised_mag[:, :F, :T]
+        denoised_pha = denoised_pha[:, :F, :T]
+        # Combine denoised magnitude and phase into a complex representation
+        denoised_com = torch.stack(
+            (denoised_mag * torch.cos(denoised_pha), denoised_mag * torch.sin(denoised_pha)),
+            dim=-1
+        )
+        return denoised_mag, denoised_pha, denoised_com

models/mamba_block2_SEMamba.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import init
+from torch.nn.parameter import Parameter
+from functools import partial
+from einops import rearrange
+from mamba_ssm import Mamba
+class MambaBlock(nn.Module):
+    def __init__(self, d_model, cfg):
+        super(MambaBlock, self).__init__()
+        d_state = cfg['model_cfg']['d_state'] # 16
+        d_conv = cfg['model_cfg']['d_conv'] # 4
+        expand = cfg['model_cfg']['expand'] # 4
+        self.forward_blocks  = Mamba(d_model=d_model, d_state=d_state, d_conv=d_conv, expand=expand)
+        self.backward_blocks = Mamba(d_model=d_model, d_state=d_state, d_conv=d_conv, expand=expand)
+        self.output_proj = nn.Linear(2 * d_model, d_model)
+        self.norm = nn.LayerNorm(d_model)
+    def forward(self, x):
+        # x: [B, T, D]
+        out_fw = self.forward_blocks(x) + x
+        out_bw = self.backward_blocks(torch.flip(x, dims=[1])) + torch.flip(x, dims=[1])
+        out_bw = torch.flip(out_bw, dims=[1])
+        out = torch.cat([out_fw, out_bw], dim=-1)
+        out = self.output_proj(out)
+        # LayerNorm
+        return self.norm(out)
+class TFMambaBlock(nn.Module):
+    """
+    Temporal-Frequency Mamba block for sequence modeling.
+    Attributes:
+    cfg (Config): Configuration for the block.
+    time_mamba (MambaBlock): Mamba block for temporal dimension.
+    freq_mamba (MambaBlock): Mamba block for frequency dimension.
+    tlinear (ConvTranspose1d): ConvTranspose1d layer for temporal dimension.
+    flinear (ConvTranspose1d): ConvTranspose1d layer for frequency dimension.
+    """
+    def __init__(self, cfg):
+        super(TFMambaBlock, self).__init__()
+        self.cfg = cfg
+        self.hid_feature = cfg['model_cfg']['hid_feature']
+        # Initialize Mamba blocks
+        self.time_mamba = MambaBlock(d_model=self.hid_feature, cfg=cfg)
+        self.freq_mamba = MambaBlock(d_model=self.hid_feature, cfg=cfg)
+    def forward(self, x):
+        """
+        Forward pass of the TFMamba block.
+        Parameters:
+        x (Tensor): Input tensor with shape (batch, channels, time, freq).
+        Returns:
+        Tensor: Output tensor after applying temporal and frequency Mamba blocks.
+        """
+        b, c, t, f = x.size()
+        x = x.permute(0, 3, 2, 1).contiguous().view(b*f, t, c)
+        x = self.time_mamba(x) + x
+        x = x.view(b, f, t, c).permute(0, 2, 1, 3).contiguous().view(b*t, f, c)
+        x = self.freq_mamba(x) + x
+        x = x.view(b, t, f, c).permute(0, 3, 1, 2)
+        return x

models/stfts.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import torch
+import torch.nn as nn
+def decompress_signed_log1p(y):
+   return torch.sign(y) * (torch.expm1(torch.abs(y)))
+RELU = nn.ReLU()
+def mag_phase_stft(y, n_fft, hop_size, win_size, compress_factor=1.0, center=True, addeps=False):
+    """
+    Compute magnitude and phase using STFT.
+    Args:
+        y (torch.Tensor): Input audio signal.
+        n_fft (int): FFT size.
+        hop_size (int): Hop size.
+        win_size (int): Window size.
+        compress_factor (float, optional): Magnitude compression factor. Defaults to 1.0.
+        center (bool, optional): Whether to center the signal before padding. Defaults to True.
+        eps (bool, optional): Whether adding epsilon to magnitude and phase or not. Defaults to False.
+    Returns:
+        tuple: Magnitude, phase, and complex representation of the STFT.
+    """
+    eps = 1e-10
+    hann_window = torch.hann_window(win_size).to(y.device)
+    stft_spec = torch.stft(
+                    y, n_fft,
+                    hop_length=hop_size,
+                    win_length=win_size,
+                    window=hann_window,
+                    center=center,
+                    pad_mode='reflect',
+                    normalized=False,
+                    return_complex=True)
+    if addeps==False:
+        mag = torch.abs(stft_spec)
+        pha = torch.angle(stft_spec)
+    else:
+        real_part = stft_spec.real
+        imag_part = stft_spec.imag
+        mag = torch.sqrt(real_part.pow(2) + imag_part.pow(2) + eps)
+        pha = torch.atan2(imag_part + eps, real_part + eps)
+    # Compress the magnitude
+    if compress_factor in ['log1p','relu_log1p', 'signed_log1p']:
+        mag = torch.log1p(mag)
+    else:
+        mag = torch.pow(mag, compress_factor)
+    com = torch.stack((mag * torch.cos(pha), mag * torch.sin(pha)), dim=-1)
+    return mag, pha, com
+def mag_phase_istft(mag, pha, n_fft, hop_size, win_size, compress_factor=1.0, center=True):
+    """
+    Inverse STFT to reconstruct the audio signal from magnitude and phase.
+    Args:
+        mag (torch.Tensor): Magnitude of the STFT.
+        pha (torch.Tensor): Phase of the STFT.
+        n_fft (int): FFT size.
+        hop_size (int): Hop size.
+        win_size (int): Window size.
+        compress_factor (float, optional): Magnitude compression factor. Defaults to 1.0.
+        center (bool, optional): Whether to center the signal before padding. Defaults to True.
+    Returns:
+        torch.Tensor: Reconstructed audio signal.
+    """
+    if compress_factor == 'log1p':
+        mag = torch.expm1(mag)
+    elif compress_factor == 'signed_log1p':
+        mag = decompress_signed_log1p(mag)
+    elif compress_factor == 'relu_log1p':
+        mag = torch.expm1(RELU(mag))
+    else:
+        mag = torch.pow(RELU(mag), 1.0 / compress_factor)
+    com = torch.complex(mag * torch.cos(pha), mag * torch.sin(pha))
+    hann_window = torch.hann_window(win_size).to(com.device)
+    wav = torch.istft(
+                    com,
+                    n_fft,
+                    hop_length=hop_size,
+                    win_length=win_size,
+                    window=hann_window,
+                    center=center)
+    return wav

recipes/USEMamba_30x1_lr_00002_norm_05_vq_065_nfft_320_hop_40_NRIR_012_pha_0005_com_04_early_001.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+# Environment Settings
+# These settings specify the hardware and distributed setup for the model training.
+# Adjust `num_gpus` and `dist_config` according to your distributed training environment.
+env_setting:
+  num_gpus: 8  # Number of GPUs. Now we don't support CPU mode.
+  num_workers: 20 # 0  Number of worker threads for data loading.
+  persistent_workers: True # False  If you have large RAM, turn this to be True
+  prefetch_factor: 8 # null
+  seed: 1234  # Seed for random number generators to ensure reproducibility.
+  stdout_interval: 5000
+  checkpoint_interval: 5000  # save model to ckpt every N steps
+  validation_interval: 5000
+  dist_cfg:
+    dist_backend: nccl  # Distributed training backend, 'nccl' for NVIDIA GPUs.
+    dist_url: tcp://localhost:19478  # URL for initializing distributed training.
+    world_size: 1  # Total number of processes in the distributed training.
+  pin_memory: True # If you have large RAM, turn this to be True
+# STFT Configuration
+# Configuration for Short-Time Fourier Transform (STFT), crucial for audio processing models.
+stft_cfg:
+  sampling_rate: 8000  # Audio sampling rate in Hz.
+  n_fft: 320  # FFT components for transforming audio signals.
+  hop_size: 40  # Samples between successive frames.
+  win_size: 320  # Window size used in FFT.
+  sfi: True # Sampline Frequency Independent
+# Model Configuration
+# Defines the architecture specifics of the model, including layer configurations and feature compression.
+model_cfg:
+  hid_feature: 64  # Channels in dense layers.
+  compress_factor: relu_log1p  # Compression factor applied to extracted features.
+  num_tfmamba: 30  # Number of Time-Frequency Mamba (TFMamba) blocks in the model.
+  d_state: 16  # Dimensionality of the state vector in Mamba blocks.
+  d_conv: 4  # Convolutional layer dimensionality within Mamba blocks.
+  expand: 4  # Expansion factor for the layers within the Mamba blocks.
+  norm_epsilon: 0.00001  # Numerical stability in normalization layers within the Mamba blocks.
+  beta: 2.0  # Hyperparameter for the Learnable Sigmoid function.
+  input_channel: 2 # Magnitude and Phase
+  output_channel: 1  # Single Channel Speech Enhancement
+  inner_mamba_nlayer: 1 # Number of layer of Mamba in Bidirectional Mamba
+  nonlinear: None # last activation function for the mag encoder. 'softplus' or 'relu'
+  mapping: True # Otherwise, this should be masking model

utils/util.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import yaml
+import torch
+import os
+import shutil
+import torch.nn.functional as F
+def load_config(config_path):
+    """Load configuration from a YAML file."""
+    with open(config_path, 'r') as file:
+        return yaml.safe_load(file)
+def pad_or_trim_to_match(reference: torch.Tensor, target: torch.Tensor, pad_value: float = 1e-6) -> torch.Tensor:
+    """
+    Extends the target tensor to match the reference tensor along dim=1
+    without breaking autograd, by creating a new tensor and copying data in.
+    """
+    B, ref_len = reference.shape
+    _, tgt_len = target.shape
+    if tgt_len == ref_len:
+        return target
+    elif tgt_len > ref_len:
+        return target[:, :ref_len]
+    # Allocate padded tensor with grad support
+    padded = torch.full((B, ref_len), pad_value, dtype=target.dtype, device=target.device)
+    padded[:, :tgt_len] = target  # This preserves gradient tracking
+    return padded