| import copy |
| from contextlib import contextmanager |
| from inspect import signature |
| from typing import List |
|
|
| import numpy as np |
| import torch |
| from flatten_dict import flatten |
| from flatten_dict import unflatten |
| from numpy.random import RandomState |
|
|
| from .. import ml |
| from ..core import AudioSignal |
| from ..core import util |
| from .datasets import AudioLoader |
|
|
| tt = torch.tensor |
| """Shorthand for converting things to torch.tensor.""" |
|
|
|
|
| class BaseTransform: |
| """This is the base class for all transforms that are implemented |
| in this library. Transforms have two main operations: ``transform`` |
| and ``instantiate``. |
| |
| ``instantiate`` sets the parameters randomly |
| from distribution tuples for each parameter. For example, for the |
| ``BackgroundNoise`` transform, the signal-to-noise ratio (``snr``) |
| is chosen randomly by instantiate. By default, it chosen uniformly |
| between 10.0 and 30.0 (the tuple is set to ``("uniform", 10.0, 30.0)``). |
| |
| ``transform`` applies the transform using the instantiated parameters. |
| A simple example is as follows: |
| |
| >>> seed = 0 |
| >>> signal = ... |
| >>> transform = transforms.NoiseFloor(db = ("uniform", -50.0, -30.0)) |
| >>> kwargs = transform.instantiate() |
| >>> output = transform(signal.clone(), **kwargs) |
| |
| By breaking apart the instantiation of parameters from the actual audio |
| processing of the transform, we can make things more reproducible, while |
| also applying the transform on batches of data efficiently on GPU, |
| rather than on individual audio samples. |
| |
| .. note:: |
| We call ``signal.clone()`` for the input to the ``transform`` function |
| because signals are modified in-place! If you don't clone the signal, |
| you will lose the original data. |
| |
| Parameters |
| ---------- |
| keys : list, optional |
| Keys that the transform looks for when |
| calling ``self.transform``, by default []. In general this is |
| set automatically, and you won't need to manipulate this argument. |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| |
| Examples |
| -------- |
| |
| >>> seed = 0 |
| >>> |
| >>> audio_path = "tests/audio/spk/f10_script4_produced.wav" |
| >>> signal = AudioSignal(audio_path, offset=10, duration=2) |
| >>> transform = tfm.Compose( |
| >>> [ |
| >>> tfm.RoomImpulseResponse(sources=["tests/audio/irs.csv"]), |
| >>> tfm.BackgroundNoise(sources=["tests/audio/noises.csv"]), |
| >>> ], |
| >>> ) |
| >>> |
| >>> kwargs = transform.instantiate(seed, signal) |
| >>> output = transform(signal, **kwargs) |
| |
| """ |
|
|
| def __init__(self, keys: list = [], name: str = None, prob: float = 1.0): |
| |
| tfm_keys = list(signature(self._transform).parameters.keys()) |
|
|
| |
| ignore_keys = ["signal", "kwargs"] |
| tfm_keys = [k for k in tfm_keys if k not in ignore_keys] |
|
|
| |
| |
| self.keys = keys + tfm_keys + ["mask"] |
|
|
| self.prob = prob |
|
|
| if name is None: |
| name = self.__class__.__name__ |
| self.name = name |
|
|
| def _prepare(self, batch: dict): |
| sub_batch = batch[self.name] |
|
|
| for k in self.keys: |
| assert k in sub_batch.keys(), f"{k} not in batch" |
|
|
| return sub_batch |
|
|
| def _transform(self, signal): |
| return signal |
|
|
| def _instantiate(self, state: RandomState, signal: AudioSignal = None): |
| return {} |
|
|
| @staticmethod |
| def apply_mask(batch: dict, mask: torch.Tensor): |
| """Applies a mask to the batch. |
| |
| Parameters |
| ---------- |
| batch : dict |
| Batch whose values will be masked in the ``transform`` pass. |
| mask : torch.Tensor |
| Mask to apply to batch. |
| |
| Returns |
| ------- |
| dict |
| A dictionary that contains values only where ``mask = True``. |
| """ |
| masked_batch = {k: v[mask] for k, v in flatten(batch).items()} |
| return unflatten(masked_batch) |
|
|
| def transform(self, signal: AudioSignal, **kwargs): |
| """Apply the transform to the audio signal, |
| with given keyword arguments. |
| |
| Parameters |
| ---------- |
| signal : AudioSignal |
| Signal that will be modified by the transforms in-place. |
| kwargs: dict |
| Keyword arguments to the specific transforms ``self._transform`` |
| function. |
| |
| Returns |
| ------- |
| AudioSignal |
| Transformed AudioSignal. |
| |
| Examples |
| -------- |
| |
| >>> for seed in range(10): |
| >>> kwargs = transform.instantiate(seed, signal) |
| >>> output = transform(signal.clone(), **kwargs) |
| |
| """ |
| tfm_kwargs = self._prepare(kwargs) |
| mask = tfm_kwargs["mask"] |
|
|
| if torch.any(mask): |
| tfm_kwargs = self.apply_mask(tfm_kwargs, mask) |
| tfm_kwargs = {k: v for k, v in tfm_kwargs.items() if k != "mask"} |
| signal[mask] = self._transform(signal[mask], **tfm_kwargs) |
|
|
| return signal |
|
|
| def __call__(self, *args, **kwargs): |
| return self.transform(*args, **kwargs) |
|
|
| def instantiate( |
| self, |
| state: RandomState = None, |
| signal: AudioSignal = None, |
| ): |
| """Instantiates parameters for the transform. |
| |
| Parameters |
| ---------- |
| state : RandomState, optional |
| _description_, by default None |
| signal : AudioSignal, optional |
| _description_, by default None |
| |
| Returns |
| ------- |
| dict |
| Dictionary containing instantiated arguments for every keyword |
| argument to ``self._transform``. |
| |
| Examples |
| -------- |
| |
| >>> for seed in range(10): |
| >>> kwargs = transform.instantiate(seed, signal) |
| >>> output = transform(signal.clone(), **kwargs) |
| |
| """ |
| state = util.random_state(state) |
|
|
| |
| |
| |
| |
| needs_signal = "signal" in set(signature(self._instantiate).parameters.keys()) |
| kwargs = {} |
| if needs_signal: |
| kwargs = {"signal": signal} |
|
|
| |
| params = self._instantiate(state, **kwargs) |
| for k in list(params.keys()): |
| v = params[k] |
| if isinstance(v, (AudioSignal, torch.Tensor, dict)): |
| params[k] = v |
| else: |
| params[k] = tt(v) |
| mask = state.rand() <= self.prob |
| params[f"mask"] = tt(mask) |
|
|
| |
| |
| |
| params = {self.name: params} |
|
|
| return params |
|
|
| def batch_instantiate( |
| self, |
| states: list = None, |
| signal: AudioSignal = None, |
| ): |
| """Instantiates arguments for every item in a batch, |
| given a list of states. Each state in the list |
| corresponds to one item in the batch. |
| |
| Parameters |
| ---------- |
| states : list, optional |
| List of states, by default None |
| signal : AudioSignal, optional |
| AudioSignal to pass to the ``self.instantiate`` section |
| if it is needed for this transform, by default None |
| |
| Returns |
| ------- |
| dict |
| Collated dictionary of arguments. |
| |
| Examples |
| -------- |
| |
| >>> batch_size = 4 |
| >>> signal = AudioSignal(audio_path, offset=10, duration=2) |
| >>> signal_batch = AudioSignal.batch([signal.clone() for _ in range(batch_size)]) |
| >>> |
| >>> states = [seed + idx for idx in list(range(batch_size))] |
| >>> kwargs = transform.batch_instantiate(states, signal_batch) |
| >>> batch_output = transform(signal_batch, **kwargs) |
| """ |
| kwargs = [] |
| for state in states: |
| kwargs.append(self.instantiate(state, signal)) |
| kwargs = util.collate(kwargs) |
| return kwargs |
|
|
|
|
| class Identity(BaseTransform): |
| """This transform just returns the original signal.""" |
|
|
| pass |
|
|
|
|
| class SpectralTransform(BaseTransform): |
| """Spectral transforms require STFT data to exist, since manipulations |
| of the STFT require the spectrogram. This just calls ``stft`` before |
| the transform is called, and calls ``istft`` after the transform is |
| called so that the audio data is written to after the spectral |
| manipulation. |
| """ |
|
|
| def transform(self, signal, **kwargs): |
| signal.stft() |
| super().transform(signal, **kwargs) |
| signal.istft() |
| return signal |
|
|
|
|
| class Compose(BaseTransform): |
| """Compose applies transforms in sequence, one after the other. The |
| transforms are passed in as positional arguments or as a list like so: |
| |
| >>> transform = tfm.Compose( |
| >>> [ |
| >>> tfm.RoomImpulseResponse(sources=["tests/audio/irs.csv"]), |
| >>> tfm.BackgroundNoise(sources=["tests/audio/noises.csv"]), |
| >>> ], |
| >>> ) |
| |
| This will convolve the signal with a room impulse response, and then |
| add background noise to the signal. Instantiate instantiates |
| all the parameters for every transform in the transform list so the |
| interface for using the Compose transform is the same as everything |
| else: |
| |
| >>> kwargs = transform.instantiate() |
| >>> output = transform(signal.clone(), **kwargs) |
| |
| Under the hood, the transform maps each transform to a unique name |
| under the hood of the form ``{position}.{name}``, where ``position`` |
| is the index of the transform in the list. ``Compose`` can nest |
| within other ``Compose`` transforms, like so: |
| |
| >>> preprocess = transforms.Compose( |
| >>> tfm.GlobalVolumeNorm(), |
| >>> tfm.CrossTalk(), |
| >>> name="preprocess", |
| >>> ) |
| >>> augment = transforms.Compose( |
| >>> tfm.RoomImpulseResponse(), |
| >>> tfm.BackgroundNoise(), |
| >>> name="augment", |
| >>> ) |
| >>> postprocess = transforms.Compose( |
| >>> tfm.VolumeChange(), |
| >>> tfm.RescaleAudio(), |
| >>> tfm.ShiftPhase(), |
| >>> name="postprocess", |
| >>> ) |
| >>> transform = transforms.Compose(preprocess, augment, postprocess), |
| |
| This defines 3 composed transforms, and then composes them in sequence |
| with one another. |
| |
| Parameters |
| ---------- |
| *transforms : list |
| List of transforms to apply |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__(self, *transforms: list, name: str = None, prob: float = 1.0): |
| if isinstance(transforms[0], list): |
| transforms = transforms[0] |
|
|
| for i, tfm in enumerate(transforms): |
| tfm.name = f"{i}.{tfm.name}" |
|
|
| keys = [tfm.name for tfm in transforms] |
| super().__init__(keys=keys, name=name, prob=prob) |
|
|
| self.transforms = transforms |
| self.transforms_to_apply = keys |
|
|
| @contextmanager |
| def filter(self, *names: list): |
| """This can be used to skip transforms entirely when applying |
| the sequence of transforms to a signal. For example, take |
| the following transforms with the names ``preprocess, augment, postprocess``. |
| |
| >>> preprocess = transforms.Compose( |
| >>> tfm.GlobalVolumeNorm(), |
| >>> tfm.CrossTalk(), |
| >>> name="preprocess", |
| >>> ) |
| >>> augment = transforms.Compose( |
| >>> tfm.RoomImpulseResponse(), |
| >>> tfm.BackgroundNoise(), |
| >>> name="augment", |
| >>> ) |
| >>> postprocess = transforms.Compose( |
| >>> tfm.VolumeChange(), |
| >>> tfm.RescaleAudio(), |
| >>> tfm.ShiftPhase(), |
| >>> name="postprocess", |
| >>> ) |
| >>> transform = transforms.Compose(preprocess, augment, postprocess) |
| |
| If we wanted to apply all 3 to a signal, we do: |
| |
| >>> kwargs = transform.instantiate() |
| >>> output = transform(signal.clone(), **kwargs) |
| |
| But if we only wanted to apply the ``preprocess`` and ``postprocess`` |
| transforms to the signal, we do: |
| |
| >>> with transform_fn.filter("preprocess", "postprocess"): |
| >>> output = transform(signal.clone(), **kwargs) |
| |
| Parameters |
| ---------- |
| *names : list |
| List of transforms, identified by name, to apply to signal. |
| """ |
| old_transforms = self.transforms_to_apply |
| self.transforms_to_apply = names |
| yield |
| self.transforms_to_apply = old_transforms |
|
|
| def _transform(self, signal, **kwargs): |
| for transform in self.transforms: |
| if any([x in transform.name for x in self.transforms_to_apply]): |
| signal = transform(signal, **kwargs) |
| return signal |
|
|
| def _instantiate(self, state: RandomState, signal: AudioSignal = None): |
| parameters = {} |
| for transform in self.transforms: |
| parameters.update(transform.instantiate(state, signal=signal)) |
| return parameters |
|
|
| def __getitem__(self, idx): |
| return self.transforms[idx] |
|
|
| def __len__(self): |
| return len(self.transforms) |
|
|
| def __iter__(self): |
| for transform in self.transforms: |
| yield transform |
|
|
|
|
| class Choose(Compose): |
| """Choose logic is the same as :py:func:`audiotools.data.transforms.Compose`, |
| but instead of applying all the transforms in sequence, it applies just a single transform, |
| which is chosen for each item in the batch. |
| |
| Parameters |
| ---------- |
| *transforms : list |
| List of transforms to apply |
| weights : list |
| Probability of choosing any specific transform. |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| |
| Examples |
| -------- |
| |
| >>> transforms.Choose(tfm.LowPass(), tfm.HighPass()) |
| """ |
|
|
| def __init__( |
| self, |
| *transforms: list, |
| weights: list = None, |
| name: str = None, |
| prob: float = 1.0, |
| ): |
| super().__init__(*transforms, name=name, prob=prob) |
|
|
| if weights is None: |
| _len = len(self.transforms) |
| weights = [1 / _len for _ in range(_len)] |
| self.weights = np.array(weights) |
|
|
| def _instantiate(self, state: RandomState, signal: AudioSignal = None): |
| kwargs = super()._instantiate(state, signal) |
| tfm_idx = list(range(len(self.transforms))) |
| tfm_idx = state.choice(tfm_idx, p=self.weights) |
| one_hot = [] |
| for i, t in enumerate(self.transforms): |
| mask = kwargs[t.name]["mask"] |
| if mask.item(): |
| kwargs[t.name]["mask"] = tt(i == tfm_idx) |
| one_hot.append(kwargs[t.name]["mask"]) |
| kwargs["one_hot"] = one_hot |
| return kwargs |
|
|
|
|
| class Repeat(Compose): |
| """Repeatedly applies a given transform ``n_repeat`` times." |
| |
| Parameters |
| ---------- |
| transform : BaseTransform |
| Transform to repeat. |
| n_repeat : int, optional |
| Number of times to repeat transform, by default 1 |
| """ |
|
|
| def __init__( |
| self, |
| transform, |
| n_repeat: int = 1, |
| name: str = None, |
| prob: float = 1.0, |
| ): |
| transforms = [copy.copy(transform) for _ in range(n_repeat)] |
| super().__init__(transforms, name=name, prob=prob) |
|
|
| self.n_repeat = n_repeat |
|
|
|
|
| class RepeatUpTo(Choose): |
| """Repeatedly applies a given transform up to ``max_repeat`` times." |
| |
| Parameters |
| ---------- |
| transform : BaseTransform |
| Transform to repeat. |
| max_repeat : int, optional |
| Max number of times to repeat transform, by default 1 |
| weights : list |
| Probability of choosing any specific number up to ``max_repeat``. |
| """ |
|
|
| def __init__( |
| self, |
| transform, |
| max_repeat: int = 5, |
| weights: list = None, |
| name: str = None, |
| prob: float = 1.0, |
| ): |
| transforms = [] |
| for n in range(1, max_repeat): |
| transforms.append(Repeat(transform, n_repeat=n)) |
| super().__init__(transforms, name=name, prob=prob, weights=weights) |
|
|
| self.max_repeat = max_repeat |
|
|
|
|
| class ClippingDistortion(BaseTransform): |
| """Adds clipping distortion to signal. Corresponds |
| to :py:func:`audiotools.core.effects.EffectMixin.clip_distortion`. |
| |
| Parameters |
| ---------- |
| perc : tuple, optional |
| Clipping percentile. Values are between 0.0 to 1.0. |
| Typical values are 0.1 or below, by default ("uniform", 0.0, 0.1) |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__( |
| self, |
| perc: tuple = ("uniform", 0.0, 0.1), |
| name: str = None, |
| prob: float = 1.0, |
| ): |
| super().__init__(name=name, prob=prob) |
|
|
| self.perc = perc |
|
|
| def _instantiate(self, state: RandomState): |
| return {"perc": util.sample_from_dist(self.perc, state)} |
|
|
| def _transform(self, signal, perc): |
| return signal.clip_distortion(perc) |
|
|
|
|
| class Equalizer(BaseTransform): |
| """Applies an equalization curve to the audio signal. Corresponds |
| to :py:func:`audiotools.core.effects.EffectMixin.equalizer`. |
| |
| Parameters |
| ---------- |
| eq_amount : tuple, optional |
| The maximum dB cut to apply to the audio in any band, |
| by default ("const", 1.0 dB) |
| n_bands : int, optional |
| Number of bands in EQ, by default 6 |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__( |
| self, |
| eq_amount: tuple = ("const", 1.0), |
| n_bands: int = 6, |
| name: str = None, |
| prob: float = 1.0, |
| ): |
| super().__init__(name=name, prob=prob) |
|
|
| self.eq_amount = eq_amount |
| self.n_bands = n_bands |
|
|
| def _instantiate(self, state: RandomState): |
| eq_amount = util.sample_from_dist(self.eq_amount, state) |
| eq = -eq_amount * state.rand(self.n_bands) |
| return {"eq": eq} |
|
|
| def _transform(self, signal, eq): |
| return signal.equalizer(eq) |
|
|
|
|
| class Quantization(BaseTransform): |
| """Applies quantization to the input waveform. Corresponds |
| to :py:func:`audiotools.core.effects.EffectMixin.quantization`. |
| |
| Parameters |
| ---------- |
| channels : tuple, optional |
| Number of evenly spaced quantization channels to quantize |
| to, by default ("choice", [8, 32, 128, 256, 1024]) |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__( |
| self, |
| channels: tuple = ("choice", [8, 32, 128, 256, 1024]), |
| name: str = None, |
| prob: float = 1.0, |
| ): |
| super().__init__(name=name, prob=prob) |
|
|
| self.channels = channels |
|
|
| def _instantiate(self, state: RandomState): |
| return {"channels": util.sample_from_dist(self.channels, state)} |
|
|
| def _transform(self, signal, channels): |
| return signal.quantization(channels) |
|
|
|
|
| class MuLawQuantization(BaseTransform): |
| """Applies mu-law quantization to the input waveform. Corresponds |
| to :py:func:`audiotools.core.effects.EffectMixin.mulaw_quantization`. |
| |
| Parameters |
| ---------- |
| channels : tuple, optional |
| Number of mu-law spaced quantization channels to quantize |
| to, by default ("choice", [8, 32, 128, 256, 1024]) |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__( |
| self, |
| channels: tuple = ("choice", [8, 32, 128, 256, 1024]), |
| name: str = None, |
| prob: float = 1.0, |
| ): |
| super().__init__(name=name, prob=prob) |
|
|
| self.channels = channels |
|
|
| def _instantiate(self, state: RandomState): |
| return {"channels": util.sample_from_dist(self.channels, state)} |
|
|
| def _transform(self, signal, channels): |
| return signal.mulaw_quantization(channels) |
|
|
|
|
| class NoiseFloor(BaseTransform): |
| """Adds a noise floor of Gaussian noise to the signal at a specified |
| dB. |
| |
| Parameters |
| ---------- |
| db : tuple, optional |
| Level of noise to add to signal, by default ("const", -50.0) |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__( |
| self, |
| db: tuple = ("const", -50.0), |
| name: str = None, |
| prob: float = 1.0, |
| ): |
| super().__init__(name=name, prob=prob) |
|
|
| self.db = db |
|
|
| def _instantiate(self, state: RandomState, signal: AudioSignal): |
| db = util.sample_from_dist(self.db, state) |
| audio_data = state.randn(signal.num_channels, signal.signal_length) |
| nz_signal = AudioSignal(audio_data, signal.sample_rate) |
| nz_signal.normalize(db) |
| return {"nz_signal": nz_signal} |
|
|
| def _transform(self, signal, nz_signal): |
| |
| |
| return signal + nz_signal |
|
|
|
|
| class BackgroundNoise(BaseTransform): |
| """Adds background noise from audio specified by a set of CSV files. |
| A valid CSV file looks like, and is typically generated by |
| :py:func:`audiotools.data.preprocess.create_csv`: |
| |
| .. csv-table:: |
| :header: path |
| |
| room_tone/m6_script2_clean.wav |
| room_tone/m6_script2_cleanraw.wav |
| room_tone/m6_script2_ipad_balcony1.wav |
| room_tone/m6_script2_ipad_bedroom1.wav |
| room_tone/m6_script2_ipad_confroom1.wav |
| room_tone/m6_script2_ipad_confroom2.wav |
| room_tone/m6_script2_ipad_livingroom1.wav |
| room_tone/m6_script2_ipad_office1.wav |
| |
| .. note:: |
| All paths are relative to an environment variable called ``PATH_TO_DATA``, |
| so that CSV files are portable across machines where data may be |
| located in different places. |
| |
| This transform calls :py:func:`audiotools.core.effects.EffectMixin.mix` |
| and :py:func:`audiotools.core.effects.EffectMixin.equalizer` under the |
| hood. |
| |
| Parameters |
| ---------- |
| snr : tuple, optional |
| Signal-to-noise ratio, by default ("uniform", 10.0, 30.0) |
| sources : List[str], optional |
| Sources containing folders, or CSVs with paths to audio files, |
| by default None |
| weights : List[float], optional |
| Weights to sample audio files from each source, by default None |
| eq_amount : tuple, optional |
| Amount of equalization to apply, by default ("const", 1.0) |
| n_bands : int, optional |
| Number of bands in equalizer, by default 3 |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| loudness_cutoff : float, optional |
| Loudness cutoff when loading from audio files, by default None |
| """ |
|
|
| def __init__( |
| self, |
| snr: tuple = ("uniform", 10.0, 30.0), |
| sources: List[str] = None, |
| weights: List[float] = None, |
| eq_amount: tuple = ("const", 1.0), |
| n_bands: int = 3, |
| name: str = None, |
| prob: float = 1.0, |
| loudness_cutoff: float = None, |
| ): |
| super().__init__(name=name, prob=prob) |
|
|
| self.snr = snr |
| self.eq_amount = eq_amount |
| self.n_bands = n_bands |
| self.loader = AudioLoader(sources, weights) |
| self.loudness_cutoff = loudness_cutoff |
|
|
| def _instantiate(self, state: RandomState, signal: AudioSignal): |
| eq_amount = util.sample_from_dist(self.eq_amount, state) |
| eq = -eq_amount * state.rand(self.n_bands) |
| snr = util.sample_from_dist(self.snr, state) |
|
|
| bg_signal = self.loader( |
| state, |
| signal.sample_rate, |
| duration=signal.signal_duration, |
| loudness_cutoff=self.loudness_cutoff, |
| num_channels=signal.num_channels, |
| )["signal"] |
|
|
| return {"eq": eq, "bg_signal": bg_signal, "snr": snr} |
|
|
| def _transform(self, signal, bg_signal, snr, eq): |
| |
| |
| return signal.mix(bg_signal.clone(), snr, eq) |
|
|
|
|
| class CrossTalk(BaseTransform): |
| """Adds crosstalk between speakers, whose audio is drawn from a CSV file |
| that was produced via :py:func:`audiotools.data.preprocess.create_csv`. |
| |
| This transform calls :py:func:`audiotools.core.effects.EffectMixin.mix` |
| under the hood. |
| |
| Parameters |
| ---------- |
| snr : tuple, optional |
| How loud cross-talk speaker is relative to original signal in dB, |
| by default ("uniform", 0.0, 10.0) |
| sources : List[str], optional |
| Sources containing folders, or CSVs with paths to audio files, |
| by default None |
| weights : List[float], optional |
| Weights to sample audio files from each source, by default None |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| loudness_cutoff : float, optional |
| Loudness cutoff when loading from audio files, by default -40 |
| """ |
|
|
| def __init__( |
| self, |
| snr: tuple = ("uniform", 0.0, 10.0), |
| sources: List[str] = None, |
| weights: List[float] = None, |
| name: str = None, |
| prob: float = 1.0, |
| loudness_cutoff: float = -40, |
| ): |
| super().__init__(name=name, prob=prob) |
|
|
| self.snr = snr |
| self.loader = AudioLoader(sources, weights) |
| self.loudness_cutoff = loudness_cutoff |
|
|
| def _instantiate(self, state: RandomState, signal: AudioSignal): |
| snr = util.sample_from_dist(self.snr, state) |
| crosstalk_signal = self.loader( |
| state, |
| signal.sample_rate, |
| duration=signal.signal_duration, |
| loudness_cutoff=self.loudness_cutoff, |
| num_channels=signal.num_channels, |
| )["signal"] |
|
|
| return {"crosstalk_signal": crosstalk_signal, "snr": snr} |
|
|
| def _transform(self, signal, crosstalk_signal, snr): |
| |
| |
| loudness = signal.loudness() |
| mix = signal.mix(crosstalk_signal.clone(), snr) |
| mix.normalize(loudness) |
| return mix |
|
|
|
|
| class RoomImpulseResponse(BaseTransform): |
| """Convolves signal with a room impulse response, at a specified |
| direct-to-reverberant ratio, with equalization applied. Room impulse |
| response data is drawn from a CSV file that was produced via |
| :py:func:`audiotools.data.preprocess.create_csv`. |
| |
| This transform calls :py:func:`audiotools.core.effects.EffectMixin.apply_ir` |
| under the hood. |
| |
| Parameters |
| ---------- |
| drr : tuple, optional |
| _description_, by default ("uniform", 0.0, 30.0) |
| sources : List[str], optional |
| Sources containing folders, or CSVs with paths to audio files, |
| by default None |
| weights : List[float], optional |
| Weights to sample audio files from each source, by default None |
| eq_amount : tuple, optional |
| Amount of equalization to apply, by default ("const", 1.0) |
| n_bands : int, optional |
| Number of bands in equalizer, by default 6 |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| use_original_phase : bool, optional |
| Whether or not to use the original phase, by default False |
| offset : float, optional |
| Offset from each impulse response file to use, by default 0.0 |
| duration : float, optional |
| Duration of each impulse response, by default 1.0 |
| """ |
|
|
| def __init__( |
| self, |
| drr: tuple = ("uniform", 0.0, 30.0), |
| sources: List[str] = None, |
| weights: List[float] = None, |
| eq_amount: tuple = ("const", 1.0), |
| n_bands: int = 6, |
| name: str = None, |
| prob: float = 1.0, |
| use_original_phase: bool = False, |
| offset: float = 0.0, |
| duration: float = 1.0, |
| ): |
| super().__init__(name=name, prob=prob) |
|
|
| self.drr = drr |
| self.eq_amount = eq_amount |
| self.n_bands = n_bands |
| self.use_original_phase = use_original_phase |
|
|
| self.loader = AudioLoader(sources, weights) |
| self.offset = offset |
| self.duration = duration |
|
|
| def _instantiate(self, state: RandomState, signal: AudioSignal = None): |
| eq_amount = util.sample_from_dist(self.eq_amount, state) |
| eq = -eq_amount * state.rand(self.n_bands) |
| drr = util.sample_from_dist(self.drr, state) |
|
|
| ir_signal = self.loader( |
| state, |
| signal.sample_rate, |
| offset=self.offset, |
| duration=self.duration, |
| loudness_cutoff=None, |
| num_channels=signal.num_channels, |
| )["signal"] |
| ir_signal.zero_pad_to(signal.sample_rate) |
|
|
| return {"eq": eq, "ir_signal": ir_signal, "drr": drr} |
|
|
| def _transform(self, signal, ir_signal, drr, eq): |
| |
| |
| return signal.apply_ir( |
| ir_signal.clone(), drr, eq, use_original_phase=self.use_original_phase |
| ) |
|
|
|
|
| class VolumeChange(BaseTransform): |
| """Changes the volume of the input signal. |
| |
| Uses :py:func:`audiotools.core.effects.EffectMixin.volume_change`. |
| |
| Parameters |
| ---------- |
| db : tuple, optional |
| Change in volume in decibels, by default ("uniform", -12.0, 0.0) |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__( |
| self, |
| db: tuple = ("uniform", -12.0, 0.0), |
| name: str = None, |
| prob: float = 1.0, |
| ): |
| super().__init__(name=name, prob=prob) |
| self.db = db |
|
|
| def _instantiate(self, state: RandomState): |
| return {"db": util.sample_from_dist(self.db, state)} |
|
|
| def _transform(self, signal, db): |
| return signal.volume_change(db) |
|
|
|
|
| class VolumeNorm(BaseTransform): |
| """Normalizes the volume of the excerpt to a specified decibel. |
| |
| Uses :py:func:`audiotools.core.effects.EffectMixin.normalize`. |
| |
| Parameters |
| ---------- |
| db : tuple, optional |
| dB to normalize signal to, by default ("const", -24) |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__( |
| self, |
| db: tuple = ("const", -24), |
| name: str = None, |
| prob: float = 1.0, |
| ): |
| super().__init__(name=name, prob=prob) |
|
|
| self.db = db |
|
|
| def _instantiate(self, state: RandomState): |
| return {"db": util.sample_from_dist(self.db, state)} |
|
|
| def _transform(self, signal, db): |
| return signal.normalize(db) |
|
|
|
|
| class GlobalVolumeNorm(BaseTransform): |
| """Similar to :py:func:`audiotools.data.transforms.VolumeNorm`, this |
| transform also normalizes the volume of a signal, but it uses |
| the volume of the entire audio file the loaded excerpt comes from, |
| rather than the volume of just the excerpt. The volume of the |
| entire audio file is expected in ``signal.metadata["loudness"]``. |
| If loading audio from a CSV generated by :py:func:`audiotools.data.preprocess.create_csv` |
| with ``loudness = True``, like the following: |
| |
| .. csv-table:: |
| :header: path,loudness |
| |
| daps/produced/f1_script1_produced.wav,-16.299999237060547 |
| daps/produced/f1_script2_produced.wav,-16.600000381469727 |
| daps/produced/f1_script3_produced.wav,-17.299999237060547 |
| daps/produced/f1_script4_produced.wav,-16.100000381469727 |
| daps/produced/f1_script5_produced.wav,-16.700000762939453 |
| daps/produced/f3_script1_produced.wav,-16.5 |
| |
| The ``AudioLoader`` will automatically load the loudness column into |
| the metadata of the signal. |
| |
| Uses :py:func:`audiotools.core.effects.EffectMixin.volume_change`. |
| |
| Parameters |
| ---------- |
| db : tuple, optional |
| dB to normalize signal to, by default ("const", -24) |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__( |
| self, |
| db: tuple = ("const", -24), |
| name: str = None, |
| prob: float = 1.0, |
| ): |
| super().__init__(name=name, prob=prob) |
|
|
| self.db = db |
|
|
| def _instantiate(self, state: RandomState, signal: AudioSignal): |
| if "loudness" not in signal.metadata: |
| db_change = 0.0 |
| elif float(signal.metadata["loudness"]) == float("-inf"): |
| db_change = 0.0 |
| else: |
| db = util.sample_from_dist(self.db, state) |
| db_change = db - float(signal.metadata["loudness"]) |
|
|
| return {"db": db_change} |
|
|
| def _transform(self, signal, db): |
| return signal.volume_change(db) |
|
|
|
|
| class Silence(BaseTransform): |
| """Zeros out the signal with some probability. |
| |
| Parameters |
| ---------- |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 0.1 |
| """ |
|
|
| def __init__(self, name: str = None, prob: float = 0.1): |
| super().__init__(name=name, prob=prob) |
|
|
| def _transform(self, signal): |
| _loudness = signal._loudness |
| signal = AudioSignal( |
| torch.zeros_like(signal.audio_data), |
| sample_rate=signal.sample_rate, |
| stft_params=signal.stft_params, |
| ) |
| |
| |
| signal._loudness = _loudness |
|
|
| return signal |
|
|
|
|
| class LowPass(BaseTransform): |
| """Applies a LowPass filter. |
| |
| Uses :py:func:`audiotools.core.dsp.DSPMixin.low_pass`. |
| |
| Parameters |
| ---------- |
| cutoff : tuple, optional |
| Cutoff frequency distribution, |
| by default ``("choice", [4000, 8000, 16000])`` |
| zeros : int, optional |
| Number of zero-crossings in filter, argument to |
| ``julius.LowPassFilters``, by default 51 |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__( |
| self, |
| cutoff: tuple = ("choice", [4000, 8000, 16000]), |
| zeros: int = 51, |
| name: str = None, |
| prob: float = 1, |
| ): |
| super().__init__(name=name, prob=prob) |
|
|
| self.cutoff = cutoff |
| self.zeros = zeros |
|
|
| def _instantiate(self, state: RandomState): |
| return {"cutoff": util.sample_from_dist(self.cutoff, state)} |
|
|
| def _transform(self, signal, cutoff): |
| return signal.low_pass(cutoff, zeros=self.zeros) |
|
|
|
|
| class HighPass(BaseTransform): |
| """Applies a HighPass filter. |
| |
| Uses :py:func:`audiotools.core.dsp.DSPMixin.high_pass`. |
| |
| Parameters |
| ---------- |
| cutoff : tuple, optional |
| Cutoff frequency distribution, |
| by default ``("choice", [50, 100, 250, 500, 1000])`` |
| zeros : int, optional |
| Number of zero-crossings in filter, argument to |
| ``julius.LowPassFilters``, by default 51 |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__( |
| self, |
| cutoff: tuple = ("choice", [50, 100, 250, 500, 1000]), |
| zeros: int = 51, |
| name: str = None, |
| prob: float = 1, |
| ): |
| super().__init__(name=name, prob=prob) |
|
|
| self.cutoff = cutoff |
| self.zeros = zeros |
|
|
| def _instantiate(self, state: RandomState): |
| return {"cutoff": util.sample_from_dist(self.cutoff, state)} |
|
|
| def _transform(self, signal, cutoff): |
| return signal.high_pass(cutoff, zeros=self.zeros) |
|
|
|
|
| class RescaleAudio(BaseTransform): |
| """Rescales the audio so it is in between ``-val`` and ``val`` |
| only if the original audio exceeds those bounds. Useful if |
| transforms have caused the audio to clip. |
| |
| Uses :py:func:`audiotools.core.effects.EffectMixin.ensure_max_of_audio`. |
| |
| Parameters |
| ---------- |
| val : float, optional |
| Max absolute value of signal, by default 1.0 |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__(self, val: float = 1.0, name: str = None, prob: float = 1): |
| super().__init__(name=name, prob=prob) |
|
|
| self.val = val |
|
|
| def _transform(self, signal): |
| return signal.ensure_max_of_audio(self.val) |
|
|
|
|
| class ShiftPhase(SpectralTransform): |
| """Shifts the phase of the audio. |
| |
| Uses :py:func:`audiotools.core.dsp.DSPMixin.shift)phase`. |
| |
| Parameters |
| ---------- |
| shift : tuple, optional |
| How much to shift phase by, by default ("uniform", -np.pi, np.pi) |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__( |
| self, |
| shift: tuple = ("uniform", -np.pi, np.pi), |
| name: str = None, |
| prob: float = 1, |
| ): |
| super().__init__(name=name, prob=prob) |
| self.shift = shift |
|
|
| def _instantiate(self, state: RandomState): |
| return {"shift": util.sample_from_dist(self.shift, state)} |
|
|
| def _transform(self, signal, shift): |
| return signal.shift_phase(shift) |
|
|
|
|
| class InvertPhase(ShiftPhase): |
| """Inverts the phase of the audio. |
| |
| Uses :py:func:`audiotools.core.dsp.DSPMixin.shift_phase`. |
| |
| Parameters |
| ---------- |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__(self, name: str = None, prob: float = 1): |
| super().__init__(shift=("const", np.pi), name=name, prob=prob) |
|
|
|
|
| class CorruptPhase(SpectralTransform): |
| """Corrupts the phase of the audio. |
| |
| Uses :py:func:`audiotools.core.dsp.DSPMixin.corrupt_phase`. |
| |
| Parameters |
| ---------- |
| scale : tuple, optional |
| How much to corrupt phase by, by default ("uniform", 0, np.pi) |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__( |
| self, scale: tuple = ("uniform", 0, np.pi), name: str = None, prob: float = 1 |
| ): |
| super().__init__(name=name, prob=prob) |
| self.scale = scale |
|
|
| def _instantiate(self, state: RandomState, signal: AudioSignal = None): |
| scale = util.sample_from_dist(self.scale, state) |
| corruption = state.normal(scale=scale, size=signal.phase.shape[1:]) |
| return {"corruption": corruption.astype("float32")} |
|
|
| def _transform(self, signal, corruption): |
| return signal.shift_phase(shift=corruption) |
|
|
|
|
| class FrequencyMask(SpectralTransform): |
| """Masks a band of frequencies at a center frequency |
| from the audio. |
| |
| Uses :py:func:`audiotools.core.dsp.DSPMixin.mask_frequencies`. |
| |
| Parameters |
| ---------- |
| f_center : tuple, optional |
| Center frequency between 0.0 and 1.0 (Nyquist), by default ("uniform", 0.0, 1.0) |
| f_width : tuple, optional |
| Width of zero'd out band, by default ("const", 0.1) |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__( |
| self, |
| f_center: tuple = ("uniform", 0.0, 1.0), |
| f_width: tuple = ("const", 0.1), |
| name: str = None, |
| prob: float = 1, |
| ): |
| super().__init__(name=name, prob=prob) |
| self.f_center = f_center |
| self.f_width = f_width |
|
|
| def _instantiate(self, state: RandomState, signal: AudioSignal): |
| f_center = util.sample_from_dist(self.f_center, state) |
| f_width = util.sample_from_dist(self.f_width, state) |
|
|
| fmin = max(f_center - (f_width / 2), 0.0) |
| fmax = min(f_center + (f_width / 2), 1.0) |
|
|
| fmin_hz = (signal.sample_rate / 2) * fmin |
| fmax_hz = (signal.sample_rate / 2) * fmax |
|
|
| return {"fmin_hz": fmin_hz, "fmax_hz": fmax_hz} |
|
|
| def _transform(self, signal, fmin_hz: float, fmax_hz: float): |
| return signal.mask_frequencies(fmin_hz=fmin_hz, fmax_hz=fmax_hz) |
|
|
|
|
| class TimeMask(SpectralTransform): |
| """Masks out contiguous time-steps from signal. |
| |
| Uses :py:func:`audiotools.core.dsp.DSPMixin.mask_timesteps`. |
| |
| Parameters |
| ---------- |
| t_center : tuple, optional |
| Center time in terms of 0.0 and 1.0 (duration of signal), |
| by default ("uniform", 0.0, 1.0) |
| t_width : tuple, optional |
| Width of dropped out portion, by default ("const", 0.025) |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__( |
| self, |
| t_center: tuple = ("uniform", 0.0, 1.0), |
| t_width: tuple = ("const", 0.025), |
| name: str = None, |
| prob: float = 1, |
| ): |
| super().__init__(name=name, prob=prob) |
| self.t_center = t_center |
| self.t_width = t_width |
|
|
| def _instantiate(self, state: RandomState, signal: AudioSignal): |
| t_center = util.sample_from_dist(self.t_center, state) |
| t_width = util.sample_from_dist(self.t_width, state) |
|
|
| tmin = max(t_center - (t_width / 2), 0.0) |
| tmax = min(t_center + (t_width / 2), 1.0) |
|
|
| tmin_s = signal.signal_duration * tmin |
| tmax_s = signal.signal_duration * tmax |
| return {"tmin_s": tmin_s, "tmax_s": tmax_s} |
|
|
| def _transform(self, signal, tmin_s: float, tmax_s: float): |
| return signal.mask_timesteps(tmin_s=tmin_s, tmax_s=tmax_s) |
|
|
|
|
| class MaskLowMagnitudes(SpectralTransform): |
| """Masks low magnitude regions out of signal. |
| |
| Uses :py:func:`audiotools.core.dsp.DSPMixin.mask_low_magnitudes`. |
| |
| Parameters |
| ---------- |
| db_cutoff : tuple, optional |
| Decibel value for which things below it will be masked away, |
| by default ("uniform", -10, 10) |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__( |
| self, |
| db_cutoff: tuple = ("uniform", -10, 10), |
| name: str = None, |
| prob: float = 1, |
| ): |
| super().__init__(name=name, prob=prob) |
| self.db_cutoff = db_cutoff |
|
|
| def _instantiate(self, state: RandomState, signal: AudioSignal = None): |
| return {"db_cutoff": util.sample_from_dist(self.db_cutoff, state)} |
|
|
| def _transform(self, signal, db_cutoff: float): |
| return signal.mask_low_magnitudes(db_cutoff) |
|
|
|
|
| class Smoothing(BaseTransform): |
| """Convolves the signal with a smoothing window. |
| |
| Uses :py:func:`audiotools.core.effects.EffectMixin.convolve`. |
| |
| Parameters |
| ---------- |
| window_type : tuple, optional |
| Type of window to use, by default ("const", "average") |
| window_length : tuple, optional |
| Length of smoothing window, by |
| default ("choice", [8, 16, 32, 64, 128, 256, 512]) |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__( |
| self, |
| window_type: tuple = ("const", "average"), |
| window_length: tuple = ("choice", [8, 16, 32, 64, 128, 256, 512]), |
| name: str = None, |
| prob: float = 1, |
| ): |
| super().__init__(name=name, prob=prob) |
| self.window_type = window_type |
| self.window_length = window_length |
|
|
| def _instantiate(self, state: RandomState, signal: AudioSignal = None): |
| window_type = util.sample_from_dist(self.window_type, state) |
| window_length = util.sample_from_dist(self.window_length, state) |
| window = signal.get_window( |
| window_type=window_type, window_length=window_length, device="cpu" |
| ) |
| return {"window": AudioSignal(window, signal.sample_rate)} |
|
|
| def _transform(self, signal, window): |
| sscale = signal.audio_data.abs().max(dim=-1, keepdim=True).values |
| sscale[sscale == 0.0] = 1.0 |
|
|
| out = signal.convolve(window) |
|
|
| oscale = out.audio_data.abs().max(dim=-1, keepdim=True).values |
| oscale[oscale == 0.0] = 1.0 |
|
|
| out = out * (sscale / oscale) |
| return out |
|
|
|
|
| class TimeNoise(TimeMask): |
| """Similar to :py:func:`audiotools.data.transforms.TimeMask`, but |
| replaces with noise instead of zeros. |
| |
| Parameters |
| ---------- |
| t_center : tuple, optional |
| Center time in terms of 0.0 and 1.0 (duration of signal), |
| by default ("uniform", 0.0, 1.0) |
| t_width : tuple, optional |
| Width of dropped out portion, by default ("const", 0.025) |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__( |
| self, |
| t_center: tuple = ("uniform", 0.0, 1.0), |
| t_width: tuple = ("const", 0.025), |
| name: str = None, |
| prob: float = 1, |
| ): |
| super().__init__(t_center=t_center, t_width=t_width, name=name, prob=prob) |
|
|
| def _transform(self, signal, tmin_s: float, tmax_s: float): |
| signal = signal.mask_timesteps(tmin_s=tmin_s, tmax_s=tmax_s, val=0.0) |
| mag, phase = signal.magnitude, signal.phase |
|
|
| mag_r, phase_r = torch.randn_like(mag), torch.randn_like(phase) |
| mask = (mag == 0.0) * (phase == 0.0) |
|
|
| mag[mask] = mag_r[mask] |
| phase[mask] = phase_r[mask] |
|
|
| signal.magnitude = mag |
| signal.phase = phase |
| return signal |
|
|
|
|
| class FrequencyNoise(FrequencyMask): |
| """Similar to :py:func:`audiotools.data.transforms.FrequencyMask`, but |
| replaces with noise instead of zeros. |
| |
| Parameters |
| ---------- |
| f_center : tuple, optional |
| Center frequency between 0.0 and 1.0 (Nyquist), by default ("uniform", 0.0, 1.0) |
| f_width : tuple, optional |
| Width of zero'd out band, by default ("const", 0.1) |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__( |
| self, |
| f_center: tuple = ("uniform", 0.0, 1.0), |
| f_width: tuple = ("const", 0.1), |
| name: str = None, |
| prob: float = 1, |
| ): |
| super().__init__(f_center=f_center, f_width=f_width, name=name, prob=prob) |
|
|
| def _transform(self, signal, fmin_hz: float, fmax_hz: float): |
| signal = signal.mask_frequencies(fmin_hz=fmin_hz, fmax_hz=fmax_hz) |
| mag, phase = signal.magnitude, signal.phase |
|
|
| mag_r, phase_r = torch.randn_like(mag), torch.randn_like(phase) |
| mask = (mag == 0.0) * (phase == 0.0) |
|
|
| mag[mask] = mag_r[mask] |
| phase[mask] = phase_r[mask] |
|
|
| signal.magnitude = mag |
| signal.phase = phase |
| return signal |
|
|
|
|
| class SpectralDenoising(Equalizer): |
| """Applies denoising algorithm detailed in |
| :py:func:`audiotools.ml.layers.spectral_gate.SpectralGate`, |
| using a randomly generated noise signal for denoising. |
| |
| Parameters |
| ---------- |
| eq_amount : tuple, optional |
| Amount of eq to apply to noise signal, by default ("const", 1.0) |
| denoise_amount : tuple, optional |
| Amount to denoise by, by default ("uniform", 0.8, 1.0) |
| nz_volume : float, optional |
| Volume of noise to denoise with, by default -40 |
| n_bands : int, optional |
| Number of bands in equalizer, by default 6 |
| n_freq : int, optional |
| Number of frequency bins to smooth by, by default 3 |
| n_time : int, optional |
| Number of time bins to smooth by, by default 5 |
| name : str, optional |
| Name of this transform, used to identify it in the dictionary |
| produced by ``self.instantiate``, by default None |
| prob : float, optional |
| Probability of applying this transform, by default 1.0 |
| """ |
|
|
| def __init__( |
| self, |
| eq_amount: tuple = ("const", 1.0), |
| denoise_amount: tuple = ("uniform", 0.8, 1.0), |
| nz_volume: float = -40, |
| n_bands: int = 6, |
| n_freq: int = 3, |
| n_time: int = 5, |
| name: str = None, |
| prob: float = 1, |
| ): |
| super().__init__(eq_amount=eq_amount, n_bands=n_bands, name=name, prob=prob) |
|
|
| self.nz_volume = nz_volume |
| self.denoise_amount = denoise_amount |
| self.spectral_gate = ml.layers.SpectralGate(n_freq, n_time) |
|
|
| def _transform(self, signal, nz, eq, denoise_amount): |
| nz = nz.normalize(self.nz_volume).equalizer(eq) |
| self.spectral_gate = self.spectral_gate.to(signal.device) |
| signal = self.spectral_gate(signal, nz, denoise_amount) |
| return signal |
|
|
| def _instantiate(self, state: RandomState): |
| kwargs = super()._instantiate(state) |
| kwargs["denoise_amount"] = util.sample_from_dist(self.denoise_amount, state) |
| kwargs["nz"] = AudioSignal(state.randn(22050), 44100) |
| return kwargs |
|
|