EzAudio-ControlNet

Paused

App Files Files Community

EzAudio-ControlNet / audiotools /data /transforms.py

OpenSound

Upload 211 files

9d3cb0a verified over 1 year ago

raw

history blame contribute delete

52.6 kB

	import copy
	from contextlib import contextmanager
	from inspect import signature
	from typing import List

	import numpy as np
	import torch
	from flatten_dict import flatten
	from flatten_dict import unflatten
	from numpy.random import RandomState

	from .. import ml
	from ..core import AudioSignal
	from ..core import util
	from .datasets import AudioLoader

	tt = torch.tensor
	"""Shorthand for converting things to torch.tensor."""


	class BaseTransform:
	"""This is the base class for all transforms that are implemented
	in this library. Transforms have two main operations: ``transform``
	and ``instantiate``.

	``instantiate`` sets the parameters randomly
	from distribution tuples for each parameter. For example, for the
	``BackgroundNoise`` transform, the signal-to-noise ratio (``snr``)
	is chosen randomly by instantiate. By default, it chosen uniformly
	between 10.0 and 30.0 (the tuple is set to ``("uniform", 10.0, 30.0)``).

	``transform`` applies the transform using the instantiated parameters.
	A simple example is as follows:

	>>> seed = 0
	>>> signal = ...
	>>> transform = transforms.NoiseFloor(db = ("uniform", -50.0, -30.0))
	>>> kwargs = transform.instantiate()
	>>> output = transform(signal.clone(), **kwargs)

	By breaking apart the instantiation of parameters from the actual audio
	processing of the transform, we can make things more reproducible, while
	also applying the transform on batches of data efficiently on GPU,
	rather than on individual audio samples.

	.. note::
	We call ``signal.clone()`` for the input to the ``transform`` function
	because signals are modified in-place! If you don't clone the signal,
	you will lose the original data.

	Parameters
	----------
	keys : list, optional
	Keys that the transform looks for when
	calling ``self.transform``, by default []. In general this is
	set automatically, and you won't need to manipulate this argument.
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0

	Examples
	--------

	>>> seed = 0
	>>>
	>>> audio_path = "tests/audio/spk/f10_script4_produced.wav"
	>>> signal = AudioSignal(audio_path, offset=10, duration=2)
	>>> transform = tfm.Compose(
	>>> [
	>>> tfm.RoomImpulseResponse(sources=["tests/audio/irs.csv"]),
	>>> tfm.BackgroundNoise(sources=["tests/audio/noises.csv"]),
	>>> ],
	>>> )
	>>>
	>>> kwargs = transform.instantiate(seed, signal)
	>>> output = transform(signal, **kwargs)

	"""

	def __init__(self, keys: list = [], name: str = None, prob: float = 1.0):
	# Get keys from the _transform signature.
	tfm_keys = list(signature(self._transform).parameters.keys())

	# Filter out signal and kwargs keys.
	ignore_keys = ["signal", "kwargs"]
	tfm_keys = [k for k in tfm_keys if k not in ignore_keys]

	# Combine keys specified by the child class, the keys found in
	# _transform signature, and the mask key.
	self.keys = keys + tfm_keys + ["mask"]

	self.prob = prob

	if name is None:
	name = self.__class__.__name__
	self.name = name

	def _prepare(self, batch: dict):
	sub_batch = batch[self.name]

	for k in self.keys:
	assert k in sub_batch.keys(), f"{k} not in batch"

	return sub_batch

	def _transform(self, signal):
	return signal

	def _instantiate(self, state: RandomState, signal: AudioSignal = None):
	return {}

	@staticmethod
	def apply_mask(batch: dict, mask: torch.Tensor):
	"""Applies a mask to the batch.

	Parameters
	----------
	batch : dict
	Batch whose values will be masked in the ``transform`` pass.
	mask : torch.Tensor
	Mask to apply to batch.

	Returns
	-------
	dict
	A dictionary that contains values only where ``mask = True``.
	"""
	masked_batch = {k: v[mask] for k, v in flatten(batch).items()}
	return unflatten(masked_batch)

	def transform(self, signal: AudioSignal, **kwargs):
	"""Apply the transform to the audio signal,
	with given keyword arguments.

	Parameters
	----------
	signal : AudioSignal
	Signal that will be modified by the transforms in-place.
	kwargs: dict
	Keyword arguments to the specific transforms ``self._transform``
	function.

	Returns
	-------
	AudioSignal
	Transformed AudioSignal.

	Examples
	--------

	>>> for seed in range(10):
	>>> kwargs = transform.instantiate(seed, signal)
	>>> output = transform(signal.clone(), **kwargs)

	"""
	tfm_kwargs = self._prepare(kwargs)
	mask = tfm_kwargs["mask"]

	if torch.any(mask):
	tfm_kwargs = self.apply_mask(tfm_kwargs, mask)
	tfm_kwargs = {k: v for k, v in tfm_kwargs.items() if k != "mask"}
	signal[mask] = self._transform(signal[mask], **tfm_kwargs)

	return signal

	def __call__(self, args, *kwargs):
	return self.transform(args, *kwargs)

	def instantiate(
	self,
	state: RandomState = None,
	signal: AudioSignal = None,
	):
	"""Instantiates parameters for the transform.

	Parameters
	----------
	state : RandomState, optional
	_description_, by default None
	signal : AudioSignal, optional
	_description_, by default None

	Returns
	-------
	dict
	Dictionary containing instantiated arguments for every keyword
	argument to ``self._transform``.

	Examples
	--------

	>>> for seed in range(10):
	>>> kwargs = transform.instantiate(seed, signal)
	>>> output = transform(signal.clone(), **kwargs)

	"""
	state = util.random_state(state)

	# Not all instantiates need the signal. Check if signal
	# is needed before passing it in, so that the end-user
	# doesn't need to have variables they're not using flowing
	# into their function.
	needs_signal = "signal" in set(signature(self._instantiate).parameters.keys())
	kwargs = {}
	if needs_signal:
	kwargs = {"signal": signal}

	# Instantiate the parameters for the transform.
	params = self._instantiate(state, **kwargs)
	for k in list(params.keys()):
	v = params[k]
	if isinstance(v, (AudioSignal, torch.Tensor, dict)):
	params[k] = v
	else:
	params[k] = tt(v)
	mask = state.rand() <= self.prob
	params[f"mask"] = tt(mask)

	# Put the params into a nested dictionary that will be
	# used later when calling the transform. This is to avoid
	# collisions in the dictionary.
	params = {self.name: params}

	return params

	def batch_instantiate(
	self,
	states: list = None,
	signal: AudioSignal = None,
	):
	"""Instantiates arguments for every item in a batch,
	given a list of states. Each state in the list
	corresponds to one item in the batch.

	Parameters
	----------
	states : list, optional
	List of states, by default None
	signal : AudioSignal, optional
	AudioSignal to pass to the ``self.instantiate`` section
	if it is needed for this transform, by default None

	Returns
	-------
	dict
	Collated dictionary of arguments.

	Examples
	--------

	>>> batch_size = 4
	>>> signal = AudioSignal(audio_path, offset=10, duration=2)
	>>> signal_batch = AudioSignal.batch([signal.clone() for _ in range(batch_size)])
	>>>
	>>> states = [seed + idx for idx in list(range(batch_size))]
	>>> kwargs = transform.batch_instantiate(states, signal_batch)
	>>> batch_output = transform(signal_batch, **kwargs)
	"""
	kwargs = []
	for state in states:
	kwargs.append(self.instantiate(state, signal))
	kwargs = util.collate(kwargs)
	return kwargs


	class Identity(BaseTransform):
	"""This transform just returns the original signal."""

	pass


	class SpectralTransform(BaseTransform):
	"""Spectral transforms require STFT data to exist, since manipulations
	of the STFT require the spectrogram. This just calls ``stft`` before
	the transform is called, and calls ``istft`` after the transform is
	called so that the audio data is written to after the spectral
	manipulation.
	"""

	def transform(self, signal, **kwargs):
	signal.stft()
	super().transform(signal, **kwargs)
	signal.istft()
	return signal


	class Compose(BaseTransform):
	"""Compose applies transforms in sequence, one after the other. The
	transforms are passed in as positional arguments or as a list like so:

	>>> transform = tfm.Compose(
	>>> [
	>>> tfm.RoomImpulseResponse(sources=["tests/audio/irs.csv"]),
	>>> tfm.BackgroundNoise(sources=["tests/audio/noises.csv"]),
	>>> ],
	>>> )

	This will convolve the signal with a room impulse response, and then
	add background noise to the signal. Instantiate instantiates
	all the parameters for every transform in the transform list so the
	interface for using the Compose transform is the same as everything
	else:

	>>> kwargs = transform.instantiate()
	>>> output = transform(signal.clone(), **kwargs)

	Under the hood, the transform maps each transform to a unique name
	under the hood of the form ``{position}.{name}``, where ``position``
	is the index of the transform in the list. ``Compose`` can nest
	within other ``Compose`` transforms, like so:

	>>> preprocess = transforms.Compose(
	>>> tfm.GlobalVolumeNorm(),
	>>> tfm.CrossTalk(),
	>>> name="preprocess",
	>>> )
	>>> augment = transforms.Compose(
	>>> tfm.RoomImpulseResponse(),
	>>> tfm.BackgroundNoise(),
	>>> name="augment",
	>>> )
	>>> postprocess = transforms.Compose(
	>>> tfm.VolumeChange(),
	>>> tfm.RescaleAudio(),
	>>> tfm.ShiftPhase(),
	>>> name="postprocess",
	>>> )
	>>> transform = transforms.Compose(preprocess, augment, postprocess),

	This defines 3 composed transforms, and then composes them in sequence
	with one another.

	Parameters
	----------
	*transforms : list
	List of transforms to apply
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(self, *transforms: list, name: str = None, prob: float = 1.0):
	if isinstance(transforms[0], list):
	transforms = transforms[0]

	for i, tfm in enumerate(transforms):
	tfm.name = f"{i}.{tfm.name}"

	keys = [tfm.name for tfm in transforms]
	super().__init__(keys=keys, name=name, prob=prob)

	self.transforms = transforms
	self.transforms_to_apply = keys

	@contextmanager
	def filter(self, *names: list):
	"""This can be used to skip transforms entirely when applying
	the sequence of transforms to a signal. For example, take
	the following transforms with the names ``preprocess, augment, postprocess``.

	>>> preprocess = transforms.Compose(
	>>> tfm.GlobalVolumeNorm(),
	>>> tfm.CrossTalk(),
	>>> name="preprocess",
	>>> )
	>>> augment = transforms.Compose(
	>>> tfm.RoomImpulseResponse(),
	>>> tfm.BackgroundNoise(),
	>>> name="augment",
	>>> )
	>>> postprocess = transforms.Compose(
	>>> tfm.VolumeChange(),
	>>> tfm.RescaleAudio(),
	>>> tfm.ShiftPhase(),
	>>> name="postprocess",
	>>> )
	>>> transform = transforms.Compose(preprocess, augment, postprocess)

	If we wanted to apply all 3 to a signal, we do:

	>>> kwargs = transform.instantiate()
	>>> output = transform(signal.clone(), **kwargs)

	But if we only wanted to apply the ``preprocess`` and ``postprocess``
	transforms to the signal, we do:

	>>> with transform_fn.filter("preprocess", "postprocess"):
	>>> output = transform(signal.clone(), **kwargs)

	Parameters
	----------
	*names : list
	List of transforms, identified by name, to apply to signal.
	"""
	old_transforms = self.transforms_to_apply
	self.transforms_to_apply = names
	yield
	self.transforms_to_apply = old_transforms

	def _transform(self, signal, **kwargs):
	for transform in self.transforms:
	if any([x in transform.name for x in self.transforms_to_apply]):
	signal = transform(signal, **kwargs)
	return signal

	def _instantiate(self, state: RandomState, signal: AudioSignal = None):
	parameters = {}
	for transform in self.transforms:
	parameters.update(transform.instantiate(state, signal=signal))
	return parameters

	def __getitem__(self, idx):
	return self.transforms[idx]

	def __len__(self):
	return len(self.transforms)

	def __iter__(self):
	for transform in self.transforms:
	yield transform


	class Choose(Compose):
	"""Choose logic is the same as :py:func:`audiotools.data.transforms.Compose`,
	but instead of applying all the transforms in sequence, it applies just a single transform,
	which is chosen for each item in the batch.

	Parameters
	----------
	*transforms : list
	List of transforms to apply
	weights : list
	Probability of choosing any specific transform.
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0

	Examples
	--------

	>>> transforms.Choose(tfm.LowPass(), tfm.HighPass())
	"""

	def __init__(
	self,
	*transforms: list,
	weights: list = None,
	name: str = None,
	prob: float = 1.0,
	):
	super().__init__(*transforms, name=name, prob=prob)

	if weights is None:
	_len = len(self.transforms)
	weights = [1 / _len for _ in range(_len)]
	self.weights = np.array(weights)

	def _instantiate(self, state: RandomState, signal: AudioSignal = None):
	kwargs = super()._instantiate(state, signal)
	tfm_idx = list(range(len(self.transforms)))
	tfm_idx = state.choice(tfm_idx, p=self.weights)
	one_hot = []
	for i, t in enumerate(self.transforms):
	mask = kwargs[t.name]["mask"]
	if mask.item():
	kwargs[t.name]["mask"] = tt(i == tfm_idx)
	one_hot.append(kwargs[t.name]["mask"])
	kwargs["one_hot"] = one_hot
	return kwargs


	class Repeat(Compose):
	"""Repeatedly applies a given transform ``n_repeat`` times."

	Parameters
	----------
	transform : BaseTransform
	Transform to repeat.
	n_repeat : int, optional
	Number of times to repeat transform, by default 1
	"""

	def __init__(
	self,
	transform,
	n_repeat: int = 1,
	name: str = None,
	prob: float = 1.0,
	):
	transforms = [copy.copy(transform) for _ in range(n_repeat)]
	super().__init__(transforms, name=name, prob=prob)

	self.n_repeat = n_repeat


	class RepeatUpTo(Choose):
	"""Repeatedly applies a given transform up to ``max_repeat`` times."

	Parameters
	----------
	transform : BaseTransform
	Transform to repeat.
	max_repeat : int, optional
	Max number of times to repeat transform, by default 1
	weights : list
	Probability of choosing any specific number up to ``max_repeat``.
	"""

	def __init__(
	self,
	transform,
	max_repeat: int = 5,
	weights: list = None,
	name: str = None,
	prob: float = 1.0,
	):
	transforms = []
	for n in range(1, max_repeat):
	transforms.append(Repeat(transform, n_repeat=n))
	super().__init__(transforms, name=name, prob=prob, weights=weights)

	self.max_repeat = max_repeat


	class ClippingDistortion(BaseTransform):
	"""Adds clipping distortion to signal. Corresponds
	to :py:func:`audiotools.core.effects.EffectMixin.clip_distortion`.

	Parameters
	----------
	perc : tuple, optional
	Clipping percentile. Values are between 0.0 to 1.0.
	Typical values are 0.1 or below, by default ("uniform", 0.0, 0.1)
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(
	self,
	perc: tuple = ("uniform", 0.0, 0.1),
	name: str = None,
	prob: float = 1.0,
	):
	super().__init__(name=name, prob=prob)

	self.perc = perc

	def _instantiate(self, state: RandomState):
	return {"perc": util.sample_from_dist(self.perc, state)}

	def _transform(self, signal, perc):
	return signal.clip_distortion(perc)


	class Equalizer(BaseTransform):
	"""Applies an equalization curve to the audio signal. Corresponds
	to :py:func:`audiotools.core.effects.EffectMixin.equalizer`.

	Parameters
	----------
	eq_amount : tuple, optional
	The maximum dB cut to apply to the audio in any band,
	by default ("const", 1.0 dB)
	n_bands : int, optional
	Number of bands in EQ, by default 6
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(
	self,
	eq_amount: tuple = ("const", 1.0),
	n_bands: int = 6,
	name: str = None,
	prob: float = 1.0,
	):
	super().__init__(name=name, prob=prob)

	self.eq_amount = eq_amount
	self.n_bands = n_bands

	def _instantiate(self, state: RandomState):
	eq_amount = util.sample_from_dist(self.eq_amount, state)
	eq = -eq_amount * state.rand(self.n_bands)
	return {"eq": eq}

	def _transform(self, signal, eq):
	return signal.equalizer(eq)


	class Quantization(BaseTransform):
	"""Applies quantization to the input waveform. Corresponds
	to :py:func:`audiotools.core.effects.EffectMixin.quantization`.

	Parameters
	----------
	channels : tuple, optional
	Number of evenly spaced quantization channels to quantize
	to, by default ("choice", [8, 32, 128, 256, 1024])
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(
	self,
	channels: tuple = ("choice", [8, 32, 128, 256, 1024]),
	name: str = None,
	prob: float = 1.0,
	):
	super().__init__(name=name, prob=prob)

	self.channels = channels

	def _instantiate(self, state: RandomState):
	return {"channels": util.sample_from_dist(self.channels, state)}

	def _transform(self, signal, channels):
	return signal.quantization(channels)


	class MuLawQuantization(BaseTransform):
	"""Applies mu-law quantization to the input waveform. Corresponds
	to :py:func:`audiotools.core.effects.EffectMixin.mulaw_quantization`.

	Parameters
	----------
	channels : tuple, optional
	Number of mu-law spaced quantization channels to quantize
	to, by default ("choice", [8, 32, 128, 256, 1024])
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(
	self,
	channels: tuple = ("choice", [8, 32, 128, 256, 1024]),
	name: str = None,
	prob: float = 1.0,
	):
	super().__init__(name=name, prob=prob)

	self.channels = channels

	def _instantiate(self, state: RandomState):
	return {"channels": util.sample_from_dist(self.channels, state)}

	def _transform(self, signal, channels):
	return signal.mulaw_quantization(channels)


	class NoiseFloor(BaseTransform):
	"""Adds a noise floor of Gaussian noise to the signal at a specified
	dB.

	Parameters
	----------
	db : tuple, optional
	Level of noise to add to signal, by default ("const", -50.0)
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(
	self,
	db: tuple = ("const", -50.0),
	name: str = None,
	prob: float = 1.0,
	):
	super().__init__(name=name, prob=prob)

	self.db = db

	def _instantiate(self, state: RandomState, signal: AudioSignal):
	db = util.sample_from_dist(self.db, state)
	audio_data = state.randn(signal.num_channels, signal.signal_length)
	nz_signal = AudioSignal(audio_data, signal.sample_rate)
	nz_signal.normalize(db)
	return {"nz_signal": nz_signal}

	def _transform(self, signal, nz_signal):
	# Clone bg_signal so that transform can be repeatedly applied
	# to different signals with the same effect.
	return signal + nz_signal


	class BackgroundNoise(BaseTransform):
	"""Adds background noise from audio specified by a set of CSV files.
	A valid CSV file looks like, and is typically generated by
	:py:func:`audiotools.data.preprocess.create_csv`:

	.. csv-table::
	:header: path

	room_tone/m6_script2_clean.wav
	room_tone/m6_script2_cleanraw.wav
	room_tone/m6_script2_ipad_balcony1.wav
	room_tone/m6_script2_ipad_bedroom1.wav
	room_tone/m6_script2_ipad_confroom1.wav
	room_tone/m6_script2_ipad_confroom2.wav
	room_tone/m6_script2_ipad_livingroom1.wav
	room_tone/m6_script2_ipad_office1.wav

	.. note::
	All paths are relative to an environment variable called ``PATH_TO_DATA``,
	so that CSV files are portable across machines where data may be
	located in different places.

	This transform calls :py:func:`audiotools.core.effects.EffectMixin.mix`
	and :py:func:`audiotools.core.effects.EffectMixin.equalizer` under the
	hood.

	Parameters
	----------
	snr : tuple, optional
	Signal-to-noise ratio, by default ("uniform", 10.0, 30.0)
	sources : List[str], optional
	Sources containing folders, or CSVs with paths to audio files,
	by default None
	weights : List[float], optional
	Weights to sample audio files from each source, by default None
	eq_amount : tuple, optional
	Amount of equalization to apply, by default ("const", 1.0)
	n_bands : int, optional
	Number of bands in equalizer, by default 3
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	loudness_cutoff : float, optional
	Loudness cutoff when loading from audio files, by default None
	"""

	def __init__(
	self,
	snr: tuple = ("uniform", 10.0, 30.0),
	sources: List[str] = None,
	weights: List[float] = None,
	eq_amount: tuple = ("const", 1.0),
	n_bands: int = 3,
	name: str = None,
	prob: float = 1.0,
	loudness_cutoff: float = None,
	):
	super().__init__(name=name, prob=prob)

	self.snr = snr
	self.eq_amount = eq_amount
	self.n_bands = n_bands
	self.loader = AudioLoader(sources, weights)
	self.loudness_cutoff = loudness_cutoff

	def _instantiate(self, state: RandomState, signal: AudioSignal):
	eq_amount = util.sample_from_dist(self.eq_amount, state)
	eq = -eq_amount * state.rand(self.n_bands)
	snr = util.sample_from_dist(self.snr, state)

	bg_signal = self.loader(
	state,
	signal.sample_rate,
	duration=signal.signal_duration,
	loudness_cutoff=self.loudness_cutoff,
	num_channels=signal.num_channels,
	)["signal"]

	return {"eq": eq, "bg_signal": bg_signal, "snr": snr}

	def _transform(self, signal, bg_signal, snr, eq):
	# Clone bg_signal so that transform can be repeatedly applied
	# to different signals with the same effect.
	return signal.mix(bg_signal.clone(), snr, eq)


	class CrossTalk(BaseTransform):
	"""Adds crosstalk between speakers, whose audio is drawn from a CSV file
	that was produced via :py:func:`audiotools.data.preprocess.create_csv`.

	This transform calls :py:func:`audiotools.core.effects.EffectMixin.mix`
	under the hood.

	Parameters
	----------
	snr : tuple, optional
	How loud cross-talk speaker is relative to original signal in dB,
	by default ("uniform", 0.0, 10.0)
	sources : List[str], optional
	Sources containing folders, or CSVs with paths to audio files,
	by default None
	weights : List[float], optional
	Weights to sample audio files from each source, by default None
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	loudness_cutoff : float, optional
	Loudness cutoff when loading from audio files, by default -40
	"""

	def __init__(
	self,
	snr: tuple = ("uniform", 0.0, 10.0),
	sources: List[str] = None,
	weights: List[float] = None,
	name: str = None,
	prob: float = 1.0,
	loudness_cutoff: float = -40,
	):
	super().__init__(name=name, prob=prob)

	self.snr = snr
	self.loader = AudioLoader(sources, weights)
	self.loudness_cutoff = loudness_cutoff

	def _instantiate(self, state: RandomState, signal: AudioSignal):
	snr = util.sample_from_dist(self.snr, state)
	crosstalk_signal = self.loader(
	state,
	signal.sample_rate,
	duration=signal.signal_duration,
	loudness_cutoff=self.loudness_cutoff,
	num_channels=signal.num_channels,
	)["signal"]

	return {"crosstalk_signal": crosstalk_signal, "snr": snr}

	def _transform(self, signal, crosstalk_signal, snr):
	# Clone bg_signal so that transform can be repeatedly applied
	# to different signals with the same effect.
	loudness = signal.loudness()
	mix = signal.mix(crosstalk_signal.clone(), snr)
	mix.normalize(loudness)
	return mix


	class RoomImpulseResponse(BaseTransform):
	"""Convolves signal with a room impulse response, at a specified
	direct-to-reverberant ratio, with equalization applied. Room impulse
	response data is drawn from a CSV file that was produced via
	:py:func:`audiotools.data.preprocess.create_csv`.

	This transform calls :py:func:`audiotools.core.effects.EffectMixin.apply_ir`
	under the hood.

	Parameters
	----------
	drr : tuple, optional
	_description_, by default ("uniform", 0.0, 30.0)
	sources : List[str], optional
	Sources containing folders, or CSVs with paths to audio files,
	by default None
	weights : List[float], optional
	Weights to sample audio files from each source, by default None
	eq_amount : tuple, optional
	Amount of equalization to apply, by default ("const", 1.0)
	n_bands : int, optional
	Number of bands in equalizer, by default 6
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	use_original_phase : bool, optional
	Whether or not to use the original phase, by default False
	offset : float, optional
	Offset from each impulse response file to use, by default 0.0
	duration : float, optional
	Duration of each impulse response, by default 1.0
	"""

	def __init__(
	self,
	drr: tuple = ("uniform", 0.0, 30.0),
	sources: List[str] = None,
	weights: List[float] = None,
	eq_amount: tuple = ("const", 1.0),
	n_bands: int = 6,
	name: str = None,
	prob: float = 1.0,
	use_original_phase: bool = False,
	offset: float = 0.0,
	duration: float = 1.0,
	):
	super().__init__(name=name, prob=prob)

	self.drr = drr
	self.eq_amount = eq_amount
	self.n_bands = n_bands
	self.use_original_phase = use_original_phase

	self.loader = AudioLoader(sources, weights)
	self.offset = offset
	self.duration = duration

	def _instantiate(self, state: RandomState, signal: AudioSignal = None):
	eq_amount = util.sample_from_dist(self.eq_amount, state)
	eq = -eq_amount * state.rand(self.n_bands)
	drr = util.sample_from_dist(self.drr, state)

	ir_signal = self.loader(
	state,
	signal.sample_rate,
	offset=self.offset,
	duration=self.duration,
	loudness_cutoff=None,
	num_channels=signal.num_channels,
	)["signal"]
	ir_signal.zero_pad_to(signal.sample_rate)

	return {"eq": eq, "ir_signal": ir_signal, "drr": drr}

	def _transform(self, signal, ir_signal, drr, eq):
	# Clone ir_signal so that transform can be repeatedly applied
	# to different signals with the same effect.
	return signal.apply_ir(
	ir_signal.clone(), drr, eq, use_original_phase=self.use_original_phase
	)


	class VolumeChange(BaseTransform):
	"""Changes the volume of the input signal.

	Uses :py:func:`audiotools.core.effects.EffectMixin.volume_change`.

	Parameters
	----------
	db : tuple, optional
	Change in volume in decibels, by default ("uniform", -12.0, 0.0)
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(
	self,
	db: tuple = ("uniform", -12.0, 0.0),
	name: str = None,
	prob: float = 1.0,
	):
	super().__init__(name=name, prob=prob)
	self.db = db

	def _instantiate(self, state: RandomState):
	return {"db": util.sample_from_dist(self.db, state)}

	def _transform(self, signal, db):
	return signal.volume_change(db)


	class VolumeNorm(BaseTransform):
	"""Normalizes the volume of the excerpt to a specified decibel.

	Uses :py:func:`audiotools.core.effects.EffectMixin.normalize`.

	Parameters
	----------
	db : tuple, optional
	dB to normalize signal to, by default ("const", -24)
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(
	self,
	db: tuple = ("const", -24),
	name: str = None,
	prob: float = 1.0,
	):
	super().__init__(name=name, prob=prob)

	self.db = db

	def _instantiate(self, state: RandomState):
	return {"db": util.sample_from_dist(self.db, state)}

	def _transform(self, signal, db):
	return signal.normalize(db)


	class GlobalVolumeNorm(BaseTransform):
	"""Similar to :py:func:`audiotools.data.transforms.VolumeNorm`, this
	transform also normalizes the volume of a signal, but it uses
	the volume of the entire audio file the loaded excerpt comes from,
	rather than the volume of just the excerpt. The volume of the
	entire audio file is expected in ``signal.metadata["loudness"]``.
	If loading audio from a CSV generated by :py:func:`audiotools.data.preprocess.create_csv`
	with ``loudness = True``, like the following:

	.. csv-table::
	:header: path,loudness

	daps/produced/f1_script1_produced.wav,-16.299999237060547
	daps/produced/f1_script2_produced.wav,-16.600000381469727
	daps/produced/f1_script3_produced.wav,-17.299999237060547
	daps/produced/f1_script4_produced.wav,-16.100000381469727
	daps/produced/f1_script5_produced.wav,-16.700000762939453
	daps/produced/f3_script1_produced.wav,-16.5

	The ``AudioLoader`` will automatically load the loudness column into
	the metadata of the signal.

	Uses :py:func:`audiotools.core.effects.EffectMixin.volume_change`.

	Parameters
	----------
	db : tuple, optional
	dB to normalize signal to, by default ("const", -24)
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(
	self,
	db: tuple = ("const", -24),
	name: str = None,
	prob: float = 1.0,
	):
	super().__init__(name=name, prob=prob)

	self.db = db

	def _instantiate(self, state: RandomState, signal: AudioSignal):
	if "loudness" not in signal.metadata:
	db_change = 0.0
	elif float(signal.metadata["loudness"]) == float("-inf"):
	db_change = 0.0
	else:
	db = util.sample_from_dist(self.db, state)
	db_change = db - float(signal.metadata["loudness"])

	return {"db": db_change}

	def _transform(self, signal, db):
	return signal.volume_change(db)


	class Silence(BaseTransform):
	"""Zeros out the signal with some probability.

	Parameters
	----------
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 0.1
	"""

	def __init__(self, name: str = None, prob: float = 0.1):
	super().__init__(name=name, prob=prob)

	def _transform(self, signal):
	_loudness = signal._loudness
	signal = AudioSignal(
	torch.zeros_like(signal.audio_data),
	sample_rate=signal.sample_rate,
	stft_params=signal.stft_params,
	)
	# So that the amound of noise added is as if it wasn't silenced.
	# TODO: improve this hack
	signal._loudness = _loudness

	return signal


	class LowPass(BaseTransform):
	"""Applies a LowPass filter.

	Uses :py:func:`audiotools.core.dsp.DSPMixin.low_pass`.

	Parameters
	----------
	cutoff : tuple, optional
	Cutoff frequency distribution,
	by default ``("choice", [4000, 8000, 16000])``
	zeros : int, optional
	Number of zero-crossings in filter, argument to
	``julius.LowPassFilters``, by default 51
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(
	self,
	cutoff: tuple = ("choice", [4000, 8000, 16000]),
	zeros: int = 51,
	name: str = None,
	prob: float = 1,
	):
	super().__init__(name=name, prob=prob)

	self.cutoff = cutoff
	self.zeros = zeros

	def _instantiate(self, state: RandomState):
	return {"cutoff": util.sample_from_dist(self.cutoff, state)}

	def _transform(self, signal, cutoff):
	return signal.low_pass(cutoff, zeros=self.zeros)


	class HighPass(BaseTransform):
	"""Applies a HighPass filter.

	Uses :py:func:`audiotools.core.dsp.DSPMixin.high_pass`.

	Parameters
	----------
	cutoff : tuple, optional
	Cutoff frequency distribution,
	by default ``("choice", [50, 100, 250, 500, 1000])``
	zeros : int, optional
	Number of zero-crossings in filter, argument to
	``julius.LowPassFilters``, by default 51
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(
	self,
	cutoff: tuple = ("choice", [50, 100, 250, 500, 1000]),
	zeros: int = 51,
	name: str = None,
	prob: float = 1,
	):
	super().__init__(name=name, prob=prob)

	self.cutoff = cutoff
	self.zeros = zeros

	def _instantiate(self, state: RandomState):
	return {"cutoff": util.sample_from_dist(self.cutoff, state)}

	def _transform(self, signal, cutoff):
	return signal.high_pass(cutoff, zeros=self.zeros)


	class RescaleAudio(BaseTransform):
	"""Rescales the audio so it is in between ``-val`` and ``val``
	only if the original audio exceeds those bounds. Useful if
	transforms have caused the audio to clip.

	Uses :py:func:`audiotools.core.effects.EffectMixin.ensure_max_of_audio`.

	Parameters
	----------
	val : float, optional
	Max absolute value of signal, by default 1.0
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(self, val: float = 1.0, name: str = None, prob: float = 1):
	super().__init__(name=name, prob=prob)

	self.val = val

	def _transform(self, signal):
	return signal.ensure_max_of_audio(self.val)


	class ShiftPhase(SpectralTransform):
	"""Shifts the phase of the audio.

	Uses :py:func:`audiotools.core.dsp.DSPMixin.shift)phase`.

	Parameters
	----------
	shift : tuple, optional
	How much to shift phase by, by default ("uniform", -np.pi, np.pi)
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(
	self,
	shift: tuple = ("uniform", -np.pi, np.pi),
	name: str = None,
	prob: float = 1,
	):
	super().__init__(name=name, prob=prob)
	self.shift = shift

	def _instantiate(self, state: RandomState):
	return {"shift": util.sample_from_dist(self.shift, state)}

	def _transform(self, signal, shift):
	return signal.shift_phase(shift)


	class InvertPhase(ShiftPhase):
	"""Inverts the phase of the audio.

	Uses :py:func:`audiotools.core.dsp.DSPMixin.shift_phase`.

	Parameters
	----------
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(self, name: str = None, prob: float = 1):
	super().__init__(shift=("const", np.pi), name=name, prob=prob)


	class CorruptPhase(SpectralTransform):
	"""Corrupts the phase of the audio.

	Uses :py:func:`audiotools.core.dsp.DSPMixin.corrupt_phase`.

	Parameters
	----------
	scale : tuple, optional
	How much to corrupt phase by, by default ("uniform", 0, np.pi)
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(
	self, scale: tuple = ("uniform", 0, np.pi), name: str = None, prob: float = 1
	):
	super().__init__(name=name, prob=prob)
	self.scale = scale

	def _instantiate(self, state: RandomState, signal: AudioSignal = None):
	scale = util.sample_from_dist(self.scale, state)
	corruption = state.normal(scale=scale, size=signal.phase.shape[1:])
	return {"corruption": corruption.astype("float32")}

	def _transform(self, signal, corruption):
	return signal.shift_phase(shift=corruption)


	class FrequencyMask(SpectralTransform):
	"""Masks a band of frequencies at a center frequency
	from the audio.

	Uses :py:func:`audiotools.core.dsp.DSPMixin.mask_frequencies`.

	Parameters
	----------
	f_center : tuple, optional
	Center frequency between 0.0 and 1.0 (Nyquist), by default ("uniform", 0.0, 1.0)
	f_width : tuple, optional
	Width of zero'd out band, by default ("const", 0.1)
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(
	self,
	f_center: tuple = ("uniform", 0.0, 1.0),
	f_width: tuple = ("const", 0.1),
	name: str = None,
	prob: float = 1,
	):
	super().__init__(name=name, prob=prob)
	self.f_center = f_center
	self.f_width = f_width

	def _instantiate(self, state: RandomState, signal: AudioSignal):
	f_center = util.sample_from_dist(self.f_center, state)
	f_width = util.sample_from_dist(self.f_width, state)

	fmin = max(f_center - (f_width / 2), 0.0)
	fmax = min(f_center + (f_width / 2), 1.0)

	fmin_hz = (signal.sample_rate / 2) * fmin
	fmax_hz = (signal.sample_rate / 2) * fmax

	return {"fmin_hz": fmin_hz, "fmax_hz": fmax_hz}

	def _transform(self, signal, fmin_hz: float, fmax_hz: float):
	return signal.mask_frequencies(fmin_hz=fmin_hz, fmax_hz=fmax_hz)


	class TimeMask(SpectralTransform):
	"""Masks out contiguous time-steps from signal.

	Uses :py:func:`audiotools.core.dsp.DSPMixin.mask_timesteps`.

	Parameters
	----------
	t_center : tuple, optional
	Center time in terms of 0.0 and 1.0 (duration of signal),
	by default ("uniform", 0.0, 1.0)
	t_width : tuple, optional
	Width of dropped out portion, by default ("const", 0.025)
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(
	self,
	t_center: tuple = ("uniform", 0.0, 1.0),
	t_width: tuple = ("const", 0.025),
	name: str = None,
	prob: float = 1,
	):
	super().__init__(name=name, prob=prob)
	self.t_center = t_center
	self.t_width = t_width

	def _instantiate(self, state: RandomState, signal: AudioSignal):
	t_center = util.sample_from_dist(self.t_center, state)
	t_width = util.sample_from_dist(self.t_width, state)

	tmin = max(t_center - (t_width / 2), 0.0)
	tmax = min(t_center + (t_width / 2), 1.0)

	tmin_s = signal.signal_duration * tmin
	tmax_s = signal.signal_duration * tmax
	return {"tmin_s": tmin_s, "tmax_s": tmax_s}

	def _transform(self, signal, tmin_s: float, tmax_s: float):
	return signal.mask_timesteps(tmin_s=tmin_s, tmax_s=tmax_s)


	class MaskLowMagnitudes(SpectralTransform):
	"""Masks low magnitude regions out of signal.

	Uses :py:func:`audiotools.core.dsp.DSPMixin.mask_low_magnitudes`.

	Parameters
	----------
	db_cutoff : tuple, optional
	Decibel value for which things below it will be masked away,
	by default ("uniform", -10, 10)
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(
	self,
	db_cutoff: tuple = ("uniform", -10, 10),
	name: str = None,
	prob: float = 1,
	):
	super().__init__(name=name, prob=prob)
	self.db_cutoff = db_cutoff

	def _instantiate(self, state: RandomState, signal: AudioSignal = None):
	return {"db_cutoff": util.sample_from_dist(self.db_cutoff, state)}

	def _transform(self, signal, db_cutoff: float):
	return signal.mask_low_magnitudes(db_cutoff)


	class Smoothing(BaseTransform):
	"""Convolves the signal with a smoothing window.

	Uses :py:func:`audiotools.core.effects.EffectMixin.convolve`.

	Parameters
	----------
	window_type : tuple, optional
	Type of window to use, by default ("const", "average")
	window_length : tuple, optional
	Length of smoothing window, by
	default ("choice", [8, 16, 32, 64, 128, 256, 512])
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(
	self,
	window_type: tuple = ("const", "average"),
	window_length: tuple = ("choice", [8, 16, 32, 64, 128, 256, 512]),
	name: str = None,
	prob: float = 1,
	):
	super().__init__(name=name, prob=prob)
	self.window_type = window_type
	self.window_length = window_length

	def _instantiate(self, state: RandomState, signal: AudioSignal = None):
	window_type = util.sample_from_dist(self.window_type, state)
	window_length = util.sample_from_dist(self.window_length, state)
	window = signal.get_window(
	window_type=window_type, window_length=window_length, device="cpu"
	)
	return {"window": AudioSignal(window, signal.sample_rate)}

	def _transform(self, signal, window):
	sscale = signal.audio_data.abs().max(dim=-1, keepdim=True).values
	sscale[sscale == 0.0] = 1.0

	out = signal.convolve(window)

	oscale = out.audio_data.abs().max(dim=-1, keepdim=True).values
	oscale[oscale == 0.0] = 1.0

	out = out * (sscale / oscale)
	return out


	class TimeNoise(TimeMask):
	"""Similar to :py:func:`audiotools.data.transforms.TimeMask`, but
	replaces with noise instead of zeros.

	Parameters
	----------
	t_center : tuple, optional
	Center time in terms of 0.0 and 1.0 (duration of signal),
	by default ("uniform", 0.0, 1.0)
	t_width : tuple, optional
	Width of dropped out portion, by default ("const", 0.025)
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(
	self,
	t_center: tuple = ("uniform", 0.0, 1.0),
	t_width: tuple = ("const", 0.025),
	name: str = None,
	prob: float = 1,
	):
	super().__init__(t_center=t_center, t_width=t_width, name=name, prob=prob)

	def _transform(self, signal, tmin_s: float, tmax_s: float):
	signal = signal.mask_timesteps(tmin_s=tmin_s, tmax_s=tmax_s, val=0.0)
	mag, phase = signal.magnitude, signal.phase

	mag_r, phase_r = torch.randn_like(mag), torch.randn_like(phase)
	mask = (mag == 0.0) * (phase == 0.0)

	mag[mask] = mag_r[mask]
	phase[mask] = phase_r[mask]

	signal.magnitude = mag
	signal.phase = phase
	return signal


	class FrequencyNoise(FrequencyMask):
	"""Similar to :py:func:`audiotools.data.transforms.FrequencyMask`, but
	replaces with noise instead of zeros.

	Parameters
	----------
	f_center : tuple, optional
	Center frequency between 0.0 and 1.0 (Nyquist), by default ("uniform", 0.0, 1.0)
	f_width : tuple, optional
	Width of zero'd out band, by default ("const", 0.1)
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(
	self,
	f_center: tuple = ("uniform", 0.0, 1.0),
	f_width: tuple = ("const", 0.1),
	name: str = None,
	prob: float = 1,
	):
	super().__init__(f_center=f_center, f_width=f_width, name=name, prob=prob)

	def _transform(self, signal, fmin_hz: float, fmax_hz: float):
	signal = signal.mask_frequencies(fmin_hz=fmin_hz, fmax_hz=fmax_hz)
	mag, phase = signal.magnitude, signal.phase

	mag_r, phase_r = torch.randn_like(mag), torch.randn_like(phase)
	mask = (mag == 0.0) * (phase == 0.0)

	mag[mask] = mag_r[mask]
	phase[mask] = phase_r[mask]

	signal.magnitude = mag
	signal.phase = phase
	return signal


	class SpectralDenoising(Equalizer):
	"""Applies denoising algorithm detailed in
	:py:func:`audiotools.ml.layers.spectral_gate.SpectralGate`,
	using a randomly generated noise signal for denoising.

	Parameters
	----------
	eq_amount : tuple, optional
	Amount of eq to apply to noise signal, by default ("const", 1.0)
	denoise_amount : tuple, optional
	Amount to denoise by, by default ("uniform", 0.8, 1.0)
	nz_volume : float, optional
	Volume of noise to denoise with, by default -40
	n_bands : int, optional
	Number of bands in equalizer, by default 6
	n_freq : int, optional
	Number of frequency bins to smooth by, by default 3
	n_time : int, optional
	Number of time bins to smooth by, by default 5
	name : str, optional
	Name of this transform, used to identify it in the dictionary
	produced by ``self.instantiate``, by default None
	prob : float, optional
	Probability of applying this transform, by default 1.0
	"""

	def __init__(
	self,
	eq_amount: tuple = ("const", 1.0),
	denoise_amount: tuple = ("uniform", 0.8, 1.0),
	nz_volume: float = -40,
	n_bands: int = 6,
	n_freq: int = 3,
	n_time: int = 5,
	name: str = None,
	prob: float = 1,
	):
	super().__init__(eq_amount=eq_amount, n_bands=n_bands, name=name, prob=prob)

	self.nz_volume = nz_volume
	self.denoise_amount = denoise_amount
	self.spectral_gate = ml.layers.SpectralGate(n_freq, n_time)

	def _transform(self, signal, nz, eq, denoise_amount):
	nz = nz.normalize(self.nz_volume).equalizer(eq)
	self.spectral_gate = self.spectral_gate.to(signal.device)
	signal = self.spectral_gate(signal, nz, denoise_amount)
	return signal

	def _instantiate(self, state: RandomState):
	kwargs = super()._instantiate(state)
	kwargs["denoise_amount"] = util.sample_from_dist(self.denoise_amount, state)
	kwargs["nz"] = AudioSignal(state.randn(22050), 44100)
	return kwargs