Spaces:

FINAL-Bench
/

LiteRT-LM

Running

App Files Files Community

LiteRT-LM / runtime /components /preprocessor /audio_preprocessor.h

SeaWolf-AI

Upload full LiteRT-LM codebase

5f923cd verified 5 days ago

raw

history blame contribute delete

17.3 kB

	// Copyright 2025 The ODML Authors.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#ifndef THIRD_PARTY_ODML_LITERT_LM_RUNTIME_COMPONENTS_PREPROCESSOR_AUDIO_PREPROCESSOR_H_
	#define THIRD_PARTY_ODML_LITERT_LM_RUNTIME_COMPONENTS_PREPROCESSOR_AUDIO_PREPROCESSOR_H_

	#include <array>
	#include <ostream>

	#include "absl/status/statusor.h" // from @com_google_absl
	#include "runtime/engine/io_types.h"
	#include "runtime/util/status_macros.h" // IWYU pragma: keep

	namespace litert::lm {

	// Configuration for audio preprocessing.
	class AudioPreprocessorConfig {
	public:
	// The padding type of for FFT bins.
	enum FftPaddingType {
	// Right padding. The resulted FFT frame will be right padding with zeros or
	// truncated to the given FFT frame length.
	kRight = 0,
	// Center padding. The results FFT frame will be left and right padding with
	// zeros with same amount, or truncated with same amount on left and right,
	// to the given FFT frame length.
	kCenter = 1
	};

	// Creates Google's Universal Speech Model (USM) audio preprocessing
	// configuration.
	static AudioPreprocessorConfig CreateDefaultUsmConfig() {
	return AudioPreprocessorConfig(
	/* sample_rate_hz= */ 16000,
	/* num_channels= */ 1,
	/* frame_length= */ 512,
	/* hop_length= */ 160,
	/* fft_length = */ 1024,
	/* input_scale = */ 32768,
	/* pre_emphasis_factor = */ 0.97,
	/* num_mel_bins= */ 128,
	/* mel_low_hz= */ 125.0,
	/* mel_high_hz= */ 7500.0,
	/* mel_floor= */ 1e-6,
	/* normalize_mel= */ true,
	/* add_floor_to_mel_before_log= */ false,
	/* semicausal_padding= / false, / non_zero_hanning= */ true,
	/* periodic_hanning= */ true,
	/* fft_padding_type= */ FftPaddingType::kRight);
	}

	static AudioPreprocessorConfig Create(
	int sample_rate_hz, int num_channels, int frame_length, int hop_length,
	int fft_length, float input_scale, float pre_emphasis_factor,
	int num_mel_bins, float mel_low_hz, float mel_high_hz, float mel_floor,
	bool normalize_mel, bool add_floor_to_mel_before_log,
	bool semicausal_padding, bool non_zero_hanning, bool periodic_hanning,
	FftPaddingType fft_padding_type) {
	return AudioPreprocessorConfig(
	sample_rate_hz, num_channels, frame_length, hop_length, fft_length,
	input_scale, pre_emphasis_factor, num_mel_bins, mel_low_hz, mel_high_hz,
	mel_floor, normalize_mel, add_floor_to_mel_before_log,
	semicausal_padding, non_zero_hanning, periodic_hanning,
	fft_padding_type);
	}

	friend std::ostream& operator<<(std::ostream& os,
	const FftPaddingType& padding_type) {
	switch (padding_type) {
	case FftPaddingType::kRight:
	os << "right";
	break;
	case FftPaddingType::kCenter:
	os << "center";
	break;
	default:
	os << "unknown";
	break;
	}
	return os;
	}

	// Allows logging of the config.
	friend std::ostream& operator<<(std::ostream& os,
	const AudioPreprocessorConfig& config) {
	os << "AudioPreprocessorConfig {\n";
	os << " sample_rate_hz: " << config.GetSampleRateHz() << "\n";
	os << " num_channels: " << config.GetNumChannels() << "\n";
	os << " input_scale: " << config.GetInputScale() << "\n";
	os << " pre_emphasis_factor: " << config.GetPreEmphasisFactor() << "\n";
	os << " fft_length: " << config.GetFftLength() << "\n";
	os << " fft_bins: " << config.GetFftBins() << "\n";
	os << " frame_length: " << config.GetFrameLength() << "\n";
	os << " hop_length: " << config.GetHopLength() << "\n";
	os << " num_mel_bins: " << config.GetNumMelBins() << "\n";
	os << " mel_low_hz: " << config.GetMelLowHz() << "\n";
	os << " mel_high_hz: " << config.GetMelHighHz() << "\n";
	os << " mel_floor: " << config.GetMelFloor() << "\n";
	os << " normalize_mel: " << config.GetNormalizeMel() << "\n";
	os << " add_floor_to_mel_before_log: "
	<< config.GetAddFloorToMelBeforeLog() << "\n";
	os << " semicausal_padding: " << config.GetSemicausalPadding() << "\n";
	os << " non_zero_hanning: " << config.GetNonZeroHanning() << "\n";
	os << " periodic_hanning: " << config.GetPeriodicHanning() << "\n";
	os << " fft_padding_type: " << config.GetFftPaddingType() << "\n";
	os << "}";
	return os;
	}

	// Getter APIs.
	// The sample rate while loading the audio. The audio should be resampled to
	// the configured sample rate.
	int GetSampleRateHz() const { return sample_rate_hz_; }
	// The number of audio channels the preprocessor expect from the audio
	// content.
	int GetNumChannels() const { return num_channels_; }
	// The scale applied to the audio PCM frames before processing to
	// spectrogram.
	float GetInputScale() const { return input_scale_; }
	// The pre-emphasis factor applied to the audio before processing to
	// spectrogram.
	float GetPreEmphasisFactor() const { return pre_emphasis_factor_; }
	// The FFT length used for processing the audio.
	int GetFftLength() const { return fft_length_; }
	// The number of FFT bins used for real-sequence Fourier transform (RFFT) and
	// Mel spectrogram processing. It is derived from the FFT length as FFT
	// length / 2 + 1.
	int GetFftBins() const { return fft_bins_; }
	// The frame length used for for each frame of Short-Time Fourier Transform
	// (STFT).
	int GetFrameLength() const { return frame_length_; }
	// The hop length used for in sliding window of Short-Time Fourier Transform
	// (STFT).
	int GetHopLength() const { return hop_length_; }
	// The number of Mel bins used for Mel spectrogram processing.
	int GetNumMelBins() const { return num_mel_bins_; }
	// The lower bound of the Mel frequency range.
	float GetMelLowHz() const { return mel_low_hz_; }
	// The upper bound of the Mel frequency range.
	float GetMelHighHz() const { return mel_high_hz_; }
	// The floor value of the Mel spectrogram.
	float GetMelFloor() const { return mel_floor_; }
	// Whether to normalize the Mel spectrogram with precalculated mean and std
	// dev.
	bool GetNormalizeMel() const { return normalize_mel_; }
	// Whether to add the floor value to the Mel spectrogram before taking the
	// logarithm.
	bool GetAddFloorToMelBeforeLog() const {
	return add_floor_to_mel_before_log_;
	}
	// Whether to use semicausal padding for the audio frames.
	bool GetSemicausalPadding() const { return semicausal_padding_; }
	// Whether to use non-zero Hanning window for FFT.
	bool GetNonZeroHanning() const { return non_zero_hanning_; }
	// Whether to use the periodic Hanning window for FFT.
	bool GetPeriodicHanning() const { return periodic_hanning_; }
	// The padding type used for FFT.
	FftPaddingType GetFftPaddingType() const { return fft_padding_type_; }

	// Setter APIs.
	void SetSampleRateHz(int sample_rate_hz) { sample_rate_hz_ = sample_rate_hz; }
	void SetNumChannels(int num_channels) { num_channels_ = num_channels; }
	void SetInputScale(float input_scale) { input_scale_ = input_scale; }
	void SetPreEmphasisFactor(float pre_emphasis_factor) {
	pre_emphasis_factor_ = pre_emphasis_factor;
	}
	// The FFT length must be even for real FFT optimization. The FFT bins will be
	// derived from the FFT length as FFT length / 2 + 1.
	void SetFftLength(int fft_length) {
	fft_length_ = fft_length;
	fft_bins_ = fft_length / 2 + 1;
	}
	void SetFrameLength(int frame_length) { frame_length_ = frame_length; }
	void SetHopLength(int hop_length) { hop_length_ = hop_length; }
	void SetNumMelBins(int num_mel_bins) { num_mel_bins_ = num_mel_bins; }
	void SetMelLowHz(float mel_low_hz) { mel_low_hz_ = mel_low_hz; }
	void SetMelHighHz(float mel_high_hz) { mel_high_hz_ = mel_high_hz; }
	void SetMelFloor(float mel_floor) { mel_floor_ = mel_floor; }
	void SetNormalizeMel(bool normalize_mel) { normalize_mel_ = normalize_mel; }
	void SetAddFloorToMelBeforeLog(bool add_floor_to_mel_before_log) {
	add_floor_to_mel_before_log_ = add_floor_to_mel_before_log;
	}
	void SetSemicausalPadding(bool semicausal_padding) {
	semicausal_padding_ = semicausal_padding;
	}
	void SetNonZeroHanning(bool non_zero_hanning) {
	non_zero_hanning_ = non_zero_hanning;
	}
	void SetPeriodicHanning(bool periodic_hanning) {
	periodic_hanning_ = periodic_hanning;
	}
	void SetFftPaddingType(FftPaddingType fft_padding_type) {
	fft_padding_type_ = fft_padding_type;
	}

	// The Mel Spectrogram means used for Universal Speech Model (USM) during
	// preprocessing.
	static constexpr std::array<float, 128> kUsmMelMean{
	6.398797734146062, 6.5292966718485665, 6.636971307272159,
	6.73283598251503, 6.83729192594687, 6.955722303271236,
	7.102944890730766, 7.114182036087843, 7.1506544101153,
	7.174958993259514, 7.1890256978077804, 7.196835788986042,
	7.211737590554171, 7.365040287042535, 7.350661707754529,
	7.34752702412618, 7.370936184320344, 7.552167274579683,
	7.4736985912567455, 7.461733145619613, 7.655010083032587,
	7.537023586741711, 7.59332033698754, 7.678828995158089,
	7.573545549481997, 7.721706263812856, 7.548489195294597,
	7.647480899467908, 7.546350507038094, 7.552359044394656,
	7.60142267532906, 7.510803537242497, 7.547512749381739,
	7.5734628575808145, 7.516065818981327, 7.544310572169082,
	7.556128732606547, 7.578428971230521, 7.565946473157099,
	7.565821431053628, 7.582146705201401, 7.5917054493764775,
	7.59647680034444, 7.612909043144701, 7.642191074647679,
	7.682020208604412, 7.669657702288002, 7.636762908696176,
	7.645613169792156, 7.687786852309006, 7.733375349074729,
	7.705414197270183, 7.773851002316419, 7.767855696186511,
	7.804625030416079, 7.8095583241565505, 7.845300151068656,
	7.832030482713495, 7.876477438621265, 7.886595835981996,
	7.907747879286325, 7.926010325946424, 7.927971987569718,
	7.94765994925662, 7.9609369675109205, 7.977485334083968,
	7.995276449058029, 8.020093867153456, 8.026893789702653,
	8.036394113138993, 8.072079269745391, 8.072009510709744,
	8.15832987882215, 8.169035932109242, 8.201262910500471,
	8.203176911295596, 8.237251381186532, 8.265968214462914,
	8.278791003594298, 8.279921657260331, 8.303751782080207,
	8.323985266369666, 8.358499418073363, 8.368121771923692,
	8.392162333974197, 8.40529917133684, 8.421934604788884,
	8.43307981480797, 8.416437732709245, 8.380481381138022,
	8.313028108945332, 8.172698101608145, 7.987087868524417,
	7.775018865353218, 7.587469885918491, 7.485680948258058,
	7.425561455270659, 7.426161453764725, 7.500171657170674,
	7.473711809407939, 7.497915553109761, 7.555291079941853,
	7.5404297094497155, 7.554637855844384, 7.5536294881940025,
	7.597411437015373, 7.620857310821611, 7.622024042245356,
	7.643684482318661, 7.651806604022742, 7.647768200868812,
	7.619968160658521, 7.663675433728041, 7.770133777809638,
	7.775737195054957, 7.756637821283381, 7.7958903182806445,
	7.824714343764584, 7.8699194044250325, 7.857690367947652,
	7.854133456399421, 7.83057312917979, 7.780062155284722,
	7.687571300835443, 7.626255596158039, 7.475138444832542,
	7.31241576045514, 7.162930372619685,
	};

	// The Mel Spectrogram standard deviations constants used for Universal Speech
	// Model (USM) during preprocessing.
	static constexpr std::array<float, 128> kUsmMelStdDev{
	1.6785894541269812, 1.6687138672328043, 1.6906522689607268,
	1.7375192957945016, 1.7755335232132188, 1.7945350399969586,
	1.8160038735261768, 1.8455822079478754, 1.854889301328728,
	1.8544058257314018, 1.8531530795826658, 1.8568193392072,
	1.8568580559801775, 1.8403822120311448, 1.8311156303932052,
	1.8381223837390877, 1.8582757939740133, 1.8751353033960765,
	1.8940031697532662, 1.9045566324594227, 1.9114104933328382,
	1.9234409916967738, 1.932244372950416, 1.9354540832886058,
	1.9196173248258872, 1.8884371698304272, 1.8666212011400265,
	1.851852265212217, 1.8466309429379515, 1.8370433682382064,
	1.8312948374209728, 1.8233918348681029, 1.8162900339615862,
	1.813554336166136, 1.7988012203002604, 1.7783664628243725,
	1.762995373099593, 1.754638830337111, 1.7562192553046327,
	1.7570134298011308, 1.748103676233597, 1.7420266564237143,
	1.7433799765791382, 1.7405273444710188, 1.7681605535143332,
	1.7928765468247894, 1.7832784911754684, 1.7556019331853459,
	1.734978397119943, 1.7251193027145706, 1.711577677561937,
	1.7077475454470532, 1.702793505675667, 1.7087228728780646,
	1.7055479598955696, 1.7048659481569446, 1.7136985315687527,
	1.7003759527643025, 1.7038510617369829, 1.712407460050622,
	1.7195395708962748, 1.715985369102956, 1.7047382463157097,
	1.6858892841332958, 1.6803980138770978, 1.6883086163746897,
	1.678822586089551, 1.6704169259147215, 1.6824154866833487,
	1.7002006169486261, 1.7095077608591729, 1.7127719919531275,
	1.7007540237588394, 1.7007030789334565, 1.7006801726721705,
	1.7084333739135957, 1.7080081837410785, 1.7088852843730529,
	1.7058124003569382, 1.7104967128913229, 1.7017088898161998,
	1.6946290530635235, 1.6886895951157692, 1.6913609136330663,
	1.6802034976166595, 1.6778644057956866, 1.6844856225324205,
	1.6919889285341483, 1.6918548241011255, 1.6771215766236411,
	1.6753742459089904, 1.6732896439517075, 1.665104739745144,
	1.682512689327978, 1.7001049276791989, 1.71496232533367,
	1.751371703351037, 1.7589949482516734, 1.7274831977280356,
	1.7428303906628124, 1.7427952258580872, 1.7072930970436015,
	1.72696991469254, 1.7128335116767701, 1.7266508365456639,
	1.699287147275948, 1.6860698274507981, 1.6862991003373358,
	1.683393071329867, 1.687619365543026, 1.7100825041856975,
	1.7407356256589301, 1.7218710733945026, 1.6776658140019411,
	1.6864518015922916, 1.7273244787326472, 1.6992470398169233,
	1.6800806970795965, 1.6579370965601807, 1.6647055065206582,
	1.65766768806214, 1.6294301234765352, 1.5918612004781831,
	1.5335441292387613, 1.3949765253217616, 1.2628815962896491,
	1.1053653031914006, 0.9263256925938697,
	};

	private:
	explicit AudioPreprocessorConfig(
	// Audio decoding parameters.
	int sample_rate_hz, int num_channels,
	// FFT parameters.
	int frame_length, int hop_length, int fft_length, float input_scale,
	float pre_emphasis_factor,
	// Mel spectrogram parameters.
	int num_mel_bins, float mel_low_hz, float mel_high_hz, float mel_floor,
	bool normalize_mel, bool add_floor_to_mel_before_log,
	bool semicausal_padding, bool non_zero_hanning, bool periodic_hanning,
	FftPaddingType fft_padding_type)
	: sample_rate_hz_(sample_rate_hz),
	num_channels_(num_channels),
	fft_length_(fft_length),
	fft_bins_(fft_length / 2 + 1),
	frame_length_(frame_length),
	hop_length_(hop_length),
	num_mel_bins_(num_mel_bins),
	mel_low_hz_(mel_low_hz),
	mel_high_hz_(mel_high_hz),
	mel_floor_(mel_floor),
	input_scale_(input_scale),
	pre_emphasis_factor_(pre_emphasis_factor),
	normalize_mel_(normalize_mel),
	add_floor_to_mel_before_log_(add_floor_to_mel_before_log),
	semicausal_padding_(semicausal_padding),
	non_zero_hanning_(non_zero_hanning),
	periodic_hanning_(periodic_hanning),
	fft_padding_type_(fft_padding_type) {}
	int sample_rate_hz_;
	int num_channels_;
	int fft_length_;
	int fft_bins_;
	int frame_length_;
	int hop_length_;
	int num_mel_bins_;
	float mel_low_hz_;
	float mel_high_hz_;
	float mel_floor_;
	float input_scale_;
	float pre_emphasis_factor_;
	bool normalize_mel_;
	bool add_floor_to_mel_before_log_;
	bool semicausal_padding_;
	bool non_zero_hanning_;
	bool periodic_hanning_;
	FftPaddingType fft_padding_type_;
	};

	// Interface for audio preprocessing.
	class AudioPreprocessor {
	public:
	virtual ~AudioPreprocessor() = default;

	// Preprocesses the undecoded audio bytes and returns the preprocessed audio.
	virtual absl::StatusOr<InputAudio> Preprocess(
	const InputAudio& input_audio) = 0;

	// Reset the audio preprocessor to the initial state.
	virtual void Reset() = 0;
	};

	std::ostream& operator<<(
	std::ostream& os,
	const AudioPreprocessorConfig::FftPaddingType& padding_type);

	} // namespace litert::lm

	#endif // THIRD_PARTY_ODML_LITERT_LM_RUNTIME_COMPONENTS_PREPROCESSOR_AUDIO_PREPROCESSOR_H_