LiteRT-LM / runtime /components /preprocessor /audio_preprocessor.h
SeaWolf-AI's picture
Upload full LiteRT-LM codebase
5f923cd verified
// Copyright 2025 The ODML Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef THIRD_PARTY_ODML_LITERT_LM_RUNTIME_COMPONENTS_PREPROCESSOR_AUDIO_PREPROCESSOR_H_
#define THIRD_PARTY_ODML_LITERT_LM_RUNTIME_COMPONENTS_PREPROCESSOR_AUDIO_PREPROCESSOR_H_
#include <array>
#include <ostream>
#include "absl/status/statusor.h" // from @com_google_absl
#include "runtime/engine/io_types.h"
#include "runtime/util/status_macros.h" // IWYU pragma: keep
namespace litert::lm {
// Configuration for audio preprocessing.
class AudioPreprocessorConfig {
public:
// The padding type of for FFT bins.
enum FftPaddingType {
// Right padding. The resulted FFT frame will be right padding with zeros or
// truncated to the given FFT frame length.
kRight = 0,
// Center padding. The results FFT frame will be left and right padding with
// zeros with same amount, or truncated with same amount on left and right,
// to the given FFT frame length.
kCenter = 1
};
// Creates Google's Universal Speech Model (USM) audio preprocessing
// configuration.
static AudioPreprocessorConfig CreateDefaultUsmConfig() {
return AudioPreprocessorConfig(
/* sample_rate_hz= */ 16000,
/* num_channels= */ 1,
/* frame_length= */ 512,
/* hop_length= */ 160,
/* fft_length = */ 1024,
/* input_scale = */ 32768,
/* pre_emphasis_factor = */ 0.97,
/* num_mel_bins= */ 128,
/* mel_low_hz= */ 125.0,
/* mel_high_hz= */ 7500.0,
/* mel_floor= */ 1e-6,
/* normalize_mel= */ true,
/* add_floor_to_mel_before_log= */ false,
/* semicausal_padding= */ false, /* non_zero_hanning= */ true,
/* periodic_hanning= */ true,
/* fft_padding_type= */ FftPaddingType::kRight);
}
static AudioPreprocessorConfig Create(
int sample_rate_hz, int num_channels, int frame_length, int hop_length,
int fft_length, float input_scale, float pre_emphasis_factor,
int num_mel_bins, float mel_low_hz, float mel_high_hz, float mel_floor,
bool normalize_mel, bool add_floor_to_mel_before_log,
bool semicausal_padding, bool non_zero_hanning, bool periodic_hanning,
FftPaddingType fft_padding_type) {
return AudioPreprocessorConfig(
sample_rate_hz, num_channels, frame_length, hop_length, fft_length,
input_scale, pre_emphasis_factor, num_mel_bins, mel_low_hz, mel_high_hz,
mel_floor, normalize_mel, add_floor_to_mel_before_log,
semicausal_padding, non_zero_hanning, periodic_hanning,
fft_padding_type);
}
friend std::ostream& operator<<(std::ostream& os,
const FftPaddingType& padding_type) {
switch (padding_type) {
case FftPaddingType::kRight:
os << "right";
break;
case FftPaddingType::kCenter:
os << "center";
break;
default:
os << "unknown";
break;
}
return os;
}
// Allows logging of the config.
friend std::ostream& operator<<(std::ostream& os,
const AudioPreprocessorConfig& config) {
os << "AudioPreprocessorConfig {\n";
os << " sample_rate_hz: " << config.GetSampleRateHz() << "\n";
os << " num_channels: " << config.GetNumChannels() << "\n";
os << " input_scale: " << config.GetInputScale() << "\n";
os << " pre_emphasis_factor: " << config.GetPreEmphasisFactor() << "\n";
os << " fft_length: " << config.GetFftLength() << "\n";
os << " fft_bins: " << config.GetFftBins() << "\n";
os << " frame_length: " << config.GetFrameLength() << "\n";
os << " hop_length: " << config.GetHopLength() << "\n";
os << " num_mel_bins: " << config.GetNumMelBins() << "\n";
os << " mel_low_hz: " << config.GetMelLowHz() << "\n";
os << " mel_high_hz: " << config.GetMelHighHz() << "\n";
os << " mel_floor: " << config.GetMelFloor() << "\n";
os << " normalize_mel: " << config.GetNormalizeMel() << "\n";
os << " add_floor_to_mel_before_log: "
<< config.GetAddFloorToMelBeforeLog() << "\n";
os << " semicausal_padding: " << config.GetSemicausalPadding() << "\n";
os << " non_zero_hanning: " << config.GetNonZeroHanning() << "\n";
os << " periodic_hanning: " << config.GetPeriodicHanning() << "\n";
os << " fft_padding_type: " << config.GetFftPaddingType() << "\n";
os << "}";
return os;
}
// Getter APIs.
// The sample rate while loading the audio. The audio should be resampled to
// the configured sample rate.
int GetSampleRateHz() const { return sample_rate_hz_; }
// The number of audio channels the preprocessor expect from the audio
// content.
int GetNumChannels() const { return num_channels_; }
// The scale applied to the audio PCM frames before processing to
// spectrogram.
float GetInputScale() const { return input_scale_; }
// The pre-emphasis factor applied to the audio before processing to
// spectrogram.
float GetPreEmphasisFactor() const { return pre_emphasis_factor_; }
// The FFT length used for processing the audio.
int GetFftLength() const { return fft_length_; }
// The number of FFT bins used for real-sequence Fourier transform (RFFT) and
// Mel spectrogram processing. It is derived from the FFT length as FFT
// length / 2 + 1.
int GetFftBins() const { return fft_bins_; }
// The frame length used for for each frame of Short-Time Fourier Transform
// (STFT).
int GetFrameLength() const { return frame_length_; }
// The hop length used for in sliding window of Short-Time Fourier Transform
// (STFT).
int GetHopLength() const { return hop_length_; }
// The number of Mel bins used for Mel spectrogram processing.
int GetNumMelBins() const { return num_mel_bins_; }
// The lower bound of the Mel frequency range.
float GetMelLowHz() const { return mel_low_hz_; }
// The upper bound of the Mel frequency range.
float GetMelHighHz() const { return mel_high_hz_; }
// The floor value of the Mel spectrogram.
float GetMelFloor() const { return mel_floor_; }
// Whether to normalize the Mel spectrogram with precalculated mean and std
// dev.
bool GetNormalizeMel() const { return normalize_mel_; }
// Whether to add the floor value to the Mel spectrogram before taking the
// logarithm.
bool GetAddFloorToMelBeforeLog() const {
return add_floor_to_mel_before_log_;
}
// Whether to use semicausal padding for the audio frames.
bool GetSemicausalPadding() const { return semicausal_padding_; }
// Whether to use non-zero Hanning window for FFT.
bool GetNonZeroHanning() const { return non_zero_hanning_; }
// Whether to use the periodic Hanning window for FFT.
bool GetPeriodicHanning() const { return periodic_hanning_; }
// The padding type used for FFT.
FftPaddingType GetFftPaddingType() const { return fft_padding_type_; }
// Setter APIs.
void SetSampleRateHz(int sample_rate_hz) { sample_rate_hz_ = sample_rate_hz; }
void SetNumChannels(int num_channels) { num_channels_ = num_channels; }
void SetInputScale(float input_scale) { input_scale_ = input_scale; }
void SetPreEmphasisFactor(float pre_emphasis_factor) {
pre_emphasis_factor_ = pre_emphasis_factor;
}
// The FFT length must be even for real FFT optimization. The FFT bins will be
// derived from the FFT length as FFT length / 2 + 1.
void SetFftLength(int fft_length) {
fft_length_ = fft_length;
fft_bins_ = fft_length / 2 + 1;
}
void SetFrameLength(int frame_length) { frame_length_ = frame_length; }
void SetHopLength(int hop_length) { hop_length_ = hop_length; }
void SetNumMelBins(int num_mel_bins) { num_mel_bins_ = num_mel_bins; }
void SetMelLowHz(float mel_low_hz) { mel_low_hz_ = mel_low_hz; }
void SetMelHighHz(float mel_high_hz) { mel_high_hz_ = mel_high_hz; }
void SetMelFloor(float mel_floor) { mel_floor_ = mel_floor; }
void SetNormalizeMel(bool normalize_mel) { normalize_mel_ = normalize_mel; }
void SetAddFloorToMelBeforeLog(bool add_floor_to_mel_before_log) {
add_floor_to_mel_before_log_ = add_floor_to_mel_before_log;
}
void SetSemicausalPadding(bool semicausal_padding) {
semicausal_padding_ = semicausal_padding;
}
void SetNonZeroHanning(bool non_zero_hanning) {
non_zero_hanning_ = non_zero_hanning;
}
void SetPeriodicHanning(bool periodic_hanning) {
periodic_hanning_ = periodic_hanning;
}
void SetFftPaddingType(FftPaddingType fft_padding_type) {
fft_padding_type_ = fft_padding_type;
}
// The Mel Spectrogram means used for Universal Speech Model (USM) during
// preprocessing.
static constexpr std::array<float, 128> kUsmMelMean{
6.398797734146062, 6.5292966718485665, 6.636971307272159,
6.73283598251503, 6.83729192594687, 6.955722303271236,
7.102944890730766, 7.114182036087843, 7.1506544101153,
7.174958993259514, 7.1890256978077804, 7.196835788986042,
7.211737590554171, 7.365040287042535, 7.350661707754529,
7.34752702412618, 7.370936184320344, 7.552167274579683,
7.4736985912567455, 7.461733145619613, 7.655010083032587,
7.537023586741711, 7.59332033698754, 7.678828995158089,
7.573545549481997, 7.721706263812856, 7.548489195294597,
7.647480899467908, 7.546350507038094, 7.552359044394656,
7.60142267532906, 7.510803537242497, 7.547512749381739,
7.5734628575808145, 7.516065818981327, 7.544310572169082,
7.556128732606547, 7.578428971230521, 7.565946473157099,
7.565821431053628, 7.582146705201401, 7.5917054493764775,
7.59647680034444, 7.612909043144701, 7.642191074647679,
7.682020208604412, 7.669657702288002, 7.636762908696176,
7.645613169792156, 7.687786852309006, 7.733375349074729,
7.705414197270183, 7.773851002316419, 7.767855696186511,
7.804625030416079, 7.8095583241565505, 7.845300151068656,
7.832030482713495, 7.876477438621265, 7.886595835981996,
7.907747879286325, 7.926010325946424, 7.927971987569718,
7.94765994925662, 7.9609369675109205, 7.977485334083968,
7.995276449058029, 8.020093867153456, 8.026893789702653,
8.036394113138993, 8.072079269745391, 8.072009510709744,
8.15832987882215, 8.169035932109242, 8.201262910500471,
8.203176911295596, 8.237251381186532, 8.265968214462914,
8.278791003594298, 8.279921657260331, 8.303751782080207,
8.323985266369666, 8.358499418073363, 8.368121771923692,
8.392162333974197, 8.40529917133684, 8.421934604788884,
8.43307981480797, 8.416437732709245, 8.380481381138022,
8.313028108945332, 8.172698101608145, 7.987087868524417,
7.775018865353218, 7.587469885918491, 7.485680948258058,
7.425561455270659, 7.426161453764725, 7.500171657170674,
7.473711809407939, 7.497915553109761, 7.555291079941853,
7.5404297094497155, 7.554637855844384, 7.5536294881940025,
7.597411437015373, 7.620857310821611, 7.622024042245356,
7.643684482318661, 7.651806604022742, 7.647768200868812,
7.619968160658521, 7.663675433728041, 7.770133777809638,
7.775737195054957, 7.756637821283381, 7.7958903182806445,
7.824714343764584, 7.8699194044250325, 7.857690367947652,
7.854133456399421, 7.83057312917979, 7.780062155284722,
7.687571300835443, 7.626255596158039, 7.475138444832542,
7.31241576045514, 7.162930372619685,
};
// The Mel Spectrogram standard deviations constants used for Universal Speech
// Model (USM) during preprocessing.
static constexpr std::array<float, 128> kUsmMelStdDev{
1.6785894541269812, 1.6687138672328043, 1.6906522689607268,
1.7375192957945016, 1.7755335232132188, 1.7945350399969586,
1.8160038735261768, 1.8455822079478754, 1.854889301328728,
1.8544058257314018, 1.8531530795826658, 1.8568193392072,
1.8568580559801775, 1.8403822120311448, 1.8311156303932052,
1.8381223837390877, 1.8582757939740133, 1.8751353033960765,
1.8940031697532662, 1.9045566324594227, 1.9114104933328382,
1.9234409916967738, 1.932244372950416, 1.9354540832886058,
1.9196173248258872, 1.8884371698304272, 1.8666212011400265,
1.851852265212217, 1.8466309429379515, 1.8370433682382064,
1.8312948374209728, 1.8233918348681029, 1.8162900339615862,
1.813554336166136, 1.7988012203002604, 1.7783664628243725,
1.762995373099593, 1.754638830337111, 1.7562192553046327,
1.7570134298011308, 1.748103676233597, 1.7420266564237143,
1.7433799765791382, 1.7405273444710188, 1.7681605535143332,
1.7928765468247894, 1.7832784911754684, 1.7556019331853459,
1.734978397119943, 1.7251193027145706, 1.711577677561937,
1.7077475454470532, 1.702793505675667, 1.7087228728780646,
1.7055479598955696, 1.7048659481569446, 1.7136985315687527,
1.7003759527643025, 1.7038510617369829, 1.712407460050622,
1.7195395708962748, 1.715985369102956, 1.7047382463157097,
1.6858892841332958, 1.6803980138770978, 1.6883086163746897,
1.678822586089551, 1.6704169259147215, 1.6824154866833487,
1.7002006169486261, 1.7095077608591729, 1.7127719919531275,
1.7007540237588394, 1.7007030789334565, 1.7006801726721705,
1.7084333739135957, 1.7080081837410785, 1.7088852843730529,
1.7058124003569382, 1.7104967128913229, 1.7017088898161998,
1.6946290530635235, 1.6886895951157692, 1.6913609136330663,
1.6802034976166595, 1.6778644057956866, 1.6844856225324205,
1.6919889285341483, 1.6918548241011255, 1.6771215766236411,
1.6753742459089904, 1.6732896439517075, 1.665104739745144,
1.682512689327978, 1.7001049276791989, 1.71496232533367,
1.751371703351037, 1.7589949482516734, 1.7274831977280356,
1.7428303906628124, 1.7427952258580872, 1.7072930970436015,
1.72696991469254, 1.7128335116767701, 1.7266508365456639,
1.699287147275948, 1.6860698274507981, 1.6862991003373358,
1.683393071329867, 1.687619365543026, 1.7100825041856975,
1.7407356256589301, 1.7218710733945026, 1.6776658140019411,
1.6864518015922916, 1.7273244787326472, 1.6992470398169233,
1.6800806970795965, 1.6579370965601807, 1.6647055065206582,
1.65766768806214, 1.6294301234765352, 1.5918612004781831,
1.5335441292387613, 1.3949765253217616, 1.2628815962896491,
1.1053653031914006, 0.9263256925938697,
};
private:
explicit AudioPreprocessorConfig(
// Audio decoding parameters.
int sample_rate_hz, int num_channels,
// FFT parameters.
int frame_length, int hop_length, int fft_length, float input_scale,
float pre_emphasis_factor,
// Mel spectrogram parameters.
int num_mel_bins, float mel_low_hz, float mel_high_hz, float mel_floor,
bool normalize_mel, bool add_floor_to_mel_before_log,
bool semicausal_padding, bool non_zero_hanning, bool periodic_hanning,
FftPaddingType fft_padding_type)
: sample_rate_hz_(sample_rate_hz),
num_channels_(num_channels),
fft_length_(fft_length),
fft_bins_(fft_length / 2 + 1),
frame_length_(frame_length),
hop_length_(hop_length),
num_mel_bins_(num_mel_bins),
mel_low_hz_(mel_low_hz),
mel_high_hz_(mel_high_hz),
mel_floor_(mel_floor),
input_scale_(input_scale),
pre_emphasis_factor_(pre_emphasis_factor),
normalize_mel_(normalize_mel),
add_floor_to_mel_before_log_(add_floor_to_mel_before_log),
semicausal_padding_(semicausal_padding),
non_zero_hanning_(non_zero_hanning),
periodic_hanning_(periodic_hanning),
fft_padding_type_(fft_padding_type) {}
int sample_rate_hz_;
int num_channels_;
int fft_length_;
int fft_bins_;
int frame_length_;
int hop_length_;
int num_mel_bins_;
float mel_low_hz_;
float mel_high_hz_;
float mel_floor_;
float input_scale_;
float pre_emphasis_factor_;
bool normalize_mel_;
bool add_floor_to_mel_before_log_;
bool semicausal_padding_;
bool non_zero_hanning_;
bool periodic_hanning_;
FftPaddingType fft_padding_type_;
};
// Interface for audio preprocessing.
class AudioPreprocessor {
public:
virtual ~AudioPreprocessor() = default;
// Preprocesses the undecoded audio bytes and returns the preprocessed audio.
virtual absl::StatusOr<InputAudio> Preprocess(
const InputAudio& input_audio) = 0;
// Reset the audio preprocessor to the initial state.
virtual void Reset() = 0;
};
std::ostream& operator<<(
std::ostream& os,
const AudioPreprocessorConfig::FftPaddingType& padding_type);
} // namespace litert::lm
#endif // THIRD_PARTY_ODML_LITERT_LM_RUNTIME_COMPONENTS_PREPROCESSOR_AUDIO_PREPROCESSOR_H_