| import os |
| import math |
|
|
| import librosa |
| import numpy as np |
| from transformers import Wav2Vec2FeatureExtractor |
|
|
|
|
| class DataProcessor: |
| def __init__(self, sampling_rate, wav2vec_model_path): |
| self._processor = Wav2Vec2FeatureExtractor.from_pretrained(wav2vec_model_path, local_files_only=True) |
| self._sampling_rate = sampling_rate |
|
|
| def extract_feature(self, audio_path): |
| speech_array, sampling_rate = librosa.load(audio_path, sr=self._sampling_rate) |
| input_value = np.squeeze(self._processor(speech_array, sampling_rate=sampling_rate).input_values) |
| return input_value |
|
|
|
|
| def prepare_audio_feature(wav_file, fps=30, sampling_rate=16000, wav2vec_model_path=None): |
| data_preprocessor = DataProcessor(sampling_rate, wav2vec_model_path) |
|
|
| input_value = data_preprocessor.extract_feature(wav_file) |
| seq_len = math.ceil(len(input_value)/sampling_rate*fps) |
| return { |
| "audio_feature": input_value, |
| "seq_len": seq_len |
| } |
|
|
|
|
|
|