|
|
|
|
|
|
|
|
|
|
|
|
| #include <string.h>
|
| #include <stdlib.h>
|
| #include <algorithm>
|
| #include <math.h>
|
| #include "aed.h"
|
| #include "aed_st.h"
|
| #include "coeff.h"
|
| #include "pitch_est.h"
|
| #include "stft.h"
|
| #include <assert.h>
|
|
|
| #define AUP_AED_ALIGN8(o) (((o) + 7) & (~7))
|
| #define AUP_AED_MAX(x, y) (((x) > (y)) ? (x) : (y))
|
| #define AUP_AED_MIN(x, y) (((x) > (y)) ? (y) : (x))
|
| #define AUP_AED_EPS (1e-20f)
|
|
|
|
|
|
|
|
|
|
|
| AUP_MODULE_AIVAD::AUP_MODULE_AIVAD(char* onnx_path) {
|
| ort_api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
|
| OrtStatus* status =
|
| ort_api->CreateEnv(ORT_LOGGING_LEVEL_WARNING, "TEN-VAD", &ort_env);
|
| if (status) {
|
| printf("Failed to create env: %s\n", ort_api->GetErrorMessage(status));
|
| ort_api->ReleaseStatus(status);
|
| ort_api->ReleaseEnv(ort_env);
|
| ort_env = NULL;
|
| return;
|
| }
|
|
|
| OrtSessionOptions* session_options;
|
| ort_api->CreateSessionOptions(&session_options);
|
| ort_api->SetIntraOpNumThreads(session_options, 1);
|
| status =
|
| ort_api->CreateSession(ort_env, onnx_path, session_options, &ort_session);
|
| ort_api->ReleaseSessionOptions(session_options);
|
| if (status) {
|
| printf("Failed to create ort_session: %s\n",
|
| ort_api->GetErrorMessage(status));
|
| ort_api->ReleaseStatus(status);
|
| ort_api->ReleaseEnv(ort_env);
|
| ort_env = NULL;
|
| return;
|
| }
|
|
|
| ort_api->GetAllocatorWithDefaultOptions(&ort_allocator);
|
| size_t num_inputs;
|
| ort_api->SessionGetInputCount(ort_session, &num_inputs);
|
| assert(num_inputs == AUP_AED_MODEL_IO_NUM);
|
| for (size_t i = 0; i < num_inputs; i++) {
|
| char* input_name;
|
| ort_api->SessionGetInputName(ort_session, i, ort_allocator, &input_name);
|
| strncpy(input_names_buf[i], input_name, sizeof(input_names_buf[i]));
|
| input_names[i] = input_names_buf[i];
|
| ort_api->AllocatorFree(ort_allocator, input_name);
|
| }
|
|
|
| size_t num_outputs;
|
| ort_api->SessionGetOutputCount(ort_session, &num_outputs);
|
| assert(num_outputs == AUP_AED_MODEL_IO_NUM);
|
| for (size_t i = 0; i < num_outputs; i++) {
|
| char* output_name;
|
| ort_api->SessionGetOutputName(ort_session, i, ort_allocator, &output_name);
|
| strncpy(output_names_buf[i], output_name, sizeof(output_names_buf[i]));
|
| output_names[i] = output_names_buf[i];
|
| ort_api->AllocatorFree(ort_allocator, output_name);
|
| }
|
|
|
| OrtMemoryInfo* memory_info;
|
| status = ort_api->CreateCpuMemoryInfo(OrtDeviceAllocator, OrtMemTypeDefault,
|
| &memory_info);
|
| if (status != NULL) {
|
| printf("Failed to create memory info: %s\n",
|
| ort_api->GetErrorMessage(status));
|
| ort_api->ReleaseStatus(status);
|
| ort_api->ReleaseSession(ort_session);
|
| ort_api->ReleaseEnv(ort_env);
|
| ort_session = NULL;
|
| ort_env = NULL;
|
| return;
|
| }
|
| int64_t input_shapes0[] = {1, AUP_AED_CONTEXT_WINDOW_LEN, AUP_AED_FEA_LEN};
|
| int64_t input_shapes1234[] = {1, AUP_AED_MODEL_HIDDEN_DIM};
|
| for (int i = 0; i < num_inputs; i++) {
|
| status = ort_api->CreateTensorWithDataAsOrtValue(
|
| memory_info, i == 0 ? input_data_buf_0 : input_data_buf_1234[i - 1],
|
| i == 0 ? sizeof(input_data_buf_0) : sizeof(input_data_buf_1234[i - 1]),
|
| i == 0 ? input_shapes0 : input_shapes1234,
|
| i == 0 ? sizeof(input_shapes0) / sizeof(input_shapes0[0])
|
| : sizeof(input_shapes1234) / sizeof(input_shapes1234[0]),
|
| ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &ort_input_tensors[i]);
|
| if (status != NULL) {
|
| printf("Failed to create input tensor %d: %s\n", i,
|
| ort_api->GetErrorMessage(status));
|
| ort_api->ReleaseStatus(status);
|
| ort_api->ReleaseSession(ort_session);
|
| ort_api->ReleaseEnv(ort_env);
|
| ort_session = NULL;
|
| ort_env = NULL;
|
| return;
|
| }
|
| }
|
|
|
| int64_t output_shapes0[] = {1, 1, 1};
|
| int64_t output_shapes1234[] = {1, AUP_AED_MODEL_HIDDEN_DIM};
|
| for (int i = 0; i < num_outputs; i++) {
|
| status = ort_api->CreateTensorAsOrtValue(
|
| ort_allocator, i == 0 ? output_shapes0 : output_shapes1234,
|
| i == 0 ? sizeof(output_shapes0) / sizeof(output_shapes0[0])
|
| : sizeof(output_shapes1234) / sizeof(output_shapes1234[0]),
|
| ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, &ort_output_tensors[i]);
|
| if (status != NULL) {
|
| printf("Failed to create output tensor %d: %s\n", i,
|
| ort_api->GetErrorMessage(status));
|
| ort_api->ReleaseStatus(status);
|
| ort_api->ReleaseSession(ort_session);
|
| ort_api->ReleaseEnv(ort_env);
|
| ort_session = NULL;
|
| ort_env = NULL;
|
| return;
|
| }
|
| }
|
| inited = 1;
|
| }
|
|
|
| AUP_MODULE_AIVAD::~AUP_MODULE_AIVAD() {
|
| for (int i = 0; i < AUP_AED_MODEL_IO_NUM; i++) {
|
| if (ort_output_tensors[i]) {
|
| ort_api->ReleaseValue(ort_output_tensors[i]);
|
| }
|
| }
|
| if (ort_session) {
|
| ort_api->ReleaseSession(ort_session);
|
| }
|
| if (ort_env) {
|
| ort_api->ReleaseEnv(ort_env);
|
| }
|
| }
|
|
|
| int AUP_MODULE_AIVAD::Process(float* input, float* output) {
|
| if (!inited) {
|
| printf("not inited!\n");
|
| return -1;
|
| }
|
|
|
| memcpy(input_data_buf_0, input, sizeof(input_data_buf_0));
|
| if (clear_hidden) {
|
| memset(input_data_buf_1234, 0, sizeof(input_data_buf_1234));
|
| clear_hidden = 0;
|
| }
|
| OrtStatus* status = ort_api->Run(
|
| ort_session, NULL, input_names, ort_input_tensors, AUP_AED_MODEL_IO_NUM,
|
| output_names, AUP_AED_MODEL_IO_NUM, ort_output_tensors);
|
| float* output_data;
|
| ort_api->GetTensorMutableData(ort_output_tensors[0], (void**)&output_data);
|
| *output = output_data[0];
|
| for (int i = 1; i < AUP_AED_MODEL_IO_NUM; i++) {
|
| ort_api->GetTensorMutableData(ort_output_tensors[i], (void**)&output_data);
|
| memcpy(input_data_buf_1234[i - 1], output_data,
|
| sizeof(input_data_buf_1234[i - 1]));
|
| }
|
|
|
| return 0;
|
| }
|
|
|
| int AUP_MODULE_AIVAD::Reset() {
|
| if (!inited) {
|
| return -1;
|
| }
|
|
|
| clear_hidden = 1;
|
| return 0;
|
| }
|
|
|
| static int AUP_Aed_checkStatCfg(Aed_StaticCfg* pCfg) {
|
| if (pCfg == NULL) {
|
| return -1;
|
| }
|
|
|
| #if AUP_AED_FEA_LEN < AUP_AED_MEL_FILTER_BANK_NUM
|
| return -1;
|
| #endif
|
|
|
| if (pCfg->hopSz < 32) {
|
| return -1;
|
| }
|
|
|
| if (pCfg->frqInputAvailableFlag == 1) {
|
| if (pCfg->fftSz < 128 || pCfg->fftSz < pCfg->hopSz) {
|
| return -1;
|
| }
|
| if (pCfg->anaWindowSz > pCfg->fftSz || pCfg->anaWindowSz < pCfg->hopSz) {
|
| return -1;
|
| }
|
| }
|
|
|
| return 0;
|
| }
|
|
|
| static int AUP_Aed_publishStaticCfg(Aed_St* stHdl) {
|
| const Aed_StaticCfg* pStatCfg;
|
|
|
| if (stHdl == NULL) {
|
| return -1;
|
| }
|
| pStatCfg = (const Aed_StaticCfg*)(&(stHdl->stCfg));
|
|
|
| stHdl->extFftSz = 0;
|
| stHdl->extNBins = 0;
|
| stHdl->extWinSz = 0;
|
| if (pStatCfg->frqInputAvailableFlag == 1) {
|
| stHdl->extFftSz = pStatCfg->fftSz;
|
| stHdl->extNBins = (stHdl->extFftSz >> 1) + 1;
|
| stHdl->extWinSz = pStatCfg->anaWindowSz;
|
| }
|
| stHdl->extHopSz = pStatCfg->hopSz;
|
|
|
| stHdl->intFftSz = AUP_AED_ASSUMED_FFTSZ;
|
| stHdl->intHopSz = AUP_AED_ASSUMED_HOPSZ;
|
| stHdl->intWinSz = AUP_AED_ASSUMED_WINDOWSZ;
|
| stHdl->intNBins = (stHdl->intFftSz >> 1) + 1;
|
| stHdl->intAnalyWindowPtr = AUP_AED_STFTWindow_Hann768;
|
|
|
| if (pStatCfg->frqInputAvailableFlag == 0 ||
|
| stHdl->extHopSz != stHdl->intHopSz) {
|
|
|
| stHdl->intAnalyFlag =
|
| 2;
|
| } else if (stHdl->extFftSz == stHdl->intFftSz) {
|
|
|
|
|
| stHdl->intAnalyFlag = 0;
|
| } else {
|
|
|
| stHdl->intAnalyFlag =
|
| 1;
|
| }
|
| stHdl->inputTimeFIFOLen = stHdl->extHopSz + stHdl->intHopSz;
|
|
|
|
|
|
|
|
|
| stHdl->intAnalyFlag =
|
| 2;
|
|
|
| stHdl->feaSz = (size_t)AUP_AED_FEA_LEN;
|
| stHdl->melFbSz = (size_t)AUP_AED_MEL_FILTER_BANK_NUM;
|
| stHdl->algDelay = (size_t)AUP_AED_LOOKAHEAD_NFRM;
|
| stHdl->algCtxtSz = (size_t)AUP_AED_CONTEXT_WINDOW_LEN;
|
| stHdl->frmRmsBufLen = AUP_AED_MAX(1, stHdl->algDelay);
|
|
|
| return 0;
|
| }
|
|
|
| static int AUP_Aed_publishDynamCfg(Aed_St* stHdl) {
|
| const Aed_DynamCfg* pDynmCfg;
|
| PE_DynamCfg peDynmCfg;
|
| if (stHdl == NULL) {
|
| return -1;
|
| }
|
|
|
| pDynmCfg = (const Aed_DynamCfg*)(&(stHdl->dynamCfg));
|
| stHdl->aivadResetFrmNum = pDynmCfg->resetFrameNum;
|
| stHdl->voiceDecideThresh = pDynmCfg->extVoiceThr;
|
|
|
| if (stHdl->pitchEstStPtr != NULL) {
|
| peDynmCfg.voicedThr = pDynmCfg->pitchEstVoicedThr;
|
| AUP_PE_setDynamCfg(stHdl->pitchEstStPtr, &peDynmCfg);
|
| }
|
|
|
| return 0;
|
| }
|
|
|
| static int AUP_Aed_resetVariables(Aed_St* stHdl) {
|
| if (stHdl == NULL) {
|
| return -1;
|
| }
|
|
|
|
|
|
|
| memset(stHdl->dynamMemPtr, 0, stHdl->dynamMemSize);
|
|
|
| float* melFbCoef = stHdl->melFilterBankCoef;
|
| size_t* melBinBuff = stHdl->melFilterBinBuff;
|
| size_t i, j;
|
| size_t nBins = stHdl->intNBins;
|
| size_t melFbSz = stHdl->melFbSz;
|
|
|
| stHdl->aedProcFrmCnt = 0;
|
| stHdl->inputTimeFIFOIdx = 0;
|
| stHdl->aivadResetCnt = 0;
|
| stHdl->timeSignalPre = 0.0f;
|
| stHdl->aivadScore =
|
| -1.0f;
|
| stHdl->aivadScorePre = -1.0f;
|
|
|
| stHdl->pitchFreq = 0.0f;
|
|
|
|
|
| float low_mel = 2595.0f * log10f(1.0f + 0.0f / 700.0f);
|
| float high_mel = 2595.0f * log10f(1.0f + 8000.0f / 700.0f);
|
| float mel_points = 0.0f;
|
| float hz_points = 0.0f;
|
| size_t idx = 0;
|
|
|
| for (i = 0; i < melFbSz + 2; i++) {
|
| mel_points = i * (high_mel - low_mel) / ((float)melFbSz + 1.0f) + low_mel;
|
| hz_points = 700.0f * (powf(10.0f, mel_points / 2595.0f) - 1.0f);
|
| melBinBuff[i] =
|
| (size_t)((stHdl->intFftSz + 1.0f) * hz_points / (float)AUP_AED_FS);
|
| if (i > 0 && melBinBuff[i] == melBinBuff[i - 1]) {
|
| return -1;
|
| }
|
| }
|
|
|
| for (j = 0; j < melFbSz; j++) {
|
| for (i = melBinBuff[j]; i < melBinBuff[j + 1]; i++) {
|
| idx = j * nBins + i;
|
| melFbCoef[idx] = (float)(i - melBinBuff[j]) /
|
| (float)(melBinBuff[j + 1] - melBinBuff[j]);
|
| }
|
| for (i = melBinBuff[j + 1]; i < melBinBuff[j + 2]; i++) {
|
| idx = j * nBins + i;
|
| melFbCoef[idx] = (float)(melBinBuff[j + 2] - i) /
|
| (float)(melBinBuff[j + 2] - melBinBuff[j + 1]);
|
| }
|
| }
|
|
|
| if (stHdl->pitchEstStPtr != NULL) {
|
| if (AUP_PE_init(stHdl->pitchEstStPtr) < 0) {
|
| return -1;
|
| }
|
| }
|
|
|
| if (stHdl->aivadInf != NULL) {
|
| stHdl->aivadInf->Reset();
|
| }
|
|
|
| if (stHdl->timeInAnalysis != NULL) {
|
| if (AUP_Analyzer_init(stHdl->timeInAnalysis) < 0) {
|
| return -1;
|
| }
|
| }
|
|
|
| return 0;
|
| }
|
|
|
| static int AUP_Aed_addOneCnter(int cnter) {
|
| cnter++;
|
| if (cnter >= 1000000000) {
|
| cnter = 0;
|
| }
|
| return (cnter);
|
| }
|
|
|
| static void AUP_Aed_binPowerConvert(const float* src, float* tgt, int srcNBins,
|
| int tgtNBins) {
|
| float rate;
|
| int srcIdx, tgtIdx;
|
| if (srcNBins == tgtNBins) {
|
| memcpy(tgt, src, sizeof(float) * tgtNBins);
|
| return;
|
| }
|
|
|
| memset(tgt, 0, sizeof(float) * tgtNBins);
|
|
|
| rate = (float)(srcNBins - 1) / (float)(tgtNBins - 1);
|
| for (tgtIdx = 0; tgtIdx < tgtNBins; tgtIdx++) {
|
| srcIdx = (int)(tgtIdx * rate);
|
| srcIdx = AUP_AED_MIN(srcNBins - 1, AUP_AED_MAX(srcIdx, 0));
|
| tgt[tgtIdx] = src[srcIdx];
|
| }
|
|
|
| return;
|
| }
|
|
|
| static void AUP_Aed_CalcBinPow(int nBins, const float* cmplxSpctr,
|
| float* binPow) {
|
| int idx, realIdx, imagIdx;
|
|
|
|
|
| binPow[0] = cmplxSpctr[0] * cmplxSpctr[0];
|
|
|
|
|
| binPow[nBins - 1] = cmplxSpctr[1] * cmplxSpctr[1];
|
|
|
| for (idx = 1; idx < (nBins - 1); idx++) {
|
| realIdx = idx << 1;
|
| imagIdx = realIdx + 1;
|
|
|
| binPow[idx] = cmplxSpctr[realIdx] * cmplxSpctr[realIdx] +
|
| cmplxSpctr[imagIdx] * cmplxSpctr[imagIdx];
|
| }
|
| return;
|
| }
|
|
|
| static int AUP_Aed_pitch_proc(void* pitchModule, const float* timeSignal,
|
| size_t timeLen, const float* binPow, size_t nBins,
|
| PE_OutputData* pOut) {
|
| PE_InputData peInData;
|
|
|
| peInData.timeSignal = timeSignal;
|
| peInData.hopSz = (int)timeLen;
|
| peInData.inBinPow = binPow;
|
| peInData.nBins = (int)nBins;
|
| pOut->pitchFreq = 0;
|
| pOut->voiced = -1;
|
| return AUP_PE_proc(pitchModule, &peInData, pOut);
|
| }
|
|
|
| static int AUP_Aed_aivad_proc(Aed_St* stHdl, const float* inBinPow,
|
| float* aivadScore) {
|
| if (stHdl == NULL || inBinPow == NULL || aivadScore == NULL) {
|
| return -1;
|
| }
|
|
|
| size_t i, j;
|
| size_t nBins = stHdl->intNBins;
|
| size_t melFbSz = stHdl->melFbSz;
|
| size_t srcOffset;
|
| size_t srcLen;
|
|
|
| float* aivadInputFeatStack = stHdl->aivadInputFeatStack;
|
| float* melFbCoef = stHdl->melFilterBankCoef;
|
| const float* aivadFeatMean = AUP_AED_FEATURE_MEANS;
|
| const float* aivadFeatStd = AUP_AED_FEATURE_STDS;
|
| float* curMelFbCoefPtr = NULL;
|
| float* curInputFeatPtr = NULL;
|
| float perBandValue = 0.0f;
|
| float powerNormal = 32768.0f * 32768.0f;
|
|
|
|
|
| srcOffset = stHdl->feaSz;
|
| srcLen = (stHdl->algCtxtSz - 1) * stHdl->feaSz;
|
| memmove(aivadInputFeatStack, aivadInputFeatStack + srcOffset,
|
| sizeof(float) * srcLen);
|
| curInputFeatPtr = aivadInputFeatStack + srcLen;
|
|
|
|
|
| for (i = 0; i < melFbSz; i++) {
|
| perBandValue = 0.0f;
|
| curMelFbCoefPtr = melFbCoef + i * nBins;
|
| for (j = 0; j < nBins; j++) {
|
| perBandValue += (inBinPow[j] * curMelFbCoefPtr[j]);
|
| }
|
| perBandValue = perBandValue / powerNormal;
|
| perBandValue = logf(perBandValue + AUP_AED_EPS);
|
| curInputFeatPtr[i] =
|
| (perBandValue - aivadFeatMean[i]) / (aivadFeatStd[i] + AUP_AED_EPS);
|
| }
|
|
|
|
|
| for (i = melFbSz; i < stHdl->feaSz; i++) {
|
| curInputFeatPtr[i] =
|
| (stHdl->pitchFreq - aivadFeatMean[i]) / (aivadFeatStd[i] + AUP_AED_EPS);
|
| }
|
|
|
|
|
|
|
| float aivadOutput;
|
| if (stHdl->aivadInf != NULL &&
|
| stHdl->aivadInf->Process(stHdl->aivadInputFeatStack, &aivadOutput) != 0) {
|
| return -1;
|
| }
|
|
|
| (*aivadScore) = aivadOutput;
|
|
|
| stHdl->aivadResetCnt += 1;
|
| if (stHdl->aivadResetCnt >= stHdl->aivadResetFrmNum) {
|
| if (stHdl->aivadInf != NULL && stHdl->aivadInf->Reset() != 0) {
|
| }
|
| stHdl->aivadResetCnt = 0;
|
| }
|
|
|
| return 0;
|
| }
|
|
|
| static int AUP_Aed_dynamMemPrepare(Aed_St* stHdl, void* memPtrExt,
|
| size_t memSize) {
|
| if (stHdl == NULL) {
|
| return -1;
|
| }
|
| size_t pitchInNBins = stHdl->intNBins;
|
| size_t totalMemSize = 0;
|
| size_t inputTimeFIFOMemSize = 0;
|
| size_t inputEmphTimeFIFOMemSize = 0;
|
| size_t aivadInputCmplxSptrmMemSize = 0;
|
| size_t aivadInputBinPowMemSize = 0;
|
| size_t frameRmsBuffMemSize = 0;
|
| size_t aivadInputFeatStackMemSize = 0;
|
| size_t aimdInputFeatStackMemSize = 0;
|
| size_t melFilterBankCoefMemSize = 0;
|
| size_t melFilterBinBuffMemSize = 0;
|
| size_t inputFloatBuffMemSize = 0;
|
|
|
|
|
| char* memPtr = NULL;
|
|
|
|
|
|
|
|
|
| inputTimeFIFOMemSize =
|
| AUP_AED_ALIGN8(sizeof(float) * stHdl->inputTimeFIFOLen);
|
| totalMemSize += inputTimeFIFOMemSize;
|
|
|
| inputEmphTimeFIFOMemSize =
|
| AUP_AED_ALIGN8(sizeof(float) * stHdl->inputTimeFIFOLen);
|
| totalMemSize += inputEmphTimeFIFOMemSize;
|
|
|
| aivadInputCmplxSptrmMemSize = AUP_AED_ALIGN8(sizeof(float) * stHdl->intFftSz);
|
| totalMemSize += aivadInputCmplxSptrmMemSize;
|
|
|
| aivadInputBinPowMemSize = AUP_AED_ALIGN8(sizeof(float) * stHdl->intNBins);
|
| totalMemSize += aivadInputBinPowMemSize;
|
|
|
| aivadInputFeatStackMemSize =
|
| AUP_AED_ALIGN8(sizeof(float) * stHdl->algCtxtSz * stHdl->feaSz);
|
| totalMemSize += aivadInputFeatStackMemSize;
|
|
|
| aimdInputFeatStackMemSize =
|
| AUP_AED_ALIGN8(sizeof(float) * stHdl->algCtxtSz * stHdl->feaSz);
|
| totalMemSize += aimdInputFeatStackMemSize;
|
|
|
| melFilterBankCoefMemSize =
|
| AUP_AED_ALIGN8(sizeof(float) * pitchInNBins * stHdl->feaSz);
|
| totalMemSize += melFilterBankCoefMemSize;
|
|
|
| melFilterBinBuffMemSize = AUP_AED_ALIGN8(sizeof(size_t) * (stHdl->feaSz + 2));
|
| totalMemSize += melFilterBinBuffMemSize;
|
|
|
| frameRmsBuffMemSize = AUP_AED_ALIGN8(stHdl->frmRmsBufLen * sizeof(float));
|
| totalMemSize += frameRmsBuffMemSize;
|
|
|
| inputFloatBuffMemSize = AUP_AED_ALIGN8(stHdl->extHopSz * sizeof(float));
|
| totalMemSize += inputFloatBuffMemSize;
|
|
|
| if (memPtrExt == NULL) {
|
| return ((int)totalMemSize);
|
| }
|
|
|
| if (totalMemSize > memSize) {
|
| return -1;
|
| }
|
|
|
| memPtr = (char*)memPtrExt;
|
|
|
| stHdl->inputTimeFIFO = (float*)memPtr;
|
| memPtr += inputTimeFIFOMemSize;
|
|
|
| stHdl->inputEmphTimeFIFO = (float*)memPtr;
|
| memPtr += inputEmphTimeFIFOMemSize;
|
|
|
| stHdl->aivadInputCmplxSptrm = (float*)memPtr;
|
| memPtr += aivadInputCmplxSptrmMemSize;
|
|
|
| stHdl->aivadInputBinPow = (float*)memPtr;
|
| memPtr += aivadInputBinPowMemSize;
|
|
|
| stHdl->aivadInputFeatStack = (float*)memPtr;
|
| memPtr += aivadInputFeatStackMemSize;
|
|
|
| stHdl->melFilterBankCoef = (float*)memPtr;
|
| memPtr += melFilterBankCoefMemSize;
|
|
|
| stHdl->melFilterBinBuff = (size_t*)memPtr;
|
| memPtr += melFilterBinBuffMemSize;
|
|
|
| stHdl->frameRmsBuff = (float*)memPtr;
|
| memPtr += frameRmsBuffMemSize;
|
|
|
| stHdl->inputFloatBuff = (float*)memPtr;
|
| memPtr += inputFloatBuffMemSize;
|
|
|
| if (((size_t)(memPtr - (char*)memPtrExt)) > totalMemSize) {
|
| return -1;
|
| }
|
|
|
| return ((int)totalMemSize);
|
| }
|
|
|
| static int AUP_Aed_runOneFrm(Aed_St* stHdl, const float* tSignal, int hopSz,
|
| const float* binPowPtr, int nBins) {
|
| PE_OutputData peOutData = {0, 0};
|
| float aivadScore = -1.0f;
|
| float mediaFilterout = 0;
|
| int mediaIdx = (int)(AUP_AED_OUTPUT_SMOOTH_FILTER_LEN) / 2;
|
| int i;
|
|
|
| if (AUP_Aed_pitch_proc(stHdl->pitchEstStPtr, tSignal, hopSz, binPowPtr, nBins,
|
| &peOutData) < 0) {
|
| return -1;
|
| }
|
| stHdl->pitchFreq = peOutData.pitchFreq;
|
| if (AUP_Aed_aivad_proc(stHdl, binPowPtr, &aivadScore) < 0) {
|
| return -1;
|
| }
|
| stHdl->aivadScore = aivadScore;
|
|
|
| return 0;
|
| }
|
|
|
|
|
|
|
|
|
|
|
| int AUP_Aed_create(void** stPtr) {
|
| if (stPtr == NULL) {
|
| return -1;
|
| }
|
| Aed_St* tmpPtr = (Aed_St*)malloc(sizeof(Aed_St));
|
| if (tmpPtr == NULL) {
|
| return -1;
|
| }
|
| memset(tmpPtr, 0, sizeof(Aed_St));
|
|
|
| if (AUP_PE_create(&(tmpPtr->pitchEstStPtr)) < 0) {
|
| return -1;
|
| }
|
| if (AUP_Analyzer_create(&(tmpPtr->timeInAnalysis)) < 0) {
|
| return -1;
|
| }
|
|
|
| tmpPtr->stCfg.enableFlag = 1;
|
| tmpPtr->stCfg.fftSz = 1024;
|
| tmpPtr->stCfg.hopSz = 256;
|
| tmpPtr->stCfg.anaWindowSz = 768;
|
| tmpPtr->stCfg.frqInputAvailableFlag = 0;
|
|
|
| tmpPtr->dynamCfg.extVoiceThr = 0.5f;
|
| tmpPtr->dynamCfg.extMusicThr = 0.5f;
|
| tmpPtr->dynamCfg.extEnergyThr = 10.0f;
|
| tmpPtr->dynamCfg.resetFrameNum = 1875;
|
| tmpPtr->dynamCfg.pitchEstVoicedThr = AUP_AED_PITCH_EST_DEFAULT_VOICEDTHR;
|
|
|
| (*stPtr) = (void*)tmpPtr;
|
|
|
| return 0;
|
| }
|
|
|
| int AUP_Aed_destroy(void** stPtr) {
|
| if (stPtr == NULL || (*stPtr) == NULL) {
|
| return -1;
|
| }
|
| Aed_St* stHdl = (Aed_St*)(*stPtr);
|
|
|
| if (stHdl->aivadInf != NULL) {
|
| delete stHdl->aivadInf;
|
| }
|
| stHdl->aivadInf = NULL;
|
|
|
| if (AUP_PE_destroy(&(stHdl->pitchEstStPtr)) < 0) {
|
| return -1;
|
| }
|
| if (AUP_Analyzer_destroy(&(stHdl->timeInAnalysis)) < 0) {
|
| return -1;
|
| }
|
|
|
| if (stHdl->dynamMemPtr != NULL) {
|
| free(stHdl->dynamMemPtr);
|
| }
|
| stHdl->dynamMemPtr = NULL;
|
|
|
| if (stHdl != NULL) {
|
| free(stHdl);
|
| }
|
| (*stPtr) = NULL;
|
|
|
| return 0;
|
| }
|
|
|
| int AUP_Aed_memAllocate(void* stPtr, const Aed_StaticCfg* pCfg) {
|
| Aed_St* stHdl = (Aed_St*)(stPtr);
|
| Aed_StaticCfg aedStatCfg;
|
| PE_StaticCfg pitchStatCfg;
|
| Analyzer_StaticCfg analyzerStatCfg;
|
| int totalMemSize = 0;
|
|
|
| if (stPtr == NULL || pCfg == NULL) {
|
| return -1;
|
| }
|
|
|
|
|
| memcpy(&aedStatCfg, pCfg, sizeof(Aed_StaticCfg));
|
| if (AUP_Aed_checkStatCfg(&aedStatCfg) < 0) {
|
| return -1;
|
| }
|
|
|
| memcpy(&(stHdl->stCfg), &aedStatCfg, sizeof(Aed_StaticCfg));
|
|
|
|
|
|
|
| if (AUP_Aed_publishStaticCfg(stHdl) < 0) {
|
| return -1;
|
| }
|
|
|
|
|
| if (stHdl->aivadInf == NULL) {
|
| stHdl->aivadInf = new AUP_MODULE_AIVAD("onnx_model/ten-vad.onnx");
|
| if (stHdl->aivadInf == NULL) {
|
| return -1;
|
| }
|
| }
|
| stHdl->aivadInf->Reset();
|
|
|
|
|
| if (AUP_PE_getStaticCfg(stHdl->pitchEstStPtr, &pitchStatCfg) < 0) {
|
| return -1;
|
| }
|
| pitchStatCfg.fftSz = stHdl->intFftSz;
|
| pitchStatCfg.anaWindowSz = stHdl->intWinSz;
|
| pitchStatCfg.hopSz = stHdl->intHopSz;
|
| pitchStatCfg.useLPCPreFiltering = AUP_AED_PITCH_EST_USE_LPC;
|
| pitchStatCfg.procFs = AUP_AED_PITCH_EST_PROCFS;
|
| if (AUP_PE_memAllocate(stHdl->pitchEstStPtr, &pitchStatCfg) < 0) {
|
| return -1;
|
| }
|
|
|
|
|
| AUP_Analyzer_getStaticCfg(stHdl->timeInAnalysis, &analyzerStatCfg);
|
| analyzerStatCfg.win_len = (int)stHdl->intWinSz;
|
| analyzerStatCfg.hop_size = (int)stHdl->intHopSz;
|
| analyzerStatCfg.fft_size = (int)stHdl->intFftSz;
|
| analyzerStatCfg.ana_win_coeff = stHdl->intAnalyWindowPtr;
|
| if (AUP_Analyzer_memAllocate(stHdl->timeInAnalysis, &analyzerStatCfg) < 0) {
|
| return -1;
|
| }
|
|
|
|
|
| totalMemSize = AUP_Aed_dynamMemPrepare(stHdl, NULL, 0);
|
| if (totalMemSize < 0) {
|
| return -1;
|
| }
|
|
|
|
|
| if (totalMemSize > (int)stHdl->dynamMemSize) {
|
| if (stHdl->dynamMemPtr != NULL) {
|
| free(stHdl->dynamMemPtr);
|
| stHdl->dynamMemPtr = NULL;
|
| stHdl->dynamMemSize = 0;
|
| }
|
| stHdl->dynamMemPtr = malloc(totalMemSize);
|
| if (stHdl->dynamMemPtr == NULL) {
|
| return -1;
|
| }
|
| stHdl->dynamMemSize = totalMemSize;
|
| }
|
| memset(stHdl->dynamMemPtr, 0, stHdl->dynamMemSize);
|
|
|
|
|
| if (AUP_Aed_dynamMemPrepare(stHdl, stHdl->dynamMemPtr, stHdl->dynamMemSize) <
|
| 0) {
|
| return -1;
|
| }
|
|
|
|
|
| if (AUP_Aed_publishDynamCfg(stHdl) < 0) {
|
| return -1;
|
| }
|
|
|
| return 0;
|
| }
|
|
|
| int AUP_Aed_init(void* stPtr) {
|
| Aed_St* stHdl = (Aed_St*)(stPtr);
|
| if (stPtr == NULL) {
|
| return -1;
|
| }
|
|
|
|
|
| if (AUP_Aed_publishDynamCfg(stHdl) < 0) {
|
| return -1;
|
| }
|
|
|
|
|
| if (AUP_Aed_resetVariables(stHdl) < 0) {
|
| return -1;
|
| }
|
|
|
| return 0;
|
| }
|
|
|
| int AUP_Aed_setDynamCfg(void* stPtr, const Aed_DynamCfg* pCfg) {
|
| Aed_St* stHdl = (Aed_St*)(stPtr);
|
|
|
| if (stPtr == NULL || pCfg == NULL) {
|
| return -1;
|
| }
|
|
|
| memcpy(&(stHdl->dynamCfg), pCfg, sizeof(Aed_DynamCfg));
|
|
|
|
|
| if (AUP_Aed_publishDynamCfg(stHdl) < 0) {
|
| return -1;
|
| }
|
|
|
| return 0;
|
| }
|
|
|
| int AUP_Aed_getStaticCfg(const void* stPtr, Aed_StaticCfg* pCfg) {
|
| const Aed_St* stHdl = (const Aed_St*)(stPtr);
|
|
|
| if (stPtr == NULL || pCfg == NULL) {
|
| return -1;
|
| }
|
|
|
| memcpy(pCfg, &(stHdl->stCfg), sizeof(Aed_StaticCfg));
|
|
|
| return 0;
|
| }
|
|
|
| int AUP_Aed_getDynamCfg(const void* stPtr, Aed_DynamCfg* pCfg) {
|
| const Aed_St* stHdl = (const Aed_St*)(stPtr);
|
|
|
| if (stPtr == NULL || pCfg == NULL) {
|
| return -1;
|
| }
|
|
|
| memcpy(pCfg, &(stHdl->dynamCfg), sizeof(Aed_DynamCfg));
|
|
|
| return 0;
|
| }
|
|
|
| int AUP_Aed_getAlgDelay(const void* stPtr, int* delayInFrms) {
|
| const Aed_St* stHdl = (const Aed_St*)(stPtr);
|
|
|
| if (stPtr == NULL || delayInFrms == NULL) {
|
| return -1;
|
| }
|
|
|
| (*delayInFrms) = (int)stHdl->algDelay;
|
|
|
| return 0;
|
| }
|
|
|
| int AUP_Aed_proc(void* stPtr, const Aed_InputData* pIn, Aed_OutputData* pOut) {
|
| Analyzer_InputData analyzerInput;
|
| Analyzer_OutputData analyzerOutput;
|
| Aed_St* stHdl = (Aed_St*)(stPtr);
|
|
|
| const float* binPowPtr = NULL;
|
| float frameRms = 0.0f;
|
| float frameEnergy = 0.0f;
|
| float powerNormal = 32768.0f * 32768.0f;
|
| int idx;
|
|
|
| if (stPtr == NULL) {
|
| return -1;
|
| }
|
| if (stHdl->stCfg.enableFlag == 0) {
|
| return 0;
|
| }
|
| if (pIn == NULL || pIn->timeSignal == NULL || pOut == NULL) {
|
| return -1;
|
| }
|
|
|
| if (stHdl->intAnalyFlag != 2) {
|
| if (pIn->binPower == NULL) {
|
| return -1;
|
| }
|
| if (pIn->nBins != (int)((stHdl->stCfg.fftSz >> 1) + 1) ||
|
| pIn->hopSz != (int)(stHdl->stCfg.hopSz)) {
|
| return -1;
|
| }
|
| }
|
|
|
|
|
| for (idx = 0; idx < pIn->hopSz; idx++) {
|
| frameRms += (pIn->timeSignal[idx] * pIn->timeSignal[idx]);
|
| }
|
| frameEnergy = frameRms;
|
| frameRms = sqrtf(frameRms / (float)pIn->hopSz);
|
| memmove(stHdl->frameRmsBuff, stHdl->frameRmsBuff + 1,
|
| sizeof(float) * (stHdl->frmRmsBufLen - 1));
|
| stHdl->frameRmsBuff[stHdl->frmRmsBufLen - 1] = frameRms;
|
|
|
|
|
| if ((stHdl->inputTimeFIFOIdx + pIn->hopSz) > (int)stHdl->inputTimeFIFOLen) {
|
| return -1;
|
| }
|
|
|
|
|
| float* timeSigEphaPtr = stHdl->inputEmphTimeFIFO + stHdl->inputTimeFIFOIdx;
|
| for (idx = 0; idx < pIn->hopSz; idx++) {
|
| timeSigEphaPtr[idx] = pIn->timeSignal[idx] - 0.97f * stHdl->timeSignalPre;
|
| stHdl->timeSignalPre = pIn->timeSignal[idx];
|
| }
|
|
|
| memcpy(stHdl->inputTimeFIFO + stHdl->inputTimeFIFOIdx, pIn->timeSignal,
|
| sizeof(float) * (pIn->hopSz));
|
| stHdl->inputTimeFIFOIdx += pIn->hopSz;
|
|
|
| if (stHdl->intAnalyFlag == 0) {
|
| if (stHdl->inputTimeFIFOIdx != (int)(stHdl->intHopSz) ||
|
| (int)(stHdl->intNBins) != pIn->nBins) {
|
| return -1;
|
| }
|
|
|
|
|
| stHdl->aedProcFrmCnt = AUP_Aed_addOneCnter(stHdl->aedProcFrmCnt);
|
| binPowPtr = pIn->binPower;
|
|
|
|
|
| if (AUP_Aed_runOneFrm(stHdl, stHdl->inputTimeFIFO, (int)stHdl->intHopSz,
|
| binPowPtr, (int)stHdl->intNBins) < 0) {
|
| return -1;
|
| }
|
|
|
|
|
| stHdl->inputTimeFIFOIdx = 0;
|
| } else if (stHdl->intAnalyFlag ==
|
| 1) {
|
| if (stHdl->inputTimeFIFOIdx != (int)(stHdl->intHopSz) ||
|
| (int)(stHdl->extNBins) != pIn->nBins) {
|
| return -1;
|
| }
|
|
|
|
|
| stHdl->aedProcFrmCnt = AUP_Aed_addOneCnter(stHdl->aedProcFrmCnt);
|
| AUP_Aed_binPowerConvert(pIn->binPower, stHdl->aivadInputBinPow,
|
| (int)stHdl->extNBins, (int)stHdl->intNBins);
|
| binPowPtr = stHdl->aivadInputBinPow;
|
|
|
|
|
| if (AUP_Aed_runOneFrm(stHdl, stHdl->inputTimeFIFO, (int)stHdl->intHopSz,
|
| binPowPtr, (int)stHdl->intNBins) < 0) {
|
| return -1;
|
| }
|
|
|
|
|
| stHdl->inputTimeFIFOIdx = 0;
|
| } else {
|
| if (stHdl->timeInAnalysis == NULL) {
|
| return -1;
|
| }
|
|
|
|
|
| while (stHdl->inputTimeFIFOIdx >= (int)stHdl->intHopSz) {
|
| stHdl->aedProcFrmCnt = AUP_Aed_addOneCnter(stHdl->aedProcFrmCnt);
|
|
|
| analyzerInput.input = stHdl->inputEmphTimeFIFO;
|
| analyzerInput.iLength = (int)stHdl->intHopSz;
|
| analyzerOutput.output = stHdl->aivadInputCmplxSptrm;
|
| analyzerOutput.oLength = (int)stHdl->intFftSz;
|
| if (AUP_Analyzer_proc(stHdl->timeInAnalysis, &analyzerInput,
|
| &analyzerOutput) < 0) {
|
| return -1;
|
| }
|
|
|
| AUP_Aed_CalcBinPow((int)stHdl->intNBins, stHdl->aivadInputCmplxSptrm,
|
| stHdl->aivadInputBinPow);
|
| binPowPtr = stHdl->aivadInputBinPow;
|
|
|
|
|
| if (AUP_Aed_runOneFrm(stHdl, stHdl->inputTimeFIFO, (int)stHdl->intHopSz,
|
| binPowPtr, (int)stHdl->intNBins) < 0) {
|
| return -1;
|
| }
|
|
|
|
|
| if (stHdl->inputTimeFIFOIdx > (int)stHdl->intHopSz) {
|
| memcpy(stHdl->inputTimeFIFO, stHdl->inputTimeFIFO + stHdl->intHopSz,
|
| sizeof(float) * (stHdl->inputTimeFIFOIdx - stHdl->intHopSz));
|
| memcpy(stHdl->inputEmphTimeFIFO,
|
| stHdl->inputEmphTimeFIFO + stHdl->intHopSz,
|
| sizeof(float) * (stHdl->inputTimeFIFOIdx - stHdl->intHopSz));
|
| }
|
| stHdl->inputTimeFIFOIdx -= (int)stHdl->intHopSz;
|
| }
|
| }
|
|
|
|
|
| pOut->frameEnergy = frameEnergy / powerNormal;
|
| pOut->frameRms = stHdl->frameRmsBuff[0];
|
| pOut->pitchFreq = stHdl->pitchFreq;
|
| pOut->voiceProb = stHdl->aivadScore;
|
| if (pOut->voiceProb < 0.0f) {
|
| pOut->vadRes = -1;
|
| } else if (pOut->voiceProb <= stHdl->voiceDecideThresh) {
|
| pOut->vadRes = 0;
|
| } else {
|
| pOut->vadRes = 1;
|
| }
|
|
|
| return 0;
|
| }
|
|
|