摘要:
unimrcp vad 模塊voice activity dector一直認為比較粗暴,而且unimrcp的社區也很久沒有更新了。使用原始unimrcp如果只是用來做Demo演示,通過手動調整參數,還是可以的。但是距離生產環境,還是有很遠的一段路。
這篇文章介紹如何使用webRtc vad模塊替換原來的算法。
【題外話:昨天開了題目,因為有事,沒有更新,今天補上】
unimrcp 的vad的模塊,在libs/mpf/src/mpf_activity_detector.c 文件中,主要算法函數如下:
1 static apr_size_t mpf_activity_detector_level_calculate(const mpf_frame_t *frame) 2 { 3 apr_size_t sum = 0; 4 apr_size_t count = frame->codec_frame.size/2; 5 const apr_int16_t *cur = frame->codec_frame.buffer; 6 const apr_int16_t *end = cur + count; 7 8 for(; cur < end; cur++) { 9 if(*cur < 0) { 10 sum -= *cur; 11 } 12 else { 13 sum += *cur; 14 } 15 } 16 17 return sum / count; 18 }
大家看這個算法,非常簡單粗暴,累加求其平均值,如果大於閾值,表示有聲音,如果不大於,表示靜音。並沒有噪音檢測。所以基本上就是不可用。
在上一篇文檔介紹了WebRTC 的 VAD的算法,今天主要使用webRTC 的VAD的算法,替換該算法。步驟和上一篇介紹webRTC的是一致的。
1 static apr_size_t mpf_activity_detector_level_calculate(const mpf_frame_t *frame) 2 { 3 //calculate samplesCount 4 apr_size_t samplesCount = frame->codec_frame.size/2; 5 //default 10 6 int per_ms_frames = 10; 7 //calculate samples 8 apr_size_t sampleRate = 16000; 9 // 10 size_t samples = sampleRate * per_ms_frames / 1000; 11 if (samples == 0) return -1; 12 // 13 size_t nTotal = (samplesCount / samples); 14 //buffer 15 int16_t *input = frame->codec_frame.buffer; 16 //init vad 17 VadInst * vadInst = WebRtcVad_Create(); 18 if (vadInst == NULL) { 19 return -1; 20 } 21 int status = WebRtcVad_Init(vadInst); 22 if (status != 0) { 23 WebRtcVad_Free(vadInst); 24 return -1; 25 } 26 //default 1 27 int16_t vad_mode = 1; 28 status = WebRtcVad_set_mode(vadInst, vad_mode); 29 if (status != 0) { 30 WebRtcVad_Free(vadInst); 31 return -1; 32 } 33 int cnt = 0; 34 int i = 0; 35 for (i = 0; i < nTotal; i++) { 36 int keep_weight = 0; 37 int nVadRet = WebRtcVad_Process(vadInst, sampleRate, input, samples, keep_weight); 38 if (nVadRet == -1) { 39 WebRtcVad_Free(vadInst); 40 return -1; 41 } else { 42 if (nVadRet >= 1) { 43 cnt++; 44 } 45 printf(" %d \t", nVadRet); 46 } 47 input += samples; 48 } 49 //if hunman voice < nTotal/10, as silent sample。maybe ...
//FIXME 50 if (cnt < nTotal/10) { 51 return 0; 52 } 53 else { 54 return 1; 55 } 56 }
WebRtcVad_Free(vadInst)
下面要更新主處理函數,保留他原有的TRANSION的中間狀態邏輯,
1 /** Process current frame */ 2 MPF_DECLARE(mpf_detector_event_e) mpf_activity_detector_process(mpf_activity_detector_t *detector, const mpf_frame_t *frame) 3 { 4 mpf_detector_event_e det_event = MPF_DETECTOR_EVENT_NONE; 5 apr_size_t level = 0; 6 if((frame->type & MEDIA_FRAME_TYPE_AUDIO) == MEDIA_FRAME_TYPE_AUDIO) { 7 /* first, calculate current activity level of processed frame */ 8 level = mpf_activity_detector_level_calculate(frame); 9 #if 0 10 apt_log(APT_LOG_MARK,APT_PRIO_INFO,"Activity Detector --------------------- [%"APR_SIZE_T_FMT"]",level); 11 #endif 12 } 13 14 if(detector->state == DETECTOR_STATE_INACTIVITY) { 15 //if(level >= detector->level_threshold) { 16 if(level >= 1) { 17 /* start to detect activity */ 18 mpf_activity_detector_state_change(detector,DETECTOR_STATE_ACTIVITY_TRANSITION); 19 } 20 else { 21 detector->duration += CODEC_FRAME_TIME_BASE; 22 if(detector->duration >= detector->noinput_timeout) { 23 /* detected noinput */ 24 det_event = MPF_DETECTOR_EVENT_NOINPUT; 25 } 26 } 27 } 28 else if(detector->state == DETECTOR_STATE_ACTIVITY_TRANSITION) { 29 //if(level >= detector->level_threshold) { 30 if(level >= 1) { 31 detector->duration += CODEC_FRAME_TIME_BASE; 32 if(detector->duration >= detector->speech_timeout) { 33 /* finally detected activity */ 34 det_event = MPF_DETECTOR_EVENT_ACTIVITY; 35 mpf_activity_detector_state_change(detector,DETECTOR_STATE_ACTIVITY); 36 } 37 } 38 else { 39 /* fallback to inactivity */ 40 mpf_activity_detector_state_change(detector,DETECTOR_STATE_INACTIVITY); 41 } 42 } 43 else if(detector->state == DETECTOR_STATE_ACTIVITY) { 44 //if(level >= detector->level_threshold) { 45 if(level >= 1) { 46 detector->duration += CODEC_FRAME_TIME_BASE; 47 } 48 else { 49 /* start to detect inactivity */ 50 mpf_activity_detector_state_change(detector,DETECTOR_STATE_INACTIVITY_TRANSITION); 51 } 52 } 53 else if(detector->state == DETECTOR_STATE_INACTIVITY_TRANSITION) { 54 //if(level >= detector->level_threshold) { 55 if(level >= 1) { 56 /* fallback to activity */ 57 mpf_activity_detector_state_change(detector,DETECTOR_STATE_ACTIVITY); 58 } 59 else { 60 detector->duration += CODEC_FRAME_TIME_BASE; 61 if(detector->duration >= detector->silence_timeout) { 62 /* detected inactivity */ 63 det_event = MPF_DETECTOR_EVENT_INACTIVITY; 64 mpf_activity_detector_state_change(detector,DETECTOR_STATE_INACTIVITY); 65 } 66 } 67 } 68 69 return det_event; 70 }
如此替換后,就完成了算法的更新。當然還需要調整一下cmake的相關的文件配置,加載相應的webRTC的vad文件。
static apr_size_t mpf_activity_detector_level_calculate(const mpf_frame_t *frame) { //calculate samplesCount apr_size_t samplesCount = frame->codec_frame.size/2; //default 10 int per_ms_frames = 10; //calculate samples apr_size_t sampleRate = 16000; // size_t samples = sampleRate * per_ms_frames / 1000; if (samples == 0) return -1; // size_t nTotal = (samplesCount / samples); //buffer int16_t *input = frame->codec_frame.buffer; //init vad VadInst * vadInst = WebRtcVad_Create(); if (vadInst == NULL) { return -1; } int status = WebRtcVad_Init(vadInst); if (status != 0) { WebRtcVad_Free(vadInst); return -1; } //default 1 int16_t vad_mode = 1; status = WebRtcVad_set_mode(vadInst, vad_mode); if (status != 0) { WebRtcVad_Free(vadInst); return -1; } int cnt = 0; int i = 0; for (i = 0; i < nTotal; i++) { int keep_weight = 0; int nVadRet = WebRtcVad_Process(vadInst, sampleRate, input, samples, keep_weight); if (nVadRet == -1) { WebRtcVad_Free(vadInst); return -1; } else { if (nVadRet >= 1) { cnt++; } printf(" %d \t", nVadRet); } input += samples; } //if hunman voice < nTotal/10, as silent sample if (cnt < nTotal/10) { return 0; } else { return 1; }