語音識別—前端錄音上傳服務器進行語音識別

本文轉載自查看原文 2019-08-23 10:31 1347 node.js/ 語音識別

采用前端錄音，基於Node搭建Websocket服務器，音頻對象Blob使用Websocket傳給后端服務器后寫入本地音頻文件，然后調用百度AI語音識別本地音頻文件，最后將識別結果傳給前端顯示。

在這里插入圖片描述

百度語音識別

查看文檔知道了我想要的信息，如果想要實現實時語音識別、長時間段的語音、喚醒詞功能、語義解析功能，需要使用Android、IOS SDK或者Linux C++ SDK版本，而我使用的Nodejs SDK是不支持的。

1、規格參數要求

語音時長上線為60s，超出講返回錯誤
原始錄音文件為pcm（不壓縮）、wav（不壓縮）、pcm編碼或者amr（壓縮）格式，不區分大小寫，推薦使用`pcm`
錄音采樣率為16000，聲道為單通道
支持普通話、英文、粵語、四川話

項目結構

調用百度AI平台語音識別的Nodejs SDK，查看文檔快速入門，可以查看如何調用。

首先將nodejs-sdk下載下來，下載后將目錄里的speech文件夾拷貝到你的項目文件夾中，其中assets是存放錄音音頻的地方，然后進入node文件夾下的位置進行安裝依賴包：

npm install

我的項目文件夾目錄如下：

audio_asr_baidu
├─ package-lock.json
└─ speech
       ├─ .gitignore
       ├─ assets
       │    ├─ 16k_test.pcm
       │    └─ recorder.wav
       ├─ cpp
       │    ├─ .gitignore
       │    ├─ README.md
       │    ├─ build.sh
       │    └─ main.cpp
       └─ node
              ├─ .gitignore
              ├─ README.md
              ├─ RecordRTC.js
              ├─ index.html
              ├─ main.js
              ├─ node_modules
              ├─ package-lock.json
              ├─ package.json
              └─ style.css

然后在node文件夾里的index.html是我的客戶端文件，main.js是我的服務端文件。

搭建Websocket服務器

在main.js文件里搭建websocket服務器，首先安裝相關依賴模塊：

npm i ws -S

然后搭建：

let Server = require('ws').Server;
const wss = new Server({
    port: 9001
})
// 連接服務器
wss.on('connection', ws => {
    console.log('server connected');

    })
    ws.on('error', error => {
        console.log('Error:' + error);

    })
    ws.on('close', () => {
        console.log('Websocket is closed');
    })
})
// 斷開連接
wss.on('disconnection', ws => {
    ws.on('message', msg => {
        console.log('server recived msg:' + msg);
    })
})

然后在index.html中：

let ws = new WebSocket('ws://localhost:9001');
ws.onopen = e => {
    console.log('Connection to server opened');
}

啟動服務：

node main.js

就可以在控制台看見這樣的打印信息：

// 客戶端的打印信息：
Connection to server opened

// 服務端的打印信息：
server connected

語音識別

客戶端錄音傳給服務端，然后寫入本地.wav文件的操作有個了雛形，但是還需要再修改一下，所以這里只是大概將百度AI提供的錄音文件語音識別之后傳給前端顯示演示出來。

index.html:

<div>
  <p>You said: </p>
  <p id="txt"></p>
 </div>

我使用的是RecordRTC錄音：

<script src="./RecordRTC.js"></script>
    <script>
        let audio = document.querySelector('audio');
        let audioTxt = document.getElementById('txt');
        let ws = new WebSocket('ws://localhost:9001');
        ws.onopen = e => {
           console.log('Connection to server opened');
       }
            /**
             * @name: captureMicrophone
             * @description: 獲取麥克風權限
             * @param {type} callback
             * @return: none
             */
        function captureMicrophone(callback) {
            navigator.mediaDevices.getUserMedia({
                audio: true
            }).then(callback).catch(function(error) {
                alert('Unable to access your microphone.');
                console.error(error);
            });
        }
        /**
         * @name: stopRecordingCallback
         * @description: 停止說話 發送blob給服務端
         * @param {type} none
         * @return: none
         */
        function stopRecordingCallback() {
            audio.srcObject = null;
            let blob = recorder.getBlob();
            console.log(blob);
           // 錄音回放
            audio.src = URL.createObjectURL(blob);
           // 麥克風停止使用
            recorder.microphone.stop();
            // 傳輸blob
            ws.send(blob)
        }
        // 將識別結果傳送給前端顯示
        ws.onmessage = e => {
            console.log(e.data);
            let data = JSON.parse(e.data);
            audioTxt.innerHTML = data.result;
        }
        ws.onclose = e => {
            console.log('Connection to server closed');

        }

        let recorder; // globally accessible
        /**
         * @name: 
         * @description: 開始說話
         * @param {type} none
         * @return: 
         */
        document.getElementById('btn-start-recording').onclick = function() {
            // this.disabled = true;
            captureMicrophone(function(microphone) {
                audio.srcObject = microphone;


                recorder = RecordRTC(microphone, {
                    type: 'audio',
                    recorderType: StereoAudioRecorder,
                    desiredSampRate: 16000
                });

                recorder.startRecording();

                // 點擊停止說話，釋放麥克風
                recorder.microphone = microphone;
                document.getElementById('btn-stop-recording').disabled = false;
            });
        };
        /**
         * @name: 
         * @description: 停止說話
         * @param {type} none
         * @return: 
         */
        document.getElementById('btn-stop-recording').onclick = function() {
            // this.disabled = true;
            recorder.stopRecording(stopRecordingCallback);

        };
    </script>

服務端將音頻對象blob寫入本地.wav文件，然后進行語音識別：

let AipSpeech = require("baidu-aip-sdk").speech;
let fs = require('fs');
let Server = require('ws').Server;

const wss = new Server({
    port: 9001
})
let resTxt;// 語音識別結果

wss.on('connection', ws => {
    console.log('server connected');

    ws.on('message', data => {
        console.log('server recived audio blob');
   // 務必替換百度雲控制台中新建百度語音應用的 Api Key 和 Secret Key
        let client = new AipSpeech(0, 'Api Key', 'Secret Key');

        let voice = fs.readFileSync('../assets/16k_test.pcm');

        let voiceBase64 = new Buffer(voice);

        // 識別本地語音文件
        client.recognize(voiceBase64, 'pcm', 16000).then(function(result) {
            console.log('語音識別本地音頻文件結果: ' + JSON.stringify(result));
            resTxt = JSON.stringify(result)

        }, function(err) {
            console.log(err);
        });
    })
    // 服務端發送給客戶端
    ws.send(resTxt);
    
    ws.on('error', error => {
        console.log('Error:' + error);

    })
    ws.on('close', () => {
        console.log('Websocket is closed');
    })
})
wss.on('disconnection', ws => {
    ws.on('message', msg => {
        console.log('server recived msg:' + msg);
    })
})

然后啟動服務器：

node main.js

再打開index.html在瀏覽器上，就可以看到連接服務器了，然后語音識別的結果是這樣：

語音識別本地音頻文件結果: {"corpus_no":"6728179649062189023","err_msg":"success.","err_no":0,"result":["北京科技館。"],"sn":"727133992011566526398"}

這是前端顯示的結果：
在這里插入圖片描述

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 html5網頁錄音和語音識別 Azure AI 服務之語音識別微信小程序語音識別服務搭建全過程解析（https api開放，支持新接口mp3錄音、老接口silk錄音） Unity3d在Window上使用SAPI進行語音識別利用微軟認知服務實現語音識別功能 Android開發之語音識別 Android studio語音識別 [轉]Kaldi語音識別 DTW算法（語音識別）人工智能 - 基於火狐瀏覽器的語音識別,語音自動回復