ROS中階筆記(六):機器人感知—機器語音
1 語音識別理論

2 常用語音功能包
- pocketsphinx:集成CMU Sphinx和Festival開源項目中的代碼,實現語音識別的功能
- audio-common:提供了文本轉語音的功能實現完成"機器人說話"的想法
- AIML:人工智能標記語音,Artificial Intelligence Markup Language是一種創建自然語音軟件代理的XML語言
3 科大訊飛SDK
登錄科大訊飛開放平台的官方網站:http://www.xfyun.cn/,注冊一個賬戶並下載SDK
下載的SDK文件(Linux_lat···文件)——解壓——示例在samples文件夾下面(比如iat_record_sample,剛下載完成功能包之后,需要咋當前目錄下面編譯,通過make編譯)

編譯之后的文件放在bin文件夾下面的,可以看到有一個生成的可執行文件iat_record_sample;

3.1 使用前提
將科大訊飛SDK的庫文件拷貝到系統目錄下
$ sudo cp libmsc.so/usr/lib/libmsc.so # (注意選擇相應的處理器架構)
# SDK的庫文件在libs中有(x86和x64,在這里面放置的庫文件)
科大訊飛的SDK帶有ID號,每個人每次下載后的ID都不相同,更換SDK之后需要修改代碼中的APPID,你也可以直接使用本課程的libmsc.so文件,否則需要將源碼中的APPID修改為自己下載SDK中的ID。
3.2 語音聽寫
iat_publish.cpp
/*
* 語音聽寫(iFly Auto Transform)技術能夠實時地將語音轉換成對應的文字。
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include "robot_voice/qisr.h"
#include "robot_voice/msp_cmn.h"
#include "robot_voice/msp_errors.h"
#include "robot_voice/speech_recognizer.h"
#include <iconv.h>
#include "ros/ros.h"
#include "std_msgs/String.h"
#define FRAME_LEN 640
#define BUFFER_SIZE 4096
int wakeupFlag = 0 ;
int resultFlag = 0 ;
static void show_result(char *string, char is_over)
{
resultFlag=1;
printf("\rResult: [ %s ]", string);
if(is_over)
putchar('\n');
}
static char *g_result = NULL;
static unsigned int g_buffersize = BUFFER_SIZE;
void on_result(const char *result, char is_last)
{
if (result) {
size_t left = g_buffersize - 1 - strlen(g_result);
size_t size = strlen(result);
if (left < size) {
g_result = (char*)realloc(g_result, g_buffersize + BUFFER_SIZE);
if (g_result)
g_buffersize += BUFFER_SIZE;
else {
printf("mem alloc failed\n");
return;
}
}
strncat(g_result, result, size);
show_result(g_result, is_last);
}
}
void on_speech_begin()
{
if (g_result)
{
free(g_result);
}
g_result = (char*)malloc(BUFFER_SIZE);
g_buffersize = BUFFER_SIZE;
memset(g_result, 0, g_buffersize);
printf("Start Listening...\n");
}
void on_speech_end(int reason)
{
if (reason == END_REASON_VAD_DETECT)
printf("\nSpeaking done \n");
else
printf("\nRecognizer error %d\n", reason);
}
/* demo recognize the audio from microphone */
static void demo_mic(const char* session_begin_params)
{
int errcode;
int i = 0;
struct speech_rec iat;
struct speech_rec_notifier recnotifier = {
on_result,
on_speech_begin,
on_speech_end
};
errcode = sr_init(&iat, session_begin_params, SR_MIC, &recnotifier);
if (errcode) {
printf("speech recognizer init failed\n");
return;
}
errcode = sr_start_listening(&iat);
if (errcode) {
printf("start listen failed %d\n", errcode);
}
/* demo 10 seconds recording */
while(i++ < 10)
sleep(1);
errcode = sr_stop_listening(&iat);
if (errcode) {
printf("stop listening failed %d\n", errcode);
}
sr_uninit(&iat);
}
/* main thread: start/stop record ; query the result of recgonization.
* record thread: record callback(data write)
* helper thread: ui(keystroke detection)
*/
void WakeUp(const std_msgs::String::ConstPtr& msg)
{
printf("waking up\r\n");
usleep(700*1000);
wakeupFlag=1;
}
int main(int argc, char* argv[])
{
// 初始化ROS
ros::init(argc, argv, "voiceRecognition");
ros::NodeHandle n;
ros::Rate loop_rate(10);
// 聲明Publisher和Subscriber
// 訂閱喚醒語音識別的信號
ros::Subscriber wakeUpSub = n.subscribe("voiceWakeup", 1000, WakeUp);
// 訂閱喚醒語音識別的信號
ros::Publisher voiceWordsPub = n.advertise<std_msgs::String>("voiceWords", 1000);
ROS_INFO("Sleeping...");
int count=0;
while(ros::ok())
{
// 語音識別喚醒
if (wakeupFlag){
ROS_INFO("Wakeup...");
int ret = MSP_SUCCESS;
const char* login_params = "appid = 594a7b46, work_dir = .";
const char* session_begin_params =
"sub = iat, domain = iat, language = zh_cn, "
"accent = mandarin, sample_rate = 16000, "
"result_type = plain, result_encoding = utf8";
ret = MSPLogin(NULL, NULL, login_params);
if(MSP_SUCCESS != ret){
MSPLogout();
printf("MSPLogin failed , Error code %d.\n",ret);
}
printf("Demo recognizing the speech from microphone\n");
printf("Speak in 10 seconds\n");
demo_mic(session_begin_params);
printf("10 sec passed\n");
wakeupFlag=0;
MSPLogout();
}
// 語音識別完成
if(resultFlag){
resultFlag=0;
std_msgs::String msg;
msg.data = g_result;
voiceWordsPub.publish(msg);
}
ros::spinOnce();
loop_rate.sleep();
count++;
}
exit:
MSPLogout(); // Logout...
return 0;
}
subscriber(訂閱者):用來接收語音喚醒信號,接收到喚醒信號之后,會將wakeupFlag變量置位;
主循環中調用SDK的語音聽寫功能,識別成功后置位resultFlag變量,通過Publisher將識別出來的字符串發布。
第一步,在CMakeLists.txt中加入編譯規則
add_executable(iat_publish
src/iat_publish.cpp
src/speech_recognizer.c
src/linuxrec.c)
target_link_libraries(
iat_publish
${catkin_LIBRARIES}
libmsc.so -ldl -lpthread -lm -lrt -lasound
)
編譯過程需要鏈接SDK中的libmsc.so庫,此前已經將此庫拷貝到系統路徑下,所以此處不需要添加完整路徑。
第二步,語音聽寫示例
$ roscore
$ rosrun robot_voice iat_publish
$ rostopic pub /voiceWakeup std_msgs/String "data:'any string'"
'any string' ''里面可以隨便填
3.3 語音合成
語音合成(Text To Speech,TTS)技術能夠自動將任意文字實時轉換為連續的自然語音,是一種能夠在任何時間、任何地點,向任何人提供語音信息服務的高效便捷手段,非常符合信息時代海量數據、動態更新和個性化查詢的需求。
tts_subscribe.cpp
/*
* 語音合成(Text To Speech,TTS)技術能夠自動將任意文字實時轉換為連續的
* 自然語音,是一種能夠在任何時間、任何地點,向任何人提供語音信息服務的
* 高效便捷手段,非常符合信息時代海量數據、動態更新和個性化查詢的需求。
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include "robot_voice/qtts.h"
#include "robot_voice/msp_cmn.h"
#include "robot_voice/msp_errors.h"
#include "ros/ros.h"
#include "std_msgs/String.h"
#include <sstream>
#include <sys/types.h>
#include <sys/stat.h>
/* wav音頻頭部格式 */
typedef struct _wave_pcm_hdr
{
char riff[4]; // = "RIFF"
int size_8; // = FileSize - 8
char wave[4]; // = "WAVE"
char fmt[4]; // = "fmt "
int fmt_size; // = 下一個結構體的大小 : 16
short int format_tag; // = PCM : 1
short int channels; // = 通道數 : 1
int samples_per_sec; // = 采樣率 : 8000 | 6000 | 11025 | 16000
int avg_bytes_per_sec; // = 每秒字節數 : samples_per_sec * bits_per_sample / 8
short int block_align; // = 每采樣點字節數 : wBitsPerSample / 8
short int bits_per_sample; // = 量化比特數: 8 | 16
char data[4]; // = "data";
int data_size; // = 純數據長度 : FileSize - 44
} wave_pcm_hdr;
/* 默認wav音頻頭部數據 */
wave_pcm_hdr default_wav_hdr =
{
{ 'R', 'I', 'F', 'F' },
0,
{'W', 'A', 'V', 'E'},
{'f', 'm', 't', ' '},
16,
1,
1,
16000,
32000,
2,
16,
{'d', 'a', 't', 'a'},
0
};
/* 文本合成 */
int text_to_speech(const char* src_text, const char* des_path, const char* params)
{
int ret = -1;
FILE* fp = NULL;
const char* sessionID = NULL;
unsigned int audio_len = 0;
wave_pcm_hdr wav_hdr = default_wav_hdr;
int synth_status = MSP_TTS_FLAG_STILL_HAVE_DATA;
if (NULL == src_text || NULL == des_path)
{
printf("params is error!\n");
return ret;
}
fp = fopen(des_path, "wb");
if (NULL == fp)
{
printf("open %s error.\n", des_path);
return ret;
}
/* 開始合成 */
sessionID = QTTSSessionBegin(params, &ret);
if (MSP_SUCCESS != ret)
{
printf("QTTSSessionBegin failed, error code: %d.\n", ret);
fclose(fp);
return ret;
}
ret = QTTSTextPut(sessionID, src_text, (unsigned int)strlen(src_text), NULL);
if (MSP_SUCCESS != ret)
{
printf("QTTSTextPut failed, error code: %d.\n",ret);
QTTSSessionEnd(sessionID, "TextPutError");
fclose(fp);
return ret;
}
printf("正在合成 ...\n");
fwrite(&wav_hdr, sizeof(wav_hdr) ,1, fp); //添加wav音頻頭,使用采樣率為16000
while (1)
{
/* 獲取合成音頻 */
const void* data = QTTSAudioGet(sessionID, &audio_len, &synth_status, &ret);
if (MSP_SUCCESS != ret)
break;
if (NULL != data)
{
fwrite(data, audio_len, 1, fp);
wav_hdr.data_size += audio_len; //計算data_size大小
}
if (MSP_TTS_FLAG_DATA_END == synth_status)
break;
printf(">");
usleep(150*1000); //防止頻繁占用CPU
}//合成狀態synth_status取值請參閱《訊飛語音雲API文檔》
printf("\n");
if (MSP_SUCCESS != ret)
{
printf("QTTSAudioGet failed, error code: %d.\n",ret);
QTTSSessionEnd(sessionID, "AudioGetError");
fclose(fp);
return ret;
}
/* 修正wav文件頭數據的大小 */
wav_hdr.size_8 += wav_hdr.data_size + (sizeof(wav_hdr) - 8);
/* 將修正過的數據寫回文件頭部,音頻文件為wav格式 */
fseek(fp, 4, 0);
fwrite(&wav_hdr.size_8,sizeof(wav_hdr.size_8), 1, fp); //寫入size_8的值
fseek(fp, 40, 0); //將文件指針偏移到存儲data_size值的位置
fwrite(&wav_hdr.data_size,sizeof(wav_hdr.data_size), 1, fp); //寫入data_size的值
fclose(fp);
fp = NULL;
/* 合成完畢 */
ret = QTTSSessionEnd(sessionID, "Normal");
if (MSP_SUCCESS != ret)
{
printf("QTTSSessionEnd failed, error code: %d.\n",ret);
}
return ret;
}
void voiceWordsCallback(const std_msgs::String::ConstPtr& msg)
{
char cmd[2000];
const char* text;
int ret = MSP_SUCCESS;
const char* session_begin_params = "voice_name = xiaoyan, text_encoding = utf8, sample_rate = 16000, speed = 50, volume = 50, pitch = 50, rdn = 2";
const char* filename = "tts_sample.wav"; //合成的語音文件名稱
std::cout<<"I heard :"<<msg->data.c_str()<<std::endl;
text = msg->data.c_str();
/* 文本合成 */
printf("開始合成 ...\n");
ret = text_to_speech(text, filename, session_begin_params);
if (MSP_SUCCESS != ret)
{
printf("text_to_speech failed, error code: %d.\n", ret);
}
printf("合成完畢\n");
unlink("/tmp/cmd");
mkfifo("/tmp/cmd", 0777);
popen("mplayer -quiet -slave -input file=/tmp/cmd 'tts_sample.wav'","r");
sleep(3);
}
void toExit()
{
printf("按任意鍵退出 ...\n");
getchar();
MSPLogout(); //退出登錄
}
int main(int argc, char* argv[])
{
int ret = MSP_SUCCESS;
const char* login_params = "appid = 594a7b46, work_dir = .";//登錄參數,appid與msc庫綁定,請勿隨意改動
/*
* rdn: 合成音頻數字發音方式
* volume: 合成音頻的音量
* pitch: 合成音頻的音調
* speed: 合成音頻對應的語速
* voice_name: 合成發音人
* sample_rate: 合成音頻采樣率
* text_encoding: 合成文本編碼格式
*
* 詳細參數說明請參閱《訊飛語音雲MSC--API文檔》
*/
/* 用戶登錄 */
ret = MSPLogin(NULL, NULL, login_params);//第一個參數是用戶名,第二個參數是密碼,第三個參數是登錄參數,用戶名和密碼可在http://open.voicecloud.cn注冊獲取
if (MSP_SUCCESS != ret)
{
printf("MSPLogin failed, error code: %d.\n", ret);
/*goto exit ;*///登錄失敗,退出登錄
toExit();
}
printf("\n###########################################################################\n");
printf("## 語音合成(Text To Speech,TTS)技術能夠自動將任意文字實時轉換為連續的 ##\n");
printf("## 自然語音,是一種能夠在任何時間、任何地點,向任何人提供語音信息服務的 ##\n");
printf("## 高效便捷手段,非常符合信息時代海量數據、動態更新和個性化查詢的需求。 ##\n");
printf("###########################################################################\n\n");
ros::init(argc,argv,"TextToSpeech");
ros::NodeHandle n;
ros::Subscriber sub =n.subscribe("voiceWords", 1000,voiceWordsCallback);
ros::spin();
exit:
printf("按任意鍵退出 ...\n");
getchar();
MSPLogout(); //退出登錄
return 0;
}
- main函數中聲明了一個訂閱voiceWords話題的subscriber,接收輸入的語音字符串。
- 回調函數voiceWordsCallback中使用SDK接口將字符串轉換成中文語音。
第一步,在CMakeLists.txt中加入編譯規則
add_executable(tts_subscribe src/tts_subscribe.cpp)
target_link_libraries(
tts_subscribe
${catkin_LIBRARIES}
libmsc.so -ldl -pthread
)
第二步,語音合成示例
$ roscore
$ rosrun robot_voice tts_subscribe
$ rostopic pub/voiceWords std_msgs/String"data:'你好,我是機器人'"
語音合成啟動后可能產生的錯誤

原因:mplayer配置導致
解決方法:在$HOME/.mplayer/config文件中添加如下設置:lirc=no
3.4 智能語音助手
voice_assistant.cpp
#include <iostream>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <time.h>
#include "robot_voice/qtts.h"
#include "robot_voice/msp_cmn.h"
#include "robot_voice/msp_errors.h"
#include "ros/ros.h"
#include "std_msgs/String.h"
#include <sstream>
#include <sys/types.h>
#include <sys/stat.h>
/* wav音頻頭部格式 */
typedef struct _wave_pcm_hdr
{
char riff[4]; // = "RIFF"
int size_8; // = FileSize - 8
char wave[4]; // = "WAVE"
char fmt[4]; // = "fmt "
int fmt_size; // = 下一個結構體的大小 : 16
short int format_tag; // = PCM : 1
short int channels; // = 通道數 : 1
int samples_per_sec; // = 采樣率 : 8000 | 6000 | 11025 | 16000
int avg_bytes_per_sec; // = 每秒字節數 : samples_per_sec * bits_per_sample / 8
short int block_align; // = 每采樣點字節數 : wBitsPerSample / 8
short int bits_per_sample; // = 量化比特數: 8 | 16
char data[4]; // = "data";
int data_size; // = 純數據長度 : FileSize - 44
} wave_pcm_hdr;
/* 默認wav音頻頭部數據 */
wave_pcm_hdr default_wav_hdr =
{
{ 'R', 'I', 'F', 'F' },
0,
{'W', 'A', 'V', 'E'},
{'f', 'm', 't', ' '},
16,
1,
1,
16000,
32000,
2,
16,
{'d', 'a', 't', 'a'},
0
};
/* 文本合成 */
int text_to_speech(const char* src_text, const char* des_path, const char* params)
{
int ret = -1;
FILE* fp = NULL;
const char* sessionID = NULL;
unsigned int audio_len = 0;
wave_pcm_hdr wav_hdr = default_wav_hdr;
int synth_status = MSP_TTS_FLAG_STILL_HAVE_DATA;
if (NULL == src_text || NULL == des_path)
{
printf("params is error!\n");
return ret;
}
fp = fopen(des_path, "wb");
if (NULL == fp)
{
printf("open %s error.\n", des_path);
return ret;
}
/* 開始合成 */
sessionID = QTTSSessionBegin(params, &ret);
if (MSP_SUCCESS != ret)
{
printf("QTTSSessionBegin failed, error code: %d.\n", ret);
fclose(fp);
return ret;
}
ret = QTTSTextPut(sessionID, src_text, (unsigned int)strlen(src_text), NULL);
if (MSP_SUCCESS != ret)
{
printf("QTTSTextPut failed, error code: %d.\n",ret);
QTTSSessionEnd(sessionID, "TextPutError");
fclose(fp);
return ret;
}
printf("正在合成 ...\n");
fwrite(&wav_hdr, sizeof(wav_hdr) ,1, fp); //添加wav音頻頭,使用采樣率為16000
while (1)
{
/* 獲取合成音頻 */
const void* data = QTTSAudioGet(sessionID, &audio_len, &synth_status, &ret);
if (MSP_SUCCESS != ret)
break;
if (NULL != data)
{
fwrite(data, audio_len, 1, fp);
wav_hdr.data_size += audio_len; //計算data_size大小
}
if (MSP_TTS_FLAG_DATA_END == synth_status)
break;
printf(">");
usleep(150*1000); //防止頻繁占用CPU
}//合成狀態synth_status取值請參閱《訊飛語音雲API文檔》
printf("\n");
if (MSP_SUCCESS != ret)
{
printf("QTTSAudioGet failed, error code: %d.\n",ret);
QTTSSessionEnd(sessionID, "AudioGetError");
fclose(fp);
return ret;
}
/* 修正wav文件頭數據的大小 */
wav_hdr.size_8 += wav_hdr.data_size + (sizeof(wav_hdr) - 8);
/* 將修正過的數據寫回文件頭部,音頻文件為wav格式 */
fseek(fp, 4, 0);
fwrite(&wav_hdr.size_8,sizeof(wav_hdr.size_8), 1, fp); //寫入size_8的值
fseek(fp, 40, 0); //將文件指針偏移到存儲data_size值的位置
fwrite(&wav_hdr.data_size,sizeof(wav_hdr.data_size), 1, fp); //寫入data_size的值
fclose(fp);
fp = NULL;
/* 合成完畢 */
ret = QTTSSessionEnd(sessionID, "Normal");
if (MSP_SUCCESS != ret)
{
printf("QTTSSessionEnd failed, error code: %d.\n",ret);
}
return ret;
}
std::string to_string(int val)
{
char buf[20];
sprintf(buf, "%d", val);
return std::string(buf);
}
void voiceWordsCallback(const std_msgs::String::ConstPtr& msg)
{
char cmd[2000];
const char* text;
int ret = MSP_SUCCESS;
const char* session_begin_params = "voice_name = xiaoyan, text_encoding = utf8, sample_rate = 16000, speed = 50, volume = 50, pitch = 50, rdn = 2";
const char* filename = "tts_sample.wav"; //合成的語音文件名稱
std::cout<<"I heard :"<<msg->data.c_str()<<std::endl;
std::string dataString = msg->data;
if(dataString.compare("你是誰?") == 0)
{
char nameString[40] = "我是你的語音小助手";
text = nameString;
std::cout<<text<<std::endl;
}
else if(dataString.compare("你可以做什么?") == 0)
{
char helpString[40] = "你可以問我現在時間";
text = helpString;
std::cout<<text<<std::endl;
}
else if(dataString.compare("現在時間。") == 0)
{
//獲取當前時間
struct tm *ptm;
long ts;
ts = time(NULL);
ptm = localtime(&ts);
std::string string = "現在時間" + to_string(ptm-> tm_hour) + "點" + to_string(ptm-> tm_min) + "分";
char timeString[40];
string.copy(timeString, sizeof(string), 0);
text = timeString;
std::cout<<text<<std::endl;
}
else
{
text = msg->data.c_str();
}
/* 文本合成 */
printf("開始合成 ...\n");
ret = text_to_speech(text, filename, session_begin_params);
if (MSP_SUCCESS != ret)
{
printf("text_to_speech failed, error code: %d.\n", ret);
}
printf("合成完畢\n");
unlink("/tmp/cmd");
mkfifo("/tmp/cmd", 0777);
popen("mplayer -quiet -slave -input file=/tmp/cmd 'tts_sample.wav'","r");
sleep(3);
}
void toExit()
{
printf("按任意鍵退出 ...\n");
getchar();
MSPLogout(); //退出登錄
}
int main(int argc, char* argv[])
{
int ret = MSP_SUCCESS;
const char* login_params = "appid = 594a7b46, work_dir = .";//登錄參數,appid與msc庫綁定,請勿隨意改動
/*
* rdn: 合成音頻數字發音方式
* volume: 合成音頻的音量
* pitch: 合成音頻的音調
* speed: 合成音頻對應的語速
* voice_name: 合成發音人
* sample_rate: 合成音頻采樣率
* text_encoding: 合成文本編碼格式
*
* 詳細參數說明請參閱《訊飛語音雲MSC--API文檔》
*/
/* 用戶登錄 */
ret = MSPLogin(NULL, NULL, login_params);//第一個參數是用戶名,第二個參數是密碼,第三個參數是登錄參數,用戶名和密碼可在http://open.voicecloud.cn注冊獲取
if (MSP_SUCCESS != ret)
{
printf("MSPLogin failed, error code: %d.\n", ret);
/*goto exit ;*///登錄失敗,退出登錄
toExit();
}
printf("\n###########################################################################\n");
printf("## 語音合成(Text To Speech,TTS)技術能夠自動將任意文字實時轉換為連續的 ##\n");
printf("## 自然語音,是一種能夠在任何時間、任何地點,向任何人提供語音信息服務的 ##\n");
printf("## 高效便捷手段,非常符合信息時代海量數據、動態更新和個性化查詢的需求。 ##\n");
printf("###########################################################################\n\n");
ros::init(argc,argv,"TextToSpeech");
ros::NodeHandle n;
ros::Subscriber sub =n.subscribe("voiceWords", 1000,voiceWordsCallback);
ros::spin();
exit:
printf("按任意鍵退出 ...\n");
getchar();
MSPLogout(); //退出登錄
return 0;
}
第一步,在CMakeLists.txt中加入編譯規則
add_executable(voice_assistant src/voice_assistant.cpp)
target_link_libraries(
voice_assistant
${catkin_LIBRARIES}
libmsc.so -ldl -pthread
)
第二步,智能語音助手示例
$ roscore
$ rosrun robot_voice iat_publish # 啟動語音識別節點
$ rosrun robot_voice voice_assistant # 啟動語音助手作為語音判斷
$ rostopic pub /voiceWakeup std_msgs/String "data:'any string'" # 喚醒詞
4 參考資料
1.ROS探索總結(二十八)——機器聽覺
2.ROS Kinetic使用PocketSphinx進行語音識別
https://blog.csdn.net/seeseeatre/article/details/79228816
3.訊飛開放平台-以語音交互為核心的人工智能開放平台
