java實現微軟文本轉語音(TTS)經驗總結


一、使用背景

公司項目之前一直是采用人工錄音,然而上線一段時間之后發現,人工錄音成本太高,而且每周上線的音頻不多,老板發現問題后,甚至把音頻功能裸停了一段時間。直到最近項目要向海外擴展,需要內容做國際化,就想到了用機器翻譯。目前機翻已經相對成熟,做的好的國內有科大訊飛,國外有微軟。既然項目主要面對海外用戶,就決定采用微軟的TTS。(PS:這里不是打廣告,微軟的TTS是真的不錯,自己可以去官網試聽下,雖然無法像人一樣很有感情的朗讀詩歌什么的,但是朗讀新聞咨詢類文章還是抑揚頓挫的。)

二、上代碼

使用背景已經啰嗦了一大堆,我覺得讀者還是會關注的,但是我想作為資深CV碼農,我想你們更關注還是如何應用,所以還是老規矩,簡簡單單的上代碼。(申請賬號這些就不介紹了)

1.依賴

<dependency>
    <groupId>com.microsoft.cognitiveservices.speech</groupId>
    <artifactId>client-sdk</artifactId>
    <version>1.12.1</version>
</dependency>

2.配置常量

public class TtsConst {
    /**
     * 音頻合成類型(親測這種效果最佳,其他的你自己去試試)
     */
    public static final String AUDIO_24KHZ_48KBITRATE_MONO_MP3 = "audio-24khz-48kbitrate-mono-mp3";
    /**
     * 授權url
     */
    public static final String ACCESS_TOKEN_URI = "https://eastasia.api.cognitive.microsoft.com/sts/v1.0/issuetoken";
    /**
     * api key
     */
    public static final String API_KEY = "你自己的 api key";
    /**
     * 設置accessToken的過期時間為9分鍾
     */
    public static final Integer ACCESS_TOKEN_EXPIRE_TIME = 9 * 60;
    /**
     * 性別
     */
    public static final String MALE = "Male";
    /**
     * tts服務url
     */
    public static final String TTS_SERVICE_URI = "https://eastasia.tts.speech.microsoft.com/cognitiveservices/v1";

}

3.https連接

public class HttpsConnection {

    public static HttpsURLConnection getHttpsConnection(String connectingUrl) throws Exception {

        URL url = new URL(connectingUrl);
        return (HttpsURLConnection) url.openConnection();
    }
}

3.授權

@Component
@Slf4j
public class Authentication {
    @Resource
    private RedisCache redisCache;

    public String genAccessToken() {
        InputStream inSt;
        HttpsURLConnection webRequest;

        try {
            String accessToken = redisCache.get(RedisKey.KEY_TTS_ACCESS_TOKEN);
            if (StringUtils.isEmpty(accessToken)) {
                webRequest = HttpsConnection.getHttpsConnection(TtsConst.ACCESS_TOKEN_URI);
                webRequest.setDoInput(true);
                webRequest.setDoOutput(true);
                webRequest.setConnectTimeout(5000);
                webRequest.setReadTimeout(5000);
                webRequest.setRequestMethod("POST");

                byte[] bytes = new byte[0];
                webRequest.setRequestProperty("content-length", String.valueOf(bytes.length));
                webRequest.setRequestProperty("Ocp-Apim-Subscription-Key", TtsConst.API_KEY);
                webRequest.connect();

                DataOutputStream dop = new DataOutputStream(webRequest.getOutputStream());
                dop.write(bytes);
                dop.flush();
                dop.close();

                inSt = webRequest.getInputStream();
                InputStreamReader in = new InputStreamReader(inSt);
                BufferedReader bufferedReader = new BufferedReader(in);
                StringBuilder strBuffer = new StringBuilder();
                String line = null;
                while ((line = bufferedReader.readLine()) != null) {
                    strBuffer.append(line);
                }

                bufferedReader.close();
                in.close();
                inSt.close();
                webRequest.disconnect();

                accessToken = strBuffer.toString();
                //設置accessToken的過期時間為9分鍾
                redisCache.set(RedisKey.KEY_TTS_ACCESS_TOKEN, accessToken, TtsConst.ACCESS_TOKEN_EXPIRE_TIME);
                log.info("New tts access token {}", accessToken);
            }
            return accessToken;
        } catch (Exception e) {
            log.error("Generate tts access token failed {}", e.getMessage());
        }
        return null;
    }
}

4.字節數組處理

public class ByteArray {
    private byte[] data;
    private int length;

    public ByteArray(){
        length = 0;
        data = new byte[length];
    }

    public ByteArray(byte[] ba){
        data = ba;
        length = ba.length;
    }

    /**
    合並數組
     */
    public  void cat(byte[] second, int offset, int length){

        if(this.length + length > data.length) {
            int allocatedLength = Math.max(data.length, length);
            byte[] allocated = new byte[allocatedLength << 1];
            System.arraycopy(data, 0, allocated, 0, this.length);
            System.arraycopy(second, offset, allocated, this.length, length);
            data = allocated;
        }else {
            System.arraycopy(second, offset, data, this.length, length);
        }

        this.length += length;
    }

    public  void cat(byte[] second){
        cat(second, 0, second.length);
    }

    public byte[] getArray(){
        if(length == data.length){
            return data;
        }

        byte[] ba = new byte[length];
        System.arraycopy(data, 0, ba, 0, this.length);
        data = ba;
        return ba;
    }

    public int getLength(){
        return length;
    }
}

5.創建SSML文件

@Slf4j
public class XmlDom {
    public static String createDom(String locale, String genderName, String voiceName, String textToSynthesize){
        Document doc = null;
        Element speak, voice;
        try {
            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
            DocumentBuilder builder = dbf.newDocumentBuilder();
            doc = builder.newDocument();
            if (doc != null){
                speak = doc.createElement("speak");
                speak.setAttribute("version", "1.0");
                speak.setAttribute("xml:lang", "en-us");
                voice = doc.createElement("voice");
                voice.setAttribute("xml:lang", locale);
                voice.setAttribute("xml:gender", genderName);
                voice.setAttribute("name", voiceName);

                voice.appendChild(doc.createTextNode(textToSynthesize));
                speak.appendChild(voice);
                doc.appendChild(speak);
            }
        } catch (ParserConfigurationException e) {
            log.error("Create ssml document failed: {}",e.getMessage());
            return null;
        }
        return transformDom(doc);
    }

    private static String transformDom(Document doc){
        StringWriter writer = new StringWriter();
        try {
            TransformerFactory tf = TransformerFactory.newInstance();
            Transformer transformer;
            transformer = tf.newTransformer();
            transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
            transformer.transform(new DOMSource(doc), new StreamResult(writer));
        } catch (TransformerException e) {
            log.error("Transform ssml document failed: {}",e.getMessage());
            return null;
        }
        return writer.getBuffer().toString().replaceAll("\n|\r", "");
    }
}

6.正主來了!TTS服務

@Slf4j
@Component
public class TtsService {

    @Resource
    private Authentication authentication;

    /**
     * 合成音頻
     */
    public byte[] genAudioBytes(String textToSynthesize, String locale, String gender, String voiceName) {

        String accessToken = authentication.genAccessToken();
        if (StringUtils.isEmpty(accessToken)) {
            return new byte[0];
        }
        try {
            HttpsURLConnection webRequest = HttpsConnection.getHttpsConnection(TtsConst.TTS_SERVICE_URI);
            webRequest.setDoInput(true);
            webRequest.setDoOutput(true);
            webRequest.setConnectTimeout(5000);
            webRequest.setReadTimeout(300000);
            webRequest.setRequestMethod("POST");

            webRequest.setRequestProperty("Content-Type", "application/ssml+xml");
            webRequest.setRequestProperty("X-Microsoft-OutputFormat", TtsConst.AUDIO_24KHZ_48KBITRATE_MONO_MP3);
            webRequest.setRequestProperty("Authorization", "Bearer " + accessToken);
            webRequest.setRequestProperty("X-Search-AppId", "07D3234E49CE426DAA29772419F436CC");
            webRequest.setRequestProperty("X-Search-ClientID", "1ECFAE91408841A480F00935DC390962");
            webRequest.setRequestProperty("User-Agent", "TTSAndroid");
            webRequest.setRequestProperty("Accept", "*/*");

            String body = XmlDom.createDom(locale, gender, voiceName, textToSynthesize);
            if (StringUtils.isEmpty(body)) {
                return new byte[0];
            }
            byte[] bytes = body.getBytes();
            webRequest.setRequestProperty("content-length", String.valueOf(bytes.length));
            webRequest.connect();
            DataOutputStream dop = new DataOutputStream(webRequest.getOutputStream());
            dop.write(bytes);
            dop.flush();
            dop.close();

            InputStream inSt = webRequest.getInputStream();
            ByteArray ba = new ByteArray();

            int rn2 = 0;
            int bufferLength = 4096;
            byte[] buf2 = new byte[bufferLength];
            while ((rn2 = inSt.read(buf2, 0, bufferLength)) > 0) {
                ba.cat(buf2, 0, rn2);
            }

            inSt.close();
            webRequest.disconnect();

            return ba.getArray();
        } catch (Exception e) {
            log.error("Synthesis tts speech failed {}", e.getMessage());
        }
        return null;
    }
}

由於項目中需要將音頻上傳到OSS,所以這里生成的是字節碼文件,你也可以選擇下載或保存音頻文件。

三、問題及總結

1.問題

由於項目中需要生成超過10分鍾的音頻,我在調試中發現tts不能生成超過10分鍾的音頻,尷尬了呀,在微軟官網中摸索了半天也沒找到生成超過10分鍾音頻的辦法,放棄了嗎?不可能的。在我感覺到無計可施的時候,我的腦海中蹦出了四個字,那就是”斷點續傳“。我就想能不能通過tts把內容分段生成字節碼兩個,然后拼接后再上傳到OSS。說干就干,沒想到最后真的可行。成功那一瞬間的感覺無法言喻呀。不廢話了,嗯,上大媽,哦不是,上代碼。太激動了。

    /**
     * 生成中文音頻信息
     */
    public byte[] getZHAudioBuffer(String gender, String chapterContent, String locale, String voiceName) {
        byte[] audioBuffer;
        if (chapterContent.length() <= 2600) {
            audioBuffer = ttsService.genAudioBytes(chapterContent, locale, gender, voiceName);
        } else {
            byte[] audioBuffer1 = ttsService.genAudioBytes(chapterContent.substring(0, chapterContent.length() / 2), locale, gender, voiceName);
            byte[] audioBuffer2 = ttsService.genAudioBytes(chapterContent.substring(chapterContent.length() / 2), locale, gender, voiceName);
            ByteArray byteArray = new ByteArray(audioBuffer1);
            byteArray.cat(audioBuffer2);
            audioBuffer = byteArray.getArray();
        }
        return audioBuffer;
    }

    /**
     * 生成英文音頻信息
     */
    public byte[] getUSAudioBuffer(String gender, String chapterContent, String locale, String voiceName) {
        String[] words = chapterContent.split(" ");
        byte[] audioBuffer;
        int maxLength = 1500;
        if (words.length <= maxLength) {
            audioBuffer = ttsService.genAudioBytes(chapterContent, locale, gender, voiceName);
        } else {
            String[] part1 = new String[maxLength];
            String[] part2 = new String[words.length - maxLength];
            for (int i = 0; i < words.length; i++) {
                if (i < maxLength) {
                    part1[i] = words[i];
                } else {
                    part2[i - maxLength] = words[i];
                }
            }
            byte[] audioBuffer1 = ttsService.genAudioBytes(String.join(" ", part1), locale, gender, voiceName);
            byte[] audioBuffer2 = ttsService.genAudioBytes(String.join(" ", part2), locale, gender, voiceName);
            ByteArray byteArray = new ByteArray(audioBuffer1);
            byteArray.cat(audioBuffer2);
            audioBuffer = byteArray.getArray();
        }
        return audioBuffer;
    }

我要說的都在代碼里了,你細品。(PS:中文的2600字符和英文的1500字符,是我調試出來的,生成的音頻肯定是在10分鍾以內的)

2.總結

微軟TTS還是挺香的,嗯,總結很到位,我繼續摸索其他功能去了。


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM