頻域特征-Fbank


Fbank是一種前端處理方法,以類似人耳的方式對音頻進行處理,可以提高語音識別的性能。fbank的計算流程與語譜圖類似,唯一的區別就在於加了個Mel濾波器,從而使得得到的特征更逼近人耳特性。有關於Mel濾波器的相關內容可以查閱https://mp.weixin.qq.com/s/pGwO_27x8ddQF55wTSQlmA。接下來就介紹一下fbank的求取過程。
• 預加重
因為語音信號的功率譜隨頻率的增加而減小,因此導致語音的大部分能量都集中在低頻部分,從而導致高頻部分的信噪比很低。因此一般使用一階高通濾波器去提升信號在高頻部分的信噪比。

def preemphasis(signal, coeff=0.95):
    return np.append(signal[1], signal[1:] - coeff * signal[:-1])

• 分幀及加窗
有關分幀的具體原理可以參見https://mp.weixin.qq.com/s/PKBZgFXicNHghb39iyPfow。

def frame_sig(sig, frame_len, frame_step, win_func):
    '''
    :param sig: 輸入的語音信號
    :param frame_len: 幀長
    :param frame_step: 幀移
    :param win_func: 窗函數
    :return: array of frames, num_frame * frame_len
    '''
    slen = len(sig)
    if slen <= frame_len:
        num_frames = 1
    else:
        # np.ceil(), 向上取整
        num_frames = 1 + int(np.ceil((slen - frame_len) / frame_step))
    padlen = int( (num_frames - 1) * frame_step + frame_len)
    # 將信號補長,使得(slen - frame_len) /frame_step整除
    zeros = np.zeros((padlen - slen,))
    padSig = np.concatenate((sig, zeros))
    indices = np.tile(np.arange(0, frame_len), (num_frames, 1)) + np.tile(np.arange(0, num_frames*frame_step, frame_step), (frame_len, 1)).T
    indices = np.array(indices, dtype=np.int32)
    frames = padSig[indices]
    win = np.tile(win_func(frame_len), (num_frames, 1))
    return frames * win

• FFT
對提取出來的幀信號進行傅里葉變換

complex_spec = np.fft.rfft(frames, NFFT)

• 幅值平方

np.square(np.abs(complex_spec))

• Mel濾波器
在先前的章節中,已經介紹了Mel濾波器的求法,在此不再贅述。具體可參考https://mp.weixin.qq.com/s/pGwO_27x8ddQF55wTSQlmA。

def filterbank(nfilt=40, nfft=512, samplerate=16000, lowfreq=20, highfreq=None):
    low_freq = lowfreq
    if highfreq is None:
        highfreq = samplerate // 2
    low_mel = hz2mel(low_freq)
    high_mel = hz2mel(highfreq)
    mel_points = np.linspace(low_mel, high_mel, nfilt + 2)
    binf = np.floor((nfft + 1) * mel2hz(mel_points) / samplerate)
    fbank = np.zeros([nfilt, int(nfft / 2 + 1)])
    for indexj in range(0, nfilt):
        left = binf[indexj]
        center = binf[indexj + 1]
        right = binf[indexj + 2]
        for indexi in range(int(left), int(center)):
            fbank[indexj, indexi] = (indexi - left) / ( center - left)
        for indexi in range(int(center), int(right)):
            fbank[indexj, indexi] = (right - indexi) / ( right -center)
    return fbank

• 對數功率

librosa.power_to_db(feature.T)

然后就能得到fbank特征。完整的代碼如下

import numpy as np
import soundfile as sf
import python_speech_features as psf
import librosa
import librosa.display
import matplotlib.pyplot as plt
def frame_sig(sig, frame_len, frame_step, win_func):
    '''
    :param sig: 輸入的語音信號
    :param frame_len: 幀長
    :param frame_step: 幀移
    :param win_func: 窗函數
    :return: array of frames, num_frame * frame_len
    '''
    slen = len(sig)
    if slen <= frame_len:
        num_frames = 1
    else:
        # np.ceil(), 向上取整
        num_frames = 1 + int(np.ceil((slen - frame_len) / frame_step))
    padlen = int( (num_frames - 1) * frame_step + frame_len)
    # 將信號補長,使得(slen - frame_len) /frame_step整除
    zeros = np.zeros((padlen - slen,))
    padSig = np.concatenate((sig, zeros))
    indices = np.tile(np.arange(0, frame_len), (num_frames, 1)) + np.tile(np.arange(0, num_frames*frame_step, frame_step), (frame_len, 1)).T
    indices = np.array(indices, dtype=np.int32)
    frames = padSig[indices]
    win = np.tile(win_func(frame_len), (num_frames, 1))
    return frames * win
def preemphasis(signal, coeff=0.95):
    return np.append(signal[1], signal[1:] - coeff * signal[:-1])
def pow_spec(frames, NFFT):
    complex_spec = np.fft.rfft(frames, NFFT)
    return 1 / NFFT * np.square(np.abs(complex_spec))
def hz2mel(hz):
    return 2595 * np.log10(1 + hz / 700.)
def mel2hz(mel):
    return 700 * (10 ** (mel / 2595.0) - 1)
def filterbank(nfilt=40, nfft=512, samplerate=16000, lowfreq=20, highfreq=None):
    low_freq = lowfreq
    if highfreq is None:
        highfreq = samplerate // 2
    low_mel = hz2mel(low_freq)
    high_mel = hz2mel(highfreq)
    mel_points = np.linspace(low_mel, high_mel, nfilt + 2)
    binf = np.floor((nfft + 1) * mel2hz(mel_points) / samplerate)
    fbank = np.zeros([nfilt, int(nfft / 2 + 1)])
    for indexj in range(0, nfilt):
        left = binf[indexj]
        center = binf[indexj + 1]
        right = binf[indexj + 2]
        for indexi in range(int(left), int(center)):
            fbank[indexj, indexi] = (indexi - left) / ( center - left)
        for indexi in range(int(center), int(right)):
            fbank[indexj, indexi] = (right - indexi) / ( right -center)
    return fbank
y, sr = sf.read('q1.wav')
'預加重'
y = preemphasis(y, coeff=0.98)
'分幀'
frames = frame_sig(y, frame_len=2048, frame_step= 512, win_func=np.hanning)
features = pow_spec(frames, NFFT=2048)
nfilt = 26
nfft = 2048
fb = filterbank(nfilt, nfft, sr, lowfreq=20, highfreq=sr // 2)
feature = np.dot(features, fb.T)
librosa.display.specshow(librosa.power_to_db(feature.T),sr=sr, x_axis='time', y_axis='linear')
plt.title('Spectrogram')
plt.colorbar(format='%+2.0f dB')
plt.tight_layout()
plt.show()


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM