語音預處理(二):時域轉頻域


一.時域轉頻域

這節主要介紹如何經過傅里葉變換將音頻轉到頻域,以便於后續的特征提取和識別。先后進行加窗、分幀、FFT和取log操作。
輸入:音頻矩陣wavsignal ,幀率fs
例:[[1507 1374 1218 ... -78 -127 -43]],16000
輸出:轉成頻域后的音頻矩陣data_input

二.代碼:

#coding=utf-8
import os
import wave
import numpy as np
import matplotlib.pyplot as plt
import math 
import time
from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank
from scipy.fftpack import fft

def read_wav_data(filename):
    """
    :param filename:輸入音頻的絕對路徑(路徑+文件名)例:D:\\GitHub\\wav\\dae\\train\\A2_1.wav
    :return:wave_data,framerate:輸出音頻矩陣,幀率。例:A2_1 [[1507 1374 1218 ...  -78 -127  -43]]
    讀取wav文件,返回聲音信號的時域譜矩陣和播放時間
    """
    wav = wave.open(filename,"rb") #打開wav格式的聲音文件filename
    audioname = filename.split('\\')[-1] #音頻名
    num_frame = wav.getnframes() #獲取幀數
    #print("{}幀數為:{}".format(audioname,num_frame))
    num_channel = wav.getnchannels() #獲取聲道數
    #print("{}聲道數為:{}".format(audioname,num_channel))
    framerate = wav.getframerate() # 獲取幀率
    #print("{}幀率為:{}".format(audioname,framerate))
    num_sample_width = wav.getsampwidth() #獲取每一幀的比特寬度
    #print("{}比特寬度為:{}".format(audioname,num_sample_width))
    str_data = wav.readframes(num_frame) # 讀取全部的幀(二進制字符串)
    wav.close() # 關閉流

    wave_data = np.fromstring(str_data,dtype=np.short) # 將聲音文件數據從字符串格式轉換為數組矩陣形式
    # print("{} shape: {}".format("wave_data",wave_data.shape))
    # print("{} type: {}".format("wave_data",wave_data.dtype))
    # print("{}: {}".format("wave_data",wave_data))
    wave_data.shape = -1, num_channel #按照聲道數將數組整形,單聲道是一列,雙聲道是兩列矩陣
    # print("{} shape(整形后): {}".format("wave_data",wave_data.shape))
    # print("{}(整形后): {}".format("wave_data",wave_data))
    wave_data = wave_data.T # 將wave_data矩陣轉置
    # print("{} shape(轉置后):{}".format("wave_data",wave_data.shape))
    # print("{}(轉置后):{}".format("wave_data",wave_data))
    # print("{} len:{}".format("wave_data[0]",len(wave_data[0])))

    return wave_data, framerate
x = np.linspace(0, 400 - 1, 400, dtype = np.int64)
w = 0.54 - 0.46 * np.cos(2 * np.pi * (x) / (400 - 1) ) # 漢明窗

def GetFrequencyFeature(wavsignal, fs):
    """
    :param wavsignal:音頻矩陣 例:[[1507 1374 1218 ...  -78 -127  -43]]
    :param fs:幀率 例:16000
    :return data_input:轉成頻域后的音頻矩陣
    """
    # wav波形 加時間窗以及時移10ms
    time_window = 25  # 單位ms
    window_length = fs / 1000 * time_window  # 計算窗長度的公式,目前全部為400固定值
    wav_arr = np.array(wavsignal)  # wav_arr:[[1507 1374 1218 ...  -78 -127  -43]]
    wav_length = wav_arr.shape[1]  # wav_arr.shape[0]:1,wav_arr.shape[1]:163000
    range0_end = int(len(wavsignal[0]) / fs * 1000 - time_window) // 10  # 計算循環終止的位置,也就是最終生成的窗數
    data_input = np.zeros((range0_end, 200), dtype=np.float)  # 用於存放最終的頻率特征數據
    data_line = np.zeros((1, 400), dtype=np.float)

    for i in range(0, range0_end):
        p_start = i * 160  # 0,160,320,480
        p_end = p_start + 400  # 400,560,720,880
        data_line = wav_arr[0, p_start:p_end]  # 分幀
        data_line = data_line * w  # 加窗(這里是漢明窗)
        data_line = np.abs(fft(data_line)) / wav_length  # 傅里葉變換
        data_input[i] = data_line[0:200]  # 設置為400除以2的值(即200)是取一半數據,因為是對稱的
    data_input = np.log(data_input + 1)  # 取log
    return data_input

if(__name__=='__main__'):
       wave_data, fs = read_wav_data("D:\\GitHub\\wav\\dae\\train\\A2_1.wav")
       print("wave_data:{}".format(wave_data))
       freimg = GetFrequencyFeature(wave_data, fs)
       print("freimg:{}".format(freimg))
       print("freimg shape:{}".format(freimg.shape))

三.程序輸出:

wave_data:[[1507 1374 1218 ... -78 -127 -43]]
freimg:
[[2.42781686e-01 3.74806474e-01 1.57395831e-01 ... 2.44499564e-03 3.97955672e-03 2.58199104e-03]
[4.53051376e-01 5.93472204e-01 2.75825723e-01 ... 4.04953736e-03 2.93284484e-03 2.91292203e-03]
[7.36420845e-01 5.12987026e-01 3.08367617e-01 ... 3.05346426e-03 1.78931565e-03 1.43100353e-03]
...
[2.66626609e-02 1.66521460e-01 1.30265491e-01 ... 3.11282926e-04 1.49347950e-03 2.64074732e-03]
[3.03173655e-01 2.10056810e-01 5.29090247e-02 ... 2.23561051e-03 1.90130764e-03 2.44626778e-03]
[2.42738928e-01 4.61532521e-02 1.34021807e-01 ... 7.12491485e-04 8.06380446e-04 1.50869641e-03]]
freimg shape:(1016, 200)


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM