一.時域轉頻域
這節主要介紹如何經過傅里葉變換將音頻轉到頻域,以便於后續的特征提取和識別。先后進行加窗、分幀、FFT和取log操作。
輸入:音頻矩陣wavsignal ,幀率fs
例:[[1507 1374 1218 ... -78 -127 -43]],16000
輸出:轉成頻域后的音頻矩陣data_input
二.代碼:
#coding=utf-8
import os
import wave
import numpy as np
import matplotlib.pyplot as plt
import math
import time
from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank
from scipy.fftpack import fft
def read_wav_data(filename):
"""
:param filename:輸入音頻的絕對路徑(路徑+文件名)例:D:\\GitHub\\wav\\dae\\train\\A2_1.wav
:return:wave_data,framerate:輸出音頻矩陣,幀率。例:A2_1 [[1507 1374 1218 ... -78 -127 -43]]
讀取wav文件,返回聲音信號的時域譜矩陣和播放時間
"""
wav = wave.open(filename,"rb") #打開wav格式的聲音文件filename
audioname = filename.split('\\')[-1] #音頻名
num_frame = wav.getnframes() #獲取幀數
#print("{}幀數為:{}".format(audioname,num_frame))
num_channel = wav.getnchannels() #獲取聲道數
#print("{}聲道數為:{}".format(audioname,num_channel))
framerate = wav.getframerate() # 獲取幀率
#print("{}幀率為:{}".format(audioname,framerate))
num_sample_width = wav.getsampwidth() #獲取每一幀的比特寬度
#print("{}比特寬度為:{}".format(audioname,num_sample_width))
str_data = wav.readframes(num_frame) # 讀取全部的幀(二進制字符串)
wav.close() # 關閉流
wave_data = np.fromstring(str_data,dtype=np.short) # 將聲音文件數據從字符串格式轉換為數組矩陣形式
# print("{} shape: {}".format("wave_data",wave_data.shape))
# print("{} type: {}".format("wave_data",wave_data.dtype))
# print("{}: {}".format("wave_data",wave_data))
wave_data.shape = -1, num_channel #按照聲道數將數組整形,單聲道是一列,雙聲道是兩列矩陣
# print("{} shape(整形后): {}".format("wave_data",wave_data.shape))
# print("{}(整形后): {}".format("wave_data",wave_data))
wave_data = wave_data.T # 將wave_data矩陣轉置
# print("{} shape(轉置后):{}".format("wave_data",wave_data.shape))
# print("{}(轉置后):{}".format("wave_data",wave_data))
# print("{} len:{}".format("wave_data[0]",len(wave_data[0])))
return wave_data, framerate
x = np.linspace(0, 400 - 1, 400, dtype = np.int64)
w = 0.54 - 0.46 * np.cos(2 * np.pi * (x) / (400 - 1) ) # 漢明窗
def GetFrequencyFeature(wavsignal, fs):
"""
:param wavsignal:音頻矩陣 例:[[1507 1374 1218 ... -78 -127 -43]]
:param fs:幀率 例:16000
:return data_input:轉成頻域后的音頻矩陣
"""
# wav波形 加時間窗以及時移10ms
time_window = 25 # 單位ms
window_length = fs / 1000 * time_window # 計算窗長度的公式,目前全部為400固定值
wav_arr = np.array(wavsignal) # wav_arr:[[1507 1374 1218 ... -78 -127 -43]]
wav_length = wav_arr.shape[1] # wav_arr.shape[0]:1,wav_arr.shape[1]:163000
range0_end = int(len(wavsignal[0]) / fs * 1000 - time_window) // 10 # 計算循環終止的位置,也就是最終生成的窗數
data_input = np.zeros((range0_end, 200), dtype=np.float) # 用於存放最終的頻率特征數據
data_line = np.zeros((1, 400), dtype=np.float)
for i in range(0, range0_end):
p_start = i * 160 # 0,160,320,480
p_end = p_start + 400 # 400,560,720,880
data_line = wav_arr[0, p_start:p_end] # 分幀
data_line = data_line * w # 加窗(這里是漢明窗)
data_line = np.abs(fft(data_line)) / wav_length # 傅里葉變換
data_input[i] = data_line[0:200] # 設置為400除以2的值(即200)是取一半數據,因為是對稱的
data_input = np.log(data_input + 1) # 取log
return data_input
if(__name__=='__main__'):
wave_data, fs = read_wav_data("D:\\GitHub\\wav\\dae\\train\\A2_1.wav")
print("wave_data:{}".format(wave_data))
freimg = GetFrequencyFeature(wave_data, fs)
print("freimg:{}".format(freimg))
print("freimg shape:{}".format(freimg.shape))
三.程序輸出:
wave_data:[[1507 1374 1218 ... -78 -127 -43]]
freimg:
[[2.42781686e-01 3.74806474e-01 1.57395831e-01 ... 2.44499564e-03 3.97955672e-03 2.58199104e-03]
[4.53051376e-01 5.93472204e-01 2.75825723e-01 ... 4.04953736e-03 2.93284484e-03 2.91292203e-03]
[7.36420845e-01 5.12987026e-01 3.08367617e-01 ... 3.05346426e-03 1.78931565e-03 1.43100353e-03]
...
[2.66626609e-02 1.66521460e-01 1.30265491e-01 ... 3.11282926e-04 1.49347950e-03 2.64074732e-03]
[3.03173655e-01 2.10056810e-01 5.29090247e-02 ... 2.23561051e-03 1.90130764e-03 2.44626778e-03]
[2.42738928e-01 4.61532521e-02 1.34021807e-01 ... 7.12491485e-04 8.06380446e-04 1.50869641e-03]]
freimg shape:(1016, 200)