背景
最近下載了一批類似百家講壇的音頻文件。這些文件前面部分是演講類的音頻,主要講歷史的,后面一部分是音樂。
但是我只想聽演講類部分,不想聽音樂。所以希望把文件切割,把音樂部分切走,只留下演講部分。
觀察文件,發現每個文件的音樂都不一樣,演講和音樂的長度也不一樣。
這里一個技術難點就是怎么識別哪些音頻是演講,哪些音頻是音樂。
通過KNN算法,1s的音頻文件的預測正確率是92%。
音頻文件和源碼
音頻文件和源碼可以在這里下載
一、把音頻文件轉換為數字
# encoding=gbk
import random
import wave
import matplotlib.pyplot as plt
import numpy as np
import os
# nchannels 聲道
# sampwidth 樣本寬度
# framerate 幀率,也就是一秒有多少幀
# nframes 文件一共有多少幀
def pre_deal(file_path):
"""音頻解析,返回音頻數據"""
f = wave.open(file_path, 'rb')
params = f.getparams()
nchannels, sampwidth, framerate, nframes = params[:4]
strData = f.readframes(nframes) # 讀取音頻,字符串格式
waveData = np.fromstring(strData, dtype=np.int16) # 將字符串轉化為int
waveData = waveData[::nchannels] # 根據聲道數,轉換為單聲道
rate = 20.00
framerate = framerate / rate # 降低幀率
nframes = nframes / rate # 降低幀率
waveData = waveData[::int(rate)]
# wave幅值歸一化
max_ = float(max(abs(waveData)))
waveData = waveData / max_
return waveData, framerate, nframes
def plpot(waveData):
"""畫圖"""
time = [i for i, v in enumerate(waveData)]
plt.plot(time, waveData)
plt.xlabel("Time")
plt.ylabel("Amplitude")
plt.title("Single channel wavedata")
plt.grid('on') # 標尺,on:有,off:無。
plt.show()
def mp3towav(file_path, to_file_path):
"""mp3文件轉wav文件"""
if os.path.exists(to_file_path):
return to_file_path
from pydub import AudioSegment
print file_path
song1 = AudioSegment.from_mp3(file_path)
song1.export(to_file_path, 'wav')
return to_file_path
if __name__ == '__main__':
file_path = 'D:\BaiduNetdiskDownload\\a.mp3'
file_path = mp3towav('D:\BaiduNetdiskDownload\\a.mp3', file_path.replace('mp3', 'wav'))
data, _, _ = pre_deal(file_path)
plpot(data)
- 通過
wave
庫,可以識別音頻文件,聲道,樣本寬度,幀率,幀數等 - 由於文件的左右聲道的值都一樣,所以簡單處理只要其中一個聲道
- 為了提升機器學習速度,修改采樣率為20分之一,來降低數據量
- wave庫只支持wav文件,所以需要把mp3轉換為wav,這里用到了pydub庫
- 解析音頻數據后,通過matplotlib庫來畫圖,顯示出波紋圖
二、人工標記數據
使用音頻處理軟件goldwave,采用人工聽的方法來把音頻文件的音樂部分剪掉,保存的文件放在chg目錄里面,剪之前的文件放在raw目錄下面。一共剪了18個文件。
三、獲取訓練數據
class LeaningTest():
chg_path = r'D:\BaiduNetdiskDownload\test\chg'
raw_path = r'D:\BaiduNetdiskDownload\test\raw'
model = None
@classmethod
def load_model(cls):
cls.model = pickle_utils.load('knn.model.pkl')
@classmethod
def chg(cls):
chg_path = r'D:\BaiduNetdiskDownload\test\chg'
raw_path = r'D:\BaiduNetdiskDownload\test\raw'
for i, f in enumerate(os.listdir(chg_path)):
shutil.copy(chg_path + '\\' + f, chg_path + '\\' + '%s.mp3' % i)
shutil.copy(raw_path + '\\' + f, raw_path + '\\' + '%s.mp3' % i)
@classmethod
def get_path(cls, i, t):
p = cls.chg_path if t == 'chg' else cls.raw_path
return p + '\\' + '%s.mp3' % i
@classmethod
def sample_cnt(cls, sample):
"""
轉換樣本數據,返回每個區間的計數。
例如從[0.1,0.1,0.8]轉換為[2,1]
2是[0,0.5)區間的計數
1是[0.5,1)區間的計數
"""
step = 0.025
qujians = []
start = 0
while start < 1:
qujians.append((start, start + step))
start += step
new_sample = [0 for i in range(len(qujians))]
for s in sample:
for i, qujian in enumerate(qujians):
if qujian[0] <= s < qujian[1]:
new_sample[i] += 1
return new_sample
@classmethod
def get_sample(cls, i):
"""
獲取用於機器學習的數據
return [([100,200],0)]
"""
chg = cls.to_wav(cls.get_path(i, 'chg'))
raw = cls.to_wav(cls.get_path(i, 'raw'))
data_chg, framerate_chg, n_frames_chg = pre_deal(chg)
total_sec_chg = int(n_frames_chg / framerate_chg)
data_raw, framerate_raw, n_frames_raw = pre_deal(raw)
total_sec_raw = int(n_frames_raw / framerate_raw)
length = 1
samples = []
for i in range(60, total_sec_raw, length):
if total_sec_chg + 5 < i < total_sec_chg + 5:
continue # 不要這部分
flag = 0 if i < total_sec_chg else 1
# print get_index(framerate, 0, i),get_index(framerate, 0, i + length),total_sec
sample = data_raw[get_index(framerate_raw, 0, i):get_index(framerate_raw, 0, i + length)]
sample = cls.sample_cnt(sample)
samples.append((sample, flag))
return samples
@classmethod
def to_wav(cls, file_path):
"""轉換mp3為wav"""
if 'mp3' in file_path:
to_file_path = file_path.replace('mp3', 'wav')
mp3towav(file_path, to_file_path)
file_path = to_file_path
return file_path
@classmethod
def get_all_sample(cls, ):
"""獲取所有樣本"""
file_name = 'sample4.json'
if os.path.exists(file_name):
with open(file_name, 'r') as f:
return json.loads(f.read())
else:
samples = []
for i in range(1):
print 'get sample', i
samples.extend(cls.get_sample(i))
with open(file_name, 'w') as f:
f.write(json.dumps(samples))
return samples
@classmethod
def train_wrapper(cls):
"""訓練"""
samples = cls.get_all_sample()
label0 = [s for s in samples if s[1] == 0]
label1 = [s for s in samples if s[1] == 1]
random.shuffle(label0)
random.shuffle(label1)
train_datas_sets = [i[0] for i in label0[:int(len(label0) * 0.7)]] + [i[0] for i in
label1[:int(len(label1) * 0.7)]]
train_labels_set = [i[1] for i in label0[:int(len(label0) * 0.7)]] + [i[1] for i in
label1[:int(len(label1) * 0.7)]]
test_datas_set = [i[0] for i in label0[int(len(label0) * 0.7):]] + [i[0] for i in
label1[int(len(label1) * 0.7):]]
test_labels_set = [i[1] for i in label0[int(len(label0) * 0.7):]] + [i[1] for i in
label1[int(len(label1) * 0.7):]]
print len(train_datas_sets)
# cls.train_knn(train_datas_sets, train_labels_set, test_datas_set, test_labels_set)
if __name__ == '__main__':
LeaningTest.train_wrapper()
- 以1秒鍾為一個樣本,然后對數據進行計數,返回每個區間的計數,區間間隔是0.025,所以一個樣布的向量長度是40
- 由於前60s都是前奏,所以不作為訓練數據
- 由於是人工分割,所以可能有誤差,所以把分割點前后5s的都不作為訓練數據
四、訓練
@classmethod
def train(cls, train_datas_sets, train_labels_set, test_datas_set, test_labels_set):
"""
"""
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn import neighbors
for mechine in [svm.SVC, LogisticRegression, LinearRegression, tree.DecisionTreeClassifier,
neighbors.KNeighborsClassifier, MLPClassifier, GaussianNB]:
clf = mechine()
clf.fit(train_datas_sets, train_labels_set) # 訓練
score = clf.score(test_datas_set, test_labels_set) # 預測測試集,並計算正確率
print 'score', mechine, score
訓練結果:
score <class 'sklearn.svm.classes.SVC'> 0.7203252032520325
score <class 'sklearn.linear_model.logistic.LogisticRegression'> 0.8886178861788618
score <class 'sklearn.linear_model.base.LinearRegression'> 0.40864632529611417
score <class 'sklearn.tree.tree.DecisionTreeClassifier'> 0.8888888888888888
score <class 'sklearn.neighbors.classification.KNeighborsClassifier'> 0.9224932249322493
score <class 'sklearn.neural_network.multilayer_perceptron.MLPClassifier'> 0.835230352303523
score <class 'sklearn.naive_bayes.GaussianNB'> 0.8035230352303523
- 使用多種模型進行訓練,得到的結果為knn的准確率最高,達到了0.92
所以訓練knn模型,並保存為pickle
@classmethod
def train_knn(cls, train_datas_sets, train_labels_set, test_datas_set, test_labels_set):
from sklearn import neighbors
mechine = neighbors.KNeighborsClassifier
clf = mechine()
clf.fit(train_datas_sets, train_labels_set) # 訓練
score = clf.score(test_datas_set, test_labels_set) # 預測測試集,並計算正確率
print 'score', mechine, score
pickle_utils.dump(clf, 'knn.model.pkl')
五、分割文件
@classmethod
def get_cut_sce(cls, file_path, model):
"""獲取分割的秒數,找不到返回None"""
file_path = cls.to_wav(file_path)
data_raw, framerate, n_frames = pre_deal(file_path)
total_sec = int(n_frames / framerate)
length = 1
rets = []
for i in range(60, total_sec, length):
# print file_path, i
sample = data_raw[get_index(framerate, 0, i):get_index(framerate, 0, i + length)]
sample = cls.sample_cnt(sample)
ret = model.predict([sample])
rets.append(ret)
if ret == 1 and len(rets) >= 3 and rets[-2] == 1 and rets[-3] == 1:
return i
return None
@classmethod
def get_min(cls, sec):
"""轉換秒數為 分秒格式"""
print '%s:%s' % (int(sec / 60), int(sec % 60))
@classmethod
def predict(cls, ):
"""預測"""
file_path = r'D:\BaiduNetdiskDownload\c.mp3'
model = pickle_utils.load('knn.model.pkl')
sec = cls.get_cut_sce(file_path, model)
print 'sec', sec, cls.get_min(sec)
@classmethod
def cut_song(cls, file_path, to_file_path, file_name):
"""分割歌曲"""
print 'cut_song', file_name.decode('gbk'), file_path
sec = cls.get_cut_sce(file_path, cls.model)
if sec is None:
print 'error can not find sec', file_path, file_name.decode('gbk')
return 0
song = AudioSegment.from_mp3(file_path)
# to_file_path=file_path.replace('mp3','wav')
song = song[:sec * 1000]
song.export(to_file_path, 'mp3', bitrate='64k')
return 1
@classmethod
def cut_songs(cls, ):
"""分割某個文件夾下面的所有歌曲"""
root_path = r'D:\BaiduNetdiskDownload\聽世界-戰國5(156集)64kbps'
del_path = r'D:\BaiduNetdiskDownload\to_del'
for f in os.listdir(root_path):
if 'mp3' in f and 'cut' not in f:
file_path = root_path + '\\' + f
if os.path.exists(file_path + '.cut.mp3'):
print 'exist', file_path.decode('gbk') + '.cut.mp3'
continue
# 由於pydub不支持windows的中文路徑,所以只能把源文件已到一個臨時的英文目錄,然后執行分割 然后把臨時文件移走
tmp_file_path = 'D:\BaiduNetdiskDownload\\test.mp3' # pydub不支持中文地址,只能這樣
tmp_wav_path = tmp_file_path.replace('mp3', 'wav')
tmp_to_file_path = tmp_file_path + '.cut.mp3'
shutil.copy(file_path, tmp_file_path)
ret = cls.cut_song(tmp_file_path, tmp_to_file_path,f)
shutil.move(tmp_file_path, del_path + '\\del1_' + f)
shutil.move(tmp_wav_path, del_path + '\\del3_' + f)
try:
# 有可能找不到分割點,導致沒有分割,所以加上try
shutil.copy(tmp_to_file_path, file_path + '.cut.mp3')
shutil.move(tmp_to_file_path, del_path + '\\del2_' + f)
except:
import traceback
print traceback.format_exc()
@classmethod
def test(cls):
song = AudioSegment.from_mp3(u'D:\BaiduNetdiskDownload\測試\\a.mp3'.encode('gbk'))
if __name__ == '__main__':
LeaningTest.load_model()
LeaningTest.cut_songs()
- 即使准確率達到0.92,但是還沒有到100%,為了提升正確率,所以連續3s都判斷為音樂,才分割。
- 由於pydub不支持windows的中文路徑,所以只能把源文件已到一個臨時的英文目錄,然后執行分割 然后把臨時文件移走
未經同意,請不要轉載