使用機器學習方法識別音頻文件的音樂和演講

本文轉載自查看原文 2020-03-24 18:29 1488 大數據

背景

最近下載了一批類似百家講壇的音頻文件。這些文件前面部分是演講類的音頻，主要講歷史的，后面一部分是音樂。
但是我只想聽演講類部分，不想聽音樂。所以希望把文件切割，把音樂部分切走，只留下演講部分。
觀察文件，發現每個文件的音樂都不一樣，演講和音樂的長度也不一樣。
這里一個技術難點就是怎么識別哪些音頻是演講，哪些音頻是音樂。
通過KNN算法，1s的音頻文件的預測正確率是92%。

音頻文件和源碼

音頻文件和源碼可以在這里下載

一、把音頻文件轉換為數字

# encoding=gbk
import random
import wave
import matplotlib.pyplot as plt
import numpy as np
import os

# nchannels 聲道
# sampwidth 樣本寬度
# framerate 幀率，也就是一秒有多少幀
# nframes 文件一共有多少幀

def pre_deal(file_path):
    """音頻解析，返回音頻數據"""
    f = wave.open(file_path, 'rb')
    params = f.getparams()
    nchannels, sampwidth, framerate, nframes = params[:4]
    strData = f.readframes(nframes)  # 讀取音頻，字符串格式
    waveData = np.fromstring(strData, dtype=np.int16)  # 將字符串轉化為int

    waveData = waveData[::nchannels]  # 根據聲道數，轉換為單聲道
    rate = 20.00
    framerate = framerate / rate  # 降低幀率
    nframes = nframes / rate  # 降低幀率
    waveData = waveData[::int(rate)]

    # wave幅值歸一化
    max_ = float(max(abs(waveData)))
    waveData = waveData / max_

    return waveData, framerate, nframes


def plpot(waveData):
    """畫圖"""
    time = [i for i, v in enumerate(waveData)]
    plt.plot(time, waveData)
    plt.xlabel("Time")
    plt.ylabel("Amplitude")
    plt.title("Single channel wavedata")
    plt.grid('on')  # 標尺，on：有，off:無。
    plt.show()


def mp3towav(file_path, to_file_path):
    """mp3文件轉wav文件"""
    if os.path.exists(to_file_path):
        return to_file_path
    from pydub import AudioSegment
    print file_path
    song1 = AudioSegment.from_mp3(file_path)
    song1.export(to_file_path, 'wav')
    return to_file_path


if __name__ == '__main__':
    file_path = 'D:\BaiduNetdiskDownload\\a.mp3'
    file_path = mp3towav('D:\BaiduNetdiskDownload\\a.mp3', file_path.replace('mp3', 'wav'))
    data, _, _ = pre_deal(file_path)
    plpot(data)

通過wave庫，可以識別音頻文件，聲道，樣本寬度，幀率，幀數等
由於文件的左右聲道的值都一樣，所以簡單處理只要其中一個聲道
為了提升機器學習速度，修改采樣率為20分之一，來降低數據量
wave庫只支持wav文件，所以需要把mp3轉換為wav，這里用到了pydub庫
解析音頻數據后，通過matplotlib庫來畫圖，顯示出波紋圖

二、人工標記數據

使用音頻處理軟件goldwave，采用人工聽的方法來把音頻文件的音樂部分剪掉，保存的文件放在chg目錄里面，剪之前的文件放在raw目錄下面。一共剪了18個文件。

三、獲取訓練數據

class LeaningTest():
    chg_path = r'D:\BaiduNetdiskDownload\test\chg'
    raw_path = r'D:\BaiduNetdiskDownload\test\raw'
    model = None

    @classmethod
    def load_model(cls):
        cls.model = pickle_utils.load('knn.model.pkl')

    @classmethod
    def chg(cls):
        chg_path = r'D:\BaiduNetdiskDownload\test\chg'
        raw_path = r'D:\BaiduNetdiskDownload\test\raw'
        for i, f in enumerate(os.listdir(chg_path)):
            shutil.copy(chg_path + '\\' + f, chg_path + '\\' + '%s.mp3' % i)
            shutil.copy(raw_path + '\\' + f, raw_path + '\\' + '%s.mp3' % i)

    @classmethod
    def get_path(cls, i, t):
        p = cls.chg_path if t == 'chg' else cls.raw_path
        return p + '\\' + '%s.mp3' % i

    @classmethod
    def sample_cnt(cls, sample):
        """
        轉換樣本數據，返回每個區間的計數。
        例如從[0.1,0.1,0.8]轉換為[2,1]
        2是[0,0.5)區間的計數
        1是[0.5,1)區間的計數
        """
        step = 0.025
        qujians = []
        start = 0
        while start < 1:
            qujians.append((start, start + step))
            start += step
        new_sample = [0 for i in range(len(qujians))]
        for s in sample:
            for i, qujian in enumerate(qujians):
                if qujian[0] <= s < qujian[1]:
                    new_sample[i] += 1
        return new_sample

    @classmethod
    def get_sample(cls, i):
        """
        獲取用於機器學習的數據
        return [([100,200],0)]
        """
        chg = cls.to_wav(cls.get_path(i, 'chg'))
        raw = cls.to_wav(cls.get_path(i, 'raw'))

        data_chg, framerate_chg, n_frames_chg = pre_deal(chg)
        total_sec_chg = int(n_frames_chg / framerate_chg)

        data_raw, framerate_raw, n_frames_raw = pre_deal(raw)
        total_sec_raw = int(n_frames_raw / framerate_raw)

        length = 1
        samples = []
        for i in range(60, total_sec_raw, length):
            if total_sec_chg + 5 < i < total_sec_chg + 5:
                continue  # 不要這部分

            flag = 0 if i < total_sec_chg else 1
            # print get_index(framerate, 0, i),get_index(framerate, 0, i + length),total_sec
            sample = data_raw[get_index(framerate_raw, 0, i):get_index(framerate_raw, 0, i + length)]

            sample = cls.sample_cnt(sample)

            samples.append((sample, flag))
        return samples

    @classmethod
    def to_wav(cls, file_path):
        """轉換mp3為wav"""
        if 'mp3' in file_path:
            to_file_path = file_path.replace('mp3', 'wav')
            mp3towav(file_path, to_file_path)
            file_path = to_file_path
        return file_path

    @classmethod
    def get_all_sample(cls, ):
        """獲取所有樣本"""
        file_name = 'sample4.json'
        if os.path.exists(file_name):
            with open(file_name, 'r') as f:
                return json.loads(f.read())
        else:
            samples = []
            for i in range(1):
                print 'get sample', i
                samples.extend(cls.get_sample(i))
            with open(file_name, 'w') as f:
                f.write(json.dumps(samples))
            return samples

    @classmethod
    def train_wrapper(cls):
        """訓練"""
        samples = cls.get_all_sample()
        label0 = [s for s in samples if s[1] == 0]
        label1 = [s for s in samples if s[1] == 1]
        random.shuffle(label0)
        random.shuffle(label1)
        train_datas_sets = [i[0] for i in label0[:int(len(label0) * 0.7)]] + [i[0] for i in
                                                                              label1[:int(len(label1) * 0.7)]]
        train_labels_set = [i[1] for i in label0[:int(len(label0) * 0.7)]] + [i[1] for i in
                                                                              label1[:int(len(label1) * 0.7)]]
        test_datas_set = [i[0] for i in label0[int(len(label0) * 0.7):]] + [i[0] for i in
                                                                            label1[int(len(label1) * 0.7):]]
        test_labels_set = [i[1] for i in label0[int(len(label0) * 0.7):]] + [i[1] for i in
                                                                             label1[int(len(label1) * 0.7):]]
        print len(train_datas_sets)
        # cls.train_knn(train_datas_sets, train_labels_set, test_datas_set, test_labels_set)

 

if __name__ == '__main__':
    LeaningTest.train_wrapper()

以1秒鍾為一個樣本，然后對數據進行計數，返回每個區間的計數，區間間隔是0.025，所以一個樣布的向量長度是40
由於前60s都是前奏，所以不作為訓練數據
由於是人工分割，所以可能有誤差，所以把分割點前后5s的都不作為訓練數據

四、訓練

@classmethod
def train(cls, train_datas_sets, train_labels_set, test_datas_set, test_labels_set):
    """
    """
    from sklearn.naive_bayes import GaussianNB
    from sklearn.linear_model import LogisticRegression
    from sklearn.linear_model import LinearRegression
    from sklearn import tree
    from sklearn import svm
    from sklearn.neural_network import MLPClassifier
    from sklearn import neighbors
    for mechine in [svm.SVC, LogisticRegression, LinearRegression, tree.DecisionTreeClassifier,
                    neighbors.KNeighborsClassifier, MLPClassifier, GaussianNB]:
        clf = mechine()
        clf.fit(train_datas_sets, train_labels_set)  # 訓練
        score = clf.score(test_datas_set, test_labels_set)  # 預測測試集，並計算正確率
        print 'score', mechine, score

訓練結果:

score <class 'sklearn.svm.classes.SVC'> 0.7203252032520325
score <class 'sklearn.linear_model.logistic.LogisticRegression'> 0.8886178861788618
score <class 'sklearn.linear_model.base.LinearRegression'> 0.40864632529611417
score <class 'sklearn.tree.tree.DecisionTreeClassifier'> 0.8888888888888888
score <class 'sklearn.neighbors.classification.KNeighborsClassifier'> 0.9224932249322493
score <class 'sklearn.neural_network.multilayer_perceptron.MLPClassifier'> 0.835230352303523
score <class 'sklearn.naive_bayes.GaussianNB'> 0.8035230352303523

使用多種模型進行訓練，得到的結果為knn的准確率最高，達到了0.92

所以訓練knn模型，並保存為pickle

@classmethod
def train_knn(cls, train_datas_sets, train_labels_set, test_datas_set, test_labels_set):
    from sklearn import neighbors
    mechine = neighbors.KNeighborsClassifier
    clf = mechine()
    clf.fit(train_datas_sets, train_labels_set)  # 訓練
    score = clf.score(test_datas_set, test_labels_set)  # 預測測試集，並計算正確率
    print 'score', mechine, score
    pickle_utils.dump(clf, 'knn.model.pkl')

五、分割文件

@classmethod
def get_cut_sce(cls, file_path, model):
    """獲取分割的秒數，找不到返回None"""
    file_path = cls.to_wav(file_path)
    data_raw, framerate, n_frames = pre_deal(file_path)
    total_sec = int(n_frames / framerate)

    length = 1
    rets = []
    for i in range(60, total_sec, length):
        # print file_path, i
        sample = data_raw[get_index(framerate, 0, i):get_index(framerate, 0, i + length)]

        sample = cls.sample_cnt(sample)
        ret = model.predict([sample])
        rets.append(ret)
        if ret == 1 and len(rets) >= 3 and rets[-2] == 1 and rets[-3] == 1:
            return i

    return None

@classmethod
def get_min(cls, sec):
    """轉換秒數為 分秒格式"""
    print '%s:%s' % (int(sec / 60), int(sec % 60))

@classmethod
def predict(cls, ):
    """預測"""
    file_path = r'D:\BaiduNetdiskDownload\c.mp3'
    model = pickle_utils.load('knn.model.pkl')
    sec = cls.get_cut_sce(file_path, model)
    print 'sec', sec, cls.get_min(sec)

@classmethod
def cut_song(cls, file_path, to_file_path, file_name):
    """分割歌曲"""
    print 'cut_song', file_name.decode('gbk'), file_path
    sec = cls.get_cut_sce(file_path, cls.model)
    if sec is None:
        print 'error can not find sec', file_path, file_name.decode('gbk')
        return 0
    song = AudioSegment.from_mp3(file_path)
    # to_file_path=file_path.replace('mp3','wav')
    song = song[:sec * 1000]
    song.export(to_file_path, 'mp3', bitrate='64k')
    return 1


@classmethod
def cut_songs(cls, ):
    """分割某個文件夾下面的所有歌曲"""
    root_path = r'D:\BaiduNetdiskDownload\聽世界-戰國5(156集)64kbps'
    del_path = r'D:\BaiduNetdiskDownload\to_del'
    for f in os.listdir(root_path):
        if 'mp3' in f and 'cut' not in f:
            file_path = root_path + '\\' + f
            if os.path.exists(file_path + '.cut.mp3'):
                print 'exist', file_path.decode('gbk') + '.cut.mp3'
                continue
            # 由於pydub不支持windows的中文路徑，所以只能把源文件已到一個臨時的英文目錄，然后執行分割 然后把臨時文件移走
            tmp_file_path = 'D:\BaiduNetdiskDownload\\test.mp3'  # pydub不支持中文地址，只能這樣
            tmp_wav_path = tmp_file_path.replace('mp3', 'wav')
            tmp_to_file_path = tmp_file_path + '.cut.mp3'
            shutil.copy(file_path, tmp_file_path)
            ret = cls.cut_song(tmp_file_path, tmp_to_file_path,f)
            shutil.move(tmp_file_path, del_path + '\\del1_' + f)
            shutil.move(tmp_wav_path, del_path + '\\del3_' + f)
            try:
                # 有可能找不到分割點，導致沒有分割，所以加上try
                shutil.copy(tmp_to_file_path, file_path + '.cut.mp3')
                shutil.move(tmp_to_file_path, del_path + '\\del2_' + f)

            except:
                import traceback
                print traceback.format_exc()




@classmethod
def test(cls):
    song = AudioSegment.from_mp3(u'D:\BaiduNetdiskDownload\測試\\a.mp3'.encode('gbk'))


if __name__ == '__main__':
    LeaningTest.load_model()
    LeaningTest.cut_songs()

即使准確率達到0.92，但是還沒有到100%，為了提升正確率，所以連續3s都判斷為音樂，才分割。
由於pydub不支持windows的中文路徑，所以只能把源文件已到一個臨時的英文目錄，然后執行分割然后把臨時文件移走

未經同意，請不要轉載

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 常用機器學習方法總結各種機器學習方法的優缺點基於手寫數字識別數據集的機器學習方法對比研究《統計學習方法》：統計學習機器學習統計機器學習文本分類(機器學習方法) 機器學習方法--分類、回歸、聚類 Python基於機器學習方法實現的電影推薦系統 R語言進行機器學習方法及實例（一）機器學習丨《機器學習》、《統計學習方法》思維導圖把音頻文件壓縮變小的方法