hanlp學習六：文本分類

本文轉載自查看原文 2020-01-26 21:04 1101 自然語言處理

一概念：

文本分類：將一個文檔歸類到一個或多個類別中的自然語言處理任務

類別即標簽

多標簽分類：一篇文檔可能屬於多個類別

二流程：

a.人工標注文檔的類別生成文本分類語料庫

代碼：

# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-01-03 19:36
# 《自然語言處理入門》11.2 文本分類語料庫
# 配套書籍：http://nlp.hankcs.com/book.php
# 討論答疑：https://bbs.hankcs.com/
from pyhanlp import *
import zipfile
import os

from pyhanlp.static import download, remove_file, HANLP_DATA_PATH


def test_data_path():
    """
    獲取測試數據路徑，位於$root/data/test，根目錄由配置文件指定。
    :return:
    """
    data_path = os.path.join(HANLP_DATA_PATH, 'test')
    if not os.path.isdir(data_path):
        os.mkdir(data_path)
    return data_path


def ensure_data(data_name, data_url):
    root_path = test_data_path()
    dest_path = os.path.join(root_path, data_name)
    if os.path.exists(dest_path):
        return dest_path
    if data_url.endswith('.zip'):
        dest_path += '.zip'
    download(data_url, dest_path)
    if data_url.endswith('.zip'):
        with zipfile.ZipFile(dest_path, "r") as archive:
            archive.extractall(root_path)
        remove_file(dest_path)
        dest_path = dest_path[:-len('.zip')]
    return dest_path



sogou_corpus_path = ensure_data('搜狗文本分類語料庫迷你版',
                                'http://file.hankcs.com/corpus/sogou-text-classification-corpus-mini.zip')

AbstractDataSet = JClass('com.hankcs.hanlp.classification.corpus.AbstractDataSet')
Document = JClass('com.hankcs.hanlp.classification.corpus.Document')
FileDataSet = JClass('com.hankcs.hanlp.classification.corpus.FileDataSet')
MemoryDataSet = JClass('com.hankcs.hanlp.classification.corpus.MemoryDataSet')

# 演示加載文本分類語料庫
if __name__ == '__main__':
    dataSet = MemoryDataSet()  # ①將數據集加載到內存中
    dataSet.load(sogou_corpus_path)  # ②加載data/test/搜狗文本分類語料庫迷你版
    dataSet.add("自然語言處理", "自然語言處理很有趣")  # ③新增樣本
    allClasses = dataSet.getCatalog().getCategories()  # ④獲取標注集
    print("標注集：%s" % (allClasses))
    for document in dataSet.iterator():
        print("第一篇文檔的類別：" + allClasses.get(document.category))
        break

b.利用語料訓練模型

特征提取以及分類器處理

特征提取步驟：

（1）分詞

（2）分詞等預處理結束后，從這些詞語中挑出有用的子集作為特征，利用卡方特征選擇篩選詞語

（3）確定特征之后，將文檔轉化為詞袋向量

分類器包括：朴素貝葉斯以及支持向量機

支持向量機代碼

# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-01-06 14:09
# 《自然語言處理入門》11.5.2 線性支持向量機文本分類器實現
# 配套書籍：http://nlp.hankcs.com/book.php
# 討論答疑：https://bbs.hankcs.com/
from pyhanlp.static import STATIC_ROOT, download
import os

from tests.demos.demo_text_classification import sogou_corpus_path


def install_jar(name, url):
    dst = os.path.join(STATIC_ROOT, name)
    if os.path.isfile(dst):
        return dst
    download(url, dst)
    return dst


install_jar('text-classification-svm-1.0.2.jar', 'http://file.hankcs.com/bin/text-classification-svm-1.0.2.jar')
install_jar('liblinear-1.95.jar', 'http://file.hankcs.com/bin/liblinear-1.95.jar')
from pyhanlp import *

LinearSVMClassifier = SafeJClass('com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier')
IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil')


def train_or_load_classifier():
    model_path = sogou_corpus_path + '.svm.ser'
    if os.path.isfile(model_path):
        return LinearSVMClassifier(IOUtil.readObjectFrom(model_path))
    classifier = LinearSVMClassifier()
    classifier.train(sogou_corpus_path)
    model = classifier.getModel()
    IOUtil.saveObjectTo(model, model_path)
    return LinearSVMClassifier(model)


def predict(classifier, text):
    print("《%16s》\t屬於分類\t【%s】" % (text, classifier.classify(text)))
    # 如需獲取離散型隨機變量的分布，請使用predict接口
    # print("《%16s》\t屬於分類\t【%s】" % (text, classifier.predict(text)))


if __name__ == '__main__':
    classifier = train_or_load_classifier()
    predict(classifier, "C羅獲2018環球足球獎最佳球員 德尚榮膺最佳教練")
    predict(classifier, "潛艇具有很強的戰略威懾能力與實戰能力")
    predict(classifier, "研究生考錄模式亟待進一步專業化")
    predict(classifier, "如果真想用食物解壓,建議可以食用燕麥")
    predict(classifier, "通用及其部分競爭對手目前正在考慮解決庫存問題")

c.利用模型預測文檔的類別

三情感分析：

a.概念：提取文本的主觀信息,找出文本所對應的正負情感態度

b.流程：

（1）:准備情感語料庫

（2）利用情感語料庫，訓練分類模型

c.利用酒店評論語料庫進行的情感分類試驗

# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-01-07 13:53

from pyhanlp import *
from pyhanlp import *
import zipfile
import os

from pyhanlp.static import download, remove_file, HANLP_DATA_PATH


def test_data_path():
    """
    獲取測試數據路徑，位於$root/data/test，根目錄由配置文件指定。
    :return:
    """
    data_path = os.path.join(HANLP_DATA_PATH, 'test')
    if not os.path.isdir(data_path):
        os.mkdir(data_path)
    return data_path


def ensure_data(data_name, data_url):
    root_path = test_data_path()
    dest_path = os.path.join(root_path, data_name)
    if os.path.exists(dest_path):
        return dest_path
    if data_url.endswith('.zip'):
        dest_path += '.zip'
    download(data_url, dest_path)
    if data_url.endswith('.zip'):
        with zipfile.ZipFile(dest_path, "r") as archive:
            archive.extractall(root_path)
        remove_file(dest_path)
        dest_path = dest_path[:-len('.zip')]
    return dest_path

IClassifier = JClass('com.hankcs.hanlp.classification.classifiers.IClassifier')
NaiveBayesClassifier = JClass('com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier')
# 中文情感挖掘語料-ChnSentiCorp 譚松波
chn_senti_corp = ensure_data("ChnSentiCorp情感分析酒店評論", "http://file.hankcs.com/corpus/ChnSentiCorp.zip")


def predict(classifier, text):
    print("《%s》 情感極性是 【%s】" % (text, classifier.classify(text)))


if __name__ == '__main__':
    classifier = NaiveBayesClassifier()
    #  創建分類器，更高級的功能請參考IClassifier的接口定義
    classifier.train(chn_senti_corp)
    #  訓練后的模型支持持久化，下次就不必訓練了
    predict(classifier, "前台客房服務態度非常好！早餐很豐富，房價很干凈。再接再厲！")
    predict(classifier, "結果大失所望，燈光昏暗，空間極其狹小，床墊質量惡劣，房間還伴着一股霉味。")
    predict(classifier, "可利用文本分類實現情感分析，效果不是不行")

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 機器學習-文本分類（2）-新聞文本分類文本分類學習（二）文本表示深度學習之文本分類模型-基於transformer NLP學習（2）----文本分類模型文本分類學習（六） AdaBoost和SVM 深度學習在文本分類中的應用 NLP文本分類學習筆記7.1：基於ERNIE的文本分類文本分類實戰基於paddlepaddle的文本分類文本分類概述