hanlp学习六：文本分类

本文转载自查看原文 2020-01-26 21:04 1101 自然语言处理

一概念：

文本分类：将一个文档归类到一个或多个类别中的自然语言处理任务

类别即标签

多标签分类：一篇文档可能属于多个类别

二流程：

a.人工标注文档的类别生成文本分类语料库

代码：

# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-01-03 19:36
# 《自然语言处理入门》11.2 文本分类语料库
# 配套书籍：http://nlp.hankcs.com/book.php
# 讨论答疑：https://bbs.hankcs.com/
from pyhanlp import *
import zipfile
import os

from pyhanlp.static import download, remove_file, HANLP_DATA_PATH


def test_data_path():
    """
    获取测试数据路径，位于$root/data/test，根目录由配置文件指定。
    :return:
    """
    data_path = os.path.join(HANLP_DATA_PATH, 'test')
    if not os.path.isdir(data_path):
        os.mkdir(data_path)
    return data_path


def ensure_data(data_name, data_url):
    root_path = test_data_path()
    dest_path = os.path.join(root_path, data_name)
    if os.path.exists(dest_path):
        return dest_path
    if data_url.endswith('.zip'):
        dest_path += '.zip'
    download(data_url, dest_path)
    if data_url.endswith('.zip'):
        with zipfile.ZipFile(dest_path, "r") as archive:
            archive.extractall(root_path)
        remove_file(dest_path)
        dest_path = dest_path[:-len('.zip')]
    return dest_path



sogou_corpus_path = ensure_data('搜狗文本分类语料库迷你版',
                                'http://file.hankcs.com/corpus/sogou-text-classification-corpus-mini.zip')

AbstractDataSet = JClass('com.hankcs.hanlp.classification.corpus.AbstractDataSet')
Document = JClass('com.hankcs.hanlp.classification.corpus.Document')
FileDataSet = JClass('com.hankcs.hanlp.classification.corpus.FileDataSet')
MemoryDataSet = JClass('com.hankcs.hanlp.classification.corpus.MemoryDataSet')

# 演示加载文本分类语料库
if __name__ == '__main__':
    dataSet = MemoryDataSet()  # ①将数据集加载到内存中
    dataSet.load(sogou_corpus_path)  # ②加载data/test/搜狗文本分类语料库迷你版
    dataSet.add("自然语言处理", "自然语言处理很有趣")  # ③新增样本
    allClasses = dataSet.getCatalog().getCategories()  # ④获取标注集
    print("标注集：%s" % (allClasses))
    for document in dataSet.iterator():
        print("第一篇文档的类别：" + allClasses.get(document.category))
        break

b.利用语料训练模型

特征提取以及分类器处理

特征提取步骤：

（1）分词

（2）分词等预处理结束后，从这些词语中挑出有用的子集作为特征，利用卡方特征选择筛选词语

（3）确定特征之后，将文档转化为词袋向量

分类器包括：朴素贝叶斯以及支持向量机

支持向量机代码

# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-01-06 14:09
# 《自然语言处理入门》11.5.2 线性支持向量机文本分类器实现
# 配套书籍：http://nlp.hankcs.com/book.php
# 讨论答疑：https://bbs.hankcs.com/
from pyhanlp.static import STATIC_ROOT, download
import os

from tests.demos.demo_text_classification import sogou_corpus_path


def install_jar(name, url):
    dst = os.path.join(STATIC_ROOT, name)
    if os.path.isfile(dst):
        return dst
    download(url, dst)
    return dst


install_jar('text-classification-svm-1.0.2.jar', 'http://file.hankcs.com/bin/text-classification-svm-1.0.2.jar')
install_jar('liblinear-1.95.jar', 'http://file.hankcs.com/bin/liblinear-1.95.jar')
from pyhanlp import *

LinearSVMClassifier = SafeJClass('com.hankcs.hanlp.classification.classifiers.LinearSVMClassifier')
IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil')


def train_or_load_classifier():
    model_path = sogou_corpus_path + '.svm.ser'
    if os.path.isfile(model_path):
        return LinearSVMClassifier(IOUtil.readObjectFrom(model_path))
    classifier = LinearSVMClassifier()
    classifier.train(sogou_corpus_path)
    model = classifier.getModel()
    IOUtil.saveObjectTo(model, model_path)
    return LinearSVMClassifier(model)


def predict(classifier, text):
    print("《%16s》\t属于分类\t【%s】" % (text, classifier.classify(text)))
    # 如需获取离散型随机变量的分布，请使用predict接口
    # print("《%16s》\t属于分类\t【%s】" % (text, classifier.predict(text)))


if __name__ == '__main__':
    classifier = train_or_load_classifier()
    predict(classifier, "C罗获2018环球足球奖最佳球员 德尚荣膺最佳教练")
    predict(classifier, "潜艇具有很强的战略威慑能力与实战能力")
    predict(classifier, "研究生考录模式亟待进一步专业化")
    predict(classifier, "如果真想用食物解压,建议可以食用燕麦")
    predict(classifier, "通用及其部分竞争对手目前正在考虑解决库存问题")

c.利用模型预测文档的类别

三情感分析：

a.概念：提取文本的主观信息,找出文本所对应的正负情感态度

b.流程：

（1）:准备情感语料库

（2）利用情感语料库，训练分类模型

c.利用酒店评论语料库进行的情感分类试验

# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-01-07 13:53

from pyhanlp import *
from pyhanlp import *
import zipfile
import os

from pyhanlp.static import download, remove_file, HANLP_DATA_PATH


def test_data_path():
    """
    获取测试数据路径，位于$root/data/test，根目录由配置文件指定。
    :return:
    """
    data_path = os.path.join(HANLP_DATA_PATH, 'test')
    if not os.path.isdir(data_path):
        os.mkdir(data_path)
    return data_path


def ensure_data(data_name, data_url):
    root_path = test_data_path()
    dest_path = os.path.join(root_path, data_name)
    if os.path.exists(dest_path):
        return dest_path
    if data_url.endswith('.zip'):
        dest_path += '.zip'
    download(data_url, dest_path)
    if data_url.endswith('.zip'):
        with zipfile.ZipFile(dest_path, "r") as archive:
            archive.extractall(root_path)
        remove_file(dest_path)
        dest_path = dest_path[:-len('.zip')]
    return dest_path

IClassifier = JClass('com.hankcs.hanlp.classification.classifiers.IClassifier')
NaiveBayesClassifier = JClass('com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier')
# 中文情感挖掘语料-ChnSentiCorp 谭松波
chn_senti_corp = ensure_data("ChnSentiCorp情感分析酒店评论", "http://file.hankcs.com/corpus/ChnSentiCorp.zip")


def predict(classifier, text):
    print("《%s》 情感极性是 【%s】" % (text, classifier.classify(text)))


if __name__ == '__main__':
    classifier = NaiveBayesClassifier()
    #  创建分类器，更高级的功能请参考IClassifier的接口定义
    classifier.train(chn_senti_corp)
    #  训练后的模型支持持久化，下次就不必训练了
    predict(classifier, "前台客房服务态度非常好！早餐很丰富，房价很干净。再接再厉！")
    predict(classifier, "结果大失所望，灯光昏暗，空间极其狭小，床垫质量恶劣，房间还伴着一股霉味。")
    predict(classifier, "可利用文本分类实现情感分析，效果不是不行")

免责声明！

本站转载的文章为个人学习借鉴使用，本站对版权不负任何法律责任。如果侵犯了您的隐私权益，请联系本站邮箱yoyou2525@163.com删除。

猜您在找 机器学习-文本分类（2）-新闻文本分类文本分类学习（二）文本表示深度学习之文本分类模型-基于transformer NLP学习（2）----文本分类模型文本分类学习（六） AdaBoost和SVM 深度学习在文本分类中的应用 NLP文本分类学习笔记7.1：基于ERNIE的文本分类文本分类实战基于paddlepaddle的文本分类文本分类概述