python之NLP詞性標注


1、知識點

包括中文和英文的詞性標注
主要使用的庫是nltk和jiaba

2、代碼

# coding = utf-8

import nltk
from nltk.corpus import stopwords
from nltk.corpus import brown
import numpy as np
"""
標注步驟:
    1、清洗,分詞
    2、標注
    
FAQ:
    1、 Resource punkt not found.
        請安裝punkt模塊 
    2、安裝average_perceptron tagger
    3、Resource sinica_treebank not found
        請安裝sinica_treebank模塊
"""
def english_label():
    """
    英文詞性標注
    :return:
    """
    # 分詞
    text = "Sentiment analysis is a challenging subject in machine learning.\
     People express their emotions in language that is often obscured by sarcasm,\
      ambiguity, and plays on words, all of which could be very misleading for \
      both humans and computers.".lower()
    text_list = nltk.word_tokenize(text)
    # 去掉標點符號
    english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
    text_list = [word for word in text_list if word not in english_punctuations]
    # 去掉停用詞
    stops = set(stopwords.words("english"))
    text_list = [word for word in text_list if word not in stops]

    list = nltk.pos_tag(text_list) #打標簽
    print(list)


def chineses_label():
    import jieba.posseg as pseg
    import re
    """
    fool也可以針對中文詞性標注
    HanLP詞性標注集
    案例使用jieba進行詞性標注
    :return:
    """
    str = "我愛你,是粉色,舒服 ,舒服,士大夫"
    posseg_list = re.sub(r'[,]', " ", str)
    posseg_list =pseg.cut(posseg_list)
    print(posseg_list)
    print(' '.join('%s/%s' % (word, tag) for (word, tag) in posseg_list))

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM