nlp數據清洗(包括有英文和中文)

本文轉載自查看原文 2019-12-16 16:42 1620 NLP

一、英文數據清洗

英文數據清洗是去除縮寫、非字母符號、專有名詞的縮寫、提取詞干、提取詞根。

1.常規的清洗方式

去除非字母符號和常用縮寫

#coding=utf-8
import jieba
import unicodedata
import sys,re,collections,nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
class rule:
    # 正則表達式過濾特殊符號用空格符占位，雙引號、單引號、句點、逗號
    pat_letter = re.compile(r'[^a-zA-Z \']+')#保留'
    # 還原常見縮寫單詞
    pat_is = re.compile("(it|he|she|that|this|there|here)(\'s)", re.I)
    pat_s = re.compile("([a-zA-Z])(\'s)")  # 處理類似於這樣的縮寫today’s
    pat_not = re.compile("([a-zA-Z])(n\'t)")  # not的縮寫
    pat_would = re.compile("([a-zA-Z])(\'d)")  # would的縮寫
    pat_will = re.compile("([a-zA-Z])(\'ll)")  # will的縮寫
    pat_am = re.compile("([I|i])(\'m)")  # am的縮寫
    pat_are = re.compile("([a-zA-Z])(\'re)")  # are的縮寫
    pat_ve = re.compile("([a-zA-Z])(\'ve)")  # have的縮寫


def replace_abbreviations(text):
    new_text = text
    new_text = rule.pat_letter.sub(' ', new_text).strip().lower()
    new_text = rule.pat_is.sub(r"\1 is", new_text)#其中\1是匹配到的第一個group
    new_text = rule.pat_s.sub(r"\1 ", new_text)
    new_text = rule.pat_not.sub(r"\1 not", new_text)
    new_text = rule.pat_would.sub(r"\1 would", new_text)
    new_text = rule.pat_will.sub(r"\1 will", new_text)
    new_text = rule.pat_am.sub(r"\1 am", new_text)
    new_text = rule.pat_are.sub(r"\1 are", new_text)
    new_text = rule.pat_ve.sub(r"\1 have", new_text)
    new_text = new_text.replace('\'', ' ')
    return new_text


if __name__=='__main__':
    text='there\'re many recen\'t \'t extensions of this basic idea to include attention. 120,yes\'s it\'s'
    text=replace_abbreviations(text)
    print(text)#there are many rece not  t extensions of this basic idea to include attention   yes  it is

2.詳細的處理方式

去除普通的縮寫，還引入了一些專有名詞的處理、標點符號的處理

import re
def clean_text(text):
    """
    Clean text
    :param text: the string of text
    :return: text string after cleaning
    """
    # acronym
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"cannot", "can not ", text)
    text = re.sub(r"what\'s", "what is", text)
    text = re.sub(r"What\'s", "what is", text)
    text = re.sub(r"\'ve ", " have ", text)
    text = re.sub(r"n\'t", " not ", text)
    text = re.sub(r"i\'m", "i am ", text)
    text = re.sub(r"I\'m", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r" e mail ", " email ", text)
    text = re.sub(r" e \- mail ", " email ", text)
    text = re.sub(r" e\-mail ", " email ", text)

    # spelling correction
    text = re.sub(r"ph\.d", "phd", text)
    text = re.sub(r"PhD", "phd", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" fb ", " facebook ", text)
    text = re.sub(r"facebooks", " facebook ", text)
    text = re.sub(r"facebooking", " facebook ", text)
    text = re.sub(r" usa ", " america ", text)
    text = re.sub(r" us ", " america ", text)
    text = re.sub(r" u s ", " america ", text)
    text = re.sub(r" U\.S\. ", " america ", text)
    text = re.sub(r" US ", " america ", text)
    text = re.sub(r" American ", " america ", text)
    text = re.sub(r" America ", " america ", text)
    text = re.sub(r" mbp ", " macbook-pro ", text)
    text = re.sub(r" mac ", " macbook ", text)
    text = re.sub(r"macbook pro", "macbook-pro", text)
    text = re.sub(r"macbook-pros", "macbook-pro", text)
    text = re.sub(r" 1 ", " one ", text)
    text = re.sub(r" 2 ", " two ", text)
    text = re.sub(r" 3 ", " three ", text)
    text = re.sub(r" 4 ", " four ", text)
    text = re.sub(r" 5 ", " five ", text)
    text = re.sub(r" 6 ", " six ", text)
    text = re.sub(r" 7 ", " seven ", text)
    text = re.sub(r" 8 ", " eight ", text)
    text = re.sub(r" 9 ", " nine ", text)
    text = re.sub(r"googling", " google ", text)
    text = re.sub(r"googled", " google ", text)
    text = re.sub(r"googleable", " google ", text)
    text = re.sub(r"googles", " google ", text)
    text = re.sub(r"dollars", " dollar ", text)

    # punctuation
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"-", " - ", text)
    text = re.sub(r"/", " / ", text)
    text = re.sub(r"\\", " \ ", text)
    text = re.sub(r"=", " = ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"\.", " . ", text)
    text = re.sub(r",", " , ", text)
    text = re.sub(r"\?", " ? ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\"", " \" ", text)
    text = re.sub(r"&", " & ", text)
    text = re.sub(r"\|", " | ", text)
    text = re.sub(r";", " ; ", text)
    text = re.sub(r"\(", " ( ", text)
    text = re.sub(r"\)", " ( ", text)

    # symbol replacement
    text = re.sub(r"&", " and ", text)
    text = re.sub(r"\|", " or ", text)
    text = re.sub(r"=", " equal ", text)
    text = re.sub(r"\+", " plus ", text)
    text = re.sub(r"\$", " dollar ", text)

    # remove extra space
    text = ' '.join(text.split())

    return text

if __name__=='__main__':
    text = 'there\'re many recen\'t \'t extensions of this basic idea to include attention. 120,yes\'s it\'s'
    text = clean_text(text)
    print(text)  # there are many rece not t extensions of this basic idea to include attention . 120 , yes s it s

3.包括有處理詞根詞綴的處理方式

去除符號、還原縮寫、獲取詞根。

#coding=utf-8
import jieba
import unicodedata
import sys,re,collections,nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
class rule:
    # 正則表達式過濾特殊符號用空格符占位，雙引號、單引號、句點、逗號
    pat_letter = re.compile(r'[^a-zA-Z \']+')#保留'
    # 還原常見縮寫單詞
    pat_is = re.compile("(it|he|she|that|this|there|here)(\'s)", re.I)
    pat_s = re.compile("([a-zA-Z])(\'s)")  # 處理類似於這樣的縮寫today’s
    pat_not = re.compile("([a-zA-Z])(n\'t)")  # not的縮寫
    pat_would = re.compile("([a-zA-Z])(\'d)")  # would的縮寫
    pat_will = re.compile("([a-zA-Z])(\'ll)")  # will的縮寫
    pat_am = re.compile("([I|i])(\'m)")  # am的縮寫
    pat_are = re.compile("([a-zA-Z])(\'re)")  # are的縮寫
    pat_ve = re.compile("([a-zA-Z])(\'ve)")  # have的縮寫


def replace_abbreviations(text):
    new_text = text
    new_text = rule.pat_letter.sub(' ', new_text).strip().lower()
    new_text = rule.pat_is.sub(r"\1 is", new_text)#其中\1是匹配到的第一個group
    new_text = rule.pat_s.sub(r"\1 ", new_text)
    new_text = rule.pat_not.sub(r"\1 not", new_text)
    new_text = rule.pat_would.sub(r"\1 would", new_text)
    new_text = rule.pat_will.sub(r"\1 will", new_text)
    new_text = rule.pat_am.sub(r"\1 am", new_text)
    new_text = rule.pat_are.sub(r"\1 are", new_text)
    new_text = rule.pat_ve.sub(r"\1 have", new_text)
    new_text = new_text.replace('\'', ' ')
    return new_text

# pos和tag有相似的地方，通過tag獲得pos
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'):#以副詞
        return nltk.corpus.wordnet.ADV
    else:
        return ''

def merge(words):
    lmtzr = WordNetLemmatizer()
    new_words = ''
    words = nltk.pos_tag(word_tokenize(words))  # tag is like [('bigger', 'JJR')]
    for word in words:
        pos = get_wordnet_pos(word[1])
        if pos:
            # lemmatize()方法將word單詞還原成pos詞性的形式
            word = lmtzr.lemmatize(word[0], pos)
            new_words+=' '+word
        else:
            new_words+=' '+word[0]
    return new_words

def clear_data(text):
    text=replace_abbreviations(text)
    text=merge(text)
    text=text.strip()
    return text
if __name__=='__main__':
    text='there\'re many recen\'t \'t extensions of this basic had idea to include attention. 120,had'
    text=clear_data(text)
    print(text)#there be many rece not t extension of this basic have idea to include attention have

二、中文數據清洗

去除一些停用詞。而停用詞是文本中一些高頻的代詞、連詞、介詞等對文本分類無意義的詞，通常維護一個停用詞表，特征提取過程中刪除停用表中出現的詞，本質上屬於特征選擇的一部分。具體可參考Hanlp的停用詞表https://github.com/hankcs/HanLP

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 python之NLP數據清洗數據清洗 python 數據清洗 Python基本的數據清洗爬蟲數據清洗數據清洗的方法 07>>>數據清洗數據清洗數據清洗數據清洗有哪些方法？