Spacy简单入门


安装Spacy

pip install spacy

导入工具包和英文模型

#python -m spacy download en

文本处理

import spacy
nlp=spacy.load('en')
#分词
doc=nlp('Weather is good, very windy and sunny. We have no classes in the afternoon.')
for token in doc:
    print(token)
#分句
for sent in doc.sents:
    print(sent)

Weather
is
good
,
very
windy
and
sunny
.
We
have
no
classes
in
the
afternoon
.
Weather is good, very windy and sunny.
We have no classes in the afternoon.

词性

for token in doc:
    print('{}-{}'.format(token,token.pos_))
Weather-NOUN
is-AUX
good-ADJ
,-PUNCT
very-ADV
windy-ADJ
and-CCONJ
sunny-ADJ
.-PUNCT
We-PRON
have-AUX
no-DET
classes-NOUN
in-ADP
the-DET
afternoon-NOUN
.-PUNCT

命名实体识别

doc=nlp('I went to beijing where I met my old friend Jack from uni.')

for ent in doc.ents:
    print('{}-{}'.format(ent,ent.label_))
    
from spacy import displacy

displacy.render(doc,style='ent',jupyter=True)
    
beijing-GPE
Jack-PERSON

I went to

beijing
GPE

where I met my old friend

Jack
PERSON

from uni.

找出文中所有的人名

def getFileContent(path):
    with open(path,'r') as f:
        return f.read()

doc=nlp(getFileContent('./data/pride_and_prejudice.txt'))
sents=[s for s in doc.sents]
print(len(sents))
from collections import Counter,defaultdict

def find_person(doc):
    c=Counter()
    for ent in doc.ents:
        if ent.label_=='PERSON':
            c[ent.lemma_]+=1
    return c.most_common(10)
print(find_person(doc))
7153
[('Elizabeth', 600), ('Darcy', 355), ('Jane', 277), ('Bingley', 260), ('Bennet', 258), ('Collins', 166), ('Wickham', 108), ('Lizzy', 94), ('Gardiner', 90), ('Lady Catherine', 76)]

恐怖袭击分析

def read_lines(path):
    with open(path,'r') as f:
        return f.readlines()

text=read_lines('./data/rand-terrorism-dataset.txt')
nlp_list=[nlp(line)  for line in text]

common_terrorist_groups = [
    'taliban', 
    'al - qaeda', 
    'hamas',  
    'fatah', 
    'plo', 
    'bilad al - rafidayn'
]

common_locations = [
    'iraq',
    'baghdad', 
    'kirkuk', 
    'mosul', 
    'afghanistan', 
    'kabul',
    'basra', 
    'palestine', 
    'gaza', 
    'israel', 
    'istanbul', 
    'beirut', 
    'pakistan'
]
location_entity_dict = defaultdict(Counter)

for article in nlp_list:
    
    article_terrorist_groups = [ent.lemma_ for ent in article.ents if ent.label_=='PERSON' or ent.label_ =='ORG']#人或者组织
    article_locations = [ent.lemma_ for ent in article.ents if ent.label_=='GPE']
    terrorist_common = [ent for ent in article_terrorist_groups if ent.lower() in common_terrorist_groups]
    locations_common = [ent for ent in article_locations if ent.lower() in common_locations]
    
    for found_entity in terrorist_common:
        for found_location in locations_common:
            location_entity_dict[found_entity][found_location] += 1
location_entity_dict
defaultdict(collections.Counter,
            {'PLO': Counter({'Beirut': 9,
                      'ISRAEL': 17,
                      'Israel': 21,
                      'Iraq': 8,
                      'Palestine': 1}),
             'Fatah': Counter({'Israel': 18,
                      'Beirut': 1,
                      'Iraq': 1,
                      'ISRAEL': 4,
                      'Gaza': 11}),
             'Hamas': Counter({'ISRAEL': 7,
                      'Israel': 19,
                      'Beirut': 1,
                      'Gaza': 70}),
             'Taliban': Counter({'AFGHANISTAN': 3,
                      'Kabul': 45,
                      'Pakistan': 17,
                      'Afghanistan': 263}),
             'HAMAS': Counter({'ISRAEL': 1}),
             'Al - Qaeda': Counter({'Kabul': 1,
                      'Iraq': 4,
                      'Israel': 1,
                      'Baghdad': 5,
                      'Pakistan': 1,
                      'Mosul': 16,
                      'Kirkuk': 2}),
             'al - Qaeda': Counter({'Iraq': 46,
                      'Afghanistan': 6,
                      'Kabul': 2,
                      'Istanbul': 3,
                      'Baghdad': 14,
                      'Palestine': 3,
                      'Mosul': 1,
                      'Kirkuk': 3,
                      'Pakistan': 5}),
             'Bilad al - Rafidayn': Counter({'Iraq': 21,
                      'Baghdad': 32,
                      'Basra': 4,
                      'Mosul': 4,
                      'Palestine': 6}),
             'taliban': Counter({'Kabul': 1})})
import pandas as pd

df=pd.DataFrame.from_dict(dict(location_entity_dict),dtype=int)

df=df.fillna(value=0).astype(int)

df
PLO Fatah Hamas Taliban HAMAS Al - Qaeda al - Qaeda Bilad al - Rafidayn taliban
Beirut 9 1 1 0 0 0 0 0 0
ISRAEL 17 4 7 0 1 0 0 0 0
Israel 21 18 19 0 0 1 0 0 0
Iraq 8 1 0 0 0 4 46 21 0
Palestine 1 0 0 0 0 0 3 6 0
Gaza 0 11 70 0 0 0 0 0 0
AFGHANISTAN 0 0 0 3 0 0 0 0 0
Kabul 0 0 0 45 0 1 2 0 1
Pakistan 0 0 0 17 0 1 5 0 0
Afghanistan 0 0 0 263 0 0 6 0 0
Baghdad 0 0 0 0 0 5 14 32 0
Mosul 0 0 0 0 0 16 1 4 0
Kirkuk 0 0 0 0 0 2 3 0 0
Istanbul 0 0 0 0 0 0 3 0 0
Basra 0 0 0 0 0 0 0 4 0
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12,10))
hmap=sns.heatmap(df,annot=True,fmt='d',cmap='YlGnBu',cbar=False)
plt.title('trror')
# x轴的旋转30度
plt.xticks(rotation=30)
plt.show()

output_18_0


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM