安装Spacy
pip install spacy
导入工具包和英文模型
#python -m spacy download en
文本处理
import spacy
nlp=spacy.load('en')
#分词
doc=nlp('Weather is good, very windy and sunny. We have no classes in the afternoon.')
for token in doc:
print(token)
#分句
for sent in doc.sents:
print(sent)
Weather
is
good
,
very
windy
and
sunny
.
We
have
no
classes
in
the
afternoon
.
Weather is good, very windy and sunny.
We have no classes in the afternoon.
词性
for token in doc:
print('{}-{}'.format(token,token.pos_))
Weather-NOUN
is-AUX
good-ADJ
,-PUNCT
very-ADV
windy-ADJ
and-CCONJ
sunny-ADJ
.-PUNCT
We-PRON
have-AUX
no-DET
classes-NOUN
in-ADP
the-DET
afternoon-NOUN
.-PUNCT
命名实体识别
doc=nlp('I went to beijing where I met my old friend Jack from uni.')
for ent in doc.ents:
print('{}-{}'.format(ent,ent.label_))
from spacy import displacy
displacy.render(doc,style='ent',jupyter=True)
beijing-GPE
Jack-PERSON
I went to
beijing
GPE
where I met my old friend
Jack
PERSON
from uni.
beijing
GPE
where I met my old friend
Jack
PERSON
from uni.
找出文中所有的人名
def getFileContent(path):
with open(path,'r') as f:
return f.read()
doc=nlp(getFileContent('./data/pride_and_prejudice.txt'))
sents=[s for s in doc.sents]
print(len(sents))
from collections import Counter,defaultdict
def find_person(doc):
c=Counter()
for ent in doc.ents:
if ent.label_=='PERSON':
c[ent.lemma_]+=1
return c.most_common(10)
print(find_person(doc))
7153
[('Elizabeth', 600), ('Darcy', 355), ('Jane', 277), ('Bingley', 260), ('Bennet', 258), ('Collins', 166), ('Wickham', 108), ('Lizzy', 94), ('Gardiner', 90), ('Lady Catherine', 76)]
恐怖袭击分析
def read_lines(path):
with open(path,'r') as f:
return f.readlines()
text=read_lines('./data/rand-terrorism-dataset.txt')
nlp_list=[nlp(line) for line in text]
common_terrorist_groups = [
'taliban',
'al - qaeda',
'hamas',
'fatah',
'plo',
'bilad al - rafidayn'
]
common_locations = [
'iraq',
'baghdad',
'kirkuk',
'mosul',
'afghanistan',
'kabul',
'basra',
'palestine',
'gaza',
'israel',
'istanbul',
'beirut',
'pakistan'
]
location_entity_dict = defaultdict(Counter)
for article in nlp_list:
article_terrorist_groups = [ent.lemma_ for ent in article.ents if ent.label_=='PERSON' or ent.label_ =='ORG']#人或者组织
article_locations = [ent.lemma_ for ent in article.ents if ent.label_=='GPE']
terrorist_common = [ent for ent in article_terrorist_groups if ent.lower() in common_terrorist_groups]
locations_common = [ent for ent in article_locations if ent.lower() in common_locations]
for found_entity in terrorist_common:
for found_location in locations_common:
location_entity_dict[found_entity][found_location] += 1
location_entity_dict
defaultdict(collections.Counter,
{'PLO': Counter({'Beirut': 9,
'ISRAEL': 17,
'Israel': 21,
'Iraq': 8,
'Palestine': 1}),
'Fatah': Counter({'Israel': 18,
'Beirut': 1,
'Iraq': 1,
'ISRAEL': 4,
'Gaza': 11}),
'Hamas': Counter({'ISRAEL': 7,
'Israel': 19,
'Beirut': 1,
'Gaza': 70}),
'Taliban': Counter({'AFGHANISTAN': 3,
'Kabul': 45,
'Pakistan': 17,
'Afghanistan': 263}),
'HAMAS': Counter({'ISRAEL': 1}),
'Al - Qaeda': Counter({'Kabul': 1,
'Iraq': 4,
'Israel': 1,
'Baghdad': 5,
'Pakistan': 1,
'Mosul': 16,
'Kirkuk': 2}),
'al - Qaeda': Counter({'Iraq': 46,
'Afghanistan': 6,
'Kabul': 2,
'Istanbul': 3,
'Baghdad': 14,
'Palestine': 3,
'Mosul': 1,
'Kirkuk': 3,
'Pakistan': 5}),
'Bilad al - Rafidayn': Counter({'Iraq': 21,
'Baghdad': 32,
'Basra': 4,
'Mosul': 4,
'Palestine': 6}),
'taliban': Counter({'Kabul': 1})})
import pandas as pd
df=pd.DataFrame.from_dict(dict(location_entity_dict),dtype=int)
df=df.fillna(value=0).astype(int)
df
PLO | Fatah | Hamas | Taliban | HAMAS | Al - Qaeda | al - Qaeda | Bilad al - Rafidayn | taliban | |
---|---|---|---|---|---|---|---|---|---|
Beirut | 9 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
ISRAEL | 17 | 4 | 7 | 0 | 1 | 0 | 0 | 0 | 0 |
Israel | 21 | 18 | 19 | 0 | 0 | 1 | 0 | 0 | 0 |
Iraq | 8 | 1 | 0 | 0 | 0 | 4 | 46 | 21 | 0 |
Palestine | 1 | 0 | 0 | 0 | 0 | 0 | 3 | 6 | 0 |
Gaza | 0 | 11 | 70 | 0 | 0 | 0 | 0 | 0 | 0 |
AFGHANISTAN | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 0 |
Kabul | 0 | 0 | 0 | 45 | 0 | 1 | 2 | 0 | 1 |
Pakistan | 0 | 0 | 0 | 17 | 0 | 1 | 5 | 0 | 0 |
Afghanistan | 0 | 0 | 0 | 263 | 0 | 0 | 6 | 0 | 0 |
Baghdad | 0 | 0 | 0 | 0 | 0 | 5 | 14 | 32 | 0 |
Mosul | 0 | 0 | 0 | 0 | 0 | 16 | 1 | 4 | 0 |
Kirkuk | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 0 | 0 |
Istanbul | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 |
Basra | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0 |
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12,10))
hmap=sns.heatmap(df,annot=True,fmt='d',cmap='YlGnBu',cbar=False)
plt.title('trror')
# x轴的旋转30度
plt.xticks(rotation=30)
plt.show()