spaCy 基本使用



關於 spaCy

官網:https://spacy.io

相比 NLTK

  • spacy 速度更快
  • 可以和深度學習結合

安裝

(base) $ python -m spacy download en

import spacy

加載英文模型,需要先安裝模型 $ sudo python -m spacy download en

如果顯示 ✘ Couldn't link model to 'en'
則注意執行命令前加上 sudo

# 加載英文模型   
nlp = spacy.load('en')
 
doc = nlp('Start using Neo4j with built-in guides.Learn the basics of graph database technology ')
 
type(doc) 
# spacy.tokens.doc.Doc

分詞

for token in doc:
    print(token)

'''
    Start
    using
    Neo4j
    with
    built
    -
    in
    guides
    .
    Learn
    the
    basics
    of
    graph
    database
    technology
'''

分句

for sent in doc.sents:
    print(sent)

'''
    Start using Neo4j within seconds, with built-in guides and sample datasets for popular use cases.
    Learn the basics of graph database technology, from building a data model to deploying a graph-powered application.
'''

詞性


for token in doc:
    print('{} {}'.format(token, token.pos_)) 
'''

    Start VERB
    using VERB
    Neo4j PROPN
    within ADP
    seconds NOUN
    , PUNCT
    with ADP
    built VERB
    - PUNCT
    in ADP
    guides NOUN
    and CCONJ
    sample NOUN
    datasets NOUN
    for ADP
    popular ADJ
    use NOUN
    cases NOUN
    . PUNCT
    Learn VERB
    the DET
    basics NOUN
    of ADP
    graph PROPN
    database NOUN
    technology NOUN
    , PUNCT
    from ADP
    building VERB
    a DET
    data NOUN
    model NOUN
    to ADP
    deploying VERB
    a DET
    graph NOUN
    - PUNCT
    powered VERB
    application NOUN
    . PUNCT
'''

命名體識別


doc2 = nlp('arXiv is a free distribution service and an open-access archive for 1,812,439 scholarly articles. Materials on this site are not peer-reviewed by arXiv.')
 
for ent in doc2.ents:
    print('{}, {} '.format(ent, ent.label_))   # 實體,類型
'''
arXiv, ORG 
1,812,439, CARDINAL 
arXiv, ORG 
'''

from spacy import displacy
displacy.render(doc2, style='ent')


displacy.render(doc2, style='ent')
displacy.render(doc2, style='dep')

image.png


頻次統計

# 找到一本書中 人名出現頻次

def read_file(filename):
    with open(filename, 'r') as file:
        return file.read()
    
def read_file_to_list(filename):
    with open(filename, 'r') as file:
        return file.readlines()
 
text = read_file('data/pride_and_prejudice.txt')
 
precessed_text = nlp(text) # 執行的時間比較長
 
# 查看統計
# sents 內部並非字符串,而是 spacy.tokens.span.Span 類型
sents = [sent for sent in precessed_text.sents]
 
print(len(sents))
#  7153



from collections import Counter
 
 def find_person(doc):
    c = Counter()
    names = []
    for ent in precessed_text.ents:
        if ent.label_ == 'PERSON':
#             print(ent)
            c[ent.lemma_] += 1
#             names.append(ent)
    
    return c.most_common(10)
        
find_person(precessed_text)
'''
    [('Elizabeth', 600),
     ('Darcy', 355),
     ('Jane', 277),
     ('Bingley', 260),
     ('Bennet', 258),
     ('Collins', 166),
     ('Wickham', 108),
     ('Lizzy', 94),
     ('Gardiner', 90),
     ('Lady Catherine', 76)]
'''


恐怖襲擊分析


 
text2 = read_file_to_list('data/rand-terrorism-dataset.txt')
text2[:5]
'''
    ['CHILE.  An explosion from a single stick of dynamite went off on the patio of the Santiago Binational Center, causing $21,000 in damages.\n',
     'ISRAEL.  Palestinian terrorists fired five mortar shells into the collective settlement at Masada, causing slight damage but no injuries.\n',
     'GUATEMALA.  A bomb was thrown over the wall surrounding the U.S. Marines guards house in Guatemala City, causing damage but no injuries.\n',
     'FRANCE.  Five French students bombed the Paris offices of   Chase Manhattan Bank before dawn.  Trans-World Airways and the Bank of America were also bombed.   They claimed to be protesting the U.S. involvement in the Vietnam war.\n',
     'UNITED STATES - Unidentified anti-Castro Cubans attempted to bomb the Miami branch of the Spanish National Tourist Office.\n']
'''


# 哪些恐怖組織在哪些國家造成了多少次恐怖襲擊
 
nlp2 = [nlp(art)  for art in text2]
# nlp2.sents[:3]
 
# 選取下面常見數據來展示

common_terrorist_groups = [
    'taliban', 
    'al - qaeda', 
    'hamas',  
    'fatah', 
    'plo', 
    'bilad al - rafidayn'
]

common_locations = [
    'iraq',
    'baghdad', 
    'kirkuk', 
    'mosul', 
    'afghanistan', 
    'kabul',
    'basra', 
    'palestine', 
    'gaza', 
    'israel', 
    'istanbul', 
    'beirut', 
    'pakistan'
]
from collections import defaultdict
 
# 構建計數器

location_entity_dict = defaultdict(Counter)
 
# 找一個實體,識別每一篇文章

for art in nlp2:
    art_terrorist_group = [ent.lemma_ for ent in art.ents if ent.label_ == 'PERSON' or ent.label_ == 'ORG'] # 恐怖組織或人
    art_locations = [ent.lemma_  for ent in art.ents if ent.label_ == 'GPE']  #地點
    
#     print(len(art_terrorist_group))
#     print(len(art_locations))
    
    # 剔除
    terrorist_common = [ent for ent in art_terrorist_group if ent in common_terrorist_groups]
    locations_common = [ent for ent in art_locations if ent in common_locations]
    
    # 統計組織干了什么壞事
    for found_ent in terrorist_common:
        for found_loc in locations_common:
            location_entity_dict[found_ent][found_loc] += 1
            

print(location_entity_dict)
    
    
defaultdict(<class 'collections.Counter'>, {})
location_entity_dict
# defaultdict(collections.Counter, {})

 
location_entity_dict = defaultdict(Counter)

for article in nlp2:
    
    article_terrorist_groups = [ent.lemma_ for ent in article.ents if ent.label_=='PERSON' or ent.label_ =='ORG']#人或者組織
    article_locations = [ent.lemma_ for ent in article.ents if ent.label_=='GPE']
    terrorist_common = [ent for ent in article_terrorist_groups if ent in common_terrorist_groups]
    locations_common = [ent for ent in article_locations if ent in common_locations]
    
    for found_entity in terrorist_common:
        for found_location in locations_common:
            location_entity_dict[found_entity][found_location] += 1
            
 
location_entity_dict
# defaultdict(collections.Counter, {})


相關資料




免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM