原文鏈接:http://www.one2know.cn/nlp6/
- 內置分塊器
分塊:從文本中抽取短語
import nltk
text = 'Lalbagh Botanical Garden is a well known botanical garden in Bengaluru, India.'
# 文本切割成多個句子
sentences = nltk.sent_tokenize(text)
for sentence in sentences:
words = nltk.word_tokenize(sentence)
tags = nltk.pos_tag(words)
chunks = nltk.ne_chunk(tags) # 實現分塊,返回樹結構
print(chunks)
輸出:
(S
(PERSON Lalbagh/NNP)
(PERSON Botanical/NNP Garden/NNP)
is/VBZ
a/DT
well/RB
known/VBN
botanical/JJ
garden/NN
in/IN
(GPE Bengaluru/NNP)
,/,
(GPE India/NNP)
./.)
- 編寫簡單的RE分塊器
import nltk
text = 'Ravi is the CEO of a company. He is very powerful public speaker also.'
# 詞性語法規則
grammar = '\n'.join([
'NP: {<DT>*<NNP>}', # 一個或多個DT后緊跟一個NNP
'NP: {<JJ>*<NN>}', # 一個或多個JJ后緊跟一個NN
'NP: {<NNP>+}', # 一個或多個NNP組成
])
sentences = nltk.sent_tokenize(text)
for sentence in sentences:
words = nltk.word_tokenize(sentence)
tags = nltk.pos_tag(words)
# 將語法規則放到RegexpParser對象中
chunkparser = nltk.RegexpParser(grammar)
result = chunkparser.parse(tags)
print(result)
輸出:
(S
(NP Ravi/NNP)
is/VBZ
(NP the/DT CEO/NNP)
of/IN
a/DT
(NP company/NN)
./.)
(S
He/PRP
is/VBZ
very/RB
(NP powerful/JJ public/JJ speaker/NN)
also/RB
./.)
- 訓練分塊器
IOB標注格式:
列 | 描述 |
---|---|
IOB第一列 | 輸入句子中的單詞 |
IOB第二列 | 單詞對應的詞性 |
IOB第三列 | I(內部詞),O(外部詞),B(開始詞);加上詞種類的后綴 |
例子:
Rockwell NNP B-NP
International NNP I-NP
Corp. NNP I-NP
's POS B-NP
Tulsa NNP I-NP
unit NN I-NP
said VBD B-VP
it PRP B-NP
代碼:
import nltk
from nltk.corpus import conll2000,treebank_chunk # 兩個數據集
# 簡單的分塊器,抽取NNP(專有名詞)
def mySimpleChunker():
grammar = 'NP: {<NNP>+}'
return nltk.RegexpParser(grammar)
# 不抽取任何東西,只用於檢驗算法能否正常運行
def test_nothing(data):
cp = nltk.RegexpParser("")
print(cp.evaluate(data))
# 測試mySimpleChunker()函數
def test_mysimplechunker(data):
schunker = mySimpleChunker()
print(schunker.evaluate(data))
datasets = [
conll2000.chunked_sents('test.txt',chunk_types=['NP']),
treebank_chunk.chunked_sents(),
]
# 前50個IOB標注語句 計算分塊器的准確率
for dataset in datasets:
test_nothing(dataset[:50])
print('---------------------')
test_mysimplechunker(dataset[:50])
print()
輸出:
ChunkParse score:
IOB Accuracy: 38.6%%
Precision: 0.0%%
Recall: 0.0%%
F-Measure: 0.0%%
---------------------
ChunkParse score:
IOB Accuracy: 48.2%%
Precision: 71.1%%
Recall: 17.2%%
F-Measure: 27.7%%
ChunkParse score:
IOB Accuracy: 45.0%%
Precision: 0.0%%
Recall: 0.0%%
F-Measure: 0.0%%
---------------------
ChunkParse score:
IOB Accuracy: 50.7%%
Precision: 51.9%%
Recall: 8.8%%
F-Measure: 15.1%%
- 遞歸下降句法分析
遞歸先序遍歷句法分析樹
NLTK的RD分析器
import nltk
def RDParserExample(grammar,textlist):
# RecursiveDescentParser遞歸下降分析器
parser = nltk.parse.RecursiveDescentParser(grammar)
for text in textlist:
sentence = nltk.word_tokenize(text)
for tree in parser.parse(sentence):
print(tree)
tree.draw()
# 利用grammar創建CFG對象
grammar = nltk.CFG.fromstring("""
S -> NP VP
NP -> NNP VBZ
VP -> IN NNP | DT NN IN NNP
NNP -> 'Tajmahal' | 'Agra' | 'Bangalore' | 'Karnataka'
VBZ -> 'is'
IN -> 'in' | 'of'
DT -> 'the'
NN -> 'capital'
""")
# 測試
text = [
"Tajmahal is in Agra",
"Bangalore is the capital of Karnataka",
]
RDParserExample(grammar,text)
輸出:
(S (NP (NNP Tajmahal) (VBZ is)) (VP (IN in) (NNP Agra)))
(S
(NP (NNP Bangalore) (VBZ is))
(VP (DT the) (NN capital) (IN of) (NNP Karnataka)))
- shift-reduce句法分析
shift-reduce句法分析器:從左到右單線程,也可以從上到下多線程
import nltk
def SRParserExample(grammer,textlist):
parser = nltk.parse.ShiftReduceParser(grammer)
for text in textlist:
sentence = nltk.word_tokenize(text)
for tree in parser.parse(sentence):
print(tree)
tree.draw()
grammar = nltk.CFG.fromstring("""
S -> NP VP
NP -> NNP VBZ
VP -> IN NNP | DT NN IN NNP
NNP -> 'Tajmahal' | 'Agra' | 'Bangalore' | 'Karnataka'
VBZ -> 'is'
IN -> 'in' | 'of'
DT -> 'the'
NN -> 'capital'
""")
text = [
"Tajmahal is in Agra",
"Bangalore is the capital of Karnataka",
]
SRParserExample(grammar,text)
輸出:
(S (NP (NNP Tajmahal) (VBZ is)) (VP (IN in) (NNP Agra)))
- 依存句法分析和主觀依存分析
import nltk
# 依存相關規則
grammar = nltk.grammar.DependencyGrammar.fromstring("""
'savings' -> 'small'
'yield' -> 'savings'
'gains' -> 'large'
'yield' -> 'gains'
""")
sentence = 'small savings yield large gains'
dp = nltk.parse.ProjectiveDependencyParser(grammar)
print(sorted(dp.parse(sentence.split())))
for t in sorted(dp.parse(sentence.split())):
print(t)
t.draw()
輸出:
[Tree('yield', [Tree('savings', ['small']), Tree('gains', ['large'])])]
(yield (savings small) (gains large))
- 線圖句法分析
from nltk.grammar import CFG
from nltk.parse.chart import ChartParser,BU_LC_STRATEGY
# BNF格式文法 開始符號:S 終結符號:單詞
grammar = CFG.fromstring("""
S -> T1 T4
T1 -> NNP VBZ
T2 -> DT NN
T3 ->IN NNP
T4 -> T3 | T2 T3
NNP -> 'Tajmahal' | 'Agra' | 'Bangalore' | 'Karnataka'
VBZ -> 'is'
IN -> 'in' | 'of'
DT -> 'the'
NN -> 'capital'
""")
cp = ChartParser(grammar,BU_LC_STRATEGY,trace=True)
# trace=True可以看見分析過程
# strategy=BU_LC_STRATEGY是默認的,不寫好像也行
sentence = 'Bangalore is the capital of Karnataka'
tokens = sentence.split()
chart = cp.chart_parse(tokens) # 對單詞列表分析,並存到chart對象
parses = list(chart.parses(grammar.start())) # 將chart取到的所有分析樹賦給parses
print('Total Edges :',len(chart.edges())) # 輸出chart對象所有邊的數量
for tree in parses: # 打印所有分析樹
print(tree)
tree.draw()
輸出:
|.Bangal. is . the .capita. of .Karnat.|
|[------] . . . . .| [0:1] 'Bangalore'
|. [------] . . . .| [1:2] 'is'
|. . [------] . . .| [2:3] 'the'
|. . . [------] . .| [3:4] 'capital'
|. . . . [------] .| [4:5] 'of'
|. . . . . [------]| [5:6] 'Karnataka'
|[------] . . . . .| [0:1] NNP -> 'Bangalore' *
|[------> . . . . .| [0:1] T1 -> NNP * VBZ
|. [------] . . . .| [1:2] VBZ -> 'is' *
|[-------------] . . . .| [0:2] T1 -> NNP VBZ *
|[-------------> . . . .| [0:2] S -> T1 * T4
|. . [------] . . .| [2:3] DT -> 'the' *
|. . [------> . . .| [2:3] T2 -> DT * NN
|. . . [------] . .| [3:4] NN -> 'capital' *
|. . [-------------] . .| [2:4] T2 -> DT NN *
|. . [-------------> . .| [2:4] T4 -> T2 * T3
|. . . . [------] .| [4:5] IN -> 'of' *
|. . . . [------> .| [4:5] T3 -> IN * NNP
|. . . . . [------]| [5:6] NNP -> 'Karnataka' *
|. . . . . [------>| [5:6] T1 -> NNP * VBZ
|. . . . [-------------]| [4:6] T3 -> IN NNP *
|. . . . [-------------]| [4:6] T4 -> T3 *
|. . [---------------------------]| [2:6] T4 -> T2 T3 *
|[=========================================]| [0:6] S -> T1 T4 *
Total Edges : 24
(S
(T1 (NNP Bangalore) (VBZ is))
(T4 (T2 (DT the) (NN capital)) (T3 (IN of) (NNP Karnataka))))