哈工大LTP基本使用-分詞、詞性標注、依存句法分析、命名實體識別、角色標注

本文轉載自查看原文 2021-04-25 14:08 438 自然語言處理-相關庫的使用/ 自然語言處理

代碼

import os
from pprint import pprint
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
class LtpParser:
    def __init__(self):
        LTP_DIR = "../model/ltp_data_v3.4.0/"
        self.segmentor = Segmentor()
        # load_with_lexicon用於加載自定義的詞典 
        self.segmentor.load_with_lexicon(os.path.join(LTP_DIR, "cws.model"),os.path.join(LTP_DIR, "user_dict.txt"))

        self.postagger = Postagger()
        self.postagger.load_with_lexicon(os.path.join(LTP_DIR, "pos.model"),os.path.join(LTP_DIR, "user_dict.txt"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))

    def analyse(self, text):
        # 分詞
        segmentor_res = self.segmentor.segment(text)
        print(list(segmentor_res))

        # 詞性標注,傳入的是分詞的結果
        postagger_res = self.postagger.postag(segmentor_res)
        print(list(postagger_res))

        # 命名實體識別，傳入的是分詞、詞性標注的結果

        # 依存句法分析，傳入的是分詞、詞性標注的結果
        arcs = self.parser.parse(segmentor_res, postagger_res)
        # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) 
        arcs_res = []
        for word, arc in zip(list(segmentor_res), arcs):
            tmp = {}
            if arc.head == 0:
                tmp['dep'] = word
                tmp['gov'] = 'ROOT'
                tmp['pos'] = arc.relation
            else:
                tmp['dep'] = word
                tmp['gov'] = segmentor_res[arc.head-1]
                tmp['pos'] = arc.relation
            arcs_res.append(tmp)
        pprint(arcs_res)

        # 語義角色標注，傳入的是分詞、詞性標注、句法分析結果
        labeller_res = self.labeller.label(segmentor_res, postagger_res, arcs)
        for role in labeller_res:
            print (role.index, "\t".join(["%s:(%d,%d)-(%s)" % (arg.name, arg.range.start, arg.range.end, "".join(list(segmentor_res)[arg.range.start:arg.range.end+1])) for arg in role.arguments]))

if __name__ == '__main__':
    ltpParser = LtpParser()
    text = "中國是一個自由、和平的國家"
    ltpParser.analyse(text)

結果

['中國', '是', '一個', '自由', '、', '和平', '的', '國家']
['ns', 'v', 'm', 'a', 'wp', 'a', 'u', 'n']
['S-Ns', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
[{'dep': '中國', 'gov': '是', 'pos': 'SBV'},
 {'dep': '是', 'gov': 'ROOT', 'pos': 'HED'},
 {'dep': '一個', 'gov': '國家', 'pos': 'ATT'},
 {'dep': '自由', 'gov': '國家', 'pos': 'ATT'},
 {'dep': '、', 'gov': '和平', 'pos': 'WP'},
 {'dep': '和平', 'gov': '自由', 'pos': 'COO'},
 {'dep': '的', 'gov': '自由', 'pos': 'RAD'},
 {'dep': '國家', 'gov': '是', 'pos': 'VOB'}]
1 A0:(0,0)-(中國)	A1:(2,7)-(一個自由、和平的國家)

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 哈工大LTP語言分析：分詞、詞性標注、句法分析等學習筆記CB007:分詞、命名實體識別、詞性標注、句法分析樹 hanlp入門（含命名實體識別與詞性標注、關鍵詞提取、自動摘要、地名識別、依存句法分析、短語提取）使用哈工大LTP進行句法分析 python調用Hanlp做命名實體識別以及詞性標注中文詞性標注解釋及句法分析標注解釋使用哈工大LTP進行文本命名實體識別並保存到txt jieba分詞的詞性標注 pyhanlp 分詞與詞性標注 Jieba分詞詞性標注以及詞性說明