给“实体抽取”打补丁

VincentWei

天地间，浩然正气长存，为天地立心，为生民立命，为往圣继绝学，为万世开太平！

免责声明：网站内容仅供个人学习记录，禁做商业用途，转载请注明出处。

版权所有 © 2017-2020 NEUSNCP个人学习笔记辽ICP备17017855号-2

给“实体抽取”打补丁

VincentWei 2019年5月5日 21:36:35

在邮箱场景中，光抽取人名是不够的，你要知道对应到具体人名类型的哪个实体，发件人，收件人，还是抄送人，等等

BiLSTM-CRF，就算是BERT-BiLSTM-CRF也不一定能够保证，在语料训练不够充分的情况下，能够精准的识别，这时候，往往要有补救措施，有的时候，最直接直白，简单的方式，在没有很好的办法下，在实践中往往可以尝试采纳，等待新技术有所突破，再替换

现在采用的笨方法，就是根据自然语言依存分析，找到语句结构的dependency，分析主谓宾，以谓语为主要识别点，进行语义识别，应为具体识别，往往是通过具体的谓语，标榜不同的实体，比如回复傅强，这“傅强”就是收件人，抄送魏剑龙，则“魏剑龙”就是抄送人，以谓语为中心的识别主谓宾结构，以主谓宾结构为单位，能够对应一个语义，也就是说，单独的实体识别，可能只有词性之说，但当这个相同词性的实体，在不同的结构单位中，就有了语义，同时对应富含语义的实体。

如下测试代码：

#!/usr/bin/env python
# coding=utf-8

# Set your own model path
MODELDIR="D:\\BaiduNetdiskDownload\\ltp_data_v3.4.0"

import sys
import os
 
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer

print("正在加载LTP模型... ...")

segmentor = Segmentor()
segmentor.load(os.path.join(MODELDIR, "cws.model"))

postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))

#labeller = SementicRoleLabeller()
#labeller.load(os.path.join(MODELDIR, "srl/"))

print("加载模型完毕。")

in_file_name = "input.txt"
out_file_name = "output.txt"
begin_line = 1
end_line = 0

if len(sys.argv) > 1:
    in_file_name = sys.argv[1]

if len(sys.argv) > 2:
    out_file_name = sys.argv[2]

if len(sys.argv) > 3:
    begin_line = int(sys.argv[3])

if len(sys.argv) > 4:
    end_line = int(sys.argv[4])

def extraction_start(in_file_name, out_file_name, begin_line, end_line):
    """
    事实三元组抽取的总控程序
    Args:
        in_file_name: 输入文件的名称
        #out_file_name: 输出文件的名称
        begin_line: 读文件的起始行
        end_line: 读文件的结束行
    """
    in_file = open(in_file_name, 'rb')
    out_file = open(out_file_name, 'a')
    
    line_index = 1
    sentence_number = 0
    text_line = in_file.readline()
    while text_line:
        if line_index < begin_line:
            text_line = in_file.readline()
            line_index += 1
            continue
        if end_line != 0 and line_index > end_line:
            break
        sentence = text_line.strip()
        if sentence == "" or len(sentence) > 1000:
            text_line = in_file.readline()
            line_index += 1
            continue
        try:
            fact_triple_extract(sentence, out_file)
            out_file.flush()
        except Exception as e:
            print(e)
        sentence_number += 1
        if sentence_number % 50 == 0:
            print("%d done" % (sentence_number))
        text_line = in_file.readline()
        line_index += 1
    in_file.close()
    out_file.close()

def fact_triple_extract(sentence, out_file):
    """
    对于给定的句子进行事实三元组抽取
    Args:
        sentence: 要处理的语句
    """
    #print sentence
    words = segmentor.segment(sentence)
    #print "\t".join(words)
    postags = postagger.postag(words)
    netags = recognizer.recognize(words, postags)
    arcs = parser.parse(words, postags)
    #print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)

    child_dict_list = build_parse_child_dict(words, postags, arcs)
    for index in range(len(postags)):
        # 抽取以谓词为中心的事实三元组
        if postags[index] == 'v':
            child_dict = child_dict_list[index]
            # 主谓宾
            if 'SBV' in child_dict and 'VOB' in child_dict:
                for vob in child_dict['VOB']:
                    e1 = complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                    r = words[index]
                    e2 = complete_e(words, postags, child_dict_list, vob)
                    out_file.write("主语谓语宾语关系\t(%s, %s, %s)\n" % (e1, r, e2))
                    out_file.flush()
            elif 'VOB' in child_dict:
                for vob in child_dict['VOB']:
                    r = words[index]
                    e2 = complete_e(words, postags, child_dict_list, vob)
                    out_file.write("主语谓语宾语关系\t(%s, %s, %s)\n" % ("我", r, e2))
                    out_file.flush()
            # 定语后置，动宾关系
            if arcs[index].relation == 'ATT':
                if 'VOB' in child_dict:
                    for vob in child_dict['VOB']:
                        e1 = complete_e(words, postags, child_dict_list, arcs[index].head - 1)
                        r = words[index]
                        e2 = complete_e(words, postags, child_dict_list, vob)
                        temp_string = r+e2
                        if temp_string == e1[:len(temp_string)]:
                        	e1 = e1[len(temp_string):]
                        if temp_string not in e1:
                        	out_file.write("定语后置动宾关系\t(%s, %s, %s)\n" % (e1, r, e2))
                        	out_file.flush()
            # 含有介宾关系的主谓动补关系
            if 'SBV' in child_dict and 'CMP' in child_dict:
                #e1 = words[child_dict['SBV'][0]]
                e1 = complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                cmp_index = child_dict['CMP'][0]
                r = words[index] + words[cmp_index]
                if 'POB' in child_dict_list[cmp_index]:
                    for pob in child_dict_list[cmp_index]['POB']:
                        e2 = complete_e(words, postags, child_dict_list, pob)
                        out_file.write("介宾关系主谓动补\t(%s, %s, %s)\n" % (e1, r, e2))
                        out_file.flush()
            elif 'CMP' in child_dict:
                cmp_index = child_dict['CMP'][0]
                r = words[index] + words[cmp_index]
                if 'POB' in child_dict_list[cmp_index]:
                    for pob in child_dict_list[cmp_index]['POB']:
                        e2 = complete_e(words, postags, child_dict_list, pob)
                        out_file.write("介宾关系主谓动补\t(%s, %s, %s)\n" % ("我", r, e2))
                        out_file.flush()


def build_parse_child_dict(words, postags, arcs):
    """
    为句子中的每个词语维护一个保存句法依存儿子节点的字典
    Args:
        words: 分词列表
        postags: 词性列表
        arcs: 句法依存列表
    """
    child_dict_list = []
    for index in range(len(words)):
        child_dict = dict()
        for arc_index in range(len(arcs)):
            if arcs[arc_index].head == index + 1:
                if arcs[arc_index].relation in child_dict:
                    child_dict[arcs[arc_index].relation].append(arc_index)
                else:
                    child_dict[arcs[arc_index].relation] = []
                    child_dict[arcs[arc_index].relation].append(arc_index)
        #if child_dict.has_key('SBV'):
        #    print words[index],child_dict['SBV']
        child_dict_list.append(child_dict)
    return child_dict_list

def complete_e(words, postags, child_dict_list, word_index):
    """
    完善识别的部分实体
    """
    child_dict = child_dict_list[word_index]
    prefix = ''
    if 'ATT' in child_dict:
        for i in range(len(child_dict['ATT'])):
            prefix += complete_e(words, postags, child_dict_list, child_dict['ATT'][i])
    
    postfix = ''
    if postags[word_index] == 'v':
        if 'VOB' in child_dict:
            postfix += complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
        if 'SBV' in child_dict:
            prefix = complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix

    return prefix + words[word_index] + postfix

if __name__ == "__main__":
    extraction_start(in_file_name, out_file_name, begin_line, end_line)

通过在pipeline中的组件中，配置参数，主要有谓语，词性，和对应的实体，比如 ["抄送"]，["nr"]，["mail_cc_nr"]，通过识别动词的语义，也就是通过词向量相识度比对，识别与“抄送”相似的谓语为中心，识别主谓宾结构，以此识别对应宾语对应的具体富含语义的实体，也就是抄送人。

浏览： 2.0K

您的评论 *

[[total]] 条评论

添加评论

[[item.time]]

[[item.user.username]] [[item.floor]]楼

[[cc.time]]

[[cc.user.username]] #[[cc.room]]

- «
- 1
- ...
- [[i]]
- ...
- »

点击加载更多……
添加评论
登录后即可回复

添加评论登录后即可回复

VincentWei

78

1.2K

给“实体抽取”打补丁

[[total]] 条评论