image
VincentWei

天地间,浩然正气长存,为天地立心,为生民立命,为往圣继绝学,为万世开太平!

给“实体抽取”打补丁

VincentWei    2019-05-05 21:36

在邮箱场景中,光抽取人名是不够的,你要知道对应到具体人名类型的哪个实体,发件人,收件人,还是抄送人,等等

BiLSTM-CRF,就算是BERT-BiLSTM-CRF也不一定能够保证,在语料训练不够充分的情况下,能够精准的识别,这时候,往往要有补救措施,有的时候,最直接直白,简单的方式,在没有很好的办法下,在实践中往往可以尝试采纳,等待新技术有所突破,再替换

现在采用的笨方法,就是根据自然语言依存分析,找到语句结构的dependency,分析主谓宾,以谓语为主要识别点,进行语义识别,应为具体识别,往往是通过具体的谓语,标榜不同的实体,比如回复傅强,这“傅强”就是收件人,抄送魏剑龙,则“魏剑龙”就是抄送人,以谓语为中心的识别主谓宾结构,以主谓宾结构为单位,能够对应一个语义,也就是说,单独的实体识别,可能只有词性之说,但当这个相同词性的实体,在不同的结构单位中,就有了语义,同时对应富含语义的实体。

如下测试代码:

#!/usr/bin/env python
# coding=utf-8

# Set your own model path
MODELDIR="D:\\BaiduNetdiskDownload\\ltp_data_v3.4.0"

import sys
import os
 
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer

print("正在加载LTP模型... ...")

segmentor = Segmentor()
segmentor.load(os.path.join(MODELDIR, "cws.model"))

postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))

#labeller = SementicRoleLabeller()
#labeller.load(os.path.join(MODELDIR, "srl/"))

print("加载模型完毕。")

in_file_name = "input.txt"
out_file_name = "output.txt"
begin_line = 1
end_line = 0

if len(sys.argv) > 1:
    in_file_name = sys.argv[1]

if len(sys.argv) > 2:
    out_file_name = sys.argv[2]

if len(sys.argv) > 3:
    begin_line = int(sys.argv[3])

if len(sys.argv) > 4:
    end_line = int(sys.argv[4])

def extraction_start(in_file_name, out_file_name, begin_line, end_line):
    """
    事实三元组抽取的总控程序
    Args:
        in_file_name: 输入文件的名称
        #out_file_name: 输出文件的名称
        begin_line: 读文件的起始行
        end_line: 读文件的结束行
    """
    in_file = open(in_file_name, 'rb')
    out_file = open(out_file_name, 'a')
    
    line_index = 1
    sentence_number = 0
    text_line = in_file.readline()
    while text_line:
        if line_index < begin_line:
            text_line = in_file.readline()
            line_index += 1
            continue
        if end_line != 0 and line_index > end_line:
            break
        sentence = text_line.strip()
        if sentence == "" or len(sentence) > 1000:
            text_line = in_file.readline()
            line_index += 1
            continue
        try:
            fact_triple_extract(sentence, out_file)
            out_file.flush()
        except Exception as e:
            print(e)
        sentence_number += 1
        if sentence_number % 50 == 0:
            print("%d done" % (sentence_number))
        text_line = in_file.readline()
        line_index += 1
    in_file.close()
    out_file.close()

def fact_triple_extract(sentence, out_file):
    """
    对于给定的句子进行事实三元组抽取
    Args:
        sentence: 要处理的语句
    """
    #print sentence
    words = segmentor.segment(sentence)
    #print "\t".join(words)
    postags = postagger.postag(words)
    netags = recognizer.recognize(words, postags)
    arcs = parser.parse(words, postags)
    #print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)

    child_dict_list = build_parse_child_dict(words, postags, arcs)
    for index in range(len(postags)):
        # 抽取以谓词为中心的事实三元组
        if postags[index] == 'v':
            child_dict = child_dict_list[index]
            # 主谓宾
            if 'SBV' in child_dict and 'VOB' in child_dict:
                for vob in child_dict['VOB']:
                    e1 = complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                    r = words[index]
                    e2 = complete_e(words, postags, child_dict_list, vob)
                    out_file.write("主语谓语宾语关系\t(%s, %s, %s)\n" % (e1, r, e2))
                    out_file.flush()
            elif 'VOB' in child_dict:
                for vob in child_dict['VOB']:
                    r = words[index]
                    e2 = complete_e(words, postags, child_dict_list, vob)
                    out_file.write("主语谓语宾语关系\t(%s, %s, %s)\n" % ("我", r, e2))
                    out_file.flush()
            # 定语后置,动宾关系
            if arcs[index].relation == 'ATT':
                if 'VOB' in child_dict:
                    for vob in child_dict['VOB']:
                        e1 = complete_e(words, postags, child_dict_list, arcs[index].head - 1)
                        r = words[index]
                        e2 = complete_e(words, postags, child_dict_list, vob)
                        temp_string = r+e2
                        if temp_string == e1[:len(temp_string)]:
                        	e1 = e1[len(temp_string):]
                        if temp_string not in e1:
                        	out_file.write("定语后置动宾关系\t(%s, %s, %s)\n" % (e1, r, e2))
                        	out_file.flush()
            # 含有介宾关系的主谓动补关系
            if 'SBV' in child_dict and 'CMP' in child_dict:
                #e1 = words[child_dict['SBV'][0]]
                e1 = complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                cmp_index = child_dict['CMP'][0]
                r = words[index] + words[cmp_index]
                if 'POB' in child_dict_list[cmp_index]:
                    for pob in child_dict_list[cmp_index]['POB']:
                        e2 = complete_e(words, postags, child_dict_list, pob)
                        out_file.write("介宾关系主谓动补\t(%s, %s, %s)\n" % (e1, r, e2))
                        out_file.flush()
            elif 'CMP' in child_dict:
                cmp_index = child_dict['CMP'][0]
                r = words[index] + words[cmp_index]
                if 'POB' in child_dict_list[cmp_index]:
                    for pob in child_dict_list[cmp_index]['POB']:
                        e2 = complete_e(words, postags, child_dict_list, pob)
                        out_file.write("介宾关系主谓动补\t(%s, %s, %s)\n" % ("我", r, e2))
                        out_file.flush()


def build_parse_child_dict(words, postags, arcs):
    """
    为句子中的每个词语维护一个保存句法依存儿子节点的字典
    Args:
        words: 分词列表
        postags: 词性列表
        arcs: 句法依存列表
    """
    child_dict_list = []
    for index in range(len(words)):
        child_dict = dict()
        for arc_index in range(len(arcs)):
            if arcs[arc_index].head == index + 1:
                if arcs[arc_index].relation in child_dict:
                    child_dict[arcs[arc_index].relation].append(arc_index)
                else:
                    child_dict[arcs[arc_index].relation] = []
                    child_dict[arcs[arc_index].relation].append(arc_index)
        #if child_dict.has_key('SBV'):
        #    print words[index],child_dict['SBV']
        child_dict_list.append(child_dict)
    return child_dict_list

def complete_e(words, postags, child_dict_list, word_index):
    """
    完善识别的部分实体
    """
    child_dict = child_dict_list[word_index]
    prefix = ''
    if 'ATT' in child_dict:
        for i in range(len(child_dict['ATT'])):
            prefix += complete_e(words, postags, child_dict_list, child_dict['ATT'][i])
    
    postfix = ''
    if postags[word_index] == 'v':
        if 'VOB' in child_dict:
            postfix += complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
        if 'SBV' in child_dict:
            prefix = complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix

    return prefix + words[word_index] + postfix

if __name__ == "__main__":
    extraction_start(in_file_name, out_file_name, begin_line, end_line)

通过在pipeline中的组件中,配置参数,主要有谓语,词性,和对应的实体,比如 ["抄送"],["nr"],["mail_cc_nr"],通过识别动词的语义,也就是通过词向量相识度比对,识别与“抄送”相似的谓语为中心,识别主谓宾结构,以此识别对应宾语对应的具体富含语义的实体,也就是抄送人。

Views: 1.8K

[[total]] comments

Post your comment
  1. [[item.time]]
    [[item.user.username]] [[item.floor]]Floor
  2. Click to load more...
  3. Post your comment