在邮箱场景中,光抽取人名是不够的,你要知道对应到具体人名类型的哪个实体,发件人,收件人,还是抄送人,等等
BiLSTM-CRF,就算是BERT-BiLSTM-CRF也不一定能够保证,在语料训练不够充分的情况下,能够精准的识别,这时候,往往要有补救措施,有的时候,最直接直白,简单的方式,在没有很好的办法下,在实践中往往可以尝试采纳,等待新技术有所突破,再替换
现在采用的笨方法,就是根据自然语言依存分析,找到语句结构的dependency,分析主谓宾,以谓语为主要识别点,进行语义识别,应为具体识别,往往是通过具体的谓语,标榜不同的实体,比如回复傅强,这“傅强”就是收件人,抄送魏剑龙,则“魏剑龙”就是抄送人,以谓语为中心的识别主谓宾结构,以主谓宾结构为单位,能够对应一个语义,也就是说,单独的实体识别,可能只有词性之说,但当这个相同词性的实体,在不同的结构单位中,就有了语义,同时对应富含语义的实体。
如下测试代码:
#!/usr/bin/env python
# coding=utf-8
# Set your own model path
MODELDIR="D:\\BaiduNetdiskDownload\\ltp_data_v3.4.0"
import sys
import os
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer
print("正在加载LTP模型... ...")
segmentor = Segmentor()
segmentor.load(os.path.join(MODELDIR, "cws.model"))
postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))
parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))
recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))
#labeller = SementicRoleLabeller()
#labeller.load(os.path.join(MODELDIR, "srl/"))
print("加载模型完毕。")
in_file_name = "input.txt"
out_file_name = "output.txt"
begin_line = 1
end_line = 0
if len(sys.argv) > 1:
in_file_name = sys.argv[1]
if len(sys.argv) > 2:
out_file_name = sys.argv[2]
if len(sys.argv) > 3:
begin_line = int(sys.argv[3])
if len(sys.argv) > 4:
end_line = int(sys.argv[4])
def extraction_start(in_file_name, out_file_name, begin_line, end_line):
"""
事实三元组抽取的总控程序
Args:
in_file_name: 输入文件的名称
#out_file_name: 输出文件的名称
begin_line: 读文件的起始行
end_line: 读文件的结束行
"""
in_file = open(in_file_name, 'rb')
out_file = open(out_file_name, 'a')
line_index = 1
sentence_number = 0
text_line = in_file.readline()
while text_line:
if line_index < begin_line:
text_line = in_file.readline()
line_index += 1
continue
if end_line != 0 and line_index > end_line:
break
sentence = text_line.strip()
if sentence == "" or len(sentence) > 1000:
text_line = in_file.readline()
line_index += 1
continue
try:
fact_triple_extract(sentence, out_file)
out_file.flush()
except Exception as e:
print(e)
sentence_number += 1
if sentence_number % 50 == 0:
print("%d done" % (sentence_number))
text_line = in_file.readline()
line_index += 1
in_file.close()
out_file.close()
def fact_triple_extract(sentence, out_file):
"""
对于给定的句子进行事实三元组抽取
Args:
sentence: 要处理的语句
"""
#print sentence
words = segmentor.segment(sentence)
#print "\t".join(words)
postags = postagger.postag(words)
netags = recognizer.recognize(words, postags)
arcs = parser.parse(words, postags)
#print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)
child_dict_list = build_parse_child_dict(words, postags, arcs)
for index in range(len(postags)):
# 抽取以谓词为中心的事实三元组
if postags[index] == 'v':
child_dict = child_dict_list[index]
# 主谓宾
if 'SBV' in child_dict and 'VOB' in child_dict:
for vob in child_dict['VOB']:
e1 = complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
r = words[index]
e2 = complete_e(words, postags, child_dict_list, vob)
out_file.write("主语谓语宾语关系\t(%s, %s, %s)\n" % (e1, r, e2))
out_file.flush()
elif 'VOB' in child_dict:
for vob in child_dict['VOB']:
r = words[index]
e2 = complete_e(words, postags, child_dict_list, vob)
out_file.write("主语谓语宾语关系\t(%s, %s, %s)\n" % ("我", r, e2))
out_file.flush()
# 定语后置,动宾关系
if arcs[index].relation == 'ATT':
if 'VOB' in child_dict:
for vob in child_dict['VOB']:
e1 = complete_e(words, postags, child_dict_list, arcs[index].head - 1)
r = words[index]
e2 = complete_e(words, postags, child_dict_list, vob)
temp_string = r+e2
if temp_string == e1[:len(temp_string)]:
e1 = e1[len(temp_string):]
if temp_string not in e1:
out_file.write("定语后置动宾关系\t(%s, %s, %s)\n" % (e1, r, e2))
out_file.flush()
# 含有介宾关系的主谓动补关系
if 'SBV' in child_dict and 'CMP' in child_dict:
#e1 = words[child_dict['SBV'][0]]
e1 = complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
cmp_index = child_dict['CMP'][0]
r = words[index] + words[cmp_index]
if 'POB' in child_dict_list[cmp_index]:
for pob in child_dict_list[cmp_index]['POB']:
e2 = complete_e(words, postags, child_dict_list, pob)
out_file.write("介宾关系主谓动补\t(%s, %s, %s)\n" % (e1, r, e2))
out_file.flush()
elif 'CMP' in child_dict:
cmp_index = child_dict['CMP'][0]
r = words[index] + words[cmp_index]
if 'POB' in child_dict_list[cmp_index]:
for pob in child_dict_list[cmp_index]['POB']:
e2 = complete_e(words, postags, child_dict_list, pob)
out_file.write("介宾关系主谓动补\t(%s, %s, %s)\n" % ("我", r, e2))
out_file.flush()
def build_parse_child_dict(words, postags, arcs):
"""
为句子中的每个词语维护一个保存句法依存儿子节点的字典
Args:
words: 分词列表
postags: 词性列表
arcs: 句法依存列表
"""
child_dict_list = []
for index in range(len(words)):
child_dict = dict()
for arc_index in range(len(arcs)):
if arcs[arc_index].head == index + 1:
if arcs[arc_index].relation in child_dict:
child_dict[arcs[arc_index].relation].append(arc_index)
else:
child_dict[arcs[arc_index].relation] = []
child_dict[arcs[arc_index].relation].append(arc_index)
#if child_dict.has_key('SBV'):
# print words[index],child_dict['SBV']
child_dict_list.append(child_dict)
return child_dict_list
def complete_e(words, postags, child_dict_list, word_index):
"""
完善识别的部分实体
"""
child_dict = child_dict_list[word_index]
prefix = ''
if 'ATT' in child_dict:
for i in range(len(child_dict['ATT'])):
prefix += complete_e(words, postags, child_dict_list, child_dict['ATT'][i])
postfix = ''
if postags[word_index] == 'v':
if 'VOB' in child_dict:
postfix += complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
if 'SBV' in child_dict:
prefix = complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix
return prefix + words[word_index] + postfix
if __name__ == "__main__":
extraction_start(in_file_name, out_file_name, begin_line, end_line)
通过在pipeline中的组件中,配置参数,主要有谓语,词性,和对应的实体,比如 ["抄送"],["nr"],["mail_cc_nr"],通过识别动词的语义,也就是通过词向量相识度比对,识别与“抄送”相似的谓语为中心,识别主谓宾结构,以此识别对应宾语对应的具体富含语义的实体,也就是抄送人。