基于CRF模型的中文地址要素解析
import jieba import os import codecs import pycrfsuite
定义特征函数
def word2features(sent, i): word = sent[i][0] postag = sent[i][1]
features = [
'bias',
'word.lower=' + word.lower(),
'word[-3:]=' + word[-3:],
'word[-2:]=' + word[-2:],
'word.isupper=%s' % word.isupper(),
'word.istitle=%s' % word.istitle(),
'word.isdigit=%s' % word.isdigit(),
'postag=' + postag
]
if i > 0:
word1 = sent[i-1][0]
postag1 = sent[i-1][1]
features.extend([
'-1:word.lower=' + word1.lower(),
'-1:word.istitle=%s' % word1.istitle(),
'-1:word.isupper=%s' % word1.isupper(),
'-1:postag=' + postag1
])
else:
features.append('BOS')
if i < len(sent)-1:
word1 = sent[i+1][0]
postag1 = sent[i+1][1]
features.extend([
'+1:word.lower=' + word1.lower(),
'+1:word.istitle=%s' % word1.istitle(),
'+1:word.isupper=%s' % word1.isupper(),
'+1:postag=' + postag1
])
else:
features.append('EOS')
return features
定义特征函数,提取每个词的特征
def extract_features(sentence): return [word2features(sentence, i) for i in range(len(sentence))]
定义标签函数,提取每个词的标签
def get_labels(sentence): return [label for token, postag, label in sentence]
定义读取数据函数
def read_data(file_path): with codecs.open(file_path, 'r', encoding='utf-8') as f: data = [] sentence = [] for line in f: line = line.strip() if not line: if sentence: data.append(sentence) sentence = [] else: word, postag, label = line.split() sentence.append((word, postag, label)) if sentence: data.append(sentence) return data
读取训练数据
train_data = read_data('train.conll')
提取训练数据的特征和标签
X_train = [extract_features(sentence) for sentence in train_data] y_train = [get_labels(sentence) for sentence in train_data]
训练CRF模型
trainer = pycrfsuite.Trainer(verbose=False) for xseq, yseq in zip(X_train, y_train): trainer.append(xseq, yseq) trainer.set_params({ 'c1': 1.0, 'c2': 1e-3, 'max_iterations': 100, 'feature.possible_transitions': True }) trainer.train('address.crfsuite')
加载CRF模型
tagger = pycrfsuite.Tagger() tagger.open('address.crfsuite')
定义解析地址函数
def parse_address(address): # 分词 words = jieba.cut(address) # 初始化地址要素 province = '' city = '' district = '' street = '' houseno = '' cellno = '' floorno = '' devzone = '' community = ''
# 初始化标签
tag_list = []
# 遍历分词结果
for word in words:
# 提取特征
features = word2features([(word, '')], 0)
# 预测标签
label = tagger.tag(features)[0]
# 根据标签更新地址要素
if label == 'B-prov':
province = word
elif label == 'E-prov':
province += word
elif label == 'B-city':
city = word
elif label == 'E-city':
city += word
elif label == 'B-district':
district = word
elif label == 'E-district':
district += word
elif label == 'B-devzone':
devzone = word
elif label == 'E-devzone':
devzone += word
elif label == 'B-town':
street = word
elif label == 'E-town':
street += word
elif label == 'B-roadno':
houseno = word
elif label == 'E-roadno':
houseno += word
elif label == 'B-houseno':
houseno = word
elif label == 'E-houseno':
houseno += word
elif label == 'B-cellno':
cellno = word
elif label == 'E-cellno':
cellno += word
elif label == 'B-floorno':
floorno = word
elif label == 'E-floorno':
floorno += word
elif label == 'B-community':
community = word
elif label == 'I-community':
community += word
# 添加标签
tag_list.append(label)
# 返回地址要素和标签
return province, city, district, street, community, devzone, houseno, cellno, floorno, ' '.join(tag_list)
读取文件中的地址信息
with open('1.txt', 'r', encoding='utf-8') as f: with open('对对对队_addr_parsing_crf.txt', 'w', encoding='utf-8') as f_out: for line in f: # 获取地址信息 id_, info = line.strip().split('') # 解析地址要素和标签 province, city, district, street, community, devzone, houseno, cellno, floorno, tags = parse_address(info) # 将结果写入文件 f_out.write(id_ + '' + info + '' + tags + ' ')
原文地址: https://www.cveoy.top/t/topic/joLT 著作权归作者所有。请勿转载和采集!