中文地址要素解析 - 基于CRF模型的地址信息提取
import jieba import pycrfsuite
定义特征函数
def word2features(sent, i): word = sent[i][0] features = [ 'bias', 'word=' + word, 'word.isdigit=%s' % word.isdigit(), ] if i > 0: prev_word = sent[i-1][0] features.extend([ '-1:word=' + prev_word, '-1:word.isdigit=%s' % prev_word.isdigit(), ]) else: features.append('BOS') if i < len(sent)-1: next_word = sent[i+1][0] features.extend([ '+1:word=' + next_word, '+1:word.isdigit=%s' % next_word.isdigit(), ]) else: features.append('EOS') return features
定义标签序列
def sent2labels(sent): return [label for token, label in sent]
定义特征序列
def sent2features(sent): return [word2features(sent, i) for i in range(len(sent))]
解析地址
def parse_address(address): # 分词 words = jieba.cut(address) # 初始化地址要素 province = '' city = '' district = '' street = ''
houseno = ''
cellno = ''
floorno = ''
devzone = ''
community = ''
# 初始化标签
labels = []
# 遍历分词结果
for word in words:
labels.append('O')
# 提取特征序列
sent = list(zip(words, labels))
features = sent2features(sent)
# 加载模型
tagger = pycrfsuite.Tagger()
tagger.open('addr_parser.crfsuite')
# 预测标签序列
pred_labels = tagger.tag(features)
# 根据标签序列提取地址要素
for i in range(len(sent)):
if pred_labels[i].startswith('B'):
if pred_labels[i].endswith('prov'):
province += sent[i][0]
elif pred_labels[i].endswith('city'):
city += sent[i][0]
elif pred_labels[i].endswith('district'):
district += sent[i][0]
elif pred_labels[i].endswith('town'):
street += sent[i][0]
elif pred_labels[i].endswith('devzone'):
devzone += sent[i][0]
elif pred_labels[i].endswith('roadno'):
houseno += sent[i][0]
elif pred_labels[i].endswith('houseno'):
houseno += sent[i][0]
elif pred_labels[i].endswith('cellno'):
cellno += sent[i][0]
elif pred_labels[i].endswith('floorno'):
floorno += sent[i][0]
elif pred_labels[i].endswith('community'):
community += sent[i][0]
elif pred_labels[i].startswith('I'):
if pred_labels[i].endswith('prov'):
province += sent[i][0]
elif pred_labels[i].endswith('city'):
city += sent[i][0]
elif pred_labels[i].endswith('district'):
district += sent[i][0]
elif pred_labels[i].endswith('town'):
street += sent[i][0]
elif pred_labels[i].endswith('devzone'):
devzone += sent[i][0]
elif pred_labels[i].endswith('roadno'):
houseno += sent[i][0]
elif pred_labels[i].endswith('houseno'):
houseno += sent[i][0]
elif pred_labels[i].endswith('cellno'):
cellno += sent[i][0]
elif pred_labels[i].endswith('floorno'):
floorno += sent[i][0]
elif pred_labels[i].endswith('community'):
community += sent[i][0]
# 返回地址要素
return province, city, district, street, community, devzone, houseno, cellno, floorno
读取文件中的地址信息
with open('1.txt', 'r', encoding='utf-8') as f: with open('地址要素解析结果.txt', 'w', encoding='utf-8') as f_out: for line in f: # 获取地址信息 id_, info = line.strip().split('') # 解析地址要素 province, city, district, street, community, devzone, houseno, cellno, floorno = parse_address(info) # 将结果写入文件 f_out.write(id_ + '' + province + '' + city + '' + district + '' + street + '' + community + '' + devzone + '' + houseno + '' + cellno + '' + floorno + ' ')
原文地址: https://www.cveoy.top/t/topic/joLS 著作权归作者所有。请勿转载和采集!