基于CRF的中文地址要素解析
import jieba
import sklearn_crfsuite
from sklearn_crfsuite import metrics
def parse_address(address):
# 分词
words = jieba.cut(address)
# 初始化地址要素
province = ''
city = ''
district = ''
street = ''
houseno = ''
cellno = ''
floorno = ''
devzone = ''
community = ''
# 初始化标签
tag_list = []
# 遍历分词结果
for word in words:
# 判断是否为省份
if word.endswith('省') or word.endswith('自治区'):
for i, char in enumerate(word):
if i == 0:
tag_list.append('B-prov')
elif i == len(word) - 1:
tag_list.append('E-prov')
else:
tag_list.append('I-prov')
province = word
# 判断是否为直辖市
elif word in ['北京', '上海', '天津', '重庆']:
for i, char in enumerate(word):
if i == 0:
tag_list.append('B-city')
elif i == len(word) - 1:
tag_list.append('E-city')
else:
tag_list.append('I-city')
province = word + '市'
city = word + '市'
# 判断是否为城市
elif word.endswith('市'):
for i, char in enumerate(word):
if i == 0:
tag_list.append('B-city')
elif i == len(word) - 1:
tag_list.append('E-city')
else:
tag_list.append('I-city')
city = word
# 判断是否为区县
elif word.endswith('区') or word.endswith('社区')or word.endswith('庄')or word.endswith('塘')or word.endswith('演'):
for i, char in enumerate(word):
if i == 0:
tag_list.append('B-district')
elif i == len(word) - 1:
tag_list.append('E-district')
else:
tag_list.append('I-district')
community = word
# 判断是否为村
elif word.endswith('村') or word.endswith('县'):
for i, char in enumerate(word):
if i == 0:
tag_list.append('B-district')
elif i == len(word) - 1:
tag_list.append('E-district')
else:
tag_list.append('I-district')
district = word
# 判断是否为城
elif word.endswith('城'):
for i, char in enumerate(word):
if i == 0:
tag_list.append('B-devzone')
elif i == len(word) - 1:
tag_list.append('E-devzone')
else:
tag_list.append('I-devzone')
devzone = word
# 判断是否为街道
elif word.endswith('街') or word.endswith('路') or word.endswith('巷') or word.endswith('道'):
for i, char in enumerate(word):
if i == 0:
tag_list.append('B-town')
elif i == len(word) - 1:
tag_list.append('E-town')
else:
tag_list.append('I-town')
street = word
# 判断是否为门牌号
elif word.isdigit():
for i, char in enumerate(word):
if i == 0:
tag_list.append('B-roadno')
elif i == len(word) - 1:
tag_list.append('E-roadno')
else:
tag_list.append('I-roadno')
# 判断是否为楼号
elif word.endswith('栋') or word.endswith('幢') or word.endswith('座'):
for i, char in enumerate(word):
if i == 0:
tag_list.append('B-houseno')
elif i == len(word) - 1:
tag_list.append('E-houseno')
else:
tag_list.append('I-houseno')
houseno = word
# 判断是否为单元
elif word.endswith('单元') :
for i, char in enumerate(word):
if i == 0:
tag_list.append('B-cellno')
elif i == len(word) - 1:
tag_list.append('E-cellno')
else:
tag_list.append('I-cellno')
cellno = word
# 判断是否为楼层
elif word.endswith('楼') :
for i, char in enumerate(word):
if i == 0:
tag_list.append('B-floorno')
elif i == len(word) - 1:
tag_list.append('E-floorno')
else:
tag_list.append('I-floorno')
floorno = word
# 判断是否为POI
else:
for i, char in enumerate(word):
if i == 0:
tag_list.append('B-poi')
elif i == len(word) - 1:
tag_list.append('E-poi')
else:
tag_list.append('I-poi')
# 返回地址要素和标签
return province, city, district, street, community, devzone, houseno, cellno, floorno, tag_list
# 读取训练数据
with open('train.conll', 'r', encoding='utf-8') as f:
# 初始化训练数据
train_data = []
# 遍历每一行
for line in f:
# 跳过空行
if line.strip() == '':
continue
# 获取字符和标签
char, tag = line.strip().split(' ')
train_data.append((char, tag))
# 将训练数据转换为CRF模型所需的格式
X_train = [[{'char': char} for char, tag in sentence] for sentence in train_data]
y_train = [[tag for char, tag in sentence] for sentence in train_data]
# 训练CRF模型
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=True
)
crf.fit(X_train, y_train)
# 读取测试数据
with open('1.txt', 'r', encoding='utf-8') as f:
with open('output.txt', 'w', encoding='utf-8') as f_out:
# 遍历每一行
for line in f:
# 获取地址信息
id_, info = line.strip().split('')
# 对地址信息进行分词
chars = list(info)
# 将分词结果转换为CRF模型所需的格式
X_test = [{'char': char} for char in chars]
# 预测地址要素标签
y_pred = crf.predict_single(X_test)
# 将预测结果写入文件
f_out.write(id_ + '' + info + '' + ' '.join(y_pred) + '
')
原文地址: https://www.cveoy.top/t/topic/joLU 著作权归作者所有。请勿转载和采集!