基于 Jieba 的地址解析算法:识别省市区街道门牌号等要素
import jieba
读取训练好的标签
with open('addr_tag.txt', 'r', encoding='utf-8') as f: tag_dict = {} for line in f: word, tag = line.strip().split(' ') tag_dict[word] = tag
def parse_address(address): # 分词 words = jieba.cut(address) # 初始化地址要素 province = '' city = '' district = '' street = ''
houseno = ''
cellno = ''
floorno = ''
devzone = ''
community = ''
# 初始化标签
tag_list = []
# 遍历分词结果
for word in words:
# 判断是否为省份
if tag_dict.get(word) == 'B-prov':
for char in word:
if char == word[0]:
tag_list.append('B-prov')
elif char == word[-1]:
tag_list.append('E-prov')
else:
tag_list.append('I-prov')
province = word
# 判断是否为直辖市
elif word in ['北京', '上海', '天津', '重庆']:
for char in word:
if char == word[0]:
tag_list.append('B-city')
elif char == word[-1]:
tag_list.append('E-city')
else:
tag_list.append('I-city')
province = word + '市'
city = word + '市'
# 判断是否为城市
elif tag_dict.get(word) == 'B-city':
for char in word:
if char == word[0]:
tag_list.append('B-city')
elif char == word[-1]:
tag_list.append('E-city')
else:
tag_list.append('I-city')
city = word
# 判断是否为区县
elif tag_dict.get(word) == 'B-district':
for char in word:
if char == word[0]:
tag_list.append('B-district')
elif char == word[-1]:
tag_list.append('E-district')
else:
tag_list.append('I-district')
district = word
# 判断是否为街道
elif tag_dict.get(word) == 'B-town':
for char in word:
if char == word[0]:
tag_list.append('B-town')
elif char == word[-1]:
tag_list.append('E-town')
else:
tag_list.append('I-town')
street = word
# 判断是否为社区
elif tag_dict.get(word) == 'B-community':
for char in word:
if char == word[0]:
tag_list.append('B-community')
elif char == word[-1]:
tag_list.append('E-community')
else:
tag_list.append('I-community')
community = word
# 判断是否为开发区
elif tag_dict.get(word) == 'B-devzone':
for char in word:
if char == word[0]:
tag_list.append('B-devzone')
elif char == word[-1]:
tag_list.append('E-devzone')
else:
tag_list.append('I-devzone')
devzone = word
# 判断是否为门牌号
elif tag_dict.get(word) == 'B-roadno' or tag_dict.get(word) == 'I-roadno' or tag_dict.get(word) == 'E-roadno':
for char in word:
if char == word[0]:
tag_list.append('B-roadno')
elif char == word[-1]:
tag_list.append('E-roadno')
else:
tag_list.append('I-roadno')
houseno += word
# 判断是否为楼号
elif tag_dict.get(word) == 'B-houseno' or tag_dict.get(word) == 'I-houseno' or tag_dict.get(word) == 'E-houseno':
for char in word:
if char == word[0]:
tag_list.append('B-houseno')
elif char == word[-1]:
tag_list.append('E-houseno')
else:
tag_list.append('I-houseno')
houseno = word
# 判断是否为单元
elif tag_dict.get(word) == 'B-cellno' or tag_dict.get(word) == 'I-cellno' or tag_dict.get(word) == 'E-cellno':
for char in word:
if char == word[0]:
tag_list.append('B-cellno')
elif char == word[-1]:
tag_list.append('E-cellno')
else:
tag_list.append('I-cellno')
cellno = word
# 判断是否为楼层
elif tag_dict.get(word) == 'B-floorno' or tag_dict.get(word) == 'I-floorno' or tag_dict.get(word) == 'E-floorno':
for char in word:
if char == word[0]:
tag_list.append('B-floorno')
elif char == word[-1]:
tag_list.append('E-floorno')
else:
tag_list.append('I-floorno')
floorno = word
# 判断是否为POI
elif tag_dict.get(word) == 'B-poi' or tag_dict.get(word) == 'I-poi' or tag_dict.get(word) == 'E-poi':
for char in word:
if char == word[0]:
tag_list.append('B-poi')
elif char == word[-1]:
tag_list.append('E-poi')
else:
tag_list.append('I-poi')
# 其他情况
else:
tag_list.append('O')
# 返回地址要素和标签
return province, city, district, street, community, devzone, houseno, cellno, floorno, ' '.join(tag_list)
读取文件中的地址信息
with open('1.txt', 'r', encoding='utf-8') as f: with open('对对对队_addr_parsing.txt', 'w', encoding='utf-8') as f_out: for line in f: # 获取地址信息 id_, info = line.strip().split('') # 解析地址要素和标签 province, city, district, street, community, devzone, houseno, cellno, floorno, tags = parse_address(info) # 将结果写入文件 f_out.write(id_ + '' + info + '' + tags + ' ')
原文地址: https://www.cveoy.top/t/topic/joLA 著作权归作者所有。请勿转载和采集!