import jiebadef parse_addressaddress # 分词 words = jiebacutaddress # 初始化地址要素 province = city = district = street = road_no = # 遍历分词结果 for word in words # 判断是否为
import jieba
def parse_address(address): # 分词 words = jieba.cut(address) # 初始化地址要素 province = '' city = '' district = '' street = '' road_no = '' # 遍历分词结果 for word in words: # 判断是否为省份 if word.endswith('省') or word.endswith('自治区'): for char in word: if char == word[0]: print(char, 'B-prov') elif char == word[-1]: print(char, 'E-prov') else: print(char, 'I-prov') province = word # 判断是否为直辖市 elif word in ['北京', '上海', '天津', '重庆']: for char in word: if char == word[0]: print(char, 'B-city') elif char == word[-1]: print(char, 'E-city') else: print(char, 'I-city') province = word + '市' city = word + '市' # 判断是否为城市 elif word.endswith('市'): for char in word: if char == word[0]: print(char, 'B-city') elif char == word[-1]: print(char, 'E-city') else: print(char, 'I-city') city = word # 判断是否为区县 elif word.endswith('区') or word.endswith('县'): for char in word: if char == word[0]: print(char, 'B-district') elif char == word[-1]: print(char, 'E-district') else: print(char, 'I-district') district = word # 判断是否为街道 elif word.endswith('街') or word.endswith('路') or word.endswith('巷') or word.endswith('道'): for char in word: if char == word[0]: print(char, 'B-town') elif char == word[-1]: print(char, 'E-town') else: print(char, 'I-town') street = word # 判断是否为门牌号 elif word.isdigit(): for char in word: if char == word[0]: print(char, 'B-roadno') elif char == word[-1]: print(char, 'E-roadno') else: print(char, 'I-roadno') # 判断是否为POI else: for char in word: if char == word[0]: print(char, 'B-poi') elif char == word[-1]: print(char, 'E-poi') else: print(char, 'I-poi') # 返回地址要素 return province, city, district, street
读取文件中的地址信息
with open('1.txt', 'r', encoding='utf-8') as f: with open('dev.conll', 'w', encoding='utf-8') as fw: for line in f: # 获取地址信息 info = line.strip().split('')[1] # 解析地址要素 province, city, district, street = parse_address(info) # 输出地址要素 for i in range(len(info)): if info[i] == ' ': fw.write('\n') else: fw.write(info[i] + ' O\n') fw.write('\n') print(' '
原文地址: https://www.cveoy.top/t/topic/fyZy 著作权归作者所有。请勿转载和采集!