import jieba

读取训练好的标签

with open('addr_tag.txt', 'r', encoding='utf-8') as f: tag_dict = {} for line in f: word, tag = line.strip().split(' ') tag_dict[word] = tag

def parse_address(address): # 分词 words = jieba.cut(address) # 初始化地址要素 province = '' city = '' district = '' street = ''

houseno = ''
cellno = ''
floorno = ''
devzone = ''
community = ''

# 初始化标签
tag_list = []
# 遍历分词结果
for word in words:
    # 判断是否为省份
    if tag_dict.get(word) == 'B-prov':
        for char in word:
            if char == word[0]:
                tag_list.append('B-prov')
            elif char == word[-1]:
                tag_list.append('E-prov')
            else:
                tag_list.append('I-prov')
        province = word
    # 判断是否为直辖市
    elif word in ['北京', '上海', '天津', '重庆']:
        for char in word:
            if char == word[0]:
                tag_list.append('B-city')
            elif char == word[-1]:
                tag_list.append('E-city')
            else:
                tag_list.append('I-city')
        province = word + '市'
        city = word + '市'
    # 判断是否为城市
    elif tag_dict.get(word) == 'B-city':
        for char in word:
            if char == word[0]:
                tag_list.append('B-city')
            elif char == word[-1]:
                tag_list.append('E-city')
            else:
                tag_list.append('I-city')
        city = word
    # 判断是否为区县
    elif tag_dict.get(word) == 'B-district':
        for char in word:
            if char == word[0]:
                tag_list.append('B-district')
            elif char == word[-1]:
                tag_list.append('E-district')
            else:
                tag_list.append('I-district')
        district = word
    # 判断是否为街道
    elif tag_dict.get(word) == 'B-town':
        for char in word:
            if char == word[0]:
                tag_list.append('B-town')
            elif char == word[-1]:
                tag_list.append('E-town')
            else:
                tag_list.append('I-town')
        street = word
    # 判断是否为社区
    elif tag_dict.get(word) == 'B-community':
        for char in word:
            if char == word[0]:
                tag_list.append('B-community')
            elif char == word[-1]:
                tag_list.append('E-community')
            else:
                tag_list.append('I-community')
        community = word
    # 判断是否为开发区
    elif tag_dict.get(word) == 'B-devzone':
        for char in word:
            if char == word[0]:
                tag_list.append('B-devzone')
            elif char == word[-1]:
                tag_list.append('E-devzone')
            else:
                tag_list.append('I-devzone')
        devzone = word
    # 判断是否为门牌号
    elif tag_dict.get(word) == 'B-roadno' or tag_dict.get(word) == 'I-roadno' or tag_dict.get(word) == 'E-roadno':
        for char in word:
            if char == word[0]:
                tag_list.append('B-roadno')
            elif char == word[-1]:
                tag_list.append('E-roadno')
            else:
                tag_list.append('I-roadno')
        houseno += word
    # 判断是否为楼号
    elif tag_dict.get(word) == 'B-houseno' or tag_dict.get(word) == 'I-houseno' or tag_dict.get(word) == 'E-houseno':
        for char in word:
            if char == word[0]:
                tag_list.append('B-houseno')
            elif char == word[-1]:
                tag_list.append('E-houseno')
            else:
                tag_list.append('I-houseno')
        houseno = word
    # 判断是否为单元
    elif tag_dict.get(word) == 'B-cellno' or tag_dict.get(word) == 'I-cellno' or tag_dict.get(word) == 'E-cellno':
        for char in word:
            if char == word[0]:
                tag_list.append('B-cellno')
            elif char == word[-1]:
                tag_list.append('E-cellno')
            else:
                tag_list.append('I-cellno')
        cellno = word
    # 判断是否为楼层
    elif tag_dict.get(word) == 'B-floorno' or tag_dict.get(word) == 'I-floorno' or tag_dict.get(word) == 'E-floorno':
        for char in word:
            if char == word[0]:
                tag_list.append('B-floorno')
            elif char == word[-1]:
                tag_list.append('E-floorno')
            else:
                tag_list.append('I-floorno')
        floorno = word
    # 判断是否为POI
    elif tag_dict.get(word) == 'B-poi' or tag_dict.get(word) == 'I-poi' or tag_dict.get(word) == 'E-poi':
        for char in word:
            if char == word[0]:
                tag_list.append('B-poi')
            elif char == word[-1]:
                tag_list.append('E-poi')
            else:
                tag_list.append('I-poi')
    # 其他情况
    else:
        tag_list.append('O')
# 返回地址要素和标签
return province, city, district, street, community, devzone, houseno, cellno, floorno, ' '.join(tag_list)

读取文件中的地址信息

with open('1.txt', 'r', encoding='utf-8') as f: with open('对对对队_addr_parsing.txt', 'w', encoding='utf-8') as f_out: for line in f: # 获取地址信息 id_, info = line.strip().split('') # 解析地址要素和标签 province, city, district, street, community, devzone, houseno, cellno, floorno, tags = parse_address(info) # 将结果写入文件 f_out.write(id_ + '' + info + '' + tags + ' ')

基于 Jieba 的地址解析算法:识别省市区街道门牌号等要素

原文地址: https://www.cveoy.top/t/topic/joLA 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录