import jieba import pycrfsuite

定义特征函数

def word2features(sent, i): word = sent[i][0] features = [ 'bias', 'word=' + word, 'word.isdigit=%s' % word.isdigit(), ] if i > 0: prev_word = sent[i-1][0] features.extend([ '-1:word=' + prev_word, '-1:word.isdigit=%s' % prev_word.isdigit(), ]) else: features.append('BOS') if i < len(sent)-1: next_word = sent[i+1][0] features.extend([ '+1:word=' + next_word, '+1:word.isdigit=%s' % next_word.isdigit(), ]) else: features.append('EOS') return features

定义标签序列

def sent2labels(sent): return [label for token, label in sent]

定义特征序列

def sent2features(sent): return [word2features(sent, i) for i in range(len(sent))]

解析地址

def parse_address(address): # 分词 words = jieba.cut(address) # 初始化地址要素 province = '' city = '' district = '' street = ''

houseno = ''
cellno = ''
floorno = ''
devzone = ''
community = ''

# 初始化标签
labels = []
# 遍历分词结果
for word in words:
    labels.append('O')
# 提取特征序列
sent = list(zip(words, labels))
features = sent2features(sent)
# 加载模型
tagger = pycrfsuite.Tagger()
tagger.open('addr_parser.crfsuite')
# 预测标签序列
pred_labels = tagger.tag(features)
# 根据标签序列提取地址要素
for i in range(len(sent)):
    if pred_labels[i].startswith('B'):
        if pred_labels[i].endswith('prov'):
            province += sent[i][0]
        elif pred_labels[i].endswith('city'):
            city += sent[i][0]
        elif pred_labels[i].endswith('district'):
            district += sent[i][0]
        elif pred_labels[i].endswith('town'):
            street += sent[i][0]
        elif pred_labels[i].endswith('devzone'):
            devzone += sent[i][0]
        elif pred_labels[i].endswith('roadno'):
            houseno += sent[i][0]
        elif pred_labels[i].endswith('houseno'):
            houseno += sent[i][0]
        elif pred_labels[i].endswith('cellno'):
            cellno += sent[i][0]
        elif pred_labels[i].endswith('floorno'):
            floorno += sent[i][0]
        elif pred_labels[i].endswith('community'):
            community += sent[i][0]
    elif pred_labels[i].startswith('I'):
        if pred_labels[i].endswith('prov'):
            province += sent[i][0]
        elif pred_labels[i].endswith('city'):
            city += sent[i][0]
        elif pred_labels[i].endswith('district'):
            district += sent[i][0]
        elif pred_labels[i].endswith('town'):
            street += sent[i][0]
        elif pred_labels[i].endswith('devzone'):
            devzone += sent[i][0]
        elif pred_labels[i].endswith('roadno'):
            houseno += sent[i][0]
        elif pred_labels[i].endswith('houseno'):
            houseno += sent[i][0]
        elif pred_labels[i].endswith('cellno'):
            cellno += sent[i][0]
        elif pred_labels[i].endswith('floorno'):
            floorno += sent[i][0]
        elif pred_labels[i].endswith('community'):
            community += sent[i][0]
# 返回地址要素
return province, city, district, street, community, devzone, houseno, cellno, floorno

读取文件中的地址信息

with open('1.txt', 'r', encoding='utf-8') as f: with open('地址要素解析结果.txt', 'w', encoding='utf-8') as f_out: for line in f: # 获取地址信息 id_, info = line.strip().split('') # 解析地址要素 province, city, district, street, community, devzone, houseno, cellno, floorno = parse_address(info) # 将结果写入文件 f_out.write(id_ + '' + province + '' + city + '' + district + '' + street + '' + community + '' + devzone + '' + houseno + '' + cellno + '' + floorno + ' ')

中文地址要素解析 - 基于CRF模型的地址信息提取

定义特征函数

定义标签序列

定义特征序列

解析地址

读取文件中的地址信息