import jieba import os import codecs import pycrfsuite

定义特征函数

def word2features(sent, i): word = sent[i][0] postag = sent[i][1]

features = [
    'bias',
    'word.lower=' + word.lower(),
    'word[-3:]=' + word[-3:],
    'word[-2:]=' + word[-2:],
    'word.isupper=%s' % word.isupper(),
    'word.istitle=%s' % word.istitle(),
    'word.isdigit=%s' % word.isdigit(),
    'postag=' + postag
]
if i > 0:
    word1 = sent[i-1][0]
    postag1 = sent[i-1][1]
    features.extend([
        '-1:word.lower=' + word1.lower(),
        '-1:word.istitle=%s' % word1.istitle(),
        '-1:word.isupper=%s' % word1.isupper(),
        '-1:postag=' + postag1
    ])
else:
    features.append('BOS')
if i < len(sent)-1:
    word1 = sent[i+1][0]
    postag1 = sent[i+1][1]
    features.extend([
        '+1:word.lower=' + word1.lower(),
        '+1:word.istitle=%s' % word1.istitle(),
        '+1:word.isupper=%s' % word1.isupper(),
        '+1:postag=' + postag1
    ])
else:
    features.append('EOS')

return features

定义特征函数，提取每个词的特征

def extract_features(sentence): return [word2features(sentence, i) for i in range(len(sentence))]

定义标签函数，提取每个词的标签

def get_labels(sentence): return [label for token, postag, label in sentence]

定义读取数据函数

def read_data(file_path): with codecs.open(file_path, 'r', encoding='utf-8') as f: data = [] sentence = [] for line in f: line = line.strip() if not line: if sentence: data.append(sentence) sentence = [] else: word, postag, label = line.split() sentence.append((word, postag, label)) if sentence: data.append(sentence) return data

读取训练数据

train_data = read_data('train.conll')

提取训练数据的特征和标签

X_train = [extract_features(sentence) for sentence in train_data] y_train = [get_labels(sentence) for sentence in train_data]

训练CRF模型

trainer = pycrfsuite.Trainer(verbose=False) for xseq, yseq in zip(X_train, y_train): trainer.append(xseq, yseq) trainer.set_params({ 'c1': 1.0, 'c2': 1e-3, 'max_iterations': 100, 'feature.possible_transitions': True }) trainer.train('address.crfsuite')

加载CRF模型

tagger = pycrfsuite.Tagger() tagger.open('address.crfsuite')

定义解析地址函数

def parse_address(address): # 分词 words = jieba.cut(address) # 初始化地址要素 province = '' city = '' district = '' street = '' houseno = '' cellno = '' floorno = '' devzone = '' community = ''

# 初始化标签
tag_list = []
# 遍历分词结果
for word in words:
    # 提取特征
    features = word2features([(word, '')], 0)
    # 预测标签
    label = tagger.tag(features)[0]
    # 根据标签更新地址要素
    if label == 'B-prov':
        province = word
    elif label == 'E-prov':
        province += word
    elif label == 'B-city':
        city = word
    elif label == 'E-city':
        city += word
    elif label == 'B-district':
        district = word
    elif label == 'E-district':
        district += word
    elif label == 'B-devzone':
        devzone = word
    elif label == 'E-devzone':
        devzone += word
    elif label == 'B-town':
        street = word
    elif label == 'E-town':
        street += word
    elif label == 'B-roadno':
        houseno = word
    elif label == 'E-roadno':
        houseno += word
    elif label == 'B-houseno':
        houseno = word
    elif label == 'E-houseno':
        houseno += word
    elif label == 'B-cellno':
        cellno = word
    elif label == 'E-cellno':
        cellno += word
    elif label == 'B-floorno':
        floorno = word
    elif label == 'E-floorno':
        floorno += word
    elif label == 'B-community':
        community = word
    elif label == 'I-community':
        community += word
    # 添加标签
    tag_list.append(label)
# 返回地址要素和标签
return province, city, district, street, community, devzone, houseno, cellno, floorno, ' '.join(tag_list)

读取文件中的地址信息

with open('1.txt', 'r', encoding='utf-8') as f: with open('对对对队_addr_parsing_crf.txt', 'w', encoding='utf-8') as f_out: for line in f: # 获取地址信息 id_, info = line.strip().split('') # 解析地址要素和标签 province, city, district, street, community, devzone, houseno, cellno, floorno, tags = parse_address(info) # 将结果写入文件 f_out.write(id_ + '' + info + '' + tags + ' ')

基于CRF模型的中文地址要素解析