import jieba import codecs import os import sys import sklearn_crfsuite from sklearn_crfsuite import metrics from sklearn.externals import joblib

加载自定义词典

jieba.load_userdict('dict.txt')

读取训练数据

def read_data(file_path): with codecs.open(file_path, 'r', encoding='utf-8') as f: data = [] sentence = [] for line in f: line = line.strip() if not line: data.append(sentence) sentence = [] else: word, tag = line.split() sentence.append((word, tag)) if sentence: data.append(sentence) return data

特征提取函数

def feature_extractor(sentence, i): word = sentence[i][0] features = { 'bias': 1.0, 'word': word, 'word_len': len(word), 'word_isdigit': word.isdigit(), 'word_isalpha': word.isalpha(), 'word_isalnum': word.isalnum(), 'word_isupper': word.isupper(), 'word_istitle': word.istitle(), 'word_islower': word.islower(), 'prev_word': '' if i == 0 else sentence[i-1][0], 'prev_word_len': 0 if i == 0 else len(sentence[i-1][0]), 'prev_word_isdigit': False if i == 0 else sentence[i-1][0].isdigit(), 'prev_word_isalpha': False if i == 0 else sentence[i-1][0].isalpha(), 'prev_word_isalnum': False if i == 0 else sentence[i-1][0].isalnum(), 'prev_word_isupper': False if i == 0 else sentence[i-1][0].isupper(), 'prev_word_istitle': False if i == 0 else sentence[i-1][0].istitle(), 'prev_word_islower': False if i == 0 else sentence[i-1][0].islower(), 'next_word': '' if i == len(sentence)-1 else sentence[i+1][0], 'next_word_len': 0 if i == len(sentence)-1 else len(sentence[i+1][0]), 'next_word_isdigit': False if i == len(sentence)-1 else sentence[i+1][0].isdigit(), 'next_word_isalpha': False if i == len(sentence)-1 else sentence[i+1][0].isalpha(), 'next_word_isalnum': False if i == len(sentence)-1 else sentence[i+1][0].isalnum(), 'next_word_isupper': False if i == len(sentence)-1 else sentence[i+1][0].isupper(), 'next_word_istitle': False if i == len(sentence)-1 else sentence[i+1][0].istitle(), 'next_word_islower': False if i == len(sentence)-1 else sentence[i+1][0].islower(), } return features

特征提取函数，针对整个句子

def sentence_feature_extractor(sentence): return [feature_extractor(sentence, i) for i in range(len(sentence))]

训练模型

def train_model(train_data, model_path): X_train = [sentence_feature_extractor(sentence) for sentence in train_data] y_train = [[tag for word, tag in sentence] for sentence in train_data] crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_train, y_train) joblib.dump(crf, model_path)

预测标签

def predict_tags(sentence, model_path): crf = joblib.load(model_path) X_test = [sentence_feature_extractor(sentence)] y_pred = crf.predict(X_test)[0] return y_pred

解析地址

def parse_address(address, model_path): # 分词 words = jieba.cut(address) # 初始化地址要素 province = '' city = '' district = '' street = '' houseno = '' cellno = '' floorno = '' devzone = '' community = '' # 遍历分词结果 sentence = [(word, '') for word in words] tags = predict_tags(sentence, model_path) for i in range(len(sentence)): word, tag = sentence[i] if tag.startswith('B-prov'): province = word elif tag.startswith('B-city'): city = word elif tag.startswith('B-district'): district = word elif tag.startswith('B-town'): street = word elif tag.startswith('B-devzone'): devzone = word elif tag.startswith('B-community'): community = word elif tag.startswith('B-roadno'): houseno = word elif tag.startswith('B-houseno'): houseno = word elif tag.startswith('B-cellno'): cellno = word elif tag.startswith('B-floorno'): floorno = word # 返回地址要素 return province, city, district, street, community, devzone, houseno, cellno, floorno

if name == 'main': # 训练模型 train_data = read_data('train.conll') train_model(train_data, 'addr_crf_model.pkl') # 解析地址 with open('1.txt', 'r', encoding='utf-8') as f: with open('addr_parsing_result.txt', 'w', encoding='utf-8') as f_out: for line in f: id_, address = line.strip().split('') province, city, district, street, community, devzone, houseno, cellno, floorno = parse_address(address, 'addr_crf_model.pkl') result = id_ + '' + address + '' + province + '' + city + '' + district + '' + street + '' + community + '' + devzone + '' + houseno + '' + cellno + '' + floorno + ' ' f_out.write(result)

基于CRF模型的地址解析算法实现