import jieba from sklearn_crfsuite import CRF from sklearn_crfsuite.metrics import flat_classification_report from sklearn.model_selection import train_test_split

读取训练数据

with open('train.conll', 'r', encoding='utf-8') as f: lines = f.readlines()

初始化训练集和标签集

train_data = [] train_labels = []

遍历每一行数据

for line in lines: # 去除空格和换行符 line = line.strip() # 判断是否为空行 if line: # 将每行数据按空格分割 parts = line.split(' ') # 获取词和标签 word, label = parts[0], parts[1] # 将词和标签加入训练集和标签集 train_data.append(word) train_labels.append(label) else: # 遇到空行表示一条数据结束，将训练集和标签集加入总的训练数据列表中 train_data.append(' ') train_labels.append(' ') train_data = [list(jieba.cut(word)) if word != ' ' else [' '] for word in train_data]

划分训练集和测试集

X_train, X_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)

定义特征函数，用于将词转换为特征

def word2features(sent, i): word = sent[i] # 定义前缀和后缀 prefixes = ['B', 'M', 'E', 'S'] suffixes = ['prov', 'city', 'district', 'town', 'roadno', 'houseno', 'cellno', 'floorno', 'poi', 'devzone', 'community'] features = { 'bias': 1.0, 'word': word, 'word_len': len(word), 'word_isdigit': word.isdigit(), 'word_isalpha': word.isalpha(), 'word_isalnum': word.isalnum(), 'word_prefix': [word.startswith(prefix) for prefix in prefixes], 'word_suffix': [word.endswith(suffix) for suffix in suffixes], } # 处理当前词的前一个词和后一个词 if i > 0: word1 = sent[i-1] features.update({ '-1:word': word1, '-1:word_len': len(word1), '-1:word_isdigit': word1.isdigit(), '-1:word_isalpha': word1.isalpha(), '-1:word_isalnum': word1.isalnum(), '-1:word_prefix': [word1.startswith(prefix) for prefix in prefixes], '-1:word_suffix': [word1.endswith(suffix) for suffix in suffixes], }) else: features['BOS'] = True if i < len(sent)-1: word1 = sent[i+1] features.update({ '+1:word': word1, '+1:word_len': len(word1), '+1:word_isdigit': word1.isdigit(), '+1:word_isalpha': word1.isalpha(), '+1:word_isalnum': word1.isalnum(), '+1:word_prefix': [word1.startswith(prefix) for prefix in prefixes], '+1:word_suffix': [word1.endswith(suffix) for suffix in suffixes], }) else: features['EOS'] = True return features

定义特征函数，将整个句子转换为特征序列

def sent2features(sent): return [word2features(sent, i) for i in range(len(sent))]

将训练集和测试集转换为特征序列

X_train = [sent2features(sent) for sent in X_train] X_test = [sent2features(sent) for sent in X_test]

训练CRF模型

crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_train, y_train)

预测测试集

y_pred = crf.predict(X_test)

输出分类报告

report = flat_classification_report(y_test, y_pred) print(report)

使用训练好的模型解析地址

def parse_address(address): # 分词 words = list(jieba.cut(address)) # 初始化地址要素 province = '' city = '' district = '' street = '' houseno = '' cellno = '' floorno = '' devzone = '' community = '' # 初始化标签 tags = [] # 将地址转换为特征序列 features = sent2features(words) # 预测标签序列 labels = crf.predict([features])[0] # 遍历词和标签 for word, label in zip(words, labels): # 判断是否为省份 if label == 'B-prov': province = word elif label == 'E-prov': province += word # 判断是否为直辖市 elif label == 'B-city' and word in ['北京', '上海', '天津', '重庆']: province = word + '市' city = word + '市' # 判断是否为城市 elif label == 'B-city' and province: city = word elif label == 'E-city' and city: city += word # 判断是否为区县 elif label == 'B-district' and city: district = word elif label == 'I-district' and district: district += word elif label == 'E-district' and district: district += word # 判断是否为街道 elif label == 'B-town' and district: street = word elif label == 'I-town' and street: street += word elif label == 'E-town' and street: street += word # 判断是否为门牌号 elif label == 'B-roadno': houseno = word elif label == 'I-roadno' and houseno: houseno += word elif label == 'E-roadno' and houseno: houseno += word # 判断是否为楼号 elif label == 'B-houseno': houseno = word elif label == 'I-houseno' and houseno: houseno += word elif label == 'E-houseno' and houseno: houseno += word # 判断是否为单元 elif label == 'B-cellno': cellno = word elif label == 'I-cellno' and cellno: cellno += word elif label == 'E-cellno' and cellno: cellno += word # 判断是否为楼层 elif label == 'B-floorno': floorno = word elif label == 'I-floorno' and floorno: floorno += word elif label == 'E-floorno' and floorno: floorno += word # 判断是否为POI elif label == 'B-poi': tags.append('B-poi') elif label == 'I-poi' and tags and tags[-1] == 'B-poi': tags.append('I-poi') elif label == 'E-poi' and tags and tags[-1] == 'B-poi': tags.append('E-poi') # 判断是否为社区 elif label == 'B-community': community = word elif label == 'I-community' and community: community += word elif label == 'E-community' and community: community += word # 判断是否为开发区 elif label == 'B-devzone': devzone = word elif label == 'I-devzone' and devzone: devzone += word elif label == 'E-devzone' and devzone: devzone += word # 返回地址要素和标签 return province, city, district, street, community, devzone, houseno, cellno, floorno, ' '.join(tags)

基于CRF模型的地址解析算法