地址解析模型:使用决策树进行地址要素识别
import os import sys import codecs import numpy as np from sklearn.metrics import classification_report from sklearn.preprocessing import LabelEncoder from sklearn.feature_extraction import DictVectorizer from sklearn.tree import DecisionTreeClassifier import pickle
定义函数,读取数据文件
def read_data_file(data_file): with codecs.open(data_file, 'r', encoding='utf-8') as f: lines = f.readlines() data = [] labels = [] for line in lines: if line.strip(): parts = line.strip().split('') if len(parts) < 2: continue data.append(parts[1]) labels.append(parts[0]) return data, labels
定义函数,读取训练数据文件
def read_train_file(train_file): with codecs.open(train_file, 'r', encoding='utf-8') as f: lines = f.readlines() data = [] labels = [] sent_data = [] sent_labels = [] for line in lines: if line.strip(): parts = line.strip().split(' ') if len(parts) < 2: continue sent_data.append(parts[1]) sent_labels.append(parts[0]) else: data.append(sent_data) labels.append(sent_labels) sent_data = [] sent_labels = [] if sent_data: data.append(sent_data) labels.append(sent_labels) return data, labels
定义函数,将数据转换为特征向量
def data2features(data):
features = []
for sent in data:
for i in range(len(sent)):
feature = {}
feature['word'] = sent[i]
if i == 0:
feature['prev_word'] = ''
else:
feature['prev_word'] = sent[i-1]
if i == len(sent)-1:
feature['next_word'] = ''
else:
feature['next_word'] = sent[i+1]
features.append(feature)
return features
定义函数,训练模型
def train(train_file, model_file): # 读取训练数据 data, labels = read_train_file(train_file)
# 将数据转换为特征向量
features = data2features(data)
# 将标签编码为数字
label_encoder = LabelEncoder()
label_encoder.fit(np.concatenate(labels))
labels_encoded = [label_encoder.transform(l) for l in labels]
# 将特征向量转换为稀疏矩阵
dict_vectorizer = DictVectorizer()
features_sparse = dict_vectorizer.fit_transform(features)
# 训练决策树模型
clf = DecisionTreeClassifier()
clf.fit(features_sparse, np.concatenate(labels_encoded))
# 保存模型
with open(model_file, 'wb') as f:
model = (label_encoder, dict_vectorizer, clf)
pickle.dump(model, f)
定义函数,使用模型进行预测
def predict(data_file, model_file, result_file): # 读取数据 data, _ = read_data_file(data_file)
# 读取模型
with open(model_file, 'rb') as f:
label_encoder, dict_vectorizer, clf = pickle.load(f)
# 将数据转换为特征向量
features = data2features(data)
# 将特征向量转换为稀疏矩阵
features_sparse = dict_vectorizer.transform(features)
# 使用模型进行预测
labels_encoded = clf.predict(features_sparse)
labels = label_encoder.inverse_transform(labels_encoded)
# 将预测结果输出到文件
with codecs.open(result_file, 'w', encoding='utf-8') as f:
for i in range(len(data)):
f.write('{}{}{}
'.format(i+1, data[i], labels[i]))
定义函数,评估模型
def evaluate(data_file, model_file): # 读取数据 data, labels = read_data_file(data_file)
# 读取模型
with open(model_file, 'rb') as f:
label_encoder, dict_vectorizer, clf = pickle.load(f)
# 将数据转换为特征向量
features = data2features(data)
# 将特征向量转换为稀疏矩阵
features_sparse = dict_vectorizer.transform(features)
# 使用模型进行预测
labels_encoded = clf.predict(features_sparse)
labels_predicted = label_encoder.inverse_transform(labels_encoded)
# 输出评估结果
print(classification_report(labels, labels_predicted))
定义函数,解析地址要素
def parse_address(data_file, model_file, result_file): # 读取数据 data, _ = read_data_file(data_file)
# 读取模型
with open(model_file, 'rb') as f:
label_encoder, dict_vectorizer, clf = pickle.load(f)
# 将数据转换为特征向量
features = data2features(data)
# 将特征向量转换为稀疏矩阵
features_sparse = dict_vectorizer.transform(features)
# 使用模型进行预测
labels_encoded = clf.predict(features_sparse)
labels = label_encoder.inverse_transform(labels_encoded)
# 将预测结果解析为地址要素
results = []
result = None # 添加result变量
for i in range(len(data)):
if labels[i] == 'O':
if result: # 如果result不为空,则将其添加到结果列表中
results.append((result[0], result[1]))
result = None
results.append((data[i], 'O'))
else:
tag, type = labels[i].split('-')
if tag == 'B':
if result: # 如果result不为空,则将其添加到结果列表中
results.append((result[0], result[1]))
result = [data[i], type]
elif tag == 'I':
if result:
result[0] += data[i]
else:
result = [data[i], type]
elif tag == 'E':
if result:
result[0] += data[i]
results.append((result[0], result[1]))
result = None
else:
results.append((data[i], type))
else:
if result: # 如果result不为空,则将其添加到结果列表中
results.append((result[0], result[1]))
result = None
if result: # 处理最后一个result
results.append((result[0], result[1]))
# 将解析结果输出到文件
with codecs.open(result_file, 'w', encoding='utf-8') as f:
for i in range(len(data)):
f.write('{}{}{}
'.format(i+1, data[i], labels[i]))
# 将解析结果输出到控制台
for result in results:
print(result)
解析地址要素
parse_address('3.txt', 'addr_parsing_model.pkl', '对对对队_addr_parsing_runid.txt')
原文地址: https://www.cveoy.top/t/topic/joLv 著作权归作者所有。请勿转载和采集!