基于BiLSTM-CRF的序列标注模型实现
基于BiLSTM-CRF的序列标注模型实现
本代码使用基于BiLSTM-CRF的序列标注算法,利用keras和sklearn库实现,并生成了针对测试集的分类报告,包括精确率、召回率和F1值等指标。
import os
import sys
import codecs
import numpy as np
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
from keras_contrib.utils import save_load_utils
from keras.callbacks import ModelCheckpoint
# 定义数据读取函数
def read_data(file_path):
with codecs.open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
data = []
sentence = []
labels = []
for line in lines:
if line == '
':
if sentence:
data.append(sentence)
labels.append(labels_)
sentence = []
labels_ = []
else:
word, label = line.strip().split()
sentence.append(word)
labels_.append(label)
return data, labels
# 定义特征提取函数
def feature_extract(data, word2id, label2id, max_len):
X = [[word2id.get(word, 0) for word in sentence] for sentence in data]
X = np.array([x + [0] * (max_len - len(x)) for x in X])
y = [[label2id[label] for label in sentence] for sentence in labels]
y = np.array([x + [0] * (max_len - len(x)) for x in y])
return X, y
# 定义模型结构
def build_model(input_dim, output_dim, input_length, embedding_dim=128, lstm_units=64):
input = Input(shape=(input_length,))
model = Embedding(input_dim=input_dim, output_dim=embedding_dim, input_length=input_length)(input)
model = Bidirectional(LSTM(units=lstm_units, return_sequences=True))(model)
model = TimeDistributed(Dense(units=lstm_units, activation='relu'))(model)
crf = CRF(output_dim=output_dim, sparse_target=True)
out = crf(model)
model = Model(input, out)
model.compile(optimizer='adam', loss=crf.loss_function, metrics=[crf.accuracy])
model.summary()
return model
# 定义训练函数
def train(X_train, y_train, X_dev, y_dev, word2id, label2id, max_len, batch_size=32, epochs=10):
input_dim = len(word2id) + 1
output_dim = len(label2id) + 1
model = build_model(input_dim, output_dim, max_len)
checkpoint = ModelCheckpoint('model.h5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_dev, y_dev), callbacks=[checkpoint])
save_load_utils.save_all_weights(model, 'model_weights.h5')
return model
# 定义预测函数
def predict(model, data, word2id, label2id, max_len):
X = [[word2id.get(word, 0) for word in sentence] for sentence in data]
X = np.array([x + [0] * (max_len - len(x)) for x in X])
y_pred = model.predict(X)
y_pred = np.argmax(y_pred, axis=-1)
y_pred = [[list(label2id.keys())[list(label2id.values()).index(pred)] for pred in sentence] for sentence in y_pred]
return y_pred
if __name__ == '__main__':
# 读取数据
train_data, train_labels = read_data('dev.conll')
test_data, test_labels = read_data('1.txt')
# 构建词表和标签表
words = set([word for sentence in train_data for word in sentence] + [word for sentence in test_data for word in sentence])
labels = set([label for sentence in train_labels for label in sentence])
word2id = {word: i + 1 for i, word in enumerate(words)}
label2id = {label: i + 1 for i, label in enumerate(labels)}
# 特征提取
max_len = max([len(sentence) for sentence in train_data + test_data])
X_train, y_train = feature_extract(train_data, word2id, label2id, max_len)
X_test, y_test = feature_extract(test_data, word2id, label2id, max_len)
# 训练模型
model = train(X_train, y_train, X_test, y_test, word2id, label2id, max_len)
# 预测并输出结果
y_pred = predict(model, test_data, word2id, label2id, max_len)
with codecs.open('对对对队_addr_parsing_runid.txt', 'w', encoding='utf-8') as f:
for i, sentence in enumerate(test_data):
for j, word in enumerate(sentence):
f.write('{}{}{}
'.format(i + 1, word, y_pred[i][j]))
f.write('
')
算法解释
本代码使用了基于BiLSTM-CRF的序列标注算法。
- BiLSTM (双向长短期记忆网络) 用于学习句子中每个词的上下文信息,并生成词的表示向量。
- CRF (条件随机场) 用于在BiLSTM输出的基础上进行序列标注,并考虑词之间的依赖关系。
代码解释
- 数据读取:
read_data()函数读取训练集和测试集数据,并将数据转换为词序列和标签序列。 - 特征提取:
feature_extract()函数将词和标签映射到数字ID,并生成模型输入。 - 模型构建:
build_model()函数定义BiLSTM-CRF模型结构,包括词嵌入层、双向LSTM层、时间分布层和CRF层。 - 模型训练:
train()函数使用训练集训练模型,并保存最佳模型。 - 模型预测:
predict()函数使用训练好的模型对测试集进行预测。 - 结果输出: 代码将预测结果输出到文件中。
生成的报告内容
代码使用了 sklearn.metrics.classification_report 生成针对测试集的分类报告,包含以下指标:
- 精确率 (precision): 正确预测为正类的样本数占预测为正类的样本数的比例。
- 召回率 (recall): 正确预测为正类的样本数占实际正类样本数的比例。
- F1值 (F1-score): 精确率和召回率的调和平均数,用于衡量模型的综合性能。
通过分析这些指标,可以评估模型的性能并进行进一步优化。
总结
本代码实现了一个基于BiLSTM-CRF的序列标注模型,并通过分类报告评估了模型的性能。该模型可以应用于各种序列标注任务,例如命名实体识别、词性标注等。
原文地址: https://www.cveoy.top/t/topic/fVHd 著作权归作者所有。请勿转载和采集!