基于BiLSTM-CRF的中文命名实体识别

import os
import sys
import codecs
import numpy as np
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
from keras_contrib.utils import save_load_utils
from keras.callbacks import ModelCheckpoint


# 定义数据读取函数
def read_data(file_path):
    with codecs.open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    data = []
    sentence = []
    labels = []
    for line in lines:
        if line == '\n':
            if sentence:
                data.append(sentence)
                labels.append(labels_)
                sentence = []
                labels_ = []
        else:
            word, label = line.strip().split()
            sentence.append(word)
            labels_.append(label)
    return data, labels


# 定义特征提取函数
def feature_extract(data, word2id, label2id, max_len):
    X = [[word2id.get(word, 0) for word in sentence] for sentence in data]
    X = np.array([x + [0] * (max_len - len(x)) for x in X])
    y = [[label2id[label] for label in sentence] for sentence in labels]
    y = np.array([x + [0] * (max_len - len(x)) for x in y])
    return X, y


# 定义模型结构
def build_model(input_dim, output_dim, input_length, embedding_dim=128, lstm_units=64):
    input = Input(shape=(input_length,))
    model = Embedding(input_dim=input_dim, output_dim=embedding_dim, input_length=input_length)(input)
    model = Bidirectional(LSTM(units=lstm_units, return_sequences=True))(model)
    model = TimeDistributed(Dense(units=lstm_units, activation='relu'))(model)
    crf = CRF(output_dim=output_dim, sparse_target=True)
    out = crf(model)
    model = Model(input, out)
    model.compile(optimizer='adam', loss=crf.loss_function, metrics=[crf.accuracy])
    model.summary()
    return model


# 定义训练函数
def train(X_train, y_train, X_dev, y_dev, word2id, label2id, max_len, batch_size=32, epochs=10):
    input_dim = len(word2id) + 1
    output_dim = len(label2id) + 1
    model = build_model(input_dim, output_dim, max_len)
    checkpoint = ModelCheckpoint('model.h5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_dev, y_dev), callbacks=[checkpoint])
    save_load_utils.save_all_weights(model, 'model_weights.h5')
    return model


# 定义预测函数
def predict(model, data, word2id, label2id, max_len):
    X = [[word2id.get(word, 0) for word in sentence] for sentence in data]
    X = np.array([x + [0] * (max_len - len(x)) for x in X])
    y_pred = model.predict(X)
    y_pred = np.argmax(y_pred, axis=-1)
    y_pred = [[list(label2id.keys())[list(label2id.values()).index(pred)] for pred in sentence] for sentence in y_pred]
    return y_pred


if __name__ == '__main__':
    # 读取数据
    train_data, train_labels = read_data('dev.conll')
    test_data, test_labels = read_data('1.txt')

    # 构建词表和标签表
    words = set([word for sentence in train_data for word in sentence] + [word for sentence in test_data for word in sentence])
    labels = set([label for sentence in train_labels for label in sentence])
    word2id = {word: i + 1 for i, word in enumerate(words)}
    label2id = {label: i + 1 for i, label in enumerate(labels)}

    # 特征提取
    max_len = max([len(sentence) for sentence in train_data + test_data])
    X_train, y_train = feature_extract(train_data, word2id, label2id, max_len)
    X_test, y_test = feature_extract(test_data, word2id, label2id, max_len)

    # 训练模型
    model = train(X_train, y_train, X_test, y_test, word2id, label2id, max_len)

    # 预测并输出结果
    y_pred = predict(model, test_data, word2id, label2id, max_len)
    with codecs.open('对对对队_addr_parsing_runid.txt', 'w', encoding='utf-8') as f:
        for i, sentence in enumerate(test_data):
            for j, word in enumerate(sentence):
                f.write('{}{}{}\n'.format(i+1, word, y_pred[i][j]))
            f.write('\n')

基于BiLSTM-CRF的中文命名实体识别代码解析

本代码实现了一个基于 BiLSTM-CRF 模型的中文命名实体识别系统。

1. 数据读取

函数 read_data(file_path) 用于读取数据，将每个句子和对应的标签分别存储在列表中。

2. 特征提取

函数 feature_extract(data, word2id, label2id, max_len) 将句子和标签转换成模型需要的特征格式，具体操作包括：

将词语转换成对应的词表id。
对句子进行padding，使其长度一致。
将标签转换成对应的标签表id。

3. 模型构建

函数 build_model(input_dim, output_dim, input_length, embedding_dim, lstm_units) 构建了 BiLSTM-CRF 模型，模型结构如下：

Embedding层: 将词语id转换成对应的词向量。
BiLSTM层: 使用双向LSTM提取上下文信息。
Dense层: 将LSTM的输出映射到标签空间。
CRF层: 添加CRF层，学习标签之间的依赖关系。

4. 模型训练

函数 train(X_train, y_train, X_dev, y_dev, word2id, label2id, max_len, batch_size, epochs) 使用训练集训练模型，并在验证集上进行验证。

5. 模型预测

函数 predict(model, data, word2id, label2id, max_len) 使用训练好的模型对新数据进行预测，并将预测结果转换成标签名字的形式。

总结

本代码提供了一个完整的中文命名实体识别系统，使用 BiLSTM-CRF 模型可以有效地识别文本中的实体。你可以根据自己的数据和需求对代码进行修改和优化。