TensorFlow Text Classification with Fully Connected Neural Network

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalMaxPooling1D, Embedding, LSTM, SpatialDropout1D,Flatten
from tensorflow.keras.models import Model, Sequential
import seaborn as sns
from sklearn.metrics import confusion_matrix
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from data_load_preprocess import dataloader
from keras import backend as K

def plot_confusion_matrix(y_test, y_pred, binary=1,savename = 'Decisontree'):
    ' Plot the confusion matrix for the target labels and predictions '
    cm = confusion_matrix(y_test, y_pred)
    if binary == 1:
        # Create a dataframe with the confusion matrix values
        df_cm = pd.DataFrame(cm, range(cm.shape[0]),
                             range(cm.shape[1]))
    else:
        df_cm = pd.DataFrame(cm, index=[0, 1, 2, 3, 4, 5, 6, 7], columns=[0, 1, 2, 3, 4, 5, 6, 7])
    # Plot the confusion matrix
    sns.set(font_scale=1.4)  # for label size
    sns.heatmap(df_cm, annot=True, fmt='.0f', cmap="YlGnBu", annot_kws={"size": 10})  # font size
    plt.savefig(f'./{savename}.png')
    plt.show()
# Determine the outputs according to its probabilty values

df = dataloader()
traindata, valdata, trainlabel, vallabel = train_test_split(df['preprocessed'].values, df['labels'].values,test_size=0.2, random_state=42 )

from sklearn.feature_extraction.text import TfidfVectorizer
# Use TF-IDF vectorizer to convert the text data into features
tfidf = TfidfVectorizer()
#   Numericalize the train dataset
tfidf_train = tfidf.fit_transform(traindata)
#   Numericalize the test dataset
tfidf_val = tfidf.transform(valdata)


def convert(model, testdata):
  testdata = testdata.toarray()
  data = model.predict(testdata)
  out = []
  for i in data:
    if i >0.5:
      out.append(1)
    else:
      out.append(0)
  return out

# Define the custom F-1 Score calculation function for neural network
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# Define the learning rate scheduler and callbacks, crictirial on validation accuracy
def scheduler(epoch, lr):
        if epoch < 5:
            return lr
        else:
            return lr * tf.math.exp(-0.1)
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3),
                    tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=1)]


model_tfidf = Sequential()
model_tfidf.add(Dense(128, input_shape=(tfidf_train.shape[1],), activation='relu'))
model_tfidf.add(Dense(64, activation='relu'))
model_tfidf.add(Dense(32, activation='relu'))
model_tfidf.add(Dense(16, activation='relu'))
model_tfidf.add(Dense(1, activation='sigmoid'))
model_tfidf.summary()


# Compile the model
model_tfidf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', f1_m])

# Train the model
history_tfidf = model_tfidf.fit(tfidf_train.toarray(), trainlabel, epochs=5, batch_size=8, validation_data=(tfidf_val.toarray(), vallabel),callbacks=callbacks)


# Evaluate the model
loss, accuracy, f1 = model_tfidf.evaluate(tfidf_val.toarray(), vallabel, verbose=0)
print('Val loss:', loss)
print('Val accuracy:', accuracy)
print('Val f1:', f1)

print('Val confusion matrix')
plot_confusion_matrix(convert(model_tfidf, tfidf_val), vallabel, binary = 1, savename='TensorFlowClassifier')

plt.plot(history_tfidf.history['accuracy'], label='accuracy')
plt.plot(history_tfidf.history['val_accuracy'], label = 'val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.savefig('./Full_connect_Accuracy.png')
plt.show()

plt.plot(history_tfidf.history['loss'])
plt.plot(history_tfidf.history['val_loss'])
plt.title('Loss vs. epochs')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc='upper right')
plt.savefig('./Full_connect_Loss.png')
plt.show()

代码讲解
内容：本代码是一个使用TensorFlow实现的文本分类器，采用了全连接神经网络模型。主要包含以下部分：

1. 数据预处理部分：从数据集中读取数据，将文本数据转化为数值化的特征数据，划分训练集和测试集。

2. 模型构建部分：使用Sequential()函数构建全连接神经网络模型，其中包括5个隐藏层，每个隐藏层的神经元个数分别为128、64、32、16，输出层为1个神经元，采用sigmoid激活函数。

3. 模型训练部分：编译模型，使用二元交叉熵作为损失函数，使用Adam优化器进行模型训练，同时监控模型在验证集上的准确率和F1-score，并设置EarlyStopping和LearningRateScheduler回调函数。

4. 模型评估部分：在测试集上评估模型性能，计算准确率、F1-score，并绘制混淆矩阵和学习曲线图。

5. 自定义函数部分：包括绘制混淆矩阵函数plot_confusion_matrix()和自定义F1-score计算函数recall_m()、precision_m()和f1_m()。

6. 其他部分：导入必要的库和模块，如TensorFlow、sklearn、pandas、matplotlib等。

该代码实现了一个简单的文本分类器，可以用于对文本数据进行情感分析、主题分类等任务。同时，该代码也可以作为初学者学习TensorFlow的参考。