import pandas as pdimport numpy as npfrom sklearnpreprocessing import LabelEncoder MinMaxScalerfrom sklearnmodel_selection import train_test_splitimport reimport tensorflow as tffrom tensorflow import
Load data
data = pd.read_csv('data.csv')
Preprocess data
data['Title'] = data['Title'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]', '', x)) data['Description'] = data['Description'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]', '', x)) data['Title'] = data['Title'].apply(lambda x: x.lower()) data['Description'] = data['Description'].apply(lambda x: x.lower())
Split data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
Preprocess input features
title_encoder = LabelEncoder() description_encoder = LabelEncoder() title_encoder.fit(train_data['Title']) description_encoder.fit(train_data['Description']) train_data['Title'] = title_encoder.transform(train_data['Title']) train_data['Description'] = description_encoder.transform(train_data['Description']) test_data['Title'] = title_encoder.transform(test_data['Title']) test_data['Description'] = description_encoder.transform(test_data['Description'])
Scale input features
scaler = MinMaxScaler() train_data[['Title', 'Description']] = scaler.fit_transform(train_data[['Title', 'Description']]) test_data[['Title', 'Description']] = scaler.transform(test_data[['Title', 'Description']])
Preprocess target variable
train_data['Target'] = train_data['Target'].apply(lambda x: 1 if x == 'Yes' else 0) test_data['Target'] = test_data['Target'].apply(lambda x: 1 if x == 'Yes' else 0)
Define the model
input_title = Input(shape=(1,)) input_description = Input(shape=(1,))
embedding_dim = 32 vocab_size_title = len(title_encoder.classes_) vocab_size_description = len(description_encoder.classes_)
embedding_title = Embedding(vocab_size_title, embedding_dim)(input_title) embedding_description = Embedding(vocab_size_description, embedding_dim)(input_description)
flatten_title = layers.Flatten()(embedding_title) flatten_description = layers.Flatten()(embedding_description)
concat = Concatenate()([flatten_title, flatten_description]) dropout = Dropout(0.2)(concat) dense1 = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(dropout) output = Dense(1, activation='sigmoid')(dense1)
model = keras.Model(inputs=[input_title, input_description], outputs=output)
Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss=binary_crossentropy, metrics=[AUC()])
Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
Train the model
history = model.fit( x=[train_data['Title'], train_data['Description']], y=train_data['Target'], batch_size=32, epochs=50, validation_split=0.2, callbacks=[early_stopping] )
Evaluate the model
loss, auc = model.evaluate([test_data['Title'], test_data['Description']], test_data['Target']) print(f'Test loss: {loss}') print(f'Test AUC: {auc}'
原文地址: https://www.cveoy.top/t/topic/hYF3 著作权归作者所有。请勿转载和采集!