一名大三的大数据专业学生你现在有一个基于python软文浏览量的预测的机器学习项目。我需要你看以下代码帮我把代码重新排列加上中文注释。import pandas as pdimport numpy as npimport seaborn as snsimport matplotlibpyplot as pltfrom sklearnensemble import randomforestr
导入必要的库
import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import mean_squared_error from sklearn.neural_network import MLPRegressor from sklearn.svm import SVR
读取数据
df_ads = pd.read_csv('易速鲜花微信软文.csv') print(df_ads.head(10))
数据清洗
df_ads.isna().sum() # NaN出现的次数 print(df_ads.isna().sum()) df_ads = df_ads.dropna() # 删除NaN值 print(df_ads)
数据分析
plt.plot(df_ads['点赞数'], df_ads['浏览量'], 'r.', label='Training data') plt.xlabel('点赞数') plt.ylabel('浏览量') plt.legend() plt.show()
data = pd.concat([df_ads['浏览量'], df_ads['热度指数']], axis=1) # 浏览量和热度指数 fig = sns.boxplot(x='热度指数', y="浏览量", data=data) # 用seaborn的箱线图画图 fig.axis(ymin=0, ymax=800000); #设定y轴坐标 plt.show()
特征工程
X = df_ads[['点赞数']] y = df_ads['浏览量']
数据拆分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
随机森林模型构建、预测和评估
model = RandomForestRegressor() model.fit(X_train, y_train) y_pred = model.predict(X_test) print('随机森林模型评估:') print('均方误差:%.2f' % mean_squared_error(y_test, y_pred)) print('R2得分:%.2f' % model.score(X_test, y_test))
决策树模型构建、预测和评估
model = DecisionTreeRegressor() model.fit(X_train, y_train) y_pred = model.predict(X_test) print('决策树模型评估:') print('均方误差:%.2f' % mean_squared_error(y_test, y_pred)) print('R2得分:%.2f' % model.score(X_test, y_test))
线性回归模型构建、预测和评估
model = LinearRegression() model.fit(X_train, y_train) y_pred = model.predict(X_test) print('线性回归模型评估:') print('均方误差:%.2f' % mean_squared_error(y_test, y_pred)) print('R2得分:%.2f' % model.score(X_test, y_test))
支持向量机模型构建、预测和评估
model_svr = SVR(kernel='rbf', C=1e3, gamma=0.1) model_svr.fit(X_train, y_train) y_pred_svr = model_svr.predict(X_test) df_ads_pred_svr = X_test.copy() df_ads_pred_svr['浏览量真值'] = y_test df_ads_pred_svr['浏览量预测值'] = y_pred_svr df_ads_pred_svr print("支持向量机预测集评分:", model_svr.score(X_test, y_test)) print("支持向量机训练集评分:", model_svr.score(X_train, y_train))
神经网络模型构建、预测和评估
model_mlp = MLPRegressor(hidden_layer_sizes=(100,50,10), max_iter=1000, alpha=0.001, solver='adam', verbose=0, random_state=21) model_mlp.fit(X_train, y_train) y_pred_mlp = model_mlp.predict(X_test) df_ads_pred_mlp = X_test.copy() df_ads_pred_mlp['浏览量真值'] = y_test df_ads_pred_mlp['浏览量预测值'] = y_pred_mlp df_ads_pred_mlp print("神经网络预测集评分:", model_mlp.score(X_test, y_test)) print("神经网络训练集评分:", model_mlp.score(X_train, y_train))
散点图
plt.scatter(df_ads['点赞数'], df_ads['浏览量'], label='Training data') plt.xlabel('点赞数') plt.ylabel('浏览量') plt.legend() plt.show()
直方图
sns.histplot(df_ads['浏览量'], kde=True) plt.xlabel('浏览量') plt.ylabel('频数') plt.show()
热力图
corr_matrix = df_ads.corr() sns.heatmap(corr_matrix, annot=True) plt.show()
饼图1
labels = ['0-1000', '1000-3000', '3000-5000', '5000-10000', '>10000'] df_ads['点赞数分布'] = pd.cut(df_ads['点赞数'], bins=[0, 1000, 3000, 5000, 10000, np.inf], labels=labels) df_pie = df_ads.groupby('点赞数分布').size().reset_index(name='counts') fig1, ax1 = plt.subplots() ax1.pie(df_pie['counts'], labels=df_pie['点赞数分布'], autopct='%1.1f%%') ax1.axis('equal') plt.title('点赞数分布') plt.show()
饼图2
labels = ['0-20', '20-40', '40-60', '60-80', '>80'] df_ads['热度指数分布'] = pd.cut(df_ads['热度指数'], bins=[0, 20, 40, 60, 80, np.inf], labels=labels) df_pie = df_ads.groupby('热度指数分布').size().reset_index(name='counts') fig2, ax2 = plt.subplots() ax2.pie(df_pie['counts'], labels=df_pie['热度指数分布'], autopct='%1.1f%%') ax2.axis('equal') plt.title('热度指数分布') plt.show()
添加代码,分析点赞数、热度指数、浏览量之间的相关性
sns.pairplot(df_ads[['点赞数', '热度指数', '浏览量']]) plt.show()
特征工程,将热度指数也加入到特征中
X = df_ads[['点赞数', '热度指数']] y = df_ads['浏览量']
模型构建
添加代码,使用其他模型构建预测模型,如岭回归、Lasso回归
原文地址: http://www.cveoy.top/t/topic/hfFl 著作权归作者所有。请勿转载和采集!