一名大三的大数据专业学生你现在有一个基于python软文浏览量的预测的机器学习项目。我需要你看以下代码帮我把代码重新排列加上中文注释。把所有代码给我打出来。import pandas as pdimport numpy as npimport seaborn as snsimport matplotlibpyplot as pltfrom sklearnensemble import ra
导入必要的库
import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import mean_squared_error from sklearn.neural_network import MLPRegressor from sklearn.svm import SVR
读取数据
df_ads = pd.read_csv('易速鲜花微信软文.csv') print(df_ads.head(10))
数据清洗
df_ads.isna().sum() # NaN出现的次数 print(df_ads.isna().sum()) df_ads = df_ads.dropna() # 删除NaN值 print(df_ads)
数据分析
plt.plot(df_ads['点赞数'], df_ads['浏览量'], 'r.', label='Training data') plt.xlabel('点赞数') plt.ylabel('浏览量') plt.legend() plt.show()
data = pd.concat([df_ads['浏览量'], df_ads['热度指数']], axis=1) # 浏览量和热度指数 fig = sns.boxplot(x='热度指数', y="浏览量", data=data) # 用seaborn的箱线图画图 fig.axis(ymin=0, ymax=800000); #设定y轴坐标 plt.show()
特征工程
X = df_ads[['点赞数']] y = df_ads['浏览量']
数据拆分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
模型构建
model = RandomForestRegressor() model.fit(X_train, y_train)
预测结果
y_pred = model.predict(X_test)
模型评估
print('随机森林模型评估:') print('均方误差:%.2f' % mean_squared_error(y_test, y_pred)) print('R2得分:%.2f' % model.score(X_test, y_test))
模型构建
model = DecisionTreeRegressor() model.fit(X_train, y_train)
预测结果
y_pred = model.predict(X_test)
模型评估
print('决策树模型评估:') print('均方误差:%.2f' % mean_squared_error(y_test, y_pred)) print('R2得分:%.2f' % model.score(X_test, y_test))
模型构建
model = LinearRegression() model.fit(X_train, y_train)
预测结果
y_pred = model.predict(X_test)
模型评估
print('线性回归模型评估:') print('均方误差:%.2f' % mean_squared_error(y_test, y_pred)) print('R2得分:%.2f' % model.score(X_test, y_test))
model_svr = SVR(kernel='rbf', C=1e3, gamma=0.1) model_svr.fit(X_train, y_train) y_pred_svr = model_svr.predict(X_test) df_ads_pred_svr = X_test.copy() df_ads_pred_svr['浏览量真值'] = y_test df_ads_pred_svr['浏览量预测值'] = y_pred_svr df_ads_pred_svr print("支持向量机预测集评分:", model_svr.score(X_test, y_test)) print("支持向量机训练集评分:", model_svr.score(X_train, y_train))
model_mlp = MLPRegressor(hidden_layer_sizes=(100,50,10), max_iter=1000, alpha=0.001, solver='adam', verbose=0, random_state=21) model_mlp.fit(X_train, y_train) y_pred_mlp = model_mlp.predict(X_test) df_ads_pred_mlp = X_test.copy() df_ads_pred_mlp['浏览量真值'] = y_test df_ads_pred_mlp['浏览量预测值'] = y_pred_mlp df_ads_pred_mlp print("神经网络预测集评分:", model_mlp.score(X_test, y_test)) print("神经网络训练集评分:", model_mlp.score(X_train, y_train))
散点图
plt.scatter(df_ads['点赞数'], df_ads['浏览量'], label='Training data') plt.xlabel('点赞数') plt.ylabel('浏览量') plt.legend() plt.show()
直方图
sns.histplot(df_ads['浏览量'], kde=True) plt.xlabel('浏览量') plt.ylabel('频数') plt.show()
加入热力图
corr_matrix = df_ads.corr() sns.heatmap(corr_matrix, annot=True) plt.show()
饼图1
labels = ['0-1000', '1000-3000', '3000-5000', '5000-10000', '>10000'] df_ads['点赞数分布'] = pd.cut(df_ads['点赞数'], bins=[0, 1000, 3000, 5000, 10000, np.inf], labels=labels) df_pie = df_ads.groupby('点赞数分布').size().reset_index(name='counts') fig1, ax1 = plt.subplots() ax1.pie(df_pie['counts'], labels=df_pie['点赞数分布'], autopct='%1.1f%%') ax1.axis('equal') plt.title('点赞数分布') plt.show()
饼图2
labels = ['0-20', '20-40', '40-60', '60-80', '>80'] df_ads['热度指数分布'] = pd.cut(df_ads['热度指数'], bins=[0, 20, 40, 60, 80, np.inf], labels=labels) df_pie = df_ads.groupby('热度指数分布').size().reset_index(name='counts') fig2, ax2 = plt.subplots() ax2.pie(df_pie['counts'], labels=df_pie['热度指数分布'], autopct='%1.1f%%') ax2.axis('equal') plt.title('热度指数分布') plt.show()
添加代码,分析点赞数、热度指数、浏览量之间的相关性
sns.pairplot(df_ads[['点赞数', '热度指数', '浏览量']]) plt.show()
特征工程
添加代码,将热度指数也加入到特征中
X = df_ads[['点赞数', '热度指数']] y = df_ads['浏览量']
数据拆分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
模型构建
添加代码,使用其他模型
原文地址: https://www.cveoy.top/t/topic/hfHa 著作权归作者所有。请勿转载和采集!