三、评论分析

  1. 统计每年的评论数
import pandas as pd
import matplotlib.pyplot as plt

# 读取评论数据
reviews = pd.read_csv('reviews.csv')

# 将评论时间转换为日期格式
reviews['date'] = pd.to_datetime(reviews['date'])

# 统计每年的评论数
reviews['year'] = reviews['date'].dt.year
comments_per_year = reviews.groupby('year').size()

# 绘制每年的评论数柱状图
plt.bar(comments_per_year.index, comments_per_year.values)
plt.xlabel('Year')
plt.ylabel('Number of Comments')
plt.title('Number of Comments per Year')
plt.show()
  1. 统计有用(helpful)、有趣(funny)及酷(cool)的评论及数量
# 统计有用、有趣和酷的评论数量
helpful_count = reviews['helpful'].sum()
funny_count = reviews['funny'].sum()
cool_count = reviews['cool'].sum()

# 打印结果
print("Number of Helpful Comments:", helpful_count)
print("Number of Funny Comments:", funny_count)
print("Number of Cool Comments:", cool_count)
  1. 每年全部评论用户排行榜
# 统计每年评论用户的数量
user_ranking = reviews.groupby(['year', 'user_id']).size().reset_index(name='comment_count')

# 按评论数量降序排序
user_ranking = user_ranking.sort_values(['year', 'comment_count'], ascending=[True, False])

# 打印每年的评论用户排行榜
for year in reviews['year'].unique():
    print("Year:", year)
    print(user_ranking[user_ranking['year'] == year].head(5))
    print()
  1. 从评论中提取最常见的Top20词语
from collections import Counter
import nltk
from nltk.corpus import stopwords

# 下载停用词表
nltk.download('stopwords')

# 设置停用词
stop_words = set(stopwords.words('english'))

# 提取评论中的词语
words = ' '.join(reviews['text']).lower().split()

# 去除停用词
words = [word for word in words if word not in stop_words]

# 统计词频
word_counts = Counter(words)

# 获取最常见的Top20词语
top20_words = word_counts.most_common(20)

# 打印结果
for word, count in top20_words:
    print(word, count)
  1. 提取全部评论,通过词性过滤,并完成词云分析(WordCloud)
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

# 下载停用词表
nltk.download('stopwords')

# 设置停用词
stop_words = set(stopwords.words('english'))

# 提取评论中的词语
words = ' '.join(reviews['text']).lower().split()

# 去除停用词和标点符号
words = [word for word in words if word not in stop_words and word.isalpha()]

# 根据词频生成词云
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(words))

# 绘制词云
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
  1. 计算单词的关系图(譬如chinesesteak等单词)
import networkx as nx
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

# 下载停用词表
nltk.download('stopwords')

# 设置停用词
stop_words = set(stopwords.words('english'))

# 提取评论中的词语
words = ' '.join(reviews['text']).lower().split()

# 去除停用词和标点符号
words = [word for word in words if word not in stop_words and word.isalpha()]

# 创建词语关系图
G = nx.Graph()

# 添加词语节点
for word in words:
    G.add_node(word)

# 添加单词关系
for i in range(len(words)-1):
    for j in range(i+1, len(words)):
        if words[i] != words[j]:
            if G.has_edge(words[i], words[j]):
                G[words[i]][words[j]]['weight'] += 1
            else:
                G.add_edge(words[i], words[j], weight=1)

# 选择某个单词,获取与其关联的单词及权重
word = 'chinesesteak'
related_words = G[word]
related_words = [(w, related_words[w]['weight']) for w in related_words]

# 按权重降序排序
related_words = sorted(related_words, key=lambda x: x[1], reverse=True)

# 打印结果
for word, weight in related_words:
    print(word, weight)

# 绘制关系图
plt.figure(figsize=(10, 5))
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, node_size=100, node_color='lightblue', edge_color='gray')
plt.show()

四、评分分析

  1. 统计评分的分布情况(1-5分)
import pandas as pd
import matplotlib.pyplot as plt

# 读取评论数据
reviews = pd.read_csv('reviews.csv')

# 统计评分的分布情况
rating_counts = reviews['stars'].value_counts().sort_index()

# 绘制评分的分布情况柱状图
plt.bar(rating_counts.index, rating_counts.values)
plt.xlabel('Rating')
plt.ylabel('Count')
plt.title('Distribution of Ratings')
plt.show()
  1. 统计评分周(周一~周天)次数统计
# 将评论时间转换为日期格式
reviews['date'] = pd.to_datetime(reviews['date'])

# 统计评分周次数
rating_per_weekday = reviews['date'].dt.dayofweek.value_counts().sort_index()

# 绘制评分周次数柱状图
plt.bar(rating_per_weekday.index, rating_per_weekday.values)
plt.xlabel('Weekday')
plt.ylabel('Count')
plt.title('Number of Ratings per Weekday')
plt.show()
  1. 统计拥有次数最多的5分评价的商家
# 统计拥有次数最多的5分评价的商家
top_businesses = reviews[reviews['stars'] == 5]['business_id'].value_counts().head(5)

# 打印结果
print(top_businesses)

五、打卡分析

  1. 统计每年的打卡次数
import pandas as pd
import matplotlib.pyplot as plt

# 读取打卡数据
checkins = pd.read_csv('checkins.csv')

# 将打卡时间转换为日期格式
checkins['date'] = pd.to_datetime(checkins['date'])

# 统计每年的打卡次数
checkins['year'] = checkins['date'].dt.year
checkins_per_year = checkins.groupby('year').size()

# 绘制每年的打卡次数柱状图
plt.bar(checkins_per_year.index, checkins_per_year.values)
plt.xlabel('Year')
plt.ylabel('Number of Checkins')
plt.title('Number of Checkins per Year')
plt.show()
  1. 统计24小时每小时打卡次数
# 统计24小时每小时打卡次数
checkins_per_hour = checkins['date'].dt.hour.value_counts().sort_index()

# 绘制24小时每小时打卡次数柱状图
plt.bar(checkins_per_hour.index, checkins_per_hour.values)
plt.xlabel('Hour')
plt.ylabel('Number of Checkins')
plt.title('Number of Checkins per Hour')
plt.show()
  1. 统计最喜欢打卡的城市
# 统计最喜欢打卡的城市
top_cities = checkins['city'].value_counts().head(5)

# 打印结果
print(top_cities)
  1. 全部商家的打卡排行榜
# 统计全部商家的打卡次数
business_ranking = checkins['business_id'].value_counts()

# 打印全部商家的打卡排行榜
print(business_ranking.head(10))

六、综合分析

  1. 每个城市最好(评分次数、评分、打卡数)的五家商家
import pandas as pd

# 读取商家数据和评论数据
businesses = pd.read_csv('businesses.csv')
reviews = pd.read_csv('reviews.csv')

# 统计每个城市的商家数量
city_counts = businesses['city'].value_counts()

# 选择每个城市最好的五家商家
top_businesses = []
for city in businesses['city'].unique():
    city_businesses = businesses[businesses['city'] == city]
    city_reviews = reviews[reviews['business_id'].isin(city_businesses['business_id'])]
    city_businesses['review_count'] = city_reviews.groupby('business_id').size()
    city_businesses['average_rating'] = city_reviews.groupby('business_id')['stars'].mean()
    city_businesses['checkin_count'] = checkins[checkins['business_id'].isin(city_businesses['business_id'])].groupby('business_id').size()
    top_businesses.append(city_businesses.nlargest(5, ['review_count', 'average_rating', 'checkin_count']))

# 打印结果
for city, businesses in zip(businesses['city'].unique(), top_businesses):
    print("City:", city)
    print(businesses)
    print()

以上代码为示例代码,根据你的具体数据和需求进行相应的修改和调整

三、评论分析1统计每年的评论数2统计有用helpful、有趣funny及酷cool的评论及数量3每年全部评论用户排行榜4从评论中提取最常见的Top20词语5提取全部评论通过词性过滤并完成词云分析WordCloud6计算单词的关系图譬如chinesesteak等单词四、评分分析1统计评分的分布情况1-5分2统计评分周周一~周天次数统计3统计拥有次数最多的5分评价的商家五、打卡分析1统计每年的打卡次数

原文地址: https://www.cveoy.top/t/topic/hLQR 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录