抖音热榜数据爬取与分析:词云生成和视频搜索
import requests from bs4 import BeautifulSoup import openpyxl import jieba from wordcloud import WordCloud import matplotlib.pyplot as plt
创建excel文件
wb = openpyxl.Workbook() wb.remove(wb.active)
爬取热榜、娱乐榜、社会榜、挑战榜的数据并存储到excel文件中
urls = { '热榜': 'https://www.douyin.com/aweme/v1/hotsearch/billboard/?type=hot', '娱乐榜': 'https://www.douyin.com/aweme/v1/hotsearch/billboard/?type=entertainment', '社会榜': 'https://www.douyin.com/aweme/v1/hotsearch/billboard/?type=society', '挑战榜': 'https://www.douyin.com/aweme/v1/hotsearch/billboard/?type=challenge', }
for name, url in urls.items(): # 获取数据 response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') items = soup.select('.list-item') # 写入excel文件 sheet = wb.create_sheet(name) sheet.append(['标题', '热度', '链接']) for item in items: title = item.select_one('.title').text hot = item.select_one('.hot-value').text link = item.select_one('.link').get('href') sheet.append([title, hot, link])
保存excel文件
filename = '学号+姓名.xlsx' wb.save(filename)
生成词云图
wb = openpyxl.load_workbook(filename) words = [] for sheet in wb: if sheet.title != 'Sheet1': # 排除默认的工作表 for row in sheet.iter_rows(min_row=2): words.append(row[0].value) # 只取标题
text = ' '.join(jieba.cut(' '.join(words))) wordcloud = WordCloud(width=800, height=600, font_path='msyh.ttc') wordcloud.generate(text) plt.imshow(wordcloud) plt.axis('off') plt.show()
模拟搜索
keyword = input('请输入要搜索的视频名称:') categories = [] for sheet in wb: if sheet.title != 'Sheet1': # 排除默认的工作表 for row in sheet.iter_rows(min_row=2): if keyword in row[0].value: categories.append(sheet.title) break
if categories: print('该视频对应的分类有:') for i, category in enumerate(categories): print(f'{i + 1}. {category}') choice = int(input('请选择分类:')) sheet = wb[categories[choice - 1]] print('该分类前10条视频的标题和链接为:') for row in sheet.iter_rows(min_row=2, max_row=11): title = row[0].value link = row[2].value print(f'标题:{title} 链接:{link} ') else: print('没有找到相关视频')
原文地址: https://www.cveoy.top/t/topic/oKIf 著作权归作者所有。请勿转载和采集!