Python 爬取 B 站视频弹幕并生成词云

本程序使用 Python 编写，可以爬取 B 站视频的弹幕并生成词云图。程序包含以下功能：

登录 B 站账号获取 cookie
获取视频的 cid（弹幕对应的视频编号）
根据 cid 获取弹幕数据
解析弹幕数据，保存到 Excel 表中
对弹幕文本进行分词和词频统计，生成词云图

注意： 由于爬取 B 站视频弹幕需要登录，且为了保护用户隐私，本程序不提供登录功能的代码实现。以下代码仅提供爬取弹幕的基本思路，用户需要自行实现登录功能。

代码示例

import requests
from bs4 import BeautifulSoup
import lxml
import sqlite3
import jieba
from wordcloud import WordCloud
import openpyxl
import tkinter as tk
from tkinter import filedialog

class BilibiliDanmuCrawler:
    def __init__(self):
        self.login_url = 'https://passport.bilibili.com/login'
        self.video_url = 'https://www.bilibili.com/video/'
        self.danmu_url = 'https://api.bilibili.com/x/v1/dm/list.so?oid='        
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
        self.session = requests.Session()
        self.session.headers.update(self.headers)
        self.cookies = None
        self.cid = None
        self.danmu_list = []
        self.word_count = None
        self.word_cloud = None
        self.excel_file = None

    def login(self, username, password):
        # TODO: 登录 B 站账号，获取 cookie
        pass

    def get_cid(self, video_url):
        # TODO: 根据视频链接获取 cid
        pass

    def get_danmu(self):
        # TODO: 根据 cid 获取弹幕数据
        pass

    def parse_danmu(self):
        # TODO: 解析弹幕数据，保存到列表中
        pass

    def save_to_excel(self, file_path):
        # TODO: 将弹幕数据保存到 Excel 表中
        pass

    def word_count(self):
        # TODO: 对弹幕文本进行分词和词频统计
        pass

    def generate_word_cloud(self):
        # TODO: 生成词云图
        pass

class Application(tk.Frame):
    def __init__(self, master=None):
        super().__init__(master)
        self.master = master
        self.pack()
        self.create_widgets()

    def create_widgets(self):
        self.video_url_label = tk.Label(self, text='视频链接：')
        self.video_url_label.pack(side='top')
        self.video_url_entry = tk.Entry(self)
        self.video_url_entry.pack(side='top')

        self.username_label = tk.Label(self, text='用户名：')
        self.username_label.pack(side='top')
        self.username_entry = tk.Entry(self)
        self.username_entry.pack(side='top')

        self.password_label = tk.Label(self, text='密码：')
        self.password_label.pack(side='top')
        self.password_entry = tk.Entry(self, show='*')
        self.password_entry.pack(side='top')

        self.save_path_label = tk.Label(self, text='保存路径：')
        self.save_path_label.pack(side='top')
        self.save_path_var = tk.StringVar()
        self.save_path_entry = tk.Entry(self, textvariable=self.save_path_var)
        self.save_path_entry.pack(side='top')
        self.save_path_button = tk.Button(self, text='选择', command=self.select_save_path)
        self.save_path_button.pack(side='top')

        self.crawl_button = tk.Button(self, text='爬取弹幕', command=self.crawl_danmu)
        self.crawl_button.pack(side='top')

        self.word_count_button = tk.Button(self, text='分词统计', command=self.word_count)
        self.word_count_button.pack(side='top')

        self.word_cloud_button = tk.Button(self, text='生成词云图', command=self.generate_word_cloud)
        self.word_cloud_button.pack(side='top')

        self.quit_button = tk.Button(self, text='退出', command=self.master.destroy)
        self.quit_button.pack(side='bottom')

    def select_save_path(self):
        file_path = filedialog.asksaveasfilename(defaultextension='.xlsx')
        self.save_path_var.set(file_path)

    def crawl_danmu(self):
        video_url = self.video_url_entry.get()
        username = self.username_entry.get()
        password = self.password_entry.get()
        save_path = self.save_path_var.get()

        crawler = BilibiliDanmuCrawler()
        crawler.login(username, password)
        crawler.get_cid(video_url)
        crawler.get_danmu()
        crawler.parse_danmu()
        crawler.save_to_excel(save_path)

        self.crawl_button.config(state='disabled')
        self.word_count_button.config(state='normal')
        self.word_cloud_button.config(state='disabled')

    def word_count(self):
        save_path = self.save_path_var.get()

        crawler = BilibiliDanmuCrawler()
        crawler.excel_file = openpyxl.load_workbook(save_path)
        crawler.word_count()
        crawler.excel_file.save(save_path)

        self.crawl_button.config(state='disabled')
        self.word_count_button.config(state='disabled')
        self.word_cloud_button.config(state='normal')

    def generate_word_cloud(self):
        save_path = self.save_path_var.get()

        crawler = BilibiliDanmuCrawler()
        crawler.excel_file = openpyxl.load_workbook(save_path)
        crawler.generate_word_cloud()
        crawler.excel_file.save(save_path)

        self.crawl_button.config(state='normal')
        self.word_count_button.config(state='normal')
        self.word_cloud_button.config(state='disabled')

root = tk.Tk()
app = Application(master=root)
app.mainloop()

代码说明

BilibiliDanmuCrawler 类
- 定义了用于爬取 B 站弹幕的类 BilibiliDanmuCrawler，包含以下方法：
  - login(username, password): 登录 B 站获取 cookie，需要用户自行实现
  - get_cid(video_url): 根据视频链接获取 cid，需要用户自行实现
  - get_danmu(): 根据 cid 获取弹幕数据，需要用户自行实现
  - parse_danmu(): 解析弹幕数据，保存到列表中，需要用户自行实现
  - save_to_excel(file_path): 将弹幕数据保存到 Excel 表中，需要用户自行实现
  - word_count(): 对弹幕文本进行分词和词频统计，需要用户自行实现
  - generate_word_cloud(): 生成词云图，需要用户自行实现
Application 类
- 定义了用于创建窗体应用程序的类 Application，包含以下方法：
  - create_widgets(): 创建窗体上的各种控件，包括视频链接输入框、用户名输入框、密码输入框、保存路径选择框、爬取弹幕按钮、分词统计按钮、生成词云按钮、退出按钮
  - select_save_path(): 选择保存路径
  - crawl_danmu(): 爬取弹幕并保存到 Excel 表格
  - word_count(): 进行分词统计
  - generate_word_cloud(): 生成词云图

注意：

代码中 TODO 部分需要用户自行实现
由于 B 站网站结构可能会发生变化，程序可能需要进行调整才能正常运行
使用本程序爬取 B 站视频弹幕时，请遵守 B 站网站的规则和协议，不要进行恶意爬取或其他违反网站规则的行为