import sys
import requests
from bs4 import BeautifulSoup
from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QPushButton, QTextEdit, QFileDialog
from PyQt5.QtGui import QPixmap
from PyQt5.QtCore import Qt
import sqlite3
import jieba
from wordcloud import WordCloud
from openpyxl import Workbook

class MainWindow(QMainWindow):
    def __init__(self):
        super().__init__()
        self.initUI()

    def initUI(self):
        self.setWindowTitle('斗罗大陆第一集弹幕爬取')
        self.setGeometry(100, 100, 800, 600)

        self.label1 = QLabel(self)
        self.label1.setText('视频网址:')
        self.label1.move(50, 50)

        self.textEdit1 = QTextEdit(self)
        self.textEdit1.setGeometry(150, 50, 500, 30)

        self.label2 = QLabel(self)
        self.label2.setText('弹幕爬取结果:')
        self.label2.move(50, 100)

        self.textEdit2 = QTextEdit(self)
        self.textEdit2.setGeometry(50, 150, 700, 300)

        self.button1 = QPushButton(self)
        self.button1.setText('爬取弹幕')
        self.button1.setGeometry(50, 500, 100, 30)
        self.button1.clicked.connect(self.crawl_danmu)

        self.button2 = QPushButton(self)
        self.button2.setText('保存到Excel')
        self.button2.setGeometry(200, 500, 100, 30)
        self.button2.clicked.connect(self.save_to_excel)

        self.button3 = QPushButton(self)
        self.button3.setText('生成词云图')
        self.button3.setGeometry(350, 500, 100, 30)
        self.button3.clicked.connect(self.generate_wordcloud)

        self.button4 = QPushButton(self)
        self.button4.setText('清空结果')
        self.button4.setGeometry(500, 500, 100, 30)
        self.button4.clicked.connect(self.clear_result)

    def crawl_danmu(self):
        url = self.textEdit1.toPlainText()
        if not url.startswith('https://v.qq.com/x/cover'):
            self.textEdit2.setText('请输入正确的视频网址!')
            return
        res = requests.get(url)
        soup = BeautifulSoup(res.text, 'lxml')
        vid = soup.find('div', {'class': 'player_wrapper'})['data-vid']
        cid = soup.find('div', {'class': 'player_wrapper'})['data-cid']
        danmu_url = f'https://mfm.video.qq.com/danmu?otype=json&timestamp=1619874895.695&target_id={vid}&session_key=&appver=3.2.19.333&sdkver=2.7.3.19&platform=2&sdtfrom=&vid={vid}&cid={cid}'
        res = requests.get(danmu_url)
        danmu_list = res.json()['comments']
        danmu_text = ''
        for danmu in danmu_list:
            danmu_text += danmu['content'] + '\n'
        self.textEdit2.setText(danmu_text)

    def save_to_excel(self):
        file_path, _ = QFileDialog.getSaveFileName(self, '保存文件', './', 'Excel Files (*.xlsx)')
        if not file_path:
            return
        wb = Workbook()
        ws = wb.active
        danmu_text = self.textEdit2.toPlainText()
        danmu_list = danmu_text.split('\n')
        for i, danmu in enumerate(danmu_list):
            ws.cell(row=i+1, column=1, value=danmu)
        wb.save(file_path)
        self.textEdit2.setText('保存成功!')

    def generate_wordcloud(self):
        danmu_text = self.textEdit2.toPlainText()
        if not danmu_text:
            return
        conn = sqlite3.connect(':memory:')
        c = conn.cursor()
        c.execute('CREATE TABLE danmu (content TEXT)')
        danmu_list = danmu_text.split('\n')
        for danmu in danmu_list:
            c.execute('INSERT INTO danmu VALUES (?)', (danmu,))
        c.execute('SELECT content FROM danmu')
        content_list = c.fetchall()
        content = ''
        for item in content_list:
            content += item[0] + ' '
        seg_list = jieba.cut(content)
        word_list = []
        for seg in seg_list:
            if len(seg) > 1:
                word_list.append(seg)
        word_freq = {}
        for word in word_list:
            if word in word_freq:
                word_freq[word] += 1
            else:
                word_freq[word] = 1
        wc = WordCloud(background_color='white', font_path='msyh.ttc', width=600, height=400, max_words=50)
        wc.generate_from_frequencies(word_freq)
        wc.to_file('wordcloud.png')
        pixmap = QPixmap('wordcloud.png')
        self.label3 = QLabel(self)
        self.label3.setPixmap(pixmap)
        self.label3.setAlignment(Qt.AlignCenter)
        self.label3.setGeometry(50, 470, 700, 100)
        self.label3.show()

    def clear_result(self):
        self.textEdit2.clear()
        self.label3.clear()

if __name__ == '__main__':
    app = QApplication(sys.argv)
    mainWindow = MainWindow()
    mainWindow.show()
    sys.exit(app.exec_())

程序功能:

  1. 爬取弹幕: 输入腾讯视频《斗罗大陆》第一集网址,点击'爬取弹幕'按钮,即可获取弹幕内容并显示在结果框中。
  2. 保存到Excel: 点击'保存到Excel'按钮,可将爬取的弹幕数据保存到Excel表格中,方便后续分析。
  3. 生成词云图: 点击'生成词云图'按钮,程序将根据弹幕内容生成词云图,直观展示弹幕关键词汇及其频率。
  4. 清空结果: 点击'清空结果'按钮,可清除当前结果框内容和词云图,方便进行新的操作。

使用说明:

  1. 确保已安装Python环境以及所需的第三方库:requests, BeautifulSoup4, lxml, sqlite3, jieba, WordCloud, openpyxl, PyQt5。
  2. 运行代码,将在弹出的窗口中输入目标视频网址。
  3. 点击相应按钮即可执行对应功能。

注意:

  • 本程序仅供学习交流使用,请勿用于商业用途。
  • 爬取过程中请遵循网站robots协议,避免对目标网站造成压力。
  • 词云图生成需要字体文件'msyh.ttc',请确保该文件已放置在程序运行目录下。

原文地址: https://www.cveoy.top/t/topic/f1du 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录