由于题目中未明确指定需要爬取哪个小说网站的内容,因此以下代码仅提供一个大致的思路,需要根据实际情况进行修改和完善。

import requests
from bs4 import BeautifulSoup
import lxml
import sqlite3
import jieba
from wordcloud import WordCloud
import openpyxl
import tkinter as tk
from tkinter import filedialog

class NovelCrawler:
    def __init__(self):
        self.url = "" # 待爬取的小说网站链接
        self.title = "" # 小说标题
        self.author = "" # 小说作者
        self.content = "" # 小说正文
        self.word_list = [] # 分词结果
        self.word_freq = {} # 分词频率统计结果
        self.wordcloud = None # 词云图
        self.db_name = "novel.db" # 数据库名称
        self.excel_name = "novel.xlsx" # Excel表格名称
        self.conn = None # 数据库连接
        self.cursor = None # 数据库游标
        self.wb = None # Excel工作簿
        self.ws = None # Excel工作表
        self.init_db() # 初始化数据库
        self.init_excel() # 初始化Excel表格

    def init_db(self):
        self.conn = sqlite3.connect(self.db_name)
        self.cursor = self.conn.cursor()
        self.cursor.execute('''CREATE TABLE IF NOT EXISTS novel
                               (title TEXT, author TEXT, content TEXT)''')
        self.conn.commit()

    def init_excel(self):
        self.wb = openpyxl.Workbook()
        self.ws = self.wb.active
        self.ws.append(["标题", "作者", "正文"])

    def load_url(self):
        self.url = input("请输入小说网站链接:")

    def get_novel_info(self):
        response = requests.get(self.url)
        response.encoding = "utf-8"
        soup = BeautifulSoup(response.text, "lxml")
        self.title = soup.find("h1").text.strip()
        self.author = soup.find("div", class_="author").text.strip()
        self.content = soup.find("div", class_="content").text.strip()

    def save_to_db(self):
        self.cursor.execute("INSERT INTO novel VALUES (?, ?, ?)",
                            (self.title, self.author, self.content))
        self.conn.commit()

    def save_to_excel(self):
        self.ws.append([self.title, self.author, self.content])
        self.wb.save(self.excel_name)

    def segment_words(self):
        self.word_list = jieba.cut(self.content)
        self.word_freq = {}
        for word in self.word_list:
            if len(word) > 1:
                self.word_freq[word] = self.word_freq.get(word, 0) + 1

    def generate_wordcloud(self):
        self.wordcloud = WordCloud(width=800, height=600, background_color="white").generate_from_frequencies(self.word_freq)

    def save_wordcloud(self):
        file_path = filedialog.asksaveasfilename(defaultextension=".png")
        if file_path:
            self.wordcloud.to_file(file_path)

class NovelCrawlerGUI:
    def __init__(self):
        self.crawler = NovelCrawler()
        self.window = tk.Tk()
        self.window.title("小说爬取")
        self.window.geometry("400x300")

        self.load_url_button = tk.Button(self.window, text="加载链接", command=self.load_url)
        self.load_url_button.pack(pady=10)

        self.get_info_button = tk.Button(self.window, text="获取信息", command=self.get_novel_info)
        self.get_info_button.pack(pady=10)

        self.save_to_db_button = tk.Button(self.window, text="保存到数据库", command=self.save_to_db)
        self.save_to_db_button.pack(pady=10)

        self.save_to_excel_button = tk.Button(self.window, text="保存到Excel", command=self.save_to_excel)
        self.save_to_excel_button.pack(pady=10)

        self.segment_words_button = tk.Button(self.window, text="分词", command=self.segment_words)
        self.segment_words_button.pack(pady=10)

        self.generate_wordcloud_button = tk.Button(self.window, text="生成词云图", command=self.generate_wordcloud)
        self.generate_wordcloud_button.pack(pady=10)

        self.save_wordcloud_button = tk.Button(self.window, text="保存词云图", command=self.save_wordcloud)
        self.save_wordcloud_button.pack(pady=10)

        self.window.mainloop()

    def load_url(self):
        self.crawler.load_url()

    def get_novel_info(self):
        self.crawler.get_novel_info()

    def save_to_db(self):
        self.crawler.save_to_db()

    def save_to_excel(self):
        self.crawler.save_to_excel()

    def segment_words(self):
        self.crawler.segment_words()

    def generate_wordcloud(self):
        self.crawler.generate_wordcloud()

    def save_wordcloud(self):
        self.crawler.save_wordcloud()

if __name__ == "__main__":
    NovelCrawlerGUI()

以上代码中,NovelCrawler类定义了小说爬取的具体实现,包括从网站获取信息、保存到数据库和Excel表格、分词和生成词云图等操作;NovelCrawlerGUI类则定义了一个简单的窗口应用程序,通过按钮调用NovelCrawler类的方法实现各种功能。其中,filedialog模块用于选择保存词云图的文件路径。需要注意的是,由于各种操作可能需要一些时间,因此建议在按钮点击后显示一个进度条或提示信息,以便用户等待和确认操作是否成功

编写python代码实现以下功能小说爬取主要任务:设计一个窗体应用系统具有以下功能:1加载需要用到的各种第三方库如requests;BeautifulSoup4;lxml;sqlite3;jieba;;WordCloud;openpyxl等。将信息保存到Excel表中显示处理后的信息

原文地址: https://www.cveoy.top/t/topic/hnCf 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录