修改后的代码如下:

from scrapy import Request from scrapy.spiders import Spider from ..items import BookItem from scrapy.exceptions import DropItem from scrapy.pipelines.images import ImagesPipeline import logging import os

class bookSpider(Spider): name = 'books'

def start_requests(self):
    url = "https://www.qidian.com/finish/"
    yield Request(url)

def parse(self, response):
    li_selector = response.css("ul.all-img-list.cf > li")
    for selector in li_selector:
        item = BookItem()
        item['name'] = selector.css("div h2 a::text").get()
        item['price'] = selector.css("div p.intro::text").get().strip()
        img_url = selector.css("div .book-img-box img::attr(src)").get()
        if img_url:
            img_url = response.urljoin(img_url) # 将图片url相对路径转化为绝对路径
            item['image_urls'] = [img_url]
        yield item

    next_url = response.css('.lbf-pagination-next::attr(href)').get() # 获取下一页URL
    if next_url:
        yield response.follow(next_url, self.parse) # 直接跟随URL请求下一页并解析

class BookPipeline: def process_item(self, item, spider): with open('书籍信息.txt', 'a', encoding='utf-8') as f: if not f.tell(): f.write('书籍名,信息'+'\n') str1 = item['name'] + "," + item['price'] + '\n' f.write(str1) return item

class SaveImagePipeline(ImagesPipeline): def get_media_requests(self, item, info): for image_url in item.get('image_urls', []): yield Request(image_url, meta={'item': item})

def item_completed(self, results, item, info):
    item = super().item_completed(results, item, info)
    if 'image_urls' in item:
        if not results[0][0]:
            raise DropItem("图片未下载")
    return item

def file_path(self, request, response=None, info=None):
    #返回图片名称
    return os.path.basename(request.url)

def thumb_path(self, request, thumb_id, response=None, info=None):
    #返回缩略图名称
    return f'thumbs/{thumb_id}/{self.file_path(request)}'

logger = logging.getLogger("SavelmagePipeline")

Crawl responsibly by identifying yourself (and your website) on the user-agent

USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.95 Safari/537.36)"

Obey robots.txt rules

ROBOTSTXT_OBEY = True

IMAGES_STORE='D:/pyth项目/book/img' IMAGES_THUMBS={ 'small':(10,10), 'big':(50,50) } IMAGES_MIN_WIDTH=5 IMAGES_MIN_HEIGHT=5

Configure item pipelines

See https://docs.scrapy.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES = { "book.pipelines.BookPipeline": 300, "book.pipelines.SaveImagePipeline": 400 }

cmdline.execute("scrapy crawl books".split()

bookSpider类代码 from scrapy import Request from scrapyspiders import Sp ide r fro m items i mport BookItem import requests import scrapy class bookSpiderSpider name = books def start_requestsself url =

原文地址: https://www.cveoy.top/t/topic/fbaa 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录