Scrapy爬取起点中文网书籍信息并下载封面图片
Scrapy爬取起点中文网书籍信息并下载封面图片
实验目的
本实验的目的是通过使用Scrapy框架,爬取起点中文网的书籍信息,并将其存储到文本文件中,同时下载书籍的封面图片并保存到本地。
代码实现
bookSpider类代码
from scrapy import Request
from scrapy.spiders import Spider
from ..items import BookItem
import requests
import scrapy
class bookSpider(Spider):
name = 'books'
def start_requests(self):
url = 'https://www.qidian.com/finish/'
yield Request(url)
def parse(self, response):
li_selector = response.css('ul.all-img-list.cf > li')
for selector in li_selector:
item = BookItem()
item['name'] = selector.css('div h2 a::text').get()
item['price'] = selector.css('div p.intro::text').get().strip()
img_url = selector.css('div .book-img-box img::attr(src)').get()
if img_url:
img_url = response.urljoin(img_url) # 将图片url相对路径转化为绝对路径
try:
image_resp = requests.get(img_url)
if image_resp.status_code == 200 and image_resp.content:
item['img_data'] = image_resp.content
except Exception as e:
self.logger.error(f'下载图片失败: {e}')
yield item
next_url = response.css('.lbf-pagination-next::attr(href)').get() # 获取下一页URL
if next_url:
yield response.follow(next_url, self.parse) # 直接跟随URL请求下一页并解析
def parse_image(self, response):
item = response.meta['item']
if 'img_url' in item:
image_name = response.url.split('/')[-1]
item['image_name'] = image_name
return item
items类代码
import scrapy
class BookItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
price = scrapy.Field()
img_url = scrapy.Field()
pipelines类代码
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline # 下载图片的管道
from scrapy import Request
from scrapy.exceptions import DropItem # 异常
import logging
import os
class BookPipeline:
def process_item(self, item, spider):
with open('书籍信息.txt', 'a', encoding='utf-8') as f:
if not f.tell():
f.write('书籍名,信息' + '\n')
str1 = item['name'] + ',' + item['price'] + '\n'
f.write(str1)
return item
logger = logging.getLogger('SavelmagePipeline') # 图片管道,继承于lmagesPipeline
class SaveImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
img_url = item.get('img_url')
if img_url:
yield Request(img_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
logger.warning(f'图片未下载: {item['img_url']}')
raise DropItem('图片未下载')
return item
def file_path(self, request, response=None, info=None):
# 返回图片名称
return os.path.basename(request.url)
setting类代码
REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
FEED_EXPORT_ENCODING = 'utf-8'
BOT_NAME = 'book'
SPIDER_MODULES = ['book.spiders']
NEWSPIDER_MODULE = 'book.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.95 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
IMAGES_STORE = 'D:/pyth项目/book/img'
IMAGES_THUMBS = {
'small': (10, 10),
'big': (50, 50)
}
IMAGES_MIN_WIDTH = 5
IMAGES_MIN_HEIGHT = 5
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'book.pipelines.BookPipeline': 300,
'book.pipelines.SaveImagePipeline': 400
}
start类代码
from scrapy import cmdline
cmdline.execute('scrapy crawl books'.split())
实验结果
运行代码后,程序会爬取起点中文网已完结书籍信息,并将书籍信息存储到'书籍信息.txt'文件中,同时下载封面图片并保存到'D:/pyth项目/book/img'目录下。
总结
本实验成功使用Scrapy框架爬取了起点中文网的书籍信息,并下载了封面图片,验证了Scrapy框架在数据抓取方面的强大功能。
原文地址: https://www.cveoy.top/t/topic/buRV 著作权归作者所有。请勿转载和采集!