由于京东网站的反爬虫机制较为严格,需要使用一些反反爬虫的技巧,如使用代理IP、设置User-Agent等。以下是一个简单的示例代码,仅供参考。

import scrapy
from scrapy import Request
import json


class JDPhoneSpider(scrapy.Spider):
    name = 'jdphone'
    allowed_domains = ['jd.com']
    start_urls = [
        'https://list.jd.com/list.html?cat=9987,653,655'
    ]
    brands = ['Apple', '华为', '小米']  # 指定的三个品牌

    def parse(self, response):
        # 获取所有手机列表页面的链接
        page_links = response.css('.p-num a::attr(href)').extract()
        for link in page_links:
            yield Request(link, callback=self.parse_phone_list)

    def parse_phone_list(self, response):
        # 获取当前页面的手机信息
        phones = response.css('.gl-warp .gl-item')
        for phone in phones:
            brand = phone.css('.p-name a::text').extract_first().split()[0]
            if brand not in self.brands:
                continue
            name = phone.css('.p-name a::attr(title)').extract_first()
            price = phone.css('.p-price strong::text').extract_first()
            comment_count = phone.css('.p-commit strong::text').extract_first()
            is_self_run = '自营' in phone.css('.p-icons i::text').extract()
            phone_dict = {'name': name, 'price': price, 'comment_count': comment_count, 'is_self_run': is_self_run}
            detail_link = phone.css('.p-name a::attr(href)').extract_first()
            yield Request(detail_link, callback=self.parse_phone_detail, meta={'phone': phone_dict})

    def parse_phone_detail(self, response):
        # 获取手机的详细参数信息
        phone_dict = response.meta['phone']
        params = {}
        params_list = response.css('.Ptable-item')
        for param in params_list:
            name = param.css('.dt::text').extract_first().strip()
            value = param.css('.dd::text').extract_first().strip()
            params[name] = value
        phone_dict['params'] = params
        yield phone_dict

    def closed(self, reason):
        # 对所获取的信息去重,去除相同品牌相同型号的手机
        phones = list(self.crawler.stats.get_value('items'))
        distinct_phones = []
        for phone in phones:
            if phone not in distinct_phones:
                distinct_phones.append(phone)
        # 将手机的详细参数信息保存成为json文件
        with open('phones.json', 'w', encoding='utf-8') as f:
            json.dump(distinct_phones, f, ensure_ascii=False)

在运行前需要安装Scrapy和相关依赖,并在settings.py中添加以下配置:

ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 0.5
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
DOWNLOADER_MIDDLEWARES = {
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
    'scrapy_useragents.downloadermiddlewares.useragents.UserAgentsMiddleware': 500,
    'scrapy.downloadermiddlewares.retry.RetryMiddleware': 900,
    'scrapy_proxy_pool.middlewares.ProxyPoolMiddleware': 100,
    'scrapy_proxy_pool.middlewares.BanDetectionMiddleware': 200,
}
USER_AGENTS = [
    'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
    'Mozilla/5.0 (compatible; Bingbot/2.0; +http://www.bing.com/bingbot.htm)',
    'Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)',
    'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
    'Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)',
    'Mozilla/5.0 (compatible; DuckDuckBot/1.0; +http://duckduckgo.com/duckduckbot.html)',
    'Mozilla/5.0 (compatible; SogouSpider/4.0; +http://www.sogou.com/docs/help/webmasters.htm#07)',
    'Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)',
    'Mozilla/5.0 (compatible; Facebot/1.0; +http://www.facebook.com/facebot)',
]
PROXY_POOL_ENABLED = True
PROXY_POOL_RETRY_TIMES = 3
PROXY_POOL_RANDOM_USER_AGENT = True
PROXY_POOL_HTTP_CODES = [200, 301, 302, 404, 500, 502, 503, 504]
京东热卖手机信息抓取:运用Scrapy爬取指定品牌手机数据

原文地址: https://www.cveoy.top/t/topic/ol6s 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录