以下是示例代码:

import scrapy import json

class JDHotSaleSpider(scrapy.Spider): name = 'jd_hotsale' allowed_domains = ['jd.com'] start_urls = ['https://www.jd.com/']

def parse(self, response):
    # 1. 获取指定品牌手机信息
    brands = ['Apple', 'Samsung', 'Huawei']
    for brand in brands:
        url = f'https://search.jd.com/Search?keyword={brand}&enc=utf-8&wq={brand}'
        yield scrapy.Request(url, meta={'brand': brand}, callback=self.parse_phone_list)

def parse_phone_list(self, response):
    brand = response.meta['brand']
    phones = response.xpath('//li[@class="gl-item"]')
    for phone in phones:
        name = phone.xpath('.//div[@class="p-name"]/a/em/text()').get()
        price = phone.xpath('.//div[@class="p-price"]/strong/i/text()').get()
        if not price:
            price = phone.xpath('.//div[@class="p-price"]/strong/@data-price').get()
        comment_count = phone.xpath('.//div[@class="p-commit"]/strong/a/text()').get()
        is_self_support = phone.xpath('.//div[@class="p-icons"]/i[contains(@class, "self-support")]/text()')
        is_self_support = bool(is_self_support)
        phone_info = {'brand': brand, 'name': name, 'price': price, 'comment_count': comment_count, 'is_self_support': is_self_support}
        yield scrapy.Request(phone.xpath('./div[@class="gl-i-wrap j-sku-item"]/@data-sku').get(), meta={'phone_info': phone_info}, callback=self.parse_phone_detail)

def parse_phone_detail(self, response):
    phone_info = response.meta['phone_info']
    attrs = response.xpath('//div[@class="Ptable"]/div[@class="Ptable-item"]')
    attrs_dict = {}
    for attr in attrs:
        key = attr.xpath('./div[@class="Ptable-item-tit"]/text()').get()
        value = attr.xpath('./div[@class="Ptable-item-con"]/text()').get()
        attrs_dict[key] = value
    phone_info['attrs'] = attrs_dict
    yield phone_info

def closed(self, reason):
    # 2. 对所获取的信息去重,去除相同品牌相同型号的手机
    phones = []
    for item in self.crawler.stats.get_stats()['item_scraped_count']:
        if item['name'] not in [p['name'] for p in phones if p['brand'] == item['brand']]:
            phones.append(item)

    # 3. 将手机的详细参数信息保存成为json文件
    with open('phones.json', 'w', encoding='utf-8') as f:
        json.dump(phones, f, ensure_ascii=False)
Python爬虫实战:使用Scrapy抓取京东热卖手机信息

原文地址: https://www.cveoy.top/t/topic/ol5X 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录