这段代码中并没有生成json文件的代码,也没有指定json文件的保存路径。如果需要生成json文件,可以在爬虫关闭时通过读取爬虫统计信息中的数据来生成json文件,并将生成的json文件保存到指定的路径。具体实现可以参考以下代码示例:

import scrapy
import re
import json
import xlwt

class JDPhoneSpider(scrapy.Spider):
    name = 'jdphone'
    allowed_domains = ['jd.com']
    start_urls = [
        'https://search.jd.com/Search?keyword=华为手机&enc=utf-8&wq=华为手机',
        'https://search.jd.com/Search?keyword=小米手机&enc=utf-8&wq=小米手机',
        'https://search.jd.com/Search?keyword=苹果手机&enc=utf-8&wq=苹果手机'
    ]
    custom_settings = {
        'DOWNLOAD_DELAY': 1,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    def parse(self, response):
        # 获取搜索结果页中的所有手机链接
        phone_links = response.css('div#J_goodsList li.gl-item div.p-name a::attr(href)').extract()
        for link in phone_links:
            yield scrapy.Request(link, callback=self.parse_phone)

        # 获取下一页链接
        next_page = response.css('a.fp-next::attr(href)').extract_first()
        if next_page:
            yield scrapy.Request('https://search.jd.com' + next_page, callback=self.parse)

    def parse_phone(self, response):
        # 获取手机名称、价格、评价数量、是否自营
        name = response.css('div.sku-name::text').extract_first().strip()
        price = response.css('div.summary-price-wrap span.p-price strong::text').extract_first()
        comment_count = response.css('div.comment-count a::text').extract_first()
        is_self_operated = '是' if response.css('div.self-operated').extract_first() else '否'

        # 获取手机参数信息
        params = {}
        param_items = response.css('ul.parameter2 li')
        for item in param_items:
            key = item.css('span.p-name::text').extract_first().strip()
            value = item.css('span.p-value::text').extract_first().strip()
            params[key] = value

        # 将数据保存为item
        item = {
            'name': name,
            'price': price,
            'comment_count': comment_count,
            'is_self_operated': is_self_operated,
            'params': params
        }
        yield item

    def closed(self, reason):
        # 将数据保存为excel文件
        book = xlwt.Workbook(encoding='utf-8')
        sheet = book.add_sheet('phones')
        row = 0
        sheet.write(row, 0, '品牌')
        sheet.write(row, 1, '型号')
        sheet.write(row, 2, '价格')
        sheet.write(row, 3, '评价数量')
        sheet.write(row, 4, '是否自营')
        sheet.write(row, 5, '颜色')
        sheet.write(row, 6, '屏幕尺寸')
        sheet.write(row, 7, 'CPU型号')
        sheet.write(row, 8, 'RAM')
        sheet.write(row, 9, 'ROM')
        for brand in ['华为', '小米', '苹果']:
            phones = self.get_phones_by_brand(brand)
            models = set([phone['name'] for phone in phones])
            for model in models:
                phones_of_model = [phone for phone in phones if phone['name'] == model]
                phone = phones_of_model[0]
                row += 1
                sheet.write(row, 0, brand)
                sheet.write(row, 1, phone['name'])
                sheet.write(row, 2, phone['price'])
                sheet.write(row, 3, phone['comment_count'])
                sheet.write(row, 4, phone['is_self_operated'])
                for key, value in phone['params'].items():
                    if key == '颜色':
                        sheet.write(row, 5, value)
                    elif key == '屏幕尺寸':
                        sheet.write(row, 6, value)
                    elif key == 'CPU型号':
                        sheet.write(row, 7, value)
                    elif key == 'RAM':
                        sheet.write(row, 8, value)
                    elif key == 'ROM':
                        sheet.write(row, 9, value)
        book.save('phones.xls')

        # 将数据保存为json文件
        data = {}
        for brand in ['华为', '小米', '苹果']:
            data[brand] = {'phones': self.get_phones_by_brand(brand)}
        with open('phones.json', 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False)

    def get_phones_by_brand(self, brand):
        phones = []
        for item in self.crawler.stats.get_stats():
            if item[0].startswith('jdphone'):
                data = json.loads(item[1])
                if data['name'] == brand:
                    phones.extend(data['phones'])
        return phones

    def process_item(self, item, spider):
        # 对数据进行去重处理
        brand = re.search(r'(\w+)手机', spider.start_urls[0]).group(1)
        model = item['name']
        for item in self.crawler.stats.get_stats():
            if item[0].startswith('jdphone'):
                data = json.loads(item[1])
                if data['name'] == brand:
                    phones = data['phones']
                    if any([phone['name'] == model for phone in phones]):
                        return item
                    else:
                        phones.append(item)
                        self.crawler.stats.set_value('jdphone', json.dumps({'name': brand, 'phones': phones}))
                        return item
        self.crawler.stats.set_value('jdphone', json.dumps({'name': brand, 'phones': [item]}))
        return item
``
import scrapyimport reimport jsonimport xlwtclass JDPhoneSpiderscrapySpider name = jdphone allowed_domains = jdcom start_urls = httpssearchjdcomSearchkeyword=华为手机&enc=utf-8&wq=华为手机

原文地址: https://www.cveoy.top/t/topic/fGnk 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录