Python爬虫实战:使用Scrapy抓取京东热卖手机信息
以下是示例代码:
import scrapy import json
class JDHotSaleSpider(scrapy.Spider): name = 'jd_hotsale' allowed_domains = ['jd.com'] start_urls = ['https://www.jd.com/']
def parse(self, response):
# 1. 获取指定品牌手机信息
brands = ['Apple', 'Samsung', 'Huawei']
for brand in brands:
url = f'https://search.jd.com/Search?keyword={brand}&enc=utf-8&wq={brand}'
yield scrapy.Request(url, meta={'brand': brand}, callback=self.parse_phone_list)
def parse_phone_list(self, response):
brand = response.meta['brand']
phones = response.xpath('//li[@class="gl-item"]')
for phone in phones:
name = phone.xpath('.//div[@class="p-name"]/a/em/text()').get()
price = phone.xpath('.//div[@class="p-price"]/strong/i/text()').get()
if not price:
price = phone.xpath('.//div[@class="p-price"]/strong/@data-price').get()
comment_count = phone.xpath('.//div[@class="p-commit"]/strong/a/text()').get()
is_self_support = phone.xpath('.//div[@class="p-icons"]/i[contains(@class, "self-support")]/text()')
is_self_support = bool(is_self_support)
phone_info = {'brand': brand, 'name': name, 'price': price, 'comment_count': comment_count, 'is_self_support': is_self_support}
yield scrapy.Request(phone.xpath('./div[@class="gl-i-wrap j-sku-item"]/@data-sku').get(), meta={'phone_info': phone_info}, callback=self.parse_phone_detail)
def parse_phone_detail(self, response):
phone_info = response.meta['phone_info']
attrs = response.xpath('//div[@class="Ptable"]/div[@class="Ptable-item"]')
attrs_dict = {}
for attr in attrs:
key = attr.xpath('./div[@class="Ptable-item-tit"]/text()').get()
value = attr.xpath('./div[@class="Ptable-item-con"]/text()').get()
attrs_dict[key] = value
phone_info['attrs'] = attrs_dict
yield phone_info
def closed(self, reason):
# 2. 对所获取的信息去重,去除相同品牌相同型号的手机
phones = []
for item in self.crawler.stats.get_stats()['item_scraped_count']:
if item['name'] not in [p['name'] for p in phones if p['brand'] == item['brand']]:
phones.append(item)
# 3. 将手机的详细参数信息保存成为json文件
with open('phones.json', 'w', encoding='utf-8') as f:
json.dump(phones, f, ensure_ascii=False)
原文地址: https://www.cveoy.top/t/topic/ol5X 著作权归作者所有。请勿转载和采集!