import scrapyimport reimport jsonimport xlwtclass JDPhoneSpiderscrapySpider name = jdphone allowed_domains = jdcom start_urls = httpssearchjdcomSearchkeyword=华为手机&enc=utf-8&wq=华为手机
这段代码中并没有生成json文件的代码,也没有指定json文件的保存路径。如果需要生成json文件,可以在爬虫关闭时通过读取爬虫统计信息中的数据来生成json文件,并将生成的json文件保存到指定的路径。具体实现可以参考以下代码示例:
import scrapy
import re
import json
import xlwt
class JDPhoneSpider(scrapy.Spider):
name = 'jdphone'
allowed_domains = ['jd.com']
start_urls = [
'https://search.jd.com/Search?keyword=华为手机&enc=utf-8&wq=华为手机',
'https://search.jd.com/Search?keyword=小米手机&enc=utf-8&wq=小米手机',
'https://search.jd.com/Search?keyword=苹果手机&enc=utf-8&wq=苹果手机'
]
custom_settings = {
'DOWNLOAD_DELAY': 1,
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
def parse(self, response):
# 获取搜索结果页中的所有手机链接
phone_links = response.css('div#J_goodsList li.gl-item div.p-name a::attr(href)').extract()
for link in phone_links:
yield scrapy.Request(link, callback=self.parse_phone)
# 获取下一页链接
next_page = response.css('a.fp-next::attr(href)').extract_first()
if next_page:
yield scrapy.Request('https://search.jd.com' + next_page, callback=self.parse)
def parse_phone(self, response):
# 获取手机名称、价格、评价数量、是否自营
name = response.css('div.sku-name::text').extract_first().strip()
price = response.css('div.summary-price-wrap span.p-price strong::text').extract_first()
comment_count = response.css('div.comment-count a::text').extract_first()
is_self_operated = '是' if response.css('div.self-operated').extract_first() else '否'
# 获取手机参数信息
params = {}
param_items = response.css('ul.parameter2 li')
for item in param_items:
key = item.css('span.p-name::text').extract_first().strip()
value = item.css('span.p-value::text').extract_first().strip()
params[key] = value
# 将数据保存为item
item = {
'name': name,
'price': price,
'comment_count': comment_count,
'is_self_operated': is_self_operated,
'params': params
}
yield item
def closed(self, reason):
# 将数据保存为excel文件
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('phones')
row = 0
sheet.write(row, 0, '品牌')
sheet.write(row, 1, '型号')
sheet.write(row, 2, '价格')
sheet.write(row, 3, '评价数量')
sheet.write(row, 4, '是否自营')
sheet.write(row, 5, '颜色')
sheet.write(row, 6, '屏幕尺寸')
sheet.write(row, 7, 'CPU型号')
sheet.write(row, 8, 'RAM')
sheet.write(row, 9, 'ROM')
for brand in ['华为', '小米', '苹果']:
phones = self.get_phones_by_brand(brand)
models = set([phone['name'] for phone in phones])
for model in models:
phones_of_model = [phone for phone in phones if phone['name'] == model]
phone = phones_of_model[0]
row += 1
sheet.write(row, 0, brand)
sheet.write(row, 1, phone['name'])
sheet.write(row, 2, phone['price'])
sheet.write(row, 3, phone['comment_count'])
sheet.write(row, 4, phone['is_self_operated'])
for key, value in phone['params'].items():
if key == '颜色':
sheet.write(row, 5, value)
elif key == '屏幕尺寸':
sheet.write(row, 6, value)
elif key == 'CPU型号':
sheet.write(row, 7, value)
elif key == 'RAM':
sheet.write(row, 8, value)
elif key == 'ROM':
sheet.write(row, 9, value)
book.save('phones.xls')
# 将数据保存为json文件
data = {}
for brand in ['华为', '小米', '苹果']:
data[brand] = {'phones': self.get_phones_by_brand(brand)}
with open('phones.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False)
def get_phones_by_brand(self, brand):
phones = []
for item in self.crawler.stats.get_stats():
if item[0].startswith('jdphone'):
data = json.loads(item[1])
if data['name'] == brand:
phones.extend(data['phones'])
return phones
def process_item(self, item, spider):
# 对数据进行去重处理
brand = re.search(r'(\w+)手机', spider.start_urls[0]).group(1)
model = item['name']
for item in self.crawler.stats.get_stats():
if item[0].startswith('jdphone'):
data = json.loads(item[1])
if data['name'] == brand:
phones = data['phones']
if any([phone['name'] == model for phone in phones]):
return item
else:
phones.append(item)
self.crawler.stats.set_value('jdphone', json.dumps({'name': brand, 'phones': phones}))
return item
self.crawler.stats.set_value('jdphone', json.dumps({'name': brand, 'phones': [item]}))
return item
``
原文地址: https://www.cveoy.top/t/topic/fGnk 著作权归作者所有。请勿转载和采集!