京东手机信息爬取脚本 - 华为、小米、苹果手机数据采集 - 常规

python/nimport scrapy/nimport re/nimport json/nimport xlwt/n/nclass JDPhoneSpider(scrapy.Spider):/n name = 'jdphone'/n allowed_domains = ['jd.com']/n start_urls = [/n 'https://search.jd.com/Search?keyword=华为手机&enc=utf-8&wq=华为手机',/n 'https://search.jd.com/Search?keyword=小米手机&enc=utf-8&wq=小米手机',/n 'https://search.jd.com/Search?keyword=苹果手机&enc=utf-8&wq=苹果手机'/n ]/n custom_settings = {/n 'DOWNLOAD_DELAY': 1,/n 'CONCURRENT_REQUESTS_PER_DOMAIN': 1,/n 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'/n }/n/n def parse(self, response):/n # 获取搜索结果页中的所有手机链接/n phone_links = response.css('div#J_goodsList li.gl-item div.p-name a::attr(href)').extract()/n for link in phone_links:/n yield scrapy.Request(link, callback=self.parse_phone)/n/n # 获取下一页链接/n next_page = response.css('a.fp-next::attr(href)').extract_first()/n if next_page:/n yield scrapy.Request('https://search.jd.com' + next_page, callback=self.parse)/n/n def parse_phone(self, response):/n # 获取手机名称、价格、评价数量、是否自营/n name = response.css('div.sku-name::text').extract_first().strip()/n price = response.css('div.summary-price-wrap span.p-price strong::text').extract_first()/n comment_count = response.css('div.comment-count a::text').extract_first()/n is_self_operated = '是' if response.css('div.self-operated').extract_first() else '否'/n/n # 获取手机参数信息/n params = {}/n param_items = response.css('ul.parameter2 li')/n for item in param_items:/n key = item.css('span.p-name::text').extract_first().strip()/n value = item.css('span.p-value::text').extract_first().strip()/n params[key] = value/n/n # 将数据保存为item/n item = {/n 'name': name,/n 'price': price,/n 'comment_count': comment_count,/n 'is_self_operated': is_self_operated,/n 'params': params/n }/n yield item/n/n def closed(self, reason):/n # 将数据保存为excel文件/n book = xlwt.Workbook(encoding='utf-8')/n sheet = book.add_sheet('phones')/n row = 0/n sheet.write(row, 0, '品牌')/n sheet.write(row, 1, '型号')/n sheet.write(row, 2, '价格')/n sheet.write(row, 3, '评价数量')/n sheet.write(row, 4, '是否自营')/n sheet.write(row, 5, '颜色')/n sheet.write(row, 6, '屏幕尺寸')/n sheet.write(row, 7, 'CPU型号')/n sheet.write(row, 8, 'RAM')/n sheet.write(row, 9, 'ROM')/n for brand in ['华为', '小米', '苹果']:/n phones = self.get_phones_by_brand(brand)/n models = set([phone['name'] for phone in phones])/n for model in models:/n phones_of_model = [phone for phone in phones if phone['name'] == model]/n phone = phones_of_model[0]/n row += 1/n sheet.write(row, 0, brand)/n sheet.write(row, 1, phone['name'])/n sheet.write(row, 2, phone['price'])/n sheet.write(row, 3, phone['comment_count'])/n sheet.write(row, 4, phone['is_self_operated'])/n for key, value in phone['params'].items():/n if key == '颜色':/n sheet.write(row, 5, value)/n elif key == '屏幕尺寸':/n sheet.write(row, 6, value)/n elif key == 'CPU型号':/n sheet.write(row, 7, value)/n elif key == 'RAM':/n sheet.write(row, 8, value)/n elif key == 'ROM':/n sheet.write(row, 9, value)/n book.save('phones.xls')/n/n # 将数据保存为json文件/n data = {}/n for brand in ['华为', '小米', '苹果']:/n data[brand] = {'phones': self.get_phones_by_brand(brand)}/n with open('phones.json', 'w', encoding='utf-8') as f:/n json.dump(data, f, ensure_ascii=False)/n/n def get_phones_by_brand(self, brand):/n phones = []/n for item in self.crawler.stats.get_stats():/n if item[0].startswith('jdphone'):/n data = json.loads(item[1])/n if data['name'] == brand:/n phones.extend(data['phones'])/n return phones/n/n def process_item(self, item, spider):/n # 对数据进行去重处理/n brand = re.search(r'(/w+)手机', spider.start_urls[0]).group(1)/n model = item['name']/n for item in self.crawler.stats.get_stats():/n if item[0].startswith('jdphone'):/n data = json.loads(item[1])/n if data['name'] == brand:/n phones = data['phones']/n if any([phone['name'] == model for phone in phones]):/n return item/n else:/n phones.append(item)/n self.crawler.stats.set_value('jdphone', json.dumps({'name': brand, 'phones': phones}))/n return item/n self.crawler.stats.set_value('jdphone', json.dumps({'name': brand, 'phones': [item]})) /n return item/n/n/n运行结果内容：由于需要访问京东网站，需要一定时间进行爬取，因此运行时间较长，具体时间取决于网络情况和电脑性能。/n/n在爬取完成后，会自动将数据保存为excel文件和json文件，分别命名为phones.xls和phones.json，保存在当前目录下。同时，控制台会输出一些爬取统计信息，如爬取的手机数量、去重后的手机数量等。/n