京东热卖手机信息抓取:运用Scrapy爬取指定品牌手机数据
由于京东网站的反爬虫机制较为严格,需要使用一些反反爬虫的技巧,如使用代理IP、设置User-Agent等。以下是一个简单的示例代码,仅供参考。
import scrapy
from scrapy import Request
import json
class JDPhoneSpider(scrapy.Spider):
name = 'jdphone'
allowed_domains = ['jd.com']
start_urls = [
'https://list.jd.com/list.html?cat=9987,653,655'
]
brands = ['Apple', '华为', '小米'] # 指定的三个品牌
def parse(self, response):
# 获取所有手机列表页面的链接
page_links = response.css('.p-num a::attr(href)').extract()
for link in page_links:
yield Request(link, callback=self.parse_phone_list)
def parse_phone_list(self, response):
# 获取当前页面的手机信息
phones = response.css('.gl-warp .gl-item')
for phone in phones:
brand = phone.css('.p-name a::text').extract_first().split()[0]
if brand not in self.brands:
continue
name = phone.css('.p-name a::attr(title)').extract_first()
price = phone.css('.p-price strong::text').extract_first()
comment_count = phone.css('.p-commit strong::text').extract_first()
is_self_run = '自营' in phone.css('.p-icons i::text').extract()
phone_dict = {'name': name, 'price': price, 'comment_count': comment_count, 'is_self_run': is_self_run}
detail_link = phone.css('.p-name a::attr(href)').extract_first()
yield Request(detail_link, callback=self.parse_phone_detail, meta={'phone': phone_dict})
def parse_phone_detail(self, response):
# 获取手机的详细参数信息
phone_dict = response.meta['phone']
params = {}
params_list = response.css('.Ptable-item')
for param in params_list:
name = param.css('.dt::text').extract_first().strip()
value = param.css('.dd::text').extract_first().strip()
params[name] = value
phone_dict['params'] = params
yield phone_dict
def closed(self, reason):
# 对所获取的信息去重,去除相同品牌相同型号的手机
phones = list(self.crawler.stats.get_value('items'))
distinct_phones = []
for phone in phones:
if phone not in distinct_phones:
distinct_phones.append(phone)
# 将手机的详细参数信息保存成为json文件
with open('phones.json', 'w', encoding='utf-8') as f:
json.dump(distinct_phones, f, ensure_ascii=False)
在运行前需要安装Scrapy和相关依赖,并在settings.py中添加以下配置:
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 0.5
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_useragents.downloadermiddlewares.useragents.UserAgentsMiddleware': 500,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 900,
'scrapy_proxy_pool.middlewares.ProxyPoolMiddleware': 100,
'scrapy_proxy_pool.middlewares.BanDetectionMiddleware': 200,
}
USER_AGENTS = [
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'Mozilla/5.0 (compatible; Bingbot/2.0; +http://www.bing.com/bingbot.htm)',
'Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)',
'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
'Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)',
'Mozilla/5.0 (compatible; DuckDuckBot/1.0; +http://duckduckgo.com/duckduckbot.html)',
'Mozilla/5.0 (compatible; SogouSpider/4.0; +http://www.sogou.com/docs/help/webmasters.htm#07)',
'Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)',
'Mozilla/5.0 (compatible; Facebot/1.0; +http://www.facebook.com/facebot)',
]
PROXY_POOL_ENABLED = True
PROXY_POOL_RETRY_TIMES = 3
PROXY_POOL_RANDOM_USER_AGENT = True
PROXY_POOL_HTTP_CODES = [200, 301, 302, 404, 500, 502, 503, 504]
原文地址: https://www.cveoy.top/t/topic/ol6s 著作权归作者所有。请勿转载和采集!