京东热卖手机信息爬取：苹果、华为、小米品牌数据抓取与分析

import scrapy from scrapy.selector import Selector from scrapy.http import Request import re import pandas as pd

class JdSpider(scrapy.Spider): name = "jd"

def start_requests(self):
    urls = [
        "https://list.jd.com/list.html?cat=9987,653,655",
        "https://list.jd.com/list.html?cat=9987,653,659",
        "https://list.jd.com/list.html?cat=9987,653,652",
    ]
    for url in urls:
        yield Request(url=url, callback=self.parse)

def parse(self, response):
    sel = Selector(response)
    items = sel.xpath('//*[@id="plist"]/ul/li')
    for item in items:
        brand = item.xpath('./div[@class="p-name"]/a/em/text()').extract_first()
        if brand in ['苹果', '华为', '小米']:
            name = item.xpath('./div[@class="p-name"]/a/em/text()').extract_first()
            price = item.xpath('./div[@class="p-price"]/strong/i/text()').extract_first()
            comment_num = item.xpath('./div[@class="p-commit"]/strong/a/text()').extract_first()
            is_self_run = item.xpath('./div[@class="p-icons"]/i[@class="goods-icons-self"]/text()').extract_first()
            is_self_run = "是" if is_self_run == "自营" else "否"
            detail_url = item.xpath('./div[@class="p-name"]/a/@href').extract_first()
            yield Request(url=detail_url, callback=self.parse_detail, meta={'name': name, 'brand': brand, 'price': price, 'comment_num': comment_num, 'is_self_run': is_self_run})

    next_page = sel.xpath('//*[@id="J_bottomPage"]/span[@class="p-num"]/a[@class="pn-next"]/@href')
    if next_page:
        url = response.urljoin(next_page.extract_first())
        yield Request(url=url, callback=self.parse)

def parse_detail(self, response):
    sel = Selector(response)
    name = response.meta['name']
    brand = response.meta['brand']
    price = response.meta['price']
    comment_num = response.meta['comment_num']
    is_self_run = response.meta['is_self_run']

    params_dict = {}
    params = sel.xpath('//div[@class="Ptable"]/div[@class="Ptable-item"]')
    for param in params:
        param_name = param.xpath('./h3/text()').extract_first().strip()
        param_values = []
        for value in param.xpath('./dl/dd/ul/li'):
            value = value.xpath('string(.)').extract_first().strip()
            if value:
                param_values.append(value)
        params_dict[param_name] = param_values

    item_dict = {
        '名称': name,
        '品牌': brand,
        '价格': price,
        '评价数量': comment_num,
        '是否自营': is_self_run,
    }
    item_dict.update(params_dict)

    yield item_dict

def remove_duplicates(items): df = pd.DataFrame(items) df = df.drop_duplicates(subset=['品牌', '名称']) return df.to_dict('records')

def main(): from scrapy.crawler import CrawlerProcess

process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'FEED_FORMAT': 'csv',
    'FEED_URI': 'jd.csv',
    'LOG_LEVEL': 'ERROR'
})

process.crawl(JdSpider)
process.start()

items = pd.read_csv('jd.csv').to_dict('records')
items = remove_duplicates(items)
df = pd.DataFrame(items)
df.to_excel('jd.xlsx', index=False)

if name == 'main': main()