Python 爬取 Boss 直聘职位信息并保存到 JSON 和 Excel 文件

该代码实现了从 Boss 直聘网站爬取职位信息,并将其保存到 JSON 和 Excel 文件的功能。

代码功能

  1. 获取城市编码: 通过输入城市名,获取对应的城市编码。使用了第三方接口获取城市编码的数据。
  2. 验证 Cookie: 验证输入的 Cookie 是否有效。
  3. 获取职位列表: 通过输入查询职位和城市,获取对应的职位列表。最多遍历三页数据。
  4. 保存为 JSON 文件: 将职位列表保存为 JSON 文件,文件名格式为'boss_当前日期.json',保存在桌面上。
  5. 将 JSON 数据保存到 Excel 文件: 将 JSON 文件中的数据保存到 Excel 文件中。Excel 文件名格式为'boss_当前日期.xlsx',保存在桌面上。

代码示例

import os
import re
import platform
import requests
import json
import urllib.request
from openpyxl import Workbook
from datetime import datetime


class GetDataTools:
    @staticmethod
    def get_citycode(input_city):
        response = urllib.request.urlopen('https://ghproxy.com/https://raw.githubusercontent.com/hoochanlon/scripts/main/d-json/bosszhipin_citycode.json')
        city_json = json.loads(response.read().decode('utf-8'))

        city_code = None
        for group in city_json['zpData']['cityGroup']:
            for city in group['cityList']:
                if city['name'] == input_city:
                    city_code = city['code']
                    return city_code
        return None

    @staticmethod
    def validate_cookie(input_cookie):
        cookie_list = input_cookie.split('; ')
        cookies_dict = {}

        try:
            for item in cookie_list:
                key, value = item.strip().split('=', 1)
                cookies_dict[key] = value
        except ValueError:
            return False

        return True


class ZhiPin:
    @staticmethod
    def get_job_list():
        input_keywords = input('请输入查询职位:')
        input_city = input('请输入城市:')
        city_code = GetDataTools.get_citycode(input_city)
        # while city_code is None:
        #     print('未找到匹配的城市编码,请重新输入城市名。')
        #     input_city = input('请输入城市:')
        #     city_code = GetDataTools.get_citycode(input_city)

        # input_num = input('请输入需要遍历的页码:')
        input_cookie = input('请复制Cookie:')
        # {lastCity=101281300; wd_guid=f623072b-1eae-44b6-b0fe-247dadb24412; historyState=state; _bl_uid=qXl6Um27d653n6nRF9dOdbUd1ypL; collection_pop_window=1; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1697116639,1697207153,1697279586; __l=l=%2Fwww.zhipin.com%2Fchengshi%2Fc101281300%2F&r=&g=&s=3&friend_source=0&s=3&friend_source=0; JSESSIONID=79902620D8225A172634CA2AF272DEC6; boss_login_mode=wechat; wt2=DflPxd9aBbeE_UE_wR0wmizZfaQ4hkeu_erA660HfeNzv7uZHigC3UlsBEosFTjIq1MvaVMaU06Gs7YNdZ1BkhQ~~; wbg=0; geek_zp_token=V1RN4hEuf62VtuVtRvxhoaISu46Trexi8~; __c=1697279586; __a=52344128.1694327042.1697207153.1697279586.23.4.7.23; __zp_stoken__=6b6ceAGlvETU8ZQoxTU14DGA9bA19VUh1bA8dNVhNBGksJEkcAFdTX3lCemhkK35dWn9%2BGA1CR0QmRy4EIUhPRgRAQ2hEazg5Az1WSxRGUR4fGDpKAhNiSkJ%2BMVMnPjoWGEZMZxdEN004XXQ%3D; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1697280310}
        # {lastCity=101281300; wd_guid=f623072b-1eae-44b6-b0fe-247dadb24412; historyState=state; _bl_uid=qXl6Um27d653n6nRF9dOdbUd1ypL; __g=-; __l=l=%2Fwww.zhipin.com%2Fweb%2Fgeek%2Fjob%3Fquery%3D&r=&g=&s=3&friend_source=0; collection_pop_window=1; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1697116639,1697207153; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1697207153; __c=1697207153; __a=52344128.1694327042.1697116640.1697207153.15.3.2.15; __zp_stoken__=1002eKRMYRyhhMFQPNjVdHitKfXxoWXdUYGldDRgkHmIhDRFYFBwtPV9kX0kYIXZ3QUcPMWFxXD5kcDA8d3RGLy8ZZVZGf3ltEWQpc1Q6cg51IB1xDRwRPkd7An8fLyBqA349Tnt3XVxyNDQ%3D}
        # while not GetDataTools.validate_cookie(input_cookie):
        #     input_cookie = input('请重新复制Cookie:')

        job_data_list = []

        # boss直聘最多只能遍历前三页,不然会直接报错。
        for page in range(1, 4):
            url = f'https://www.zhipin.com/wapi/zpgeek/search/joblist.json?query={input_keywords}&city={city_code}&experience=105&salary=405&page={page}&pageSize=30'

            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36',
                'Cookie': input_cookie,
            }

            data = requests.get(url, headers=headers).json()
            job_data_list.append(data)

        return job_data_list

    @staticmethod
    def save_to_json(job_data_list):
        save_json = os.path.join(
            os.path.join(os.path.expanduser('~'), 'Desktop'),
            'boss_{}.json'.format(datetime.now().strftime('%Y-%m-%d'))
        )

        with open(save_json, 'w', encoding='utf-8') as f:
            json.dump(job_data_list, f, ensure_ascii=False, indent=4)

        return save_json


class DataToExcel:
    @staticmethod
    def save_to_excel(workbook, zhipin_json):

        with open(zhipin_json, 'r', encoding='utf-8') as f:
            zhipin_json = json.load(f)

        worksheet = workbook.create_sheet(title='招聘信息清单')

        titles = [
            '职位', '技能', '学历', '工资', '公司', '人数规模', '地址'
            # '职位', '技能', '经验', '学历', '工资', '福利',
            # '公司', '公司类型', '人数规模', '融资状态', '地址'
        ]

        for col, title in enumerate(titles, start=1):
            worksheet.cell(row=1, column=col, value=title)

        data_list = []
        for item in zhipin_json:
            if 'zpData' in item and 'jobList' in item['zpData']:
                data_list.extend(item['zpData']['jobList'])

        row = 2
        column = 1

        for job in data_list:
            worksheet.cell(row=row, column=column, value=job['jobName'])
            column += 1
            worksheet.cell(row=row, column=column, value='; '.join(job['skills']))
            column += 1
            # worksheet.cell(row=row, column=column, value=job['jobExperience'])
            # column += 1
            worksheet.cell(row=row, column=column, value=job['jobDegree'])
            column += 1
            worksheet.cell(row=row, column=column, value=job['salaryDesc'])
            column += 1
            # worksheet.cell(row=row, column=column, value='; '.join(job['welfareList']))
            # column += 1
            worksheet.cell(row=row, column=column, value=job['brandName'])
            column += 1
            # worksheet.cell(row=row, column=column, value=job['brandIndustry'])
            # column += 1
            worksheet.cell(row=row, column=column, value=job['brandScaleName'])
            column += 1
            # worksheet.cell(row=row, column=column, value=job['brandStageName'])
            # column += 1
            worksheet.cell(row=row, column=column, value=f'{job['cityName']} {job['areaDistrict']} {job['businessDistrict']}')
            column += 1

            row += 1
            column = 1



        # for row, data_row in enumerate(extra_data, start=1):
        #     for col, value in enumerate(data_row, start=1):
        #         extra_sheet.cell(row=row, column=col, value=value)



        # for row, data_row in enumerate(extra_data, start=1):
        #     for col, value in enumerate(data_row, start=1):
        #         extra_sheet.cell(row=row, column=col, value=value)


if __name__ == '__main__':
    job_data_list = ZhiPin.get_job_list()
    zhipin_json = ZhiPin.save_to_json(job_data_list)

    workbook = Workbook()
    workbook.remove(workbook.active)

    # DataToExcel.save_to_excel(workbook, os.path.join(
    #         os.path.join(os.path.expanduser('~'), 'Desktop'),
    #         'boss_{}.json'.format(datetime.now().strftime('%Y-%m-%d'))
    #     ))

    DataToExcel.save_to_excel(workbook, zhipin_json)
    save_xlsx = os.path.join(
        os.path.join(os.path.expanduser('~'), 'Desktop'),
        'boss_{}.xlsx'.format(datetime.now().strftime('input_city'))
        )

    workbook.save(save_xlsx)

使用方法

  1. 运行代码。
  2. 输入查询职位。
  3. 输入城市。
  4. 复制 Cookie 并粘贴到提示框中。

程序会自动获取职位列表并保存到 JSON 和 Excel 文件中。

注意

  1. 由于 Boss 直聘的限制,最多只能遍历前三页数据。
  2. 确保代码运行环境中安装了必要的库。
  3. Cookie 需要在登录 Boss 直聘网站后获取,有效期有限,请注意及时更新。

总结

该代码利用 Python 爬取 Boss 直聘网站的职位信息,并将其保存到 JSON 和 Excel 文件中,方便用户进行数据分析和管理。

Python 爬取 Boss 直聘职位信息并保存到 JSON 和 Excel 文件

原文地址: https://www.cveoy.top/t/topic/bE5i 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录