Python 爬取 Boss 直聘职位信息并保存到 JSON 和 Excel 文件
Python 爬取 Boss 直聘职位信息并保存到 JSON 和 Excel 文件
该代码实现了从 Boss 直聘网站爬取职位信息,并将其保存到 JSON 和 Excel 文件的功能。
代码功能
- 获取城市编码: 通过输入城市名,获取对应的城市编码。使用了第三方接口获取城市编码的数据。
- 验证 Cookie: 验证输入的 Cookie 是否有效。
- 获取职位列表: 通过输入查询职位和城市,获取对应的职位列表。最多遍历三页数据。
- 保存为 JSON 文件: 将职位列表保存为 JSON 文件,文件名格式为'boss_当前日期.json',保存在桌面上。
- 将 JSON 数据保存到 Excel 文件: 将 JSON 文件中的数据保存到 Excel 文件中。Excel 文件名格式为'boss_当前日期.xlsx',保存在桌面上。
代码示例
import os
import re
import platform
import requests
import json
import urllib.request
from openpyxl import Workbook
from datetime import datetime
class GetDataTools:
@staticmethod
def get_citycode(input_city):
response = urllib.request.urlopen('https://ghproxy.com/https://raw.githubusercontent.com/hoochanlon/scripts/main/d-json/bosszhipin_citycode.json')
city_json = json.loads(response.read().decode('utf-8'))
city_code = None
for group in city_json['zpData']['cityGroup']:
for city in group['cityList']:
if city['name'] == input_city:
city_code = city['code']
return city_code
return None
@staticmethod
def validate_cookie(input_cookie):
cookie_list = input_cookie.split('; ')
cookies_dict = {}
try:
for item in cookie_list:
key, value = item.strip().split('=', 1)
cookies_dict[key] = value
except ValueError:
return False
return True
class ZhiPin:
@staticmethod
def get_job_list():
input_keywords = input('请输入查询职位:')
input_city = input('请输入城市:')
city_code = GetDataTools.get_citycode(input_city)
# while city_code is None:
# print('未找到匹配的城市编码,请重新输入城市名。')
# input_city = input('请输入城市:')
# city_code = GetDataTools.get_citycode(input_city)
# input_num = input('请输入需要遍历的页码:')
input_cookie = input('请复制Cookie:')
# {lastCity=101281300; wd_guid=f623072b-1eae-44b6-b0fe-247dadb24412; historyState=state; _bl_uid=qXl6Um27d653n6nRF9dOdbUd1ypL; collection_pop_window=1; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1697116639,1697207153,1697279586; __l=l=%2Fwww.zhipin.com%2Fchengshi%2Fc101281300%2F&r=&g=&s=3&friend_source=0&s=3&friend_source=0; JSESSIONID=79902620D8225A172634CA2AF272DEC6; boss_login_mode=wechat; wt2=DflPxd9aBbeE_UE_wR0wmizZfaQ4hkeu_erA660HfeNzv7uZHigC3UlsBEosFTjIq1MvaVMaU06Gs7YNdZ1BkhQ~~; wbg=0; geek_zp_token=V1RN4hEuf62VtuVtRvxhoaISu46Trexi8~; __c=1697279586; __a=52344128.1694327042.1697207153.1697279586.23.4.7.23; __zp_stoken__=6b6ceAGlvETU8ZQoxTU14DGA9bA19VUh1bA8dNVhNBGksJEkcAFdTX3lCemhkK35dWn9%2BGA1CR0QmRy4EIUhPRgRAQ2hEazg5Az1WSxRGUR4fGDpKAhNiSkJ%2BMVMnPjoWGEZMZxdEN004XXQ%3D; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1697280310}
# {lastCity=101281300; wd_guid=f623072b-1eae-44b6-b0fe-247dadb24412; historyState=state; _bl_uid=qXl6Um27d653n6nRF9dOdbUd1ypL; __g=-; __l=l=%2Fwww.zhipin.com%2Fweb%2Fgeek%2Fjob%3Fquery%3D&r=&g=&s=3&friend_source=0; collection_pop_window=1; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1697116639,1697207153; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1697207153; __c=1697207153; __a=52344128.1694327042.1697116640.1697207153.15.3.2.15; __zp_stoken__=1002eKRMYRyhhMFQPNjVdHitKfXxoWXdUYGldDRgkHmIhDRFYFBwtPV9kX0kYIXZ3QUcPMWFxXD5kcDA8d3RGLy8ZZVZGf3ltEWQpc1Q6cg51IB1xDRwRPkd7An8fLyBqA349Tnt3XVxyNDQ%3D}
# while not GetDataTools.validate_cookie(input_cookie):
# input_cookie = input('请重新复制Cookie:')
job_data_list = []
# boss直聘最多只能遍历前三页,不然会直接报错。
for page in range(1, 4):
url = f'https://www.zhipin.com/wapi/zpgeek/search/joblist.json?query={input_keywords}&city={city_code}&experience=105&salary=405&page={page}&pageSize=30'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36',
'Cookie': input_cookie,
}
data = requests.get(url, headers=headers).json()
job_data_list.append(data)
return job_data_list
@staticmethod
def save_to_json(job_data_list):
save_json = os.path.join(
os.path.join(os.path.expanduser('~'), 'Desktop'),
'boss_{}.json'.format(datetime.now().strftime('%Y-%m-%d'))
)
with open(save_json, 'w', encoding='utf-8') as f:
json.dump(job_data_list, f, ensure_ascii=False, indent=4)
return save_json
class DataToExcel:
@staticmethod
def save_to_excel(workbook, zhipin_json):
with open(zhipin_json, 'r', encoding='utf-8') as f:
zhipin_json = json.load(f)
worksheet = workbook.create_sheet(title='招聘信息清单')
titles = [
'职位', '技能', '学历', '工资', '公司', '人数规模', '地址'
# '职位', '技能', '经验', '学历', '工资', '福利',
# '公司', '公司类型', '人数规模', '融资状态', '地址'
]
for col, title in enumerate(titles, start=1):
worksheet.cell(row=1, column=col, value=title)
data_list = []
for item in zhipin_json:
if 'zpData' in item and 'jobList' in item['zpData']:
data_list.extend(item['zpData']['jobList'])
row = 2
column = 1
for job in data_list:
worksheet.cell(row=row, column=column, value=job['jobName'])
column += 1
worksheet.cell(row=row, column=column, value='; '.join(job['skills']))
column += 1
# worksheet.cell(row=row, column=column, value=job['jobExperience'])
# column += 1
worksheet.cell(row=row, column=column, value=job['jobDegree'])
column += 1
worksheet.cell(row=row, column=column, value=job['salaryDesc'])
column += 1
# worksheet.cell(row=row, column=column, value='; '.join(job['welfareList']))
# column += 1
worksheet.cell(row=row, column=column, value=job['brandName'])
column += 1
# worksheet.cell(row=row, column=column, value=job['brandIndustry'])
# column += 1
worksheet.cell(row=row, column=column, value=job['brandScaleName'])
column += 1
# worksheet.cell(row=row, column=column, value=job['brandStageName'])
# column += 1
worksheet.cell(row=row, column=column, value=f'{job['cityName']} {job['areaDistrict']} {job['businessDistrict']}')
column += 1
row += 1
column = 1
# for row, data_row in enumerate(extra_data, start=1):
# for col, value in enumerate(data_row, start=1):
# extra_sheet.cell(row=row, column=col, value=value)
# for row, data_row in enumerate(extra_data, start=1):
# for col, value in enumerate(data_row, start=1):
# extra_sheet.cell(row=row, column=col, value=value)
if __name__ == '__main__':
job_data_list = ZhiPin.get_job_list()
zhipin_json = ZhiPin.save_to_json(job_data_list)
workbook = Workbook()
workbook.remove(workbook.active)
# DataToExcel.save_to_excel(workbook, os.path.join(
# os.path.join(os.path.expanduser('~'), 'Desktop'),
# 'boss_{}.json'.format(datetime.now().strftime('%Y-%m-%d'))
# ))
DataToExcel.save_to_excel(workbook, zhipin_json)
save_xlsx = os.path.join(
os.path.join(os.path.expanduser('~'), 'Desktop'),
'boss_{}.xlsx'.format(datetime.now().strftime('input_city'))
)
workbook.save(save_xlsx)
使用方法
- 运行代码。
- 输入查询职位。
- 输入城市。
- 复制 Cookie 并粘贴到提示框中。
程序会自动获取职位列表并保存到 JSON 和 Excel 文件中。
注意
- 由于 Boss 直聘的限制,最多只能遍历前三页数据。
- 确保代码运行环境中安装了必要的库。
- Cookie 需要在登录 Boss 直聘网站后获取,有效期有限,请注意及时更新。
总结
该代码利用 Python 爬取 Boss 直聘网站的职位信息,并将其保存到 JSON 和 Excel 文件中,方便用户进行数据分析和管理。
原文地址: https://www.cveoy.top/t/topic/bE5i 著作权归作者所有。请勿转载和采集!