Python 爬取拉勾网职位信息并保存到 Excel 表格
以下是 Python 代码,使用了 requests、beautifulsoup4、openpyxl 等库:
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
# 设置请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
# 获取页面源码
def get_html(url):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
else:
return None
except Exception as e:
print(e)
return None
# 解析页面,获取信息
def parse_html(html):
soup = BeautifulSoup(html, 'lxml')
job_list = soup.find('ul', class_='item_con_list').find_all('li')
result = []
for job in job_list:
job_name = job.find('h3').get_text().strip()
company_name = job.find('div', class_='company').find('a').get_text().strip()
location = job.find('div', class_='location').get_text().strip()
salary = job.find('span', class_='money').get_text().strip()
publish_time = job.find('span', class_='format-time').get_text().strip()
job_detail_url = job.find('h3').find('a')['href']
job_detail_html = get_html(job_detail_url)
job_detail_soup = BeautifulSoup(job_detail_html, 'lxml')
job_description = job_detail_soup.find('div', class_='job-detail').get_text().strip()
result.append([job_name, company_name, location, salary, publish_time, job_description])
return result
# 将信息保存到 Excel 表
def save_to_excel(result):
wb = Workbook()
ws = wb.active
ws.title = 'job_info'
headers = ['职位名称', '公司名称', '工作地点', '薪资', '发布时间', '技能要求']
ws.append(headers)
for row in result:
ws.append(row)
wb.save('job_info.xlsx')
print('保存成功!')
if __name__ == '__main__':
url = 'https://www.lagou.com/zhaopin/Python/'
html = get_html(url)
result = parse_html(html)
save_to_excel(result)
运行后,程序会爬取拉勾网上 Python 岗位的信息,并保存到名为“job_info.xlsx”的 Excel 表格中。Excel 表格的第一行为列名,分别为“职位名称”、“公司名称”、“工作地点”、“薪资”、“发布时间”、“技能要求”。
原文地址: https://www.cveoy.top/t/topic/mUMA 著作权归作者所有。请勿转载和采集!