Python+Selenium自动化: 从网页抓取数据并保存照片
import os
import time
import psutil
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import openpyxl
import requests
# 检查是否已经存在浏览器实例,如果存在则不再创建
for proc in psutil.process_iter():
try:
if 'chrome' in proc.name() and '--remote-debugging-port=9224' in proc.cmdline():
options = Options()
options.add_experimental_option('debuggerAddress', '127.0.0.1:9224')
break
except:
pass
else:
# 创建浏览器实例
os.system(r'start chrome --remote-debugging-port=9224 --user-data-dir="D:\评阅用"')
options = Options()
options.add_experimental_option('debuggerAddress', '127.0.0.1:9224')
# 在已有的浏览器实例中查找标签页
driver = webdriver.Chrome(options=options)
tabs = driver.window_handles
for tab in tabs:
driver.switch_to.window(tab)
if driver.title == '考后核验':
print('登陆成功')
break
# 打开Excel表格
wb = openpyxl.load_workbook(r'C:\Users\Administrator\Desktop\考后核验.xlsx')
sheet = wb.active
# 创建照片文件夹
if not os.path.exists(r'C:\Users\Administrator\Desktop\照片'):
os.mkdir(r'C:\Users\Administrator\Desktop\照片')
# 遍历时间单元K列
for row in sheet.iter_rows(min_row=2, max_row=sheet.max_row, min_col=11, max_col=11):
for cell in row:
# 获取时间单元内的数据
data = cell.value
if data:
# 创建子文件夹
folder_name = time.strftime('%Y-%m-%d %H-%M-%S', time.strptime(data, '%Y-%m-%d %H:%M:%S'))
folder_path = os.path.join(r'C:\Users\Administrator\Desktop\照片', folder_name)
if not os.path.exists(folder_path):
os.mkdir(folder_path)
# 遍历表格内'A'列数据
for row in sheet.iter_rows(min_row=2, max_row=sheet.max_row, min_col=1, max_col=1):
for cell in row:
# 获取学号
student_id = cell.value
if student_id:
# 在网页内查找考生信息
search_input = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//input[@placeholder="姓名/考生号"]')))
search_input.clear()
search_input.send_keys(student_id)
time.sleep(1) # 等待页面加载完成
search_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[@class="ant-btn ant-btn-primary"]')))
driver.execute_script('arguments[0].click();', search_button)
time.sleep(2) # 等待页面加载完成
# 获取所有时间单元元素
time_elements = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, '//td[@class="ant-table-column-has-actions ant-table-column-has-sorters" and @style="text-align: center;"]')))
for time_element in time_elements:
time_text = time_element.text
# 与时间单元数据对比
if time_text == data:
# 获取考场编号元素
exam_room_elements = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, '//td[@class="ant-table-column-has-actions ant-table-column-has-sorters" and @style="text-align: center;"]')))
for exam_room_element in exam_room_elements:
exam_room_text = exam_room_element.text
# 与'H'列数据对比
if exam_room_text == sheet.cell(row=cell.row, column=8).value:
# 创建考场编号文件夹
exam_room_folder = os.path.join(folder_path, exam_room_text)
if not os.path.exists(exam_room_folder):
os.mkdir(exam_room_folder)
# 获取学号和姓名
student_info_elements = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, '//td[@class="ant-table-column-has-actions ant-table-column-has-sorters" and @style="text-align: center;"]')))
for i in range(len(student_info_elements)):
if i % 2 == 0:
student_id_text = student_info_elements[i].text
else:
student_name_text = student_info_elements[i].text
# 提取学号和姓名的数字和文字部分
student_id_num = ''.join(filter(str.isdigit, student_id_text))
student_name = ''.join(filter(str.isalpha, student_name_text))
# 照片名称
photo_name = student_id_num + '_' + student_name + '.jpg'
# 保存照片
photo_path = os.path.join(exam_room_folder, photo_name)
photo_element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//div[@class="img-lazy-load"]/img')))
photo_src = photo_element.get_attribute('src')
response = requests.get(photo_src)
with open(photo_path, 'wb') as f:
f.write(response.content)
driver.back()
time.sleep(1) # 等待页面加载完成
原文地址: http://www.cveoy.top/t/topic/fJkW 著作权归作者所有。请勿转载和采集!