SeleniumUtil and BaseSpider Class for Web Scraping with Error Handling
{
"title": "SeleniumUtil and BaseSpider Class for Web Scraping with Error Handling",
"description": "This Python code defines SeleniumUtil and BaseSpider classes to facilitate web scraping, including functions for safe element retrieval, input, clicks, login verification, and error logging. It also includes an EmailSpider subclass for email ingestion.",
"keywords": "Selenium, web scraping, Python, SeleniumUtil, BaseSpider, EmailSpider, error handling, login verification, logging",
"content": "```python
class SeleniumUtil():
def __init__(self, driver):
self.driver = driver
# self.display = settings['CHROME_DISPLAY']
def set_driver(self, driver):
self.driver = driver
def safe_get_elements_(self, by, xpath, err):
els = self.driver.find_elements(by=by, value=xpath)
if not els:
raise SpiderException(err, xpath) # 返回错误信息 和 xpath
return els[0]
def get_page(self, url, sleep=1):
self.driver.get(url)
time.sleep(sleep)
# 打开网页
def input_with_xpath(self, by, xpath, keys, err):
element = self.safe_get_elements_(by, xpath, err)
return element.send_keys(keys)
def click_with_xpath(self, by, xpath, err_msg, front_sleep=3):
time.sleep(front_sleep)
element = self.safe_get_elements_(by, xpath, err_msg)
if element.is_enabled():
return element.click()
else:
raise SpiderException(err_msg, xpath)
def check_loging_with(self ,by, xpath):
'登录验证'
# try:
error_element = self.driver.find_elements(by, xpath)
for err in error_element:
error_message = err.text # 获取登录失败返回的信息
if error_message:
raise SpiderException(error_message, xpath=xpath)
# 页面跳转失败
current_url = self.driver.current_url
login_url = LOGIN_URL
if current_url == login_url:
raise SpiderException(LogErrorMsg.LOGIN_PAGE_JUMP_FAILED, xpath=xpath)
# 页面跳转成功未刷新
logout = self.driver.find_elements(by, BUTTON_LOGOUT_ELEMENT)
if not logout:
raise SpiderException(LogErrorMsg.LOGIN_PAGE_NOT_REFRESHED, xpath=xpath)
class BaseSpider(ABC):
def __init__(self, driver):
self.driver = driver
self.browser_ = SeleniumUtil(self.driver)
self.records = list()
self.now_time = datetime.datetime.now()
def set_driver(self, driver):
self.driver = driver
self.browser_.set_driver(driver)
def add_record(self, urn, class_name, operate, exception, ele_message):
'错误记录
urn:urn class_name:类名 operate:操作 exception:错误信息 ele_message:元素信息
'
self.records.extend([self.now_time, urn, class_name, operate, exception, ele_message])
QueryResult.insert_data(INSERT_OPERATION_LOG, self.records)
self.records.clear()
def get_search_url(self, url):
self.browser_.get_page(url)
def check_login(self):
'登录验证逻辑实现'
self.browser_.get_page(LOGIN_URL) # 登录
try:
self.browser_.input_with_xpath(By.NAME, TEXTBOX_LOGIN_ELEMENT, settings['dfm_username'],
ErrorMsg.TEXT_BOX_NOT_DISPLAYED)
self.browser_.input_with_xpath(By.NAME, TEXTBOX_PWD_ELEMENT,
generate_utils.transform_str(settings['dfm_pwd']),
ErrorMsg.TEXT_BOX_NOT_DISPLAYED)
self.browser_.click_with_xpath(By.NAME, BUTTON_LOGIN_ELEMENT,
LogErrorMsg.LOGIN_CLICK_BUTTON_NOT_DISPLAYED)
self.browser_.check_loging_with(By.XPATH, CUSTOM_LOGIN_ELEMENT)
return True
except SpiderException as e:
log_error(e)
self.add_record(False, self.__class__.__name__, Operate.LOGIN.value, e.message, e.xpath)
return False
class EmailSpider(BaseSpider):
def __int__(self, driver):
# self.driver = driver
BaseSpider.__init__(self, driver) # 调用父类构造方法传入driver
def get_advanced_and_ingestion(self):
'advanced 和ingestion 按钮'
self.get_search_url(SEARCH_PAGE) # search页面
self.browser_.click_with_xpath(By.LINK_TEXT, ADVANCED_SEARCH_ELEMENT,
EmailErrorMsg.ADVANCED_SEARCH_NOR_DISPLAYED) # Advanced Search 按钮
self.browser_.click_with_xpath(By.LINK_TEXT, EMAIL_INGESTION_DATA_ELEMENT,
EmailErrorMsg.INGESTION_DATA_NOR_DISPLAYED) # Email Ingestion data 按钮
class DFMProcessor(BaseProcessor):
def __init__(self):
BaseProcessor.__init__(self)
self.name = 'DFMProcessor'
self.lock = ''
self.driver = driver # Properly set the driver attribute
self.display = settings['CHROME_DISPLAY']
self.baseSpider = BaseSpider(self.driver)
def process(self):
pass
def get_download_path(self):
return ''
def start(self):
try:
self.lock.acquire()
logger.info(r" >>> 开始执行 {}".format(self.name))
self.driver = get_chrome_driver(self.get_download_path(), display=self.display)
# status = BaseSpider().check_login()
self.baseSpider.set_driver(self.driver) # 设置driver给BaseSpider
status = self.baseSpider.check_login()
# BaseSpider().check_login()
# status = self.login()
if status:
self.process()
logger.info(r" >>> 完成执行 {}".format(self.name))
except Exception as e:
log_error(e)
finally:
self.driver.quit()
self.lock.release()
@singleton
class EmailIngestionProcessor(DFMProcessor):
def __init__(self):
DFMProcessor.__init__(self)
self.lock = threading.Lock()
self.name = "EmailIngestionProcessor"
self.emailSpider = EmailSpider(self.driver)
def get_download_path(self):
cur_date = get_strtime('%Y-%m-%d')
download_path = os.path.join(settings['email_ingestion_path'], cur_date)
if not os.path.exists(download_path):
os.makedirs(download_path)
return download_path
def process(self):
self.emailSpider.get_advanced_and_ingestion()
#
# lock = threading.Lock()
#
#
def run():
# while True:
EmailIngestionProcessor().start()
#
if __name__ == '__main__':
run()
原文地址: https://www.cveoy.top/t/topic/qeOo 著作权归作者所有。请勿转载和采集!