SeleniumUtil and BaseSpider Class for Web Scraping with Error Handling

{
  "title": "SeleniumUtil and BaseSpider Class for Web Scraping with Error Handling",
  "description": "This Python code defines SeleniumUtil and BaseSpider classes to facilitate web scraping, including functions for safe element retrieval, input, clicks, login verification, and error logging. It also includes an EmailSpider subclass for email ingestion.",
  "keywords": "Selenium, web scraping, Python, SeleniumUtil, BaseSpider, EmailSpider, error handling, login verification, logging",
  "content": "```python
class SeleniumUtil():
    def __init__(self, driver):
        self.driver = driver
        # self.display = settings['CHROME_DISPLAY']

    def set_driver(self, driver):
        self.driver = driver



    def safe_get_elements_(self, by, xpath, err):
        els = self.driver.find_elements(by=by, value=xpath)
        if not els:
            raise SpiderException(err, xpath)  # 返回错误信息 和 xpath
        return els[0]

    def get_page(self, url, sleep=1):
        self.driver.get(url)
        time.sleep(sleep)
        # 打开网页

    def input_with_xpath(self, by, xpath, keys, err):
        element = self.safe_get_elements_(by, xpath, err)
        return element.send_keys(keys)

    def click_with_xpath(self, by, xpath, err_msg, front_sleep=3):
        time.sleep(front_sleep)
        element = self.safe_get_elements_(by, xpath, err_msg)
        if element.is_enabled():
            return element.click()
        else:
            raise SpiderException(err_msg, xpath)

    def check_loging_with(self ,by, xpath):
        '登录验证'
        # try:

        error_element = self.driver.find_elements(by, xpath)
        for err in error_element:
            error_message = err.text  # 获取登录失败返回的信息
            if error_message:
                raise SpiderException(error_message, xpath=xpath)
        # 页面跳转失败
        current_url = self.driver.current_url
        login_url = LOGIN_URL
        if current_url == login_url:
            raise SpiderException(LogErrorMsg.LOGIN_PAGE_JUMP_FAILED, xpath=xpath)

        # 页面跳转成功未刷新
        logout = self.driver.find_elements(by, BUTTON_LOGOUT_ELEMENT)
        if not logout:
            raise SpiderException(LogErrorMsg.LOGIN_PAGE_NOT_REFRESHED, xpath=xpath)



class BaseSpider(ABC):

    def __init__(self, driver):
        self.driver = driver
        self.browser_ = SeleniumUtil(self.driver)
        self.records = list()
        self.now_time = datetime.datetime.now()

    def set_driver(self, driver):
        self.driver = driver
        self.browser_.set_driver(driver)

    def add_record(self, urn, class_name, operate, exception, ele_message):
        '错误记录
        urn:urn    class_name:类名  operate:操作 exception:错误信息 ele_message:元素信息
        '
        self.records.extend([self.now_time, urn, class_name, operate, exception, ele_message])
        QueryResult.insert_data(INSERT_OPERATION_LOG, self.records)
        self.records.clear()

    def get_search_url(self, url):
        self.browser_.get_page(url)

    def check_login(self):
        '登录验证逻辑实现'
        self.browser_.get_page(LOGIN_URL)  # 登录
        try:
            self.browser_.input_with_xpath(By.NAME, TEXTBOX_LOGIN_ELEMENT, settings['dfm_username'],
                                           ErrorMsg.TEXT_BOX_NOT_DISPLAYED)
            self.browser_.input_with_xpath(By.NAME, TEXTBOX_PWD_ELEMENT,
                                           generate_utils.transform_str(settings['dfm_pwd']),
                                           ErrorMsg.TEXT_BOX_NOT_DISPLAYED)
            self.browser_.click_with_xpath(By.NAME, BUTTON_LOGIN_ELEMENT,
                                           LogErrorMsg.LOGIN_CLICK_BUTTON_NOT_DISPLAYED)
            self.browser_.check_loging_with(By.XPATH, CUSTOM_LOGIN_ELEMENT)
            return True

        except SpiderException as e:
            log_error(e)
            self.add_record(False, self.__class__.__name__, Operate.LOGIN.value, e.message, e.xpath)
            return False


class EmailSpider(BaseSpider):

    def __int__(self, driver):
        # self.driver = driver
        BaseSpider.__init__(self, driver)  # 调用父类构造方法传入driver

    def get_advanced_and_ingestion(self):
        'advanced  和ingestion 按钮'
        self.get_search_url(SEARCH_PAGE)  # search页面
        self.browser_.click_with_xpath(By.LINK_TEXT, ADVANCED_SEARCH_ELEMENT,
                                       EmailErrorMsg.ADVANCED_SEARCH_NOR_DISPLAYED)  # Advanced Search 按钮
        self.browser_.click_with_xpath(By.LINK_TEXT, EMAIL_INGESTION_DATA_ELEMENT,
                                       EmailErrorMsg.INGESTION_DATA_NOR_DISPLAYED)  # Email Ingestion data 按钮




class DFMProcessor(BaseProcessor):
    def __init__(self):
        BaseProcessor.__init__(self)
        self.name = 'DFMProcessor'
        self.lock = ''
        self.driver = driver  # Properly set the driver attribute
        self.display = settings['CHROME_DISPLAY']
        self.baseSpider = BaseSpider(self.driver)

    def process(self):
        pass

    def get_download_path(self):
        return ''

    def start(self):
        try:
            self.lock.acquire()
            logger.info(r" >>> 开始执行 {}".format(self.name))
            self.driver = get_chrome_driver(self.get_download_path(), display=self.display)
            # status = BaseSpider().check_login()
            self.baseSpider.set_driver(self.driver)  # 设置driver给BaseSpider
            status = self.baseSpider.check_login()
            # BaseSpider().check_login()
            # status = self.login()
            if status:
                self.process()
            logger.info(r" >>> 完成执行 {}".format(self.name))
        except Exception as e:
            log_error(e)
        finally:
            self.driver.quit()
            self.lock.release()



@singleton
class EmailIngestionProcessor(DFMProcessor):
    def __init__(self):
        DFMProcessor.__init__(self)
        self.lock = threading.Lock()
        self.name = "EmailIngestionProcessor"
        self.emailSpider = EmailSpider(self.driver)

    def get_download_path(self):
        cur_date = get_strtime('%Y-%m-%d')
        download_path = os.path.join(settings['email_ingestion_path'], cur_date)
        if not os.path.exists(download_path):
            os.makedirs(download_path)
        return download_path

    def process(self):
        self.emailSpider.get_advanced_and_ingestion()



# 
# lock = threading.Lock()
# 
#

def run():
    # while True:
    EmailIngestionProcessor().start()


# 
if __name__ == '__main__':
    run()