coding:utf8

import requests from bs4 import BeautifulSoup import urllib.parse as urlparse import re

class UrlManager: def init(self): self.new_urls = set() # 存放新的URL,将被访问并解析response里的新链接,递归下去,直到没有新的URL存进来就退出整个程序 self.old_urls = set() # 存在已经爬取的,且属于本站的链接

def add_new_url(self, url):
    if url is None:
        return None
    if url not in self.new_urls and url not in self.old_urls:
        self.new_urls.add(url)

def add_new_urls(self, urls):
    if urls is None or len(urls) == 0:
        return None
    for url in urls:
        self.add_new_url(url)

def has_new_url(self):
    # 此处自己添加的
    if len(self.old_urls) > 10000:  # 如果经爬取的链接大于100时,退出主程序
        self.new_urls = set()
        self.old_urls = set()

    return len(self.new_urls) != 0  # 返回新URL元组是否为空,False(不为空)或True(空),空表示没有新的URL需要爬行了,会退出主程序

def get_new_url(self):
    new_url = self.new_urls.pop()  # 从新URL元组中取一个URL
    self.old_urls.add(new_url)  # 把它移到旧的URL数组,
    return new_url

class HtmlDownLoader(): def download(self, url): if url is None: return None r = requests.get(url) if r.status_code != 200: return None return r.text

class HtmlParser: def init(self): self.foreign_urls = set()

def _get_root_domain(self, url):
    # 是否应该在这里用正则判断传进来的URL是/开头,如果解析出来netloc是空,那也算是当前域名的链接
    if url is None:
        return None
    try:
        url_info = urlparse.urlparse(url)
        root_domain = url_info.netloc
        return root_domain
    except:
        pass

def _get_new_urls(self, soup, current_url):
    new_urls = set()
    links = soup.find_all("a")
    for link in links:
        new_url = link.get('href')
        if new_url is not None:
            new_url = new_url.lstrip()

        new_url_root_domain = self._get_root_domain(new_url)
        if new_url_root_domain == '':
            pass
        elif new_url_root_domain is not None:
            if self._get_root_domain(current_url) != self._get_root_domain(new_url):
                if self._get_root_domain(new_url):
                    self.foreign_urls.add(self._get_root_domain(new_url))
                continue
        # elif new_url_root_domain is None:
        #     pass

        new_full_url = urlparse.urljoin(current_url, new_url)
        new_urls.add(new_full_url)

    return new_urls

def parse(self, html_content, current_url):
    if html_content is None:
        return
    soup = BeautifulSoup(html_content, "html.parser")
    new_urls = self._get_new_urls(soup, current_url)
    return new_urls

def get_foreign_urls(self):
    return self.foreign_urls

class SpiderMain: def init(self, ): self.urls = UrlManager() self.html_downloader = HtmlDownLoader() self.parser = HtmlParser()

def craw(self, root_url, name):
    self.urls.add_new_url(root_url)
    while self.urls.has_new_url():
        new_url = self.urls.get_new_url()
        try:
            html_content = self.html_downloader.download(new_url)
            new_urls = self.parser.parse(html_content, new_url)
            self.urls.add_new_urls(new_urls)
            with open('%s_data.txt' % (name), 'a') as f:
                f.write(new_url + '\n')
            print("craw %s" % new_url)
        except:
            print("错误!!")
    print(len(self.urls.old_urls), self.urls.old_urls)
    print(len(self.parser.foreign_urls), self.parser.foreign_urls)

if name == "main": name = input("name:") with open('%s.txt' % (name)) as fp: for u in fp: root_url = u.strip('\r\n') with open('%s_data.txt' % (name), 'a') as f: pass # root_url = u if '://' in u else 'http://' + u # root_url = "http://www.zzyidc.com/" obj_spider = SpiderMain() obj_spider.craw(root_url, name

把下面的代码改用python3的代码# codingutf8import requestsfrom bs4 import BeautifulSoupimport urlparseimport reclass UrlManager def __init__self selfnew_urls = set # 存放新的URL将被访问并解析response里的新链接递归下去直到没有新

原文地址: https://www.cveoy.top/t/topic/czmg 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录