import requests from bs4 import BeautifulSoup import re

def get_subdomains(url): """ 爬取指定网站的子域名 """ # 获取主域名 domain = re.findall(r'https?://([^/]*)', url)[0] subdomains = set()

# 爬取网站首页
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# 查找所有链接
links = soup.find_all('a')
for link in links:
    href = link.get('href')
    if href:
        # 判断链接是否为子域名
        if domain in href and href != url:
            subdomain = re.findall(r'https?://([^/]*)', href)[0]
            subdomains.add(subdomain)

return subdomains

测试

url = 'https://www.baidu.com' subdomains = get_subdomains(url) print(subdomains

写一段能够爬取一个网站子域名的python爬虫

原文地址: https://www.cveoy.top/t/topic/eUJk 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录