以下是一个使用Go语言编写的简单爬虫代码,代码行数超过250行,但是功能相对完整:

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
	"net/url"
	"strings"

	"golang.org/x/net/html"
)

func main() {
	visited := make(map[string]bool)

	// 设置初始URL
	seedURL := "https://example.com"
	visited[seedURL] = true

	// 爬取页面
	crawl(seedURL, visited)
}

func crawl(url string, visited map[string]bool) {
	fmt.Println("Crawling:", url)

	// 发送HTTP请求获取页面内容
	resp, err := http.Get(url)
	if err != nil {
		fmt.Println("Failed to crawl:", url, "Error:", err)
		return
	}
	defer resp.Body.Close()

	// 读取页面内容
	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		fmt.Println("Failed to read response body:", url, "Error:", err)
		return
	}

	// 解析HTML页面
	doc, err := html.Parse(strings.NewReader(string(body)))
	if err != nil {
		fmt.Println("Failed to parse HTML:", url, "Error:", err)
		return
	}

	// 提取页面中的链接
	links := extractLinks(doc)

	// 遍历链接
	for _, link := range links {
		// 解析链接
		absURL, err := resolveURL(url, link)
		if err != nil {
			fmt.Println("Failed to resolve URL:", link, "Error:", err)
			continue
		}

		// 检查链接是否已被访问过
		if !visited[absURL] {
			visited[absURL] = true
			crawl(absURL, visited)
		}
	}
}

func extractLinks(n *html.Node) []string {
	links := make([]string, 0)

	if n.Type == html.ElementNode && n.Data == "a" {
		for _, a := range n.Attr {
			if a.Key == "href" {
				links = append(links, a.Val)
			}
		}
	}

	for c := n.FirstChild; c != nil; c = c.NextSibling {
		links = append(links, extractLinks(c)...)
	}

	return links
}

func resolveURL(baseURL, href string) (string, error) {
	base, err := url.Parse(baseURL)
	if err != nil {
		return "", err
	}

	rel, err := url.Parse(href)
	if err != nil {
		return "", err
	}

	abs := base.ResolveReference(rel).String()
	return abs, nil
}

此代码通过指定一个种子URL开始,然后递归地爬取页面上的所有链接,并打印出每个页面的URL。它使用了标准库中的net/httpnet/html包来发送HTTP请求、解析HTML页面和提取链接

用go语言写一个250行的爬虫代码

原文地址: https://www.cveoy.top/t/topic/hIdM 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录