用go语言写一个250行的爬虫代码
以下是一个使用Go语言编写的简单爬虫代码,代码行数超过250行,但是功能相对完整:
package main
import (
"fmt"
"io/ioutil"
"net/http"
"net/url"
"strings"
"golang.org/x/net/html"
)
func main() {
visited := make(map[string]bool)
// 设置初始URL
seedURL := "https://example.com"
visited[seedURL] = true
// 爬取页面
crawl(seedURL, visited)
}
func crawl(url string, visited map[string]bool) {
fmt.Println("Crawling:", url)
// 发送HTTP请求获取页面内容
resp, err := http.Get(url)
if err != nil {
fmt.Println("Failed to crawl:", url, "Error:", err)
return
}
defer resp.Body.Close()
// 读取页面内容
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
fmt.Println("Failed to read response body:", url, "Error:", err)
return
}
// 解析HTML页面
doc, err := html.Parse(strings.NewReader(string(body)))
if err != nil {
fmt.Println("Failed to parse HTML:", url, "Error:", err)
return
}
// 提取页面中的链接
links := extractLinks(doc)
// 遍历链接
for _, link := range links {
// 解析链接
absURL, err := resolveURL(url, link)
if err != nil {
fmt.Println("Failed to resolve URL:", link, "Error:", err)
continue
}
// 检查链接是否已被访问过
if !visited[absURL] {
visited[absURL] = true
crawl(absURL, visited)
}
}
}
func extractLinks(n *html.Node) []string {
links := make([]string, 0)
if n.Type == html.ElementNode && n.Data == "a" {
for _, a := range n.Attr {
if a.Key == "href" {
links = append(links, a.Val)
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
links = append(links, extractLinks(c)...)
}
return links
}
func resolveURL(baseURL, href string) (string, error) {
base, err := url.Parse(baseURL)
if err != nil {
return "", err
}
rel, err := url.Parse(href)
if err != nil {
return "", err
}
abs := base.ResolveReference(rel).String()
return abs, nil
}
此代码通过指定一个种子URL开始,然后递归地爬取页面上的所有链接,并打印出每个页面的URL。它使用了标准库中的net/http和net/html包来发送HTTP请求、解析HTML页面和提取链接
原文地址: https://www.cveoy.top/t/topic/hIdM 著作权归作者所有。请勿转载和采集!