Python实现百度搜索爬虫: 支持图片展示和随机延迟
以下是经过修改的完整代码:
import requests
from bs4 import BeautifulSoup
import time
import tkinter as tk
import webbrowser
from PIL import ImageTk, Image
import random
def crawl_baidu(keyword):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
url = f'https://www.baidu.com/s?wd={keyword}'
# 添加随机延迟
delay = random.uniform(0.5, 1.5)
time.sleep(delay)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = []
for result in soup.find_all('div', class_='result'):
result_title = result.find('h3').get_text()
result_url = result.find('a')['href']
result_image = result.find('img')
if result_image:
result_image_url = result_image['src']
else:
result_image_url = None
results.append((result_title, result_url, result_image_url))
return results
def open_url(url):
webbrowser.open(url)
def show_image(image_url):
image_response = requests.get(image_url)
image_data = image_response.content
image = Image.open(io.BytesIO(image_data))
image.show()
def search():
keyword = entry.get()
search_results = crawl_baidu(keyword)
if len(search_results) > 0:
for index, (title, url, image_url) in enumerate(search_results, start=1):
result_text.insert(tk.END, f'{index}. ', 'index')
result_text.insert(tk.END, f'{title}
', 'title')
result_text.insert(tk.END, url, 'link')
result_text.tag_configure('link', foreground='blue', underline=True)
result_text.tag_bind('link', '<Button-1>', lambda event, url=url: open_url(url))
result_text.insert(tk.END, '
')
if image_url:
result_text.image_create(tk.END, image=image_url, padx=10, pady=10)
result_text.tag_bind('img', '<Button-1>', lambda event, image_url=image_url: show_image(image_url))
result_text.insert(tk.END, '
')
result_text.insert(tk.END, '
')
else:
result_text.insert(tk.END, '没有搜索结果
')
# 创建UI界面
window = tk.Tk()
window.title('百度搜索')
window.geometry('800x600')
label = tk.Label(window, text='请输入关键词:')
label.pack()
entry = tk.Entry(window)
entry.pack()
search_button = tk.Button(window, text='搜索', command=search)
search_button.pack()
scrollbar = tk.Scrollbar(window)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
result_text = tk.Text(window, yscrollcommand=scrollbar.set)
result_text.pack(fill=tk.BOTH)
scrollbar.config(command=result_text.yview)
window.mainloop()
该代码实现了以下功能:
- 爬取百度搜索结果的标题、链接和图片。
- 在结果列表中展示图片,点击图片可以查看大图。
- 加入随机延迟功能,模拟真实用户行为,避免被网站封禁。
希望这个代码能够帮助你了解如何编写Python爬虫程序。
原文地址: http://www.cveoy.top/t/topic/rVq 著作权归作者所有。请勿转载和采集!