import requestsfrom lxml import etreeimport pandas as pdurl = httpsearchdangdangcomkey=C9F1BEADCDF8C2E7&act=inputheaders = User-AgentMozilla50 Linux; Android 60; Nexus 5 BuildMRA58N AppleWebKi
接下来可以使用循环来获取一页中的所有图书信息。首先,我们可以将获取每本图书的代码封装成一个函数,然后在循环中调用该函数来获取每本图书的信息。具体代码如下所示:
import requests
from lxml import etree
import pandas as pd
def get_book_info(book):
name = book.xpath('.//p[@class="name"]/a/@title')[0]
author = book.xpath('.//p[@class="search_book_author"]/span/a/@title')[0]
publisher = book.xpath('.//p[@class="search_book_author"]/span[@class="search_book_publish"]/a/@title')[0]
publish_date = book.xpath('.//p[@class="search_book_author"]/span[@class="search_book_publish"]/text()')[0]
price = book.xpath('.//span[@class="search_now_price"]/text()')[0]
rating = book.xpath('.//span[@class="search_star_black"]/span/@style')[0]
rating = int("".join(filter(str.isdigit, rating))) * 0.05
comment_num = book.xpath('.//a[@class="search_comment_num"]/text()')[0]
detail = book.xpath('.//p[@class="detail"]/text()')[0]
return {
"书名": name,
"作者": author,
"出版社": publisher,
"出版日期": publish_date,
"价格": price,
"评分": rating,
"评论数": comment_num,
"详情": detail
}
url = "http://search.dangdang.com/?key=%C9%F1%BE%AD%CD%F8%C2%E7&act=input"
headers = {
"User-Agent":'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36 Edg/114.0.1823.43'
}
r = requests.get(url, headers=headers)
html = r.text
html = etree.HTML(html)
books = html.xpath('//li[@ddt-pit="1" and @class="line1"]')
book_info_list = []
for book in books:
book_info = get_book_info(book)
book_info_list.append(book_info)
df = pd.DataFrame(book_info_list)
print(df)
这样,我们就可以得到一页中所有图书的信息,并将其存储在一个DataFrame中。你可以根据需要对DataFrame进行进一步的处理和分析
原文地址: http://www.cveoy.top/t/topic/hHCc 著作权归作者所有。请勿转载和采集!