石家庄链家二手房信息爬取:获取交易情况及房源数据
import requests import time import csv import pandas as pd from lxml import etree
获取每一页的url
def Get_url(url): all_url = [] for i in range(1, 200): all_url.append(url + 'pg' + str(i) + '/') # 储存每一个页面的url return all_url
获取每套房详情信息的url
def Get_house_url(all_url, headers): num = 0 # 简单统计页数 for i in all_url: r = requests.get(i, headers=headers) html = etree.HTML(r.text) url_ls = html.xpath("//ul[@class='sellListContent']/li/a/@href") # 获取房子的url Analysis_html(url_ls, headers) time.sleep(4) print("第%s页爬完了" % i) num += 1
获取每套房的详情信息
def Analysis_html(url_ls, headers): for i in url_ls: # num记录爬取成功的索引值 r = requests.get(i, headers=headers) html = etree.HTML(r.text) name = (html.xpath("//div[@class='communityName']/a/text()"))[0].split() # 获取房名 money = html.xpath("//span[@class='total']/text()") # 获取价格 area = html.xpath("//span[@class='info']/a[1]/text()") # 获取地区 data = html.xpath("//div[@class='content']/ul/li/text()") # 获取房子基本属性
Save_data(name, money, area, data)
把爬取的信息存入文件
def Save_data(name, money, area, data): result = [name[0]] + money + [area] + data # 把详细信息合为一个列表 with open(r'raw_data.csv', 'a', encoding='utf_8_sig', newline='') as f: wt = csv.writer(f) wt.writerow(result) print('已写入') f.close()
增加一个交易情况,收集的是石家庄链家二手房的
def Get_transaction_info(url, headers): r = requests.get(url, headers=headers) html = etree.HTML(r.text) transaction_info = html.xpath("//span[@class='dealCycleTxt']/text()") # 获取交易情况 with open(r'transaction_info.csv', 'a', encoding='utf_8_sig', newline='') as f: wt = csv.writer(f) wt.writerow(transaction_info) print('已写入交易情况') f.close()
if name == 'main': url = 'https://sjz.lianjia.com/ershoufang/' transaction_url = 'https://sjz.lianjia.com/chengjiao/' headers = { "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko)Chrome" "/72.0.3626.121 Safari/537.36" } all_url = Get_url(url) with open(r'raw_data.csv', 'a', encoding='utf_8_sig', newline='') as f: # 首先加入表格头 table_label = ['小区名', '价格/万', '地区', '房屋户型', '所在楼层', '建筑面积', '户型结构', '套内面积', '建筑类型', '房屋朝向' , '梯户比例', '装修情况', '建筑结构', '供暖方式'] wt = csv.writer(f) wt.writerow(table_label) Get_house_url(all_url, headers) Get_transaction_info(transaction_url, headers)
原文地址: https://www.cveoy.top/t/topic/EVU 著作权归作者所有。请勿转载和采集!