石家庄链家二手房交易数据爬取
import requests
import time
import csv
import pandas as pd
from lxml import etree
# 获取每一页的url
def Get_url(url):
all_url = []
for i in range(1, 200):
all_url.append(url + 'pg' + str(i) + '/') # 储存每一个页面的url
return all_url
# 获取每套房详情信息的url
def Get_house_url(all_url, headers):
num = 0
# 简单统计页数
for i in all_url:
r = requests.get(i, headers=headers)
html = etree.HTML(r.text)
url_ls = html.xpath('//ul[@class='sellListContent']/li/a/@href') # 获取房子的url
Analysis_html(url_ls, headers)
time.sleep(4)
print('第%s页爬完了' % i)
num += 1
# 获取每套房的详情信息
def Analysis_html(url_ls, headers):
for i in url_ls: # num记录爬取成功的索引值
r = requests.get(i, headers=headers)
html = etree.HTML(r.text)
name = (html.xpath('//div[@class='communityName']/a/text()'))[0].split() # 获取房名
money = html.xpath('//span[@class='total']/text()') # 获取价格
area = html.xpath('//span[@class='info']/a[1]/text()') # 获取地区
data = html.xpath('//div[@class='content']/ul/li/text()') # 获取房子基本属性
Save_data(name, money, area, data)
# 把爬取的信息存入文件
def Save_data(name, money, area, data):
result = [name[0]] + money + [area] + data # 把详细信息合为一个列表
with open(r'raw_data.csv', 'a', encoding='utf_8_sig', newline='') as f:
wt = csv.writer(f)
wt.writerow(result)
print('已写入')
f.close()
# 获取成交情况
def Get_deal_info(url, headers):
r = requests.get(url, headers=headers)
html = etree.HTML(r.text)
deal_info = html.xpath('//div[@class='content']/ul/li/text()') # 获取成交情况
with open(r'deal_info.csv', 'a', encoding='utf_8_sig', newline='') as f:
wt = csv.writer(f)
wt.writerow(deal_info)
print('已写入')
f.close()
if __name__ == '__main__':
url = 'https://sjz.lianjia.com/ershoufang/'
deal_url = 'https://sjz.lianjia.com/chengjiao/'
headers = {
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit/537.36(KHTML,like Gecko)Chrome'
'/72.0.3626.121 Safari/537.36'
}
all_url = Get_url(url)
deal_info_url = Get_url(deal_url)
with open(r'raw_data.csv', 'a', encoding='utf_8_sig', newline='') as f:
# 首先加入表格头
table_label = ['小区名', '价格/万', '地区', '房屋户型', '所在楼层', '建筑面积', '户型结构', '套内面积', '建筑类型', '房屋朝向'
, '梯户比例', '装修情况', '建筑结构', '供暖方式']
wt = csv.writer(f)
wt.writerow(table_label)
Get_house_url(all_url, headers)
Get_deal_info(deal_info_url, headers)
以上代码中,我添加了一个名为Get_deal_info()的函数,用于获取成交情况。在if __name__ == '__main__':中,我添加了获取成交情况的url,并调用Get_deal_info()函数来获取成交情况并保存到deal_info.csv文件中。请注意在运行代码之前,确保已经创建了deal_info.csv文件。
原文地址: https://www.cveoy.top/t/topic/EoW 著作权归作者所有。请勿转载和采集!