import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
# 定义一个函数来获取房源信息
def get_source_info(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
info_list = soup.select('.info-list li')
info_dict = {}
for info in info_list:
key = info.select('span')[0].text
value = info.select('a|span')[1].text if len(info.select('a|span')) > 1 else ''
info_dict[key] = value
return info_dict
# 定义一个函数来获取房源详细信息
def get_source_details(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.select('.title-bar-01')[0].text
info_list = soup.select('.house-parameter li')
info_dict = {}
for info in info_list:
key = info.select('span')[0].text
value = info.select('span')[1].text
info_dict[key] = value
return title, info_dict
# 定义一个函数来获取房源数据
def get_source_data(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
data_list = soup.select('.house-list-wrap .house-list-item')
source_data = []
for data in data_list:
info_dict = get_source_info(data.select('a')[0]['href'])
info_dict['title'] = data.select('.house-title')[0].text
info_dict['price'] = data.select('.price')[0].text
source_data.append(info_dict)
return source_data
# 获取二手房数据
source_data = get_source_data('http://ershou.jilin.cn/ershoufang/')
df = pd.DataFrame(source_data)
# 数据可视化
plt.figure(figsize=(20, 10))
plt.subplot(1, 2, 1)
plt.scatter(df['area'], df['price'])
plt.xlabel('Area (平方米)')
plt.ylabel('Price (万元)')
plt.title('二手房面积与价格关系散点图')
plt.subplot(1, 2, 2)
plt.hist(df['price'], bins=50)
plt.xlabel('Price (万元)')
plt.ylabel('Count')
plt.title('二手房价格分布直方图')
plt.show()
这段代码首先定义了一个函数get_source_info
来解析房源列表页的每条房源信息,然后定义了一个函数get_source_details
来解析房源详情页的标题和详细信息。最后,定义了一个函数get_source_data
来获取整个房源页的数据,并将其存储为DataFrame格式,以便进行数据可视化分析。代码中使用了matplotlib.pyplot
库来绘制散点图和直方图,展示了房源面积与价格之间的关系以及房源价格的分布情况。