用Python爬取电影数据并可视化分析
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
# 获取网页内容
def get_html(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
except requests.RequestException:
return None
# 解析网页并提取数据
def parse_data(html):
soup = BeautifulSoup(html, 'lxml')
data = []
for item in soup.select('table#content_table tr[id^=item]'):
rank = item.select_one('td:nth-of-type(1)').text
movie = item.select_one('td:nth-of-type(2) a').text
score = item.select_one('td:nth-of-type(3)').text
comment = item.select_one('td:nth-of-type(4)').text
data.append([rank, movie, score, comment])
return data
# 保存数据到CSV文件
def save_to_csv(data, filename):
df = pd.DataFrame(data, columns=['排名', '电影名称', '评分', '评论数'])
df.to_csv(filename, index=False, encoding='utf-8-sig')
# 绘制评分的直方图
def plot_histogram(data):
scores = [float(row[2]) for row in data if row[2].isdigit()]
plt.hist(scores, bins=25, color='blue', edgecolor='white')
plt.xlabel('评分')
plt.ylabel('数量')
plt.title('评分直方图')
plt.show()
# 主函数
def main():
url = 'https://movie.douban.com/chart'
html = get_html(url)
data = parse_data(html)
save_to_csv(data, 'douban_movies.csv')
plot_histogram(data)
if __name__ == '__main__':
main()
这段代码实现了从豆瓣电影TOP250页面爬取数据的功能,并将数据保存到CSV文件,最后绘制了电影评分的直方图。代码使用了requests库获取网页内容,BeautifulSoup进行网页解析,pandas处理数据,以及matplotlib进行数据可视化。
评论已关闭