人工智能|网络爬虫——用Python爬取电影数据并可视化分析
import requests
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
# 获取网页内容
def get_html(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except requests.RequestException:
return None
# 解析网页并提取电影信息
def parse_html(html):
soup = BeautifulSoup(html, 'html.parser')
movie_data = []
for div in soup.find_all('div', class_='info'):
movie = {}
movie['name'] = div.h4.text
movie['rating'] = div.find('span', class_='rating_num').text
movie['votes'] = div.find('span', class_='rating_votes').text.replace(',', '')
movie_data.append(movie)
return movie_data
# 保存数据到CSV文件
def save_to_csv(data, filename):
df = pd.DataFrame(data)
df.to_csv(filename, index=False)
# 分析电影评分数据
def analyze_ratings(dataframe):
ratings = dataframe['rating']
ratings = ratings.astype(float)
plt.hist(ratings, bins=20, color='green', edgecolor='white')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.title('Histogram of Movie Ratings')
plt.show()
# 主函数
def main():
url = 'https://example.com/movies' # 替换为实际的网址
html = get_html(url)
movie_data = parse_html(html)
save_to_csv(movie_data, 'movies.csv')
dataframe = pd.read_csv('movies.csv')
analyze_ratings(dataframe)
if __name__ == '__main__':
main()
这段代码提供了一个简化的示例,展示了如何使用Python爬取网站上的电影数据,并使用Pandas和Matplotlib进行数据分析。需要注意的是,实际应用中应遵循网站的robots.txt协议和法律法规,避免违反网站的使用条款。此外,应该使用合适的headers和适当的爬取策略,避免过度请求导致对网站的服务影响。
评论已关闭