Python网页爬虫爬取豆瓣Top250电影数据——Xpath数据解析_爬虫电影
import requests
from lxml import etree
import csv
import time
# 设置请求头,模拟浏览器访问
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
# 存储数据的列表
data_list = []
def get_data(url):
response = requests.get(url, headers=headers)
return response.text
def parse_data(html):
# 使用etree.HTML解析网页,并通过Xpath选取数据
html = etree.HTML(html)
# 影片信息的Xpath
movie_xpath = '//div[@class="info"]'
# 评分的Xpath
score_xpath = '//div[@class="star"]/span[@class="rating_num"]/text()'
# 影片名称的Xpath
name_xpath = '//div[@class="hd"]/a/span[1]/text()'
# 影评人数的Xpath
comment_xpath = '//div[@class="star"]/span[4]/text()'
# 提取数据
movies = html.xpath(movie_xpath)
for movie in movies:
data = {
'ranking': movie.xpath('./div[@class="pic"]/em/text()')[0],
'score': movie.xpath(score_xpath)[0],
'name': movie.xpath(name_xpath)[0],
'comment': movie.xpath(comment_xpath)[0] if movie.xpath(comment_xpath) else '0'
}
data_list.append(data)
def save_data():
with open('douban_top250.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['ranking', 'score', 'name', 'comment'])
writer.writeheader()
for data in data_list:
writer.writerow(data)
def main():
# 爬取的网页URL
url = 'https://movie.douban.com/top250'
html = get_data(url)
parse_data(html)
save_data()
if __name__ == '__main__':
main()
这段代码实现了从豆瓣Top250电影页面爬取数据的功能。首先,设置请求头,模拟浏览器访问,以避免反爬虫机制。然后定义了一个获取网页内容的函数get_data
,一个解析网页内容的函数parse_data
,以及一个保存数据到CSV文件的函数save_data
。最后,在main
函数中调用这些函数,完成数据爬取和保存的流程。
评论已关闭