import scrapy
class DoubanMovieItem(scrapy.Item):
# 定义爬取的字段
movie_name = scrapy.Field()
movie_rating = scrapy.Field()
movie_comment = scrapy.Field()
class DoubanMovieSpider(scrapy.Spider):
name = 'douban_movie'
allowed_domains = ['douban.com']
start_urls = ['https://movie.douban.com/top250']
def parse(self, response):
# 提取电影信息的xpath路径
movie_selectors = response.xpath('//div[@class="info"]')
for movie_selector in movie_selectors:
item = DoubanMovieItem()
# 提取电影名称
movie_name = movie_selector.xpath('.//a/span[1]/text()').extract_first()
item['movie_name'] = movie_name.strip() if movie_name else None
# 提取电影评分
movie_rating = movie_selector.xpath('.//div[@class="star"]/span[2]/text()').extract_first()
item['movie_rating'] = movie_rating.strip() if movie_rating else None
# 提取电影评论
movie_comment = movie_selector.xpath('.//p[@class="quote"]/span/text()').extract_first()
item['movie_comment'] = movie_comment.strip() if movie_comment else None
yield item
# 提取下一页链接并进行爬取
next_page_url = response.xpath('//span[@class="next"]/a/@href').extract_first
if next_page_url:
next_page_full_url = response.urljoin(next_page_url)
yield scrapy.Request(next_page_full_url, callback=self.parse)
这段代码实现了一个简单的Scrapy爬虫,用于抓取豆瓣电影TOP250的电影名称、评分和评论。它使用XPath选择器来定位页面元素,并使用Item对象来存储爬取的数据。此外,它还实现了简单的分页逻辑来爬取所有页的数据。