import requests
from bs4 import BeautifulSoup
import re
import os
class DoubanSpider:
def __init__(self, url):
self.url = url
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
def get_page(self, url):
response = requests.get(url, headers=self.headers)
if response.status_code == 200:
return response.text
return None
def parse_index(self, html):
soup = BeautifulSoup(html, 'lxml')
movie_list = soup.find('ol', 'grid_view')
if movie_list:
for movie in movie_list.find_all('li'):
yield {
'name': movie.find('span', 'title').getText().strip(),
'score': movie.find('span', 'rating_num').getText()
}
def save_to_file(self, data):
with open('douban.txt', 'a', encoding='utf-8') as f:
f.write(f"{data['name']}: {data['score']}\n")
def run(self):
html = self.get_page(self.url)
for data in self.parse_index(html):
print(f"{data['name']}: {data['score']}")
self.save_to_file(data)
if __name__ == '__main__':
url = 'https://movie.douban.com/tag/#/?sort=T&range=2,10&tags=%E7%83%AD%E9%97%A8'
DoubanSpider(url).run()
这段代码实现了一个简单的基于Python的豆瓣电影爬虫,它能够抓取豆瓣电影标签页(这里使用的是国产电影标签作为示例)上的电影名称和评分,并将结果输出到控制台和保存到本地文件。这个示例展示了如何组织爬虫的结构,并包含了请求头的添加,以及对HTML内容的解析和提取。