Scrapy中间件(代理、Cookie、请求头、UA)
以下是一个Scrapy中间件的示例,用于设置请求的代理、Cookie和请求头,以及随机更换User-Agent:
import random
from scrapy import signals
from scrapy.downloadermiddlewares.cookies import CookiesMiddleware
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
class MyCustomMiddleware(object):
def __init__(self, proxy_url=None, cookie_name=None, user_agent_list=None):
self.proxy_url = proxy_url
self.cookie_name = cookie_name
self.user_agent_list = user_agent_list
@classmethod
def from_crawler(cls, crawler):
# 从Scrapy的配置文件中读取参数
proxy_url = crawler.settings.get('PROXY_URL')
cookie_name = crawler.settings.get('COOKIE_NAME')
user_agent_list = crawler.settings.get('USER_AGENT_LIST')
# 创建中间件实例
middleware = cls(proxy_url, cookie_name, user_agent_list)
# 将信号连接到相应的处理函数
crawler.signals.connect(middleware.spider_opened, signal=signals.spider_opened)
return middleware
def process_request(self, request, spider):
# 设置代理
if self.proxy_url:
request.meta['proxy'] = self.proxy_url
# 设置Cookie
if self.cookie_name:
request.cookies[self.cookie_name] = 'your_cookie_value'
# 或者使用CookiesMiddleware
# CookiesMiddleware.process_request(request, spider)
# 设置User-Agent
if self.user_agent_list:
user_agent = random.choice(self.user_agent_list)
request.headers.setdefault('User-Agent', user_agent)
# 或者使用UserAgentMiddleware
# UserAgentMiddleware.process_request(request, spider)
def spider_opened(self, spider):
# 当爬虫开启时,可以进行一些初始化操作
pass
在Scrapy的配置文件(settings.py
)中,你需要启用这个中间件,并设置相应的参数:
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.MyCustomMiddleware': 543,
}
# 配置代理服务器
PROXY_URL = 'http://your.proxy.com:port'
# 配置Cookie名称
COOKIE_NAME = 'my_cookie_name'
# 配置User-Agent列表
USER_AGENT_LIST = [
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:18.0) Gecko/20100101 Firefox/18.0',
# ...更多User-Agent字符串...
]
这个中间件示例提供了如何在Scrapy爬虫中设置代理、Cookie和User-Agent的方法,并且可以随机选择User-Agent来避免被对方服务器识别。在实际应用中,你需要根据自己的需求进行相应的配置。
评论已关闭