在Scrapy中设置随机的User-Agent (UA)、随机cookie 以及使用代理的方法如下:
- 随机UA中间件:
首先,创建一个Scrapy中间件,用于随机设置请求的User-Agent。
# middlewares.py
from scrapy import signals
class RandomUserAgentMiddleware:
def __init__(self, user_agent=''):
self.user_agent = user_agent
@classmethod
def from_crawler(cls, crawler):
return cls(
user_agent=crawler.settings.get('RANDOM_UA_LIST')
)
def process_request(self, request, spider):
user_agent = random.choice(self.user_agent)
request.headers.setdefault('User-Agent', user_agent)
在Scrapy设置文件 (settings.py
) 中启用并配置这个中间件,以及UA列表:
# settings.py
DOWNLOADER_MIDDLEWARES = {
'your_project.middlewares.RandomUserAgentMiddleware': 400,
}
RANDOM_UA_LIST = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
...
]
- 随机cookie中间件:
# middlewares.py
class RandomCookieMiddleware:
def __init__(self, cookie_list=None):
self.cookie_list = cookie_list or []
@classmethod
def from_crawler(cls, crawler):
return cls(
cookie_list=crawler.settings.getlist('RANDOM_COOKIE_LIST')
)
def process_request(self, request, spider):
cookie = random.choice(self.cookie_list)
request.cookies = cookie
在设置文件中配置cookie列表和启用中间件:
# settings.py
COOKIES_ENABLED = True
RANDOM_COOKIE_LIST = [
{'name': 'value'},
{'name2': 'value2'},
...
]
DOWNLOADER_MIDDLEWARES = {
'your_project.middlewares.RandomCookieMiddleware': 401,
}
- 使用代理中间件:
# middlewares.py
class ProxyMiddleware:
def __init__(self, proxy_url=''):
self.proxy_url = proxy_url
@classmethod
def from_crawler(cls, crawler):
return cls(
proxy_url=crawler.settings.get('PROXY_URL')
)
def process_request(self, request, spider):
request.meta['proxy'] = self.proxy_url
在设置文件中配置代理URL和启用中间件:
# settings.py
PROXY_URL = 'http://user:pass@proxy.example.com:8080'
DOWNLOADER_MIDDLEWARES = {
'your_project.middlewares.ProxyMiddleware': 410,
}
``