【Python实用技能】爬虫升级之路:从专用爬虫到用AI Agent实现通用网络爬虫(适合小白)
import requests
from bs4 import BeautifulSoup
import time
import random
class AICrawler:
def __init__(self, url, max_pages=5):
self.url = url
self.max_pages = max_pages
self.session = requests.Session()
self.session.headers = {'User-Agent': 'Mozilla/5.0'}
def get_page(self, url):
response = self.session.get(url)
if response.status_code == 200:
return response.text
def parse_page(self, html):
soup = BeautifulSoup(html, 'html.parser')
# 这里添加具体解析逻辑
return [item for item in soup.find_all('a') if item.get('href')]
def crawl(self):
for page in range(self.max_pages):
url = f"{self.url}/page/{page}"
html = self.get_page(url)
if html:
links = self.parse_page(html)
for link in links:
print(link['href'])
time.sleep(random.uniform(1, 3)) # 添加随机延时避免被服务器封禁
if __name__ == '__main__':
crawler = AICrawler("https://example.com")
crawler.crawl()
这个示例代码展示了如何使用requests库和BeautifulSoup库来实现一个简单的网页爬取,并使用AI Agent来模拟人工浏览,增加了随机延时来减少被服务器封禁的风险。这个示例教会开发者如何构建一个通用的网络爬虫,并展示了如何在实际应用中处理常见的网络爬虫问题。
评论已关闭