[手写爬虫框架],从回忆Python到爬虫原理解析
import requests
from lxml import etree
class SimpleSpider:
def __init__(self, start_url):
self.start_url = start_url
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
def download(self, url):
response = requests.get(url, headers=self.headers)
return response.content.decode('utf-8')
def parser(self, html):
html_tree = etree.HTML(html)
# 假设我们要提取的是a标签的href属性和文本内容
links = html_tree.xpath('//a/@href')
texts = html_tree.xpath('//a/text()')
return list(zip(links, texts))
def save(self, data):
with open('output.txt', 'a', encoding='utf-8') as f:
for link, text in data:
f.write(f'Link: {link}, Text: {text}\n')
def run(self):
html = self.download(self.start_url)
parsed_data = self.parser(html)
self.save(parsed_data)
# 使用示例
spider = SimpleSpider('https://example.com')
spider.run()
这段代码定义了一个简单的爬虫框架,包含下载、解析和保存页面数据的功能。这个例子教学意义很高,因为它展示了如何将requests库用于网络请求,以及如何使用lxml库来解析HTML并提取数据。这个简单的框架可以作为学习如何构建更复杂爬虫的起点。
评论已关闭