Python3网络爬虫开发实战爬虫基础库
import requests
from bs4 import BeautifulSoup
import re
import time
import random
# 设置请求头信息,模拟浏览器访问
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
def get_html(url):
"""发送请求,获取网页内容"""
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except requests.RequestException:
return None
def parse_html(html):
"""解析网页内容,提取有效信息"""
soup = BeautifulSoup(html, 'html.parser')
# 提取帖子标题
titles = soup.find_all('a', class_='j_thread_title')
for title in titles:
yield title.get_text()
def save_data(data, filename):
"""将数据保存到文件"""
with open(filename, 'a', encoding='utf-8') as f:
f.write(data + '\n')
def crawl_page(url, filename):
"""爬取页面内容"""
html = get_html(url)
if html:
for title in parse_html(html):
print(title)
save_data(title, filename)
# 为了避免爬虫被封,这里随机休眠几秒
time.sleep(random.randint(1, 3))
if __name__ == '__main__':
url = 'https://www.example.com/page/number/1'
filename = 'example.txt'
crawl_page(url, filename)
这个示例代码展示了如何使用requests库获取网页内容,使用BeautifulSoup进行网页解析,以及如何使用简单的正则表达式提取帖子标题。同时,代码中还包含了异常处理和随机休眠,以防被服务器封禁。这些技巧对于开发健壮的网络爬虫至关重要。
评论已关闭