import requests
from bs4 import BeautifulSoup
# 发送HTTP请求获取网页内容
def get_html(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
except requests.RequestException:
return None
# 解析网页并提取数据
def parse_data(html):
soup = BeautifulSoup(html, 'html.parser')
data = []
for item in soup.select('.product_item'):
title = item.select_one('.product_item_title').text.strip()
price = item.select_one('.product_item_price').text.strip()
data.append({'title': title, 'price': price})
return data
# 保存数据到文件
def save_data(data, filename):
with open(filename, 'w', encoding='utf-8') as f:
for item in data:
f.write(f"{item['title']},{item['price']}\n")
# 主函数,组织流程
def main():
url = 'https://www.example.com/products'
html = get_html(url)
if html:
parsed_data = parse_data(html)
save_data(parsed_data, 'products.csv')
if __name__ == '__main__':
main()
这个简易的爬虫示例展示了如何使用Python的requests库获取网页内容,使用BeautifulSoup进行网页解析,并将提取的数据保存到CSV文件中。代码注重简洁性和可读性,适合作为初学者学习爬虫编写的入门教程。