利用Python进行网络爬虫和数据抓取
import requests
from bs4 import BeautifulSoup
# 发送HTTP请求,获取网页内容
def get_html(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
except requests.RequestException:
return None
# 解析网页,提取数据
def parse_data(html):
soup = BeautifulSoup(html, 'html.parser')
data = []
for post in soup.find_all('div', class_='post'):
title = post.find('a', class_='title').text.strip()
author = post.find('span', class_='author').text.strip()
data.append({'title': title, 'author': author})
return data
# 打印数据
def print_data(data):
for item in data:
print(f"Title: {item['title']}, Author: {item['author']}")
# 主函数,组织流程
def main(url):
html = get_html(url)
if html:
parsed_data = parse_data(html)
print_data(parsed_data)
else:
print("Failed to retrieve the webpage content.")
# 用例
if __name__ == '__main__':
url = 'https://simpleisbetterthancomplex.com/feed'
main(url)
这段代码展示了如何使用Python的requests库获取网页内容,以及如何使用BeautifulSoup进行网页解析。代码简洁,注重逻辑分层和注释,可以作为学习如何进行网络爬虫的良好示例。
评论已关闭