Python网络爬虫实践:构建实用的爬虫应用
import requests
from bs4 import BeautifulSoup
def fetch_url(url):
"""
发送HTTP请求,获取网页内容
"""
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
except requests.exceptions.RequestException:
return None
def parse_soup(soup):
"""
解析BeautifulSoup对象,提取所需数据
"""
# 示例:提取所有h1标签的内容
h1_tags = soup.find_all('h1')
return [tag.get_text() for tag in h1_tags]
def main():
url = 'https://example.com' # 替换为你想爬取的网站
html_content = fetch_url(url)
if html_content:
soup = BeautifulSoup(html_content, 'html.parser')
parsed_data = parse_soup(soup)
for data in parsed_data:
print(data)
else:
print('Failed to fetch URL')
if __name__ == '__main__':
main()
这段代码展示了如何使用Python的requests库来获取网页内容,以及如何使用BeautifulSoup来解析HTML并提取数据。这是一个简单的网络爬虫示例,可以根据实际需求进行功能扩展。
评论已关闭