python-爬虫-BeautifulSoup
from urllib.request import urlopen
from bs4 import BeautifulSoup
# 获取网页内容
def get_html(url):
try:
response = urlopen(url)
return response.read()
except Exception as e:
print(f"An error occurred: {e}")
return None
# 解析网页并提取信息
def parse_html(html):
soup = BeautifulSoup(html, 'html.parser')
title = soup.find('title')
return title.text if title else None
# 主函数
def main():
url = 'https://www.example.com' # 替换为你想爬取的网页URL
html = get_html(url)
if html:
title = parse_html(html)
print(title)
else:
print("Failed to retrieve HTML")
if __name__ == "__main__":
main()
这段代码展示了如何使用Python的urllib
库和BeautifulSoup
库来获取网页内容并解析提取标题。需要替换https://www.example.com
为你想要爬取的具体网页。
评论已关闭