Python大作业——爬虫,零基础入门Python
import requests
from bs4 import BeautifulSoup
# 获取网页内容的函数
def get_html_content(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return "页面无法访问"
except requests.exceptions.RequestException:
return "请求出错"
# 解析网页并提取信息的函数
def parse_html_info(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
title = soup.find('h1', class_='post-title').get_text()
content = soup.find('div', class_='post-content').get_text()
author = soup.find('p', class_='post-meta-author').get_text()
return title, content, author
# 主函数
def main():
url = 'http://example.com/some-post' # 替换为你要爬取的网页URL
html_content = get_html_content(url)
title, content, author = parse_html_info(html_content)
print(f"标题: {title}")
print(f"内容: {content}")
print(f"作者: {author}")
if __name__ == '__main__':
main()
这段代码展示了如何使用Python的requests库和BeautifulSoup库来简单地抓取一个网页的内容,并提取出其中的标题、内容和作者信息。在实际应用中,你需要根据目标网页的结构来修改解析代码,以确保正确提取信息。
评论已关闭