爬虫使用Beautiful Soup爬取网页信息示例代码(酷dog音乐)
import requests
from bs4 import BeautifulSoup
# 定义函数来获取页面内容
def get_page_content(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
except requests.RequestException:
return None
# 定义函数来解析页面和提取音乐信息
def parse_and_extract_music_info(html):
soup = BeautifulSoup(html, 'html.parser')
music_info = {}
# 提取标题
title = soup.find('h1', class_='title').get_text(strip=True)
music_info['title'] = title
# 提取作者
author = soup.find('a', class_='author').get_text(strip=True)
music_info['author'] = author
# 提取发布日期
publish_date = soup.find('span', class_='publish-time').get_text(strip=True)
music_info['publish_date'] = publish_date
# 提取播放次数
play_count = soup.find('span', class_='count').get_text(strip=True)
music_info['play_count'] = play_count
# 提取音乐链接
audio_element = soup.find('audio')
if audio_element and 'src' in audio_element.attrs:
music_url = audio_element.attrs['src']
music_info['music_url'] = music_url
return music_info
# 定义要爬取的页面URL
url = 'https://www.essaydog.com/music/12345.html'
# 获取页面内容
html_content = get_page_content(url)
# 如果页面内容存在,则解析页面并提取音乐信息
if html_content:
music_info = parse_and_extract_music_info(html_content)
print(music_info)
else:
print("页面内容获取失败")
这段代码首先定义了get_page_content
函数来获取指定URL的页面内容,然后定义了parse_and_extract_music_info
函数来解析页面并提取音乐信息。最后,我们设置了要爬取的页面URL,执行这两个函数,并打印提取到的音乐信息。
评论已关闭