Python高阶爬虫实战分析
import requests
from bs4 import BeautifulSoup
def get_soup(url):
"""
获取网页内容并解析为soup对象
"""
response = requests.get(url)
if response.status_code == 200:
return BeautifulSoup(response.text, 'html.parser')
else:
return None
def find_image_urls(soup, image_dir):
"""
在soup对象中查找图片链接并保存到文件
"""
image_tags = soup.find_all('img', {'class': 'lazy load-img'})
for tag in image_tags:
image_url = tag['data-src']
image_name = image_url.split('/')[-1]
with open(f'{image_dir}/{image_name}', 'wb') as f:
response = requests.get(image_url)
f.write(response.content)
def main():
"""
主函数,控制流程
"""
base_url = 'https://www.example.com/page/'
image_dir = 'images'
soup = get_soup(base_url)
if soup:
find_image_urls(soup, image_dir)
if __name__ == '__main__':
main()
这个简化的代码实例展示了如何使用Python的requests和BeautifulSoup库来获取网页内容,并从中提取图片链接,最终将图片保存到本地文件夹。注意,这个例子假设网页结构和需要抓取的图片类名不变,实际应用时可能需要根据网页实际情况进行相应的调整。
评论已关闭