import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import os
import re
def download_image(image_url, directory):
response = requests.get(image_url)
file_name = image_url.split('/')[-1]
with open(os.path.join(directory, file_name), 'wb') as file:
file.write(response.content)
def extract_images(html):
soup = BeautifulSoup(html, 'html.parser')
return [img['src'] for img in soup.find_all('img', {'class': re.compile('lazy')})]
def crawl_and_download(url, directory, threads=5, processes=3):
response = requests.get(url)
image_urls = extract_images(response.text)
if not os.path.exists(directory):
os.makedirs(directory)
with ThreadPoolExecutor(max_workers=threads) as executor:
executor.map(download_image, image_urls, [directory] * len(image_urls))
# 如果想使用多进程替换上面的线程池,取消以下注释
# with ProcessPoolExecutor(max_processes=processes) as executor:
# executor.map(download_image, image_urls, [directory] * len(image_urls))
if __name__ == '__main__':
base_url = 'https://www.example.com/gallery'
output_directory = 'images'
crawl_and_download(base_url, output_directory)
这段代码示例展示了如何使用多线程和多进程来下载图片。首先定义了一个下载单个图片的函数download_image
,以及一个解析HTML内容并提取图片链接的函数extract_images
。crawl_and_download
函数负责发送HTTP请求、解析页面和调用多线程或多进程执行图片下载。通过注释展示了如何在多线程和多进程之间切换。