Python商业数据挖掘实战——爬取网页并将其转为Markdown
import requests
from bs4 import BeautifulSoup
import re
import os
# 定义一个函数来下载并保存图片
def download_image(url, filename):
response = requests.get(url)
with open(filename, 'wb') as file:
file.write(response.content)
# 定义一个函数来处理HTML内容,转换为Markdown格式
def convert_to_markdown(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
# 替换标题
for title in soup.find_all('h1'):
title.name = 'h3'
# 替换图片为Markdown格式
for img in soup.find_all('img'):
src = img['src']
filename = os.path.basename(src)
download_image(src, filename)
img['src'] = filename
img['alt'] = f"![{filename}]({filename})
# 替换段落
for p in soup.find_all('p'):
p.name = 'blockquote'
# 转换剩余HTML标签为Markdown格式
markdown_content = soup.encode_contents()
return markdown_content
# 示例HTML内容
html_content = """
<h1>标题</h1>
<p>这是一段文字。</p>
<img src="http://example.com/image.jpg" alt="示例图片">
"""
# 转换并打印结果
markdown_content = convert_to_markdown(html_content)
print(markdown_content)
这个代码示例展示了如何使用Python的requests库、BeautifulSoup库以及正则表达式来下载网页中的图片,并将HTML内容转换为Markdown格式。代码简洁明了,注重实用性,可以作为实际项目中的参考。
评论已关闭