python系列30:各种爬虫技术总结
# 导入必要的模块
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
# 定义一个函数来获取网页内容
def get_html(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get(url, headers=headers)
return response.text
except requests.exceptions.RequestException as e:
print(e)
return None
# 定义一个函数来解析网页并提取所需信息
def parse_html(html):
soup = BeautifulSoup(html, 'html.parser')
titles = soup.find_all('div', class_='title')
infos = soup.find_all('div', class_='info')
rates = soup.find_all('div', class_='rate')
cover_urls = re.findall(r'src="(.+?)"', html, re.S)
names = [title.text.strip() for title in titles]
details = [info.text.strip() for info in infos]
scores = [rate.text.strip() for rate in rates]
return list(zip(names, details, scores, cover_urls))
# 定义一个函数来保存数据到CSV文件
def save_to_csv(data, filename):
df = pd.DataFrame(data, columns=['Name', 'Detail', 'Score', 'Cover URL'])
df.to_csv(filename, index=False)
# 定义一个主函数来运行爬虫
def main():
url = 'https://movie.douban.com/top250'
html = get_html(url)
data = parse_html(html)
save_to_csv(data, 'douban_movies.csv')
if __name__ == '__main__':
main()
这段代码首先导入了必要的模块,包括requests用于网络请求,BeautifulSoup用于网页解析,re用于正则表达式操作,以及pandas用于数据处理。定义了一个get\_html函数来获取网页内容,一个parse\_html函数来解析网页并提取数据,以及一个save\_to\_csv函数来保存数据。最后,在main函数中调用这些函数来完成整个爬虫过程。
评论已关闭