爬虫笔记11——网页爬取数据写入csv
import requests
from bs4 import BeautifulSoup
import csv
# 设置请求头信息,模拟浏览器访问
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
def get_html(url):
"""
获取网页的html内容
"""
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except requests.RequestException:
return None
def parse_html(html):
"""
解析网页,提取数据
"""
soup = BeautifulSoup(html, 'lxml')
data_list = soup.select('div.quote')
for data in data_list:
yield {
'编号': data.select_one('div.number').text.strip(),
'引用': data.select_one('div.quote').text.strip(),
'来源': data.select_one('div.author').text.strip(),
}
def save_to_csv(data_list, file_name):
"""
将数据保存到csv文件
"""
with open(file_name, 'w', newline='', encoding='utf-8') as file:
writer = csv.DictWriter(file, fieldnames=['编号', '引用', '来源'])
writer.writeheader()
for data in data_list:
writer.writerow(data)
def main():
url = 'http://quotes.toscrape.com/'
html = get_html(url)
data_list = parse_html(html)
save_to_csv(data_list, 'quotes.csv')
if __name__ == '__main__':
main()
这段代码首先定义了请求头信息,用于模拟浏览器访问。然后定义了get_html
函数来获取网页的HTML内容。parse_html
函数用于解析HTML,提取数据。最后,save_to_csv
函数将数据保存到CSV文件中。在main
函数中,这些函数被组合起来完成数据爬取和保存的流程。
评论已关闭