Python中的爬虫实战:豆瓣图书爬虫
import requests
from bs4 import BeautifulSoup
import csv
# 设置请求头,模拟浏览器访问
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
def get_page(url):
"""发送请求,获取页面内容"""
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except requests.RequestException:
return None
def parse_page(html):
"""解析页面,提取图书信息"""
soup = BeautifulSoup(html, 'lxml')
book_list = soup.find('ul', class_='book-list')
books = book_list.find_all('li')
for book in books:
yield {
'书名': book.find('div', class_='info').h2.a.text.strip(),
'作者': book.find('div', class_='info').find_all('a', class_='author')[-1].text.strip(),
'平均分': book.find('div', class_='star').find('span', class_='rating_nums').text,
'评论数': book.find('div', class_='star').find_next_sibling('a').text.strip('('),
'缩略图': book.find('a', class_='image_book').find('img').get('src'),
'状态': book.find('div', class_='info').find('span', class_='tag').text.strip(),
'简介': book.find('p', class_='quote').text.strip()
}
def save_to_csv(data):
"""将数据保存到CSV文件中"""
with open('douban_books.csv', 'a', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=data.keys())
writer.writerow(data)
def main(url):
"""主函数,控制流程"""
html = get_page(url)
for book in parse_page(html):
print(book)
save_to_csv(book)
if __name__ == '__main__':
url = 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T'
main(url)
这段代码实现了一个简单的豆瓣图书爬虫。首先定义了请求头,用于模拟浏览器访问。get_page
函数负责发送请求并获取页面内容。parse_page
函数负责解析页面,提取图书信息。save_to_csv
函数负责将提取的数据保存到CSV文件中。最后,main
函数控制整个流程,通过传入URL来启动爬虫。
评论已关闭