Python爬虫实战——爬取新闻数据(简单的深度爬虫)
import requests
from bs4 import BeautifulSoup
import time
import random
import csv
def get_html(url, headers):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except requests.RequestException:
return None
def parse_html(html, keyword):
soup = BeautifulSoup(html, 'lxml')
news_list = soup.select('div.news-box > ul > li > a')
for news in news_list:
title = news.select_one('h3').text
if keyword.lower() in title.lower():
url = news['href']
print(f'正在爬取新闻: {title}')
yield url
def get_news_detail(url, headers):
html = get_html(url, headers)
soup = BeautifulSoup(html, 'lxml')
content = soup.select_one('div.article-content').text
return content
def save_to_csv(data, filename):
with open(filename, 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(data)
def main(keyword):
base_url = 'https://news.sina.com.cn/china/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Cookie': 'your_cookie_here'
}
filename = f'{keyword}_news.csv'
for url in parse_html(get_html(base_url, headers), keyword):
content = get_news_detail(url, headers)
save_to_csv([url, content], filename)
time.sleep(random.uniform(1, 3)) # 添加随机等待防止被封禁IP
if __name__ == '__main__':
keyword = '科技'
main(keyword)
这段代码实现了一个简单的深度爬虫,它首先获取新闻列表页面的HTML内容,然后解析提取新闻标题和URL,筛选含有指定关键词的新闻,并将新闻的详情页URL交给一个新的函数来获取新闻详情。新闻详情被获取后,保存到CSV文件中。为了避免被目标网站封禁,爬虫在请求新的URL之前引入随机的等待时间。
评论已关闭