基于python的百度资讯爬虫的设计与实现
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
def get_news_data(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive'
}
res = requests.get(url, headers=headers)
res.raise_for_status()
res.encoding = 'utf-8'
return res.text
def parse_news_data(html):
soup = BeautifulSoup(html, 'html.parser')
news_data = soup.select('#newsContent')
news_content = news_data[0].text if news_data else 'No content available.'
return news_content
def main():
url = 'http://news.baidu.com/item?tab=0&id=0&type=0&tm=0'
html = get_news_data(url)
content = parse_news_data(html)
print(content)
if __name__ == '__main__':
main()
这段代码首先定义了一个获取网页内容的函数get_news_data
,然后定义了一个解析新闻内容的函数parse_news_data
,最后在main
函数中通过调用这两个函数来实现从百度资讯获取新闻内容的例子。在实际应用中,你需要根据实际情况调整请求头、网页URL和解析方式。
评论已关闭