进阶网络爬虫实践内容---微博网页内容爬取
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
# 设置请求头,模拟浏览器访问
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
# 获取微博用户主页的网页内容
def get_page_content(url):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except requests.RequestException:
return None
# 解析微博数据,提取微博正文
def parse_weibo_data(html):
soup = BeautifulSoup(html, 'lxml')
weibo_data = []
for container in soup.find_all('div', class_='c'):
weibo = {}
# 提取微博正文
text = container.find('span', class_='ctt')
if text:
weibo['text'] = text.text.strip()
else:
continue
# 提取转发数和评论数
for info in container.find_all('span', class_='cc'):
if '转发' in info.text:
weibo['retweet_count'] = re.search('\d+', info.text).group(0)
elif '评论' in info.text:
weibo['comment_count'] = re.search('\d+', info.text).group(0)
weibo_data.append(weibo)
return weibo_data
# 保存微博数据到CSV文件
def save_weibo_data_to_csv(weibo_data, file_name):
df = pd.DataFrame(weibo_data)
df.to_csv(file_name + '.csv', index=False, encoding='utf-8-sig')
# 主函数
def main(user_id, since_date, count):
url = f'https://weibo.com/p/100505{user_id}/home?is_search=0&visible=0&is_all=1&since_id=0&sort=time&page={count}'
html = get_page_content(url)
if html:
weibo_data = parse_weibo_data(html)
save_weibo_data_to_csv(weibo_data, f'weibo_data_{user_id}_{since_date}_{count}')
print(f'微博数据已保存到 weibo_data_{user_id}_{since_date}_{count}.csv')
else:
print('网页内容获取失败')
# 示例用户ID为1234567890,起始日期为2023-01-01,爬取第1页的数据
main(1234567890, '2023-01-01', 1)
在这个代码示例中,我们定义了一个main
函数,它接受用户ID、起始日期和页数作为参数,并将微博数据保存到CSV文件中。这个示例展示了如何使用Python进行网络爬取,提取特定的数据,并将其保存到文件中。需要注意
评论已关闭