import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
# 设置请求头信息,模拟浏览器访问
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
# 定义一个函数来获取小红书的所有帖子
def get_all_posts(url):
# 发送GET请求
response = requests.get(url, headers=headers)
# 解析HTML内容
soup = BeautifulSoup(response.text, 'lxml')
# 提取帖子信息
posts = soup.find_all('div', class_='feed-item-root')
return posts
# 定义一个函数来提取帖子详细信息
def extract_post_info(post):
try:
# 提取帖子标题
title = post.find('a', class_='title-box').text.strip()
# 提取帖子链接
post_url = post.find('a', class_='title-box')['href']
# 提取作者昵称和昵称链接
author_info = post.find('div', class_='author-info').text.strip()
author_name = re.search('(.+)', author_info).group(1)
author_url = post.find('a', class_='author-name')['href']
# 提取作品类型
media_type = post.find('div', class_='media-type').text.strip()
# 提取阅读量
read_count = post.find('div', class_='read-count').text.strip()
# 提取点赞数
like_count = post.find('div', class_='like-count').text.strip()
# 提取评论数
comment_count = post.find('div', class_='comment-count').text.strip()
# 提取发布时间
publish_time = post.find('div', class_='publish-time').text.strip()
# 返回所有提取的信息
return {
'title': title,
'url': post_url,
'author_name': author_name,
'author_url': author_url,
'media_type': media_type,
'read_count': read_count,
'like_count': like_count,
'comment_count': comment_count,
'publish_time': publish_time
}
except Exception as e:
print(f'Error extracting post info: {e}')
return None
# 主函数
def main(max_pages):
# 初始化帖子列表和页码
posts = []
page = 1
# 循环遍历页面
while page <= max_pages:
print(f"Crawling page {page}")
# 构造页面URL
url = f'https://www.xiaohongshu.com/discovery/trending?page={page}'
# 获取页面所有帖子
all_posts = get_all_posts(url)
# 提取每个帖子的详细信息
for post in all_posts: