python爬虫实战——小红书_python小红书爬虫

这篇文章距离上次修改已过308天，其中的内容可能已经有所变动。




import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
 
# 设置请求头信息，模拟浏览器访问
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
 
# 定义一个函数来获取小红书的所有帖子
def get_all_posts(url):
    # 发送GET请求
    response = requests.get(url, headers=headers)
    # 解析HTML内容
    soup = BeautifulSoup(response.text, 'lxml')
    # 提取帖子信息
    posts = soup.find_all('div', class_='feed-item-root')
    return posts
 
# 定义一个函数来提取帖子详细信息
def extract_post_info(post):
    try:
        # 提取帖子标题
        title = post.find('a', class_='title-box').text.strip()
        # 提取帖子链接
        post_url = post.find('a', class_='title-box')['href']
        # 提取作者昵称和昵称链接
        author_info = post.find('div', class_='author-info').text.strip()
        author_name = re.search('(.+)', author_info).group(1)
        author_url = post.find('a', class_='author-name')['href']
        # 提取作品类型
        media_type = post.find('div', class_='media-type').text.strip()
        # 提取阅读量
        read_count = post.find('div', class_='read-count').text.strip()
        # 提取点赞数
        like_count = post.find('div', class_='like-count').text.strip()
        # 提取评论数
        comment_count = post.find('div', class_='comment-count').text.strip()
        # 提取发布时间
        publish_time = post.find('div', class_='publish-time').text.strip()
        # 返回所有提取的信息
        return {
            'title': title,
            'url': post_url,
            'author_name': author_name,
            'author_url': author_url,
            'media_type': media_type,
            'read_count': read_count,
            'like_count': like_count,
            'comment_count': comment_count,
            'publish_time': publish_time
        }
    except Exception as e:
        print(f'Error extracting post info: {e}')
        return None
 
# 主函数
def main(max_pages):
    # 初始化帖子列表和页码
    posts = []
    page = 1
    
    # 循环遍历页面
    while page <= max_pages:
        print(f"Crawling page {page}")
        # 构造页面URL
        url = f'https://www.xiaohongshu.com/discovery/trending?page={page}'
        # 获取页面所有帖子
        all_posts = get_all_posts(url)
        # 提取每个帖子的详细信息
        for post in all_posts:

python爬虫实战——小红书_python小红书爬虫

评论已关闭

推荐阅读