分类爬虫下的文章

2024-08-19




import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
 
# 爬取数据
def crawl_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    data = soup.find_all('div', class_='price')
    prices = [int(item.text.strip()[1:]) for item in data]
    return prices
 
# 数据可视化
def visualize_data(data):
    plt.hist(data, bins=50)
    plt.title('酒店价格分布')
    plt.xlabel('价格（人民币）')
    plt.ylabel('酒店数量')
    plt.show()
 
# 数据分析和报告
def analyze_data(data):
    mean_price = pd.Series(data).mean()
    median_price = pd.Series(data).median()
    mode_price = pd.Series(data).mode()[0]
    print(f'平均价格: {mean_price}')
    print(f'中位数价格: {median_price}')
    print(f'最常见价格: {mode_price}')
 
# 主函数
def main():
    url = 'https://www.example.com/hotels'
    prices = crawl_data(url)
    analyze_data(prices)
    visualize_data(prices)
 
if __name__ == '__main__':
    main()

这个代码实例展示了如何使用Python爬取数据、进行简单的数据分析、可视化分析结果，并且提供了一个模拟的网络爬虫URL。实际应用中，你需要替换为正确的URL和相应的数据解析方法。

- 阅读更多 -

爬虫实战-Python爬取百度当天热搜内容

System

2024-08-19

所有,爬虫




import requests
from bs4 import BeautifulSoup
import re
import datetime
 
def get_baidu_hot_search(date):
    # 设置URL模板，其中{date}将被实际日期替换
    url_template = "https://top.baidu.com/buzz?b=1&c=31&class=3&p=1&d={date}"
    url = url_template.format(date=date)
 
    # 发送HTTP请求
    response = requests.get(url)
    response.raise_for_status()
    response.encoding = 'UTF-8'
 
    # 解析网页内容
    soup = BeautifulSoup(response.text, 'html.parser')
 
    # 使用正则表达式匹配关键词
    pattern = re.compile(r'<a href="/s?wd=(.+?)&amp;.+?>(.+?)</a>')
    matches = pattern.findall(str(soup))
 
    # 输出关键词
    hot_searches = []
    for keyword, score in matches:
        hot_searches.append({
            'keyword': keyword,
            'score': score
        })
    return hot_searches
 
# 获取今天的日期字符串
today = datetime.date.today().strftime('%Y%m%d')
# 调用函数并输出结果
hot_searches = get_baidu_hot_search(today)
for search in hot_searches:
    print(search)

这段代码使用了requests库来发送HTTP请求，使用BeautifulSoup库来解析HTML，使用正则表达式来提取关键词和热度分数。代码首先定义了一个函数get_baidu_hot_search，该函数接受日期作为参数，并返回该日期的百度热搜关键词列表。然后代码获取了今天的日期字符串，并调用函数输出结果。

- 阅读更多 -

用 python 爬虫抓站的一些技巧总结_爬虫技术如何从分站到总站

System

2024-08-19

所有,爬虫




import requests
from bs4 import BeautifulSoup
 
def get_sub_pages(start_url):
    sub_pages = []
    response = requests.get(start_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        # 假设分站的链接都在<a>标签的href属性中
        for link in soup.find_all('a'):
            url = link.get('href')
            if url.startswith('http') and 'example.com' in url:
                sub_pages.append(url)
    return sub_pages
 
def get_total_site(sub_pages):
    total_site_data = []
    for page_url in sub_pages:
        response = requests.get(page_url)
        if response.status_code == 200:
            # 这里可以添加解析每个子页面数据的逻辑
            total_site_data.append(response.text)
    return total_site_data
 
# 示例起始URL
start_url = 'http://www.example.com'
sub_pages = get_sub_pages(start_url)
total_site_data = get_total_site(sub_pages)
 
# 这里可以添加保存数据到文件或数据库的逻辑

这个代码示例展示了如何从一个起始URL开始，通过爬取该起始URL的网页内容，找出所有相关的子网页链接，并对每个子网页进行爬取，最终获取整个网站的数据。这个过程是爬虫技术中一个常见的应用场景，展示了如何从分散的页面爬取回来并整合成一个完整的数据集。

- 阅读更多 -

【Python】爬虫实战03：自动化抢票脚本【某麦网】

System

2024-08-19

所有,爬虫




import requests
from bs4 import BeautifulSoup
import time
import json
 
# 用户信息配置
USER = {
    'username': '123456789',  # 用户名
    'password': '87654321',   # 密码
    'captcha_api': 'http://api.decaptcha.com/captcha?key=YOUR_API_KEY', # 验证码API接口
}
 
# 票务信息配置
TICKET_INFO = {
    'train_date': '2023-04-01',  # 出行日期
    'from_station': 'BJP',       # 出发站
    'to_station': 'BJQ',         # 目的地站
}
 
# 登录请求头
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Referer': 'https://kyfw.12306.cn/otn/resources/login.html',
}
 
# 初始化session，用于保持会话状态
session = requests.session()
 
def get_captcha():
    """获取验证码图片并返回验证码字符串"""
    resp = session.get(USER['captcha_api'])
    with open('captcha.jpg', 'wb') as f:
        f.write(resp.content)
    # 这里需要手动输入验证码，实际应用中可以集成OCR服务自动识别验证码
    captcha = input('请输入验证码：')
    return captcha
 
def login():
    """登录12306网站"""
    # 请求登录页面，获取加密参数
    resp = session.get('https://kyfw.12306.cn/otn/login/init')
    # 解析返回的HTML，获取加密参数
    soup = BeautifulSoup(resp.text, 'lxml')
    login_form = {
        'username': USER['username'],
        'password': USER['password'],  # 密码应为加密后的密码
        'appid': 'otn',
    }
    # 登录请求
    resp = session.post('https://kyfw.12306.cn/otn/login/loginAysnSuggest', data=login_form)
    print(resp.json())
 
def query_ticket():
    """查询票务信息"""
    # 构造查询参数
    query_params = {
        'leftTicketDTO.train_date': TICKET_INFO['train_date'],
        'leftTicketDTO.from_station': TICKET_INFO['from_station'],
        'leftTicketDTO.to_station': TICKET_INFO['to_station'],
        'purpose_codes': 'ADULT',
    }
    # 查询票务信息
    resp = session.post('https://kyfw.12306.cn/otn/leftTicket/queryZ', json=query_params, headers=HEADERS)
    print(resp.json())
 
def buy_ticket():
    """模拟购票流程"""
    # 此函数需要根据实际购票流程来编写，需要处理加入车厢、选择座位等步骤
    pass
 
def main():
    # 获取验证码
    captcha = get_captcha()
    # 登录
    login()
    # 查询票务信息
    query_ticket()
    # 自动购票
    buy_ticket()
 
if __name__ == '__main__':
    main()

这个示例代码提供了一个简化的框架来说

- 阅读更多 -

爬虫综合案例-使用Scrapy爬取当当网的图片信息

System

2024-08-19

所有,爬虫

以下是一个使用Scrapy框架爬取当当网图片信息的示例代码：

首先，创建一个Scrapy项目：




scrapy startproject dangdang_images
cd dangdang_images

然后，定义Item来存储爬取的数据：




# items.py
import scrapy
 
class DangdangImageItem(scrapy.Item):
    # 图片链接
    image_urls = scrapy.Field()
    # 图片下载后的保存路径
    image_paths = scrapy.Field()

接着，编写爬虫：




# spiders/dangdang_spider.py
import scrapy
from ..items import DangdangImageItem
 
class DangdangSpider(scrapy.Spider):
    name = 'dangdang'
    allowed_domains = ['dangdang.com']
    start_urls = ['http://category.dangdang.com/pg1-cid4002135.html']
 
    def parse(self, response):
        for product in response.css('ul.bigimg li'):
            item = DangdangImageItem()
            # 提取图片链接
            image_urls = product.css('img::attr(src)').extract()
            item['image_urls'] = image_urls
            yield item
 
        # 提取下一页链接并进行爬取
        next_page = response.css('div.paging_next a::attr(href)').extract_first()
        if next_page:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)

最后，设置PIL和Scrapy图片管道以下载图片：




# pipelines.py
import os
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
 
class DangdangImagesPipeline(ImagesPipeline):
    def get_media_requests(self, item):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url)
 
    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem('All images failed downloading.')
        item['image_paths'] = image_paths
        return item
 
    def file_path(self, request, response=None, info=None):
        image_guid = request.url.split('/')[-1]
        image_name = os.path.basename(image_guid)
        return 'full/{0}'.format(image_name)

在settings.py中启用和配置图片管道以及图片存储路径：




ITEM_PIPELINES = {
    'dangdang_images.pipelines.DangdangImagesPipeline': 1,
}
 
IMAGES_STORE = 'path_to_your_images_directory'

以上代码实现了一个简单的Scrapy爬虫，用于爬取当当网产品页面的图片链接，并通过Scrapy的图片管道下载这些图片。这个案例展示了如何使用Scrapy爬取图片并保存到本地

- 阅读更多 -

python爬虫之JS逆向——爬虫基础

System

2024-08-19

所有,爬虫




import requests
import execjs
 
# 请求网页
url = 'http://example.com/path/to/page'
response = requests.get(url)
 
# 解析JS代码，找到加密函数并调用
js_code = """
function encrypt(data) {
    // 这里是加密函数的代码
    // ...
}
"""
 
# 使用execjs执行JS代码
ctx = execjs.compile(js_code)
encrypted_data = ctx.call('encrypt', 'your_data_here')
 
# 使用加密后的数据发起POST请求
post_url = 'http://example.com/path/to/post/endpoint'
post_data = {
    'encryptedField': encrypted_data
}
post_response = requests.post(post_url, data=post_data)
 
# 打印结果
print(post_response.text)

这个示例展示了如何使用Python的requests库来获取网页内容，以及如何使用execjs库来执行提供的JavaScript加密函数，并将加密后的数据用于POST请求。这是进行Web爬虫开发时了解和应用JavaScript加密的一个基本例子。

- 阅读更多 -

【爬虫】案例-爬取B站视频

System

2024-08-19

所有,爬虫

以下是一个简单的Python爬虫示例，用于爬取B站视频的信息。请注意，这个例子仅用于学习目的，实际使用时应遵守相关法律法规及网站使用协议，合理使用爬虫技术，并尽量减少对网站服务器的压力。




import requests
from bs4 import BeautifulSoup
import re
 
# 设置请求头，模拟浏览器访问
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
 
# 视频页面URL
url = 'https://www.bilibili.com/video/BV1v54y1q75B'
 
# 发送GET请求
response = requests.get(url, headers=headers)
 
# 检查请求是否成功
if response.status_code == 200:
    # 使用BeautifulSoup解析页面
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # 通过视频标题获取信息
    title = soup.find('h1', class_='title').text
    print(f'视频标题: {title}')
    
    # 通过正则表达式获取视频播放地址
    video_url = re.findall(r'\"playUrl\":\"(http[^\"]+)\"', response.text)[0]
    print(f'视频地址: {video_url}')
    
    # 视频的描述信息
    description = soup.find('div', class_='video-desc').text.strip()
    print(f'视频描述: {description}')
    
    # 视频的评论数
    comment_count = soup.find('div', class_='number').text.strip()
    print(f'评论数: {comment_count}')
    
    # 视频的播放量
    play_count = soup.find('div', class_='view-count').text.strip()
    print(f'播放量: {play_count}')
    
    # 视频的评分
    score = soup.find('div', class_='video-score').text.strip()
    print(f'评分: {score}')
    
    # 视频的评论列表（此处仅作为示例，实际需要分页获取）
    comments = soup.find_all('div', class_='comment')
    for comment in comments:
        content = comment.find('span', class_='content').text
        print(f'评论内容: {content}')
else:
    print('请求失败')

这段代码使用了requests库来发送HTTP请求，BeautifulSoup库来解析页面，以及正则表达式来提取特定的数据。请确保在运行代码前已经安装了这些库（可以使用pip install requests beautifulsoup4命令安装）。

这个例子提取了视频标题、视频描述、评论数、播放量、评分和评论内容。实际应用中可能需要处理动态加载的数据、登录状态、反爬机制等问题，并且应该遵守B站的爬虫政策。

- 阅读更多 -

Python |浅谈爬虫的由来

System

2024-08-19

所有,爬虫

网络爬虫，又称为网页爬虫，是一种按照一定规则自动抓取互联网网页信息的程序或脚本。

爬虫的由来可以追溯到早期的搜索引擎发展阶段，早期的搜索引擎要收集大量的网页信息，为了实现这个目标，就需要有能够自动获取网页的程序。这样的程序最早的应用是在1990年左右的ARPANET（早期的互联网前身）上，用于传递新闻组帖子。随着互联网的发展，爬虫在各种场景中的应用也越来越广泛，包括但不限于数据分析、商业智能、机器学习等领域。

以下是一个简单的Python爬虫示例，使用requests和BeautifulSoup库：




import requests
from bs4 import BeautifulSoup
 
def get_html(url):
    response = requests.get(url)
    return response.text
 
def parse_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    return soup.find_all('a')
 
def print_links(links):
    for link in links:
        print(link.get('href'))
 
url = 'https://www.example.com'
html = get_html(url)
links = parse_html(html)
print_links(links)

这个简单的爬虫示例首先定义了一个获取HTML内容的函数，然后定义了一个解析HTML并找出所有<a>标签的函数，最后定义了一个打印所有链接的函数。这个简单的爬虫可以作为爬虫编写的入门示例。

- 阅读更多 -

爬虫--抓取网易云评论信息

System

2024-08-19

所有,爬虫

爬取网易云音乐的评论信息，可以使用Python语言结合requests和json库来实现。以下是一个简单的示例代码，用于抓取网易云音乐某首歌曲的评论信息：




import requests
import json
 
# 音乐歌曲ID
music_id = '1404578778'  # 例如：'1404578778' 代表歌曲《青春修炼手册》
 
# 网易云API地址
url = f'https://music.163.com/api/v2/resource/comments/R_SO_4_{music_id}?limit=20&offset=0'
 
# 发送HTTP请求
response = requests.get(url)
 
# 检查请求是否成功
if response.status_code == 200:
    # 解析JSON数据
    data = response.json()
    
    # 提取评论信息
    comments = data['comments']
    
    # 打印评论内容
    for comment in comments:
        content = comment['content']
        time = comment['time']
        print(f"评论内容：{content}, 评论时间：{time}")
 
else:
    print("请求失败")

请注意，由于网易云有反爬机制，实际爬取时可能需要处理cookies、headers、代理等问题，并且需要遵守网易云的使用协议，避免对其服务造成过大压力或者违反其规定。

- 阅读更多 -

【Python爬虫】Python爬取喜马拉雅，爬虫教程！

System

2024-08-19

所有,爬虫

以下是一个简单的Python爬虫示例，用于爬取喜马拉雅网站上的音频播放链接。




import requests
from bs4 import BeautifulSoup
import re
 
# 音频信息类
class AudioInfo:
    def __init__(self, title, url):
        self.title = title
        self.url = url
 
# 获取音频信息
def get_audio_info(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        audio_list = soup.find_all('a', class_='audio-title')
        audio_infos = []
        for audio in audio_list:
            title = audio.text
            audio_url = audio['href']
            audio_infos.append(AudioInfo(title, audio_url))
        return audio_infos
    else:
        return []
 
# 正则表达式提取音频链接
def extract_audio_url(text):
    pattern = re.compile(r'play_url:"(.*?)"')
    urls = re.findall(pattern, text)
    return urls
 
# 测试函数
def test_get_audio_info(url):
    audio_infos = get_audio_info(url)
    for info in audio_infos:
        print(info.title, info.url)
        audio_urls = extract_audio_url(info.url)
        for url in audio_urls:
            print(url)
 
# 主函数
def main():
    # 喜马拉雅电台分类页面URL
    category_url = 'https://www.ximalaya.com/category/500033/'
    test_get_audio_info(category_url)
 
if __name__ == '__main__':
    main()

这段代码首先定义了一个AudioInfo类来存储音频的标题和URL。get_audio_info函数用于获取指定URL下的所有音频信息，并返回一个AudioInfo对象列表。extract_audio_url函数使用正则表达式来提取每个音频播放的实际URL。test_get_audio_info函数用于测试这些功能，并打印结果。最后，main函数定义了喜马拉雅电台分类页面的URL，并调用test_get_audio_info函数来运行爬虫。

注意：由于喜马拉雅有反爬策略，实际运行时可能需要设置合适的请求头、处理登录验证、使用代理等。此代码仅为示例，用于演示基本的爬虫逻辑。

- 阅读更多 -