标签 python 下的文章

2024-08-13




import weibo
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
 
# 设置微博客户端
client = weibo.APIClient('app_key', 'app_secret', 'access_token', 'access_token_secret')
 
# 获取微博热搜索
def get_weibo_hot_search(timespan):
    hot_search = []
    for i in range(timespan):
        try:
            hot = client.trends.hot(i)
            hot_search.append(hot)
        except Exception as e:
            print(e)
            break
    return hot_search
 
# 解析微博热搜索数据
def parse_weibo_hot_search(hot_search):
    data = []
    for day in hot_search:
        for item in day:
            data.append({
                'date': item['created_at'],
                'rank': item['rank'],
                'keyword': item['keyword'],
                'query': item['query'],
                'type': item['type']
            })
    return data
 
# 数据可视化
def visualize_data(data):
    df = pd.DataFrame(data)
    df = df[df['type'] == '100']  # 只考虑搜索类型为100的数据
    df = df.sort_values(by=['date', 'rank'])
    df_grouped = df.groupby('date')
    
    dates = df_grouped.size().index
    counts = df_grouped.size()
    
    plt.figure(figsize=(15, 6))
    plt.plot(dates, counts, color='blue', marker='o')
    plt.title('微博热搜索趋势', fontsize=16)
    plt.xlabel('日期', fontsize=14)
    plt.ylabel('搜索次数', fontsize=14)
    plt.xticks(rotation=45)
    plt.show()
 
# 执行函数
if __name__ == '__main__':
    timespan = 7  # 获取过去7天的数据
    hot_search = get_weibo_hot_search(timespan)
    data = parse_weibo_hot_search(hot_search)
    visualize_data(data)

这段代码首先设置了微博客户端，然后定义了获取微博热搜索数据的函数，并解析了数据。最后，定义了一个可视化数据的函数，并在主程序中调用这些函数以执行微博热搜索数据的获取和可视化。这个例子展示了如何使用Python进行微博数据的爬取和可视化分析，对于学习微博数据分析的开发者有很好的教育价值。

- 阅读更多 -

基于Python实现爬虫+协同过滤算法的招聘信息推荐系统

System

2024-08-13

所有,爬虫




import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
 
# 初始化一些变量
job_descriptions = []
job_titles = []
jobs = []
 
# 定义一个函数来获取招聘信息
def get_job_info(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    jobs_on_page = soup.find_all('div', class_='job-card-container')
    for job in jobs_on_page:
        job_title = job.find('h2', class_='title').text
        job_description = job.find('div', class_='description').text
        job_descriptions.append(job_description)
        job_titles.append(job_title)
        jobs.append({'title': job_title, 'description': job_description})
 
# 获取招聘信息
get_job_info('https://www.seek.com.au/jobs/in-all-jobs?page=1')
 
# 将工作描述和标题转换为TF-IDF向量
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(job_descriptions)
cosine_sim = cosine_similarity(tfidf_matrix)
 
# 定义一个函数来获得相似工作的标题
def get_similar_jobs(job_title, cosine_sim):
    idx = job_titles.index(job_title)
    similar_job_indices = cosine_sim[idx].argsort()[::-1][1:6]
    return [jobs[i] for i in similar_job_indices]
 
# 获取与特定工作描述相似的工作
similar_jobs = get_similar_jobs('Data Scientist', cosine_sim)
 
# 打印出相似的工作
for job in similar_jobs:
    print(job['title'])

这个简化的代码实例展示了如何使用Python爬取招聘信息，并使用TF-IDF和cosine相似性来找出相似的工作描述。这个系统可以作为一个基础来进行工作相关性的研究和推荐。

- 阅读更多 -

python智慧交通数据分析系统时间序列预测算法爬虫出行速度预测拥堵预测大数据

System

2024-08-13

所有,爬虫




import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima_model import ARIMA
from fbprophet import Prophet
 
# 假设我们有以下函数来获取智慧交通数据
def get_data(city, date):
    # 这里应该是获取数据的代码，但由于数据不公开，我们模拟一些数据返回
    return pd.DataFrame({
        'time': pd.date_range(start=date, periods=24*7, freq='H'),
        'car_count': np.random.randint(1000, 10000, size=24*7)
    })
 
# 获取数据
city = '北京'
date = '2021-01-01'
data = get_data(city, date)
 
# 数据预处理
data['hour'] = data['time'].dt.hour
data = data.set_index('time')
 
# 使用fbprophet进行时间序列预测
model = Prophet(daily_seasonality=True, weekly_seasonality=True)
model.fit(data)
future = model.make_future_dataframe(periods=24*7)
forecast = model.predict(future)
 
# 画出预测图
fig1 = model.plot(forecast)
 
# 保存图表
fig1.savefig(f'prophet_prediction_{city}.png')
 
# 输出预测结果
print(forecast[['ds', 'yhat']])

这个示例展示了如何使用Prophet模型进行时间序列预测，并将预测结果保存为图片文件。注意，这里的数据是模拟的，实际应用中需要替换为实际的交通数据。

- 阅读更多 -

【爬虫实战】用python爬今日头条热榜TOP50榜单！

System

2024-08-13

所有,爬虫




import requests
from bs4 import BeautifulSoup
import pandas as pd
 
# 设置请求头，模拟浏览器访问
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
 
def get_data(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'lxml')
    data = soup.find_all('div', class_='title')
    rank = [i.span.get_text() for i in soup.find_all('div', class_='num')]
    names = [i.a.get_text() for i in data]
    hrefs = ['https://www.toutiao.com' + i.a.get('href') for i in data]
    return rank, names, hrefs
 
def main(url):
    rank, names, hrefs = get_data(url)
    data = pd.DataFrame(list(zip(rank, names, hrefs)), columns=['排名', '名称', '链接'])
    print(data)
    data.to_csv('今日头条热榜.csv', index=False, encoding='utf-8')
 
if __name__ == '__main__':
    url = 'https://www.toutiao.com/hotwords/'
    main(url)

这段代码首先定义了请求头，用于模拟浏览器访问网页。get_data 函数用于获取网页数据，并通过BeautifulSoup进行解析。main 函数则是程序的主要逻辑，它调用get_data函数获取数据，并将数据存储在一个DataFrame中，最后将数据保存到CSV文件中。最后，在__name__为__main__时，执行主函数，开始爬取数据。

- 阅读更多 -

使用 python 构建企业级高可用海量爬虫调度系统

System

2024-08-13

所有,爬虫

要使用Python构建一个企业级的高可用海量爬虫调度系统，可以选择使用Kubernetes和Python的第三方库如Celery来实现分布式任务调度，以及Scrapy来实现爬虫。

以下是一个基本的架构示例：

Kubernetes：负责整个系统的部署、扩缩容和服务发现。
Celery：负责分布式任务调度。
Scrapy：用于实现爬虫。

以下是一个简单的Celery配置示例：




# celery_tasks.py
from celery import Celery
 
app = Celery('my_crawler', broker='redis://localhost:6379/0', backend='redis://localhost:6379/0')
 
@app.task
def add(x, y):
    return x + y

在Kubernetes中部署Celery：




# celery-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: celery-worker
spec:
  replicas: 3
  selector:
    matchLabels:
      component: celery-worker
  template:
    metadata:
      labels:
        component: celery-worker
    spec:
      containers:
      - name: celery-worker
        image: my_celery_worker_image
        command: celery worker -A my_crawler -l info

在Scrapy中创建爬虫项目，并配置为在Celery中运行：




# myspider.py
import scrapy
from celery_tasks.tasks import add
 
class MySpider(scrapy.Spider):
    name = 'myspider'
 
    def parse(self, response):
        # 假设爬取到的数据为item
        item = ...
        # 将爬取的数据提交给Celery进行异步处理
        add.delay(item)

这个例子展示了如何使用Celery和Scrapy来构建一个简单的分布式爬虫系统。在实际部署中，你需要考虑更多的细节，如错误处理、日志记录、监控、安全性等。此外，你还需要搭配Kubernetes的服务发现和自动扩缩容功能来保证系统的高可用性和扩展性。

- 阅读更多 -

网络爬虫——python爬取豆瓣评论

System

2024-08-13

所有,爬虫

要使用Python爬取豆瓣电影评论，你可以使用requests库获取网页内容，然后用BeautifulSoup解析网页。以下是一个简单的示例代码：




import requests
from bs4 import BeautifulSoup
import time
 
# 设置HTTP请求头部，模拟浏览器访问
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
 
def get_comments(url):
    # 获取网页内容
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # 检查请求是否成功
    soup = BeautifulSoup(response.text, 'lxml')  # 解析网页
 
    # 提取评论内容
    comments = soup.find_all('div', class_='comment')
    for comment in comments:
        content = comment.find('div', class_='comment-content').text.strip()
        author = comment.find('span', class_='comment-info').text.strip()
        print(f'评论内容：{content}\n作者：{author}\n')
 
# 主函数
def main(url):
    # 循环抓取多页评论
    for page in range(1, 11):  # 假设只抓取前10页
        print(f'正在抓取第{page}页评论...')
        page_url = f'{url}&page={page}'
        get_comments(page_url)
        time.sleep(2)  # 暂停2秒，减少对服务器的请求频率
 
if __name__ == '__main__':
    movie_url = 'https://movie.douban.com/subject/1292720/comments?sort=new_score'
    main(movie_url)

请注意，由于豆瓣网站可能有反爬机制，实际运行时可能需要处理登录验证、反爬机制等问题。此外，频繁的爬取数据可能会对豆瓣服务器造成压力，应遵守豆瓣的爬虫政策。

System

2024-08-13

所有,python




# 导入必要的库
import pandas as pd
 
# 创建示例DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [5, 4, 3, 2, 1],
    'C': [10, 20, 30, 40, 50]
})
 
# 1. 查看DataFrame的前几行和后几行数据
print(df.head())  # 默认显示前5行
print(df.tail(3))  # 显示后3行
 
# 2. 查看DataFrame的统计信息
print(df.describe())
 
# 3. 对DataFrame进行排序
sorted_df = df.sort_values(by='A')
print(sorted_df)
 
# 4. 选择特定列
print(df[['A', 'B']])
 
# 5. 使用条件过滤数据
filtered_df = df[df['A'] > 2]
print(filtered_df)
 
# 6. 分组聚合
grouped_df = df.groupby('A').sum()
print(grouped_df)
 
# 7. 合并DataFrame
df1 = df.copy()
df1['D'] = [100, 200, 300, 400, 500]
merged_df = pd.merge(df, df1, on='A')
print(merged_df)
 
# 8. 重塑和填充DataFrame
reshape_df = df.stack()
padded_df = df.pad()
print(reshape_df)
print(padded_df)
 
# 9. 将DataFrame导出到CSV文件
df.to_csv('data.csv', index=False)
 
# 10. 将CSV文件导入到DataFrame
loaded_df = pd.read_csv('data.csv')
print(loaded_df)

这段代码展示了如何使用Pandas库中的常用函数来处理DataFrame数据。从查看数据、统计信息、排序和过滤，到分组聚合、合并和重塑以及导入导出操作，涵盖了数据处理的常用步骤。

- 阅读更多 -

python 取余，取整，四舍五入，向上取整，向下取整

System

2024-08-13

所有,python

在Python中，取余、取整、四舍五入、向上取整和向下取整可以使用内置的函数或运算符。

取余：使用 % 运算符




remainder = number % divisor

取整：

向下取整：使用 math.floor() 函数




import math
floor = math.floor(number)

向上取整：使用 math.ceil() 函数




import math
ceil = math.ceil(number)

四舍五入：使用 round() 函数




rounded = round(number)

向上取整和向下取整也可以使用内置的 int() 函数，但它会根据传入的浮点数返回它的整数部分（向零取整）。

例子：




number = 3.6
 
# 取余
remainder = number % 2  # 结果是 1
 
# 向下取整
import math
floor = math.floor(number)  # 结果是 3
 
# 向上取整
import math
ceil = math.ceil(number)  # 结果是 4
 
# 四舍五入
rounded = round(number)  # 结果是 4
 
# 向零取整
int_val = int(number)  # 结果是 3

System

2024-08-13

所有,python

解释：

ModuleNotFoundError: No module named 'numpy.core._multiarray_um' 错误表明 Python 无法找到名为 numpy.core._multiarray_um 的模块。这通常是因为 NumPy 没有正确安装，或者安装的 NumPy 版本有问题。

解决方法：

确认 NumPy 是否已安装：在命令行中运行 pip show numpy 查看 NumPy 是否已安装以及其版本信息。
如果未安装，使用 pip install numpy 安装 NumPy。
如果已安装，但问题仍然存在，可以尝试更新 NumPy 到最新版本：pip install --upgrade numpy。
确保你的 Python 环境（如虚拟环境）中安装了 NumPy，如果你使用的是虚拟环境，确保你已激活该环境。
如果问题依旧，可能是 Python 环境的问题，检查你的 PYTHONPATH 环境变量是否正确设置，或者尝试重置/修复你的 Python 环境。

- 阅读更多 -

在Python上，把多帧图像合成一个视频

System

2024-08-13

所有,python

要在Python中把多帧图片合成视频，可以使用moviepy库。首先需要安装这个库：




pip install moviepy

然后使用以下代码合成视频：




from moviepy.editor import ImageSequenceClip
 
# 图片列表，确保图片是相同的尺寸
image_files = ['image1.jpg', 'image2.jpg', 'image3.jpg']
# 设置每秒显示图片的速度
fps = 15
 
# 创建视频剪辑
clip = ImageSequenceClip(image_files, fps=fps)
 
# 输出视频文件
output_file = 'output_video.mp4'
clip.write_videofile(output_file)

确保所有图片的尺寸相同，否则合成的视频中图片可能会变形。fps参数决定了视频的帧率。最后，使用write_videofile方法将视频文件输出到指定路径。

- 阅读更多 -