爬虫实训——使用python爬取大学排名并且存入数据库及推荐系统
由于原始代码已经提供了一个很好的实践案例,我们可以基于这个案例简化和改进代码,以解决问题中提出的需求。以下是一个简化后的代码实例,包含了存储数据到数据库和使用推荐系统的核心函数:
import requests
from bs4 import BeautifulSoup
import pymysql
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import SVD
# 连接数据库
conn = pymysql.connect(host='localhost', user='your_username', password='your_password', db='your_dbname', charset='utf8')
# 使用pandas读取数据库数据
def read_from_db():
df = pd.read_sql_query('SELECT * FROM university_ranking', conn)
return df
# 将数据存入数据库
def save_to_db(data):
data.to_sql('university_ranking', conn, if_exists='append', index=False)
# 创建推荐系统
def create_recommender_system(data):
reader = Reader(rating_scale=False)
data = Dataset.load_from_df(data[['university', 'score']], reader)
trainset, testset = train_test_split(data, test_size=0.25)
svd = SVD()
svd.fit(trainset)
return svd
# 获取大学排名
def get_university_ranking():
url = 'http://www.shanghairanking.cn/rankings/bcur/2020'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', attrs={'class': 'table-stats'})
rows = table.find_all('tr')[1:] # 跳过表头
universities = []
scores = []
for row in rows:
tds = row.find_all('td')
universities.append(tds[1].text)
scores.append(tds[2].text.replace(',', '')) # 移除千分位分隔符
df = pd.DataFrame({'university': universities, 'score': scores})
return df
# 执行函数
def main():
df = get_university_ranking()
save_to_db(df)
df_from_db = read_from_db()
svd = create_recommender_system(df_from_db)
# 推荐系统使用示例
# 用户ID, 获取推荐学校
user_id = 0
recommendations = svd.recommend(user_id, n=3)
for recommendation in recommendations:
print(f"Recommendation for user ID {user_id}: {recommendation.item_id} with score: {recommendation.score}")
if __name__ == "__main
评论已关闭