爬取央视热榜并存储到MongoDB
import requests
from pymongo import MongoClient
from lxml import etree
# 爬取方法
def crawl_tv_rank(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
# 解析方法
def parse_html(html):
html = etree.HTML(html)
rank_list = html.xpath('//dl[@class="tv-rank-list"]/dd')
data_list = []
for rank in rank_list:
data = {
'name': rank.xpath('./a/text()')[0],
'score': rank.xpath('./p[@class="score"]/text()')[0],
'rank': rank.xpath('./p[@class="num"]/text()')[0].strip()
}
data_list.append(data)
return data_list
# 存储方法
def save_to_mongo(data_list, mongo_url, db_name, collection_name):
client = MongoClient(mongo_url)
db = client[db_name]
collection = db[collection_name]
collection.insert_many(data_list)
# 主函数
def main():
url = 'http://top.tv.sohu.com/rank_tab_1_3'
html = crawl_tv_rank(url)
data_list = parse_html(html)
save_to_mongo(data_list, 'localhost', 'tv_rank', 'sohu')
if __name__ == '__main__':
main()
这段代码实现了从国内知名视频网站搜狐TV抓取电视剧热榜的功能,并将数据存储到MongoDB数据库中。代码遵循了优秀的编程实践,例如提取了爬取、解析和存储的函数,并在主函数中调用。同时,代码中包含了错误处理和headers设置,这些都是爬虫工程师应当注意的要点。
评论已关闭