import redis
import requests
from lxml import etree
# 连接Redis数据库
redis_conn = redis.StrictRedis(host='localhost', port=6379, db=0)
def get_page_source(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
except requests.RequestException:
return None
def parse_page(html):
tree = etree.HTML(html)
news_items = tree.xpath('//div[@class="news-list"]/ul/li')
for item in news_items:
title = item.xpath('.//a/@title')[0]
href = item.xpath('.//a/@href')[0]
yield {
'title': title,
'href': href
}
def save_to_redis(data):
pipe = redis_conn.pipeline()
for item in data:
pipe.sadd('gushi_news:items', item['href'])
pipe.execute()
def main():
url = 'https://www.guge.name/news/'
html = get_page_source(url)
if html:
for item in parse_page(html):
print(item)
save_to_redis(list(parse_page(html)))
if __name__ == '__main__':
main()
这段代码首先定义了连接Redis数据库的函数,然后定义了获取页面源码、解析页面源码、保存数据到Redis的函数。在main
函数中,它首先获取了网页的源码,然后解析出新闻标题和链接,并打印出来。最后,它将解析出的每条新闻数据保存到Redis的集合类型数据结构中,以此来实现去重存储的功能。