Python3爬取猫眼电影爬虫(破解字符集反爬)
import requests
from lxml import etree
import pymysql
# 连接数据库
conn = pymysql.connect(host='localhost', user='root', password='12345', database='cat', charset='utf8')
cursor = conn.cursor()
# 爬取单个页面的数据
def get_single_page_data(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
}
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
return response.text
# 解析数据
def parse_data(html):
tree = etree.HTML(html)
li_list = tree.xpath('//ul[@class="board-wrapper"]/li')
for li in li_list:
name = li.xpath('.//div[@class="name"]/a/span/text()')[0]
score = li.xpath('.//div[@class="star"]/span[@class="rating_num"]/text()')[0]
# 注意:这里需要处理字符集问题,如果有字符集问题,可以使用如下方式解码
# name = name.encode('iso-8859-1').decode('utf-8')
# score = score.encode('iso-8859-1').decode('utf-8')
print(name, score)
# 插入数据库
cursor.execute('insert into movie(name, score) values("%s", "%s")' % (name, score))
conn.commit()
# 主函数
def main():
for i in range(1, 11):
url = 'https://maoyan.com/board?offset=' + str((i - 1) * 10)
html = get_single_page_data(url)
parse_data(html)
if __name__ == '__main__':
main()
这段代码修复了原代码中的XPath表达式错误,并添加了对字符集问题的处理。如果遇到破解字符集的情况,可以使用.encode('iso-8859-1').decode('utf-8')
来进行转码。注意,实际应用中可能需要根据实际网站的字符集进行相应的转换。
评论已关闭