Python爬虫学习教程:天猫商品数据爬虫
import requests
from lxml import etree
import csv
# 天猫商品数据爬虫
def tianmao_spider(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Referer': 'http://www.tianmao.com/',
}
response = requests.get(url, headers=headers)
response.encoding = 'gbk'
html = etree.HTML(response.text)
# XPath 表达式用于定位商品信息
goods_info = html.xpath('//ul[@class="product-list"]/li')
for info in goods_info:
# 商品名称
name = info.xpath('./div[2]/div[1]/a/text()')[0].strip()
# 商品链接
item_url = info.xpath('./div[2]/div[1]/a/@href')[0]
# 商品图片链接
img_url = info.xpath('./div[1]/a/img/@src')[0]
# 商品价格
price = info.xpath('./div[2]/div[2]/div[1]/strong/text()')[0]
# 打印或写入商品信息
print(f'商品名称: {name}, 商品链接: {item_url}, 商品图片: {img_url}, 商品价格: {price}')
# 将商品信息写入CSV文件
with open('tianmao_goods.csv', 'a', newline='', encoding='gbk') as f:
writer = csv.writer(f)
writer.writerow([name, item_url, img_url, price])
if __name__ == '__main__':
url = 'http://www.tianmao.com/search?q=%C4%EA%B3%O0&suggest=0.0.0.0&_input_charset=utf-8&suggest_type=suggest'
tianmao_spider(url)
这段代码修复了之前提到的编码问题,并添加了对请求头的处理,以及更正了XPath表达式中的一个错误。这个简易的爬虫会抓取天猫网站的商品信息,并打印到控制台,同时将信息保存到CSV文件中。
评论已关闭