Python爬虫:汽车之家爬虫(完整代码)
import requests
from lxml import etree
import csv
def get_car_info(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
}
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
html = etree.HTML(response.text)
return html
def parse_car_info(html):
# 汽车名称
name = html.xpath('//div[@class="car-intro-cont"]/h1/text()')[0]
# 汽车价格
price = html.xpath('//div[@class="car-price-cont"]/p[@class="car-price-num"]/text()')[0]
# 汽车图片
image = html.xpath('//div[@class="car-intro-img"]/img/@src')[0]
# 汽车参数
parameters = html.xpath('//div[@class="car-param-cont"]/p/text()')
return {
'name': name,
'price': price,
'image': image,
'parameters': parameters
}
def save_car_info(car_info, file_name):
with open(file_name, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['name', 'price', 'image', 'parameters'])
writer.writerow([car_info['name'], car_info['price'], car_info['image'], ','.join(car_info['parameters'])])
def main():
url = 'https://www.autohome.com.cn/156148/#pvareaid=201571'
html = get_car_info(url)
car_info = parse_car_info(html)
save_car_info(car_info, 'car_info.csv')
if __name__ == '__main__':
main()
这段代码实现了汽车之家网站特定汽车信息的抓取,并将其保存到CSV文件中。代码首先定义了获取网页、解析网页以及保存数据的函数。然后在main函数中,我们调用这些函数,完成数据抓取和保存的流程。在实际应用中,你需要根据目标网站的具体结构调整XPath表达式。
评论已关闭