Python 爬虫之 Xpath 爬取猪八戒网招商信息(详细教学)
import requests
from lxml import etree
import csv
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
}
def get_html(url):
"""
获取网页html内容
:param url: 网页链接
:return: 网页内容
"""
response = requests.get(url, headers=headers)
return response.text
def parse_html(html):
"""
解析网页并提取招商信息
:param html: 网页内容
:return: 招商信息列表
"""
html_element = etree.HTML(html)
# 以下XPath表达式需要根据实际网页结构进行调整
company_list = html_element.xpath('//div[@class="company-list"]/div')
data_list = []
for company in company_list:
name = company.xpath('.//h3/a/text()')[0] if company.xpath('.//h3/a/text()') else '无'
scale = company.xpath('.//p[@class="company-scale"]/text()')[0].strip() if company.xpath('.//p[@class="company-scale"]/text()') else '无'
type = company.xpath('.//p[@class="company-type"]/a[1]/text()')[0].strip() if company.xpath('.//p[@class="company-type"]/a[1]/text()') else '无'
area = company.xpath('.//p[@class="company-area"]/a/text()')[0].strip() if company.xpath('.//p[@class="company-area"]/a/text()') else '无'
info = {
'公司名称': name,
'公司规模': scale,
'企业类型': type,
'所在地区': area
}
data_list.append(info)
return data_list
def save_data(data_list):
"""
将招商信息保存到CSV文件
:param data_list: 招商信息列表
"""
with open('recruit_info.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=data_list[0].keys())
writer.writeheader()
writer.writerows(data_list)
def main():
"""
主函数,控制流程
"""
url = 'https://www.zhipin.com/web/geek/home.html?utm_source=homepage&utm_medium=hao-channel&utm_oc=01'
html = get_html(url)
data_list = parse_html(html)
save_data(data_list)
if __name__ == '__main__':
main()
这段代码首先定义了请求头,用于模拟浏览器访问。然后定义了get_html
函数来获取网页内容,parse_html
函数来解析网页并提取招商信息,以及save_data
函数来保存数据到CSV文件。最后,在main
函数中控制流程的进行。
注意
评论已关闭