python简易网络爬虫(以研招网招生信息为例)
import requests
from bs4 import BeautifulSoup
def crawl_lagou_jobs(position, city, page_num):
"""
爬取拉勾网的职位信息
:param position: 职位名称
:param city: 城市名称
:param page_num: 页数
:return: 职位信息列表
"""
jobs_info = []
for i in range(1, page_num+1):
url = f'https://www.lagou.com/jobs/list_%E8%AE%A1%E7%AE%97%E6%9C%BA%E7%A7%91%E6%8A%80%E5%91%98?city={city}&district=&positionName={position}&first=true&kd=e1f8c6b136364c89977c5539f8b84833'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'lxml')
job_list = soup.find_all('div', class_='job-primary')
for job in job_list:
job_info = {
'company_name': job.find('div', class_='company-name').text,
'position_name': job.find('div', class_='name').text.strip(),
'salary': job.find('div', class_='money').text.strip(),
'work_year': job.find('div', class_='work-year').text.strip(),
'education': job.find('div', class_='eduBackground').text.strip(),
'city': job.find('div', class_='work-location').text.strip(),
}
jobs_info.append(job_info)
else:
print(f'请求失败,状态码:{response.status_code}')
return jobs_info
# 使用示例
position = '机器学习工程师'
city = '北京'
page_num = 3
jobs_info = crawl_lagou_jobs(position, city, page_num)
for info in jobs_info:
print(info)
这段代码定义了一个crawl_lagou_jobs
函数,它接受职位名称、城市名称和页数作为参数,返回拉勾网上该职位的信息列表。这个简易的网络爬虫示例展示了如何使用Python进行网页爬取,并提取出有价值的数据。
评论已关闭