搜索引擎爬虫实战:使用Python爬取Google和Bing搜索结果
import requests
from bs4 import BeautifulSoup
def crawl_google(query, num_results=10):
base_url = "https://www.google.com/search?q={query}&num={num}"
params = {
"query": query.replace(' ', '+'),
"num": num_results
}
url = base_url.format(**params)
headers = {
"User-Agent": "Mozilla/5.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
}
response = requests.get(url, headers=headers)
return response.text
def crawl_bing(query, num_results=10):
base_url = "https://www.bing.com/search?q={query}&count={num}"
params = {
"query": query.replace(' ', '+'),
"num": num_results
}
url = base_url.format(**params)
headers = {
"User-Agent": "Mozilla/5.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
}
response = requests.get(url, headers=headers)
return response.text
def parse_results(html_content, engine='Google'):
soup = BeautifulSoup(html_content, 'html.parser')
results = soup.find_all('div', class_='r') if engine == 'Google' else soup.find_all('li', class_='b_algo')
parsed_results = []
for result in results:
link = result.find('a')
if link:
title = link.text
href = link['href']
parsed_results.append({'title': title, 'link': href})
return parsed_results
# 使用示例
google_results = parse_results(crawl_google('Python'), engine='Google')
bing_results = parse_results(crawl_bing('Python'), engine='Bing')
print("Google Results:")
for result in google_results:
print(f"Title: {result['title']}, Link: {result['link']}")
print("\nBing Results:")
for result in bing_results:
print(f"Title: {result['title']}, Link: {result['link']}")
这段代码定义了两个函数crawl_google
和crawl_bing
来分别爬取Google和Bing的搜索结果,然后定义了一个parse_results
函数来解析返回的HTML内容,提取出搜索结果的标题和链接。最后,我们使用这些函数来爬取"Python"作为关键词的搜索结果,并打印出来。这个例子展示了如何使用Python进行基本的网络爬虫,并且说明了如何处理动态内容和反爬虫策略。
评论已关闭