# 导入必要的模块
import pymysql
from twisted.enterprise import adbapi
from scrapy.utils.project import get_project_settings
class MysqlPipeline:
def __init__(self, host, database, user, password, port, charset):
self.host = host
self.database = database
self.user = user
self.password = password
self.port = port
self.charset = charset
self.dbpool = adbapi.ConnectionPool('pymysql', host=self.host, db=self.database, user=self.user,
password=self.password, port=self.port, charset=self.charset,
cursorclass=pymysql.cursors.DictCursor)
@classmethod
def from_crawler(cls, crawler):
"""
通过Scrapy的配置文件获取数据库连接信息
"""
host = crawler.settings.get('MYSQL_HOST')
database = crawler.settings.get('MYSQL_DATABASE')
user = crawler.settings.get('MYSQL_USER')
password = crawler.settings.get('MYSQL_PASSWORD')
port = crawler.settings.get('MYSQL_PORT')
charset = crawler.settings.get('MYSQL_CHARSET')
return cls(host, database, user, password, port, charset)
def open_spider(self, spider):
"""
爬虫开始运行时执行的操作
"""
print('MysqlPipeline: 数据库连接开启')
def close_spider(self, spider):
"""
爬虫关闭时执行的操作
"""
self.dbpool.close()
print('MysqlPipeline: 数据库连接已关闭')
def process_item(self, item, spider):
"""
处理item,将数据存入数据库
"""
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handle_error, item, spider) # 处理异常
return item
def do_insert(self, cursor, item):
"""
执行数据插入操作
"""
insert_sql = """
INSERT INTO job (title, company, salary, address, eduLevel, jobType, publishTime, detailUrl, category)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
cursor.execute(insert_sql, (item['title'], item['company'], item['salary'], item['address'], item['eduLevel'],
item['jobType'], item['publishTime'], item['detailUrl'], item['category']))
def handle_error(self, failure, item, spider):
"""
处理异常
"""
print(failure)
这段代码实现了一个Scrapy的管道,用于将爬取的数据存入MySQL数据库。首先定义了一个\`MysqlPipeli