import logging
import time
import os
from utils.redisdb import redis_cli
from config import config, env
from scrapy.utils.response import response_status_message
from scrapy.downloadermiddlewares.retry import RetryMiddleware
from utils.lark_bot import sender
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('Retry Middleware')
class RetryChangeProxy(RetryMiddleware):
"""
重试中间件
"""
def __init__(self, settings):
super().__init__(settings)
self.proxy = config.PROXY_URL
self.max_retry_times = settings.getint("RETRY_TIMES")
logger.info(f'获取到爬虫最大重试次数的配置:{self.max_retry_times}')
self.web_hook_api = 'https://open.larksuite.com/open-apis/bot/v2/hook/b11d6e67-d636-4a46-bbfb-42421fea7281'
self.pod_name = os.getenv("POD_NAME")
self.redis_cli = redis_cli()
def process_response(self, request, response, spider):
"""
响应码错误类型的重试
自定义分级重试等待时间;
重试超过指定次数发告警
"""
# 超过重试次数发告警
self.beyond_max_retries(request, spider)
if request.meta.get("dont_retry", False):
return response
if response.status in self.retry_http_codes:
reason = response_status_message(response.status)
# 动态设置延迟时间
if request.meta.get('retry_times',0) <= 1:
delay = 1
elif request.meta.get('retry_times',0) <= 3:
delay = 2 # 设置重试延迟时间
elif request.meta.get('retry_times',0) <= 5:
delay = 2*60
else:
delay = 10*60
time.sleep(delay)
return self._retry(request, reason, spider) or response
return response
def process_exception(self, request, exception, spider):
"""
程序崩溃类型的重试
判断是否需要重试
"""
# 内置方法通过捕获settings里面的异常响应状态码实现
# 属于重试的错误类型才重试
if isinstance(exception, self.exceptions_to_retry):
# 如果不指定不重试,则重试
if not request.meta.get('dont_retry', False):
# 超过重试次数发告警,且中止重试
if self.beyond_max_retries(request, spider):
return
# 动态设置延迟时间
delay = 5 # 设置默认延迟时间
if request.meta.get('retry_times',0) > 1:
delay = 10 # 设置重试延迟时间
if request.meta.get('retry_times', 0) > 3:
delay = 2 * 60
time.sleep(delay)
# 发起重试请求 (self._retry 带有超过重试次数的判断,超过则放弃)
# 并且会给新的请求自动添加 dont_filter
return self._retry(request, exception, spider)
def beyond_max_retries(self, request, spider):
"""
超过重试次数判断并发告警
只会在线上环境告警
"""
if not self.max_retry_times:
max_retry_times = request.meta.get("max_retry_times")
if max_retry_times:
self.max_retry_times = max_retry_times
retry_times = request.meta.get("retry_times", 0) + 1
if retry_times > self.max_retry_times:
cron_job_exist = hasattr(spider, 'author')
if not cron_job_exist:
author = ''
else:
author = spider.author
# 打印或记录详细的失败信息
content = f'POD_NAME:{self.pod_name} \nauthor: {author} \n爬虫:{spider.name} ' \
f'\n重试溢出:{retry_times} 次 \n{request.url}'
logger.error(content)
if env == 'prod':
filter_status = self.redis_cli.get(f'noalert:{spider.name}')
if not filter_status:
sender(content, self.web_hook_api, '重试溢出')
# 24小时只告警一次
self.redis_cli.setex(f'noalert:{spider.name}', 24*60*60, 1)
# 让程序以异常的状态退出,更外层的框架会捕获这个崩溃,并且重启该爬虫
# 如果 restart 不存在,则设置为 False;如果已存在,保持原值
spider.restart = getattr(spider, 'restart', False)
logger.info(f'spider.restart: {spider.restart}')
if spider.restart:
logger.warning(f'关闭爬虫:{spider.name}')
os._exit(1)
return True
return
For further actions, you may consider blocking this person and/or reporting abuse
Top comments (0)