- 实现原理:在所有的爬虫都是在子进程中运行,主进程监控爬虫的状态是否是异常退出了,如果是异常退出,那么就在主进程的逻辑中重启该爬虫
import subprocess
import time
import os
from threading import Thread
from utils.log import logging
from .live_check import live_listen
def start_flask_service():
live_listen()
class SpiderProcessManager:
"""
启动所有爬虫的类
"""
def __init__(self, spiders):
self.spiders = spiders
# 获取所有环境变量
env_vars = os.environ
# 打印所有的环境变量
for key, value in env_vars.items():
print(f"环境变量 {key}: {value}")
pod_name = os.getenv("POD_NAME")
print(f'POD_NAME: {pod_name}')
# 应用名称
app_name = os.getenv("APP_NAME")
print(f'APP_NAME: {app_name}')
self.pod_name = pod_name
self.app_name = app_name
# 需要启动的爬虫
self.spider = None
def log_process_output(self, process):
"""
从子进程中获取日志并输出
"""
while True:
line = process.stdout.readline().decode('utf-8')
if not line:
break
logging.info(f"{process.pid}: {line.strip()}")
def start_spiders(self):
"""
识别环境变量,获取到应用名称、pod名称,去匹配对应的爬虫,并且启动
非阻塞
"""
# 匹配应用
spiders = self.spiders[self.app_name]
# 匹配爬虫
# 对应应用清单内的爬虫编号
spider_num_key = self.pod_name.split('-')[-1]
spider_num_key = int(spider_num_key)
spider = spiders[spider_num_key]
cmd = f"python manage.py -s {spider}"
# 启动子进程
process = subprocess.Popen(
cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
)
# 启动日志打印线程
log_thread = Thread(target=self.log_process_output, args=(process,))
log_thread.start()
# 将爬虫赋值给实例属性
self.spider = spider
return process
def spider_success_status(self, process):
"""
判断爬虫是否是正常结束,正常结束返回True
"""
# None 进程没有结束,0 正常结束,其他整数是异常结束
process_end_status = process.poll()
while process_end_status is None:
process_end_status = process.poll()
time.sleep(0.5)
if process_end_status == 0:
print('爬虫正常结束')
return True
else:
print('爬虫挂了')
return False
def run(self):
"""
启动入口
"""
# 对特定类型的爬虫的异常退出 重启
restart = True
while restart:
# 初始化
restart = False
# 返回爬虫启动的子进程对象
process = self.start_spiders()
is_spider_success = self.spider_success_status(process)
# 爬虫如果异常退出
if not is_spider_success:
restart = True
print('准备重启爬虫...')
# # 且是属于scrapy爬虫,那么就对其进行重启
# if self.spider.split('.')[0] == 'scrapy':
# restart = True
# print('准备重启爬虫...')
# 二次确保能够跳出循环
if not restart:
break
Top comments (0)