DEV Community

drake
drake

Posted on

通过代码框架的设计,让所有爬虫都具备挂掉后自动拉起的能力

  • 实现原理:在所有的爬虫都是在子进程中运行,主进程监控爬虫的状态是否是异常退出了,如果是异常退出,那么就在主进程的逻辑中重启该爬虫
import subprocess
import time
import os
from threading import Thread
from utils.log import logging
from .live_check import live_listen

def start_flask_service():
    live_listen()

class SpiderProcessManager:
    """
    启动所有爬虫的类
    """
    def __init__(self, spiders):
        self.spiders = spiders

        # 获取所有环境变量
        env_vars = os.environ
        # 打印所有的环境变量
        for key, value in env_vars.items():
            print(f"环境变量 {key}: {value}")
        pod_name = os.getenv("POD_NAME")
        print(f'POD_NAME: {pod_name}')
        # 应用名称
        app_name = os.getenv("APP_NAME")
        print(f'APP_NAME: {app_name}')
        self.pod_name = pod_name
        self.app_name = app_name
        # 需要启动的爬虫
        self.spider = None

    def log_process_output(self, process):
        """
        从子进程中获取日志并输出
        """
        while True:
            line = process.stdout.readline().decode('utf-8')
            if not line:
                break
            logging.info(f"{process.pid}: {line.strip()}")

    def start_spiders(self):
        """
        识别环境变量,获取到应用名称、pod名称,去匹配对应的爬虫,并且启动
        非阻塞
        """
        # 匹配应用
        spiders = self.spiders[self.app_name]
        # 匹配爬虫
        # 对应应用清单内的爬虫编号
        spider_num_key = self.pod_name.split('-')[-1]
        spider_num_key = int(spider_num_key)
        spider = spiders[spider_num_key]
        cmd = f"python manage.py -s {spider}"
        # 启动子进程
        process = subprocess.Popen(
            cmd,
            shell=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
        )
        # 启动日志打印线程
        log_thread = Thread(target=self.log_process_output, args=(process,))
        log_thread.start()
        # 将爬虫赋值给实例属性
        self.spider = spider
        return process

    def spider_success_status(self, process):
        """
        判断爬虫是否是正常结束,正常结束返回True
        """
        # None 进程没有结束,0 正常结束,其他整数是异常结束
        process_end_status = process.poll()
        while process_end_status is None:
            process_end_status = process.poll()
            time.sleep(0.5)
        if process_end_status == 0:
            print('爬虫正常结束')
            return True
        else:
            print('爬虫挂了')
            return False

    def run(self):
        """
        启动入口
        """
        # 对特定类型的爬虫的异常退出 重启
        restart = True
        while restart:
            # 初始化
            restart = False
            # 返回爬虫启动的子进程对象
            process = self.start_spiders()
            is_spider_success = self.spider_success_status(process)
            # 爬虫如果异常退出
            if not is_spider_success:
                restart = True
                print('准备重启爬虫...')
                # # 且是属于scrapy爬虫,那么就对其进行重启
                # if self.spider.split('.')[0] == 'scrapy':
                #     restart = True
                #     print('准备重启爬虫...')
            # 二次确保能够跳出循环
            if not restart:
                break

Enter fullscreen mode Exit fullscreen mode

Top comments (0)