Merge pull request #2042 from minrk/abort-failures

add Spawner.consecutive_failure_limit
2025-10-08 10:34:10 +00:00 · 2018-07-26 10:33:36 +02:00
parent 204399ee2c 906abcc2f3
commit 0fa5c20f89
2 changed files with 49 additions and 1 deletions
--- a/jupyterhub/handlers/base.py
+++ b/jupyterhub/handlers/base.py
@@ -657,6 +657,7 @@ class BaseHandler(RequestHandler):
        # hook up spawner._spawn_future so that other requests can await
        # this result
        finish_spawn_future = spawner._spawn_future = maybe_future(finish_user_spawn())
        def _clear_spawn_future(f):
            # clear spawner._spawn_future when it's done
            # keep an exception around, though, to prevent repeated implicit spawns
@@ -665,10 +666,44 @@ class BaseHandler(RequestHandler):
                spawner._spawn_future = None
            # Now we're all done. clear _spawn_pending flag
            spawner._spawn_pending = False
        finish_spawn_future.add_done_callback(_clear_spawn_future)
        # when spawn finishes (success or failure)
        # update failure count and abort if consecutive failure limit
        # is reached
        def _track_failure_count(f):
            if f.exception() is None:
                # spawn succeeded, reset failure count
                self.settings['failure_count'] = 0
                return
            # spawn failed, increment count and abort if limit reached
            self.settings.setdefault('failure_count', 0)
            self.settings['failure_count'] += 1
            failure_count = self.settings['failure_count']
            failure_limit = spawner.consecutive_failure_limit
            if failure_limit and 1 < failure_count < failure_limit:
                self.log.warning(
                    "%i consecutive spawns failed.  "
                    "Hub will exit if failure count reaches %i before succeeding",
                    failure_count, failure_limit,
                )
            if failure_limit and failure_count >= failure_limit:
                self.log.critical(
                    "Aborting due to %i consecutive spawn failures", failure_count
                )
                # abort in 2 seconds to allow pending handlers to resolve
                # mostly propagating errors for the current failures
                def abort():
                    raise SystemExit(1)
                IOLoop.current().call_later(2, abort)
        finish_spawn_future.add_done_callback(_track_failure_count)
        try:
-            await gen.with_timeout(timedelta(seconds=self.slow_spawn_timeout), finish_spawn_future)
+            await gen.with_timeout(
                timedelta(seconds=self.slow_spawn_timeout), finish_spawn_future
            )
        except gen.TimeoutError:
            # waiting_for_response indicates server process has started,
            # but is yet to become responsive.
--- a/jupyterhub/spawner.py
+++ b/jupyterhub/spawner.py
@@ -195,6 +195,19 @@ class Spawner(LoggingConfigurable):
        """
    ).tag(config=True)
    consecutive_failure_limit = Integer(
        0,
        help="""
        Maximum number of consecutive failures to allow before
        shutting down JupyterHub.
        This helps JupyterHub recover from a certain class of problem preventing launch
        in contexts where the Hub is automatically restarted (e.g. systemd, docker, kubernetes).
        A limit of 0 means no limit and consecutive failures will not be tracked.
        """,
    ).tag(config=True)
    start_timeout = Integer(60,
        help="""
        Timeout (in seconds) before giving up on starting of single-user server.