Merge pull request #2042 from minrk/abort-failures

add Spawner.consecutive_failure_limit
2025-10-08 10:34:10 +00:00 · 2018-07-26 10:33:36 +02:00
parent 204399ee2c 906abcc2f3
commit 0fa5c20f89
2 changed files with 49 additions and 1 deletions
--- a/jupyterhub/handlers/base.py
+++ b/jupyterhub/handlers/base.py
@@ -657,6 +657,7 @@ class BaseHandler(RequestHandler):
        # hook up spawner._spawn_future so that other requests can await
        # this result
        finish_spawn_future = spawner._spawn_future = maybe_future(finish_user_spawn())
+
        def _clear_spawn_future(f):
            # clear spawner._spawn_future when it's done
            # keep an exception around, though, to prevent repeated implicit spawns
@@ -665,10 +666,44 @@ class BaseHandler(RequestHandler):
                spawner._spawn_future = None
            # Now we're all done. clear _spawn_pending flag
            spawner._spawn_pending = False
+
        finish_spawn_future.add_done_callback(_clear_spawn_future)

+        # when spawn finishes (success or failure)
+        # update failure count and abort if consecutive failure limit
+        # is reached
+        def _track_failure_count(f):
+            if f.exception() is None:
+                # spawn succeeded, reset failure count
+                self.settings['failure_count'] = 0
+                return
+            # spawn failed, increment count and abort if limit reached
+            self.settings.setdefault('failure_count', 0)
+            self.settings['failure_count'] += 1
+            failure_count = self.settings['failure_count']
+            failure_limit = spawner.consecutive_failure_limit
+            if failure_limit and 1 < failure_count < failure_limit:
+                self.log.warning(
+                    "%i consecutive spawns failed.  "
+                    "Hub will exit if failure count reaches %i before succeeding",
+                    failure_count, failure_limit,
+                )
+            if failure_limit and failure_count >= failure_limit:
+                self.log.critical(
+                    "Aborting due to %i consecutive spawn failures", failure_count
+                )
+                # abort in 2 seconds to allow pending handlers to resolve
+                # mostly propagating errors for the current failures
+                def abort():
+                    raise SystemExit(1)
+                IOLoop.current().call_later(2, abort)
+
+        finish_spawn_future.add_done_callback(_track_failure_count)
+
        try:
-            await gen.with_timeout(timedelta(seconds=self.slow_spawn_timeout), finish_spawn_future)
+            await gen.with_timeout(
+                timedelta(seconds=self.slow_spawn_timeout), finish_spawn_future
+            )
        except gen.TimeoutError:
            # waiting_for_response indicates server process has started,
            # but is yet to become responsive.
--- a/jupyterhub/spawner.py
+++ b/jupyterhub/spawner.py
@@ -195,6 +195,19 @@ class Spawner(LoggingConfigurable):
        """
    ).tag(config=True)

+    consecutive_failure_limit = Integer(
+        0,
+        help="""
+        Maximum number of consecutive failures to allow before
+        shutting down JupyterHub.
+
+        This helps JupyterHub recover from a certain class of problem preventing launch
+        in contexts where the Hub is automatically restarted (e.g. systemd, docker, kubernetes).
+
+        A limit of 0 means no limit and consecutive failures will not be tracked.
+        """,
+    ).tag(config=True)
+
    start_timeout = Integer(60,
        help="""
        Timeout (in seconds) before giving up on starting of single-user server.