diff --git a/jupyterhub/handlers/base.py b/jupyterhub/handlers/base.py index 8e508fce..f5365d6a 100644 --- a/jupyterhub/handlers/base.py +++ b/jupyterhub/handlers/base.py @@ -657,6 +657,7 @@ class BaseHandler(RequestHandler): # hook up spawner._spawn_future so that other requests can await # this result finish_spawn_future = spawner._spawn_future = maybe_future(finish_user_spawn()) + def _clear_spawn_future(f): # clear spawner._spawn_future when it's done # keep an exception around, though, to prevent repeated implicit spawns @@ -665,10 +666,44 @@ class BaseHandler(RequestHandler): spawner._spawn_future = None # Now we're all done. clear _spawn_pending flag spawner._spawn_pending = False + finish_spawn_future.add_done_callback(_clear_spawn_future) + # when spawn finishes (success or failure) + # update failure count and abort if consecutive failure limit + # is reached + def _track_failure_count(f): + if f.exception() is None: + # spawn succeeded, reset failure count + self.settings['failure_count'] = 0 + return + # spawn failed, increment count and abort if limit reached + self.settings.setdefault('failure_count', 0) + self.settings['failure_count'] += 1 + failure_count = self.settings['failure_count'] + failure_limit = spawner.consecutive_failure_limit + if failure_limit and 1 < failure_count < failure_limit: + self.log.warning( + "%i consecutive spawns failed. " + "Hub will exit if failure count reaches %i before succeeding", + failure_count, failure_limit, + ) + if failure_limit and failure_count >= failure_limit: + self.log.critical( + "Aborting due to %i consecutive spawn failures", failure_count + ) + # abort in 2 seconds to allow pending handlers to resolve + # mostly propagating errors for the current failures + def abort(): + raise SystemExit(1) + IOLoop.current().call_later(2, abort) + + finish_spawn_future.add_done_callback(_track_failure_count) + try: - await gen.with_timeout(timedelta(seconds=self.slow_spawn_timeout), finish_spawn_future) + await gen.with_timeout( + timedelta(seconds=self.slow_spawn_timeout), finish_spawn_future + ) except gen.TimeoutError: # waiting_for_response indicates server process has started, # but is yet to become responsive. diff --git a/jupyterhub/spawner.py b/jupyterhub/spawner.py index ff76b66c..cca8f96f 100644 --- a/jupyterhub/spawner.py +++ b/jupyterhub/spawner.py @@ -195,6 +195,19 @@ class Spawner(LoggingConfigurable): """ ).tag(config=True) + consecutive_failure_limit = Integer( + 0, + help=""" + Maximum number of consecutive failures to allow before + shutting down JupyterHub. + + This helps JupyterHub recover from a certain class of problem preventing launch + in contexts where the Hub is automatically restarted (e.g. systemd, docker, kubernetes). + + A limit of 0 means no limit and consecutive failures will not be tracked. + """, + ).tag(config=True) + start_timeout = Integer(60, help=""" Timeout (in seconds) before giving up on starting of single-user server.