Merge pull request #2721 from minrk/async-init-spawners

Add JupyterHub.init_spawners_timeout
2025-10-17 06:52:59 +00:00 · 2019-09-24 11:08:16 +02:00
parent fafbe86b55 74958d9397
commit 54baa0c31a
5 changed files with 108 additions and 11 deletions
--- a/jupyterhub/apihandlers/users.py
+++ b/jupyterhub/apihandlers/users.py
@@ -589,11 +589,14 @@ class SpawnProgressAPIHandler(APIHandler):
        async with aclosing(
            iterate_until(spawn_future, spawner._generate_progress())
        ) as events:
-            async for event in events:
-                # don't allow events to sneakily set the 'ready' flag
-                if 'ready' in event:
-                    event.pop('ready', None)
-                await self.send_event(event)
+            try:
+                async for event in events:
+                    # don't allow events to sneakily set the 'ready' flag
+                    if 'ready' in event:
+                        event.pop('ready', None)
+                    await self.send_event(event)
+            except asyncio.CancelledError:
+                pass

        # progress finished, wait for spawn to actually resolve,
        # in case progress finished early
--- a/jupyterhub/app.py
+++ b/jupyterhub/app.py
@@ -11,8 +11,10 @@ import re
 import signal
 import socket
 import sys
+import time
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
+from datetime import timedelta
 from datetime import timezone
 from functools import partial
 from getpass import getuser
@@ -984,6 +986,28 @@ class JupyterHub(Application):
        """,
    ).tag(config=True)

+    init_spawners_timeout = Integer(
+        10,
+        help="""
+        Timeout (in seconds) to wait for spawners to initialize
+
+        Checking if spawners are healthy can take a long time
+        if many spawners are active at hub start time.
+
+        If it takes longer than this timeout to check,
+        init_spawner will be left to complete in the background
+        and the http server is allowed to start.
+
+        A timeout of -1 means wait forever,
+        which can mean a slow startup of the Hub
+        but ensures that the Hub is fully consistent by the time it starts responding to requests.
+        This matches the behavior of jupyterhub 1.0.
+
+        .. versionadded: 1.1.0
+
+        """,
+    ).tag(config=True)
+
    db_url = Unicode(
        'sqlite:///jupyterhub.sqlite',
        help="url for the database. e.g. `sqlite:///jupyterhub.sqlite`",
@@ -1835,6 +1859,7 @@ class JupyterHub(Application):
                )

    async def init_spawners(self):
+        self.log.debug("Initializing spawners")
        db = self.db

        def _user_summary(user):
@@ -1925,6 +1950,8 @@ class JupyterHub(Application):
                else:
                    self.log.debug("%s not running", spawner._log_name)

+            spawner._check_pending = False
+
        # parallelize checks for running Spawners
        check_futures = []
        for orm_user in db.query(orm.User):
@@ -1935,11 +1962,22 @@ class JupyterHub(Application):
                    # spawner should be running
                    # instantiate Spawner wrapper and check if it's still alive
                    spawner = user.spawners[name]
+                    # signal that check is pending to avoid race conditions
+                    spawner._check_pending = True
                    f = asyncio.ensure_future(check_spawner(user, name, spawner))
                    check_futures.append(f)

+        TOTAL_USERS.set(len(self.users))
+
+        # it's important that we get here before the first await
+        # so that we know all spawners are instantiated and in the check-pending state
+
        # await checks after submitting them all
-        await gen.multi(check_futures)
+        if check_futures:
+            self.log.debug(
+                "Awaiting checks for %i possibly-running spawners", len(check_futures)
+            )
+            await gen.multi(check_futures)
        db.commit()

        # only perform this query if we are going to log it
@@ -1949,7 +1987,7 @@ class JupyterHub(Application):

        active_counts = self.users.count_active_users()
        RUNNING_SERVERS.set(active_counts['active'])
-        TOTAL_USERS.set(len(self.users))
+        return len(check_futures)

    def init_oauth(self):
        base_url = self.hub.base_url
@@ -2112,6 +2150,7 @@ class JupyterHub(Application):
        super().initialize(*args, **kwargs)
        if self.generate_config or self.generate_certs or self.subapp:
            return
+        self._start_future = asyncio.Future()
        self.load_config_file(self.config_file)
        self.init_logging()
        if 'JupyterHubApp' in self.config:
@@ -2162,11 +2201,61 @@ class JupyterHub(Application):
        self.init_services()
        await self.init_api_tokens()
        self.init_tornado_settings()
-        await self.init_spawners()
-        self.cleanup_oauth_clients()
        self.init_handlers()
        self.init_tornado_application()

+        # init_spawners can take a while
+        init_spawners_timeout = self.init_spawners_timeout
+        if init_spawners_timeout < 0:
+            # negative timeout means forever (previous, most stable behavior)
+            init_spawners_timeout = 86400
+        print(init_spawners_timeout)
+
+        init_start_time = time.perf_counter()
+        init_spawners_future = asyncio.ensure_future(self.init_spawners())
+
+        def log_init_time(f):
+            n_spawners = f.result()
+            self.log.info(
+                "Initialized %i spawners in %.3f seconds",
+                n_spawners,
+                time.perf_counter() - init_start_time,
+            )
+
+        init_spawners_future.add_done_callback(log_init_time)
+
+        try:
+
+            # don't allow a zero timeout because we still need to be sure
+            # that the Spawner objects are defined and pending
+            await gen.with_timeout(
+                timedelta(seconds=max(init_spawners_timeout, 1)), init_spawners_future
+            )
+        except gen.TimeoutError:
+            self.log.warning(
+                "init_spawners did not complete within %i seconds. "
+                "Allowing to complete in the background.",
+                self.init_spawners_timeout,
+            )
+
+        if init_spawners_future.done():
+            self.cleanup_oauth_clients()
+        else:
+            # schedule async operations after init_spawners finishes
+            async def finish_init_spawners():
+                await init_spawners_future
+                # schedule cleanup after spawners are all set up
+                # because it relies on the state resolved by init_spawners
+                self.cleanup_oauth_clients()
+                # trigger a proxy check as soon as all spawners are ready
+                # because this may be *after* the check made as part of normal startup.
+                # To avoid races with partially-complete start,
+                # ensure that start is complete before running this check.
+                await self._start_future
+                await self.proxy.check_routes(self.users, self._service_map)
+
+            asyncio.ensure_future(finish_init_spawners())
+
    async def cleanup(self):
        """Shutdown managed services and various subprocesses. Cleanup runtime files."""

@@ -2452,6 +2541,7 @@ class JupyterHub(Application):
            atexit.register(self.atexit)
        # register cleanup on both TERM and INT
        self.init_signal()
+        self._start_future.set_result(None)

    def init_signal(self):
        loop = asyncio.get_event_loop()
--- a/jupyterhub/handlers/base.py
+++ b/jupyterhub/handlers/base.py
@@ -879,7 +879,7 @@ class BaseHandler(RequestHandler):
            # clear spawner._spawn_future when it's done
            # keep an exception around, though, to prevent repeated implicit spawns
            # if spawn is failing
-            if f.exception() is None:
+            if f.cancelled() or f.exception() is None:
                spawner._spawn_future = None
            # Now we're all done. clear _spawn_pending flag
            spawner._spawn_pending = False
@@ -890,7 +890,7 @@ class BaseHandler(RequestHandler):
        # update failure count and abort if consecutive failure limit
        # is reached
        def _track_failure_count(f):
-            if f.exception() is None:
+            if f.cancelled() or f.exception() is None:
                # spawn succeeded, reset failure count
                self.settings['failure_count'] = 0
                return
--- a/jupyterhub/spawner.py
+++ b/jupyterhub/spawner.py
@@ -86,6 +86,7 @@ class Spawner(LoggingConfigurable):
    _start_pending = False
    _stop_pending = False
    _proxy_pending = False
+    _check_pending = False
    _waiting_for_response = False
    _jupyterhub_version = None
    _spawn_future = None
@@ -121,6 +122,8 @@ class Spawner(LoggingConfigurable):
            return 'spawn'
        elif self._stop_pending:
            return 'stop'
+        elif self._check_pending:
+            return 'check'
        return None

    @property
--- a/jupyterhub/user.py
+++ b/jupyterhub/user.py
@@ -727,6 +727,7 @@ class User:
        spawner = self.spawners[server_name]
        spawner._spawn_pending = False
        spawner._start_pending = False
+        spawner._check_pending = False
        spawner.stop_polling()
        spawner._stop_pending = True