Merge pull request #4561 from minrk/better-wait-up

Improve debugging when waiting for servers
2025-10-17 15:03:02 +00:00 · 2023-09-13 16:57:26 +02:00
parent d7849f0d99 cbc9f19d6e
commit fbd28ef0fd
2 changed files with 73 additions and 20 deletions
--- a/jupyterhub/proxy.py
+++ b/jupyterhub/proxy.py
@@ -48,7 +48,7 @@ from jupyterhub.traitlets import Command
 from . import utils
 from .metrics import CHECK_ROUTES_DURATION_SECONDS, PROXY_POLL_DURATION_SECONDS
 from .objects import Server
-from .utils import AnyTimeoutError, exponential_backoff, url_escape_path, url_path_join
+from .utils import exponential_backoff, url_escape_path, url_path_join
 def _one_at_a_time(method):
@@ -766,24 +766,67 @@ class ConfigurableHTTPProxy(Proxy):
        self._write_pid_file()
-        def _check_process():
+        async def wait_for_process():
-            status = self.proxy_process.poll()
+            """Watch proxy process for early termination
-            if status is not None:
+
-                with self.proxy_process:
+            Runs forever, checking every 0.5s if the process has exited
-                    e = RuntimeError("Proxy failed to start with exit code %i" % status)
+            so we don't keep waiting for endpoints after the proxy has stopped.
-                    raise e from None
+
            Raises RuntimeError if/when the proxy process exits,
            otherwise runs forever.
            Should be cancelled when servers become ready.
            """
            while True:
                status = self.proxy_process.poll()
                if status is not None:
                    with self.proxy_process:
                        e = RuntimeError(
                            f"Proxy failed to start with exit code {status}"
                        )
                        raise e from None
                await asyncio.sleep(0.5)
        # process_exited can only resolve with a RuntimeError when the process has exited,
        # otherwise it must be cancelled.
        process_exited = asyncio.ensure_future(wait_for_process())
        # wait for both servers to be ready (or one server to fail)
        server_futures = [
            asyncio.ensure_future(server.wait_up(10))
            for server in (public_server, api_server)
        ]
        servers_ready = asyncio.gather(*server_futures)
        # wait for process to crash or servers to be ready,
        # whichever comes first
        wait_timeout = 15
        ready, pending = await asyncio.wait(
            [
                process_exited,
                servers_ready,
            ],
            return_when=asyncio.FIRST_COMPLETED,
            timeout=wait_timeout,
        )
        for task in [servers_ready, process_exited] + server_futures:
            # cancel any pending tasks
            if not task.done():
                task.cancel()
        if not ready:
            # timeouts passed to wait_up should prevent this,
            # but weird things like DNS delays may result in
            # wait_up taking a lot longer than it should
            raise TimeoutError(
                f"Waiting for proxy endpoints didn't complete in {wait_timeout}s"
            )
        if process_exited in ready:
            # process exited, this will raise RuntimeError
            await process_exited
        else:
            # if we got here, servers_ready is done
            # await it to make sure exceptions are raised
            await servers_ready
        for server in (public_server, api_server):
            for i in range(10):
                _check_process()
                try:
                    await server.wait_up(1)
                except AnyTimeoutError:
                    continue
                else:
                    break
            await server.wait_up(1)
        _check_process()
        self.log.debug("Proxy started and appears to be up")
        pc = PeriodicCallback(self.check_running, 1e3 * self.check_running_interval)
        self._check_running_callback = pc
--- a/jupyterhub/utils.py
+++ b/jupyterhub/utils.py
@@ -15,6 +15,7 @@ import ssl
 import string
 import sys
 import threading
 import time
 import uuid
 import warnings
 from binascii import b2a_hex
@@ -93,6 +94,8 @@ def can_connect(ip, port):
    except OSError as e:
        if e.errno not in {errno.ECONNREFUSED, errno.ETIMEDOUT}:
            app_log.error("Unexpected error connecting to %s:%i %s", ip, port, e)
        else:
            app_log.debug("Server at %s:%i not ready: %s", ip, port, e)
        return False
    else:
        return True
@@ -245,6 +248,8 @@ async def wait_for_server(ip, port, timeout=10):
    """Wait for any server to show up at ip:port."""
    if ip in {'', '0.0.0.0', '::'}:
        ip = '127.0.0.1'
    app_log.debug("Waiting %ss for server at %s:%s", timeout, ip, port)
    tic = time.perf_counter()
    await exponential_backoff(
        lambda: can_connect(ip, port),
        "Server at {ip}:{port} didn't respond in {timeout} seconds".format(
@@ -252,6 +257,8 @@ async def wait_for_server(ip, port, timeout=10):
        ),
        timeout=timeout,
    )
    toc = time.perf_counter()
    app_log.debug("Server at %s:%s responded in %.2fs", ip, port, toc - tic)
 async def wait_for_http_server(url, timeout=10, ssl_context=None):
@@ -259,12 +266,13 @@ async def wait_for_http_server(url, timeout=10, ssl_context=None):
    Any non-5XX response code will do, even 404.
    """
    loop = ioloop.IOLoop.current()
    tic = loop.time()
    client = AsyncHTTPClient()
    if ssl_context:
        client.ssl_options = ssl_context
    app_log.debug("Waiting %ss for server at %s", timeout, url)
    tic = time.perf_counter()
    async def is_reachable():
        try:
            r = await client.fetch(url, follow_redirects=False)
@@ -297,6 +305,8 @@ async def wait_for_http_server(url, timeout=10, ssl_context=None):
        ),
        timeout=timeout,
    )
    toc = time.perf_counter()
    app_log.debug("Server at %s responded in %.2fs", url, toc - tic)
    return re