From ece8408381b4fa61f125c175663e5e6828e80b5c Mon Sep 17 00:00:00 2001
From: Min RK <benjaminrk@gmail.com>
Date: Tue, 12 Sep 2023 09:24:01 +0200
Subject: [PATCH 1/2] Improve debugging when waiting for servers

Add debug messages and timers for start and end waiting for servers

and improve logic for awaiting proxy endpoints using concurrency primitives instead of a for-loop
---
 jupyterhub/proxy.py | 48 ++++++++++++++++++++++++++++-----------------
 jupyterhub/utils.py | 14 +++++++++++--
 2 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/jupyterhub/proxy.py b/jupyterhub/proxy.py
index 87b47a6c..8d92eb94 100644
--- a/jupyterhub/proxy.py
+++ b/jupyterhub/proxy.py
@@ -48,7 +48,7 @@ from jupyterhub.traitlets import Command
 from . import utils
 from .metrics import CHECK_ROUTES_DURATION_SECONDS, PROXY_POLL_DURATION_SECONDS
 from .objects import Server
-from .utils import AnyTimeoutError, exponential_backoff, url_escape_path, url_path_join
+from .utils import exponential_backoff, url_escape_path, url_path_join
 
 
 def _one_at_a_time(method):
@@ -766,24 +766,36 @@ class ConfigurableHTTPProxy(Proxy):
 
         self._write_pid_file()
 
-        def _check_process():
-            status = self.proxy_process.poll()
-            if status is not None:
-                with self.proxy_process:
-                    e = RuntimeError("Proxy failed to start with exit code %i" % status)
-                    raise e from None
+        async def watch_process():
+            """Watch proxy process for early termination
 
-        for server in (public_server, api_server):
-            for i in range(10):
-                _check_process()
-                try:
-                    await server.wait_up(1)
-                except AnyTimeoutError:
-                    continue
-                else:
-                    break
-            await server.wait_up(1)
-        _check_process()
+            so we don't keep waiting for connections after the proxy exits
+            """
+            while True:
+                status = self.proxy_process.poll()
+                if status is not None:
+                    with self.proxy_process:
+                        e = RuntimeError(
+                            f"Proxy failed to start with exit code {status}"
+                        )
+                        raise e from None
+
+                await asyncio.wait([servers_ready], timeout=0.5)
+                # stop watching the process when the connections
+                # have finished (success or failure)
+                if servers_ready.done():
+                    return
+
+        servers_ready = asyncio.gather(
+            *(server.wait_up(10) for server in (public_server, api_server))
+        )
+
+        process_ended = asyncio.ensure_future(watch_process())
+        # wait for process to crash or servers to be ready,
+        # whichever comes first
+        # don't need to handle exceptions here,
+        # as the underlying exceptions have informative error messages
+        await asyncio.wait_for(asyncio.gather(process_ended, servers_ready), timeout=15)
         self.log.debug("Proxy started and appears to be up")
         pc = PeriodicCallback(self.check_running, 1e3 * self.check_running_interval)
         self._check_running_callback = pc
diff --git a/jupyterhub/utils.py b/jupyterhub/utils.py
index 4db7ffe0..e703d9fd 100644
--- a/jupyterhub/utils.py
+++ b/jupyterhub/utils.py
@@ -15,6 +15,7 @@ import ssl
 import string
 import sys
 import threading
+import time
 import uuid
 import warnings
 from binascii import b2a_hex
@@ -93,6 +94,8 @@ def can_connect(ip, port):
     except OSError as e:
         if e.errno not in {errno.ECONNREFUSED, errno.ETIMEDOUT}:
             app_log.error("Unexpected error connecting to %s:%i %s", ip, port, e)
+        else:
+            app_log.debug("Server at %s:%i not ready: %s", ip, port, e)
         return False
     else:
         return True
@@ -245,6 +248,8 @@ async def wait_for_server(ip, port, timeout=10):
     """Wait for any server to show up at ip:port."""
     if ip in {'', '0.0.0.0', '::'}:
         ip = '127.0.0.1'
+    app_log.debug("Waiting %ss for server at %s:%s", timeout, ip, port)
+    tic = time.perf_counter()
     await exponential_backoff(
         lambda: can_connect(ip, port),
         "Server at {ip}:{port} didn't respond in {timeout} seconds".format(
@@ -252,6 +257,8 @@ async def wait_for_server(ip, port, timeout=10):
         ),
         timeout=timeout,
     )
+    toc = time.perf_counter()
+    app_log.debug("Server at %s:%s responded in %.2fs", ip, port, toc - tic)
 
 
 async def wait_for_http_server(url, timeout=10, ssl_context=None):
@@ -259,12 +266,13 @@ async def wait_for_http_server(url, timeout=10, ssl_context=None):
 
     Any non-5XX response code will do, even 404.
     """
-    loop = ioloop.IOLoop.current()
-    tic = loop.time()
     client = AsyncHTTPClient()
     if ssl_context:
         client.ssl_options = ssl_context
 
+    app_log.debug("Waiting %ss for server at %s", timeout, url)
+    tic = time.perf_counter()
+
     async def is_reachable():
         try:
             r = await client.fetch(url, follow_redirects=False)
@@ -297,6 +305,8 @@ async def wait_for_http_server(url, timeout=10, ssl_context=None):
         ),
         timeout=timeout,
     )
+    toc = time.perf_counter()
+    app_log.debug("Server at %s responded in %.2fs", url, toc - tic)
     return re
 
 

From cbc9f19d6e0441e094111a109b13fe12f1f1e941 Mon Sep 17 00:00:00 2001
From: Min RK <benjaminrk@gmail.com>
Date: Wed, 13 Sep 2023 14:55:43 +0200
Subject: [PATCH 2/2] more detailed, verbose wait for proxy startup

---
 jupyterhub/proxy.py | 59 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 45 insertions(+), 14 deletions(-)

diff --git a/jupyterhub/proxy.py b/jupyterhub/proxy.py
index 8d92eb94..85fb23d7 100644
--- a/jupyterhub/proxy.py
+++ b/jupyterhub/proxy.py
@@ -766,10 +766,15 @@ class ConfigurableHTTPProxy(Proxy):
 
         self._write_pid_file()
 
-        async def watch_process():
+        async def wait_for_process():
             """Watch proxy process for early termination
 
-            so we don't keep waiting for connections after the proxy exits
+            Runs forever, checking every 0.5s if the process has exited
+            so we don't keep waiting for endpoints after the proxy has stopped.
+
+            Raises RuntimeError if/when the proxy process exits,
+            otherwise runs forever.
+            Should be cancelled when servers become ready.
             """
             while True:
                 status = self.proxy_process.poll()
@@ -779,23 +784,49 @@ class ConfigurableHTTPProxy(Proxy):
                             f"Proxy failed to start with exit code {status}"
                         )
                         raise e from None
+                await asyncio.sleep(0.5)
 
-                await asyncio.wait([servers_ready], timeout=0.5)
-                # stop watching the process when the connections
-                # have finished (success or failure)
-                if servers_ready.done():
-                    return
+        # process_exited can only resolve with a RuntimeError when the process has exited,
+        # otherwise it must be cancelled.
+        process_exited = asyncio.ensure_future(wait_for_process())
 
-        servers_ready = asyncio.gather(
-            *(server.wait_up(10) for server in (public_server, api_server))
-        )
+        # wait for both servers to be ready (or one server to fail)
+        server_futures = [
+            asyncio.ensure_future(server.wait_up(10))
+            for server in (public_server, api_server)
+        ]
+        servers_ready = asyncio.gather(*server_futures)
 
-        process_ended = asyncio.ensure_future(watch_process())
         # wait for process to crash or servers to be ready,
         # whichever comes first
-        # don't need to handle exceptions here,
-        # as the underlying exceptions have informative error messages
-        await asyncio.wait_for(asyncio.gather(process_ended, servers_ready), timeout=15)
+        wait_timeout = 15
+        ready, pending = await asyncio.wait(
+            [
+                process_exited,
+                servers_ready,
+            ],
+            return_when=asyncio.FIRST_COMPLETED,
+            timeout=wait_timeout,
+        )
+        for task in [servers_ready, process_exited] + server_futures:
+            # cancel any pending tasks
+            if not task.done():
+                task.cancel()
+        if not ready:
+            # timeouts passed to wait_up should prevent this,
+            # but weird things like DNS delays may result in
+            # wait_up taking a lot longer than it should
+            raise TimeoutError(
+                f"Waiting for proxy endpoints didn't complete in {wait_timeout}s"
+            )
+        if process_exited in ready:
+            # process exited, this will raise RuntimeError
+            await process_exited
+        else:
+            # if we got here, servers_ready is done
+            # await it to make sure exceptions are raised
+            await servers_ready
+
         self.log.debug("Proxy started and appears to be up")
         pc = PeriodicCallback(self.check_running, 1e3 * self.check_running_interval)
         self._check_running_callback = pc