From ece8408381b4fa61f125c175663e5e6828e80b5c Mon Sep 17 00:00:00 2001 From: Min RK Date: Tue, 12 Sep 2023 09:24:01 +0200 Subject: [PATCH 1/2] Improve debugging when waiting for servers Add debug messages and timers for start and end waiting for servers and improve logic for awaiting proxy endpoints using concurrency primitives instead of a for-loop --- jupyterhub/proxy.py | 48 ++++++++++++++++++++++++++++----------------- jupyterhub/utils.py | 14 +++++++++++-- 2 files changed, 42 insertions(+), 20 deletions(-) diff --git a/jupyterhub/proxy.py b/jupyterhub/proxy.py index 87b47a6c..8d92eb94 100644 --- a/jupyterhub/proxy.py +++ b/jupyterhub/proxy.py @@ -48,7 +48,7 @@ from jupyterhub.traitlets import Command from . import utils from .metrics import CHECK_ROUTES_DURATION_SECONDS, PROXY_POLL_DURATION_SECONDS from .objects import Server -from .utils import AnyTimeoutError, exponential_backoff, url_escape_path, url_path_join +from .utils import exponential_backoff, url_escape_path, url_path_join def _one_at_a_time(method): @@ -766,24 +766,36 @@ class ConfigurableHTTPProxy(Proxy): self._write_pid_file() - def _check_process(): - status = self.proxy_process.poll() - if status is not None: - with self.proxy_process: - e = RuntimeError("Proxy failed to start with exit code %i" % status) - raise e from None + async def watch_process(): + """Watch proxy process for early termination - for server in (public_server, api_server): - for i in range(10): - _check_process() - try: - await server.wait_up(1) - except AnyTimeoutError: - continue - else: - break - await server.wait_up(1) - _check_process() + so we don't keep waiting for connections after the proxy exits + """ + while True: + status = self.proxy_process.poll() + if status is not None: + with self.proxy_process: + e = RuntimeError( + f"Proxy failed to start with exit code {status}" + ) + raise e from None + + await asyncio.wait([servers_ready], timeout=0.5) + # stop watching the process when the connections + # have finished (success or failure) + if servers_ready.done(): + return + + servers_ready = asyncio.gather( + *(server.wait_up(10) for server in (public_server, api_server)) + ) + + process_ended = asyncio.ensure_future(watch_process()) + # wait for process to crash or servers to be ready, + # whichever comes first + # don't need to handle exceptions here, + # as the underlying exceptions have informative error messages + await asyncio.wait_for(asyncio.gather(process_ended, servers_ready), timeout=15) self.log.debug("Proxy started and appears to be up") pc = PeriodicCallback(self.check_running, 1e3 * self.check_running_interval) self._check_running_callback = pc diff --git a/jupyterhub/utils.py b/jupyterhub/utils.py index 4db7ffe0..e703d9fd 100644 --- a/jupyterhub/utils.py +++ b/jupyterhub/utils.py @@ -15,6 +15,7 @@ import ssl import string import sys import threading +import time import uuid import warnings from binascii import b2a_hex @@ -93,6 +94,8 @@ def can_connect(ip, port): except OSError as e: if e.errno not in {errno.ECONNREFUSED, errno.ETIMEDOUT}: app_log.error("Unexpected error connecting to %s:%i %s", ip, port, e) + else: + app_log.debug("Server at %s:%i not ready: %s", ip, port, e) return False else: return True @@ -245,6 +248,8 @@ async def wait_for_server(ip, port, timeout=10): """Wait for any server to show up at ip:port.""" if ip in {'', '0.0.0.0', '::'}: ip = '127.0.0.1' + app_log.debug("Waiting %ss for server at %s:%s", timeout, ip, port) + tic = time.perf_counter() await exponential_backoff( lambda: can_connect(ip, port), "Server at {ip}:{port} didn't respond in {timeout} seconds".format( @@ -252,6 +257,8 @@ async def wait_for_server(ip, port, timeout=10): ), timeout=timeout, ) + toc = time.perf_counter() + app_log.debug("Server at %s:%s responded in %.2fs", ip, port, toc - tic) async def wait_for_http_server(url, timeout=10, ssl_context=None): @@ -259,12 +266,13 @@ async def wait_for_http_server(url, timeout=10, ssl_context=None): Any non-5XX response code will do, even 404. """ - loop = ioloop.IOLoop.current() - tic = loop.time() client = AsyncHTTPClient() if ssl_context: client.ssl_options = ssl_context + app_log.debug("Waiting %ss for server at %s", timeout, url) + tic = time.perf_counter() + async def is_reachable(): try: r = await client.fetch(url, follow_redirects=False) @@ -297,6 +305,8 @@ async def wait_for_http_server(url, timeout=10, ssl_context=None): ), timeout=timeout, ) + toc = time.perf_counter() + app_log.debug("Server at %s responded in %.2fs", url, toc - tic) return re From cbc9f19d6e0441e094111a109b13fe12f1f1e941 Mon Sep 17 00:00:00 2001 From: Min RK Date: Wed, 13 Sep 2023 14:55:43 +0200 Subject: [PATCH 2/2] more detailed, verbose wait for proxy startup --- jupyterhub/proxy.py | 59 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 14 deletions(-) diff --git a/jupyterhub/proxy.py b/jupyterhub/proxy.py index 8d92eb94..85fb23d7 100644 --- a/jupyterhub/proxy.py +++ b/jupyterhub/proxy.py @@ -766,10 +766,15 @@ class ConfigurableHTTPProxy(Proxy): self._write_pid_file() - async def watch_process(): + async def wait_for_process(): """Watch proxy process for early termination - so we don't keep waiting for connections after the proxy exits + Runs forever, checking every 0.5s if the process has exited + so we don't keep waiting for endpoints after the proxy has stopped. + + Raises RuntimeError if/when the proxy process exits, + otherwise runs forever. + Should be cancelled when servers become ready. """ while True: status = self.proxy_process.poll() @@ -779,23 +784,49 @@ class ConfigurableHTTPProxy(Proxy): f"Proxy failed to start with exit code {status}" ) raise e from None + await asyncio.sleep(0.5) - await asyncio.wait([servers_ready], timeout=0.5) - # stop watching the process when the connections - # have finished (success or failure) - if servers_ready.done(): - return + # process_exited can only resolve with a RuntimeError when the process has exited, + # otherwise it must be cancelled. + process_exited = asyncio.ensure_future(wait_for_process()) - servers_ready = asyncio.gather( - *(server.wait_up(10) for server in (public_server, api_server)) - ) + # wait for both servers to be ready (or one server to fail) + server_futures = [ + asyncio.ensure_future(server.wait_up(10)) + for server in (public_server, api_server) + ] + servers_ready = asyncio.gather(*server_futures) - process_ended = asyncio.ensure_future(watch_process()) # wait for process to crash or servers to be ready, # whichever comes first - # don't need to handle exceptions here, - # as the underlying exceptions have informative error messages - await asyncio.wait_for(asyncio.gather(process_ended, servers_ready), timeout=15) + wait_timeout = 15 + ready, pending = await asyncio.wait( + [ + process_exited, + servers_ready, + ], + return_when=asyncio.FIRST_COMPLETED, + timeout=wait_timeout, + ) + for task in [servers_ready, process_exited] + server_futures: + # cancel any pending tasks + if not task.done(): + task.cancel() + if not ready: + # timeouts passed to wait_up should prevent this, + # but weird things like DNS delays may result in + # wait_up taking a lot longer than it should + raise TimeoutError( + f"Waiting for proxy endpoints didn't complete in {wait_timeout}s" + ) + if process_exited in ready: + # process exited, this will raise RuntimeError + await process_exited + else: + # if we got here, servers_ready is done + # await it to make sure exceptions are raised + await servers_ready + self.log.debug("Proxy started and appears to be up") pc = PeriodicCallback(self.check_running, 1e3 * self.check_running_interval) self._check_running_callback = pc