Merge pull request #4561 from minrk/better-wait-up

Improve debugging when waiting for servers
This commit is contained in:
Erik Sundell
2023-09-13 16:57:26 +02:00
committed by GitHub
2 changed files with 73 additions and 20 deletions

View File

@@ -48,7 +48,7 @@ from jupyterhub.traitlets import Command
from . import utils
from .metrics import CHECK_ROUTES_DURATION_SECONDS, PROXY_POLL_DURATION_SECONDS
from .objects import Server
from .utils import AnyTimeoutError, exponential_backoff, url_escape_path, url_path_join
from .utils import exponential_backoff, url_escape_path, url_path_join
def _one_at_a_time(method):
@@ -766,24 +766,67 @@ class ConfigurableHTTPProxy(Proxy):
self._write_pid_file()
def _check_process():
status = self.proxy_process.poll()
if status is not None:
with self.proxy_process:
e = RuntimeError("Proxy failed to start with exit code %i" % status)
raise e from None
async def wait_for_process():
"""Watch proxy process for early termination
Runs forever, checking every 0.5s if the process has exited
so we don't keep waiting for endpoints after the proxy has stopped.
Raises RuntimeError if/when the proxy process exits,
otherwise runs forever.
Should be cancelled when servers become ready.
"""
while True:
status = self.proxy_process.poll()
if status is not None:
with self.proxy_process:
e = RuntimeError(
f"Proxy failed to start with exit code {status}"
)
raise e from None
await asyncio.sleep(0.5)
# process_exited can only resolve with a RuntimeError when the process has exited,
# otherwise it must be cancelled.
process_exited = asyncio.ensure_future(wait_for_process())
# wait for both servers to be ready (or one server to fail)
server_futures = [
asyncio.ensure_future(server.wait_up(10))
for server in (public_server, api_server)
]
servers_ready = asyncio.gather(*server_futures)
# wait for process to crash or servers to be ready,
# whichever comes first
wait_timeout = 15
ready, pending = await asyncio.wait(
[
process_exited,
servers_ready,
],
return_when=asyncio.FIRST_COMPLETED,
timeout=wait_timeout,
)
for task in [servers_ready, process_exited] + server_futures:
# cancel any pending tasks
if not task.done():
task.cancel()
if not ready:
# timeouts passed to wait_up should prevent this,
# but weird things like DNS delays may result in
# wait_up taking a lot longer than it should
raise TimeoutError(
f"Waiting for proxy endpoints didn't complete in {wait_timeout}s"
)
if process_exited in ready:
# process exited, this will raise RuntimeError
await process_exited
else:
# if we got here, servers_ready is done
# await it to make sure exceptions are raised
await servers_ready
for server in (public_server, api_server):
for i in range(10):
_check_process()
try:
await server.wait_up(1)
except AnyTimeoutError:
continue
else:
break
await server.wait_up(1)
_check_process()
self.log.debug("Proxy started and appears to be up")
pc = PeriodicCallback(self.check_running, 1e3 * self.check_running_interval)
self._check_running_callback = pc

View File

@@ -15,6 +15,7 @@ import ssl
import string
import sys
import threading
import time
import uuid
import warnings
from binascii import b2a_hex
@@ -93,6 +94,8 @@ def can_connect(ip, port):
except OSError as e:
if e.errno not in {errno.ECONNREFUSED, errno.ETIMEDOUT}:
app_log.error("Unexpected error connecting to %s:%i %s", ip, port, e)
else:
app_log.debug("Server at %s:%i not ready: %s", ip, port, e)
return False
else:
return True
@@ -245,6 +248,8 @@ async def wait_for_server(ip, port, timeout=10):
"""Wait for any server to show up at ip:port."""
if ip in {'', '0.0.0.0', '::'}:
ip = '127.0.0.1'
app_log.debug("Waiting %ss for server at %s:%s", timeout, ip, port)
tic = time.perf_counter()
await exponential_backoff(
lambda: can_connect(ip, port),
"Server at {ip}:{port} didn't respond in {timeout} seconds".format(
@@ -252,6 +257,8 @@ async def wait_for_server(ip, port, timeout=10):
),
timeout=timeout,
)
toc = time.perf_counter()
app_log.debug("Server at %s:%s responded in %.2fs", ip, port, toc - tic)
async def wait_for_http_server(url, timeout=10, ssl_context=None):
@@ -259,12 +266,13 @@ async def wait_for_http_server(url, timeout=10, ssl_context=None):
Any non-5XX response code will do, even 404.
"""
loop = ioloop.IOLoop.current()
tic = loop.time()
client = AsyncHTTPClient()
if ssl_context:
client.ssl_options = ssl_context
app_log.debug("Waiting %ss for server at %s", timeout, url)
tic = time.perf_counter()
async def is_reachable():
try:
r = await client.fetch(url, follow_redirects=False)
@@ -297,6 +305,8 @@ async def wait_for_http_server(url, timeout=10, ssl_context=None):
),
timeout=timeout,
)
toc = time.perf_counter()
app_log.debug("Server at %s responded in %.2fs", url, toc - tic)
return re