Merge pull request #1223 from minrk/wait-up-fall-off

apply exponential backoff to all waits
2025-10-17 15:03:02 +00:00 · 2017-07-17 09:15:48 -07:00
parent 0c5a9e8347 efa6a33b0a
commit 1bafdf9130
2 changed files with 24 additions and 10 deletions
--- a/jupyterhub/spawner.py
+++ b/jupyterhub/spawner.py
@@ -16,7 +16,7 @@ from subprocess import Popen
 from tempfile import mkdtemp

 from tornado import gen
-from tornado.ioloop import PeriodicCallback
+from tornado.ioloop import PeriodicCallback, IOLoop

 from traitlets.config import LoggingConfigurable
 from traitlets import (
@@ -25,7 +25,7 @@ from traitlets import (
 )

 from .traitlets import Command, ByteSpecification
-from .utils import random_port, url_path_join
+from .utils import random_port, url_path_join, DT_MIN, DT_MAX, DT_SCALE


 class Spawner(LoggingConfigurable):
@@ -628,17 +628,21 @@ class Spawner(LoggingConfigurable):
                self.log.exception("Unhandled error in poll callback for %s", self)
        return status

-    death_interval = Float(0.1)
+    death_interval = Float(DT_MIN)

    @gen.coroutine
    def wait_for_death(self, timeout=10):
        """Wait for the single-user server to die, up to timeout seconds"""
-        for i in range(int(timeout / self.death_interval)):
+        loop = IOLoop.current()
+        tic = loop.time()
+        dt = self.death_interval
+        while dt > 0:
            status = yield self.poll()
            if status is not None:
                break
            else:
-                yield gen.sleep(self.death_interval)
+                yield gen.sleep(dt)
+            dt = min(dt * DT_SCALE, DT_MAX, timeout - (loop.time() - tic))


 def _try_setcwd(path):
--- a/jupyterhub/utils.py
+++ b/jupyterhub/utils.py
@@ -48,6 +48,12 @@ def can_connect(ip, port):
    else:
        return True

+# exponential falloff factors:
+# start at 100ms, falloff by 2x
+# never longer than 5s
+DT_MIN = 0.1
+DT_SCALE = 2
+DT_MAX = 5

@gen.coroutine
 def wait_for_server(ip, port, timeout=10):
@@ -56,11 +62,13 @@ def wait_for_server(ip, port, timeout=10):
        ip = '127.0.0.1'
    loop = ioloop.IOLoop.current()
    tic = loop.time()
-    while loop.time() - tic < timeout:
+    dt = DT_MIN
+    while dt > 0:
        if can_connect(ip, port):
            return
        else:
-            yield gen.sleep(0.1)
+            yield gen.sleep(dt)
+        dt = min(dt * DT_SCALE, DT_MAX, timeout - (loop.time() - tic))
    raise TimeoutError(
        "Server at {ip}:{port} didn't respond in {timeout} seconds".format(**locals())
    )
@@ -75,7 +83,8 @@ def wait_for_http_server(url, timeout=10):
    loop = ioloop.IOLoop.current()
    tic = loop.time()
    client = AsyncHTTPClient()
-    while loop.time() - tic < timeout:
+    dt = DT_MIN
+    while dt > 0:
        try:
            r = yield client.fetch(url, follow_redirects=False)
        except HTTPError as e:
@@ -86,16 +95,17 @@ def wait_for_http_server(url, timeout=10):
                    # but 502 or other proxy error is conceivable
                    app_log.warning(
                        "Server at %s responded with error: %s", url, e.code)
-                yield gen.sleep(0.1)
+                yield gen.sleep(dt)
            else:
                app_log.debug("Server at %s responded with %s", url, e.code)
                return e.response
        except (OSError, socket.error) as e:
            if e.errno not in {errno.ECONNABORTED, errno.ECONNREFUSED, errno.ECONNRESET}:
                app_log.warning("Failed to connect to %s (%s)", url, e)
-            yield gen.sleep(0.1)
+            yield gen.sleep(dt)
        else:
            return r
+        dt = min(dt * DT_SCALE, DT_MAX, timeout - (loop.time() - tic))

    raise TimeoutError(
        "Server at {url} didn't respond in {timeout} seconds".format(**locals())