From 9f8033a147270877b0c0ef095fe890ea6a0c4b3b Mon Sep 17 00:00:00 2001 From: yuvipanda Date: Tue, 25 Jul 2017 18:13:59 -0700 Subject: [PATCH 1/5] Move exponential backoff into a function Also use the 'Full Jitter' jitter algorithm from https://www.awsarchitectureblog.com/2015/03/backoff.html --- jupyterhub/handlers/base.py | 4 +- jupyterhub/spawner.py | 22 +++++---- jupyterhub/utils.py | 89 ++++++++++++++++++++++++------------- 3 files changed, 71 insertions(+), 44 deletions(-) diff --git a/jupyterhub/handlers/base.py b/jupyterhub/handlers/base.py index 2d1943ca..0e2be809 100644 --- a/jupyterhub/handlers/base.py +++ b/jupyterhub/handlers/base.py @@ -20,7 +20,7 @@ from .. import __version__ from .. import orm from ..objects import Server from ..spawner import LocalProcessSpawner -from ..utils import url_path_join, DT_SCALE +from ..utils import url_path_join, exponential_backoff # pattern for the authentication token header auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE) @@ -597,7 +597,7 @@ class UserSpawnHandler(BaseHandler): # record redirect count in query parameter if redirects: self.log.warning("Redirect loop detected on %s", self.request.uri) - yield gen.sleep(min(1 * (DT_SCALE ** redirects), 10)) + yield gen.sleep(min(1 * (2 ** redirects), 10)) # rewrite target url with new `redirects` query value url_parts = urlparse(target) query_parts = parse_qs(url_parts.query) diff --git a/jupyterhub/spawner.py b/jupyterhub/spawner.py index 0abada4b..d2073b18 100644 --- a/jupyterhub/spawner.py +++ b/jupyterhub/spawner.py @@ -26,7 +26,7 @@ from traitlets import ( from .objects import Server from .traitlets import Command, ByteSpecification -from .utils import random_port, url_path_join, DT_MIN, DT_MAX, DT_SCALE +from .utils import random_port, url_path_join, exponential_backoff class Spawner(LoggingConfigurable): @@ -666,21 +666,19 @@ class Spawner(LoggingConfigurable): self.log.exception("Unhandled error in poll callback for %s", self) return status - death_interval = Float(DT_MIN) - @gen.coroutine def wait_for_death(self, timeout=10): """Wait for the single-user server to die, up to timeout seconds""" - loop = IOLoop.current() - tic = loop.time() - dt = self.death_interval - while dt > 0: + @gen.coroutine + def _wait_for_death(): status = yield self.poll() - if status is not None: - break - else: - yield gen.sleep(dt) - dt = min(dt * DT_SCALE, DT_MAX, timeout - (loop.time() - tic)) + return status is not None + + yield exponential_backoff( + _wait_for_death, + 'Process did not die in {timeout} seconds'.format(timeout=timeout), + timeout=timeout + ) def _try_setcwd(path): diff --git a/jupyterhub/utils.py b/jupyterhub/utils.py index 38ece304..40cab13b 100644 --- a/jupyterhub/utils.py +++ b/jupyterhub/utils.py @@ -4,6 +4,7 @@ # Distributed under the terms of the Modified BSD License. from binascii import b2a_hex +import random import errno import hashlib from hmac import compare_digest @@ -48,29 +49,62 @@ def can_connect(ip, port): else: return True -# exponential falloff factors: -# start at 100ms, falloff by 2x -# never longer than 5s -DT_MIN = 0.1 -DT_SCALE = 2 -DT_MAX = 5 +@gen.coroutine +def exponential_backoff( + pass_func, + fail_message, + start_wait=0.1, + scale_factor=2, + max_wait=5, + timeout=10, + *args, **kwargs): + """ + Exponentially backoff until pass_func is true. + + This function will wait with exponential backoff + random jitter for as + many iterations as needed, with maximum timeout timeout. If pass_func is + still returning false at the end of timeout, a TimeoutError will be raised. + + It'll start waiting at start_wait, scaling up by continuously multiplying itself + by scale_factor until pass_func returns true. It'll never wait for more than + max_wait seconds per iteration. + + *args and **kwargs are passed to pass_func. pass_func maybe a future, although + that is not entirely recommended. + + It'll return the value of pass_func when it's truthy! + """ + loop = ioloop.IOLoop.current() + start_tic = loop.time() + dt = start_wait + while True: + if (loop.time() - start_tic) > timeout: + # We time out! + break + ret = yield gen.maybe_future(pass_func(*args, **kwargs)) + # Truthy! + if ret: + return ret + else: + yield gen.sleep(dt) + # Add some random jitter to improve performance + # This makes sure that we don't overload any single iteration + # of the tornado loop with too many things + # See https://www.awsarchitectureblog.com/2015/03/backoff.html + # for a good example of why and how this helps. We're using their + # full Jitter implementation equivalent. + dt = min(max_wait, random.uniform(0, dt * scale_factor)) + raise TimeoutError(fail_message) + @gen.coroutine def wait_for_server(ip, port, timeout=10): """Wait for any server to show up at ip:port.""" if ip in {'', '0.0.0.0'}: ip = '127.0.0.1' - loop = ioloop.IOLoop.current() - tic = loop.time() - dt = DT_MIN - while dt > 0: - if can_connect(ip, port): - return - else: - yield gen.sleep(dt) - dt = min(dt * DT_SCALE, DT_MAX, timeout - (loop.time() - tic)) - raise TimeoutError( - "Server at {ip}:{port} didn't respond in {timeout} seconds".format(**locals()) + yield exponential_backoff( + lambda: can_connect(ip, port), + "Server at {ip}:{port} didn't respond in {timeout} seconds".format(ip=ip, port=port, timeout=timeout) ) @@ -80,13 +114,12 @@ def wait_for_http_server(url, timeout=10): Any non-5XX response code will do, even 404. """ - loop = ioloop.IOLoop.current() - tic = loop.time() client = AsyncHTTPClient() - dt = DT_MIN - while dt > 0: + @gen.coroutine + def is_reachable(): try: r = yield client.fetch(url, follow_redirects=False) + return r except HTTPError as e: if e.code >= 500: # failed to respond properly, wait and try again @@ -95,25 +128,21 @@ def wait_for_http_server(url, timeout=10): # but 502 or other proxy error is conceivable app_log.warning( "Server at %s responded with error: %s", url, e.code) - yield gen.sleep(dt) else: app_log.debug("Server at %s responded with %s", url, e.code) return e.response except (OSError, socket.error) as e: if e.errno not in {errno.ECONNABORTED, errno.ECONNREFUSED, errno.ECONNRESET}: app_log.warning("Failed to connect to %s (%s)", url, e) - yield gen.sleep(dt) - else: - return r - dt = min(dt * DT_SCALE, DT_MAX, timeout - (loop.time() - tic)) - - raise TimeoutError( - "Server at {url} didn't respond in {timeout} seconds".format(**locals()) + return False + re = yield exponential_backoff( + is_reachable, + "Server at {url} didn't respond in {timeout} seconds".format(url=url, timeout=timeout) ) + return re # Decorators for authenticated Handlers - def auth_decorator(check_auth): """Make an authentication decorator. From 0ddf6bf57947f35dae7760b944bf28c0f7115153 Mon Sep 17 00:00:00 2001 From: Min RK Date: Wed, 26 Jul 2017 11:11:53 +0200 Subject: [PATCH 2/5] restore wait_for_death - don't raise TimeoutError - keep Spawner.death_interval for subclasses --- jupyterhub/spawner.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/jupyterhub/spawner.py b/jupyterhub/spawner.py index d2073b18..e22d9acd 100644 --- a/jupyterhub/spawner.py +++ b/jupyterhub/spawner.py @@ -666,6 +666,7 @@ class Spawner(LoggingConfigurable): self.log.exception("Unhandled error in poll callback for %s", self) return status + death_interval = Float(0.1) @gen.coroutine def wait_for_death(self, timeout=10): """Wait for the single-user server to die, up to timeout seconds""" @@ -673,12 +674,16 @@ class Spawner(LoggingConfigurable): def _wait_for_death(): status = yield self.poll() return status is not None - - yield exponential_backoff( - _wait_for_death, - 'Process did not die in {timeout} seconds'.format(timeout=timeout), - timeout=timeout - ) + try: + r = yield exponential_backoff( + _wait_for_death, + 'Process did not die in {timeout} seconds'.format(timeout=timeout), + start_wait=self.death_interval, + timeout=timeout, + ) + return r + except TimeoutError: + return False def _try_setcwd(path): From 6db987972ab0985a71c339c4b85fd17d9871b9cd Mon Sep 17 00:00:00 2001 From: Min RK Date: Wed, 26 Jul 2017 11:14:51 +0200 Subject: [PATCH 3/5] exponential backoff - apply jitter to first iteration - due to jitter, double start_wait to 0.2 so that is still 0.1 - keep scaling by start_wait, rather than previous dt - limit last wait to deadline so timeout is not overshot --- jupyterhub/utils.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/jupyterhub/utils.py b/jupyterhub/utils.py index 40cab13b..1a9a9c71 100644 --- a/jupyterhub/utils.py +++ b/jupyterhub/utils.py @@ -53,7 +53,7 @@ def can_connect(ip, port): def exponential_backoff( pass_func, fail_message, - start_wait=0.1, + start_wait=0.2, scale_factor=2, max_wait=5, timeout=10, @@ -75,25 +75,26 @@ def exponential_backoff( It'll return the value of pass_func when it's truthy! """ loop = ioloop.IOLoop.current() - start_tic = loop.time() - dt = start_wait + deadline = loop.time() + timeout + scale = 1 while True: - if (loop.time() - start_tic) > timeout: - # We time out! - break ret = yield gen.maybe_future(pass_func(*args, **kwargs)) # Truthy! if ret: return ret - else: - yield gen.sleep(dt) + remaining = deadline - loop.time() + if remaining < 0: + # timeout exceeded + break # Add some random jitter to improve performance # This makes sure that we don't overload any single iteration # of the tornado loop with too many things # See https://www.awsarchitectureblog.com/2015/03/backoff.html # for a good example of why and how this helps. We're using their # full Jitter implementation equivalent. - dt = min(max_wait, random.uniform(0, dt * scale_factor)) + dt = min(max_wait, remaining, random.uniform(0, start_wait * scale)) + scale *= scale_factor + yield gen.sleep(dt) raise TimeoutError(fail_message) From 8de38b1708c19d0d4dcdbd3c24ae0c9501d68122 Mon Sep 17 00:00:00 2001 From: Min RK Date: Wed, 26 Jul 2017 13:28:28 +0200 Subject: [PATCH 4/5] add some jitter to the deadline itself MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit so that we don't re-align a bunch of timing out calls once the deadline is reached. (±10% of timeout by default) --- jupyterhub/utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/jupyterhub/utils.py b/jupyterhub/utils.py index 1a9a9c71..d4a0c07f 100644 --- a/jupyterhub/utils.py +++ b/jupyterhub/utils.py @@ -57,6 +57,7 @@ def exponential_backoff( scale_factor=2, max_wait=5, timeout=10, + timeout_tolerance=0.1, *args, **kwargs): """ Exponentially backoff until pass_func is true. @@ -76,6 +77,11 @@ def exponential_backoff( """ loop = ioloop.IOLoop.current() deadline = loop.time() + timeout + # add some jitter to the deadline itself, so that we don't + # re-align a bunch of timing out calls once the deadline is reached. + if timeout_tolerance: + tol = timeout_tolerance * timeout + deadline = random.uniform(deadline - tol, deadline + tol) scale = 1 while True: ret = yield gen.maybe_future(pass_func(*args, **kwargs)) From da10a8e7dd2c920c553d409f164e998db9843958 Mon Sep 17 00:00:00 2001 From: Carol Willing Date: Wed, 26 Jul 2017 12:38:21 -0700 Subject: [PATCH 5/5] Edit docstring and comments --- jupyterhub/handlers/base.py | 1 + jupyterhub/spawner.py | 1 + jupyterhub/utils.py | 67 +++++++++++++++++++++++++++---------- 3 files changed, 51 insertions(+), 18 deletions(-) diff --git a/jupyterhub/handlers/base.py b/jupyterhub/handlers/base.py index 0e2be809..68a47e9c 100644 --- a/jupyterhub/handlers/base.py +++ b/jupyterhub/handlers/base.py @@ -597,6 +597,7 @@ class UserSpawnHandler(BaseHandler): # record redirect count in query parameter if redirects: self.log.warning("Redirect loop detected on %s", self.request.uri) + # add capped exponential backoff where cap is 10s yield gen.sleep(min(1 * (2 ** redirects), 10)) # rewrite target url with new `redirects` query value url_parts = urlparse(target) diff --git a/jupyterhub/spawner.py b/jupyterhub/spawner.py index e22d9acd..2ac5c8c8 100644 --- a/jupyterhub/spawner.py +++ b/jupyterhub/spawner.py @@ -674,6 +674,7 @@ class Spawner(LoggingConfigurable): def _wait_for_death(): status = yield self.poll() return status is not None + try: r = yield exponential_backoff( _wait_for_death, diff --git a/jupyterhub/utils.py b/jupyterhub/utils.py index d4a0c07f..fd833b0b 100644 --- a/jupyterhub/utils.py +++ b/jupyterhub/utils.py @@ -60,25 +60,59 @@ def exponential_backoff( timeout_tolerance=0.1, *args, **kwargs): """ - Exponentially backoff until pass_func is true. + Exponentially backoff until `pass_func` is true. - This function will wait with exponential backoff + random jitter for as - many iterations as needed, with maximum timeout timeout. If pass_func is - still returning false at the end of timeout, a TimeoutError will be raised. + The `pass_func` function will wait with **exponential backoff** and + **random jitter** for as many needed iterations of the Tornado loop, + until reaching maximum `timeout` or truthiness. If `pass_func` is still + returning false at `timeout`, a `TimeoutError` will be raised. - It'll start waiting at start_wait, scaling up by continuously multiplying itself - by scale_factor until pass_func returns true. It'll never wait for more than - max_wait seconds per iteration. + The first iteration will begin with a wait time of `start_wait` seconds. + Each subsequent iteration's wait time will scale up by continuously + multiplying itself by `scale_factor`. This continues for each iteration + until `pass_func` returns true or an iteration's wait time has reached + the `max_wait` seconds per iteration. - *args and **kwargs are passed to pass_func. pass_func maybe a future, although - that is not entirely recommended. + `pass_func` may be a future, although that is not entirely recommended. - It'll return the value of pass_func when it's truthy! + Parameters + ---------- + pass_func + function that is to be run + fail_message : str + message for a `TimeoutError` + start_wait : optional + initial wait time for the first iteration in seconds + scale_factor : optional + a multiplier to increase the wait time for each iteration + max_wait : optional + maximum wait time per iteration in seconds + timeout : optional + maximum time of total wait in seconds + timeout_tolerance : optional + a small multiplier used to add jitter to `timeout`'s deadline + *args, **kwargs + passed to `pass_func(*args, **kwargs)` + + Returns + ------- + value of `pass_func(*args, **kwargs)` + + Raises + ------ + TimeoutError + If `pass_func` is still false at the end of the `timeout` period. + + Notes + ----- + See https://www.awsarchitectureblog.com/2015/03/backoff.html + for information about the algorithm and examples. We're using their + full Jitter implementation equivalent. """ loop = ioloop.IOLoop.current() deadline = loop.time() + timeout - # add some jitter to the deadline itself, so that we don't - # re-align a bunch of timing out calls once the deadline is reached. + # add jitter to the deadline itself to prevent re-align of a bunch of + # timing out calls once the deadline is reached. if timeout_tolerance: tol = timeout_tolerance * timeout deadline = random.uniform(deadline - tol, deadline + tol) @@ -92,12 +126,9 @@ def exponential_backoff( if remaining < 0: # timeout exceeded break - # Add some random jitter to improve performance - # This makes sure that we don't overload any single iteration - # of the tornado loop with too many things - # See https://www.awsarchitectureblog.com/2015/03/backoff.html - # for a good example of why and how this helps. We're using their - # full Jitter implementation equivalent. + # add some random jitter to improve performance + # this prevents overloading any single tornado loop iteration with + # too many things dt = min(max_wait, remaining, random.uniform(0, start_wait * scale)) scale *= scale_factor yield gen.sleep(dt)