From 8898faa141fdce8948a11a1ac0d65c175f94e4e8 Mon Sep 17 00:00:00 2001 From: yuvipanda Date: Thu, 8 Mar 2018 02:50:53 -0800 Subject: [PATCH] Suggest retry timing when we throttle server starts Fixes #1706 --- jupyterhub/app.py | 27 +++++++++++++++++++++++++++ jupyterhub/handlers/base.py | 23 ++++++++++++++++------- 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/jupyterhub/app.py b/jupyterhub/app.py index 29e23ef0..424de282 100644 --- a/jupyterhub/app.py +++ b/jupyterhub/app.py @@ -616,6 +616,31 @@ class JupyterHub(Application): """ ).tag(config=True) + throttle_retry_suggest_min = Integer( + 30, + help=""" + Minimum seconds after which we suggest the user retry spawning. + + When `concurrent_spawn_limit` is exceeded, we recommend users retry + after a random period of time, bounded by throttle_retry_suggest_min + and throttle_retry_suggest_max. + + throttle_retry_suggest_min should ideally be set to the median + spawn time of servers in your installation. + """ + ) + + throttle_retry_suggest_max = Integer( + 60, + help=""" + Minimum seconds after which we suggest the user retry spawning. + + When `concurrent_spawn_limit` is exceeded, we recommend users retry + after a random period of time, bounded by throttle_retry_suggest_min + and throttle_retry_suggest_max. + """ + ) + active_server_limit = Integer( 0, help=""" @@ -1423,6 +1448,8 @@ class JupyterHub(Application): allow_named_servers=self.allow_named_servers, oauth_provider=self.oauth_provider, concurrent_spawn_limit=self.concurrent_spawn_limit, + throttle_retry_suggest_min=self.throttle_retry_suggest_min, + throttle_retry_suggest_max=self.throttle_retry_suggest_max, active_server_limit=self.active_server_limit, ) # allow configured settings to have priority diff --git a/jupyterhub/handlers/base.py b/jupyterhub/handlers/base.py index 082c1ad5..3aaf0878 100644 --- a/jupyterhub/handlers/base.py +++ b/jupyterhub/handlers/base.py @@ -10,6 +10,7 @@ from datetime import datetime, timedelta from http.client import responses from urllib.parse import urlparse, urlunparse, parse_qs, urlencode import uuid +import random from jinja2 import TemplateNotFound @@ -526,17 +527,25 @@ class BaseHandler(RequestHandler): active_server_limit = self.active_server_limit if concurrent_spawn_limit and spawn_pending_count >= concurrent_spawn_limit: - self.log.info( - '%s pending spawns, throttling', - spawn_pending_count, - ) SERVER_SPAWN_DURATION_SECONDS.labels( status=ServerSpawnStatus.throttled ).observe(time.perf_counter() - spawn_start_time) - raise web.HTTPError( - 429, - "User startup rate limit exceeded. Try again in a few minutes.", + # Suggest number of seconds client should wait before retrying + # This helps prevent thundering herd problems, where users simply + # immediately retry when we are overloaded. + retry_time = int(random.uniform( + self.settings['throttle_retry_suggest_min'], + self.settings['throttle_retry_suggest_max'] + )) + self.set_header('Retry-After', str(retry_time)) + self.log.info( + '%s pending spawns, throttling. Retry in %s seconds', + spawn_pending_count, retry_time ) + self.set_status(429, "Too many users trying to log in right now. Try again in a {}s".format(retry_time)) + # We use set_status and then raise web.Finish, since raising web.HTTPError resets any headers we wanna send. + raise web.Finish() + if active_server_limit and active_count >= active_server_limit: self.log.info( '%s servers active, no space available',