Suggest retry timing when we throttle server starts

Fixes #1706
2025-10-15 14:03:02 +00:00 · 2018-03-08 02:50:53 -08:00
parent 6a47123ec9
commit 8898faa141
2 changed files with 43 additions and 7 deletions
--- a/jupyterhub/app.py
+++ b/jupyterhub/app.py
@@ -616,6 +616,31 @@ class JupyterHub(Application):
        """
    ).tag(config=True)

+    throttle_retry_suggest_min = Integer(
+        30,
+        help="""
+        Minimum seconds after which we suggest the user retry spawning.
+
+        When `concurrent_spawn_limit` is exceeded, we recommend users retry
+        after a random period of time, bounded by throttle_retry_suggest_min
+        and throttle_retry_suggest_max.
+
+        throttle_retry_suggest_min should ideally be set to the median
+        spawn time of servers in your installation.
+        """
+    )
+
+    throttle_retry_suggest_max = Integer(
+        60,
+        help="""
+        Minimum seconds after which we suggest the user retry spawning.
+
+        When `concurrent_spawn_limit` is exceeded, we recommend users retry
+        after a random period of time, bounded by throttle_retry_suggest_min
+        and throttle_retry_suggest_max.
+        """
+    )
+
    active_server_limit = Integer(
        0,
        help="""
@@ -1423,6 +1448,8 @@ class JupyterHub(Application):
            allow_named_servers=self.allow_named_servers,
            oauth_provider=self.oauth_provider,
            concurrent_spawn_limit=self.concurrent_spawn_limit,
+            throttle_retry_suggest_min=self.throttle_retry_suggest_min,
+            throttle_retry_suggest_max=self.throttle_retry_suggest_max,
            active_server_limit=self.active_server_limit,
        )
        # allow configured settings to have priority
--- a/jupyterhub/handlers/base.py
+++ b/jupyterhub/handlers/base.py
@@ -10,6 +10,7 @@ from datetime import datetime, timedelta
 from http.client import responses
 from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
 import uuid
+import random

 from jinja2 import TemplateNotFound

@@ -526,17 +527,25 @@ class BaseHandler(RequestHandler):
        active_server_limit = self.active_server_limit

        if concurrent_spawn_limit and spawn_pending_count >= concurrent_spawn_limit:
-            self.log.info(
-                '%s pending spawns, throttling',
-                spawn_pending_count,
-            )
            SERVER_SPAWN_DURATION_SECONDS.labels(
                status=ServerSpawnStatus.throttled
            ).observe(time.perf_counter() - spawn_start_time)
-            raise web.HTTPError(
-                429,
-                "User startup rate limit exceeded. Try again in a few minutes.",
+            # Suggest number of seconds client should wait before retrying
+            # This helps prevent thundering herd problems, where users simply
+            # immediately retry when we are overloaded.
+            retry_time = int(random.uniform(
+                self.settings['throttle_retry_suggest_min'],
+                self.settings['throttle_retry_suggest_max']
+            ))
+            self.set_header('Retry-After', str(retry_time))
+            self.log.info(
+                '%s pending spawns, throttling. Retry in %s seconds',
+                spawn_pending_count, retry_time
            )
+            self.set_status(429, "Too many users trying to log in right now. Try again in a {}s".format(retry_time))
+            # We use set_status and then raise web.Finish, since raising web.HTTPError resets any headers we wanna send.
+            raise web.Finish()
+
        if active_server_limit and active_count >= active_server_limit:
            self.log.info(
                '%s servers active, no space available',