From 8898faa141fdce8948a11a1ac0d65c175f94e4e8 Mon Sep 17 00:00:00 2001
From: yuvipanda <yuvipanda@gmail.com>
Date: Thu, 8 Mar 2018 02:50:53 -0800
Subject: [PATCH] Suggest retry timing when we throttle server starts

Fixes #1706
---
 jupyterhub/app.py           | 27 +++++++++++++++++++++++++++
 jupyterhub/handlers/base.py | 23 ++++++++++++++++-------
 2 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/jupyterhub/app.py b/jupyterhub/app.py
index 29e23ef0..424de282 100644
--- a/jupyterhub/app.py
+++ b/jupyterhub/app.py
@@ -616,6 +616,31 @@ class JupyterHub(Application):
         """
     ).tag(config=True)
 
+    throttle_retry_suggest_min = Integer(
+        30,
+        help="""
+        Minimum seconds after which we suggest the user retry spawning.
+
+        When `concurrent_spawn_limit` is exceeded, we recommend users retry
+        after a random period of time, bounded by throttle_retry_suggest_min
+        and throttle_retry_suggest_max.
+
+        throttle_retry_suggest_min should ideally be set to the median
+        spawn time of servers in your installation.
+        """
+    )
+
+    throttle_retry_suggest_max = Integer(
+        60,
+        help="""
+        Minimum seconds after which we suggest the user retry spawning.
+
+        When `concurrent_spawn_limit` is exceeded, we recommend users retry
+        after a random period of time, bounded by throttle_retry_suggest_min
+        and throttle_retry_suggest_max.
+        """
+    )
+
     active_server_limit = Integer(
         0,
         help="""
@@ -1423,6 +1448,8 @@ class JupyterHub(Application):
             allow_named_servers=self.allow_named_servers,
             oauth_provider=self.oauth_provider,
             concurrent_spawn_limit=self.concurrent_spawn_limit,
+            throttle_retry_suggest_min=self.throttle_retry_suggest_min,
+            throttle_retry_suggest_max=self.throttle_retry_suggest_max,
             active_server_limit=self.active_server_limit,
         )
         # allow configured settings to have priority
diff --git a/jupyterhub/handlers/base.py b/jupyterhub/handlers/base.py
index 082c1ad5..3aaf0878 100644
--- a/jupyterhub/handlers/base.py
+++ b/jupyterhub/handlers/base.py
@@ -10,6 +10,7 @@ from datetime import datetime, timedelta
 from http.client import responses
 from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
 import uuid
+import random
 
 from jinja2 import TemplateNotFound
 
@@ -526,17 +527,25 @@ class BaseHandler(RequestHandler):
         active_server_limit = self.active_server_limit
 
         if concurrent_spawn_limit and spawn_pending_count >= concurrent_spawn_limit:
-            self.log.info(
-                '%s pending spawns, throttling',
-                spawn_pending_count,
-            )
             SERVER_SPAWN_DURATION_SECONDS.labels(
                 status=ServerSpawnStatus.throttled
             ).observe(time.perf_counter() - spawn_start_time)
-            raise web.HTTPError(
-                429,
-                "User startup rate limit exceeded. Try again in a few minutes.",
+            # Suggest number of seconds client should wait before retrying
+            # This helps prevent thundering herd problems, where users simply
+            # immediately retry when we are overloaded.
+            retry_time = int(random.uniform(
+                self.settings['throttle_retry_suggest_min'],
+                self.settings['throttle_retry_suggest_max']
+            ))
+            self.set_header('Retry-After', str(retry_time))
+            self.log.info(
+                '%s pending spawns, throttling. Retry in %s seconds',
+                spawn_pending_count, retry_time
             )
+            self.set_status(429, "Too many users trying to log in right now. Try again in a {}s".format(retry_time))
+            # We use set_status and then raise web.Finish, since raising web.HTTPError resets any headers we wanna send.
+            raise web.Finish()
+
         if active_server_limit and active_count >= active_server_limit:
             self.log.info(
                 '%s servers active, no space available',