Add metric recording spawn durations

Try to hit every possible exit point from the spawn_single_server method, with an appropriate status code. The default histogram buckets are also meant for request latencies, but spawning usually takes longer so we use custom buckets
2025-10-13 21:13:01 +00:00 · 2017-12-10 14:54:34 -08:00
parent 339758ec42
commit 6594e88390
2 changed files with 35 additions and 0 deletions
--- a/jupyterhub/handlers/base.py
+++ b/jupyterhub/handlers/base.py
@@ -5,6 +5,7 @@

 import copy
 import re
+import time
 from datetime import timedelta
 from http.client import responses
 from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
@@ -22,6 +23,7 @@ from .. import orm
 from ..objects import Server
 from ..spawner import LocalProcessSpawner
 from ..utils import url_path_join
+from ..metrics import SPAWN_DURATION_SECONDS

 # pattern for the authentication token header
 auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE)
@@ -388,6 +390,7 @@ class BaseHandler(RequestHandler):
    @gen.coroutine
    def spawn_single_user(self, user, server_name='', options=None):
        # in case of error, include 'try again from /hub/home' message
+        spawn_starttime = time.perf_counter()
        self.extra_error_html = self.spawn_home_error

        user_server_name = user.name
@@ -397,6 +400,11 @@ class BaseHandler(RequestHandler):

        if server_name in user.spawners and user.spawners[server_name].pending:
            pending = user.spawners[server_name].pending
+            SPAWN_DURATION_SECONDS.labels(
+                status='already-pending'
+            ).observe(
+                time.perf_counter() - spawn_starttime
+            )
            raise RuntimeError("%s pending %s" % (user_server_name, pending))

        # count active servers and pending spawns
@@ -415,6 +423,11 @@ class BaseHandler(RequestHandler):
                '%s pending spawns, throttling',
                spawn_pending_count,
            )
+            SPAWN_DURATION_SECONDS.labels(
+                status='throttled'
+            ).observe(
+                time.perf_counter() - spawn_starttime
+            )
            raise web.HTTPError(
                429,
                "User startup rate limit exceeded. Try again in a few minutes.",
@@ -424,6 +437,11 @@ class BaseHandler(RequestHandler):
                '%s servers active, no space available',
                active_count,
            )
+            SPAWN_DURATION_SECONDS.labels(
+                status='too-many-users'
+            ).observe(
+                time.perf_counter() - spawn_starttime
+            )
            raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.")

        tic = IOLoop.current().time()
@@ -456,6 +474,11 @@ class BaseHandler(RequestHandler):
            toc = IOLoop.current().time()
            self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic)
            self.statsd.timing('spawner.success', (toc - tic) * 1000)
+            SPAWN_DURATION_SECONDS.labels(
+                status='success'
+            ).observe(
+                time.perf_counter() - spawn_starttime
+            )
            spawner._proxy_pending = True
            try:
                yield self.proxy.add_user(user, server_name)
@@ -499,6 +522,11 @@ class BaseHandler(RequestHandler):
            if status is not None:
                toc = IOLoop.current().time()
                self.statsd.timing('spawner.failure', (toc - tic) * 1000)
+                SPAWN_DURATION_SECONDS.labels(
+                    status='failed'
+                ).observe(
+                    time.perf_counter() - spawn_starttime
+                )
                raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % (
                    status, spawner._log_name))

--- a/jupyterhub/metrics.py
+++ b/jupyterhub/metrics.py
@@ -9,6 +9,13 @@ REQUEST_DURATION_SECONDS = Histogram(
    ['method', 'handler', 'code']
 )

+SPAWN_DURATION_SECONDS = Histogram(
+    'spawn_duration_seconds',
+    'spawn duration for all server spawns',
+    ['status'],
+    buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")]
+)
+
 def prometheus_log_method(handler):
    """
    Tornado log handler for recording RED metrics