Make sure our metrics don't appear & disappear intermittently

Create all timeseries from the beginning, regardless of wether they happen or not. Also rename metric objects for consistency.
2025-10-17 23:13:00 +00:00 · 2017-12-10 21:23:32 -08:00
parent ea99c58da5
commit 3cd526c019
2 changed files with 67 additions and 30 deletions
--- a/jupyterhub/handlers/base.py
+++ b/jupyterhub/handlers/base.py
@@ -23,7 +23,10 @@ from .. import orm
 from ..objects import Server
 from ..spawner import LocalProcessSpawner
 from ..utils import url_path_join
-from ..metrics import SPAWN_DURATION_SECONDS, PROXY_ADD_DURATION_SECONDS
+from ..metrics import (
+    SERVER_SPAWN_DURATION_SECONDS, ServerSpawnStatus,
+    PROXY_ADD_DURATION_SECONDS, ProxyAddStatus
+)

 # pattern for the authentication token header
 auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE)
@@ -400,11 +403,9 @@ class BaseHandler(RequestHandler):

        if server_name in user.spawners and user.spawners[server_name].pending:
            pending = user.spawners[server_name].pending
-            SPAWN_DURATION_SECONDS.labels(
-                status='already-pending'
-            ).observe(
-                time.perf_counter() - spawn_start_time
-            )
+            SERVER_SPAWN_DURATION_SECONDS.labels(
+                status=ServerSpawnStatus.already_pending
+            ).observe(time.perf_counter() - spawn_start_time)
            raise RuntimeError("%s pending %s" % (user_server_name, pending))

        # count active servers and pending spawns
@@ -423,11 +424,9 @@ class BaseHandler(RequestHandler):
                '%s pending spawns, throttling',
                spawn_pending_count,
            )
-            SPAWN_DURATION_SECONDS.labels(
-                status='throttled'
-            ).observe(
-                time.perf_counter() - spawn_start_time
-            )
+            SERVER_SPAWN_DURATION_SECONDS.labels(
+                status=ServerSpawnStatus.throttled
+            ).observe(time.perf_counter() - spawn_start_time)
            raise web.HTTPError(
                429,
                "User startup rate limit exceeded. Try again in a few minutes.",
@@ -437,11 +436,9 @@ class BaseHandler(RequestHandler):
                '%s servers active, no space available',
                active_count,
            )
-            SPAWN_DURATION_SECONDS.labels(
-                status='too-many-users'
-            ).observe(
-                time.perf_counter() - spawn_start_time
-            )
+            SERVER_SPAWN_DURATION_SECONDS.labels(
+                status=ServerSpawnStatus.too_many_users
+            ).observe(time.perf_counter() - spawn_start_time)
            raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.")

        tic = IOLoop.current().time()
@@ -474,11 +471,9 @@ class BaseHandler(RequestHandler):
            toc = IOLoop.current().time()
            self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic)
            self.statsd.timing('spawner.success', (toc - tic) * 1000)
-            SPAWN_DURATION_SECONDS.labels(
-                status='success'
-            ).observe(
-                time.perf_counter() - spawn_start_time
-            )
+            SERVER_SPAWN_DURATION_SECONDS.labels(
+                status=ServerSpawnStatus.success
+            ).observe(time.perf_counter() - spawn_start_time)
            proxy_add_start_time = time.perf_counter()
            spawner._proxy_pending = True
            try:
@@ -534,11 +529,9 @@ class BaseHandler(RequestHandler):
            if status is not None:
                toc = IOLoop.current().time()
                self.statsd.timing('spawner.failure', (toc - tic) * 1000)
-                SPAWN_DURATION_SECONDS.labels(
-                    status='failed'
-                ).observe(
-                    time.perf_counter() - spawn_start_time
-                )
+                SERVER_SPAWN_DURATION_SECONDS.labels(
+                    status=ServerSpawnStatus.failure
+                ).observe(time.perf_counter() - spawn_start_time)
                raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % (
                    status, spawner._log_name))