Make sure our metrics don't appear & disappear intermittently

Create all timeseries from the beginning, regardless of wether they happen or not. Also rename metric objects for consistency.
2025-10-18 15:33:02 +00:00 · 2017-12-10 21:23:32 -08:00
parent ea99c58da5
commit 3cd526c019
2 changed files with 67 additions and 30 deletions
--- a/jupyterhub/handlers/base.py
+++ b/jupyterhub/handlers/base.py
@@ -23,7 +23,10 @@ from .. import orm
 from ..objects import Server
 from ..spawner import LocalProcessSpawner
 from ..utils import url_path_join
-from ..metrics import SPAWN_DURATION_SECONDS, PROXY_ADD_DURATION_SECONDS
+from ..metrics import (
+    SERVER_SPAWN_DURATION_SECONDS, ServerSpawnStatus,
+    PROXY_ADD_DURATION_SECONDS, ProxyAddStatus
+)

 # pattern for the authentication token header
 auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE)
@@ -400,11 +403,9 @@ class BaseHandler(RequestHandler):

        if server_name in user.spawners and user.spawners[server_name].pending:
            pending = user.spawners[server_name].pending
-            SPAWN_DURATION_SECONDS.labels(
-                status='already-pending'
-            ).observe(
-                time.perf_counter() - spawn_start_time
-            )
+            SERVER_SPAWN_DURATION_SECONDS.labels(
+                status=ServerSpawnStatus.already_pending
+            ).observe(time.perf_counter() - spawn_start_time)
            raise RuntimeError("%s pending %s" % (user_server_name, pending))

        # count active servers and pending spawns
@@ -423,11 +424,9 @@ class BaseHandler(RequestHandler):
                '%s pending spawns, throttling',
                spawn_pending_count,
            )
-            SPAWN_DURATION_SECONDS.labels(
-                status='throttled'
-            ).observe(
-                time.perf_counter() - spawn_start_time
-            )
+            SERVER_SPAWN_DURATION_SECONDS.labels(
+                status=ServerSpawnStatus.throttled
+            ).observe(time.perf_counter() - spawn_start_time)
            raise web.HTTPError(
                429,
                "User startup rate limit exceeded. Try again in a few minutes.",
@@ -437,11 +436,9 @@ class BaseHandler(RequestHandler):
                '%s servers active, no space available',
                active_count,
            )
-            SPAWN_DURATION_SECONDS.labels(
-                status='too-many-users'
-            ).observe(
-                time.perf_counter() - spawn_start_time
-            )
+            SERVER_SPAWN_DURATION_SECONDS.labels(
+                status=ServerSpawnStatus.too_many_users
+            ).observe(time.perf_counter() - spawn_start_time)
            raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.")

        tic = IOLoop.current().time()
@@ -474,11 +471,9 @@ class BaseHandler(RequestHandler):
            toc = IOLoop.current().time()
            self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic)
            self.statsd.timing('spawner.success', (toc - tic) * 1000)
-            SPAWN_DURATION_SECONDS.labels(
-                status='success'
-            ).observe(
-                time.perf_counter() - spawn_start_time
-            )
+            SERVER_SPAWN_DURATION_SECONDS.labels(
+                status=ServerSpawnStatus.success
+            ).observe(time.perf_counter() - spawn_start_time)
            proxy_add_start_time = time.perf_counter()
            spawner._proxy_pending = True
            try:
@@ -534,11 +529,9 @@ class BaseHandler(RequestHandler):
            if status is not None:
                toc = IOLoop.current().time()
                self.statsd.timing('spawner.failure', (toc - tic) * 1000)
-                SPAWN_DURATION_SECONDS.labels(
-                    status='failed'
-                ).observe(
-                    time.perf_counter() - spawn_start_time
-                )
+                SERVER_SPAWN_DURATION_SECONDS.labels(
+                    status=ServerSpawnStatus.failure
+                ).observe(time.perf_counter() - spawn_start_time)
                raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % (
                    status, spawner._log_name))

--- a/jupyterhub/metrics.py
+++ b/jupyterhub/metrics.py
@@ -2,8 +2,21 @@
 Prometheus metrics exported by JupyterHub

 Read https://prometheus.io/docs/practices/naming/ for naming
-conventions for metrics & labels.
+conventions for metrics & labels. We generally prefer naming them
+`<noun>_<verb>_<type_suffix>`. So a histogram that's tracking
+the duration (in seconds) of servers spawning would be called
+SERVER_SPAWN_DURATION_SECONDS.
+
+We also create an Enum for each 'status' type label in every metric
+we collect. This is to make sure that the metrics exist regardless
+of the condition happening or not. For example, if we don't explicitly
+create them, the metric spawn_duration_seconds{status="failure"}
+will not actually exist until the first failure. This makes dashboarding
+and alerting difficult, so we explicitly list statuses and create
+them manually here.
 """
+from enum import Enum
+
 from prometheus_client import Histogram

 REQUEST_DURATION_SECONDS = Histogram(
@@ -12,21 +25,52 @@ REQUEST_DURATION_SECONDS = Histogram(
    ['method', 'handler', 'code']
 )

-SPAWN_DURATION_SECONDS = Histogram(
-    'spawn_duration_seconds',
-    'spawn duration for all server spawns',
+SERVER_SPAWN_DURATION_SECONDS = Histogram(
+    'server_spawn_duration_seconds',
+    'time taken for server spawning operation',
    ['status'],
    # Use custom bucket sizes, since the default bucket ranges
    # are meant for quick running processes. Spawns can take a while!
    buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")]
 )

+class ServerSpawnStatus(Enum):
+    """
+    Possible values for 'status' label of SERVER_SPAWN_DURATION_SECONDS
+    """
+    success = 'success'
+    failure = 'failure'
+    already_pending = 'already-pending'
+    throttled = 'throttled'
+    too_many_users = 'too-many-users'
+
+    def __str__(self):
+        return self.value
+
+for s in ServerSpawnStatus:
+    # Create empty metrics with the given status
+    SERVER_SPAWN_DURATION_SECONDS.labels(status=s)
+
+
 PROXY_ADD_DURATION_SECONDS = Histogram(
    'proxy_add_duration_seconds',
    'duration for adding user routes to proxy',
    ['status']
 )

+class ProxyAddStatus(Enum):
+    """
+    Possible values for 'status' label of PROXY_ADD_DURATION_SECONDS
+    """
+    success = 'success'
+    failure = 'failure'
+
+    def __str__(self):
+        return self.value
+
+for s in ProxyAddStatus:
+    PROXY_ADD_DURATION_SECONDS.labels(status=s)
+
 def prometheus_log_method(handler):
    """
    Tornado log handler for recording RED metrics.