Make sure our metrics don't appear & disappear intermittently

Create all timeseries from the beginning, regardless of wether they happen or not. Also rename metric objects for consistency.
2025-10-18 15:33:02 +00:00 · 2017-12-10 21:23:32 -08:00
parent ea99c58da5
commit 3cd526c019
2 changed files with 67 additions and 30 deletions
--- a/jupyterhub/handlers/base.py
+++ b/jupyterhub/handlers/base.py
@@ -23,7 +23,10 @@ from .. import orm
 from ..objects import Server
 from ..spawner import LocalProcessSpawner
 from ..utils import url_path_join
-from ..metrics import SPAWN_DURATION_SECONDS, PROXY_ADD_DURATION_SECONDS
+from ..metrics import (
    SERVER_SPAWN_DURATION_SECONDS, ServerSpawnStatus,
    PROXY_ADD_DURATION_SECONDS, ProxyAddStatus
 )
 # pattern for the authentication token header
 auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE)
@@ -400,11 +403,9 @@ class BaseHandler(RequestHandler):
        if server_name in user.spawners and user.spawners[server_name].pending:
            pending = user.spawners[server_name].pending
-            SPAWN_DURATION_SECONDS.labels(
+            SERVER_SPAWN_DURATION_SECONDS.labels(
-                status='already-pending'
+                status=ServerSpawnStatus.already_pending
-            ).observe(
+            ).observe(time.perf_counter() - spawn_start_time)
                time.perf_counter() - spawn_start_time
            )
            raise RuntimeError("%s pending %s" % (user_server_name, pending))
        # count active servers and pending spawns
@@ -423,11 +424,9 @@ class BaseHandler(RequestHandler):
                '%s pending spawns, throttling',
                spawn_pending_count,
            )
-            SPAWN_DURATION_SECONDS.labels(
+            SERVER_SPAWN_DURATION_SECONDS.labels(
-                status='throttled'
+                status=ServerSpawnStatus.throttled
-            ).observe(
+            ).observe(time.perf_counter() - spawn_start_time)
                time.perf_counter() - spawn_start_time
            )
            raise web.HTTPError(
                429,
                "User startup rate limit exceeded. Try again in a few minutes.",
@@ -437,11 +436,9 @@ class BaseHandler(RequestHandler):
                '%s servers active, no space available',
                active_count,
            )
-            SPAWN_DURATION_SECONDS.labels(
+            SERVER_SPAWN_DURATION_SECONDS.labels(
-                status='too-many-users'
+                status=ServerSpawnStatus.too_many_users
-            ).observe(
+            ).observe(time.perf_counter() - spawn_start_time)
                time.perf_counter() - spawn_start_time
            )
            raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.")
        tic = IOLoop.current().time()
@@ -474,11 +471,9 @@ class BaseHandler(RequestHandler):
            toc = IOLoop.current().time()
            self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic)
            self.statsd.timing('spawner.success', (toc - tic) * 1000)
-            SPAWN_DURATION_SECONDS.labels(
+            SERVER_SPAWN_DURATION_SECONDS.labels(
-                status='success'
+                status=ServerSpawnStatus.success
-            ).observe(
+            ).observe(time.perf_counter() - spawn_start_time)
                time.perf_counter() - spawn_start_time
            )
            proxy_add_start_time = time.perf_counter()
            spawner._proxy_pending = True
            try:
@@ -534,11 +529,9 @@ class BaseHandler(RequestHandler):
            if status is not None:
                toc = IOLoop.current().time()
                self.statsd.timing('spawner.failure', (toc - tic) * 1000)
-                SPAWN_DURATION_SECONDS.labels(
+                SERVER_SPAWN_DURATION_SECONDS.labels(
-                    status='failed'
+                    status=ServerSpawnStatus.failure
-                ).observe(
+                ).observe(time.perf_counter() - spawn_start_time)
                    time.perf_counter() - spawn_start_time
                )
                raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % (
                    status, spawner._log_name))
--- a/jupyterhub/metrics.py
+++ b/jupyterhub/metrics.py
@@ -2,8 +2,21 @@
 Prometheus metrics exported by JupyterHub
 Read https://prometheus.io/docs/practices/naming/ for naming
-conventions for metrics & labels.
+conventions for metrics & labels. We generally prefer naming them
 `<noun>_<verb>_<type_suffix>`. So a histogram that's tracking
 the duration (in seconds) of servers spawning would be called
 SERVER_SPAWN_DURATION_SECONDS.
 We also create an Enum for each 'status' type label in every metric
 we collect. This is to make sure that the metrics exist regardless
 of the condition happening or not. For example, if we don't explicitly
 create them, the metric spawn_duration_seconds{status="failure"}
 will not actually exist until the first failure. This makes dashboarding
 and alerting difficult, so we explicitly list statuses and create
 them manually here.
 """
 from enum import Enum
 from prometheus_client import Histogram
 REQUEST_DURATION_SECONDS = Histogram(
@@ -12,21 +25,52 @@ REQUEST_DURATION_SECONDS = Histogram(
    ['method', 'handler', 'code']
 )
-SPAWN_DURATION_SECONDS = Histogram(
+SERVER_SPAWN_DURATION_SECONDS = Histogram(
-    'spawn_duration_seconds',
+    'server_spawn_duration_seconds',
-    'spawn duration for all server spawns',
+    'time taken for server spawning operation',
    ['status'],
    # Use custom bucket sizes, since the default bucket ranges
    # are meant for quick running processes. Spawns can take a while!
    buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")]
 )
 class ServerSpawnStatus(Enum):
    """
    Possible values for 'status' label of SERVER_SPAWN_DURATION_SECONDS
    """
    success = 'success'
    failure = 'failure'
    already_pending = 'already-pending'
    throttled = 'throttled'
    too_many_users = 'too-many-users'
    def __str__(self):
        return self.value
 for s in ServerSpawnStatus:
    # Create empty metrics with the given status
    SERVER_SPAWN_DURATION_SECONDS.labels(status=s)
 PROXY_ADD_DURATION_SECONDS = Histogram(
    'proxy_add_duration_seconds',
    'duration for adding user routes to proxy',
    ['status']
 )
 class ProxyAddStatus(Enum):
    """
    Possible values for 'status' label of PROXY_ADD_DURATION_SECONDS
    """
    success = 'success'
    failure = 'failure'
    def __str__(self):
        return self.value
 for s in ProxyAddStatus:
    PROXY_ADD_DURATION_SECONDS.labels(status=s)
 def prometheus_log_method(handler):
    """
    Tornado log handler for recording RED metrics.