diff --git a/jupyterhub/handlers/base.py b/jupyterhub/handlers/base.py index 5167da82..b5e489b3 100644 --- a/jupyterhub/handlers/base.py +++ b/jupyterhub/handlers/base.py @@ -23,7 +23,10 @@ from .. import orm from ..objects import Server from ..spawner import LocalProcessSpawner from ..utils import url_path_join -from ..metrics import SPAWN_DURATION_SECONDS, PROXY_ADD_DURATION_SECONDS +from ..metrics import ( + SERVER_SPAWN_DURATION_SECONDS, ServerSpawnStatus, + PROXY_ADD_DURATION_SECONDS, ProxyAddStatus +) # pattern for the authentication token header auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE) @@ -400,11 +403,9 @@ class BaseHandler(RequestHandler): if server_name in user.spawners and user.spawners[server_name].pending: pending = user.spawners[server_name].pending - SPAWN_DURATION_SECONDS.labels( - status='already-pending' - ).observe( - time.perf_counter() - spawn_start_time - ) + SERVER_SPAWN_DURATION_SECONDS.labels( + status=ServerSpawnStatus.already_pending + ).observe(time.perf_counter() - spawn_start_time) raise RuntimeError("%s pending %s" % (user_server_name, pending)) # count active servers and pending spawns @@ -423,11 +424,9 @@ class BaseHandler(RequestHandler): '%s pending spawns, throttling', spawn_pending_count, ) - SPAWN_DURATION_SECONDS.labels( - status='throttled' - ).observe( - time.perf_counter() - spawn_start_time - ) + SERVER_SPAWN_DURATION_SECONDS.labels( + status=ServerSpawnStatus.throttled + ).observe(time.perf_counter() - spawn_start_time) raise web.HTTPError( 429, "User startup rate limit exceeded. Try again in a few minutes.", @@ -437,11 +436,9 @@ class BaseHandler(RequestHandler): '%s servers active, no space available', active_count, ) - SPAWN_DURATION_SECONDS.labels( - status='too-many-users' - ).observe( - time.perf_counter() - spawn_start_time - ) + SERVER_SPAWN_DURATION_SECONDS.labels( + status=ServerSpawnStatus.too_many_users + ).observe(time.perf_counter() - spawn_start_time) raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.") tic = IOLoop.current().time() @@ -474,11 +471,9 @@ class BaseHandler(RequestHandler): toc = IOLoop.current().time() self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic) self.statsd.timing('spawner.success', (toc - tic) * 1000) - SPAWN_DURATION_SECONDS.labels( - status='success' - ).observe( - time.perf_counter() - spawn_start_time - ) + SERVER_SPAWN_DURATION_SECONDS.labels( + status=ServerSpawnStatus.success + ).observe(time.perf_counter() - spawn_start_time) proxy_add_start_time = time.perf_counter() spawner._proxy_pending = True try: @@ -534,11 +529,9 @@ class BaseHandler(RequestHandler): if status is not None: toc = IOLoop.current().time() self.statsd.timing('spawner.failure', (toc - tic) * 1000) - SPAWN_DURATION_SECONDS.labels( - status='failed' - ).observe( - time.perf_counter() - spawn_start_time - ) + SERVER_SPAWN_DURATION_SECONDS.labels( + status=ServerSpawnStatus.failure + ).observe(time.perf_counter() - spawn_start_time) raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % ( status, spawner._log_name)) diff --git a/jupyterhub/metrics.py b/jupyterhub/metrics.py index ba885446..68d6673e 100644 --- a/jupyterhub/metrics.py +++ b/jupyterhub/metrics.py @@ -2,8 +2,21 @@ Prometheus metrics exported by JupyterHub Read https://prometheus.io/docs/practices/naming/ for naming -conventions for metrics & labels. +conventions for metrics & labels. We generally prefer naming them +`__`. So a histogram that's tracking +the duration (in seconds) of servers spawning would be called +SERVER_SPAWN_DURATION_SECONDS. + +We also create an Enum for each 'status' type label in every metric +we collect. This is to make sure that the metrics exist regardless +of the condition happening or not. For example, if we don't explicitly +create them, the metric spawn_duration_seconds{status="failure"} +will not actually exist until the first failure. This makes dashboarding +and alerting difficult, so we explicitly list statuses and create +them manually here. """ +from enum import Enum + from prometheus_client import Histogram REQUEST_DURATION_SECONDS = Histogram( @@ -12,21 +25,52 @@ REQUEST_DURATION_SECONDS = Histogram( ['method', 'handler', 'code'] ) -SPAWN_DURATION_SECONDS = Histogram( - 'spawn_duration_seconds', - 'spawn duration for all server spawns', +SERVER_SPAWN_DURATION_SECONDS = Histogram( + 'server_spawn_duration_seconds', + 'time taken for server spawning operation', ['status'], # Use custom bucket sizes, since the default bucket ranges # are meant for quick running processes. Spawns can take a while! buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")] ) +class ServerSpawnStatus(Enum): + """ + Possible values for 'status' label of SERVER_SPAWN_DURATION_SECONDS + """ + success = 'success' + failure = 'failure' + already_pending = 'already-pending' + throttled = 'throttled' + too_many_users = 'too-many-users' + + def __str__(self): + return self.value + +for s in ServerSpawnStatus: + # Create empty metrics with the given status + SERVER_SPAWN_DURATION_SECONDS.labels(status=s) + + PROXY_ADD_DURATION_SECONDS = Histogram( 'proxy_add_duration_seconds', 'duration for adding user routes to proxy', ['status'] ) +class ProxyAddStatus(Enum): + """ + Possible values for 'status' label of PROXY_ADD_DURATION_SECONDS + """ + success = 'success' + failure = 'failure' + + def __str__(self): + return self.value + +for s in ProxyAddStatus: + PROXY_ADD_DURATION_SECONDS.labels(status=s) + def prometheus_log_method(handler): """ Tornado log handler for recording RED metrics.