diff --git a/jupyterhub/handlers/__init__.py b/jupyterhub/handlers/__init__.py index 8b2ffd58..0823c183 100644 --- a/jupyterhub/handlers/__init__.py +++ b/jupyterhub/handlers/__init__.py @@ -1,8 +1,8 @@ from .base import * from .login import * -from . import base, pages, login +from . import base, pages, login, metrics default_handlers = [] -for mod in (base, pages, login): +for mod in (base, pages, login, metrics): default_handlers.extend(mod.default_handlers) diff --git a/jupyterhub/handlers/base.py b/jupyterhub/handlers/base.py index aefea6fb..b5e489b3 100644 --- a/jupyterhub/handlers/base.py +++ b/jupyterhub/handlers/base.py @@ -5,6 +5,7 @@ import copy import re +import time from datetime import timedelta from http.client import responses from urllib.parse import urlparse, urlunparse, parse_qs, urlencode @@ -22,6 +23,10 @@ from .. import orm from ..objects import Server from ..spawner import LocalProcessSpawner from ..utils import url_path_join +from ..metrics import ( + SERVER_SPAWN_DURATION_SECONDS, ServerSpawnStatus, + PROXY_ADD_DURATION_SECONDS, ProxyAddStatus +) # pattern for the authentication token header auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE) @@ -388,6 +393,7 @@ class BaseHandler(RequestHandler): @gen.coroutine def spawn_single_user(self, user, server_name='', options=None): # in case of error, include 'try again from /hub/home' message + spawn_start_time = time.perf_counter() self.extra_error_html = self.spawn_home_error user_server_name = user.name @@ -397,6 +403,9 @@ class BaseHandler(RequestHandler): if server_name in user.spawners and user.spawners[server_name].pending: pending = user.spawners[server_name].pending + SERVER_SPAWN_DURATION_SECONDS.labels( + status=ServerSpawnStatus.already_pending + ).observe(time.perf_counter() - spawn_start_time) raise RuntimeError("%s pending %s" % (user_server_name, pending)) # count active servers and pending spawns @@ -415,6 +424,9 @@ class BaseHandler(RequestHandler): '%s pending spawns, throttling', spawn_pending_count, ) + SERVER_SPAWN_DURATION_SECONDS.labels( + status=ServerSpawnStatus.throttled + ).observe(time.perf_counter() - spawn_start_time) raise web.HTTPError( 429, "User startup rate limit exceeded. Try again in a few minutes.", @@ -424,6 +436,9 @@ class BaseHandler(RequestHandler): '%s servers active, no space available', active_count, ) + SERVER_SPAWN_DURATION_SECONDS.labels( + status=ServerSpawnStatus.too_many_users + ).observe(time.perf_counter() - spawn_start_time) raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.") tic = IOLoop.current().time() @@ -456,13 +471,28 @@ class BaseHandler(RequestHandler): toc = IOLoop.current().time() self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic) self.statsd.timing('spawner.success', (toc - tic) * 1000) + SERVER_SPAWN_DURATION_SECONDS.labels( + status=ServerSpawnStatus.success + ).observe(time.perf_counter() - spawn_start_time) + proxy_add_start_time = time.perf_counter() spawner._proxy_pending = True try: yield self.proxy.add_user(user, server_name) + + PROXY_ADD_DURATION_SECONDS.labels( + status='success' + ).observe( + time.perf_counter() - proxy_add_start_time + ) except Exception: self.log.exception("Failed to add %s to proxy!", user_server_name) self.log.error("Stopping %s to avoid inconsistent state", user_server_name) yield user.stop() + PROXY_ADD_DURATION_SECONDS.labels( + status='failure' + ).observe( + time.perf_counter() - proxy_add_start_time + ) else: spawner.add_poll_callback(self.user_stopped, user, server_name) finally: @@ -499,6 +529,9 @@ class BaseHandler(RequestHandler): if status is not None: toc = IOLoop.current().time() self.statsd.timing('spawner.failure', (toc - tic) * 1000) + SERVER_SPAWN_DURATION_SECONDS.labels( + status=ServerSpawnStatus.failure + ).observe(time.perf_counter() - spawn_start_time) raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % ( status, spawner._log_name)) diff --git a/jupyterhub/handlers/metrics.py b/jupyterhub/handlers/metrics.py new file mode 100644 index 00000000..60b934f9 --- /dev/null +++ b/jupyterhub/handlers/metrics.py @@ -0,0 +1,17 @@ +from prometheus_client import REGISTRY, CONTENT_TYPE_LATEST, generate_latest +from tornado import gen + +from .base import BaseHandler + +class MetricsHandler(BaseHandler): + """ + Handler to serve Prometheus metrics + """ + @gen.coroutine + def get(self): + self.set_header('Content-Type', CONTENT_TYPE_LATEST) + self.write(generate_latest(REGISTRY)) + +default_handlers = [ + (r'/metrics$', MetricsHandler) +] diff --git a/jupyterhub/log.py b/jupyterhub/log.py index 60b6288d..2405edf3 100644 --- a/jupyterhub/log.py +++ b/jupyterhub/log.py @@ -8,6 +8,7 @@ import traceback from tornado.log import LogFormatter, access_log from tornado.web import StaticFileHandler, HTTPError +from .metrics import prometheus_log_method def coroutine_traceback(typ, value, tb): """Scrub coroutine frames from a traceback @@ -68,6 +69,7 @@ def log_request(handler): - get proxied IP instead of proxy IP - log referer for redirect and failed requests - log user-agent for failed requests + - record per-request metrics in prometheus """ status = handler.get_status() request = handler.request @@ -120,3 +122,4 @@ def log_request(handler): if location: ns['location'] = ' → {}'.format(location) log_method(msg.format(**ns)) + prometheus_log_method(handler) diff --git a/jupyterhub/metrics.py b/jupyterhub/metrics.py new file mode 100644 index 00000000..68d6673e --- /dev/null +++ b/jupyterhub/metrics.py @@ -0,0 +1,94 @@ +""" +Prometheus metrics exported by JupyterHub + +Read https://prometheus.io/docs/practices/naming/ for naming +conventions for metrics & labels. We generally prefer naming them +`__`. So a histogram that's tracking +the duration (in seconds) of servers spawning would be called +SERVER_SPAWN_DURATION_SECONDS. + +We also create an Enum for each 'status' type label in every metric +we collect. This is to make sure that the metrics exist regardless +of the condition happening or not. For example, if we don't explicitly +create them, the metric spawn_duration_seconds{status="failure"} +will not actually exist until the first failure. This makes dashboarding +and alerting difficult, so we explicitly list statuses and create +them manually here. +""" +from enum import Enum + +from prometheus_client import Histogram + +REQUEST_DURATION_SECONDS = Histogram( + 'request_duration_seconds', + 'request duration for all HTTP requests', + ['method', 'handler', 'code'] +) + +SERVER_SPAWN_DURATION_SECONDS = Histogram( + 'server_spawn_duration_seconds', + 'time taken for server spawning operation', + ['status'], + # Use custom bucket sizes, since the default bucket ranges + # are meant for quick running processes. Spawns can take a while! + buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")] +) + +class ServerSpawnStatus(Enum): + """ + Possible values for 'status' label of SERVER_SPAWN_DURATION_SECONDS + """ + success = 'success' + failure = 'failure' + already_pending = 'already-pending' + throttled = 'throttled' + too_many_users = 'too-many-users' + + def __str__(self): + return self.value + +for s in ServerSpawnStatus: + # Create empty metrics with the given status + SERVER_SPAWN_DURATION_SECONDS.labels(status=s) + + +PROXY_ADD_DURATION_SECONDS = Histogram( + 'proxy_add_duration_seconds', + 'duration for adding user routes to proxy', + ['status'] +) + +class ProxyAddStatus(Enum): + """ + Possible values for 'status' label of PROXY_ADD_DURATION_SECONDS + """ + success = 'success' + failure = 'failure' + + def __str__(self): + return self.value + +for s in ProxyAddStatus: + PROXY_ADD_DURATION_SECONDS.labels(status=s) + +def prometheus_log_method(handler): + """ + Tornado log handler for recording RED metrics. + + We record the following metrics: + Rate – the number of requests, per second, your services are serving. + Errors – the number of failed requests per second. + Duration – The amount of time each request takes expressed as a time interval. + + We use a fully qualified name of the handler as a label, + rather than every url path to reduce cardinality. + + This function should be either the value of or called from a function + that is the 'log_function' tornado setting. This makes it get called + at the end of every request, allowing us to record the metrics we need. + """ + REQUEST_DURATION_SECONDS.labels( + method=handler.request.method, + handler='{}.{}'.format(handler.__class__.__module__, type(handler).__name__), + code=handler.get_status() + ).observe(handler.request.request_time()) diff --git a/requirements.txt b/requirements.txt index a4b660af..7a71f61a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ pamela python-oauth2>=1.0 SQLAlchemy>=1.1 requests +prometheus_client>=0.0.21