Merge pull request #1581 from yuvipanda/prometheus

Add RED prometheus metrics for all requests
2025-10-14 05:23:01 +00:00 · 2017-12-11 12:00:25 +01:00
parent 0b4c7defd4 3cd526c019
commit 1e00343262
6 changed files with 150 additions and 2 deletions
--- a/jupyterhub/handlers/init.py
+++ b/jupyterhub/handlers/init.py
@@ -1,8 +1,8 @@
 from .base import *
 from .login import *

-from . import base, pages, login
+from . import base, pages, login, metrics

 default_handlers = []
-for mod in (base, pages, login):
+for mod in (base, pages, login, metrics):
    default_handlers.extend(mod.default_handlers)
--- a/jupyterhub/handlers/base.py
+++ b/jupyterhub/handlers/base.py
@@ -5,6 +5,7 @@

 import copy
 import re
+import time
 from datetime import timedelta
 from http.client import responses
 from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
@@ -22,6 +23,10 @@ from .. import orm
 from ..objects import Server
 from ..spawner import LocalProcessSpawner
 from ..utils import url_path_join
+from ..metrics import (
+    SERVER_SPAWN_DURATION_SECONDS, ServerSpawnStatus,
+    PROXY_ADD_DURATION_SECONDS, ProxyAddStatus
+)

 # pattern for the authentication token header
 auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE)
@@ -388,6 +393,7 @@ class BaseHandler(RequestHandler):
    @gen.coroutine
    def spawn_single_user(self, user, server_name='', options=None):
        # in case of error, include 'try again from /hub/home' message
+        spawn_start_time = time.perf_counter()
        self.extra_error_html = self.spawn_home_error

        user_server_name = user.name
@@ -397,6 +403,9 @@ class BaseHandler(RequestHandler):

        if server_name in user.spawners and user.spawners[server_name].pending:
            pending = user.spawners[server_name].pending
+            SERVER_SPAWN_DURATION_SECONDS.labels(
+                status=ServerSpawnStatus.already_pending
+            ).observe(time.perf_counter() - spawn_start_time)
            raise RuntimeError("%s pending %s" % (user_server_name, pending))

        # count active servers and pending spawns
@@ -415,6 +424,9 @@ class BaseHandler(RequestHandler):
                '%s pending spawns, throttling',
                spawn_pending_count,
            )
+            SERVER_SPAWN_DURATION_SECONDS.labels(
+                status=ServerSpawnStatus.throttled
+            ).observe(time.perf_counter() - spawn_start_time)
            raise web.HTTPError(
                429,
                "User startup rate limit exceeded. Try again in a few minutes.",
@@ -424,6 +436,9 @@ class BaseHandler(RequestHandler):
                '%s servers active, no space available',
                active_count,
            )
+            SERVER_SPAWN_DURATION_SECONDS.labels(
+                status=ServerSpawnStatus.too_many_users
+            ).observe(time.perf_counter() - spawn_start_time)
            raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.")

        tic = IOLoop.current().time()
@@ -456,13 +471,28 @@ class BaseHandler(RequestHandler):
            toc = IOLoop.current().time()
            self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic)
            self.statsd.timing('spawner.success', (toc - tic) * 1000)
+            SERVER_SPAWN_DURATION_SECONDS.labels(
+                status=ServerSpawnStatus.success
+            ).observe(time.perf_counter() - spawn_start_time)
+            proxy_add_start_time = time.perf_counter()
            spawner._proxy_pending = True
            try:
                yield self.proxy.add_user(user, server_name)
+
+                PROXY_ADD_DURATION_SECONDS.labels(
+                    status='success'
+                ).observe(
+                    time.perf_counter() - proxy_add_start_time
+                )
            except Exception:
                self.log.exception("Failed to add %s to proxy!", user_server_name)
                self.log.error("Stopping %s to avoid inconsistent state", user_server_name)
                yield user.stop()
+                PROXY_ADD_DURATION_SECONDS.labels(
+                    status='failure'
+                ).observe(
+                    time.perf_counter() - proxy_add_start_time
+                )
            else:
                spawner.add_poll_callback(self.user_stopped, user, server_name)
            finally:
@@ -499,6 +529,9 @@ class BaseHandler(RequestHandler):
            if status is not None:
                toc = IOLoop.current().time()
                self.statsd.timing('spawner.failure', (toc - tic) * 1000)
+                SERVER_SPAWN_DURATION_SECONDS.labels(
+                    status=ServerSpawnStatus.failure
+                ).observe(time.perf_counter() - spawn_start_time)
                raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % (
                    status, spawner._log_name))

--- a/jupyterhub/handlers/metrics.py
+++ b/jupyterhub/handlers/metrics.py
@@ -0,0 +1,17 @@
+from prometheus_client import REGISTRY, CONTENT_TYPE_LATEST, generate_latest
+from tornado import gen
+
+from .base import BaseHandler
+
+class MetricsHandler(BaseHandler):
+    """
+    Handler to serve Prometheus metrics
+    """
+    @gen.coroutine
+    def get(self):
+        self.set_header('Content-Type', CONTENT_TYPE_LATEST)
+        self.write(generate_latest(REGISTRY))
+
+default_handlers = [
+    (r'/metrics$', MetricsHandler)
+]
--- a/jupyterhub/log.py
+++ b/jupyterhub/log.py
@@ -8,6 +8,7 @@ import traceback
 from tornado.log import LogFormatter, access_log
 from tornado.web import StaticFileHandler, HTTPError

+from .metrics import prometheus_log_method

 def coroutine_traceback(typ, value, tb):
    """Scrub coroutine frames from a traceback
@@ -68,6 +69,7 @@ def log_request(handler):
    - get proxied IP instead of proxy IP
    - log referer for redirect and failed requests
    - log user-agent for failed requests
+    - record per-request metrics in prometheus
    """
    status = handler.get_status()
    request = handler.request
@@ -120,3 +122,4 @@ def log_request(handler):
        if location:
            ns['location'] = ' → {}'.format(location)
    log_method(msg.format(**ns))
+    prometheus_log_method(handler)
--- a/jupyterhub/metrics.py
+++ b/jupyterhub/metrics.py
@@ -0,0 +1,94 @@
+"""
+Prometheus metrics exported by JupyterHub
+
+Read https://prometheus.io/docs/practices/naming/ for naming
+conventions for metrics & labels. We generally prefer naming them
+`<noun>_<verb>_<type_suffix>`. So a histogram that's tracking
+the duration (in seconds) of servers spawning would be called
+SERVER_SPAWN_DURATION_SECONDS.
+
+We also create an Enum for each 'status' type label in every metric
+we collect. This is to make sure that the metrics exist regardless
+of the condition happening or not. For example, if we don't explicitly
+create them, the metric spawn_duration_seconds{status="failure"}
+will not actually exist until the first failure. This makes dashboarding
+and alerting difficult, so we explicitly list statuses and create
+them manually here.
+"""
+from enum import Enum
+
+from prometheus_client import Histogram
+
+REQUEST_DURATION_SECONDS = Histogram(
+    'request_duration_seconds',
+    'request duration for all HTTP requests',
+    ['method', 'handler', 'code']
+)
+
+SERVER_SPAWN_DURATION_SECONDS = Histogram(
+    'server_spawn_duration_seconds',
+    'time taken for server spawning operation',
+    ['status'],
+    # Use custom bucket sizes, since the default bucket ranges
+    # are meant for quick running processes. Spawns can take a while!
+    buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")]
+)
+
+class ServerSpawnStatus(Enum):
+    """
+    Possible values for 'status' label of SERVER_SPAWN_DURATION_SECONDS
+    """
+    success = 'success'
+    failure = 'failure'
+    already_pending = 'already-pending'
+    throttled = 'throttled'
+    too_many_users = 'too-many-users'
+
+    def __str__(self):
+        return self.value
+
+for s in ServerSpawnStatus:
+    # Create empty metrics with the given status
+    SERVER_SPAWN_DURATION_SECONDS.labels(status=s)
+
+
+PROXY_ADD_DURATION_SECONDS = Histogram(
+    'proxy_add_duration_seconds',
+    'duration for adding user routes to proxy',
+    ['status']
+)
+
+class ProxyAddStatus(Enum):
+    """
+    Possible values for 'status' label of PROXY_ADD_DURATION_SECONDS
+    """
+    success = 'success'
+    failure = 'failure'
+
+    def __str__(self):
+        return self.value
+
+for s in ProxyAddStatus:
+    PROXY_ADD_DURATION_SECONDS.labels(status=s)
+
+def prometheus_log_method(handler):
+    """
+    Tornado log handler for recording RED metrics.
+
+    We record the following metrics:
+       Rate – the number of requests, per second, your services are serving.
+       Errors – the number of failed requests per second.
+       Duration – The amount of time each request takes expressed as a time interval.
+
+    We use a fully qualified name of the handler as a label,
+    rather than every url path to reduce cardinality.
+
+    This function should be either the value of or called from a function
+    that is the 'log_function' tornado setting. This makes it get called
+    at the end of every request, allowing us to record the metrics we need.
+    """
+    REQUEST_DURATION_SECONDS.labels(
+        method=handler.request.method,
+        handler='{}.{}'.format(handler.__class__.__module__, type(handler).__name__),
+        code=handler.get_status()
+    ).observe(handler.request.request_time())
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,4 @@ pamela
 python-oauth2>=1.0
 SQLAlchemy>=1.1
 requests
+prometheus_client>=0.0.21