Merge pull request #1581 from yuvipanda/prometheus

Add RED prometheus metrics for all requests
2025-10-15 14:03:02 +00:00 · 2017-12-11 12:00:25 +01:00
parent 0b4c7defd4 3cd526c019
commit 1e00343262
6 changed files with 150 additions and 2 deletions
--- a/jupyterhub/handlers/init.py
+++ b/jupyterhub/handlers/init.py
@@ -1,8 +1,8 @@
 from .base import *
 from .login import *
-from . import base, pages, login
+from . import base, pages, login, metrics
 default_handlers = []
-for mod in (base, pages, login):
+for mod in (base, pages, login, metrics):
    default_handlers.extend(mod.default_handlers)
--- a/jupyterhub/handlers/base.py
+++ b/jupyterhub/handlers/base.py
@@ -5,6 +5,7 @@
 import copy
 import re
 import time
 from datetime import timedelta
 from http.client import responses
 from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
@@ -22,6 +23,10 @@ from .. import orm
 from ..objects import Server
 from ..spawner import LocalProcessSpawner
 from ..utils import url_path_join
 from ..metrics import (
    SERVER_SPAWN_DURATION_SECONDS, ServerSpawnStatus,
    PROXY_ADD_DURATION_SECONDS, ProxyAddStatus
 )
 # pattern for the authentication token header
 auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE)
@@ -388,6 +393,7 @@ class BaseHandler(RequestHandler):
    @gen.coroutine
    def spawn_single_user(self, user, server_name='', options=None):
        # in case of error, include 'try again from /hub/home' message
        spawn_start_time = time.perf_counter()
        self.extra_error_html = self.spawn_home_error
        user_server_name = user.name
@@ -397,6 +403,9 @@ class BaseHandler(RequestHandler):
        if server_name in user.spawners and user.spawners[server_name].pending:
            pending = user.spawners[server_name].pending
            SERVER_SPAWN_DURATION_SECONDS.labels(
                status=ServerSpawnStatus.already_pending
            ).observe(time.perf_counter() - spawn_start_time)
            raise RuntimeError("%s pending %s" % (user_server_name, pending))
        # count active servers and pending spawns
@@ -415,6 +424,9 @@ class BaseHandler(RequestHandler):
                '%s pending spawns, throttling',
                spawn_pending_count,
            )
            SERVER_SPAWN_DURATION_SECONDS.labels(
                status=ServerSpawnStatus.throttled
            ).observe(time.perf_counter() - spawn_start_time)
            raise web.HTTPError(
                429,
                "User startup rate limit exceeded. Try again in a few minutes.",
@@ -424,6 +436,9 @@ class BaseHandler(RequestHandler):
                '%s servers active, no space available',
                active_count,
            )
            SERVER_SPAWN_DURATION_SECONDS.labels(
                status=ServerSpawnStatus.too_many_users
            ).observe(time.perf_counter() - spawn_start_time)
            raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.")
        tic = IOLoop.current().time()
@@ -456,13 +471,28 @@ class BaseHandler(RequestHandler):
            toc = IOLoop.current().time()
            self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic)
            self.statsd.timing('spawner.success', (toc - tic) * 1000)
            SERVER_SPAWN_DURATION_SECONDS.labels(
                status=ServerSpawnStatus.success
            ).observe(time.perf_counter() - spawn_start_time)
            proxy_add_start_time = time.perf_counter()
            spawner._proxy_pending = True
            try:
                yield self.proxy.add_user(user, server_name)
                PROXY_ADD_DURATION_SECONDS.labels(
                    status='success'
                ).observe(
                    time.perf_counter() - proxy_add_start_time
                )
            except Exception:
                self.log.exception("Failed to add %s to proxy!", user_server_name)
                self.log.error("Stopping %s to avoid inconsistent state", user_server_name)
                yield user.stop()
                PROXY_ADD_DURATION_SECONDS.labels(
                    status='failure'
                ).observe(
                    time.perf_counter() - proxy_add_start_time
                )
            else:
                spawner.add_poll_callback(self.user_stopped, user, server_name)
            finally:
@@ -499,6 +529,9 @@ class BaseHandler(RequestHandler):
            if status is not None:
                toc = IOLoop.current().time()
                self.statsd.timing('spawner.failure', (toc - tic) * 1000)
                SERVER_SPAWN_DURATION_SECONDS.labels(
                    status=ServerSpawnStatus.failure
                ).observe(time.perf_counter() - spawn_start_time)
                raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % (
                    status, spawner._log_name))
--- a/jupyterhub/handlers/metrics.py
+++ b/jupyterhub/handlers/metrics.py
@@ -0,0 +1,17 @@
 from prometheus_client import REGISTRY, CONTENT_TYPE_LATEST, generate_latest
 from tornado import gen
 from .base import BaseHandler
 class MetricsHandler(BaseHandler):
    """
    Handler to serve Prometheus metrics
    """
    @gen.coroutine
    def get(self):
        self.set_header('Content-Type', CONTENT_TYPE_LATEST)
        self.write(generate_latest(REGISTRY))
 default_handlers = [
    (r'/metrics$', MetricsHandler)
 ]
--- a/jupyterhub/log.py
+++ b/jupyterhub/log.py
@@ -8,6 +8,7 @@ import traceback
 from tornado.log import LogFormatter, access_log
 from tornado.web import StaticFileHandler, HTTPError
 from .metrics import prometheus_log_method
 def coroutine_traceback(typ, value, tb):
    """Scrub coroutine frames from a traceback
@@ -68,6 +69,7 @@ def log_request(handler):
    - get proxied IP instead of proxy IP
    - log referer for redirect and failed requests
    - log user-agent for failed requests
    - record per-request metrics in prometheus
    """
    status = handler.get_status()
    request = handler.request
@@ -120,3 +122,4 @@ def log_request(handler):
        if location:
            ns['location'] = ' → {}'.format(location)
    log_method(msg.format(**ns))
    prometheus_log_method(handler)
--- a/jupyterhub/metrics.py
+++ b/jupyterhub/metrics.py
@@ -0,0 +1,94 @@
 """
 Prometheus metrics exported by JupyterHub
 Read https://prometheus.io/docs/practices/naming/ for naming
 conventions for metrics & labels. We generally prefer naming them
 `<noun>_<verb>_<type_suffix>`. So a histogram that's tracking
 the duration (in seconds) of servers spawning would be called
 SERVER_SPAWN_DURATION_SECONDS.
 We also create an Enum for each 'status' type label in every metric
 we collect. This is to make sure that the metrics exist regardless
 of the condition happening or not. For example, if we don't explicitly
 create them, the metric spawn_duration_seconds{status="failure"}
 will not actually exist until the first failure. This makes dashboarding
 and alerting difficult, so we explicitly list statuses and create
 them manually here.
 """
 from enum import Enum
 from prometheus_client import Histogram
 REQUEST_DURATION_SECONDS = Histogram(
    'request_duration_seconds',
    'request duration for all HTTP requests',
    ['method', 'handler', 'code']
 )
 SERVER_SPAWN_DURATION_SECONDS = Histogram(
    'server_spawn_duration_seconds',
    'time taken for server spawning operation',
    ['status'],
    # Use custom bucket sizes, since the default bucket ranges
    # are meant for quick running processes. Spawns can take a while!
    buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")]
 )
 class ServerSpawnStatus(Enum):
    """
    Possible values for 'status' label of SERVER_SPAWN_DURATION_SECONDS
    """
    success = 'success'
    failure = 'failure'
    already_pending = 'already-pending'
    throttled = 'throttled'
    too_many_users = 'too-many-users'
    def __str__(self):
        return self.value
 for s in ServerSpawnStatus:
    # Create empty metrics with the given status
    SERVER_SPAWN_DURATION_SECONDS.labels(status=s)
 PROXY_ADD_DURATION_SECONDS = Histogram(
    'proxy_add_duration_seconds',
    'duration for adding user routes to proxy',
    ['status']
 )
 class ProxyAddStatus(Enum):
    """
    Possible values for 'status' label of PROXY_ADD_DURATION_SECONDS
    """
    success = 'success'
    failure = 'failure'
    def __str__(self):
        return self.value
 for s in ProxyAddStatus:
    PROXY_ADD_DURATION_SECONDS.labels(status=s)
 def prometheus_log_method(handler):
    """
    Tornado log handler for recording RED metrics.
    We record the following metrics:
       Rate – the number of requests, per second, your services are serving.
       Errors – the number of failed requests per second.
       Duration – The amount of time each request takes expressed as a time interval.
    We use a fully qualified name of the handler as a label,
    rather than every url path to reduce cardinality.
    This function should be either the value of or called from a function
    that is the 'log_function' tornado setting. This makes it get called
    at the end of every request, allowing us to record the metrics we need.
    """
    REQUEST_DURATION_SECONDS.labels(
        method=handler.request.method,
        handler='{}.{}'.format(handler.__class__.__module__, type(handler).__name__),
        code=handler.get_status()
    ).observe(handler.request.request_time())
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,4 @@ pamela
 python-oauth2>=1.0
 SQLAlchemy>=1.1
 requests
 prometheus_client>=0.0.21