mirror of
https://github.com/jupyterhub/jupyterhub.git
synced 2025-10-14 13:33:00 +00:00
Merge pull request #1581 from yuvipanda/prometheus
Add RED prometheus metrics for all requests
This commit is contained in:
@@ -1,8 +1,8 @@
|
||||
from .base import *
|
||||
from .login import *
|
||||
|
||||
from . import base, pages, login
|
||||
from . import base, pages, login, metrics
|
||||
|
||||
default_handlers = []
|
||||
for mod in (base, pages, login):
|
||||
for mod in (base, pages, login, metrics):
|
||||
default_handlers.extend(mod.default_handlers)
|
||||
|
@@ -5,6 +5,7 @@
|
||||
|
||||
import copy
|
||||
import re
|
||||
import time
|
||||
from datetime import timedelta
|
||||
from http.client import responses
|
||||
from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
|
||||
@@ -22,6 +23,10 @@ from .. import orm
|
||||
from ..objects import Server
|
||||
from ..spawner import LocalProcessSpawner
|
||||
from ..utils import url_path_join
|
||||
from ..metrics import (
|
||||
SERVER_SPAWN_DURATION_SECONDS, ServerSpawnStatus,
|
||||
PROXY_ADD_DURATION_SECONDS, ProxyAddStatus
|
||||
)
|
||||
|
||||
# pattern for the authentication token header
|
||||
auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE)
|
||||
@@ -388,6 +393,7 @@ class BaseHandler(RequestHandler):
|
||||
@gen.coroutine
|
||||
def spawn_single_user(self, user, server_name='', options=None):
|
||||
# in case of error, include 'try again from /hub/home' message
|
||||
spawn_start_time = time.perf_counter()
|
||||
self.extra_error_html = self.spawn_home_error
|
||||
|
||||
user_server_name = user.name
|
||||
@@ -397,6 +403,9 @@ class BaseHandler(RequestHandler):
|
||||
|
||||
if server_name in user.spawners and user.spawners[server_name].pending:
|
||||
pending = user.spawners[server_name].pending
|
||||
SERVER_SPAWN_DURATION_SECONDS.labels(
|
||||
status=ServerSpawnStatus.already_pending
|
||||
).observe(time.perf_counter() - spawn_start_time)
|
||||
raise RuntimeError("%s pending %s" % (user_server_name, pending))
|
||||
|
||||
# count active servers and pending spawns
|
||||
@@ -415,6 +424,9 @@ class BaseHandler(RequestHandler):
|
||||
'%s pending spawns, throttling',
|
||||
spawn_pending_count,
|
||||
)
|
||||
SERVER_SPAWN_DURATION_SECONDS.labels(
|
||||
status=ServerSpawnStatus.throttled
|
||||
).observe(time.perf_counter() - spawn_start_time)
|
||||
raise web.HTTPError(
|
||||
429,
|
||||
"User startup rate limit exceeded. Try again in a few minutes.",
|
||||
@@ -424,6 +436,9 @@ class BaseHandler(RequestHandler):
|
||||
'%s servers active, no space available',
|
||||
active_count,
|
||||
)
|
||||
SERVER_SPAWN_DURATION_SECONDS.labels(
|
||||
status=ServerSpawnStatus.too_many_users
|
||||
).observe(time.perf_counter() - spawn_start_time)
|
||||
raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.")
|
||||
|
||||
tic = IOLoop.current().time()
|
||||
@@ -456,13 +471,28 @@ class BaseHandler(RequestHandler):
|
||||
toc = IOLoop.current().time()
|
||||
self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic)
|
||||
self.statsd.timing('spawner.success', (toc - tic) * 1000)
|
||||
SERVER_SPAWN_DURATION_SECONDS.labels(
|
||||
status=ServerSpawnStatus.success
|
||||
).observe(time.perf_counter() - spawn_start_time)
|
||||
proxy_add_start_time = time.perf_counter()
|
||||
spawner._proxy_pending = True
|
||||
try:
|
||||
yield self.proxy.add_user(user, server_name)
|
||||
|
||||
PROXY_ADD_DURATION_SECONDS.labels(
|
||||
status='success'
|
||||
).observe(
|
||||
time.perf_counter() - proxy_add_start_time
|
||||
)
|
||||
except Exception:
|
||||
self.log.exception("Failed to add %s to proxy!", user_server_name)
|
||||
self.log.error("Stopping %s to avoid inconsistent state", user_server_name)
|
||||
yield user.stop()
|
||||
PROXY_ADD_DURATION_SECONDS.labels(
|
||||
status='failure'
|
||||
).observe(
|
||||
time.perf_counter() - proxy_add_start_time
|
||||
)
|
||||
else:
|
||||
spawner.add_poll_callback(self.user_stopped, user, server_name)
|
||||
finally:
|
||||
@@ -499,6 +529,9 @@ class BaseHandler(RequestHandler):
|
||||
if status is not None:
|
||||
toc = IOLoop.current().time()
|
||||
self.statsd.timing('spawner.failure', (toc - tic) * 1000)
|
||||
SERVER_SPAWN_DURATION_SECONDS.labels(
|
||||
status=ServerSpawnStatus.failure
|
||||
).observe(time.perf_counter() - spawn_start_time)
|
||||
raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % (
|
||||
status, spawner._log_name))
|
||||
|
||||
|
17
jupyterhub/handlers/metrics.py
Normal file
17
jupyterhub/handlers/metrics.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from prometheus_client import REGISTRY, CONTENT_TYPE_LATEST, generate_latest
|
||||
from tornado import gen
|
||||
|
||||
from .base import BaseHandler
|
||||
|
||||
class MetricsHandler(BaseHandler):
|
||||
"""
|
||||
Handler to serve Prometheus metrics
|
||||
"""
|
||||
@gen.coroutine
|
||||
def get(self):
|
||||
self.set_header('Content-Type', CONTENT_TYPE_LATEST)
|
||||
self.write(generate_latest(REGISTRY))
|
||||
|
||||
default_handlers = [
|
||||
(r'/metrics$', MetricsHandler)
|
||||
]
|
@@ -8,6 +8,7 @@ import traceback
|
||||
from tornado.log import LogFormatter, access_log
|
||||
from tornado.web import StaticFileHandler, HTTPError
|
||||
|
||||
from .metrics import prometheus_log_method
|
||||
|
||||
def coroutine_traceback(typ, value, tb):
|
||||
"""Scrub coroutine frames from a traceback
|
||||
@@ -68,6 +69,7 @@ def log_request(handler):
|
||||
- get proxied IP instead of proxy IP
|
||||
- log referer for redirect and failed requests
|
||||
- log user-agent for failed requests
|
||||
- record per-request metrics in prometheus
|
||||
"""
|
||||
status = handler.get_status()
|
||||
request = handler.request
|
||||
@@ -120,3 +122,4 @@ def log_request(handler):
|
||||
if location:
|
||||
ns['location'] = ' → {}'.format(location)
|
||||
log_method(msg.format(**ns))
|
||||
prometheus_log_method(handler)
|
||||
|
94
jupyterhub/metrics.py
Normal file
94
jupyterhub/metrics.py
Normal file
@@ -0,0 +1,94 @@
|
||||
"""
|
||||
Prometheus metrics exported by JupyterHub
|
||||
|
||||
Read https://prometheus.io/docs/practices/naming/ for naming
|
||||
conventions for metrics & labels. We generally prefer naming them
|
||||
`<noun>_<verb>_<type_suffix>`. So a histogram that's tracking
|
||||
the duration (in seconds) of servers spawning would be called
|
||||
SERVER_SPAWN_DURATION_SECONDS.
|
||||
|
||||
We also create an Enum for each 'status' type label in every metric
|
||||
we collect. This is to make sure that the metrics exist regardless
|
||||
of the condition happening or not. For example, if we don't explicitly
|
||||
create them, the metric spawn_duration_seconds{status="failure"}
|
||||
will not actually exist until the first failure. This makes dashboarding
|
||||
and alerting difficult, so we explicitly list statuses and create
|
||||
them manually here.
|
||||
"""
|
||||
from enum import Enum
|
||||
|
||||
from prometheus_client import Histogram
|
||||
|
||||
REQUEST_DURATION_SECONDS = Histogram(
|
||||
'request_duration_seconds',
|
||||
'request duration for all HTTP requests',
|
||||
['method', 'handler', 'code']
|
||||
)
|
||||
|
||||
SERVER_SPAWN_DURATION_SECONDS = Histogram(
|
||||
'server_spawn_duration_seconds',
|
||||
'time taken for server spawning operation',
|
||||
['status'],
|
||||
# Use custom bucket sizes, since the default bucket ranges
|
||||
# are meant for quick running processes. Spawns can take a while!
|
||||
buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")]
|
||||
)
|
||||
|
||||
class ServerSpawnStatus(Enum):
|
||||
"""
|
||||
Possible values for 'status' label of SERVER_SPAWN_DURATION_SECONDS
|
||||
"""
|
||||
success = 'success'
|
||||
failure = 'failure'
|
||||
already_pending = 'already-pending'
|
||||
throttled = 'throttled'
|
||||
too_many_users = 'too-many-users'
|
||||
|
||||
def __str__(self):
|
||||
return self.value
|
||||
|
||||
for s in ServerSpawnStatus:
|
||||
# Create empty metrics with the given status
|
||||
SERVER_SPAWN_DURATION_SECONDS.labels(status=s)
|
||||
|
||||
|
||||
PROXY_ADD_DURATION_SECONDS = Histogram(
|
||||
'proxy_add_duration_seconds',
|
||||
'duration for adding user routes to proxy',
|
||||
['status']
|
||||
)
|
||||
|
||||
class ProxyAddStatus(Enum):
|
||||
"""
|
||||
Possible values for 'status' label of PROXY_ADD_DURATION_SECONDS
|
||||
"""
|
||||
success = 'success'
|
||||
failure = 'failure'
|
||||
|
||||
def __str__(self):
|
||||
return self.value
|
||||
|
||||
for s in ProxyAddStatus:
|
||||
PROXY_ADD_DURATION_SECONDS.labels(status=s)
|
||||
|
||||
def prometheus_log_method(handler):
|
||||
"""
|
||||
Tornado log handler for recording RED metrics.
|
||||
|
||||
We record the following metrics:
|
||||
Rate – the number of requests, per second, your services are serving.
|
||||
Errors – the number of failed requests per second.
|
||||
Duration – The amount of time each request takes expressed as a time interval.
|
||||
|
||||
We use a fully qualified name of the handler as a label,
|
||||
rather than every url path to reduce cardinality.
|
||||
|
||||
This function should be either the value of or called from a function
|
||||
that is the 'log_function' tornado setting. This makes it get called
|
||||
at the end of every request, allowing us to record the metrics we need.
|
||||
"""
|
||||
REQUEST_DURATION_SECONDS.labels(
|
||||
method=handler.request.method,
|
||||
handler='{}.{}'.format(handler.__class__.__module__, type(handler).__name__),
|
||||
code=handler.get_status()
|
||||
).observe(handler.request.request_time())
|
@@ -6,3 +6,4 @@ pamela
|
||||
python-oauth2>=1.0
|
||||
SQLAlchemy>=1.1
|
||||
requests
|
||||
prometheus_client>=0.0.21
|
||||
|
Reference in New Issue
Block a user