mirror of
https://github.com/jupyterhub/jupyterhub.git
synced 2025-10-15 14:03:02 +00:00
Merge pull request #1581 from yuvipanda/prometheus
Add RED prometheus metrics for all requests
This commit is contained in:
@@ -1,8 +1,8 @@
|
|||||||
from .base import *
|
from .base import *
|
||||||
from .login import *
|
from .login import *
|
||||||
|
|
||||||
from . import base, pages, login
|
from . import base, pages, login, metrics
|
||||||
|
|
||||||
default_handlers = []
|
default_handlers = []
|
||||||
for mod in (base, pages, login):
|
for mod in (base, pages, login, metrics):
|
||||||
default_handlers.extend(mod.default_handlers)
|
default_handlers.extend(mod.default_handlers)
|
||||||
|
@@ -5,6 +5,7 @@
|
|||||||
|
|
||||||
import copy
|
import copy
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from http.client import responses
|
from http.client import responses
|
||||||
from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
|
from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
|
||||||
@@ -22,6 +23,10 @@ from .. import orm
|
|||||||
from ..objects import Server
|
from ..objects import Server
|
||||||
from ..spawner import LocalProcessSpawner
|
from ..spawner import LocalProcessSpawner
|
||||||
from ..utils import url_path_join
|
from ..utils import url_path_join
|
||||||
|
from ..metrics import (
|
||||||
|
SERVER_SPAWN_DURATION_SECONDS, ServerSpawnStatus,
|
||||||
|
PROXY_ADD_DURATION_SECONDS, ProxyAddStatus
|
||||||
|
)
|
||||||
|
|
||||||
# pattern for the authentication token header
|
# pattern for the authentication token header
|
||||||
auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE)
|
auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE)
|
||||||
@@ -388,6 +393,7 @@ class BaseHandler(RequestHandler):
|
|||||||
@gen.coroutine
|
@gen.coroutine
|
||||||
def spawn_single_user(self, user, server_name='', options=None):
|
def spawn_single_user(self, user, server_name='', options=None):
|
||||||
# in case of error, include 'try again from /hub/home' message
|
# in case of error, include 'try again from /hub/home' message
|
||||||
|
spawn_start_time = time.perf_counter()
|
||||||
self.extra_error_html = self.spawn_home_error
|
self.extra_error_html = self.spawn_home_error
|
||||||
|
|
||||||
user_server_name = user.name
|
user_server_name = user.name
|
||||||
@@ -397,6 +403,9 @@ class BaseHandler(RequestHandler):
|
|||||||
|
|
||||||
if server_name in user.spawners and user.spawners[server_name].pending:
|
if server_name in user.spawners and user.spawners[server_name].pending:
|
||||||
pending = user.spawners[server_name].pending
|
pending = user.spawners[server_name].pending
|
||||||
|
SERVER_SPAWN_DURATION_SECONDS.labels(
|
||||||
|
status=ServerSpawnStatus.already_pending
|
||||||
|
).observe(time.perf_counter() - spawn_start_time)
|
||||||
raise RuntimeError("%s pending %s" % (user_server_name, pending))
|
raise RuntimeError("%s pending %s" % (user_server_name, pending))
|
||||||
|
|
||||||
# count active servers and pending spawns
|
# count active servers and pending spawns
|
||||||
@@ -415,6 +424,9 @@ class BaseHandler(RequestHandler):
|
|||||||
'%s pending spawns, throttling',
|
'%s pending spawns, throttling',
|
||||||
spawn_pending_count,
|
spawn_pending_count,
|
||||||
)
|
)
|
||||||
|
SERVER_SPAWN_DURATION_SECONDS.labels(
|
||||||
|
status=ServerSpawnStatus.throttled
|
||||||
|
).observe(time.perf_counter() - spawn_start_time)
|
||||||
raise web.HTTPError(
|
raise web.HTTPError(
|
||||||
429,
|
429,
|
||||||
"User startup rate limit exceeded. Try again in a few minutes.",
|
"User startup rate limit exceeded. Try again in a few minutes.",
|
||||||
@@ -424,6 +436,9 @@ class BaseHandler(RequestHandler):
|
|||||||
'%s servers active, no space available',
|
'%s servers active, no space available',
|
||||||
active_count,
|
active_count,
|
||||||
)
|
)
|
||||||
|
SERVER_SPAWN_DURATION_SECONDS.labels(
|
||||||
|
status=ServerSpawnStatus.too_many_users
|
||||||
|
).observe(time.perf_counter() - spawn_start_time)
|
||||||
raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.")
|
raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.")
|
||||||
|
|
||||||
tic = IOLoop.current().time()
|
tic = IOLoop.current().time()
|
||||||
@@ -456,13 +471,28 @@ class BaseHandler(RequestHandler):
|
|||||||
toc = IOLoop.current().time()
|
toc = IOLoop.current().time()
|
||||||
self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic)
|
self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic)
|
||||||
self.statsd.timing('spawner.success', (toc - tic) * 1000)
|
self.statsd.timing('spawner.success', (toc - tic) * 1000)
|
||||||
|
SERVER_SPAWN_DURATION_SECONDS.labels(
|
||||||
|
status=ServerSpawnStatus.success
|
||||||
|
).observe(time.perf_counter() - spawn_start_time)
|
||||||
|
proxy_add_start_time = time.perf_counter()
|
||||||
spawner._proxy_pending = True
|
spawner._proxy_pending = True
|
||||||
try:
|
try:
|
||||||
yield self.proxy.add_user(user, server_name)
|
yield self.proxy.add_user(user, server_name)
|
||||||
|
|
||||||
|
PROXY_ADD_DURATION_SECONDS.labels(
|
||||||
|
status='success'
|
||||||
|
).observe(
|
||||||
|
time.perf_counter() - proxy_add_start_time
|
||||||
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
self.log.exception("Failed to add %s to proxy!", user_server_name)
|
self.log.exception("Failed to add %s to proxy!", user_server_name)
|
||||||
self.log.error("Stopping %s to avoid inconsistent state", user_server_name)
|
self.log.error("Stopping %s to avoid inconsistent state", user_server_name)
|
||||||
yield user.stop()
|
yield user.stop()
|
||||||
|
PROXY_ADD_DURATION_SECONDS.labels(
|
||||||
|
status='failure'
|
||||||
|
).observe(
|
||||||
|
time.perf_counter() - proxy_add_start_time
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
spawner.add_poll_callback(self.user_stopped, user, server_name)
|
spawner.add_poll_callback(self.user_stopped, user, server_name)
|
||||||
finally:
|
finally:
|
||||||
@@ -499,6 +529,9 @@ class BaseHandler(RequestHandler):
|
|||||||
if status is not None:
|
if status is not None:
|
||||||
toc = IOLoop.current().time()
|
toc = IOLoop.current().time()
|
||||||
self.statsd.timing('spawner.failure', (toc - tic) * 1000)
|
self.statsd.timing('spawner.failure', (toc - tic) * 1000)
|
||||||
|
SERVER_SPAWN_DURATION_SECONDS.labels(
|
||||||
|
status=ServerSpawnStatus.failure
|
||||||
|
).observe(time.perf_counter() - spawn_start_time)
|
||||||
raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % (
|
raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % (
|
||||||
status, spawner._log_name))
|
status, spawner._log_name))
|
||||||
|
|
||||||
|
17
jupyterhub/handlers/metrics.py
Normal file
17
jupyterhub/handlers/metrics.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
from prometheus_client import REGISTRY, CONTENT_TYPE_LATEST, generate_latest
|
||||||
|
from tornado import gen
|
||||||
|
|
||||||
|
from .base import BaseHandler
|
||||||
|
|
||||||
|
class MetricsHandler(BaseHandler):
|
||||||
|
"""
|
||||||
|
Handler to serve Prometheus metrics
|
||||||
|
"""
|
||||||
|
@gen.coroutine
|
||||||
|
def get(self):
|
||||||
|
self.set_header('Content-Type', CONTENT_TYPE_LATEST)
|
||||||
|
self.write(generate_latest(REGISTRY))
|
||||||
|
|
||||||
|
default_handlers = [
|
||||||
|
(r'/metrics$', MetricsHandler)
|
||||||
|
]
|
@@ -8,6 +8,7 @@ import traceback
|
|||||||
from tornado.log import LogFormatter, access_log
|
from tornado.log import LogFormatter, access_log
|
||||||
from tornado.web import StaticFileHandler, HTTPError
|
from tornado.web import StaticFileHandler, HTTPError
|
||||||
|
|
||||||
|
from .metrics import prometheus_log_method
|
||||||
|
|
||||||
def coroutine_traceback(typ, value, tb):
|
def coroutine_traceback(typ, value, tb):
|
||||||
"""Scrub coroutine frames from a traceback
|
"""Scrub coroutine frames from a traceback
|
||||||
@@ -68,6 +69,7 @@ def log_request(handler):
|
|||||||
- get proxied IP instead of proxy IP
|
- get proxied IP instead of proxy IP
|
||||||
- log referer for redirect and failed requests
|
- log referer for redirect and failed requests
|
||||||
- log user-agent for failed requests
|
- log user-agent for failed requests
|
||||||
|
- record per-request metrics in prometheus
|
||||||
"""
|
"""
|
||||||
status = handler.get_status()
|
status = handler.get_status()
|
||||||
request = handler.request
|
request = handler.request
|
||||||
@@ -120,3 +122,4 @@ def log_request(handler):
|
|||||||
if location:
|
if location:
|
||||||
ns['location'] = ' → {}'.format(location)
|
ns['location'] = ' → {}'.format(location)
|
||||||
log_method(msg.format(**ns))
|
log_method(msg.format(**ns))
|
||||||
|
prometheus_log_method(handler)
|
||||||
|
94
jupyterhub/metrics.py
Normal file
94
jupyterhub/metrics.py
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
"""
|
||||||
|
Prometheus metrics exported by JupyterHub
|
||||||
|
|
||||||
|
Read https://prometheus.io/docs/practices/naming/ for naming
|
||||||
|
conventions for metrics & labels. We generally prefer naming them
|
||||||
|
`<noun>_<verb>_<type_suffix>`. So a histogram that's tracking
|
||||||
|
the duration (in seconds) of servers spawning would be called
|
||||||
|
SERVER_SPAWN_DURATION_SECONDS.
|
||||||
|
|
||||||
|
We also create an Enum for each 'status' type label in every metric
|
||||||
|
we collect. This is to make sure that the metrics exist regardless
|
||||||
|
of the condition happening or not. For example, if we don't explicitly
|
||||||
|
create them, the metric spawn_duration_seconds{status="failure"}
|
||||||
|
will not actually exist until the first failure. This makes dashboarding
|
||||||
|
and alerting difficult, so we explicitly list statuses and create
|
||||||
|
them manually here.
|
||||||
|
"""
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
from prometheus_client import Histogram
|
||||||
|
|
||||||
|
REQUEST_DURATION_SECONDS = Histogram(
|
||||||
|
'request_duration_seconds',
|
||||||
|
'request duration for all HTTP requests',
|
||||||
|
['method', 'handler', 'code']
|
||||||
|
)
|
||||||
|
|
||||||
|
SERVER_SPAWN_DURATION_SECONDS = Histogram(
|
||||||
|
'server_spawn_duration_seconds',
|
||||||
|
'time taken for server spawning operation',
|
||||||
|
['status'],
|
||||||
|
# Use custom bucket sizes, since the default bucket ranges
|
||||||
|
# are meant for quick running processes. Spawns can take a while!
|
||||||
|
buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")]
|
||||||
|
)
|
||||||
|
|
||||||
|
class ServerSpawnStatus(Enum):
|
||||||
|
"""
|
||||||
|
Possible values for 'status' label of SERVER_SPAWN_DURATION_SECONDS
|
||||||
|
"""
|
||||||
|
success = 'success'
|
||||||
|
failure = 'failure'
|
||||||
|
already_pending = 'already-pending'
|
||||||
|
throttled = 'throttled'
|
||||||
|
too_many_users = 'too-many-users'
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.value
|
||||||
|
|
||||||
|
for s in ServerSpawnStatus:
|
||||||
|
# Create empty metrics with the given status
|
||||||
|
SERVER_SPAWN_DURATION_SECONDS.labels(status=s)
|
||||||
|
|
||||||
|
|
||||||
|
PROXY_ADD_DURATION_SECONDS = Histogram(
|
||||||
|
'proxy_add_duration_seconds',
|
||||||
|
'duration for adding user routes to proxy',
|
||||||
|
['status']
|
||||||
|
)
|
||||||
|
|
||||||
|
class ProxyAddStatus(Enum):
|
||||||
|
"""
|
||||||
|
Possible values for 'status' label of PROXY_ADD_DURATION_SECONDS
|
||||||
|
"""
|
||||||
|
success = 'success'
|
||||||
|
failure = 'failure'
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.value
|
||||||
|
|
||||||
|
for s in ProxyAddStatus:
|
||||||
|
PROXY_ADD_DURATION_SECONDS.labels(status=s)
|
||||||
|
|
||||||
|
def prometheus_log_method(handler):
|
||||||
|
"""
|
||||||
|
Tornado log handler for recording RED metrics.
|
||||||
|
|
||||||
|
We record the following metrics:
|
||||||
|
Rate – the number of requests, per second, your services are serving.
|
||||||
|
Errors – the number of failed requests per second.
|
||||||
|
Duration – The amount of time each request takes expressed as a time interval.
|
||||||
|
|
||||||
|
We use a fully qualified name of the handler as a label,
|
||||||
|
rather than every url path to reduce cardinality.
|
||||||
|
|
||||||
|
This function should be either the value of or called from a function
|
||||||
|
that is the 'log_function' tornado setting. This makes it get called
|
||||||
|
at the end of every request, allowing us to record the metrics we need.
|
||||||
|
"""
|
||||||
|
REQUEST_DURATION_SECONDS.labels(
|
||||||
|
method=handler.request.method,
|
||||||
|
handler='{}.{}'.format(handler.__class__.__module__, type(handler).__name__),
|
||||||
|
code=handler.get_status()
|
||||||
|
).observe(handler.request.request_time())
|
@@ -6,3 +6,4 @@ pamela
|
|||||||
python-oauth2>=1.0
|
python-oauth2>=1.0
|
||||||
SQLAlchemy>=1.1
|
SQLAlchemy>=1.1
|
||||||
requests
|
requests
|
||||||
|
prometheus_client>=0.0.21
|
||||||
|
Reference in New Issue
Block a user