Merge pull request #1581 from yuvipanda/prometheus

Add RED prometheus metrics for all requests
This commit is contained in:
Min RK
2017-12-11 12:00:25 +01:00
committed by GitHub
6 changed files with 150 additions and 2 deletions

View File

@@ -1,8 +1,8 @@
from .base import *
from .login import *
from . import base, pages, login
from . import base, pages, login, metrics
default_handlers = []
for mod in (base, pages, login):
for mod in (base, pages, login, metrics):
default_handlers.extend(mod.default_handlers)

View File

@@ -5,6 +5,7 @@
import copy
import re
import time
from datetime import timedelta
from http.client import responses
from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
@@ -22,6 +23,10 @@ from .. import orm
from ..objects import Server
from ..spawner import LocalProcessSpawner
from ..utils import url_path_join
from ..metrics import (
SERVER_SPAWN_DURATION_SECONDS, ServerSpawnStatus,
PROXY_ADD_DURATION_SECONDS, ProxyAddStatus
)
# pattern for the authentication token header
auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE)
@@ -388,6 +393,7 @@ class BaseHandler(RequestHandler):
@gen.coroutine
def spawn_single_user(self, user, server_name='', options=None):
# in case of error, include 'try again from /hub/home' message
spawn_start_time = time.perf_counter()
self.extra_error_html = self.spawn_home_error
user_server_name = user.name
@@ -397,6 +403,9 @@ class BaseHandler(RequestHandler):
if server_name in user.spawners and user.spawners[server_name].pending:
pending = user.spawners[server_name].pending
SERVER_SPAWN_DURATION_SECONDS.labels(
status=ServerSpawnStatus.already_pending
).observe(time.perf_counter() - spawn_start_time)
raise RuntimeError("%s pending %s" % (user_server_name, pending))
# count active servers and pending spawns
@@ -415,6 +424,9 @@ class BaseHandler(RequestHandler):
'%s pending spawns, throttling',
spawn_pending_count,
)
SERVER_SPAWN_DURATION_SECONDS.labels(
status=ServerSpawnStatus.throttled
).observe(time.perf_counter() - spawn_start_time)
raise web.HTTPError(
429,
"User startup rate limit exceeded. Try again in a few minutes.",
@@ -424,6 +436,9 @@ class BaseHandler(RequestHandler):
'%s servers active, no space available',
active_count,
)
SERVER_SPAWN_DURATION_SECONDS.labels(
status=ServerSpawnStatus.too_many_users
).observe(time.perf_counter() - spawn_start_time)
raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.")
tic = IOLoop.current().time()
@@ -456,13 +471,28 @@ class BaseHandler(RequestHandler):
toc = IOLoop.current().time()
self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic)
self.statsd.timing('spawner.success', (toc - tic) * 1000)
SERVER_SPAWN_DURATION_SECONDS.labels(
status=ServerSpawnStatus.success
).observe(time.perf_counter() - spawn_start_time)
proxy_add_start_time = time.perf_counter()
spawner._proxy_pending = True
try:
yield self.proxy.add_user(user, server_name)
PROXY_ADD_DURATION_SECONDS.labels(
status='success'
).observe(
time.perf_counter() - proxy_add_start_time
)
except Exception:
self.log.exception("Failed to add %s to proxy!", user_server_name)
self.log.error("Stopping %s to avoid inconsistent state", user_server_name)
yield user.stop()
PROXY_ADD_DURATION_SECONDS.labels(
status='failure'
).observe(
time.perf_counter() - proxy_add_start_time
)
else:
spawner.add_poll_callback(self.user_stopped, user, server_name)
finally:
@@ -499,6 +529,9 @@ class BaseHandler(RequestHandler):
if status is not None:
toc = IOLoop.current().time()
self.statsd.timing('spawner.failure', (toc - tic) * 1000)
SERVER_SPAWN_DURATION_SECONDS.labels(
status=ServerSpawnStatus.failure
).observe(time.perf_counter() - spawn_start_time)
raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % (
status, spawner._log_name))

View File

@@ -0,0 +1,17 @@
from prometheus_client import REGISTRY, CONTENT_TYPE_LATEST, generate_latest
from tornado import gen
from .base import BaseHandler
class MetricsHandler(BaseHandler):
"""
Handler to serve Prometheus metrics
"""
@gen.coroutine
def get(self):
self.set_header('Content-Type', CONTENT_TYPE_LATEST)
self.write(generate_latest(REGISTRY))
default_handlers = [
(r'/metrics$', MetricsHandler)
]

View File

@@ -8,6 +8,7 @@ import traceback
from tornado.log import LogFormatter, access_log
from tornado.web import StaticFileHandler, HTTPError
from .metrics import prometheus_log_method
def coroutine_traceback(typ, value, tb):
"""Scrub coroutine frames from a traceback
@@ -68,6 +69,7 @@ def log_request(handler):
- get proxied IP instead of proxy IP
- log referer for redirect and failed requests
- log user-agent for failed requests
- record per-request metrics in prometheus
"""
status = handler.get_status()
request = handler.request
@@ -120,3 +122,4 @@ def log_request(handler):
if location:
ns['location'] = '{}'.format(location)
log_method(msg.format(**ns))
prometheus_log_method(handler)

94
jupyterhub/metrics.py Normal file
View File

@@ -0,0 +1,94 @@
"""
Prometheus metrics exported by JupyterHub
Read https://prometheus.io/docs/practices/naming/ for naming
conventions for metrics & labels. We generally prefer naming them
`<noun>_<verb>_<type_suffix>`. So a histogram that's tracking
the duration (in seconds) of servers spawning would be called
SERVER_SPAWN_DURATION_SECONDS.
We also create an Enum for each 'status' type label in every metric
we collect. This is to make sure that the metrics exist regardless
of the condition happening or not. For example, if we don't explicitly
create them, the metric spawn_duration_seconds{status="failure"}
will not actually exist until the first failure. This makes dashboarding
and alerting difficult, so we explicitly list statuses and create
them manually here.
"""
from enum import Enum
from prometheus_client import Histogram
REQUEST_DURATION_SECONDS = Histogram(
'request_duration_seconds',
'request duration for all HTTP requests',
['method', 'handler', 'code']
)
SERVER_SPAWN_DURATION_SECONDS = Histogram(
'server_spawn_duration_seconds',
'time taken for server spawning operation',
['status'],
# Use custom bucket sizes, since the default bucket ranges
# are meant for quick running processes. Spawns can take a while!
buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")]
)
class ServerSpawnStatus(Enum):
"""
Possible values for 'status' label of SERVER_SPAWN_DURATION_SECONDS
"""
success = 'success'
failure = 'failure'
already_pending = 'already-pending'
throttled = 'throttled'
too_many_users = 'too-many-users'
def __str__(self):
return self.value
for s in ServerSpawnStatus:
# Create empty metrics with the given status
SERVER_SPAWN_DURATION_SECONDS.labels(status=s)
PROXY_ADD_DURATION_SECONDS = Histogram(
'proxy_add_duration_seconds',
'duration for adding user routes to proxy',
['status']
)
class ProxyAddStatus(Enum):
"""
Possible values for 'status' label of PROXY_ADD_DURATION_SECONDS
"""
success = 'success'
failure = 'failure'
def __str__(self):
return self.value
for s in ProxyAddStatus:
PROXY_ADD_DURATION_SECONDS.labels(status=s)
def prometheus_log_method(handler):
"""
Tornado log handler for recording RED metrics.
We record the following metrics:
Rate the number of requests, per second, your services are serving.
Errors the number of failed requests per second.
Duration The amount of time each request takes expressed as a time interval.
We use a fully qualified name of the handler as a label,
rather than every url path to reduce cardinality.
This function should be either the value of or called from a function
that is the 'log_function' tornado setting. This makes it get called
at the end of every request, allowing us to record the metrics we need.
"""
REQUEST_DURATION_SECONDS.labels(
method=handler.request.method,
handler='{}.{}'.format(handler.__class__.__module__, type(handler).__name__),
code=handler.get_status()
).observe(handler.request.request_time())

View File

@@ -6,3 +6,4 @@ pamela
python-oauth2>=1.0
SQLAlchemy>=1.1
requests
prometheus_client>=0.0.21