Make sure our metrics don't appear & disappear intermittently

Create all timeseries from the beginning, regardless of wether
they happen or not. Also rename metric objects for consistency.
This commit is contained in:
yuvipanda
2017-12-10 21:23:32 -08:00
parent ea99c58da5
commit 3cd526c019
2 changed files with 67 additions and 30 deletions

View File

@@ -23,7 +23,10 @@ from .. import orm
from ..objects import Server from ..objects import Server
from ..spawner import LocalProcessSpawner from ..spawner import LocalProcessSpawner
from ..utils import url_path_join from ..utils import url_path_join
from ..metrics import SPAWN_DURATION_SECONDS, PROXY_ADD_DURATION_SECONDS from ..metrics import (
SERVER_SPAWN_DURATION_SECONDS, ServerSpawnStatus,
PROXY_ADD_DURATION_SECONDS, ProxyAddStatus
)
# pattern for the authentication token header # pattern for the authentication token header
auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE) auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE)
@@ -400,11 +403,9 @@ class BaseHandler(RequestHandler):
if server_name in user.spawners and user.spawners[server_name].pending: if server_name in user.spawners and user.spawners[server_name].pending:
pending = user.spawners[server_name].pending pending = user.spawners[server_name].pending
SPAWN_DURATION_SECONDS.labels( SERVER_SPAWN_DURATION_SECONDS.labels(
status='already-pending' status=ServerSpawnStatus.already_pending
).observe( ).observe(time.perf_counter() - spawn_start_time)
time.perf_counter() - spawn_start_time
)
raise RuntimeError("%s pending %s" % (user_server_name, pending)) raise RuntimeError("%s pending %s" % (user_server_name, pending))
# count active servers and pending spawns # count active servers and pending spawns
@@ -423,11 +424,9 @@ class BaseHandler(RequestHandler):
'%s pending spawns, throttling', '%s pending spawns, throttling',
spawn_pending_count, spawn_pending_count,
) )
SPAWN_DURATION_SECONDS.labels( SERVER_SPAWN_DURATION_SECONDS.labels(
status='throttled' status=ServerSpawnStatus.throttled
).observe( ).observe(time.perf_counter() - spawn_start_time)
time.perf_counter() - spawn_start_time
)
raise web.HTTPError( raise web.HTTPError(
429, 429,
"User startup rate limit exceeded. Try again in a few minutes.", "User startup rate limit exceeded. Try again in a few minutes.",
@@ -437,11 +436,9 @@ class BaseHandler(RequestHandler):
'%s servers active, no space available', '%s servers active, no space available',
active_count, active_count,
) )
SPAWN_DURATION_SECONDS.labels( SERVER_SPAWN_DURATION_SECONDS.labels(
status='too-many-users' status=ServerSpawnStatus.too_many_users
).observe( ).observe(time.perf_counter() - spawn_start_time)
time.perf_counter() - spawn_start_time
)
raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.") raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.")
tic = IOLoop.current().time() tic = IOLoop.current().time()
@@ -474,11 +471,9 @@ class BaseHandler(RequestHandler):
toc = IOLoop.current().time() toc = IOLoop.current().time()
self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic) self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic)
self.statsd.timing('spawner.success', (toc - tic) * 1000) self.statsd.timing('spawner.success', (toc - tic) * 1000)
SPAWN_DURATION_SECONDS.labels( SERVER_SPAWN_DURATION_SECONDS.labels(
status='success' status=ServerSpawnStatus.success
).observe( ).observe(time.perf_counter() - spawn_start_time)
time.perf_counter() - spawn_start_time
)
proxy_add_start_time = time.perf_counter() proxy_add_start_time = time.perf_counter()
spawner._proxy_pending = True spawner._proxy_pending = True
try: try:
@@ -534,11 +529,9 @@ class BaseHandler(RequestHandler):
if status is not None: if status is not None:
toc = IOLoop.current().time() toc = IOLoop.current().time()
self.statsd.timing('spawner.failure', (toc - tic) * 1000) self.statsd.timing('spawner.failure', (toc - tic) * 1000)
SPAWN_DURATION_SECONDS.labels( SERVER_SPAWN_DURATION_SECONDS.labels(
status='failed' status=ServerSpawnStatus.failure
).observe( ).observe(time.perf_counter() - spawn_start_time)
time.perf_counter() - spawn_start_time
)
raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % ( raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % (
status, spawner._log_name)) status, spawner._log_name))

View File

@@ -2,8 +2,21 @@
Prometheus metrics exported by JupyterHub Prometheus metrics exported by JupyterHub
Read https://prometheus.io/docs/practices/naming/ for naming Read https://prometheus.io/docs/practices/naming/ for naming
conventions for metrics & labels. conventions for metrics & labels. We generally prefer naming them
`<noun>_<verb>_<type_suffix>`. So a histogram that's tracking
the duration (in seconds) of servers spawning would be called
SERVER_SPAWN_DURATION_SECONDS.
We also create an Enum for each 'status' type label in every metric
we collect. This is to make sure that the metrics exist regardless
of the condition happening or not. For example, if we don't explicitly
create them, the metric spawn_duration_seconds{status="failure"}
will not actually exist until the first failure. This makes dashboarding
and alerting difficult, so we explicitly list statuses and create
them manually here.
""" """
from enum import Enum
from prometheus_client import Histogram from prometheus_client import Histogram
REQUEST_DURATION_SECONDS = Histogram( REQUEST_DURATION_SECONDS = Histogram(
@@ -12,21 +25,52 @@ REQUEST_DURATION_SECONDS = Histogram(
['method', 'handler', 'code'] ['method', 'handler', 'code']
) )
SPAWN_DURATION_SECONDS = Histogram( SERVER_SPAWN_DURATION_SECONDS = Histogram(
'spawn_duration_seconds', 'server_spawn_duration_seconds',
'spawn duration for all server spawns', 'time taken for server spawning operation',
['status'], ['status'],
# Use custom bucket sizes, since the default bucket ranges # Use custom bucket sizes, since the default bucket ranges
# are meant for quick running processes. Spawns can take a while! # are meant for quick running processes. Spawns can take a while!
buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")] buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")]
) )
class ServerSpawnStatus(Enum):
"""
Possible values for 'status' label of SERVER_SPAWN_DURATION_SECONDS
"""
success = 'success'
failure = 'failure'
already_pending = 'already-pending'
throttled = 'throttled'
too_many_users = 'too-many-users'
def __str__(self):
return self.value
for s in ServerSpawnStatus:
# Create empty metrics with the given status
SERVER_SPAWN_DURATION_SECONDS.labels(status=s)
PROXY_ADD_DURATION_SECONDS = Histogram( PROXY_ADD_DURATION_SECONDS = Histogram(
'proxy_add_duration_seconds', 'proxy_add_duration_seconds',
'duration for adding user routes to proxy', 'duration for adding user routes to proxy',
['status'] ['status']
) )
class ProxyAddStatus(Enum):
"""
Possible values for 'status' label of PROXY_ADD_DURATION_SECONDS
"""
success = 'success'
failure = 'failure'
def __str__(self):
return self.value
for s in ProxyAddStatus:
PROXY_ADD_DURATION_SECONDS.labels(status=s)
def prometheus_log_method(handler): def prometheus_log_method(handler):
""" """
Tornado log handler for recording RED metrics. Tornado log handler for recording RED metrics.