mirror of
https://github.com/jupyterhub/jupyterhub.git
synced 2025-10-18 15:33:02 +00:00
Make sure our metrics don't appear & disappear intermittently
Create all timeseries from the beginning, regardless of wether they happen or not. Also rename metric objects for consistency.
This commit is contained in:
@@ -23,7 +23,10 @@ from .. import orm
|
||||
from ..objects import Server
|
||||
from ..spawner import LocalProcessSpawner
|
||||
from ..utils import url_path_join
|
||||
from ..metrics import SPAWN_DURATION_SECONDS, PROXY_ADD_DURATION_SECONDS
|
||||
from ..metrics import (
|
||||
SERVER_SPAWN_DURATION_SECONDS, ServerSpawnStatus,
|
||||
PROXY_ADD_DURATION_SECONDS, ProxyAddStatus
|
||||
)
|
||||
|
||||
# pattern for the authentication token header
|
||||
auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE)
|
||||
@@ -400,11 +403,9 @@ class BaseHandler(RequestHandler):
|
||||
|
||||
if server_name in user.spawners and user.spawners[server_name].pending:
|
||||
pending = user.spawners[server_name].pending
|
||||
SPAWN_DURATION_SECONDS.labels(
|
||||
status='already-pending'
|
||||
).observe(
|
||||
time.perf_counter() - spawn_start_time
|
||||
)
|
||||
SERVER_SPAWN_DURATION_SECONDS.labels(
|
||||
status=ServerSpawnStatus.already_pending
|
||||
).observe(time.perf_counter() - spawn_start_time)
|
||||
raise RuntimeError("%s pending %s" % (user_server_name, pending))
|
||||
|
||||
# count active servers and pending spawns
|
||||
@@ -423,11 +424,9 @@ class BaseHandler(RequestHandler):
|
||||
'%s pending spawns, throttling',
|
||||
spawn_pending_count,
|
||||
)
|
||||
SPAWN_DURATION_SECONDS.labels(
|
||||
status='throttled'
|
||||
).observe(
|
||||
time.perf_counter() - spawn_start_time
|
||||
)
|
||||
SERVER_SPAWN_DURATION_SECONDS.labels(
|
||||
status=ServerSpawnStatus.throttled
|
||||
).observe(time.perf_counter() - spawn_start_time)
|
||||
raise web.HTTPError(
|
||||
429,
|
||||
"User startup rate limit exceeded. Try again in a few minutes.",
|
||||
@@ -437,11 +436,9 @@ class BaseHandler(RequestHandler):
|
||||
'%s servers active, no space available',
|
||||
active_count,
|
||||
)
|
||||
SPAWN_DURATION_SECONDS.labels(
|
||||
status='too-many-users'
|
||||
).observe(
|
||||
time.perf_counter() - spawn_start_time
|
||||
)
|
||||
SERVER_SPAWN_DURATION_SECONDS.labels(
|
||||
status=ServerSpawnStatus.too_many_users
|
||||
).observe(time.perf_counter() - spawn_start_time)
|
||||
raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.")
|
||||
|
||||
tic = IOLoop.current().time()
|
||||
@@ -474,11 +471,9 @@ class BaseHandler(RequestHandler):
|
||||
toc = IOLoop.current().time()
|
||||
self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic)
|
||||
self.statsd.timing('spawner.success', (toc - tic) * 1000)
|
||||
SPAWN_DURATION_SECONDS.labels(
|
||||
status='success'
|
||||
).observe(
|
||||
time.perf_counter() - spawn_start_time
|
||||
)
|
||||
SERVER_SPAWN_DURATION_SECONDS.labels(
|
||||
status=ServerSpawnStatus.success
|
||||
).observe(time.perf_counter() - spawn_start_time)
|
||||
proxy_add_start_time = time.perf_counter()
|
||||
spawner._proxy_pending = True
|
||||
try:
|
||||
@@ -534,11 +529,9 @@ class BaseHandler(RequestHandler):
|
||||
if status is not None:
|
||||
toc = IOLoop.current().time()
|
||||
self.statsd.timing('spawner.failure', (toc - tic) * 1000)
|
||||
SPAWN_DURATION_SECONDS.labels(
|
||||
status='failed'
|
||||
).observe(
|
||||
time.perf_counter() - spawn_start_time
|
||||
)
|
||||
SERVER_SPAWN_DURATION_SECONDS.labels(
|
||||
status=ServerSpawnStatus.failure
|
||||
).observe(time.perf_counter() - spawn_start_time)
|
||||
raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % (
|
||||
status, spawner._log_name))
|
||||
|
||||
|
@@ -2,8 +2,21 @@
|
||||
Prometheus metrics exported by JupyterHub
|
||||
|
||||
Read https://prometheus.io/docs/practices/naming/ for naming
|
||||
conventions for metrics & labels.
|
||||
conventions for metrics & labels. We generally prefer naming them
|
||||
`<noun>_<verb>_<type_suffix>`. So a histogram that's tracking
|
||||
the duration (in seconds) of servers spawning would be called
|
||||
SERVER_SPAWN_DURATION_SECONDS.
|
||||
|
||||
We also create an Enum for each 'status' type label in every metric
|
||||
we collect. This is to make sure that the metrics exist regardless
|
||||
of the condition happening or not. For example, if we don't explicitly
|
||||
create them, the metric spawn_duration_seconds{status="failure"}
|
||||
will not actually exist until the first failure. This makes dashboarding
|
||||
and alerting difficult, so we explicitly list statuses and create
|
||||
them manually here.
|
||||
"""
|
||||
from enum import Enum
|
||||
|
||||
from prometheus_client import Histogram
|
||||
|
||||
REQUEST_DURATION_SECONDS = Histogram(
|
||||
@@ -12,21 +25,52 @@ REQUEST_DURATION_SECONDS = Histogram(
|
||||
['method', 'handler', 'code']
|
||||
)
|
||||
|
||||
SPAWN_DURATION_SECONDS = Histogram(
|
||||
'spawn_duration_seconds',
|
||||
'spawn duration for all server spawns',
|
||||
SERVER_SPAWN_DURATION_SECONDS = Histogram(
|
||||
'server_spawn_duration_seconds',
|
||||
'time taken for server spawning operation',
|
||||
['status'],
|
||||
# Use custom bucket sizes, since the default bucket ranges
|
||||
# are meant for quick running processes. Spawns can take a while!
|
||||
buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")]
|
||||
)
|
||||
|
||||
class ServerSpawnStatus(Enum):
|
||||
"""
|
||||
Possible values for 'status' label of SERVER_SPAWN_DURATION_SECONDS
|
||||
"""
|
||||
success = 'success'
|
||||
failure = 'failure'
|
||||
already_pending = 'already-pending'
|
||||
throttled = 'throttled'
|
||||
too_many_users = 'too-many-users'
|
||||
|
||||
def __str__(self):
|
||||
return self.value
|
||||
|
||||
for s in ServerSpawnStatus:
|
||||
# Create empty metrics with the given status
|
||||
SERVER_SPAWN_DURATION_SECONDS.labels(status=s)
|
||||
|
||||
|
||||
PROXY_ADD_DURATION_SECONDS = Histogram(
|
||||
'proxy_add_duration_seconds',
|
||||
'duration for adding user routes to proxy',
|
||||
['status']
|
||||
)
|
||||
|
||||
class ProxyAddStatus(Enum):
|
||||
"""
|
||||
Possible values for 'status' label of PROXY_ADD_DURATION_SECONDS
|
||||
"""
|
||||
success = 'success'
|
||||
failure = 'failure'
|
||||
|
||||
def __str__(self):
|
||||
return self.value
|
||||
|
||||
for s in ProxyAddStatus:
|
||||
PROXY_ADD_DURATION_SECONDS.labels(status=s)
|
||||
|
||||
def prometheus_log_method(handler):
|
||||
"""
|
||||
Tornado log handler for recording RED metrics.
|
||||
|
Reference in New Issue
Block a user