mirror of
https://github.com/jupyterhub/jupyterhub.git
synced 2025-10-18 15:33:02 +00:00
Make sure our metrics don't appear & disappear intermittently
Create all timeseries from the beginning, regardless of wether they happen or not. Also rename metric objects for consistency.
This commit is contained in:
@@ -23,7 +23,10 @@ from .. import orm
|
|||||||
from ..objects import Server
|
from ..objects import Server
|
||||||
from ..spawner import LocalProcessSpawner
|
from ..spawner import LocalProcessSpawner
|
||||||
from ..utils import url_path_join
|
from ..utils import url_path_join
|
||||||
from ..metrics import SPAWN_DURATION_SECONDS, PROXY_ADD_DURATION_SECONDS
|
from ..metrics import (
|
||||||
|
SERVER_SPAWN_DURATION_SECONDS, ServerSpawnStatus,
|
||||||
|
PROXY_ADD_DURATION_SECONDS, ProxyAddStatus
|
||||||
|
)
|
||||||
|
|
||||||
# pattern for the authentication token header
|
# pattern for the authentication token header
|
||||||
auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE)
|
auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE)
|
||||||
@@ -400,11 +403,9 @@ class BaseHandler(RequestHandler):
|
|||||||
|
|
||||||
if server_name in user.spawners and user.spawners[server_name].pending:
|
if server_name in user.spawners and user.spawners[server_name].pending:
|
||||||
pending = user.spawners[server_name].pending
|
pending = user.spawners[server_name].pending
|
||||||
SPAWN_DURATION_SECONDS.labels(
|
SERVER_SPAWN_DURATION_SECONDS.labels(
|
||||||
status='already-pending'
|
status=ServerSpawnStatus.already_pending
|
||||||
).observe(
|
).observe(time.perf_counter() - spawn_start_time)
|
||||||
time.perf_counter() - spawn_start_time
|
|
||||||
)
|
|
||||||
raise RuntimeError("%s pending %s" % (user_server_name, pending))
|
raise RuntimeError("%s pending %s" % (user_server_name, pending))
|
||||||
|
|
||||||
# count active servers and pending spawns
|
# count active servers and pending spawns
|
||||||
@@ -423,11 +424,9 @@ class BaseHandler(RequestHandler):
|
|||||||
'%s pending spawns, throttling',
|
'%s pending spawns, throttling',
|
||||||
spawn_pending_count,
|
spawn_pending_count,
|
||||||
)
|
)
|
||||||
SPAWN_DURATION_SECONDS.labels(
|
SERVER_SPAWN_DURATION_SECONDS.labels(
|
||||||
status='throttled'
|
status=ServerSpawnStatus.throttled
|
||||||
).observe(
|
).observe(time.perf_counter() - spawn_start_time)
|
||||||
time.perf_counter() - spawn_start_time
|
|
||||||
)
|
|
||||||
raise web.HTTPError(
|
raise web.HTTPError(
|
||||||
429,
|
429,
|
||||||
"User startup rate limit exceeded. Try again in a few minutes.",
|
"User startup rate limit exceeded. Try again in a few minutes.",
|
||||||
@@ -437,11 +436,9 @@ class BaseHandler(RequestHandler):
|
|||||||
'%s servers active, no space available',
|
'%s servers active, no space available',
|
||||||
active_count,
|
active_count,
|
||||||
)
|
)
|
||||||
SPAWN_DURATION_SECONDS.labels(
|
SERVER_SPAWN_DURATION_SECONDS.labels(
|
||||||
status='too-many-users'
|
status=ServerSpawnStatus.too_many_users
|
||||||
).observe(
|
).observe(time.perf_counter() - spawn_start_time)
|
||||||
time.perf_counter() - spawn_start_time
|
|
||||||
)
|
|
||||||
raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.")
|
raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.")
|
||||||
|
|
||||||
tic = IOLoop.current().time()
|
tic = IOLoop.current().time()
|
||||||
@@ -474,11 +471,9 @@ class BaseHandler(RequestHandler):
|
|||||||
toc = IOLoop.current().time()
|
toc = IOLoop.current().time()
|
||||||
self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic)
|
self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic)
|
||||||
self.statsd.timing('spawner.success', (toc - tic) * 1000)
|
self.statsd.timing('spawner.success', (toc - tic) * 1000)
|
||||||
SPAWN_DURATION_SECONDS.labels(
|
SERVER_SPAWN_DURATION_SECONDS.labels(
|
||||||
status='success'
|
status=ServerSpawnStatus.success
|
||||||
).observe(
|
).observe(time.perf_counter() - spawn_start_time)
|
||||||
time.perf_counter() - spawn_start_time
|
|
||||||
)
|
|
||||||
proxy_add_start_time = time.perf_counter()
|
proxy_add_start_time = time.perf_counter()
|
||||||
spawner._proxy_pending = True
|
spawner._proxy_pending = True
|
||||||
try:
|
try:
|
||||||
@@ -534,11 +529,9 @@ class BaseHandler(RequestHandler):
|
|||||||
if status is not None:
|
if status is not None:
|
||||||
toc = IOLoop.current().time()
|
toc = IOLoop.current().time()
|
||||||
self.statsd.timing('spawner.failure', (toc - tic) * 1000)
|
self.statsd.timing('spawner.failure', (toc - tic) * 1000)
|
||||||
SPAWN_DURATION_SECONDS.labels(
|
SERVER_SPAWN_DURATION_SECONDS.labels(
|
||||||
status='failed'
|
status=ServerSpawnStatus.failure
|
||||||
).observe(
|
).observe(time.perf_counter() - spawn_start_time)
|
||||||
time.perf_counter() - spawn_start_time
|
|
||||||
)
|
|
||||||
raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % (
|
raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % (
|
||||||
status, spawner._log_name))
|
status, spawner._log_name))
|
||||||
|
|
||||||
|
@@ -2,8 +2,21 @@
|
|||||||
Prometheus metrics exported by JupyterHub
|
Prometheus metrics exported by JupyterHub
|
||||||
|
|
||||||
Read https://prometheus.io/docs/practices/naming/ for naming
|
Read https://prometheus.io/docs/practices/naming/ for naming
|
||||||
conventions for metrics & labels.
|
conventions for metrics & labels. We generally prefer naming them
|
||||||
|
`<noun>_<verb>_<type_suffix>`. So a histogram that's tracking
|
||||||
|
the duration (in seconds) of servers spawning would be called
|
||||||
|
SERVER_SPAWN_DURATION_SECONDS.
|
||||||
|
|
||||||
|
We also create an Enum for each 'status' type label in every metric
|
||||||
|
we collect. This is to make sure that the metrics exist regardless
|
||||||
|
of the condition happening or not. For example, if we don't explicitly
|
||||||
|
create them, the metric spawn_duration_seconds{status="failure"}
|
||||||
|
will not actually exist until the first failure. This makes dashboarding
|
||||||
|
and alerting difficult, so we explicitly list statuses and create
|
||||||
|
them manually here.
|
||||||
"""
|
"""
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
from prometheus_client import Histogram
|
from prometheus_client import Histogram
|
||||||
|
|
||||||
REQUEST_DURATION_SECONDS = Histogram(
|
REQUEST_DURATION_SECONDS = Histogram(
|
||||||
@@ -12,21 +25,52 @@ REQUEST_DURATION_SECONDS = Histogram(
|
|||||||
['method', 'handler', 'code']
|
['method', 'handler', 'code']
|
||||||
)
|
)
|
||||||
|
|
||||||
SPAWN_DURATION_SECONDS = Histogram(
|
SERVER_SPAWN_DURATION_SECONDS = Histogram(
|
||||||
'spawn_duration_seconds',
|
'server_spawn_duration_seconds',
|
||||||
'spawn duration for all server spawns',
|
'time taken for server spawning operation',
|
||||||
['status'],
|
['status'],
|
||||||
# Use custom bucket sizes, since the default bucket ranges
|
# Use custom bucket sizes, since the default bucket ranges
|
||||||
# are meant for quick running processes. Spawns can take a while!
|
# are meant for quick running processes. Spawns can take a while!
|
||||||
buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")]
|
buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
class ServerSpawnStatus(Enum):
|
||||||
|
"""
|
||||||
|
Possible values for 'status' label of SERVER_SPAWN_DURATION_SECONDS
|
||||||
|
"""
|
||||||
|
success = 'success'
|
||||||
|
failure = 'failure'
|
||||||
|
already_pending = 'already-pending'
|
||||||
|
throttled = 'throttled'
|
||||||
|
too_many_users = 'too-many-users'
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.value
|
||||||
|
|
||||||
|
for s in ServerSpawnStatus:
|
||||||
|
# Create empty metrics with the given status
|
||||||
|
SERVER_SPAWN_DURATION_SECONDS.labels(status=s)
|
||||||
|
|
||||||
|
|
||||||
PROXY_ADD_DURATION_SECONDS = Histogram(
|
PROXY_ADD_DURATION_SECONDS = Histogram(
|
||||||
'proxy_add_duration_seconds',
|
'proxy_add_duration_seconds',
|
||||||
'duration for adding user routes to proxy',
|
'duration for adding user routes to proxy',
|
||||||
['status']
|
['status']
|
||||||
)
|
)
|
||||||
|
|
||||||
|
class ProxyAddStatus(Enum):
|
||||||
|
"""
|
||||||
|
Possible values for 'status' label of PROXY_ADD_DURATION_SECONDS
|
||||||
|
"""
|
||||||
|
success = 'success'
|
||||||
|
failure = 'failure'
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.value
|
||||||
|
|
||||||
|
for s in ProxyAddStatus:
|
||||||
|
PROXY_ADD_DURATION_SECONDS.labels(status=s)
|
||||||
|
|
||||||
def prometheus_log_method(handler):
|
def prometheus_log_method(handler):
|
||||||
"""
|
"""
|
||||||
Tornado log handler for recording RED metrics.
|
Tornado log handler for recording RED metrics.
|
||||||
|
Reference in New Issue
Block a user