mirror of
https://github.com/jupyterhub/jupyterhub.git
synced 2025-10-15 14:03:02 +00:00
Add metric recording spawn durations
Try to hit every possible exit point from the spawn_single_server method, with an appropriate status code. The default histogram buckets are also meant for request latencies, but spawning usually takes longer so we use custom buckets
This commit is contained in:
@@ -5,6 +5,7 @@
|
|||||||
|
|
||||||
import copy
|
import copy
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from http.client import responses
|
from http.client import responses
|
||||||
from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
|
from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
|
||||||
@@ -22,6 +23,7 @@ from .. import orm
|
|||||||
from ..objects import Server
|
from ..objects import Server
|
||||||
from ..spawner import LocalProcessSpawner
|
from ..spawner import LocalProcessSpawner
|
||||||
from ..utils import url_path_join
|
from ..utils import url_path_join
|
||||||
|
from ..metrics import SPAWN_DURATION_SECONDS
|
||||||
|
|
||||||
# pattern for the authentication token header
|
# pattern for the authentication token header
|
||||||
auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE)
|
auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE)
|
||||||
@@ -388,6 +390,7 @@ class BaseHandler(RequestHandler):
|
|||||||
@gen.coroutine
|
@gen.coroutine
|
||||||
def spawn_single_user(self, user, server_name='', options=None):
|
def spawn_single_user(self, user, server_name='', options=None):
|
||||||
# in case of error, include 'try again from /hub/home' message
|
# in case of error, include 'try again from /hub/home' message
|
||||||
|
spawn_starttime = time.perf_counter()
|
||||||
self.extra_error_html = self.spawn_home_error
|
self.extra_error_html = self.spawn_home_error
|
||||||
|
|
||||||
user_server_name = user.name
|
user_server_name = user.name
|
||||||
@@ -397,6 +400,11 @@ class BaseHandler(RequestHandler):
|
|||||||
|
|
||||||
if server_name in user.spawners and user.spawners[server_name].pending:
|
if server_name in user.spawners and user.spawners[server_name].pending:
|
||||||
pending = user.spawners[server_name].pending
|
pending = user.spawners[server_name].pending
|
||||||
|
SPAWN_DURATION_SECONDS.labels(
|
||||||
|
status='already-pending'
|
||||||
|
).observe(
|
||||||
|
time.perf_counter() - spawn_starttime
|
||||||
|
)
|
||||||
raise RuntimeError("%s pending %s" % (user_server_name, pending))
|
raise RuntimeError("%s pending %s" % (user_server_name, pending))
|
||||||
|
|
||||||
# count active servers and pending spawns
|
# count active servers and pending spawns
|
||||||
@@ -415,6 +423,11 @@ class BaseHandler(RequestHandler):
|
|||||||
'%s pending spawns, throttling',
|
'%s pending spawns, throttling',
|
||||||
spawn_pending_count,
|
spawn_pending_count,
|
||||||
)
|
)
|
||||||
|
SPAWN_DURATION_SECONDS.labels(
|
||||||
|
status='throttled'
|
||||||
|
).observe(
|
||||||
|
time.perf_counter() - spawn_starttime
|
||||||
|
)
|
||||||
raise web.HTTPError(
|
raise web.HTTPError(
|
||||||
429,
|
429,
|
||||||
"User startup rate limit exceeded. Try again in a few minutes.",
|
"User startup rate limit exceeded. Try again in a few minutes.",
|
||||||
@@ -424,6 +437,11 @@ class BaseHandler(RequestHandler):
|
|||||||
'%s servers active, no space available',
|
'%s servers active, no space available',
|
||||||
active_count,
|
active_count,
|
||||||
)
|
)
|
||||||
|
SPAWN_DURATION_SECONDS.labels(
|
||||||
|
status='too-many-users'
|
||||||
|
).observe(
|
||||||
|
time.perf_counter() - spawn_starttime
|
||||||
|
)
|
||||||
raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.")
|
raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.")
|
||||||
|
|
||||||
tic = IOLoop.current().time()
|
tic = IOLoop.current().time()
|
||||||
@@ -456,6 +474,11 @@ class BaseHandler(RequestHandler):
|
|||||||
toc = IOLoop.current().time()
|
toc = IOLoop.current().time()
|
||||||
self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic)
|
self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic)
|
||||||
self.statsd.timing('spawner.success', (toc - tic) * 1000)
|
self.statsd.timing('spawner.success', (toc - tic) * 1000)
|
||||||
|
SPAWN_DURATION_SECONDS.labels(
|
||||||
|
status='success'
|
||||||
|
).observe(
|
||||||
|
time.perf_counter() - spawn_starttime
|
||||||
|
)
|
||||||
spawner._proxy_pending = True
|
spawner._proxy_pending = True
|
||||||
try:
|
try:
|
||||||
yield self.proxy.add_user(user, server_name)
|
yield self.proxy.add_user(user, server_name)
|
||||||
@@ -499,6 +522,11 @@ class BaseHandler(RequestHandler):
|
|||||||
if status is not None:
|
if status is not None:
|
||||||
toc = IOLoop.current().time()
|
toc = IOLoop.current().time()
|
||||||
self.statsd.timing('spawner.failure', (toc - tic) * 1000)
|
self.statsd.timing('spawner.failure', (toc - tic) * 1000)
|
||||||
|
SPAWN_DURATION_SECONDS.labels(
|
||||||
|
status='failed'
|
||||||
|
).observe(
|
||||||
|
time.perf_counter() - spawn_starttime
|
||||||
|
)
|
||||||
raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % (
|
raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % (
|
||||||
status, spawner._log_name))
|
status, spawner._log_name))
|
||||||
|
|
||||||
|
@@ -9,6 +9,13 @@ REQUEST_DURATION_SECONDS = Histogram(
|
|||||||
['method', 'handler', 'code']
|
['method', 'handler', 'code']
|
||||||
)
|
)
|
||||||
|
|
||||||
|
SPAWN_DURATION_SECONDS = Histogram(
|
||||||
|
'spawn_duration_seconds',
|
||||||
|
'spawn duration for all server spawns',
|
||||||
|
['status'],
|
||||||
|
buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")]
|
||||||
|
)
|
||||||
|
|
||||||
def prometheus_log_method(handler):
|
def prometheus_log_method(handler):
|
||||||
"""
|
"""
|
||||||
Tornado log handler for recording RED metrics
|
Tornado log handler for recording RED metrics
|
||||||
|
Reference in New Issue
Block a user