Add metric recording spawn durations

Try to hit every possible exit point from the spawn_single_server
method, with an appropriate status code.

The default histogram buckets are also meant for request latencies,
but spawning usually takes longer so we use custom buckets
This commit is contained in:
yuvipanda
2017-12-10 14:54:34 -08:00
parent 339758ec42
commit 6594e88390
2 changed files with 35 additions and 0 deletions

View File

@@ -5,6 +5,7 @@
import copy import copy
import re import re
import time
from datetime import timedelta from datetime import timedelta
from http.client import responses from http.client import responses
from urllib.parse import urlparse, urlunparse, parse_qs, urlencode from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
@@ -22,6 +23,7 @@ from .. import orm
from ..objects import Server from ..objects import Server
from ..spawner import LocalProcessSpawner from ..spawner import LocalProcessSpawner
from ..utils import url_path_join from ..utils import url_path_join
from ..metrics import SPAWN_DURATION_SECONDS
# pattern for the authentication token header # pattern for the authentication token header
auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE) auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE)
@@ -388,6 +390,7 @@ class BaseHandler(RequestHandler):
@gen.coroutine @gen.coroutine
def spawn_single_user(self, user, server_name='', options=None): def spawn_single_user(self, user, server_name='', options=None):
# in case of error, include 'try again from /hub/home' message # in case of error, include 'try again from /hub/home' message
spawn_starttime = time.perf_counter()
self.extra_error_html = self.spawn_home_error self.extra_error_html = self.spawn_home_error
user_server_name = user.name user_server_name = user.name
@@ -397,6 +400,11 @@ class BaseHandler(RequestHandler):
if server_name in user.spawners and user.spawners[server_name].pending: if server_name in user.spawners and user.spawners[server_name].pending:
pending = user.spawners[server_name].pending pending = user.spawners[server_name].pending
SPAWN_DURATION_SECONDS.labels(
status='already-pending'
).observe(
time.perf_counter() - spawn_starttime
)
raise RuntimeError("%s pending %s" % (user_server_name, pending)) raise RuntimeError("%s pending %s" % (user_server_name, pending))
# count active servers and pending spawns # count active servers and pending spawns
@@ -415,6 +423,11 @@ class BaseHandler(RequestHandler):
'%s pending spawns, throttling', '%s pending spawns, throttling',
spawn_pending_count, spawn_pending_count,
) )
SPAWN_DURATION_SECONDS.labels(
status='throttled'
).observe(
time.perf_counter() - spawn_starttime
)
raise web.HTTPError( raise web.HTTPError(
429, 429,
"User startup rate limit exceeded. Try again in a few minutes.", "User startup rate limit exceeded. Try again in a few minutes.",
@@ -424,6 +437,11 @@ class BaseHandler(RequestHandler):
'%s servers active, no space available', '%s servers active, no space available',
active_count, active_count,
) )
SPAWN_DURATION_SECONDS.labels(
status='too-many-users'
).observe(
time.perf_counter() - spawn_starttime
)
raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.") raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.")
tic = IOLoop.current().time() tic = IOLoop.current().time()
@@ -456,6 +474,11 @@ class BaseHandler(RequestHandler):
toc = IOLoop.current().time() toc = IOLoop.current().time()
self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic) self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic)
self.statsd.timing('spawner.success', (toc - tic) * 1000) self.statsd.timing('spawner.success', (toc - tic) * 1000)
SPAWN_DURATION_SECONDS.labels(
status='success'
).observe(
time.perf_counter() - spawn_starttime
)
spawner._proxy_pending = True spawner._proxy_pending = True
try: try:
yield self.proxy.add_user(user, server_name) yield self.proxy.add_user(user, server_name)
@@ -499,6 +522,11 @@ class BaseHandler(RequestHandler):
if status is not None: if status is not None:
toc = IOLoop.current().time() toc = IOLoop.current().time()
self.statsd.timing('spawner.failure', (toc - tic) * 1000) self.statsd.timing('spawner.failure', (toc - tic) * 1000)
SPAWN_DURATION_SECONDS.labels(
status='failed'
).observe(
time.perf_counter() - spawn_starttime
)
raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % ( raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % (
status, spawner._log_name)) status, spawner._log_name))

View File

@@ -9,6 +9,13 @@ REQUEST_DURATION_SECONDS = Histogram(
['method', 'handler', 'code'] ['method', 'handler', 'code']
) )
SPAWN_DURATION_SECONDS = Histogram(
'spawn_duration_seconds',
'spawn duration for all server spawns',
['status'],
buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")]
)
def prometheus_log_method(handler): def prometheus_log_method(handler):
""" """
Tornado log handler for recording RED metrics Tornado log handler for recording RED metrics