mirror of
https://github.com/jupyterhub/jupyterhub.git
synced 2025-10-07 01:54:09 +00:00
464 lines
13 KiB
Python
464 lines
13 KiB
Python
"""
|
|
Prometheus metrics exported by JupyterHub
|
|
|
|
Read https://prometheus.io/docs/practices/naming/ for naming
|
|
conventions for metrics & labels. We generally prefer naming them
|
|
`<noun>_<verb>_<type_suffix>`. So a histogram that's tracking
|
|
the duration (in seconds) of servers spawning would be called
|
|
server_spawn_duration_seconds.
|
|
A namespace prefix is always added, so this metric is accessed as
|
|
`jupyterhub_server_spawn_duration_seconds` by default.
|
|
|
|
We also create an Enum for each 'status' type label in every metric
|
|
we collect. This is to make sure that the metrics exist regardless
|
|
of the condition happening or not. For example, if we don't explicitly
|
|
create them, the metric spawn_duration_seconds{status="failure"}
|
|
will not actually exist until the first failure. This makes dashboarding
|
|
and alerting difficult, so we explicitly list statuses and create
|
|
them manually here.
|
|
|
|
.. versionchanged:: 1.3
|
|
|
|
added ``jupyterhub_`` prefix to metric names.
|
|
"""
|
|
|
|
import asyncio
|
|
import os
|
|
import time
|
|
from datetime import timedelta
|
|
from enum import Enum
|
|
|
|
from prometheus_client import Gauge, Histogram
|
|
from tornado.ioloop import PeriodicCallback
|
|
from traitlets import Any, Bool, Dict, Float, Integer
|
|
from traitlets.config import LoggingConfigurable
|
|
|
|
from . import orm
|
|
from .utils import utcnow
|
|
|
|
metrics_prefix = os.getenv('JUPYTERHUB_METRICS_PREFIX', 'jupyterhub')
|
|
_env_buckets = os.environ.get(
|
|
'JUPYTERHUB_SERVER_SPAWN_DURATION_SECONDS_BUCKETS', ""
|
|
).strip()
|
|
|
|
if _env_buckets:
|
|
spawn_duration_buckets = [float(_s) for _s in _env_buckets.split(",")]
|
|
else:
|
|
spawn_duration_buckets = [
|
|
0.5,
|
|
1,
|
|
2.5,
|
|
5,
|
|
10,
|
|
15,
|
|
30,
|
|
60,
|
|
120,
|
|
180,
|
|
300,
|
|
600,
|
|
float("inf"),
|
|
]
|
|
|
|
REQUEST_DURATION_SECONDS = Histogram(
|
|
'request_duration_seconds',
|
|
'Request duration for all HTTP requests',
|
|
['method', 'handler', 'code'],
|
|
namespace=metrics_prefix,
|
|
)
|
|
|
|
SERVER_SPAWN_DURATION_SECONDS = Histogram(
|
|
'server_spawn_duration_seconds',
|
|
'Time taken for server spawning operation',
|
|
['status'],
|
|
# Use custom bucket sizes, since the default bucket ranges
|
|
# are meant for quick running processes. Spawns can take a while!
|
|
buckets=spawn_duration_buckets,
|
|
namespace=metrics_prefix,
|
|
)
|
|
|
|
RUNNING_SERVERS = Gauge(
|
|
'running_servers',
|
|
'The number of user servers currently running',
|
|
namespace=metrics_prefix,
|
|
)
|
|
|
|
TOTAL_USERS = Gauge(
|
|
'total_users',
|
|
'Total number of users',
|
|
namespace=metrics_prefix,
|
|
)
|
|
|
|
ACTIVE_USERS = Gauge(
|
|
'active_users',
|
|
'Number of users who were active in the given time period',
|
|
['period'],
|
|
namespace=metrics_prefix,
|
|
)
|
|
|
|
CHECK_ROUTES_DURATION_SECONDS = Histogram(
|
|
'check_routes_duration_seconds',
|
|
'Time taken to validate all routes in proxy',
|
|
namespace=metrics_prefix,
|
|
)
|
|
|
|
HUB_STARTUP_DURATION_SECONDS = Histogram(
|
|
'hub_startup_duration_seconds',
|
|
'Time taken for Hub to start',
|
|
namespace=metrics_prefix,
|
|
)
|
|
|
|
INIT_SPAWNERS_DURATION_SECONDS = Histogram(
|
|
'init_spawners_duration_seconds',
|
|
'Time taken for spawners to initialize',
|
|
namespace=metrics_prefix,
|
|
)
|
|
|
|
PROXY_POLL_DURATION_SECONDS = Histogram(
|
|
'proxy_poll_duration_seconds',
|
|
'Duration for polling all routes from proxy',
|
|
namespace=metrics_prefix,
|
|
)
|
|
|
|
|
|
class ServerSpawnStatus(Enum):
|
|
"""
|
|
Possible values for 'status' label of SERVER_SPAWN_DURATION_SECONDS
|
|
"""
|
|
|
|
success = 'success'
|
|
failure = 'failure'
|
|
already_pending = 'already-pending'
|
|
throttled = 'throttled'
|
|
too_many_users = 'too-many-users'
|
|
|
|
def __str__(self):
|
|
return self.value
|
|
|
|
|
|
for s in ServerSpawnStatus:
|
|
# Create empty metrics with the given status
|
|
SERVER_SPAWN_DURATION_SECONDS.labels(status=s)
|
|
|
|
|
|
PROXY_ADD_DURATION_SECONDS = Histogram(
|
|
'proxy_add_duration_seconds',
|
|
'Duration for adding user routes to proxy',
|
|
['status'],
|
|
namespace=metrics_prefix,
|
|
)
|
|
|
|
|
|
class ProxyAddStatus(Enum):
|
|
"""
|
|
Possible values for 'status' label of PROXY_ADD_DURATION_SECONDS
|
|
"""
|
|
|
|
success = 'success'
|
|
failure = 'failure'
|
|
|
|
def __str__(self):
|
|
return self.value
|
|
|
|
|
|
for s in ProxyAddStatus:
|
|
PROXY_ADD_DURATION_SECONDS.labels(status=s)
|
|
|
|
|
|
SERVER_POLL_DURATION_SECONDS = Histogram(
|
|
'server_poll_duration_seconds',
|
|
'Time taken to poll if server is running',
|
|
['status'],
|
|
namespace=metrics_prefix,
|
|
)
|
|
|
|
|
|
class ServerPollStatus(Enum):
|
|
"""
|
|
Possible values for 'status' label of SERVER_POLL_DURATION_SECONDS
|
|
"""
|
|
|
|
running = 'running'
|
|
stopped = 'stopped'
|
|
|
|
@classmethod
|
|
def from_status(cls, status):
|
|
"""Return enum string for a given poll status"""
|
|
if status is None:
|
|
return cls.running
|
|
return cls.stopped
|
|
|
|
|
|
for s in ServerPollStatus:
|
|
SERVER_POLL_DURATION_SECONDS.labels(status=s)
|
|
|
|
|
|
SERVER_STOP_DURATION_SECONDS = Histogram(
|
|
'server_stop_seconds',
|
|
'Time taken for server stopping operation',
|
|
['status'],
|
|
namespace=metrics_prefix,
|
|
)
|
|
|
|
|
|
class ServerStopStatus(Enum):
|
|
"""
|
|
Possible values for 'status' label of SERVER_STOP_DURATION_SECONDS
|
|
"""
|
|
|
|
success = 'success'
|
|
failure = 'failure'
|
|
|
|
def __str__(self):
|
|
return self.value
|
|
|
|
|
|
for s in ServerStopStatus:
|
|
SERVER_STOP_DURATION_SECONDS.labels(status=s)
|
|
|
|
|
|
PROXY_DELETE_DURATION_SECONDS = Histogram(
|
|
'proxy_delete_duration_seconds',
|
|
'Duration for deleting user routes from proxy',
|
|
['status'],
|
|
namespace=metrics_prefix,
|
|
)
|
|
|
|
|
|
class ProxyDeleteStatus(Enum):
|
|
"""
|
|
Possible values for 'status' label of PROXY_DELETE_DURATION_SECONDS
|
|
"""
|
|
|
|
success = 'success'
|
|
failure = 'failure'
|
|
|
|
def __str__(self):
|
|
return self.value
|
|
|
|
|
|
for s in ProxyDeleteStatus:
|
|
PROXY_DELETE_DURATION_SECONDS.labels(status=s)
|
|
|
|
|
|
class ActiveUserPeriods(Enum):
|
|
"""
|
|
Possible values for 'period' label of ACTIVE_USERS
|
|
"""
|
|
|
|
twenty_four_hours = '24h'
|
|
seven_days = '7d'
|
|
thirty_days = '30d'
|
|
|
|
|
|
for s in ActiveUserPeriods:
|
|
ACTIVE_USERS.labels(period=s.value)
|
|
|
|
|
|
EVENT_LOOP_INTERVAL_SECONDS = Histogram(
|
|
'event_loop_interval_seconds',
|
|
'Distribution of measured event loop intervals',
|
|
namespace=metrics_prefix,
|
|
# don't measure below 50ms, our default
|
|
# Increase resolution to 5ms below 75ms
|
|
# because this is where we are most sensitive.
|
|
# No need to have buckets below 50, since we only measure every 50ms.
|
|
buckets=[
|
|
# 5ms from 50-75ms
|
|
50e-3,
|
|
55e-3,
|
|
60e-3,
|
|
65e-3,
|
|
70e-3,
|
|
# from here, default prometheus buckets
|
|
75e-3,
|
|
0.1,
|
|
0.25,
|
|
0.5,
|
|
0.75,
|
|
1,
|
|
2.5,
|
|
5,
|
|
7.5,
|
|
10,
|
|
float("inf"),
|
|
],
|
|
)
|
|
|
|
|
|
def prometheus_log_method(handler):
|
|
"""
|
|
Tornado log handler for recording RED metrics.
|
|
|
|
We record the following metrics:
|
|
Rate: the number of requests, per second, your services are serving.
|
|
Errors: the number of failed requests per second.
|
|
Duration: the amount of time each request takes expressed as a time interval.
|
|
|
|
We use a fully qualified name of the handler as a label,
|
|
rather than every url path to reduce cardinality.
|
|
|
|
This function should be either the value of or called from a function
|
|
that is the 'log_function' tornado setting. This makes it get called
|
|
at the end of every request, allowing us to record the metrics we need.
|
|
"""
|
|
REQUEST_DURATION_SECONDS.labels(
|
|
method=handler.request.method,
|
|
handler=f'{handler.__class__.__module__}.{type(handler).__name__}',
|
|
code=handler.get_status(),
|
|
).observe(handler.request.request_time())
|
|
|
|
|
|
class PeriodicMetricsCollector(LoggingConfigurable):
|
|
"""
|
|
Collect metrics to be calculated periodically
|
|
"""
|
|
|
|
active_users_enabled = Bool(
|
|
True,
|
|
help="""
|
|
Enable active_users prometheus metric.
|
|
|
|
Populates a `active_users` prometheus metric, with a label `period` that counts the time period
|
|
over which these many users were active. Periods are 24h (24 hours), 7d (7 days) and 30d (30 days).
|
|
""",
|
|
config=True,
|
|
)
|
|
|
|
active_users_update_interval = Integer(
|
|
60 * 60,
|
|
help="""
|
|
Number of seconds between updating active_users metrics.
|
|
|
|
To avoid extra load on the database, this is only calculated periodically rather than
|
|
at per-minute intervals. Defaults to once an hour.
|
|
""",
|
|
config=True,
|
|
)
|
|
|
|
event_loop_interval_enabled = Bool(
|
|
True,
|
|
config=True,
|
|
help="""
|
|
Enable event_loop_interval_seconds metric.
|
|
|
|
Measures event-loop responsiveness.
|
|
""",
|
|
)
|
|
event_loop_interval_resolution = Float(
|
|
0.05,
|
|
config=True,
|
|
help="""
|
|
Interval (in seconds) on which to measure the event loop interval.
|
|
|
|
This is the _sensitivity_ of the `event_loop_interval` metric.
|
|
Setting it too low (e.g. below 20ms) can end up slowing down the whole event loop
|
|
by measuring too often,
|
|
while setting it too high (e.g. above a few seconds) may limit its resolution and usefulness.
|
|
The Prometheus Histogram populated by this metric
|
|
doesn't resolve differences below 25ms,
|
|
so setting this below ~20ms won't result in increased resolution of the histogram metric,
|
|
except for the average value, computed by::
|
|
|
|
event_loop_interval_seconds_sum / event_loop_interval_seconds_count
|
|
""",
|
|
)
|
|
event_loop_interval_log_threshold = Float(
|
|
1,
|
|
config=True,
|
|
help="""Log when the event loop blocks for at least this many seconds.""",
|
|
)
|
|
|
|
# internal state
|
|
_tasks = Dict()
|
|
_periodic_callbacks = Dict()
|
|
|
|
db = Any(help="SQLAlchemy db session to use for performing queries")
|
|
|
|
def update_active_users(self):
|
|
"""Update active users metrics."""
|
|
|
|
# All the metrics should be based off a cutoff from a *fixed* point, so we calculate
|
|
# the fixed point here - and then calculate the individual cutoffs in relation to this
|
|
# fixed point.
|
|
now = utcnow()
|
|
cutoffs = {
|
|
ActiveUserPeriods.twenty_four_hours: now - timedelta(hours=24),
|
|
ActiveUserPeriods.seven_days: now - timedelta(days=7),
|
|
ActiveUserPeriods.thirty_days: now - timedelta(days=30),
|
|
}
|
|
for period, cutoff in cutoffs.items():
|
|
value = (
|
|
self.db.query(orm.User).filter(orm.User.last_activity >= cutoff).count()
|
|
)
|
|
|
|
self.log.info(f'Found {value} active users in the last {period}')
|
|
ACTIVE_USERS.labels(period=period.value).set(value)
|
|
|
|
async def _measure_event_loop_interval(self):
|
|
"""Measure the event loop responsiveness
|
|
|
|
A single long-running coroutine because PeriodicCallback is too expensive
|
|
to measure small intervals.
|
|
"""
|
|
tick = time.perf_counter
|
|
|
|
last_tick = tick()
|
|
resolution = self.event_loop_interval_resolution
|
|
lower_bound = 2 * resolution
|
|
# This loop runs _very_ often, so try to keep it efficient.
|
|
# Even excess comparisons and assignments have a measurable effect on overall cpu usage.
|
|
while True:
|
|
await asyncio.sleep(resolution)
|
|
now = tick()
|
|
# measure the _difference_ between the sleep time and the measured time
|
|
# the event loop blocked for somewhere in the range [delay, delay + resolution]
|
|
tick_duration = now - last_tick
|
|
last_tick = now
|
|
if tick_duration < lower_bound:
|
|
# don't report numbers less than measurement resolution,
|
|
# we don't really have that information
|
|
delay = resolution
|
|
else:
|
|
delay = tick_duration - resolution
|
|
if delay >= self.event_loop_interval_log_threshold:
|
|
# warn about slow ticks
|
|
self.log.warning(
|
|
"Event loop was unresponsive for at least %.2fs!", delay
|
|
)
|
|
|
|
EVENT_LOOP_INTERVAL_SECONDS.observe(delay)
|
|
|
|
def start(self):
|
|
"""
|
|
Start the periodic update process
|
|
"""
|
|
if self.active_users_enabled:
|
|
# Setup periodic refresh of the metric
|
|
self._periodic_callbacks["active_users"] = PeriodicCallback(
|
|
self.update_active_users,
|
|
self.active_users_update_interval * 1000,
|
|
jitter=0.01,
|
|
)
|
|
|
|
# Update the metrics once on startup too
|
|
self.update_active_users()
|
|
|
|
if self.event_loop_interval_enabled:
|
|
self._tasks["event_loop_tick"] = asyncio.create_task(
|
|
self._measure_event_loop_interval()
|
|
)
|
|
|
|
# start callbacks
|
|
for pc in self._periodic_callbacks.values():
|
|
pc.start()
|
|
|
|
def stop(self):
|
|
"""
|
|
Stop collecting metrics
|
|
"""
|
|
for pc in self._periodic_callbacks.values():
|
|
pc.stop()
|
|
for task in self._tasks.values():
|
|
task.cancel()
|