Merge pull request #4525 from danilopeixoto/metrics-prefix

Add `JUPYTERHUB_METRICS_PREFIX` environment variable to customize metrics prefix
This commit is contained in:
Min RK
2023-08-10 12:48:14 +02:00
committed by GitHub
4 changed files with 76 additions and 25 deletions

View File

@@ -18,3 +18,17 @@ tool like [Grafana](https://grafana.com).
/reference/metrics /reference/metrics
``` ```
## Customizing the metrics prefix
JupyterHub metrics all have a `jupyterhub_` prefix.
As of JupyterHub 5.0, this can be overridden with `$JUPYTERHUB_METRICS_PREFIX` environment variable
in the Hub's environment.
For example,
```bash
export JUPYTERHUB_METRICS_PREFIX=jupyterhub_prod
```
would result in the metric `jupyterhub_prod_active_users`, etc.

View File

@@ -3,9 +3,11 @@ Prometheus metrics exported by JupyterHub
Read https://prometheus.io/docs/practices/naming/ for naming Read https://prometheus.io/docs/practices/naming/ for naming
conventions for metrics & labels. We generally prefer naming them conventions for metrics & labels. We generally prefer naming them
`jupyterhub_<noun>_<verb>_<type_suffix>`. So a histogram that's tracking `<noun>_<verb>_<type_suffix>`. So a histogram that's tracking
the duration (in seconds) of servers spawning would be called the duration (in seconds) of servers spawning would be called
jupyterhub_server_spawn_duration_seconds. server_spawn_duration_seconds.
A namespace prefix is always added, so this metric is accessed as
`jupyterhub_server_spawn_duration_seconds` by default.
We also create an Enum for each 'status' type label in every metric We also create an Enum for each 'status' type label in every metric
we collect. This is to make sure that the metrics exist regardless we collect. This is to make sure that the metrics exist regardless
@@ -19,6 +21,8 @@ them manually here.
added ``jupyterhub_`` prefix to metric names. added ``jupyterhub_`` prefix to metric names.
""" """
import os
from datetime import timedelta from datetime import timedelta
from enum import Enum from enum import Enum
@@ -30,49 +34,66 @@ from traitlets.config import LoggingConfigurable
from . import orm from . import orm
from .utils import utcnow from .utils import utcnow
metrics_prefix = os.getenv('JUPYTERHUB_METRICS_PREFIX', 'jupyterhub')
REQUEST_DURATION_SECONDS = Histogram( REQUEST_DURATION_SECONDS = Histogram(
'jupyterhub_request_duration_seconds', 'request_duration_seconds',
'request duration for all HTTP requests', 'Request duration for all HTTP requests',
['method', 'handler', 'code'], ['method', 'handler', 'code'],
namespace=metrics_prefix,
) )
SERVER_SPAWN_DURATION_SECONDS = Histogram( SERVER_SPAWN_DURATION_SECONDS = Histogram(
'jupyterhub_server_spawn_duration_seconds', 'server_spawn_duration_seconds',
'time taken for server spawning operation', 'Time taken for server spawning operation',
['status'], ['status'],
# Use custom bucket sizes, since the default bucket ranges # Use custom bucket sizes, since the default bucket ranges
# are meant for quick running processes. Spawns can take a while! # are meant for quick running processes. Spawns can take a while!
buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, 180, 300, 600, float("inf")], buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, 180, 300, 600, float("inf")],
namespace=metrics_prefix,
) )
RUNNING_SERVERS = Gauge( RUNNING_SERVERS = Gauge(
'jupyterhub_running_servers', 'the number of user servers currently running' 'running_servers',
'The number of user servers currently running',
namespace=metrics_prefix,
) )
TOTAL_USERS = Gauge('jupyterhub_total_users', 'total number of users') TOTAL_USERS = Gauge(
'total_users',
'Total number of users',
namespace=metrics_prefix,
)
ACTIVE_USERS = Gauge( ACTIVE_USERS = Gauge(
'jupyterhub_active_users', 'active_users',
'number of users who were active in the given time period', 'Number of users who were active in the given time period',
['period'], ['period'],
namespace=metrics_prefix,
) )
CHECK_ROUTES_DURATION_SECONDS = Histogram( CHECK_ROUTES_DURATION_SECONDS = Histogram(
'jupyterhub_check_routes_duration_seconds', 'check_routes_duration_seconds',
'Time taken to validate all routes in proxy', 'Time taken to validate all routes in proxy',
namespace=metrics_prefix,
) )
HUB_STARTUP_DURATION_SECONDS = Histogram( HUB_STARTUP_DURATION_SECONDS = Histogram(
'jupyterhub_hub_startup_duration_seconds', 'Time taken for Hub to start' 'hub_startup_duration_seconds',
'Time taken for Hub to start',
namespace=metrics_prefix,
) )
INIT_SPAWNERS_DURATION_SECONDS = Histogram( INIT_SPAWNERS_DURATION_SECONDS = Histogram(
'jupyterhub_init_spawners_duration_seconds', 'Time taken for spawners to initialize' 'init_spawners_duration_seconds',
'Time taken for spawners to initialize',
namespace=metrics_prefix,
) )
PROXY_POLL_DURATION_SECONDS = Histogram( PROXY_POLL_DURATION_SECONDS = Histogram(
'jupyterhub_proxy_poll_duration_seconds', 'proxy_poll_duration_seconds',
'duration for polling all routes from proxy', 'Duration for polling all routes from proxy',
namespace=metrics_prefix,
) )
@@ -97,9 +118,10 @@ for s in ServerSpawnStatus:
PROXY_ADD_DURATION_SECONDS = Histogram( PROXY_ADD_DURATION_SECONDS = Histogram(
'jupyterhub_proxy_add_duration_seconds', 'proxy_add_duration_seconds',
'duration for adding user routes to proxy', 'Duration for adding user routes to proxy',
['status'], ['status'],
namespace=metrics_prefix,
) )
@@ -120,9 +142,10 @@ for s in ProxyAddStatus:
SERVER_POLL_DURATION_SECONDS = Histogram( SERVER_POLL_DURATION_SECONDS = Histogram(
'jupyterhub_server_poll_duration_seconds', 'server_poll_duration_seconds',
'time taken to poll if server is running', 'Time taken to poll if server is running',
['status'], ['status'],
namespace=metrics_prefix,
) )
@@ -147,9 +170,10 @@ for s in ServerPollStatus:
SERVER_STOP_DURATION_SECONDS = Histogram( SERVER_STOP_DURATION_SECONDS = Histogram(
'jupyterhub_server_stop_seconds', 'server_stop_seconds',
'time taken for server stopping operation', 'Time taken for server stopping operation',
['status'], ['status'],
namespace=metrics_prefix,
) )
@@ -170,9 +194,10 @@ for s in ServerStopStatus:
PROXY_DELETE_DURATION_SECONDS = Histogram( PROXY_DELETE_DURATION_SECONDS = Histogram(
'jupyterhub_proxy_delete_duration_seconds', 'proxy_delete_duration_seconds',
'duration for deleting user routes from proxy', 'Duration for deleting user routes from proxy',
['status'], ['status'],
namespace=metrics_prefix,
) )
@@ -239,7 +264,7 @@ class PeriodicMetricsCollector(LoggingConfigurable):
help=""" help="""
Enable active_users prometheus metric. Enable active_users prometheus metric.
Populates a `jupyterhub_active_users` prometheus metric, with a label `period` that counts the time period Populates a `active_users` prometheus metric, with a label `period` that counts the time period
over which these many users were active. Periods are 24h (24 hours), 7d (7 days) and 30d (30 days). over which these many users were active. Periods are 24h (24 hours), 7d (7 days) and 30d (30 days).
""", """,
config=True, config=True,

View File

@@ -10,6 +10,18 @@ from ..utils import utcnow
from .utils import add_user, api_request, get_page from .utils import add_user, api_request, get_page
@pytest.mark.parametrize(
"metric_object, expected_names",
[
(metrics.TOTAL_USERS, ['jupyterhub_total_users']),
(metrics.REQUEST_DURATION_SECONDS, ['jupyterhub_request_duration_seconds']),
],
)
def test_metric_names(metric_object, expected_names):
for metric, expected_name in zip(metric_object.describe(), expected_names):
assert metric.name == expected_name
async def test_total_users(app): async def test_total_users(app):
num_users = app.db.query(orm.User).count() num_users = app.db.query(orm.User).count()
sample = metrics.TOTAL_USERS.collect()[0].samples[0] sample = metrics.TOTAL_USERS.collect()[0].samples[0]

View File

@@ -7,7 +7,7 @@ jupyter_telemetry>=0.1.0
oauthlib>=3.0 oauthlib>=3.0
packaging packaging
pamela>=1.1.0; sys_platform != 'win32' pamela>=1.1.0; sys_platform != 'win32'
prometheus_client>=0.4.0 prometheus_client>=0.5.0
psutil>=5.6.5; sys_platform == 'win32' psutil>=5.6.5; sys_platform == 'win32'
python-dateutil python-dateutil
requests requests