diff --git a/docs/source/reference/monitoring.md b/docs/source/reference/monitoring.md index a1a01635..a3649c82 100644 --- a/docs/source/reference/monitoring.md +++ b/docs/source/reference/monitoring.md @@ -18,3 +18,17 @@ tool like [Grafana](https://grafana.com). /reference/metrics ``` + +## Customizing the metrics prefix + +JupyterHub metrics all have a `jupyterhub_` prefix. +As of JupyterHub 5.0, this can be overridden with `$JUPYTERHUB_METRICS_PREFIX` environment variable +in the Hub's environment. + +For example, + +```bash +export JUPYTERHUB_METRICS_PREFIX=jupyterhub_prod +``` + +would result in the metric `jupyterhub_prod_active_users`, etc. diff --git a/jupyterhub/metrics.py b/jupyterhub/metrics.py index 24e9a0c4..c3bfd42d 100644 --- a/jupyterhub/metrics.py +++ b/jupyterhub/metrics.py @@ -3,9 +3,11 @@ Prometheus metrics exported by JupyterHub Read https://prometheus.io/docs/practices/naming/ for naming conventions for metrics & labels. We generally prefer naming them -`jupyterhub___`. So a histogram that's tracking +`__`. So a histogram that's tracking the duration (in seconds) of servers spawning would be called -jupyterhub_server_spawn_duration_seconds. +server_spawn_duration_seconds. +A namespace prefix is always added, so this metric is accessed as +`jupyterhub_server_spawn_duration_seconds` by default. We also create an Enum for each 'status' type label in every metric we collect. This is to make sure that the metrics exist regardless @@ -19,6 +21,8 @@ them manually here. added ``jupyterhub_`` prefix to metric names. """ + +import os from datetime import timedelta from enum import Enum @@ -30,49 +34,66 @@ from traitlets.config import LoggingConfigurable from . import orm from .utils import utcnow +metrics_prefix = os.getenv('JUPYTERHUB_METRICS_PREFIX', 'jupyterhub') + REQUEST_DURATION_SECONDS = Histogram( - 'jupyterhub_request_duration_seconds', - 'request duration for all HTTP requests', + 'request_duration_seconds', + 'Request duration for all HTTP requests', ['method', 'handler', 'code'], + namespace=metrics_prefix, ) SERVER_SPAWN_DURATION_SECONDS = Histogram( - 'jupyterhub_server_spawn_duration_seconds', - 'time taken for server spawning operation', + 'server_spawn_duration_seconds', + 'Time taken for server spawning operation', ['status'], # Use custom bucket sizes, since the default bucket ranges # are meant for quick running processes. Spawns can take a while! buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, 180, 300, 600, float("inf")], + namespace=metrics_prefix, ) RUNNING_SERVERS = Gauge( - 'jupyterhub_running_servers', 'the number of user servers currently running' + 'running_servers', + 'The number of user servers currently running', + namespace=metrics_prefix, ) -TOTAL_USERS = Gauge('jupyterhub_total_users', 'total number of users') +TOTAL_USERS = Gauge( + 'total_users', + 'Total number of users', + namespace=metrics_prefix, +) ACTIVE_USERS = Gauge( - 'jupyterhub_active_users', - 'number of users who were active in the given time period', + 'active_users', + 'Number of users who were active in the given time period', ['period'], + namespace=metrics_prefix, ) CHECK_ROUTES_DURATION_SECONDS = Histogram( - 'jupyterhub_check_routes_duration_seconds', + 'check_routes_duration_seconds', 'Time taken to validate all routes in proxy', + namespace=metrics_prefix, ) HUB_STARTUP_DURATION_SECONDS = Histogram( - 'jupyterhub_hub_startup_duration_seconds', 'Time taken for Hub to start' + 'hub_startup_duration_seconds', + 'Time taken for Hub to start', + namespace=metrics_prefix, ) INIT_SPAWNERS_DURATION_SECONDS = Histogram( - 'jupyterhub_init_spawners_duration_seconds', 'Time taken for spawners to initialize' + 'init_spawners_duration_seconds', + 'Time taken for spawners to initialize', + namespace=metrics_prefix, ) PROXY_POLL_DURATION_SECONDS = Histogram( - 'jupyterhub_proxy_poll_duration_seconds', - 'duration for polling all routes from proxy', + 'proxy_poll_duration_seconds', + 'Duration for polling all routes from proxy', + namespace=metrics_prefix, ) @@ -97,9 +118,10 @@ for s in ServerSpawnStatus: PROXY_ADD_DURATION_SECONDS = Histogram( - 'jupyterhub_proxy_add_duration_seconds', - 'duration for adding user routes to proxy', + 'proxy_add_duration_seconds', + 'Duration for adding user routes to proxy', ['status'], + namespace=metrics_prefix, ) @@ -120,9 +142,10 @@ for s in ProxyAddStatus: SERVER_POLL_DURATION_SECONDS = Histogram( - 'jupyterhub_server_poll_duration_seconds', - 'time taken to poll if server is running', + 'server_poll_duration_seconds', + 'Time taken to poll if server is running', ['status'], + namespace=metrics_prefix, ) @@ -147,9 +170,10 @@ for s in ServerPollStatus: SERVER_STOP_DURATION_SECONDS = Histogram( - 'jupyterhub_server_stop_seconds', - 'time taken for server stopping operation', + 'server_stop_seconds', + 'Time taken for server stopping operation', ['status'], + namespace=metrics_prefix, ) @@ -170,9 +194,10 @@ for s in ServerStopStatus: PROXY_DELETE_DURATION_SECONDS = Histogram( - 'jupyterhub_proxy_delete_duration_seconds', - 'duration for deleting user routes from proxy', + 'proxy_delete_duration_seconds', + 'Duration for deleting user routes from proxy', ['status'], + namespace=metrics_prefix, ) @@ -239,7 +264,7 @@ class PeriodicMetricsCollector(LoggingConfigurable): help=""" Enable active_users prometheus metric. - Populates a `jupyterhub_active_users` prometheus metric, with a label `period` that counts the time period + Populates a `active_users` prometheus metric, with a label `period` that counts the time period over which these many users were active. Periods are 24h (24 hours), 7d (7 days) and 30d (30 days). """, config=True, diff --git a/jupyterhub/tests/test_metrics.py b/jupyterhub/tests/test_metrics.py index 4f3091cf..7e263095 100644 --- a/jupyterhub/tests/test_metrics.py +++ b/jupyterhub/tests/test_metrics.py @@ -10,6 +10,18 @@ from ..utils import utcnow from .utils import add_user, api_request, get_page +@pytest.mark.parametrize( + "metric_object, expected_names", + [ + (metrics.TOTAL_USERS, ['jupyterhub_total_users']), + (metrics.REQUEST_DURATION_SECONDS, ['jupyterhub_request_duration_seconds']), + ], +) +def test_metric_names(metric_object, expected_names): + for metric, expected_name in zip(metric_object.describe(), expected_names): + assert metric.name == expected_name + + async def test_total_users(app): num_users = app.db.query(orm.User).count() sample = metrics.TOTAL_USERS.collect()[0].samples[0] diff --git a/requirements.txt b/requirements.txt index b44d811e..84bda83e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ jupyter_telemetry>=0.1.0 oauthlib>=3.0 packaging pamela>=1.1.0; sys_platform != 'win32' -prometheus_client>=0.4.0 +prometheus_client>=0.5.0 psutil>=5.6.5; sys_platform == 'win32' python-dateutil requests