Merge pull request #4525 from danilopeixoto/metrics-prefix

Add `JUPYTERHUB_METRICS_PREFIX` environment variable to customize metrics prefix
2025-10-08 02:24:08 +00:00 · 2023-08-10 12:48:14 +02:00
parent baaa558a84 94687e5215
commit a66801c424
4 changed files with 76 additions and 25 deletions
--- a/docs/source/reference/monitoring.md
+++ b/docs/source/reference/monitoring.md
@@ -18,3 +18,17 @@ tool like [Grafana](https://grafana.com).

 /reference/metrics
 ```
+
+## Customizing the metrics prefix
+
+JupyterHub metrics all have a `jupyterhub_` prefix.
+As of JupyterHub 5.0, this can be overridden with `$JUPYTERHUB_METRICS_PREFIX` environment variable
+in the Hub's environment.
+
+For example,
+
+```bash
+export JUPYTERHUB_METRICS_PREFIX=jupyterhub_prod
+```
+
+would result in the metric `jupyterhub_prod_active_users`, etc.
--- a/jupyterhub/metrics.py
+++ b/jupyterhub/metrics.py
@@ -3,9 +3,11 @@ Prometheus metrics exported by JupyterHub

 Read https://prometheus.io/docs/practices/naming/ for naming
 conventions for metrics & labels. We generally prefer naming them
-`jupyterhub_<noun>_<verb>_<type_suffix>`. So a histogram that's tracking
+`<noun>_<verb>_<type_suffix>`. So a histogram that's tracking
 the duration (in seconds) of servers spawning would be called
-jupyterhub_server_spawn_duration_seconds.
+server_spawn_duration_seconds.
+A namespace prefix is always added, so this metric is accessed as
+`jupyterhub_server_spawn_duration_seconds` by default.

 We also create an Enum for each 'status' type label in every metric
 we collect. This is to make sure that the metrics exist regardless
@@ -19,6 +21,8 @@ them manually here.

    added ``jupyterhub_`` prefix to metric names.
 """
+
+import os
 from datetime import timedelta
 from enum import Enum

@@ -30,49 +34,66 @@ from traitlets.config import LoggingConfigurable
 from . import orm
 from .utils import utcnow

+metrics_prefix = os.getenv('JUPYTERHUB_METRICS_PREFIX', 'jupyterhub')
+
 REQUEST_DURATION_SECONDS = Histogram(
-    'jupyterhub_request_duration_seconds',
-    'request duration for all HTTP requests',
+    'request_duration_seconds',
+    'Request duration for all HTTP requests',
    ['method', 'handler', 'code'],
+    namespace=metrics_prefix,
 )

 SERVER_SPAWN_DURATION_SECONDS = Histogram(
-    'jupyterhub_server_spawn_duration_seconds',
-    'time taken for server spawning operation',
+    'server_spawn_duration_seconds',
+    'Time taken for server spawning operation',
    ['status'],
    # Use custom bucket sizes, since the default bucket ranges
    # are meant for quick running processes. Spawns can take a while!
    buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, 180, 300, 600, float("inf")],
+    namespace=metrics_prefix,
 )

 RUNNING_SERVERS = Gauge(
-    'jupyterhub_running_servers', 'the number of user servers currently running'
+    'running_servers',
+    'The number of user servers currently running',
+    namespace=metrics_prefix,
 )

-TOTAL_USERS = Gauge('jupyterhub_total_users', 'total number of users')
+TOTAL_USERS = Gauge(
+    'total_users',
+    'Total number of users',
+    namespace=metrics_prefix,
+)

 ACTIVE_USERS = Gauge(
-    'jupyterhub_active_users',
-    'number of users who were active in the given time period',
+    'active_users',
+    'Number of users who were active in the given time period',
    ['period'],
+    namespace=metrics_prefix,
 )

 CHECK_ROUTES_DURATION_SECONDS = Histogram(
-    'jupyterhub_check_routes_duration_seconds',
+    'check_routes_duration_seconds',
    'Time taken to validate all routes in proxy',
+    namespace=metrics_prefix,
 )

 HUB_STARTUP_DURATION_SECONDS = Histogram(
-    'jupyterhub_hub_startup_duration_seconds', 'Time taken for Hub to start'
+    'hub_startup_duration_seconds',
+    'Time taken for Hub to start',
+    namespace=metrics_prefix,
 )

 INIT_SPAWNERS_DURATION_SECONDS = Histogram(
-    'jupyterhub_init_spawners_duration_seconds', 'Time taken for spawners to initialize'
+    'init_spawners_duration_seconds',
+    'Time taken for spawners to initialize',
+    namespace=metrics_prefix,
 )

 PROXY_POLL_DURATION_SECONDS = Histogram(
-    'jupyterhub_proxy_poll_duration_seconds',
-    'duration for polling all routes from proxy',
+    'proxy_poll_duration_seconds',
+    'Duration for polling all routes from proxy',
+    namespace=metrics_prefix,
 )


@@ -97,9 +118,10 @@ for s in ServerSpawnStatus:


 PROXY_ADD_DURATION_SECONDS = Histogram(
-    'jupyterhub_proxy_add_duration_seconds',
-    'duration for adding user routes to proxy',
+    'proxy_add_duration_seconds',
+    'Duration for adding user routes to proxy',
    ['status'],
+    namespace=metrics_prefix,
 )


@@ -120,9 +142,10 @@ for s in ProxyAddStatus:


 SERVER_POLL_DURATION_SECONDS = Histogram(
-    'jupyterhub_server_poll_duration_seconds',
-    'time taken to poll if server is running',
+    'server_poll_duration_seconds',
+    'Time taken to poll if server is running',
    ['status'],
+    namespace=metrics_prefix,
 )


@@ -147,9 +170,10 @@ for s in ServerPollStatus:


 SERVER_STOP_DURATION_SECONDS = Histogram(
-    'jupyterhub_server_stop_seconds',
-    'time taken for server stopping operation',
+    'server_stop_seconds',
+    'Time taken for server stopping operation',
    ['status'],
+    namespace=metrics_prefix,
 )


@@ -170,9 +194,10 @@ for s in ServerStopStatus:


 PROXY_DELETE_DURATION_SECONDS = Histogram(
-    'jupyterhub_proxy_delete_duration_seconds',
-    'duration for deleting user routes from proxy',
+    'proxy_delete_duration_seconds',
+    'Duration for deleting user routes from proxy',
    ['status'],
+    namespace=metrics_prefix,
 )


@@ -239,7 +264,7 @@ class PeriodicMetricsCollector(LoggingConfigurable):
        help="""
        Enable active_users prometheus metric.

-        Populates a `jupyterhub_active_users` prometheus metric, with a label `period` that counts the time period
+        Populates a `active_users` prometheus metric, with a label `period` that counts the time period
        over which these many users were active. Periods are 24h (24 hours), 7d (7 days) and 30d (30 days).
        """,
        config=True,
--- a/jupyterhub/tests/test_metrics.py
+++ b/jupyterhub/tests/test_metrics.py
@@ -10,6 +10,18 @@ from ..utils import utcnow
 from .utils import add_user, api_request, get_page


+@pytest.mark.parametrize(
+    "metric_object, expected_names",
+    [
+        (metrics.TOTAL_USERS, ['jupyterhub_total_users']),
+        (metrics.REQUEST_DURATION_SECONDS, ['jupyterhub_request_duration_seconds']),
+    ],
+)
+def test_metric_names(metric_object, expected_names):
+    for metric, expected_name in zip(metric_object.describe(), expected_names):
+        assert metric.name == expected_name
+
+
 async def test_total_users(app):
    num_users = app.db.query(orm.User).count()
    sample = metrics.TOTAL_USERS.collect()[0].samples[0]
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ jupyter_telemetry>=0.1.0
 oauthlib>=3.0
 packaging
 pamela>=1.1.0; sys_platform != 'win32'
-prometheus_client>=0.4.0
+prometheus_client>=0.5.0
 psutil>=5.6.5; sys_platform == 'win32'
 python-dateutil
 requests