Allow configuration of stop duration metric buckets

Issue #4833 proposes allowing configuration of buckets for server spawn
duration. It was resolved with PR #4967

This follows a similar pattern to support the same kind of configuration
for server stop duration
This commit is contained in:
Srikanth Chelluri
2025-04-10 10:49:38 -04:00
parent 9fe7822098
commit 2e5fc51b6b
2 changed files with 41 additions and 8 deletions

View File

@@ -33,14 +33,19 @@ export JUPYTERHUB_METRICS_PREFIX=jupyterhub_prod
would result in the metric `jupyterhub_prod_active_users`, etc. would result in the metric `jupyterhub_prod_active_users`, etc.
## Customizing spawn bucket sizes ## Customizing bucket sizes
As of JupyterHub 5.3, override `JUPYTERHUB_SERVER_SPAWN_DURATION_SECONDS_BUCKETS` env variable in Hub's environment to allow custom bucket sizes. Otherwise default to, [0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, 180, 300, 600, float("inf")] As of JupyterHub 5.3, the following environment variables in the Hub's environment can be overridden to support custom bucket sizes - below are the defaults:
| Variable | Default |
| -------------------------------------------------- | ------------------------------------------------------------------ |
| `JUPYTERHUB_SERVER_SPAWN_DURATION_SECONDS_BUCKETS` | `0.5,1,2.5,5,10,15,30,60,120,180,300,600,inf` |
| `JUPYTERHUB_SERVER_STOP_DURATION_SECONDS_BUCKETS` | `0.005,0.01,0.025,0.05,0.075,0.1,0.25,0.5,0.75,1,2.5,5,7.5,10,inf` |
For example, For example,
```bash ```bash
export JUPYTERHUB_SERVER_SPAWN_DURATION_SECONDS_BUCKETS="1,2,4,6,12,30,60,120" export JUPYTERHUB_SERVER_SPAWN_DURATION_SECONDS_BUCKETS="1,2,4,6,12,30,60,120,inf"
``` ```
## Configuring metrics ## Configuring metrics

View File

@@ -37,12 +37,16 @@ from . import orm
from .utils import utcnow from .utils import utcnow
metrics_prefix = os.getenv('JUPYTERHUB_METRICS_PREFIX', 'jupyterhub') metrics_prefix = os.getenv('JUPYTERHUB_METRICS_PREFIX', 'jupyterhub')
_env_buckets = os.environ.get(
_env_spawn_duration_buckets = os.environ.get(
'JUPYTERHUB_SERVER_SPAWN_DURATION_SECONDS_BUCKETS', "" 'JUPYTERHUB_SERVER_SPAWN_DURATION_SECONDS_BUCKETS', ""
).strip() ).strip()
_env_stop_duration_buckets = os.environ.get(
"JUPYTERHUB_SERVER_STOP_DURATION_SECONDS_BUCKETS", ""
).strip()
if _env_buckets: if _env_spawn_duration_buckets:
spawn_duration_buckets = [float(_s) for _s in _env_buckets.split(",")] spawn_duration_buckets = [float(_s) for _s in _env_spawn_duration_buckets.split(",")]
else: else:
spawn_duration_buckets = [ spawn_duration_buckets = [
0.5, 0.5,
@@ -60,6 +64,29 @@ else:
float("inf"), float("inf"),
] ]
if _env_stop_duration_buckets:
stop_duration_buckets = [float(_s) for _s in _env_stop_duration_buckets.split(",")]
else:
# We default to the same buckets as upstream Prometheus (as it was before) so we don't
# break anything that was consuming this metric before bucket configuration was possible
stop_duration_buckets = [
0.005,
0.01,
0.025,
0.05,
0.075,
0.1,
0.25,
0.5,
0.75,
1,
2.5,
5,
7.5,
10,
float("inf"),
]
REQUEST_DURATION_SECONDS = Histogram( REQUEST_DURATION_SECONDS = Histogram(
'request_duration_seconds', 'request_duration_seconds',
'Request duration for all HTTP requests', 'Request duration for all HTTP requests',
@@ -197,6 +224,7 @@ SERVER_STOP_DURATION_SECONDS = Histogram(
'server_stop_seconds', 'server_stop_seconds',
'Time taken for server stopping operation', 'Time taken for server stopping operation',
['status'], ['status'],
buckets=stop_duration_buckets,
namespace=metrics_prefix, namespace=metrics_prefix,
) )
@@ -341,7 +369,7 @@ class PeriodicMetricsCollector(LoggingConfigurable):
config=True, config=True,
help=""" help="""
Enable event_loop_interval_seconds metric. Enable event_loop_interval_seconds metric.
Measures event-loop responsiveness. Measures event-loop responsiveness.
""", """,
) )
@@ -350,7 +378,7 @@ class PeriodicMetricsCollector(LoggingConfigurable):
config=True, config=True,
help=""" help="""
Interval (in seconds) on which to measure the event loop interval. Interval (in seconds) on which to measure the event loop interval.
This is the _sensitivity_ of the `event_loop_interval` metric. This is the _sensitivity_ of the `event_loop_interval` metric.
Setting it too low (e.g. below 20ms) can end up slowing down the whole event loop Setting it too low (e.g. below 20ms) can end up slowing down the whole event loop
by measuring too often, by measuring too often,