diff --git a/docs/source/reference/monitoring.md b/docs/source/reference/monitoring.md index 36078877..78bb9e33 100644 --- a/docs/source/reference/monitoring.md +++ b/docs/source/reference/monitoring.md @@ -33,14 +33,19 @@ export JUPYTERHUB_METRICS_PREFIX=jupyterhub_prod would result in the metric `jupyterhub_prod_active_users`, etc. -## Customizing spawn bucket sizes +## Customizing bucket sizes -As of JupyterHub 5.3, override `JUPYTERHUB_SERVER_SPAWN_DURATION_SECONDS_BUCKETS` env variable in Hub's environment to allow custom bucket sizes. Otherwise default to, [0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, 180, 300, 600, float("inf")] +As of JupyterHub 5.3, the following environment variables in the Hub's environment can be overridden to support custom bucket sizes - below are the defaults: + +| Variable | Default | +| -------------------------------------------------- | ------------------------------------------------------------------ | +| `JUPYTERHUB_SERVER_SPAWN_DURATION_SECONDS_BUCKETS` | `0.5,1,2.5,5,10,15,30,60,120,180,300,600,inf` | +| `JUPYTERHUB_SERVER_STOP_DURATION_SECONDS_BUCKETS` | `0.005,0.01,0.025,0.05,0.075,0.1,0.25,0.5,0.75,1,2.5,5,7.5,10,inf` | For example, ```bash -export JUPYTERHUB_SERVER_SPAWN_DURATION_SECONDS_BUCKETS="1,2,4,6,12,30,60,120" +export JUPYTERHUB_SERVER_SPAWN_DURATION_SECONDS_BUCKETS="1,2,4,6,12,30,60,120,inf" ``` ## Configuring metrics diff --git a/jupyterhub/metrics.py b/jupyterhub/metrics.py index fbff828e..6e718cdb 100644 --- a/jupyterhub/metrics.py +++ b/jupyterhub/metrics.py @@ -37,12 +37,16 @@ from . import orm from .utils import utcnow metrics_prefix = os.getenv('JUPYTERHUB_METRICS_PREFIX', 'jupyterhub') -_env_buckets = os.environ.get( + +_env_spawn_duration_buckets = os.environ.get( 'JUPYTERHUB_SERVER_SPAWN_DURATION_SECONDS_BUCKETS', "" ).strip() +_env_stop_duration_buckets = os.environ.get( + "JUPYTERHUB_SERVER_STOP_DURATION_SECONDS_BUCKETS", "" +).strip() -if _env_buckets: - spawn_duration_buckets = [float(_s) for _s in _env_buckets.split(",")] +if _env_spawn_duration_buckets: + spawn_duration_buckets = [float(_s) for _s in _env_spawn_duration_buckets.split(",")] else: spawn_duration_buckets = [ 0.5, @@ -60,6 +64,29 @@ else: float("inf"), ] +if _env_stop_duration_buckets: + stop_duration_buckets = [float(_s) for _s in _env_stop_duration_buckets.split(",")] +else: + # We default to the same buckets as upstream Prometheus (as it was before) so we don't + # break anything that was consuming this metric before bucket configuration was possible + stop_duration_buckets = [ + 0.005, + 0.01, + 0.025, + 0.05, + 0.075, + 0.1, + 0.25, + 0.5, + 0.75, + 1, + 2.5, + 5, + 7.5, + 10, + float("inf"), + ] + REQUEST_DURATION_SECONDS = Histogram( 'request_duration_seconds', 'Request duration for all HTTP requests', @@ -197,6 +224,7 @@ SERVER_STOP_DURATION_SECONDS = Histogram( 'server_stop_seconds', 'Time taken for server stopping operation', ['status'], + buckets=stop_duration_buckets, namespace=metrics_prefix, ) @@ -341,7 +369,7 @@ class PeriodicMetricsCollector(LoggingConfigurable): config=True, help=""" Enable event_loop_interval_seconds metric. - + Measures event-loop responsiveness. """, ) @@ -350,7 +378,7 @@ class PeriodicMetricsCollector(LoggingConfigurable): config=True, help=""" Interval (in seconds) on which to measure the event loop interval. - + This is the _sensitivity_ of the `event_loop_interval` metric. Setting it too low (e.g. below 20ms) can end up slowing down the whole event loop by measuring too often,