Merge pull request #4214 from yuvipanda/metricsss

Add active users prometheus metrics
This commit is contained in:
Erik Sundell
2022-12-01 15:45:59 +01:00
committed by GitHub
3 changed files with 161 additions and 1 deletions

View File

@@ -74,6 +74,7 @@ from .metrics import (
INIT_SPAWNERS_DURATION_SECONDS, INIT_SPAWNERS_DURATION_SECONDS,
RUNNING_SERVERS, RUNNING_SERVERS,
TOTAL_USERS, TOTAL_USERS,
PeriodicMetricsCollector,
) )
from .oauth.provider import make_provider from .oauth.provider import make_provider
from .objects import Hub, Server from .objects import Hub, Server
@@ -2914,6 +2915,8 @@ class JupyterHub(Application):
await self.proxy.check_routes(self.users, self._service_map) await self.proxy.check_routes(self.users, self._service_map)
asyncio.ensure_future(finish_init_spawners()) asyncio.ensure_future(finish_init_spawners())
metrics_updater = PeriodicMetricsCollector(parent=self, db=self.db)
metrics_updater.start()
async def cleanup(self): async def cleanup(self):
"""Shutdown managed services and various subprocesses. Cleanup runtime files.""" """Shutdown managed services and various subprocesses. Cleanup runtime files."""

View File

@@ -19,9 +19,16 @@ them manually here.
added ``jupyterhub_`` prefix to metric names. added ``jupyterhub_`` prefix to metric names.
""" """
from datetime import timedelta
from enum import Enum from enum import Enum
from prometheus_client import Gauge, Histogram from prometheus_client import Gauge, Histogram
from tornado.ioloop import PeriodicCallback
from traitlets import Any, Bool, Integer
from traitlets.config import LoggingConfigurable
from . import orm
from .utils import utcnow
REQUEST_DURATION_SECONDS = Histogram( REQUEST_DURATION_SECONDS = Histogram(
'jupyterhub_request_duration_seconds', 'jupyterhub_request_duration_seconds',
@@ -44,6 +51,12 @@ RUNNING_SERVERS = Gauge(
TOTAL_USERS = Gauge('jupyterhub_total_users', 'total number of users') TOTAL_USERS = Gauge('jupyterhub_total_users', 'total number of users')
ACTIVE_USERS = Gauge(
'jupyterhub_active_users',
'number of users who were active in the given time period',
['period'],
)
CHECK_ROUTES_DURATION_SECONDS = Histogram( CHECK_ROUTES_DURATION_SECONDS = Histogram(
'jupyterhub_check_routes_duration_seconds', 'jupyterhub_check_routes_duration_seconds',
'Time taken to validate all routes in proxy', 'Time taken to validate all routes in proxy',
@@ -179,6 +192,20 @@ for s in ProxyDeleteStatus:
PROXY_DELETE_DURATION_SECONDS.labels(status=s) PROXY_DELETE_DURATION_SECONDS.labels(status=s)
class ActiveUserPeriods(Enum):
"""
Possible values for 'period' label of ACTIVE_USERS
"""
twenty_four_hours = '24h'
seven_days = '7d'
thirty_days = '30d'
for s in ActiveUserPeriods:
ACTIVE_USERS.labels(period=s.value)
def prometheus_log_method(handler): def prometheus_log_method(handler):
""" """
Tornado log handler for recording RED metrics. Tornado log handler for recording RED metrics.
@@ -200,3 +227,69 @@ def prometheus_log_method(handler):
handler=f'{handler.__class__.__module__}.{type(handler).__name__}', handler=f'{handler.__class__.__module__}.{type(handler).__name__}',
code=handler.get_status(), code=handler.get_status(),
).observe(handler.request.request_time()) ).observe(handler.request.request_time())
class PeriodicMetricsCollector(LoggingConfigurable):
"""
Collect metrics to be calculated periodically
"""
active_users_enabled = Bool(
True,
help="""
Enable active_users prometheus metric.
Populates a `jupyterhub_active_users` prometheus metric, with a label `period` that counts the time period
over which these many users were active. Periods are 24h (24 hours), 7d (7 days) and 30d (30 days).
""",
config=True,
)
active_users_update_interval = Integer(
60 * 60,
help="""
Number of seconds between updating active_users metrics.
To avoid extra load on the database, this is only calculated periodically rather than
at per-minute intervals. Defaults to once an hour.
""",
config=True,
)
db = Any(help="SQLAlchemy db session to use for performing queries")
def update_active_users(self):
"""Update active users metrics."""
# All the metrics should be based off a cutoff from a *fixed* point, so we calculate
# the fixed point here - and then calculate the individual cutoffs in relation to this
# fixed point.
now = utcnow()
cutoffs = {
ActiveUserPeriods.twenty_four_hours: now - timedelta(hours=24),
ActiveUserPeriods.seven_days: now - timedelta(days=7),
ActiveUserPeriods.thirty_days: now - timedelta(days=30),
}
for period, cutoff in cutoffs.items():
value = (
self.db.query(orm.User).filter(orm.User.last_activity >= cutoff).count()
)
self.log.info(f'Found {value} active users in the last {period}')
ACTIVE_USERS.labels(period=period.value).set(value)
def start(self):
"""
Start the periodic update process
"""
if self.active_users_enabled:
# Setup periodic refresh of the metric
pc = PeriodicCallback(
self.update_active_users,
self.active_users_update_interval * 1000,
jitter=0.01,
)
pc.start()
# Update the metrics once on startup too
self.update_active_users()

View File

@@ -1,11 +1,13 @@
import json import json
from datetime import timedelta
from unittest import mock from unittest import mock
import pytest import pytest
from jupyterhub import metrics, orm, roles from jupyterhub import metrics, orm, roles
from .utils import api_request, get_page from ..utils import utcnow
from .utils import add_user, api_request, get_page
async def test_total_users(app): async def test_total_users(app):
@@ -73,3 +75,65 @@ async def test_metrics_auth(
else: else:
assert r.status_code == 403 assert r.status_code == 403
assert 'read:metrics' in r.text assert 'read:metrics' in r.text
async def test_active_users(app):
db = app.db
collector = metrics.PeriodicMetricsCollector(db=db)
collector.update_active_users()
now = utcnow()
def collect():
samples = metrics.ACTIVE_USERS.collect()[0].samples
by_period = {
metrics.ActiveUserPeriods(sample.labels["period"]): sample.value
for sample in samples
}
print(by_period)
return by_period
baseline = collect()
for i, offset in enumerate(
[
None,
# in 24h
timedelta(hours=23, minutes=30),
# in 7d
timedelta(hours=24, minutes=1),
timedelta(days=6, hours=23, minutes=30),
# in 30d
timedelta(days=7, minutes=1),
timedelta(days=29, hours=23, minutes=30),
# not in any
timedelta(days=30, minutes=1),
]
):
user = add_user(db, name=f"active-{i}")
if offset:
user.last_activity = now - offset
else:
user.last_activity = None
db.commit()
# collect before update is called, don't include new users
counts = collect()
for period in metrics.ActiveUserPeriods:
assert period in counts
assert counts[period] == baseline[period]
# collect after updates, check updated counts
collector.update_active_users()
counts = collect()
assert (
counts[metrics.ActiveUserPeriods.twenty_four_hours]
== baseline[metrics.ActiveUserPeriods.twenty_four_hours] + 1
)
assert (
counts[metrics.ActiveUserPeriods.seven_days]
== baseline[metrics.ActiveUserPeriods.seven_days] + 3
)
assert (
counts[metrics.ActiveUserPeriods.thirty_days]
== baseline[metrics.ActiveUserPeriods.thirty_days] + 5
)