Merge pull request #4214 from yuvipanda/metricsss

Add active users prometheus metrics
This commit is contained in:
Erik Sundell
2022-12-01 15:45:59 +01:00
committed by GitHub
3 changed files with 161 additions and 1 deletions

View File

@@ -74,6 +74,7 @@ from .metrics import (
INIT_SPAWNERS_DURATION_SECONDS,
RUNNING_SERVERS,
TOTAL_USERS,
PeriodicMetricsCollector,
)
from .oauth.provider import make_provider
from .objects import Hub, Server
@@ -2914,6 +2915,8 @@ class JupyterHub(Application):
await self.proxy.check_routes(self.users, self._service_map)
asyncio.ensure_future(finish_init_spawners())
metrics_updater = PeriodicMetricsCollector(parent=self, db=self.db)
metrics_updater.start()
async def cleanup(self):
"""Shutdown managed services and various subprocesses. Cleanup runtime files."""

View File

@@ -19,9 +19,16 @@ them manually here.
added ``jupyterhub_`` prefix to metric names.
"""
from datetime import timedelta
from enum import Enum
from prometheus_client import Gauge, Histogram
from tornado.ioloop import PeriodicCallback
from traitlets import Any, Bool, Integer
from traitlets.config import LoggingConfigurable
from . import orm
from .utils import utcnow
REQUEST_DURATION_SECONDS = Histogram(
'jupyterhub_request_duration_seconds',
@@ -44,6 +51,12 @@ RUNNING_SERVERS = Gauge(
TOTAL_USERS = Gauge('jupyterhub_total_users', 'total number of users')
ACTIVE_USERS = Gauge(
'jupyterhub_active_users',
'number of users who were active in the given time period',
['period'],
)
CHECK_ROUTES_DURATION_SECONDS = Histogram(
'jupyterhub_check_routes_duration_seconds',
'Time taken to validate all routes in proxy',
@@ -179,6 +192,20 @@ for s in ProxyDeleteStatus:
PROXY_DELETE_DURATION_SECONDS.labels(status=s)
class ActiveUserPeriods(Enum):
"""
Possible values for 'period' label of ACTIVE_USERS
"""
twenty_four_hours = '24h'
seven_days = '7d'
thirty_days = '30d'
for s in ActiveUserPeriods:
ACTIVE_USERS.labels(period=s.value)
def prometheus_log_method(handler):
"""
Tornado log handler for recording RED metrics.
@@ -200,3 +227,69 @@ def prometheus_log_method(handler):
handler=f'{handler.__class__.__module__}.{type(handler).__name__}',
code=handler.get_status(),
).observe(handler.request.request_time())
class PeriodicMetricsCollector(LoggingConfigurable):
"""
Collect metrics to be calculated periodically
"""
active_users_enabled = Bool(
True,
help="""
Enable active_users prometheus metric.
Populates a `jupyterhub_active_users` prometheus metric, with a label `period` that counts the time period
over which these many users were active. Periods are 24h (24 hours), 7d (7 days) and 30d (30 days).
""",
config=True,
)
active_users_update_interval = Integer(
60 * 60,
help="""
Number of seconds between updating active_users metrics.
To avoid extra load on the database, this is only calculated periodically rather than
at per-minute intervals. Defaults to once an hour.
""",
config=True,
)
db = Any(help="SQLAlchemy db session to use for performing queries")
def update_active_users(self):
"""Update active users metrics."""
# All the metrics should be based off a cutoff from a *fixed* point, so we calculate
# the fixed point here - and then calculate the individual cutoffs in relation to this
# fixed point.
now = utcnow()
cutoffs = {
ActiveUserPeriods.twenty_four_hours: now - timedelta(hours=24),
ActiveUserPeriods.seven_days: now - timedelta(days=7),
ActiveUserPeriods.thirty_days: now - timedelta(days=30),
}
for period, cutoff in cutoffs.items():
value = (
self.db.query(orm.User).filter(orm.User.last_activity >= cutoff).count()
)
self.log.info(f'Found {value} active users in the last {period}')
ACTIVE_USERS.labels(period=period.value).set(value)
def start(self):
"""
Start the periodic update process
"""
if self.active_users_enabled:
# Setup periodic refresh of the metric
pc = PeriodicCallback(
self.update_active_users,
self.active_users_update_interval * 1000,
jitter=0.01,
)
pc.start()
# Update the metrics once on startup too
self.update_active_users()

View File

@@ -1,11 +1,13 @@
import json
from datetime import timedelta
from unittest import mock
import pytest
from jupyterhub import metrics, orm, roles
from .utils import api_request, get_page
from ..utils import utcnow
from .utils import add_user, api_request, get_page
async def test_total_users(app):
@@ -73,3 +75,65 @@ async def test_metrics_auth(
else:
assert r.status_code == 403
assert 'read:metrics' in r.text
async def test_active_users(app):
db = app.db
collector = metrics.PeriodicMetricsCollector(db=db)
collector.update_active_users()
now = utcnow()
def collect():
samples = metrics.ACTIVE_USERS.collect()[0].samples
by_period = {
metrics.ActiveUserPeriods(sample.labels["period"]): sample.value
for sample in samples
}
print(by_period)
return by_period
baseline = collect()
for i, offset in enumerate(
[
None,
# in 24h
timedelta(hours=23, minutes=30),
# in 7d
timedelta(hours=24, minutes=1),
timedelta(days=6, hours=23, minutes=30),
# in 30d
timedelta(days=7, minutes=1),
timedelta(days=29, hours=23, minutes=30),
# not in any
timedelta(days=30, minutes=1),
]
):
user = add_user(db, name=f"active-{i}")
if offset:
user.last_activity = now - offset
else:
user.last_activity = None
db.commit()
# collect before update is called, don't include new users
counts = collect()
for period in metrics.ActiveUserPeriods:
assert period in counts
assert counts[period] == baseline[period]
# collect after updates, check updated counts
collector.update_active_users()
counts = collect()
assert (
counts[metrics.ActiveUserPeriods.twenty_four_hours]
== baseline[metrics.ActiveUserPeriods.twenty_four_hours] + 1
)
assert (
counts[metrics.ActiveUserPeriods.seven_days]
== baseline[metrics.ActiveUserPeriods.seven_days] + 3
)
assert (
counts[metrics.ActiveUserPeriods.thirty_days]
== baseline[metrics.ActiveUserPeriods.thirty_days] + 5
)