From 339758ec42135e92687a29ed04ca17993ce3fe6e Mon Sep 17 00:00:00 2001
From: yuvipanda <yuvipanda@gmail.com>
Date: Sun, 10 Dec 2017 13:23:29 -0800
Subject: [PATCH 1/9] Add RED prometheus metrics for all requests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces Prometheus for exposing metrics
about JupyterHub's operation. We expose a standard /metrics
endpoint that can be queried without authentication. We
take on prometheus_client as an unconditional dependency
to both simplify code & because it is a pure python package
with no dependencies itself.

The first pass adds 'RED' style metrics for all HTTP requests.
http://rancher.com/red-method-for-prometheus-3-key-metrics-for-monitoring/
has some info on the RED method, but to summarize:

  For each request type, record at least the following metrics

   Rate – the number of requests, per second, your services are serving.
   Errors – the number of failed requests per second.
   Duration – The amount of time each request takes expressed as a time interval.

This instantly gives us a lot of useful metrics in a very
compact form.
---
 jupyterhub/handlers/__init__.py |  4 ++--
 jupyterhub/handlers/metrics.py  | 17 +++++++++++++++++
 jupyterhub/log.py               |  2 ++
 jupyterhub/metrics.py           | 28 ++++++++++++++++++++++++++++
 requirements.txt                |  1 +
 5 files changed, 50 insertions(+), 2 deletions(-)
 create mode 100644 jupyterhub/handlers/metrics.py
 create mode 100644 jupyterhub/metrics.py

diff --git a/jupyterhub/handlers/__init__.py b/jupyterhub/handlers/__init__.py
index 8b2ffd58..0823c183 100644
--- a/jupyterhub/handlers/__init__.py
+++ b/jupyterhub/handlers/__init__.py
@@ -1,8 +1,8 @@
 from .base import *
 from .login import *
 
-from . import base, pages, login
+from . import base, pages, login, metrics
 
 default_handlers = []
-for mod in (base, pages, login):
+for mod in (base, pages, login, metrics):
     default_handlers.extend(mod.default_handlers)
diff --git a/jupyterhub/handlers/metrics.py b/jupyterhub/handlers/metrics.py
new file mode 100644
index 00000000..60b934f9
--- /dev/null
+++ b/jupyterhub/handlers/metrics.py
@@ -0,0 +1,17 @@
+from prometheus_client import REGISTRY, CONTENT_TYPE_LATEST, generate_latest
+from tornado import gen
+
+from .base import BaseHandler
+
+class MetricsHandler(BaseHandler):
+    """
+    Handler to serve Prometheus metrics
+    """
+    @gen.coroutine
+    def get(self):
+        self.set_header('Content-Type', CONTENT_TYPE_LATEST)
+        self.write(generate_latest(REGISTRY))
+
+default_handlers = [
+    (r'/metrics$', MetricsHandler)
+]
diff --git a/jupyterhub/log.py b/jupyterhub/log.py
index 60b6288d..36a3640e 100644
--- a/jupyterhub/log.py
+++ b/jupyterhub/log.py
@@ -8,6 +8,7 @@ import traceback
 from tornado.log import LogFormatter, access_log
 from tornado.web import StaticFileHandler, HTTPError
 
+from .metrics import prometheus_log_method
 
 def coroutine_traceback(typ, value, tb):
     """Scrub coroutine frames from a traceback
@@ -120,3 +121,4 @@ def log_request(handler):
         if location:
             ns['location'] = ' → {}'.format(location)
     log_method(msg.format(**ns))
+    prometheus_log_method(handler)
diff --git a/jupyterhub/metrics.py b/jupyterhub/metrics.py
new file mode 100644
index 00000000..749ac8e6
--- /dev/null
+++ b/jupyterhub/metrics.py
@@ -0,0 +1,28 @@
+"""
+Prometheus metrics exported by JupyterHub
+"""
+from prometheus_client import Histogram
+
+REQUEST_DURATION_SECONDS = Histogram(
+    'request_duration_seconds',
+    'request duration for all HTTP requests',
+    ['method', 'handler', 'code']
+)
+
+def prometheus_log_method(handler):
+    """
+    Tornado log handler for recording RED metrics
+
+    We record the following metrics:
+       Rate – the number of requests, per second, your services are serving.
+       Errors – the number of failed requests per second.
+       Duration – The amount of time each request takes expressed as a time interval.
+
+    We use a fully qualified name of the handler as a label,
+    rather than every url path to reduce cardinality.
+    """
+    REQUEST_DURATION_SECONDS.labels(
+        method=handler.request.method,
+        handler='{}.{}'.format(handler.__class__.__module__, type(handler).__name__),
+        code=handler.get_status()
+    ).observe(handler.request.request_time())
diff --git a/requirements.txt b/requirements.txt
index a4b660af..d4fcc25a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,4 @@ pamela
 python-oauth2>=1.0
 SQLAlchemy>=1.1
 requests
+prometheus_client

From 6594e8839076f3246d38a904958df184a78dd651 Mon Sep 17 00:00:00 2001
From: yuvipanda <yuvipanda@gmail.com>
Date: Sun, 10 Dec 2017 14:54:34 -0800
Subject: [PATCH 2/9] Add metric recording spawn durations

Try to hit every possible exit point from the spawn_single_server
method, with an appropriate status code.

The default histogram buckets are also meant for request latencies,
but spawning usually takes longer so we use custom buckets
---
 jupyterhub/handlers/base.py | 28 ++++++++++++++++++++++++++++
 jupyterhub/metrics.py       |  7 +++++++
 2 files changed, 35 insertions(+)

diff --git a/jupyterhub/handlers/base.py b/jupyterhub/handlers/base.py
index aefea6fb..62e498c2 100644
--- a/jupyterhub/handlers/base.py
+++ b/jupyterhub/handlers/base.py
@@ -5,6 +5,7 @@
 
 import copy
 import re
+import time
 from datetime import timedelta
 from http.client import responses
 from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
@@ -22,6 +23,7 @@ from .. import orm
 from ..objects import Server
 from ..spawner import LocalProcessSpawner
 from ..utils import url_path_join
+from ..metrics import SPAWN_DURATION_SECONDS
 
 # pattern for the authentication token header
 auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE)
@@ -388,6 +390,7 @@ class BaseHandler(RequestHandler):
     @gen.coroutine
     def spawn_single_user(self, user, server_name='', options=None):
         # in case of error, include 'try again from /hub/home' message
+        spawn_starttime = time.perf_counter()
         self.extra_error_html = self.spawn_home_error
 
         user_server_name = user.name
@@ -397,6 +400,11 @@ class BaseHandler(RequestHandler):
 
         if server_name in user.spawners and user.spawners[server_name].pending:
             pending = user.spawners[server_name].pending
+            SPAWN_DURATION_SECONDS.labels(
+                status='already-pending'
+            ).observe(
+                time.perf_counter() - spawn_starttime
+            )
             raise RuntimeError("%s pending %s" % (user_server_name, pending))
 
         # count active servers and pending spawns
@@ -415,6 +423,11 @@ class BaseHandler(RequestHandler):
                 '%s pending spawns, throttling',
                 spawn_pending_count,
             )
+            SPAWN_DURATION_SECONDS.labels(
+                status='throttled'
+            ).observe(
+                time.perf_counter() - spawn_starttime
+            )
             raise web.HTTPError(
                 429,
                 "User startup rate limit exceeded. Try again in a few minutes.",
@@ -424,6 +437,11 @@ class BaseHandler(RequestHandler):
                 '%s servers active, no space available',
                 active_count,
             )
+            SPAWN_DURATION_SECONDS.labels(
+                status='too-many-users'
+            ).observe(
+                time.perf_counter() - spawn_starttime
+            )
             raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.")
 
         tic = IOLoop.current().time()
@@ -456,6 +474,11 @@ class BaseHandler(RequestHandler):
             toc = IOLoop.current().time()
             self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic)
             self.statsd.timing('spawner.success', (toc - tic) * 1000)
+            SPAWN_DURATION_SECONDS.labels(
+                status='success'
+            ).observe(
+                time.perf_counter() - spawn_starttime
+            )
             spawner._proxy_pending = True
             try:
                 yield self.proxy.add_user(user, server_name)
@@ -499,6 +522,11 @@ class BaseHandler(RequestHandler):
             if status is not None:
                 toc = IOLoop.current().time()
                 self.statsd.timing('spawner.failure', (toc - tic) * 1000)
+                SPAWN_DURATION_SECONDS.labels(
+                    status='failed'
+                ).observe(
+                    time.perf_counter() - spawn_starttime
+                )
                 raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % (
                     status, spawner._log_name))
 
diff --git a/jupyterhub/metrics.py b/jupyterhub/metrics.py
index 749ac8e6..89ce2b34 100644
--- a/jupyterhub/metrics.py
+++ b/jupyterhub/metrics.py
@@ -9,6 +9,13 @@ REQUEST_DURATION_SECONDS = Histogram(
     ['method', 'handler', 'code']
 )
 
+SPAWN_DURATION_SECONDS = Histogram(
+    'spawn_duration_seconds',
+    'spawn duration for all server spawns',
+    ['status'],
+    buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")]
+)
+
 def prometheus_log_method(handler):
     """
     Tornado log handler for recording RED metrics

From ce3a940b112c36c9c09514022d5b305b5d1cda76 Mon Sep 17 00:00:00 2001
From: yuvipanda <yuvipanda@gmail.com>
Date: Sun, 10 Dec 2017 15:01:47 -0800
Subject: [PATCH 3/9] Add histogram metric for proxy route addition

---
 jupyterhub/handlers/base.py | 14 +++++++++++++-
 jupyterhub/metrics.py       |  6 ++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/jupyterhub/handlers/base.py b/jupyterhub/handlers/base.py
index 62e498c2..c51e92d4 100644
--- a/jupyterhub/handlers/base.py
+++ b/jupyterhub/handlers/base.py
@@ -23,7 +23,7 @@ from .. import orm
 from ..objects import Server
 from ..spawner import LocalProcessSpawner
 from ..utils import url_path_join
-from ..metrics import SPAWN_DURATION_SECONDS
+from ..metrics import SPAWN_DURATION_SECONDS, PROXY_ADD_DURATION_SECONDS
 
 # pattern for the authentication token header
 auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE)
@@ -479,13 +479,25 @@ class BaseHandler(RequestHandler):
             ).observe(
                 time.perf_counter() - spawn_starttime
             )
+            proxy_add_starttime = time.perf_counter()
             spawner._proxy_pending = True
             try:
                 yield self.proxy.add_user(user, server_name)
+
+                PROXY_ADD_DURATION_SECONDS.labels(
+                    status='success'
+                ).observe(
+                    time.perf_counter() - proxy_add_starttime
+                )
             except Exception:
                 self.log.exception("Failed to add %s to proxy!", user_server_name)
                 self.log.error("Stopping %s to avoid inconsistent state", user_server_name)
                 yield user.stop()
+                PROXY_ADD_DURATION_SECONDS.labels(
+                    status='failure'
+                ).observe(
+                    time.perf_counter() - proxy_add_starttime
+                )
             else:
                 spawner.add_poll_callback(self.user_stopped, user, server_name)
             finally:
diff --git a/jupyterhub/metrics.py b/jupyterhub/metrics.py
index 89ce2b34..d36d853c 100644
--- a/jupyterhub/metrics.py
+++ b/jupyterhub/metrics.py
@@ -16,6 +16,12 @@ SPAWN_DURATION_SECONDS = Histogram(
     buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")]
 )
 
+PROXY_ADD_DURATION_SECONDS = Histogram(
+    'proxy_add_duration_seconds',
+    'duration for adding user routes to proxy',
+    ['status']
+)
+
 def prometheus_log_method(handler):
     """
     Tornado log handler for recording RED metrics

From 352df39454e52ec2189efdb927ca8dff38811389 Mon Sep 17 00:00:00 2001
From: yuvipanda <yuvipanda@gmail.com>
Date: Sun, 10 Dec 2017 16:52:19 -0800
Subject: [PATCH 4/9] Add version requirement for prometheus_client

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index d4fcc25a..7a71f61a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,4 +6,4 @@ pamela
 python-oauth2>=1.0
 SQLAlchemy>=1.1
 requests
-prometheus_client
+prometheus_client>=0.0.21

From 2559632079f8ec7e0bfa009238ae1dc4c62a0045 Mon Sep 17 00:00:00 2001
From: yuvipanda <yuvipanda@gmail.com>
Date: Sun, 10 Dec 2017 16:59:35 -0800
Subject: [PATCH 5/9] Expand prometheus related docstrings a bit more

---
 jupyterhub/log.py     | 1 +
 jupyterhub/metrics.py | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/jupyterhub/log.py b/jupyterhub/log.py
index 36a3640e..2405edf3 100644
--- a/jupyterhub/log.py
+++ b/jupyterhub/log.py
@@ -69,6 +69,7 @@ def log_request(handler):
     - get proxied IP instead of proxy IP
     - log referer for redirect and failed requests
     - log user-agent for failed requests
+    - record per-request metrics in prometheus
     """
     status = handler.get_status()
     request = handler.request
diff --git a/jupyterhub/metrics.py b/jupyterhub/metrics.py
index d36d853c..b75dde1c 100644
--- a/jupyterhub/metrics.py
+++ b/jupyterhub/metrics.py
@@ -24,7 +24,7 @@ PROXY_ADD_DURATION_SECONDS = Histogram(
 
 def prometheus_log_method(handler):
     """
-    Tornado log handler for recording RED metrics
+    Tornado log handler for recording RED metrics.
 
     We record the following metrics:
        Rate – the number of requests, per second, your services are serving.
@@ -33,6 +33,10 @@ def prometheus_log_method(handler):
 
     We use a fully qualified name of the handler as a label,
     rather than every url path to reduce cardinality.
+
+    This function should be either the value of or called from a function
+    that is the 'log_function' tornado setting. This makes it get called
+    at the end of every request, allowing us to record the metrics we need.
     """
     REQUEST_DURATION_SECONDS.labels(
         method=handler.request.method,

From 2099cd37fa9f86708e41e3731fb6a4fc6f171a06 Mon Sep 17 00:00:00 2001
From: yuvipanda <yuvipanda@gmail.com>
Date: Sun, 10 Dec 2017 17:00:15 -0800
Subject: [PATCH 6/9] s/starttime/start_time/

---
 jupyterhub/handlers/base.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/jupyterhub/handlers/base.py b/jupyterhub/handlers/base.py
index c51e92d4..5167da82 100644
--- a/jupyterhub/handlers/base.py
+++ b/jupyterhub/handlers/base.py
@@ -390,7 +390,7 @@ class BaseHandler(RequestHandler):
     @gen.coroutine
     def spawn_single_user(self, user, server_name='', options=None):
         # in case of error, include 'try again from /hub/home' message
-        spawn_starttime = time.perf_counter()
+        spawn_start_time = time.perf_counter()
         self.extra_error_html = self.spawn_home_error
 
         user_server_name = user.name
@@ -403,7 +403,7 @@ class BaseHandler(RequestHandler):
             SPAWN_DURATION_SECONDS.labels(
                 status='already-pending'
             ).observe(
-                time.perf_counter() - spawn_starttime
+                time.perf_counter() - spawn_start_time
             )
             raise RuntimeError("%s pending %s" % (user_server_name, pending))
 
@@ -426,7 +426,7 @@ class BaseHandler(RequestHandler):
             SPAWN_DURATION_SECONDS.labels(
                 status='throttled'
             ).observe(
-                time.perf_counter() - spawn_starttime
+                time.perf_counter() - spawn_start_time
             )
             raise web.HTTPError(
                 429,
@@ -440,7 +440,7 @@ class BaseHandler(RequestHandler):
             SPAWN_DURATION_SECONDS.labels(
                 status='too-many-users'
             ).observe(
-                time.perf_counter() - spawn_starttime
+                time.perf_counter() - spawn_start_time
             )
             raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.")
 
@@ -477,9 +477,9 @@ class BaseHandler(RequestHandler):
             SPAWN_DURATION_SECONDS.labels(
                 status='success'
             ).observe(
-                time.perf_counter() - spawn_starttime
+                time.perf_counter() - spawn_start_time
             )
-            proxy_add_starttime = time.perf_counter()
+            proxy_add_start_time = time.perf_counter()
             spawner._proxy_pending = True
             try:
                 yield self.proxy.add_user(user, server_name)
@@ -487,7 +487,7 @@ class BaseHandler(RequestHandler):
                 PROXY_ADD_DURATION_SECONDS.labels(
                     status='success'
                 ).observe(
-                    time.perf_counter() - proxy_add_starttime
+                    time.perf_counter() - proxy_add_start_time
                 )
             except Exception:
                 self.log.exception("Failed to add %s to proxy!", user_server_name)
@@ -496,7 +496,7 @@ class BaseHandler(RequestHandler):
                 PROXY_ADD_DURATION_SECONDS.labels(
                     status='failure'
                 ).observe(
-                    time.perf_counter() - proxy_add_starttime
+                    time.perf_counter() - proxy_add_start_time
                 )
             else:
                 spawner.add_poll_callback(self.user_stopped, user, server_name)
@@ -537,7 +537,7 @@ class BaseHandler(RequestHandler):
                 SPAWN_DURATION_SECONDS.labels(
                     status='failed'
                 ).observe(
-                    time.perf_counter() - spawn_starttime
+                    time.perf_counter() - spawn_start_time
                 )
                 raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % (
                     status, spawner._log_name))

From c64f23a64ae8d4c15bd779b05b6939957217e9a8 Mon Sep 17 00:00:00 2001
From: yuvipanda <yuvipanda@gmail.com>
Date: Sun, 10 Dec 2017 17:04:10 -0800
Subject: [PATCH 7/9] Add note about metric naming conventions

---
 jupyterhub/metrics.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/jupyterhub/metrics.py b/jupyterhub/metrics.py
index b75dde1c..df724b08 100644
--- a/jupyterhub/metrics.py
+++ b/jupyterhub/metrics.py
@@ -1,5 +1,8 @@
 """
 Prometheus metrics exported by JupyterHub
+
+Read https://prometheus.io/docs/practices/naming/ for naming
+conventions for metrics & labels.
 """
 from prometheus_client import Histogram
 

From ea99c58da5f24362632bc6c181135da2d33f19cd Mon Sep 17 00:00:00 2001
From: yuvipanda <yuvipanda@gmail.com>
Date: Sun, 10 Dec 2017 17:04:44 -0800
Subject: [PATCH 8/9] Clarify custom bucket sizes for spawn time histogram

---
 jupyterhub/metrics.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/jupyterhub/metrics.py b/jupyterhub/metrics.py
index df724b08..ba885446 100644
--- a/jupyterhub/metrics.py
+++ b/jupyterhub/metrics.py
@@ -16,6 +16,8 @@ SPAWN_DURATION_SECONDS = Histogram(
     'spawn_duration_seconds',
     'spawn duration for all server spawns',
     ['status'],
+    # Use custom bucket sizes, since the default bucket ranges
+    # are meant for quick running processes. Spawns can take a while!
     buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")]
 )
 

From 3cd526c0193429e114bfa80fb90ecb307376793b Mon Sep 17 00:00:00 2001
From: yuvipanda <yuvipanda@gmail.com>
Date: Sun, 10 Dec 2017 21:23:32 -0800
Subject: [PATCH 9/9] Make sure our metrics don't appear & disappear
 intermittently

Create all timeseries from the beginning, regardless of wether
they happen or not. Also rename metric objects for consistency.
---
 jupyterhub/handlers/base.py | 45 ++++++++++++++------------------
 jupyterhub/metrics.py       | 52 ++++++++++++++++++++++++++++++++++---
 2 files changed, 67 insertions(+), 30 deletions(-)

diff --git a/jupyterhub/handlers/base.py b/jupyterhub/handlers/base.py
index 5167da82..b5e489b3 100644
--- a/jupyterhub/handlers/base.py
+++ b/jupyterhub/handlers/base.py
@@ -23,7 +23,10 @@ from .. import orm
 from ..objects import Server
 from ..spawner import LocalProcessSpawner
 from ..utils import url_path_join
-from ..metrics import SPAWN_DURATION_SECONDS, PROXY_ADD_DURATION_SECONDS
+from ..metrics import (
+    SERVER_SPAWN_DURATION_SECONDS, ServerSpawnStatus,
+    PROXY_ADD_DURATION_SECONDS, ProxyAddStatus
+)
 
 # pattern for the authentication token header
 auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE)
@@ -400,11 +403,9 @@ class BaseHandler(RequestHandler):
 
         if server_name in user.spawners and user.spawners[server_name].pending:
             pending = user.spawners[server_name].pending
-            SPAWN_DURATION_SECONDS.labels(
-                status='already-pending'
-            ).observe(
-                time.perf_counter() - spawn_start_time
-            )
+            SERVER_SPAWN_DURATION_SECONDS.labels(
+                status=ServerSpawnStatus.already_pending
+            ).observe(time.perf_counter() - spawn_start_time)
             raise RuntimeError("%s pending %s" % (user_server_name, pending))
 
         # count active servers and pending spawns
@@ -423,11 +424,9 @@ class BaseHandler(RequestHandler):
                 '%s pending spawns, throttling',
                 spawn_pending_count,
             )
-            SPAWN_DURATION_SECONDS.labels(
-                status='throttled'
-            ).observe(
-                time.perf_counter() - spawn_start_time
-            )
+            SERVER_SPAWN_DURATION_SECONDS.labels(
+                status=ServerSpawnStatus.throttled
+            ).observe(time.perf_counter() - spawn_start_time)
             raise web.HTTPError(
                 429,
                 "User startup rate limit exceeded. Try again in a few minutes.",
@@ -437,11 +436,9 @@ class BaseHandler(RequestHandler):
                 '%s servers active, no space available',
                 active_count,
             )
-            SPAWN_DURATION_SECONDS.labels(
-                status='too-many-users'
-            ).observe(
-                time.perf_counter() - spawn_start_time
-            )
+            SERVER_SPAWN_DURATION_SECONDS.labels(
+                status=ServerSpawnStatus.too_many_users
+            ).observe(time.perf_counter() - spawn_start_time)
             raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.")
 
         tic = IOLoop.current().time()
@@ -474,11 +471,9 @@ class BaseHandler(RequestHandler):
             toc = IOLoop.current().time()
             self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic)
             self.statsd.timing('spawner.success', (toc - tic) * 1000)
-            SPAWN_DURATION_SECONDS.labels(
-                status='success'
-            ).observe(
-                time.perf_counter() - spawn_start_time
-            )
+            SERVER_SPAWN_DURATION_SECONDS.labels(
+                status=ServerSpawnStatus.success
+            ).observe(time.perf_counter() - spawn_start_time)
             proxy_add_start_time = time.perf_counter()
             spawner._proxy_pending = True
             try:
@@ -534,11 +529,9 @@ class BaseHandler(RequestHandler):
             if status is not None:
                 toc = IOLoop.current().time()
                 self.statsd.timing('spawner.failure', (toc - tic) * 1000)
-                SPAWN_DURATION_SECONDS.labels(
-                    status='failed'
-                ).observe(
-                    time.perf_counter() - spawn_start_time
-                )
+                SERVER_SPAWN_DURATION_SECONDS.labels(
+                    status=ServerSpawnStatus.failure
+                ).observe(time.perf_counter() - spawn_start_time)
                 raise web.HTTPError(500, "Spawner failed to start [status=%s]. The logs for %s may contain details." % (
                     status, spawner._log_name))
 
diff --git a/jupyterhub/metrics.py b/jupyterhub/metrics.py
index ba885446..68d6673e 100644
--- a/jupyterhub/metrics.py
+++ b/jupyterhub/metrics.py
@@ -2,8 +2,21 @@
 Prometheus metrics exported by JupyterHub
 
 Read https://prometheus.io/docs/practices/naming/ for naming
-conventions for metrics & labels.
+conventions for metrics & labels. We generally prefer naming them
+`<noun>_<verb>_<type_suffix>`. So a histogram that's tracking
+the duration (in seconds) of servers spawning would be called
+SERVER_SPAWN_DURATION_SECONDS.
+
+We also create an Enum for each 'status' type label in every metric
+we collect. This is to make sure that the metrics exist regardless
+of the condition happening or not. For example, if we don't explicitly
+create them, the metric spawn_duration_seconds{status="failure"}
+will not actually exist until the first failure. This makes dashboarding
+and alerting difficult, so we explicitly list statuses and create
+them manually here.
 """
+from enum import Enum
+
 from prometheus_client import Histogram
 
 REQUEST_DURATION_SECONDS = Histogram(
@@ -12,21 +25,52 @@ REQUEST_DURATION_SECONDS = Histogram(
     ['method', 'handler', 'code']
 )
 
-SPAWN_DURATION_SECONDS = Histogram(
-    'spawn_duration_seconds',
-    'spawn duration for all server spawns',
+SERVER_SPAWN_DURATION_SECONDS = Histogram(
+    'server_spawn_duration_seconds',
+    'time taken for server spawning operation',
     ['status'],
     # Use custom bucket sizes, since the default bucket ranges
     # are meant for quick running processes. Spawns can take a while!
     buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")]
 )
 
+class ServerSpawnStatus(Enum):
+    """
+    Possible values for 'status' label of SERVER_SPAWN_DURATION_SECONDS
+    """
+    success = 'success'
+    failure = 'failure'
+    already_pending = 'already-pending'
+    throttled = 'throttled'
+    too_many_users = 'too-many-users'
+
+    def __str__(self):
+        return self.value
+
+for s in ServerSpawnStatus:
+    # Create empty metrics with the given status
+    SERVER_SPAWN_DURATION_SECONDS.labels(status=s)
+
+
 PROXY_ADD_DURATION_SECONDS = Histogram(
     'proxy_add_duration_seconds',
     'duration for adding user routes to proxy',
     ['status']
 )
 
+class ProxyAddStatus(Enum):
+    """
+    Possible values for 'status' label of PROXY_ADD_DURATION_SECONDS
+    """
+    success = 'success'
+    failure = 'failure'
+
+    def __str__(self):
+        return self.value
+
+for s in ProxyAddStatus:
+    PROXY_ADD_DURATION_SECONDS.labels(status=s)
+
 def prometheus_log_method(handler):
     """
     Tornado log handler for recording RED metrics.