Merge pull request #4479 from minrk/jupyterhub-public-url

add JupyterHub.public_url config
2025-10-18 15:33:02 +00:00 · 2024-01-24 23:56:58 +01:00
parent ab588c28ce 68c12d4d32
commit 92da2c12fd
10 changed files with 224 additions and 29 deletions
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -182,6 +182,7 @@ html_context = {
 linkcheck_ignore = [
    r"(.*)github\.com(.*)#",  # javascript based anchors
    r"(.*)/#%21(.*)/(.*)",  # /#!forum/jupyter - encoded anchor edge case
+    r"https?://(.*\.)?example\.(org|com)(/.*)?",  # example links
    r"https://github.com/[^/]*$",  # too many github usernames / searches in changelog
    "https://github.com/jupyterhub/jupyterhub/pull/",  # too many PRs in changelog
    "https://github.com/jupyterhub/jupyterhub/compare/",  # too many comparisons in changelog
--- a/docs/source/reference/services.md
+++ b/docs/source/reference/services.md
@@ -138,6 +138,14 @@ JUPYTERHUB_OAUTH_SCOPES:   JSON-serialized list of scopes to use for allowing ac
                           (deprecated in 3.0, use JUPYTERHUB_OAUTH_ACCESS_SCOPES).
 JUPYTERHUB_OAUTH_ACCESS_SCOPES: JSON-serialized list of scopes to use for allowing access to the service (new in 3.0).
 JUPYTERHUB_OAUTH_CLIENT_ALLOWED_SCOPES: JSON-serialized list of scopes that can be requested by the oauth client on behalf of users (new in 3.0).
+JUPYTERHUB_PUBLIC_URL: the public URL of the service,
+  e.g. `https://jupyterhub.example.org/services/name/`.
+  Empty if no public URL is specified (default).
+  Will be available if subdomains are configured.
+JUPYTERHUB_PUBLIC_HUB_URL: the public URL of JupyterHub as a whole,
+  e.g. `https://jupyterhub.example.org/`.
+  Empty if no public URL is specified (default).
+  Will be available if subdomains are configured.
 ```

 For the previous 'cull idle' Service example, these environment variables
--- a/docs/source/reference/spawners.md
+++ b/docs/source/reference/spawners.md
@@ -315,6 +315,14 @@ The process environment is returned by `Spawner.get_env`, which specifies the fo
 - `JUPYTERHUB_OAUTH_ACCESS_SCOPES` - the scopes required to access the server (called `JUPYTERHUB_OAUTH_SCOPES` prior to 3.0)
 - `JUPYTERHUB_OAUTH_CLIENT_ALLOWED_SCOPES` - the scopes the service is allowed to request.
  If no scopes are requested explicitly, these scopes will be requested.
+- `JUPYTERHUB_PUBLIC_URL` - the public URL of the server,
+  e.g. `https://jupyterhub.example.org/user/name/`.
+  Empty if no public URL is specified (default).
+  Will be available if subdomains are configured.
+- `JUPYTERHUB_PUBLIC_HUB_URL` - the public URL of JupyterHub as a whole,
+  e.g. `https://jupyterhub.example.org/`.
+  Empty if no public URL is specified (default).
+  Will be available if subdomains are configured.

 Optional environment variables, depending on configuration:

--- a/jupyterhub/apihandlers/auth.py
+++ b/jupyterhub/apihandlers/auth.py
@@ -109,14 +109,20 @@ class OAuthHandler:
        redirect_uri = self.get_argument('redirect_uri')
        if not redirect_uri or not redirect_uri.startswith('/'):
            return uri
+
        # make absolute local redirects full URLs
        # to satisfy oauthlib's absolute URI requirement
-        redirect_uri = (
-            get_browser_protocol(self.request)
-            + "://"
-            + self.request.host
-            + redirect_uri
-        )
+
+        public_url = self.settings.get("public_url")
+        if public_url:
+            proto = public_url.scheme
+            host = public_url.netloc
+        else:
+            # guess from request
+            proto = get_browser_protocol(self.request)
+            host = self.request.host
+        redirect_uri = f"{proto}://{host}{redirect_uri}"
+
        parsed_url = urlparse(uri)
        query_list = parse_qsl(parsed_url.query, keep_blank_values=True)
        for idx, item in enumerate(query_list):
--- a/jupyterhub/app.py
+++ b/jupyterhub/app.py
@@ -698,6 +698,61 @@ class JupyterHub(Application):
        proto = 'https' if self.ssl_cert else 'http'
        return proto + '://:8000'

+    public_url = Unicode(
+        "",
+        config=True,
+        help="""Set the public URL of JupyterHub
+
+        This will skip any detection of URL and protocol from requests,
+        which isn't always correct when JupyterHub is behind
+        multiple layers of proxies, etc.
+        Usually the failure is detecting http when it's really https.
+
+        Should include the full, public URL of JupyterHub,
+        including the public-facing base_url prefix
+        (i.e. it should include a trailing slash), e.g.
+        https://jupyterhub.example.org/prefix/
+        """,
+    )
+
+    @default("public_url")
+    def _default_public_url(self):
+        if self.subdomain_host:
+            # if subdomain_host is specified, use it by default
+            return self.subdomain_host + self.base_url
+        else:
+            return ""
+
+    @validate("public_url")
+    def _validate_public_url(self, proposal):
+        url = proposal.value
+        if not url:
+            # explicitly empty (default)
+            return url
+        if not url.endswith("/"):
+            # ensure we have a trailing slash
+            # for consistency with base_url
+            url = url + "/"
+        if not url.endswith(self.base_url):
+            if not urlparse(url).path.strip("/"):
+                # no path specified, add base_url and warn
+                url = url.rstrip("/") + self.base_url
+                self.log.warning(
+                    f"Adding missing base_url {self.base_url!r} to JupyterHub.public_url = {url!r}"
+                )
+            else:
+                # path specified but it doesn't match, raise
+                raise ValueError(
+                    f"JupyterHub.public_url = {url!r} must include base_url: {self.base_url!r}"
+                )
+        if "://" not in url:
+            # https by default; should be specified
+            url = 'https://' + url
+            self.log.warning(
+                f"Adding missing protocol 'https://' to JupyterHub.public_url = {url!r}"
+            )
+        return url
+
    subdomain_host = Unicode(
        '',
        help="""Run single-user servers on subdomains of this host.
@@ -721,15 +776,18 @@ class JupyterHub(Application):
            # host should include '://'
            # if not specified, assume https: You have to be really explicit about HTTP!
            new = 'https://' + new
+            self.log.warning(
+                f"Adding missing protocol 'https://' to JupyterHub.subdomain_host = {new!r}"
+            )
        return new

    domain = Unicode(help="domain name, e.g. 'example.com' (excludes protocol, port)")

    @default('domain')
    def _domain_default(self):
-        if not self.subdomain_host:
+        if not (self.public_url or self.subdomain_host):
            return ''
-        return urlparse(self.subdomain_host).hostname
+        return urlparse(self.public_url or self.subdomain_host).hostname

    subdomain_hook = Union(
        [Callable(), Unicode()],
@@ -1941,10 +1999,15 @@ class JupyterHub(Application):

    def init_hub(self):
        """Load the Hub URL config"""
+        if self.public_url:
+            # host = scheme://hostname:port (no path)
+            public_host = urlunparse(urlparse(self.public_url)._replace(path=""))
+        else:
+            public_host = self.subdomain_host
        hub_args = dict(
            base_url=self.hub_prefix,
            routespec=self.hub_routespec,
-            public_host=self.subdomain_host,
+            public_host=public_host,
            certfile=self.internal_ssl_cert,
            keyfile=self.internal_ssl_key,
            cafile=self.internal_ssl_ca,
@@ -2462,9 +2525,9 @@ class JupyterHub(Application):
        """

        name = orm_service.name
-        if self.domain:
+        if self.subdomain_host:
            parsed_host = urlparse(self.subdomain_host)
-            domain = self.subdomain_hook(name, self.domain, kind="service")
+            domain = self.subdomain_hook(name, parsed_host.hostname, kind="service")
            host = f"{parsed_host.scheme}://{domain}"
            if parsed_host.port:
                host = f"{host}:{parsed_host.port}"
@@ -2526,9 +2589,9 @@ class JupyterHub(Application):

        name = spec['name']

-        if self.domain:
+        if self.subdomain_host:
            parsed_host = urlparse(self.subdomain_host)
-            domain = self.subdomain_hook(name, self.domain, kind="service")
+            domain = self.subdomain_hook(name, parsed_host.hostname, kind="service")
            host = f"{parsed_host.scheme}://{domain}"
            if parsed_host.port:
                host = f"{host}:{parsed_host.port}"
@@ -2974,6 +3037,7 @@ class JupyterHub(Application):
            spawner_class=self.spawner_class,
            base_url=self.base_url,
            default_url=self.default_url,
+            public_url=urlparse(self.public_url) if self.public_url else "",
            cookie_secret=self.cookie_secret,
            cookie_max_age_days=self.cookie_max_age_days,
            redirect_to_server=self.redirect_to_server,
--- a/jupyterhub/handlers/base.py
+++ b/jupyterhub/handlers/base.py
@@ -138,6 +138,10 @@ class BaseHandler(RequestHandler):
    def domain(self):
        return self.settings['domain']

+    @property
+    def public_url(self):
+        return self.settings['public_url']
+
    @property
    def db(self):
        return self.settings['db']
@@ -577,8 +581,13 @@ class BaseHandler(RequestHandler):
        # tornado <4.2 have a bug that consider secure==True as soon as
        # 'secure' kwarg is passed to set_secure_cookie
        kwargs = {'httponly': True}
-        if self.request.protocol == 'https':
-            kwargs['secure'] = True
+        public_url = self.settings.get("public_url")
+        if public_url:
+            if public_url.scheme == 'https':
+                kwargs['secure'] = True
+        else:
+            if self.request.protocol == 'https':
+                kwargs['secure'] = True

        kwargs.update(self.settings.get('cookie_options', {}))
        kwargs.update(overrides)
@@ -670,8 +679,15 @@ class BaseHandler(RequestHandler):
        next_url = self.get_argument('next', default='')
        # protect against some browsers' buggy handling of backslash as slash
        next_url = next_url.replace('\\', '%5C')
-        proto = get_browser_protocol(self.request)
-        host = self.request.host
+        public_url = self.settings.get("public_url")
+        if public_url:
+            proto = public_url.scheme
+            host = public_url.netloc
+        else:
+            # guess from request
+            proto = get_browser_protocol(self.request)
+            host = self.request.host
+
        if next_url.startswith("///"):
            # strip more than 2 leading // down to 2
            # because urlparse treats that as empty netloc,
--- a/jupyterhub/services/auth.py
+++ b/jupyterhub/services/auth.py
@@ -37,7 +37,7 @@ import uuid
 import warnings
 from http import HTTPStatus
 from unittest import mock
-from urllib.parse import urlencode
+from urllib.parse import urlencode, urlparse

 from tornado.httpclient import AsyncHTTPClient, HTTPRequest
 from tornado.httputil import url_concat
@@ -924,8 +924,13 @@ class HubOAuth(HubAuth):
            # OAuth that doesn't complete shouldn't linger too long.
            'max_age': 600,
        }
-        if get_browser_protocol(handler.request) == 'https':
-            kwargs['secure'] = True
+        public_url = os.getenv("JUPYTERHUB_PUBLIC_URL")
+        if public_url:
+            if urlparse(public_url).scheme == 'https':
+                kwargs['secure'] = True
+        else:
+            if get_browser_protocol(handler.request) == 'https':
+                kwargs['secure'] = True
        # load user cookie overrides
        kwargs.update(self.cookie_options)
        handler.set_secure_cookie(cookie_name, b64_state, **kwargs)
--- a/jupyterhub/spawner.py
+++ b/jupyterhub/spawner.py
@@ -162,6 +162,8 @@ class Spawner(LoggingConfigurable):
    hub = Any()
    orm_spawner = Any()
    cookie_options = Dict()
+    public_url = Unicode(help="Public URL of this spawner's server")
+    public_hub_url = Unicode(help="Public URL of the Hub itself")

    db = Any()

@@ -1047,6 +1049,10 @@ class Spawner(LoggingConfigurable):
        bind_url = f"{proto}://{self.ip}:{self.port}{base_url}"
        env["JUPYTERHUB_SERVICE_URL"] = bind_url

+        # the public URLs of this server and the Hub
+        env["JUPYTERHUB_PUBLIC_URL"] = self.public_url
+        env["JUPYTERHUB_PUBLIC_HUB_URL"] = self.public_hub_url
+
        # Put in limit and guarantee info if they exist.
        # Note that this is for use by the humans / notebook extensions in the
        # single-user notebook server, and not for direct usage by the spawners
--- a/jupyterhub/tests/test_user.py
+++ b/jupyterhub/tests/test_user.py
@@ -1,3 +1,6 @@
+from unittest import mock
+from urllib.parse import urlparse
+
 import pytest

 from .. import orm
@@ -66,3 +69,42 @@ def test_sync_groups(app, user, group_names):
 def test_server_url(app, user, server_name, path):
    user_url = user.url
    assert user.server_url(server_name) == user_url + path
+
+
+@pytest.mark.parametrize(
+    "server_name, public_url, subdomain_host, expected_url",
+    [
+        ("", "", "", ""),
+        ("name", "", "", ""),
+        ("", "https://hub.tld/PREFIX/", "", "https://hub.tld/PREFIX/user/USERNAME/"),
+        (
+            "name",
+            "https://hub.tld/PREFIX/",
+            "",
+            "https://hub.tld/PREFIX/user/USERNAME/name/",
+        ),
+        (
+            "name",
+            "",
+            "https://hub.tld:123",
+            "https://USERNAME.hub.tld:123/PREFIX/user/USERNAME/name/",
+        ),
+    ],
+)
+def test_public_url(app, user, server_name, public_url, subdomain_host, expected_url):
+    expected_url = expected_url.replace("USERNAME", user.escaped_name).replace(
+        "PREFIX", app.base_url.strip("/")
+    )
+    if public_url:
+        public_url = public_url.replace("PREFIX", app.base_url.strip("/"))
+        public_url = urlparse(public_url)
+    with mock.patch.dict(
+        user.settings,
+        {
+            "subdomain_host": subdomain_host,
+            "domain": urlparse(subdomain_host).hostname,
+            "public_url": public_url,
+        },
+    ):
+        public_server_url = user.public_url(server_name)
+    assert public_server_url == expected_url
--- a/jupyterhub/user.py
+++ b/jupyterhub/user.py
@@ -4,7 +4,7 @@ import json
 import warnings
 from collections import defaultdict
 from datetime import timedelta
-from urllib.parse import quote, urlparse
+from urllib.parse import quote, urlparse, urlunparse

 from sqlalchemy import inspect
 from tornado import gen, web
@@ -438,6 +438,20 @@ class User:
            )
            spawn_kwargs.update(ssl_kwargs)

+        # public URLs
+        if self.settings.get("public_url"):
+            public_url = self.settings["public_url"]
+            hub = self.settings.get('hub')
+            if hub is None:
+                # only in mock tests
+                hub_path = "/hub/"
+            else:
+                hub_path = hub.base_url
+            spawn_kwargs["public_hub_url"] = urlunparse(
+                public_url._replace(path=hub_path)
+            )
+        spawn_kwargs["public_url"] = self.public_url(server_name)
+
        # update with kwargs. Mainly for testing.
        spawn_kwargs.update(kwargs)
        spawner = spawner_class(**spawn_kwargs)
@@ -541,12 +555,19 @@ class User:
    @property
    def host(self):
        """Get the *host* for my server (proto://domain[:port])"""
-        # FIXME: escaped_name probably isn't escaped enough in general for a domain fragment
-        parsed = urlparse(self.settings['subdomain_host'])
-        h = f'{parsed.scheme}://{self.domain}'
-        if parsed.port:
-            h += ':%i' % parsed.port
-        return h
+        # if subdomains are used, use our domain
+
+        if self.settings.get('subdomain_host'):
+            parsed = urlparse(self.settings['subdomain_host'])
+            h = f"{parsed.scheme}://{self.domain}"
+            if parsed.port:
+                h = f"{h}:{parsed.port}"
+            return h
+        elif self.settings.get("public_url"):
+            # no subdomain, use public host url without path
+            return urlunparse(self.settings["public_url"]._replace(path=""))
+        else:
+            return ""

    @property
    def url(self):
@@ -554,8 +575,8 @@ class User:

        Full name.domain/path if using subdomains, otherwise just my /base/url
        """
-        if self.settings.get('subdomain_host'):
-            return f'{self.host}{self.base_url}'
+        if self.settings.get("subdomain_host"):
+            return f"{self.host}{self.base_url}"
        else:
            return self.base_url

@@ -566,6 +587,24 @@ class User:
        else:
            return url_path_join(self.url, url_escape_path(server_name), "/")

+    def public_url(self, server_name=''):
+        """Get the public URL of a server by name
+
+        Like server_url, but empty if no public URL is specified
+        """
+        # server_url will be a full URL if using subdomains
+        url = self.server_url(server_name)
+        if "://" not in url:
+            # not using subdomains, public URL may be specified
+            if self.settings.get("public_url"):
+                # add server's base_url path prefix to public host
+                url = urlunparse(self.settings["public_url"]._replace(path=url))
+            else:
+                # no public url (from subdomain or host),
+                # leave unspecified
+                url = ""
+        return url
+
    def progress_url(self, server_name=''):
        """API URL for progress endpoint for a server with a given name"""
        url_parts = [self.settings['hub'].base_url, 'api/users', self.escaped_name]