Compare commits

...

17 Commits

Author SHA1 Message Date
Min RK
17851b7586 0.8.0b3 2017-08-26 13:51:12 -04:00
Min RK
118e2fa610 Merge pull request #1364 from minrk/test-start-stop-race
exercise start/stop race conditions
2017-08-26 13:37:41 -04:00
Min RK
8e3553462c exercise start/stop race conditions
this doesn’t cover all the edge cases of each possible stage for the races, but it gets the basics covered.
2017-08-26 11:57:05 -04:00
Carol Willing
37da47d811 Merge pull request #1356 from minrk/proxy-race
rework spawn futures to fix races
2017-08-26 11:07:55 -04:00
Min RK
a640a468fb Merge pull request #1362 from stuartcampbell/master
Improve help comments for SSL key/certs configuration parameters
2017-08-26 09:41:47 -04:00
Min RK
92f034766e Merge pull request #1355 from minrk/update-oauth-secret
update oauth secret if API tokens change
2017-08-26 09:41:14 -04:00
Min RK
f7ea451df8 get the tests running 2017-08-25 18:12:15 -04:00
Stuart Campbell
1b7f54b462 Make SSL cert/key help clearer. 2017-08-25 14:52:23 -04:00
Stuart Campbell
b14b12231a Correct typo to have consistent comments 2017-08-24 16:53:25 -04:00
Min RK
2866be9462 don’t allow start while stop is pending
- start fails with 400 if stop is pending
- set spawn_pending across a whole spawn (including proxy)
- proxy_pending is only around the proxy
2017-08-23 23:35:19 -04:00
Min RK
f8648644bf ensure _stop_pending is always True on stop_single_user
previously there was a race during `delete_route`

apply the same logic as _start_pending
2017-08-23 18:30:49 -04:00
Min RK
69d4d48db0 rework spawn futures to fix races
1. set _proxy_pending before first wait to ensure that there is never a gap between setting spawn flags
2. always call `finish_user_spawn` to reduce the number of finalization cases
3. wait for proxy to finish on the slow_spawn timeout, not just start, because we are only interested in the total duration for page responsiveness
2017-08-21 11:27:30 +02:00
Min RK
df309749f2 update oauth secret if API tokens change
handle will_resume case correctly, where an API token *may* be re-used.

Previously, we only did it right if the token was *always* reused,
but clearing out a container would get it into a bad state.
2017-08-21 11:23:17 +02:00
Min RK
58751067db Merge pull request #1354 from minrk/log-typo
typo: use app_log, not self.log
2017-08-20 15:49:56 +02:00
Min RK
4fd70cf79b app_log typo 2017-08-20 15:48:55 +02:00
Carol Willing
ff15bad375 Merge pull request #1353 from minrk/log-connection-error
log error when failing to connect to Hub
2017-08-20 10:45:32 +02:00
Min RK
cba5bb1676 log error when failing to connect to Hub
for better diagnosis
2017-08-20 10:03:52 +02:00
13 changed files with 233 additions and 115 deletions

View File

@@ -7,7 +7,7 @@ version_info = (
0,
8,
0,
'b2',
'b3',
)
__version__ = '.'.join(map(str, version_info))

View File

@@ -104,22 +104,17 @@ class APIHandler(BaseHandler):
'pending': None,
'last_activity': user.last_activity.isoformat(),
}
if user.spawners['']._spawn_pending:
model['pending'] = 'spawn'
elif user.spawners['']._stop_pending:
model['pending'] = 'stop'
model['pending'] = user.spawners[''].pending or None
if self.allow_named_servers:
servers = model['servers'] = {}
for name, spawner in user.spawners.items():
if spawner.ready:
servers[name] = s = {'name': name}
if spawner._spawn_pending:
s['pending'] = 'spawn'
elif spawner._stop_pending:
s['pending'] = 'stop'
if spawner.pending:
s['pending'] = spawner.pending
if spawner.server:
s['url'] = user.url + name
s['url'] = user.url + name + '/'
return model
def group_model(self, group):

View File

@@ -178,19 +178,34 @@ class UserAPIHandler(APIHandler):
class UserServerAPIHandler(APIHandler):
"""Start and stop single-user servers"""
@gen.coroutine
@admin_or_self
def post(self, name, server_name=''):
user = self.find_user(name)
if server_name:
if not self.allow_named_servers:
if server_name and not self.allow_named_servers:
raise web.HTTPError(400, "Named servers are not enabled.")
if self.allow_named_servers and not server_name:
server_name = user.default_server_name()
spawner = user.spawners[server_name]
pending = spawner.pending
if pending == 'spawn':
self.set_header('Content-Type', 'text/plain')
self.set_status(202)
return
elif pending:
raise web.HTTPError(400, "%s is pending %s" % (spawner._log_name, pending))
if spawner.ready:
# include notify, so that a server that died is noticed immediately
# set _spawn_pending flag to prevent races while we wait
spawner._spawn_pending = True
try:
state = yield spawner.poll_and_notify()
finally:
spawner._spawn_pending = False
if state is None:
raise web.HTTPError(400, "%s's server %s is already running" % (name, server_name))
raise web.HTTPError(400, "%s is already running" % spawner._log_name)
options = self.get_json_body()
yield self.spawn_single_user(user, server_name, options=options)
@@ -209,17 +224,21 @@ class UserServerAPIHandler(APIHandler):
raise web.HTTPError(404, "%s has no server named '%s'" % (name, server_name))
spawner = user.spawners[server_name]
if spawner._stop_pending:
if spawner.pending == 'stop':
self.log.debug("%s already stopping", spawner._log_name)
self.set_header('Content-Type', 'text/plain')
self.set_status(202)
return
if not spawner.ready:
raise web.HTTPError(400, "%s's server %s is not running" % (name, server_name))
raise web.HTTPError(
400, "%s is not running %s" %
(spawner._log_name, '(pending: %s)' % spawner.pending if spawner.pending else '')
)
# include notify, so that a server that died is noticed immediately
status = yield spawner.poll_and_notify()
if status is not None:
raise web.HTTPError(400, "%s's server %s is not running" % (name, server_name))
raise web.HTTPError(400, "%s is not running" % spawner._log_name)
yield self.stop_single_user(user, server_name)
status = 202 if spawner._stop_pending else 204
self.set_header('Content-Type', 'text/plain')

View File

@@ -291,13 +291,13 @@ class JupyterHub(Application):
ssl_key = Unicode('',
help="""Path to SSL key file for the public facing interface of the proxy
Use with ssl_cert
When setting this, you should also set ssl_cert
"""
).tag(config=True)
ssl_cert = Unicode('',
help="""Path to SSL certificate file for the public facing interface of the proxy
Use with ssl_key
When setting this, you should also set ssl_key
"""
).tag(config=True)
ip = Unicode('',

View File

@@ -20,7 +20,7 @@ from .. import __version__
from .. import orm
from ..objects import Server
from ..spawner import LocalProcessSpawner
from ..utils import url_path_join, exponential_backoff
from ..utils import default_server_name, url_path_join, exponential_backoff
# pattern for the authentication token header
auth_header_pat = re.compile(r'^(?:token|bearer)\s+([^\s]+)$', flags=re.IGNORECASE)
@@ -376,8 +376,16 @@ class BaseHandler(RequestHandler):
@gen.coroutine
def spawn_single_user(self, user, server_name='', options=None):
if server_name in user.spawners and user.spawners[server_name].pending == 'spawn':
raise RuntimeError("Spawn already pending for: %s" % user.name)
user_server_name = user.name
if self.allow_named_servers and not server_name:
server_name = default_server_name(user)
if server_name:
user_server_name = '%s:%s' % (user.name, server_name)
if server_name in user.spawners and user.spawners[server_name].pending:
pending = user.spawners[server_name].pending
raise RuntimeError("%s pending %s" % (user_server_name, pending))
# count active servers and pending spawns
# we could do careful bookkeeping to avoid
@@ -397,26 +405,20 @@ class BaseHandler(RequestHandler):
)
raise web.HTTPError(
429,
"User startup rate limit exceeded. Try again in a few minutes.")
"User startup rate limit exceeded. Try again in a few minutes.",
)
if active_server_limit and active_count >= active_server_limit:
self.log.info(
'%s servers active, no space available',
active_count,
)
raise web.HTTPError(
429,
"Active user limit exceeded. Try again in a few minutes.")
raise web.HTTPError(429, "Active user limit exceeded. Try again in a few minutes.")
tic = IOLoop.current().time()
user_server_name = user.name
if server_name:
user_server_name = '%s:%s' % (user.name, server_name)
else:
user_server_name = user.name
self.log.debug("Initiating spawn for %s", user_server_name)
f = user.spawn(server_name, options)
spawn_future = user.spawn(server_name, options)
self.log.debug("%i%s concurrent spawns",
spawn_pending_count,
@@ -426,22 +428,28 @@ class BaseHandler(RequestHandler):
'/%i' % active_server_limit if active_server_limit else '')
spawner = user.spawners[server_name]
# set spawn_pending now, so there's no gap where _spawn_pending is False
# while we are waiting for _proxy_pending to be set
spawner._spawn_pending = True
@gen.coroutine
def finish_user_spawn(f=None):
def finish_user_spawn():
"""Finish the user spawn by registering listeners and notifying the proxy.
If the spawner is slow to start, this is passed as an async callback,
otherwise it is called immediately.
"""
if f and f.exception() is not None:
# failed, don't add to the proxy
return
# wait for spawn Future
try:
yield spawn_future
except Exception:
spawner._spawn_pending = False
raise
toc = IOLoop.current().time()
self.log.info("User %s took %.3f seconds to start", user_server_name, toc-tic)
self.statsd.timing('spawner.success', (toc - tic) * 1000)
try:
spawner._proxy_pending = True
try:
yield self.proxy.add_user(user, server_name)
except Exception:
self.log.exception("Failed to add %s to proxy!", user_server_name)
@@ -451,37 +459,41 @@ class BaseHandler(RequestHandler):
spawner.add_poll_callback(self.user_stopped, user, server_name)
finally:
spawner._proxy_pending = False
spawner._spawn_pending = False
try:
yield gen.with_timeout(timedelta(seconds=self.slow_spawn_timeout), f)
yield gen.with_timeout(timedelta(seconds=self.slow_spawn_timeout), finish_user_spawn())
except gen.TimeoutError:
# waiting_for_response indicates server process has started,
# but is yet to become responsive.
if not spawner._waiting_for_response:
if spawner._spawn_pending and not spawner._waiting_for_response:
# still in Spawner.start, which is taking a long time
# we shouldn't poll while spawn is incomplete.
self.log.warning("User %s is slow to start (timeout=%s)",
user_server_name, self.slow_spawn_timeout)
# schedule finish for when the user finishes spawning
IOLoop.current().add_future(f, finish_user_spawn)
else:
return
# start has finished, but the server hasn't come up
# check if the server died while we were waiting
status = yield user.spawner.poll()
if status is None:
# hit timeout, but server's running. Hope that it'll show up soon enough,
# though it's possible that it started at the wrong URL
self.log.warning("User %s is slow to become responsive (timeout=%s)",
user_server_name, self.slow_spawn_timeout)
self.log.debug("Expecting server for %s at: %s", user_server_name, spawner.server.url)
# schedule finish for when the user finishes spawning
IOLoop.current().add_future(f, finish_user_spawn)
else:
status = yield spawner.poll()
if status is not None:
toc = IOLoop.current().time()
self.statsd.timing('spawner.failure', (toc - tic) * 1000)
raise web.HTTPError(500, "Spawner failed to start [status=%s]" % status)
else:
yield finish_user_spawn()
if spawner._waiting_for_response:
# hit timeout waiting for response, but server's running.
# Hope that it'll show up soon enough,
# though it's possible that it started at the wrong URL
self.log.warning("User %s is slow to become responsive (timeout=%s)",
user_server_name, self.slow_spawn_timeout)
self.log.debug("Expecting server for %s at: %s",
user_server_name, spawner.server.url)
if spawner._proxy_pending:
# User.spawn finished, but it hasn't been added to the proxy
# Could be due to load or a slow proxy
self.log.warning("User %s is slow to be added to the proxy (timeout=%s)",
user_server_name, self.slow_spawn_timeout)
@gen.coroutine
def user_stopped(self, user, server_name):
@@ -501,36 +513,37 @@ class BaseHandler(RequestHandler):
if name not in user.spawners:
raise KeyError("User %s has no such spawner %r", user.name, name)
spawner = user.spawners[name]
if spawner._stop_pending:
raise RuntimeError("Stop already pending for: %s:%s" % (user.name, name))
tic = IOLoop.current().time()
yield self.proxy.delete_user(user, name)
f = user.stop()
@gen.coroutine
def finish_stop(f=None):
"""Finish the stop action by noticing that the user is stopped.
if spawner.pending:
raise RuntimeError("%s pending %s" % (spawner._log_name, spawner.pending))
# set user._stop_pending before doing anything async
# to avoid races
spawner._stop_pending = True
If the spawner is slow to stop, this is passed as an async callback,
otherwise it is called immediately.
@gen.coroutine
def stop():
"""Stop the server
1. remove it from the proxy
2. stop the server
3. notice that it stopped
"""
if f and f.exception() is not None:
# failed, don't do anything
return
tic = IOLoop.current().time()
try:
yield self.proxy.delete_user(user, name)
yield user.stop(name)
finally:
spawner._stop_pending = False
toc = IOLoop.current().time()
self.log.info("User %s server took %.3f seconds to stop", user.name, toc - tic)
try:
yield gen.with_timeout(timedelta(seconds=self.slow_stop_timeout), f)
yield gen.with_timeout(timedelta(seconds=self.slow_stop_timeout), stop())
except gen.TimeoutError:
if spawner._stop_pending:
# hit timeout, but stop is still pending
self.log.warning("User %s:%s server is slow to stop", user.name, name)
# schedule finish for when the server finishes stopping
IOLoop.current().add_future(f, finish_stop)
else:
raise
else:
yield finish_stop()
#---------------------------------------------------------------
# template rendering
@@ -653,7 +666,8 @@ class UserSpawnHandler(BaseHandler):
# logged in as correct user, spawn the server
spawner = current_user.spawner
if spawner._spawn_pending or spawner._proxy_pending:
if spawner.pending:
self.log.info("%s is pending %s", spawner._log_name, spawner.pending)
# spawn has started, but not finished
self.statsd.incr('redirects.user_spawn_pending', 1)
html = self.render_template("spawn_pending.html", user=current_user)

View File

@@ -84,10 +84,11 @@ class LoginHandler(BaseHandler):
if user:
already_running = False
if user.spawner:
if user.spawner.ready:
status = yield user.spawner.poll()
already_running = (status is None)
if not already_running and not user.spawner.options_form:
if not already_running and not user.spawner.options_form \
and not user.spawner.pending:
# logging in triggers spawn
yield self.spawn_single_user(user)
self.redirect(self.get_next_url())

View File

@@ -115,6 +115,10 @@ class SpawnHandler(BaseHandler):
self.log.warning("User is already running: %s", url)
self.redirect(url)
return
if user.spawner.pending:
raise web.HTTPError(
400, "%s is pending %s" % (user.spawner._log_name, user.spawner.pending)
)
form_options = {}
for key, byte_list in self.request.body_arguments.items():
form_options[key] = [ bs.decode('utf8') for bs in byte_list ]

View File

@@ -231,9 +231,10 @@ class Proxy(LoggingConfigurable):
user.name, spawner.proxy_spec, spawner.server.host,
)
if spawner._spawn_pending:
if spawner.pending and spawner.pending != 'spawn':
raise RuntimeError(
"User %s's spawn is pending, shouldn't be added to the proxy yet!", user.name)
"%s is pending %s, shouldn't be added to the proxy yet!" % (spawner._log_name, spawner.pending)
)
yield self.add_route(
spawner.proxy_spec,
@@ -326,7 +327,7 @@ class Proxy(LoggingConfigurable):
spec, route['target'], spawner.server,
)
futures.append(self.add_user(user, name))
elif spawner._proxy_pending:
elif spawner._spawn_pending:
good_routes.add(spawner.proxy_spec)
# check service routes

View File

@@ -243,7 +243,8 @@ class HubAuth(Configurable):
headers.setdefault('Authorization', 'token %s' % self.api_token)
try:
r = requests.request(method, url, **kwargs)
except requests.ConnectionError:
except requests.ConnectionError as e:
app_log.error("Error connecting to %s: %s", self.api_url, e)
msg = "Failed to connect to Hub API at %r." % self.api_url
msg += " Is the Hub accessible at this URL (from host: %s)?" % socket.gethostname()
if '127.0.0.1' in self.api_url:

View File

@@ -49,17 +49,29 @@ class Spawner(LoggingConfigurable):
# private attributes for tracking status
_spawn_pending = False
_start_pending = False
_stop_pending = False
_proxy_pending = False
_waiting_for_response = False
@property
def _log_name(self):
"""Return username:servername or username
Used in logging for consistency with named servers.
"""
if self.name:
return '%s:%s' % (self.user.name, self.name)
else:
return self.user.name
@property
def pending(self):
"""Return the current pending event, if any
Return False if nothing is pending.
"""
if self._spawn_pending or self._proxy_pending:
if self._spawn_pending:
return 'spawn'
elif self._stop_pending:
return 'stop'

View File

@@ -654,6 +654,50 @@ def test_active_server_limit(app, request):
assert counts['pending'] == 0
@mark.gen_test
def test_start_stop_race(app, no_patience, slow_spawn):
user = add_user(app.db, app, name='panda')
spawner = user.spawner
# start the server
r = yield api_request(app, 'users', user.name, 'server', method='post')
assert r.status_code == 202
assert spawner.pending == 'spawn'
# additional spawns while spawning shouldn't trigger a new spawn
with mock.patch.object(spawner, 'start') as m:
r = yield api_request(app, 'users', user.name, 'server', method='post')
assert r.status_code == 202
assert m.call_count == 0
# stop while spawning is not okay
r = yield api_request(app, 'users', user.name, 'server', method='delete')
assert r.status_code == 400
while not spawner.ready:
yield gen.sleep(0.1)
spawner.delay = 3
# stop the spawner
r = yield api_request(app, 'users', user.name, 'server', method='delete')
assert r.status_code == 202
assert spawner.pending == 'stop'
# make sure we get past deleting from the proxy
yield gen.sleep(1)
# additional stops while stopping shouldn't trigger a new stop
with mock.patch.object(spawner, 'stop') as m:
r = yield api_request(app, 'users', user.name, 'server', method='delete')
assert r.status_code == 202
assert m.call_count == 0
# start while stopping is not allowed
with mock.patch.object(spawner, 'start') as m:
r = yield api_request(app, 'users', user.name, 'server', method='post')
assert r.status_code == 400
while spawner.active:
yield gen.sleep(0.1)
# start after stop is okay
r = yield api_request(app, 'users', user.name, 'server', method='post')
assert r.status_code == 202
@mark.gen_test
def test_get_proxy(app):
r = yield api_request(app, 'proxy')

View File

@@ -38,6 +38,27 @@ def test_create_named_server(app, named_servers):
assert prefix == user.spawners[servername].server.base_url
assert prefix.endswith('/user/%s/%s/' % (username, servername))
r = yield api_request(app, 'users', username)
r.raise_for_status()
user_model = r.json()
user_model.pop('last_activity')
assert user_model == {
'name': username,
'groups': [],
'kind': 'user',
'admin': False,
'pending': None,
'server': None,
'servers': {
name: {
'name': name,
'url': url_path_join(user.url, name, '/'),
}
for name in ['1', servername]
},
}
@pytest.mark.gen_test
def test_delete_named_server(app, named_servers):
@@ -69,9 +90,9 @@ def test_delete_named_server(app, named_servers):
'servers': {
name: {
'name': name,
'url': url_path_join(user.url, name),
'url': url_path_join(user.url, name, '/'),
}
for name in ['1', servername]
for name in ['1']
},
}

View File

@@ -317,8 +317,6 @@ class User(HasTraits):
url of the server will be /user/:name/:server_name
"""
db = self.db
if self.allow_named_servers and not server_name:
server_name = default_server_name(self)
base_url = url_path_join(self.base_url, server_name) + '/'
@@ -356,11 +354,10 @@ class User(HasTraits):
oauth_client = client_store.fetch_by_client_id(client_id)
except ClientNotFoundError:
oauth_client = None
# create a new OAuth client + secret on every launch,
# except for resuming containers.
if oauth_client is None or not spawner.will_resume:
# create a new OAuth client + secret on every launch
# containers that resume will be updated below
client_store.add_client(client_id, api_token,
url_path_join(self.url, 'oauth_callback'),
url_path_join(self.url, server_name, 'oauth_callback'),
)
db.commit()
@@ -369,7 +366,7 @@ class User(HasTraits):
if (authenticator):
yield gen.maybe_future(authenticator.pre_spawn_start(self, spawner))
spawner._spawn_pending = True
spawner._start_pending = True
# wait for spawner.start to return
try:
# run optional preparation work to bootstrap the notebook
@@ -409,6 +406,13 @@ class User(HasTraits):
# use generated=False because we don't trust this token
# to have been generated properly
self.new_api_token(spawner.api_token, generated=False)
# update OAuth client secret with updated API token
if oauth_provider:
client_store = oauth_provider.client_authenticator.client_store
client_store.add_client(client_id, spawner.api_token,
url_path_join(self.url, server_name, 'oauth_callback'),
)
db.commit()
except Exception as e:
if isinstance(e, gen.TimeoutError):
@@ -428,6 +432,7 @@ class User(HasTraits):
user=self.name,
), exc_info=True)
# raise original exception
spawner._start_pending = False
raise e
spawner.start_polling()
@@ -469,7 +474,7 @@ class User(HasTraits):
_check_version(__version__, server_version, self.log)
finally:
spawner._waiting_for_response = False
spawner._spawn_pending = False
spawner._start_pending = False
return self
@gen.coroutine
@@ -480,6 +485,7 @@ class User(HasTraits):
"""
spawner = self.spawners[server_name]
spawner._spawn_pending = False
spawner._start_pending = False
spawner.stop_polling()
spawner._stop_pending = True
try: