handle Spawners that are slow to stop

e.g. docker, which can take a long time to stop,
especially if several docker actions are already queued.

Use status `202: Accepted` for API replies sent with spawn/stop still pending
This commit is contained in:
Min RK
2015-01-28 11:35:38 -08:00
parent dee8d4af00
commit 09d9efb44c
5 changed files with 103 additions and 15 deletions

View File

@@ -17,7 +17,7 @@ class BaseUserHandler(APIHandler):
return {
'name': user.name,
'admin': user.admin,
'server': user.server.base_url if user.server else None,
'server': user.server.base_url if user.server and not (user.spawn_pending or user.stop_pending) else None,
'last_activity': user.last_activity.isoformat(),
}
@@ -101,8 +101,12 @@ class UserAPIHandler(BaseUserHandler):
raise web.HTTPError(404)
if user.name == self.get_current_user().name:
raise web.HTTPError(400, "Cannot delete yourself!")
if user.stop_pending:
raise web.HTTPError(400, "%s's server is in the process of stopping, please wait." % name)
if user.spawner is not None:
yield self.stop_single_user(user)
if user.stop_pending:
raise web.HTTPError(400, "%s's server is in the process of stopping, please wait." % name)
yield gen.maybe_future(self.authenticator.delete_user(user))
@@ -136,16 +140,24 @@ class UserServerAPIHandler(BaseUserHandler):
raise web.HTTPError(400, "%s's server is already running" % name)
yield self.spawn_single_user(user)
self.set_status(201)
status = 202 if user.spawn_pending else 201
self.set_status(status)
@gen.coroutine
@admin_or_self
def delete(self, name):
user = self.find_user(name)
if user.stop_pending:
self.set_status(202)
return
if user.spawner is None:
raise web.HTTPError(400, "%s's server is not running" % name)
status = yield user.spawner.poll()
if status is not None:
raise web.HTTPError(400, "%s's server is not running" % name)
yield self.stop_single_user(user)
self.set_status(204)
status = 202 if user.stop_pending else 204
self.set_status(status)
default_handlers = [
(r"/api/users", UserListAPIHandler),

View File

@@ -163,12 +163,20 @@ class BaseHandler(RequestHandler):
def slow_spawn_timeout(self):
return self.settings.get('slow_spawn_timeout', 10)
@property
def slow_stop_timeout(self):
return self.settings.get('slow_stop_timeout', 10)
@property
def spawner_class(self):
return self.settings.get('spawner_class', LocalProcessSpawner)
@gen.coroutine
def spawn_single_user(self, user):
if user.spawn_pending:
raise RuntimeError("Spawn already pending for: %s" % user.name)
tic = IOLoop.current().time()
f = user.spawn(
spawner_class=self.spawner_class,
base_url=self.base_url,
@@ -185,6 +193,8 @@ class BaseHandler(RequestHandler):
if f and f.exception() is not None:
# failed, don't add to the proxy
return
toc = IOLoop.current().time()
self.log.info("User %s server took %.3f seconds to start", user.name, toc-tic)
yield self.proxy.add_user(user)
user.spawner.add_poll_callback(self.user_stopped, user)
@@ -215,8 +225,36 @@ class BaseHandler(RequestHandler):
@gen.coroutine
def stop_single_user(self, user):
if user.stop_pending:
raise RuntimeError("Stop already pending for: %s" % user.name)
tic = IOLoop.current().time()
f = user.stop()
yield self.proxy.delete_user(user)
yield user.stop()
@gen.coroutine
def finish_stop(f=None):
"""Finish the stop action by noticing that the user is stopped.
If the spawner is slow to stop, this is passed as an async callback,
otherwise it is called immediately.
"""
if f and f.exception() is not None:
# failed, don't do anything
return
toc = IOLoop.current().time()
self.log.info("User %s server took %.3f seconds to stop", user.name, toc-tic)
try:
yield gen.with_timeout(timedelta(seconds=self.slow_stop_timeout), f)
except gen.TimeoutError:
if user.stop_pending:
# hit timeout, but stop is still pending
self.log.warn("User %s server is slow to stop", user.name)
# schedule finish for when the server finishes stopping
IOLoop.current().add_future(f, finish_stop)
else:
raise
else:
yield finish_stop()
#---------------------------------------------------------------
# template rendering

View File

@@ -251,6 +251,7 @@ class User(Base):
state = Column(JSONDict)
spawner = None
spawn_pending = False
stop_pending = False
def __repr__(self):
if self.server:
@@ -362,6 +363,8 @@ class User(Base):
if self.spawner is None:
return
self.spawner.stop_polling()
self.stop_pending = True
try:
status = yield self.spawner.poll()
if status is None:
yield self.spawner.stop()
@@ -370,6 +373,8 @@ class User(Base):
self.last_activity = datetime.utcnow()
self.server = None
inspect(self).session.commit()
finally:
self.stop_pending = False
class APIToken(Base):

View File

@@ -49,9 +49,14 @@ class SlowSpawner(MockSpawner):
@gen.coroutine
def start(self):
yield gen.Task(IOLoop.current().add_timeout, timedelta(seconds=5))
yield gen.Task(IOLoop.current().add_timeout, timedelta(seconds=2))
yield super().start()
@gen.coroutine
def stop(self):
yield gen.Task(IOLoop.current().add_timeout, timedelta(seconds=2))
yield super().stop()
class NeverSpawner(MockSpawner):
"""A spawner that will never start"""

View File

@@ -207,25 +207,53 @@ def test_spawn(app, io_loop):
def test_slow_spawn(app, io_loop):
app.tornado_application.settings['spawner_class'] = mocking.SlowSpawner
app.tornado_application.settings['slow_spawn_timeout'] = 0
app.tornado_application.settings['slow_stop_timeout'] = 0
db = app.db
name = 'zoe'
user = add_user(db, name=name)
r = api_request(app, 'users', name, 'server', method='post')
r.raise_for_status()
assert r.status_code == 202
assert user.spawner is not None
assert user.spawn_pending
assert not user.stop_pending
dt = timedelta(seconds=0.1)
@gen.coroutine
def wait_pending():
def wait_spawn():
while user.spawn_pending:
yield gen.Task(io_loop.add_timeout, dt)
io_loop.run_sync(wait_pending)
io_loop.run_sync(wait_spawn)
assert not user.spawn_pending
status = io_loop.run_sync(user.spawner.poll)
assert status is None
@gen.coroutine
def wait_stop():
while user.stop_pending:
yield gen.Task(io_loop.add_timeout, dt)
r = api_request(app, 'users', name, 'server', method='delete')
r.raise_for_status()
assert r.status_code == 202
assert user.spawner is not None
assert user.stop_pending
r = api_request(app, 'users', name, 'server', method='delete')
r.raise_for_status()
assert r.status_code == 202
assert user.spawner is not None
assert user.stop_pending
io_loop.run_sync(wait_stop)
assert not user.stop_pending
assert user.spawner is not None
r = api_request(app, 'users', name, 'server', method='delete')
assert r.status_code == 400
def test_never_spawn(app, io_loop):
app.tornado_application.settings['spawner_class'] = mocking.NeverSpawner
app.tornado_application.settings['slow_spawn_timeout'] = 0