allow high latency spawners

such as VMs, batch and cloud services, etc. which can take minutes to start.

- Spawner.start_timeout sets a limit for true failure,
  at which point spawner should be considered dead.
- Handler.spawn_single_user only waits up to 10 seconds
  before returning. It can now return with a spawner still pending.
- Record User.spawn_pending state, and render 'pending' page
  while server is starting but not started.
This commit is contained in:
Min RK
2014-12-19 16:19:53 -08:00
parent 370b6f18d3
commit 53880f52b8
4 changed files with 100 additions and 8 deletions

View File

@@ -4,13 +4,14 @@
# Distributed under the terms of the Modified BSD License. # Distributed under the terms of the Modified BSD License.
import re import re
from datetime import datetime from datetime import datetime, timedelta
from http.client import responses from http.client import responses
from jinja2 import TemplateNotFound from jinja2 import TemplateNotFound
from tornado.log import app_log from tornado.log import app_log
from tornado.httputil import url_concat from tornado.httputil import url_concat
from tornado.ioloop import IOLoop
from tornado.web import RequestHandler from tornado.web import RequestHandler
from tornado import gen, web from tornado import gen, web
@@ -160,25 +161,54 @@ class BaseHandler(RequestHandler):
# spawning-related # spawning-related
#--------------------------------------------------------------- #---------------------------------------------------------------
@property
def slow_spawn_timeout(self):
return self.settings.get('slow_spawn_timeout', 10)
@property @property
def spawner_class(self): def spawner_class(self):
return self.settings.get('spawner_class', LocalProcessSpawner) return self.settings.get('spawner_class', LocalProcessSpawner)
@gen.coroutine @gen.coroutine
def spawn_single_user(self, user): def spawn_single_user(self, user):
yield user.spawn( f = user.spawn(
spawner_class=self.spawner_class, spawner_class=self.spawner_class,
base_url=self.base_url, base_url=self.base_url,
hub=self.hub, hub=self.hub,
config=self.config, config=self.config,
) )
yield self.proxy.add_user(user) @gen.coroutine
user.spawner.add_poll_callback(self.user_stopped, user) def finish_user_spawn(f=None):
return user """Finish the user spawn by registering listeners and notifying the proxy.
If the spawner is slow to start, this is passed as an async callback,
otherwise it is called immediately.
"""
if f and f.exception() is not None:
# failed, don't add to the proxy
return
yield self.proxy.add_user(user)
user.spawner.add_poll_callback(self.user_stopped, user)
try:
yield gen.with_timeout(timedelta(seconds=self.slow_spawn_timeout), f)
except gen.TimeoutError:
if user.spawn_pending:
# hit timeout, but spawn is still pending
self.log.warn("User %s server is slow to start", user.name)
# schedule finish for when the user finishes spawning
IOLoop.current().add_future(f, finish_user_spawn)
else:
raise
else:
yield finish_user_spawn()
@gen.coroutine @gen.coroutine
def user_stopped(self, user): def user_stopped(self, user):
"""Callback that fires when the spawner has stopped"""
status = yield user.spawner.poll() status = yield user.spawner.poll()
if status is None:
status = 'unknown'
self.log.warn("User %s server stopped, with exit code: %s", self.log.warn("User %s server stopped, with exit code: %s",
user.name, status, user.name, status,
) )
@@ -279,6 +309,13 @@ class UserSpawnHandler(BaseHandler):
if current_user and current_user.name == name: if current_user and current_user.name == name:
# logged in, spawn the server # logged in, spawn the server
if current_user.spawner: if current_user.spawner:
if current_user.spawn_pending:
# spawn has started, but not finished
html = self.render_template("spawn_pending.html", user=current_user)
self.finish(html)
return
# spawn has supposedly finished, check on the status
status = yield current_user.spawner.poll() status = yield current_user.spawner.poll()
if status is not None: if status is not None:
yield self.spawn_single_user(current_user) yield self.spawn_single_user(current_user)

View File

@@ -3,7 +3,7 @@
# Copyright (c) Jupyter Development Team. # Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License. # Distributed under the terms of the Modified BSD License.
from datetime import datetime from datetime import datetime, timedelta
import errno import errno
import json import json
import socket import socket
@@ -250,6 +250,7 @@ class User(Base):
cookie_id = Column(Unicode, default=new_token) cookie_id = Column(Unicode, default=new_token)
state = Column(JSONDict) state = Column(JSONDict)
spawner = None spawner = None
spawn_pending = False
def __repr__(self): def __repr__(self):
if self.server: if self.server:
@@ -310,7 +311,23 @@ class User(Base):
spawner.clear_state() spawner.clear_state()
spawner.api_token = api_token spawner.api_token = api_token
yield spawner.start() self.spawn_pending = True
f = spawner.start()
# wait for spawner.start to return
try:
yield gen.with_timeout(timedelta(seconds=spawner.start_timeout), f)
except gen.TimeoutError as e:
self.log.warn("{user}'s server failed to start in {s} seconds, giving up".format(
user=self.name, s=spawner.start_timeout,
))
try:
yield self.stop()
except Exception:
self.log.error("Failed to cleanup {user}'s server that failed to start".format(
user=self.name,
), exc_info=True)
# raise original TimeoutError
raise e
spawner.start_polling() spawner.start_polling()
# store state # store state
@@ -320,7 +337,7 @@ class User(Base):
try: try:
yield self.server.wait_up(http=True) yield self.server.wait_up(http=True)
except TimeoutError as e: except TimeoutError as e:
self.log.warn("{user}'s server never started at {url}, giving up.".format( self.log.warn("{user}'s server never showed up at {url}, giving up".format(
user=self.name, url=self.server.url, user=self.name, url=self.server.url,
)) ))
try: try:
@@ -331,6 +348,7 @@ class User(Base):
), exc_info=True) ), exc_info=True)
# raise original TimeoutError # raise original TimeoutError
raise e raise e
self.spawn_pending = False
return self return self
@gen.coroutine @gen.coroutine
@@ -339,6 +357,7 @@ class User(Base):
and cleanup after it. and cleanup after it.
""" """
self.spawn_pending = False
if self.spawner is None: if self.spawner is None:
return return
self.spawner.stop_polling() self.spawner.stop_polling()

View File

@@ -40,6 +40,14 @@ class Spawner(LoggingConfigurable):
user = Any() user = Any()
hub = Any() hub = Any()
api_token = Unicode() api_token = Unicode()
start_timeout = Integer(60, config=True,
help="""Timeout (in seconds) before giving up on the spawner.
This is the timeout for start to return, not the timeout for the server to respond.
Callers of spawner.start will assume that startup has failed if it takes longer than this.
start should return when the server process is started and its location is known.
"""
)
poll_interval = Integer(30, config=True, poll_interval = Integer(30, config=True,
help="""Interval (in seconds) on which to poll the spawner.""" help="""Interval (in seconds) on which to poll the spawner."""

View File

@@ -0,0 +1,28 @@
{% extends "page.html" %}
{% block main %}
<div class="container">
<div class="row">
<div class="text-center">
<p>Your server is starting up.</p>
<p>You will be redirected automatically when it's ready for you.</p>
<a id="refresh" class="btn btn-lg btn-primary" href="#">refresh</a>
</div>
</div>
</div>
{% endblock %}
{% block script %}
<script type="text/javascript">
require(["jquery"], function ($) {
$("#refresh").click(function () {
window.location.reload();
})
setTimeout(function () {
window.location.reload();
}, 5000);
});
</script>
{% endblock %}