Merge pull request #2721 from minrk/async-init-spawners

Add JupyterHub.init_spawners_timeout
This commit is contained in:
Min RK
2019-09-24 11:08:16 +02:00
committed by GitHub
5 changed files with 108 additions and 11 deletions

View File

@@ -589,11 +589,14 @@ class SpawnProgressAPIHandler(APIHandler):
async with aclosing(
iterate_until(spawn_future, spawner._generate_progress())
) as events:
try:
async for event in events:
# don't allow events to sneakily set the 'ready' flag
if 'ready' in event:
event.pop('ready', None)
await self.send_event(event)
except asyncio.CancelledError:
pass
# progress finished, wait for spawn to actually resolve,
# in case progress finished early

View File

@@ -11,8 +11,10 @@ import re
import signal
import socket
import sys
import time
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from datetime import timedelta
from datetime import timezone
from functools import partial
from getpass import getuser
@@ -984,6 +986,28 @@ class JupyterHub(Application):
""",
).tag(config=True)
init_spawners_timeout = Integer(
10,
help="""
Timeout (in seconds) to wait for spawners to initialize
Checking if spawners are healthy can take a long time
if many spawners are active at hub start time.
If it takes longer than this timeout to check,
init_spawner will be left to complete in the background
and the http server is allowed to start.
A timeout of -1 means wait forever,
which can mean a slow startup of the Hub
but ensures that the Hub is fully consistent by the time it starts responding to requests.
This matches the behavior of jupyterhub 1.0.
.. versionadded: 1.1.0
""",
).tag(config=True)
db_url = Unicode(
'sqlite:///jupyterhub.sqlite',
help="url for the database. e.g. `sqlite:///jupyterhub.sqlite`",
@@ -1835,6 +1859,7 @@ class JupyterHub(Application):
)
async def init_spawners(self):
self.log.debug("Initializing spawners")
db = self.db
def _user_summary(user):
@@ -1925,6 +1950,8 @@ class JupyterHub(Application):
else:
self.log.debug("%s not running", spawner._log_name)
spawner._check_pending = False
# parallelize checks for running Spawners
check_futures = []
for orm_user in db.query(orm.User):
@@ -1935,10 +1962,21 @@ class JupyterHub(Application):
# spawner should be running
# instantiate Spawner wrapper and check if it's still alive
spawner = user.spawners[name]
# signal that check is pending to avoid race conditions
spawner._check_pending = True
f = asyncio.ensure_future(check_spawner(user, name, spawner))
check_futures.append(f)
TOTAL_USERS.set(len(self.users))
# it's important that we get here before the first await
# so that we know all spawners are instantiated and in the check-pending state
# await checks after submitting them all
if check_futures:
self.log.debug(
"Awaiting checks for %i possibly-running spawners", len(check_futures)
)
await gen.multi(check_futures)
db.commit()
@@ -1949,7 +1987,7 @@ class JupyterHub(Application):
active_counts = self.users.count_active_users()
RUNNING_SERVERS.set(active_counts['active'])
TOTAL_USERS.set(len(self.users))
return len(check_futures)
def init_oauth(self):
base_url = self.hub.base_url
@@ -2112,6 +2150,7 @@ class JupyterHub(Application):
super().initialize(*args, **kwargs)
if self.generate_config or self.generate_certs or self.subapp:
return
self._start_future = asyncio.Future()
self.load_config_file(self.config_file)
self.init_logging()
if 'JupyterHubApp' in self.config:
@@ -2162,11 +2201,61 @@ class JupyterHub(Application):
self.init_services()
await self.init_api_tokens()
self.init_tornado_settings()
await self.init_spawners()
self.cleanup_oauth_clients()
self.init_handlers()
self.init_tornado_application()
# init_spawners can take a while
init_spawners_timeout = self.init_spawners_timeout
if init_spawners_timeout < 0:
# negative timeout means forever (previous, most stable behavior)
init_spawners_timeout = 86400
print(init_spawners_timeout)
init_start_time = time.perf_counter()
init_spawners_future = asyncio.ensure_future(self.init_spawners())
def log_init_time(f):
n_spawners = f.result()
self.log.info(
"Initialized %i spawners in %.3f seconds",
n_spawners,
time.perf_counter() - init_start_time,
)
init_spawners_future.add_done_callback(log_init_time)
try:
# don't allow a zero timeout because we still need to be sure
# that the Spawner objects are defined and pending
await gen.with_timeout(
timedelta(seconds=max(init_spawners_timeout, 1)), init_spawners_future
)
except gen.TimeoutError:
self.log.warning(
"init_spawners did not complete within %i seconds. "
"Allowing to complete in the background.",
self.init_spawners_timeout,
)
if init_spawners_future.done():
self.cleanup_oauth_clients()
else:
# schedule async operations after init_spawners finishes
async def finish_init_spawners():
await init_spawners_future
# schedule cleanup after spawners are all set up
# because it relies on the state resolved by init_spawners
self.cleanup_oauth_clients()
# trigger a proxy check as soon as all spawners are ready
# because this may be *after* the check made as part of normal startup.
# To avoid races with partially-complete start,
# ensure that start is complete before running this check.
await self._start_future
await self.proxy.check_routes(self.users, self._service_map)
asyncio.ensure_future(finish_init_spawners())
async def cleanup(self):
"""Shutdown managed services and various subprocesses. Cleanup runtime files."""
@@ -2452,6 +2541,7 @@ class JupyterHub(Application):
atexit.register(self.atexit)
# register cleanup on both TERM and INT
self.init_signal()
self._start_future.set_result(None)
def init_signal(self):
loop = asyncio.get_event_loop()

View File

@@ -879,7 +879,7 @@ class BaseHandler(RequestHandler):
# clear spawner._spawn_future when it's done
# keep an exception around, though, to prevent repeated implicit spawns
# if spawn is failing
if f.exception() is None:
if f.cancelled() or f.exception() is None:
spawner._spawn_future = None
# Now we're all done. clear _spawn_pending flag
spawner._spawn_pending = False
@@ -890,7 +890,7 @@ class BaseHandler(RequestHandler):
# update failure count and abort if consecutive failure limit
# is reached
def _track_failure_count(f):
if f.exception() is None:
if f.cancelled() or f.exception() is None:
# spawn succeeded, reset failure count
self.settings['failure_count'] = 0
return

View File

@@ -86,6 +86,7 @@ class Spawner(LoggingConfigurable):
_start_pending = False
_stop_pending = False
_proxy_pending = False
_check_pending = False
_waiting_for_response = False
_jupyterhub_version = None
_spawn_future = None
@@ -121,6 +122,8 @@ class Spawner(LoggingConfigurable):
return 'spawn'
elif self._stop_pending:
return 'stop'
elif self._check_pending:
return 'check'
return None
@property

View File

@@ -727,6 +727,7 @@ class User:
spawner = self.spawners[server_name]
spawner._spawn_pending = False
spawner._start_pending = False
spawner._check_pending = False
spawner.stop_polling()
spawner._stop_pending = True