Merge branch 'master' into asalikhov/ubuntu_focal

2025-10-07 18:14:05 +00:00 · 2020-05-29 09:12:34 -04:00
parent a2086ce626 8e8cbd0a0a
commit 1c204509e1
18 changed files with 1267 additions and 650 deletions
--- a/all-spark-notebook/test/data/local_pyspark.ipynb
+++ b/all-spark-notebook/test/data/local_pyspark.ipynb
@@ -0,0 +1,60 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "error",
+     "ename": "Error",
+     "evalue": "Jupyter cannot be started. Error attempting to locate jupyter: Data Science libraries jupyter and notebook are not installed in interpreter Python 3.7.7 64-bit ('jupyter': conda).",
+     "traceback": [
+      "Error: Jupyter cannot be started. Error attempting to locate jupyter: Data Science libraries jupyter and notebook are not installed in interpreter Python 3.7.7 64-bit ('jupyter': conda).",
+      "at b.startServer (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:92:270430)",
+      "at async b.createServer (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:92:269873)",
+      "at async connect (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:92:397876)",
+      "at async w.ensureConnectionAndNotebookImpl (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:556625)",
+      "at async w.ensureConnectionAndNotebook (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:556303)",
+      "at async w.clearResult (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:552346)",
+      "at async w.reexecuteCell (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:540374)",
+      "at async w.reexecuteCells (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:537541)"
+     ]
+    }
+   ],
+   "source": [
+    "from pyspark.sql import SparkSession\n",
+    "\n",
+    "# Spark session & context\n",
+    "spark = SparkSession.builder.master('local').getOrCreate()\n",
+    "sc = spark.sparkContext\n",
+    "\n",
+    "# Sum of the first 100 whole numbers\n",
+    "rdd = sc.parallelize(range(100 + 1))\n",
+    "rdd.sum()\n",
+    "# 5050"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/all-spark-notebook/test/data/local_sparkR.ipynb
+++ b/all-spark-notebook/test/data/local_sparkR.ipynb
@@ -0,0 +1,41 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "library(SparkR)\n",
+    "\n",
+    "# Spark session & context\n",
+    "sc <- sparkR.session(\"local\")\n",
+    "\n",
+    "# Sum of the first 100 whole numbers\n",
+    "sdf <- createDataFrame(list(1:100))\n",
+    "dapplyCollect(sdf,\n",
+    "              function(x) \n",
+    "              { x <- sum(x)}\n",
+    "             )\n",
+    "# 5050"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "R",
+   "language": "R",
+   "name": "ir"
+  },
+  "language_info": {
+   "codemirror_mode": "r",
+   "file_extension": ".r",
+   "mimetype": "text/x-r-source",
+   "name": "R",
+   "pygments_lexer": "r",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/all-spark-notebook/test/data/local_sparklyr.ipynb
+++ b/all-spark-notebook/test/data/local_sparklyr.ipynb
@@ -0,0 +1,43 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "library(sparklyr)\n",
+    "\n",
+    "# get the default config\n",
+    "conf <- spark_config()\n",
+    "# Set the catalog implementation in-memory\n",
+    "conf$spark.sql.catalogImplementation <- \"in-memory\"\n",
+    "\n",
+    "# Spark session & context\n",
+    "sc <- spark_connect(master = \"local\", config = conf)\n",
+    "\n",
+    "# Sum of the first 100 whole numbers\n",
+    "sdf_len(sc, 100, repartition = 1) %>% \n",
+    "    spark_apply(function(e) sum(e))\n",
+    "# 5050"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "R",
+   "language": "R",
+   "name": "ir"
+  },
+  "language_info": {
+   "codemirror_mode": "r",
+   "file_extension": ".r",
+   "mimetype": "text/x-r-source",
+   "name": "R",
+   "pygments_lexer": "r",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/all-spark-notebook/test/data/local_spylon.ipynb
+++ b/all-spark-notebook/test/data/local_spylon.ipynb
@@ -0,0 +1,63 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%init_spark\n",
+    "# Spark session & context\n",
+    "launcher.master = \"local\"\n",
+    "launcher.conf.spark.executor.cores = 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[8] at parallelize at <console>:28\n",
+       "res4: Double = 5050.0\n"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "// Sum of the first 100 whole numbers\n",
+    "val rdd = sc.parallelize(0 to 100)\n",
+    "rdd.sum()\n",
+    "// 5050"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "spylon-kernel",
+   "language": "scala",
+   "name": "spylon-kernel"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-scala",
+   "file_extension": ".scala",
+   "help_links": [
+    {
+     "text": "MetaKernel Magics",
+     "url": "https://metakernel.readthedocs.io/en/latest/source/README.html"
+    }
+   ],
+   "mimetype": "text/x-scala",
+   "name": "scala",
+   "pygments_lexer": "scala",
+   "version": "0.4.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/all-spark-notebook/test/data/local_toree.ipynb
+++ b/all-spark-notebook/test/data/local_toree.ipynb
@@ -0,0 +1,89 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Waiting for a Spark session to start..."
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "spark://master:7077\n"
+     ]
+    }
+   ],
+   "source": [
+    "// should print the value of --master in the kernel spec\n",
+    "println(sc.master)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Waiting for a Spark session to start..."
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "rdd = ParallelCollectionRDD[0] at parallelize at <console>:28\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "5050.0"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "// Sum of the first 100 whole numbers\n",
+    "val rdd = sc.parallelize(0 to 100)\n",
+    "rdd.sum()\n",
+    "// 5050"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Apache Toree - Scala",
+   "language": "scala",
+   "name": "apache_toree_scala"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-scala",
+   "file_extension": ".scala",
+   "mimetype": "text/x-scala",
+   "name": "scala",
+   "pygments_lexer": "scala",
+   "version": "2.11.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/all-spark-notebook/test/test_spark_notebooks.py
+++ b/all-spark-notebook/test/test_spark_notebooks.py
@@ -0,0 +1,35 @@
+# Copyright (c) Jupyter Development Team.
+# Distributed under the terms of the Modified BSD License.
+
+import logging
+
+import pytest
+import os
+
+LOGGER = logging.getLogger(__name__)
+
+
+@pytest.mark.parametrize(
+    "test_file",
+    # TODO: add local_sparklyr
+    ["local_pyspark", "local_spylon", "local_toree", "local_sparkR"],
+)
+def test_nbconvert(container, test_file):
+    """Check if Spark notebooks can be executed"""
+    host_data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
+    cont_data_dir = "/home/jovyan/data"
+    output_dir = "/tmp"
+    timeout_ms = 600
+    LOGGER.info(f"Test that {test_file} notebook can be executed ...")
+    command = f"jupyter nbconvert --to markdown --ExecutePreprocessor.timeout={timeout_ms} --output-dir {output_dir} --execute {cont_data_dir}/{test_file}.ipynb"
+    c = container.run(
+        volumes={host_data_dir: {"bind": cont_data_dir, "mode": "ro"}},
+        tty=True,
+        command=["start.sh", "bash", "-c", command],
+    )
+    rv = c.wait(timeout=timeout_ms / 10 + 10)
+    assert rv == 0 or rv["StatusCode"] == 0, f"Command {command} failed"
+    logs = c.logs(stdout=True).decode("utf-8")
+    LOGGER.debug(logs)
+    expected_file = f"{output_dir}/{test_file}.md"
+    assert expected_file in logs, f"Expected file {expected_file} not generated"
--- a/docs/contributing/features.md
+++ b/docs/contributing/features.md
@@ -25,9 +25,9 @@ If there's agreement that the feature belongs in one or more of the core stacks:

 1. Implement the feature in a local clone of the `jupyter/docker-stacks` project.
 2. Please build the image locally before submitting a pull request. Building the image locally shortens the debugging cycle by taking some load off [Travis CI](http://travis-ci.org/), which graciously provides free build services for open source projects like this one.  If you use `make`, call:
-```
-make build/somestack-notebook
-```
+    ```bash
+    make build/somestack-notebook
+    ```
 3. [Submit a pull request](https://github.com/PointCloudLibrary/pcl/wiki/A-step-by-step-guide-on-preparing-and-submitting-a-pull-request) (PR) with your changes.
 4. Watch for Travis to report a build success or failure for your PR on GitHub.
 5. Discuss changes with the maintainers and address any build issues.
--- a/docs/contributing/packages.md
+++ b/docs/contributing/packages.md
@@ -7,9 +7,9 @@ Please follow the process below to update a package version:
 1. Locate the Dockerfile containing the library you wish to update (e.g., [base-notebook/Dockerfile](https://github.com/jupyter/docker-stacks/blob/master/base-notebook/Dockerfile), [scipy-notebook/Dockerfile](https://github.com/jupyter/docker-stacks/blob/master/scipy-notebook/Dockerfile))
 2. Adjust the version number for the package. We prefer to pin the major and minor version number of packages so as to minimize rebuild side-effects when users submit pull requests (PRs). For example, you'll find the Jupyter Notebook package, `notebook`, installed using conda with `notebook=5.4.*`.
 3. Please build the image locally before submitting a pull request. Building the image locally shortens the debugging cycle by taking some load off [Travis CI](http://travis-ci.org/), which graciously provides free build services for open source projects like this one. If you use `make`, call:
-```
-make build/somestack-notebook
-```
+    ```bash
+    make build/somestack-notebook
+    ```
 4. [Submit a pull request](https://github.com/PointCloudLibrary/pcl/wiki/A-step-by-step-guide-on-preparing-and-submitting-a-pull-request) (PR) with your changes.
 5. Watch for Travis to report a build success or failure for your PR on GitHub.
 6. Discuss changes with the maintainers and address any build issues. Version conflicts are the most common problem. You may need to upgrade additional packages to fix build failures.
--- a/docs/contributing/stacks.md
+++ b/docs/contributing/stacks.md
@@ -13,13 +13,13 @@ This approach mirrors how we build and share the core stack images. Feel free to

 First, install [cookiecutter](https://github.com/audreyr/cookiecutter) using pip or conda:

-```
+```bash
 pip install cookiecutter   # or conda install cookiecutter
 ```

 Run the cookiecutter command pointing to the [jupyter/cookiecutter-docker-stacks](https://github.com/jupyter/cookiecutter-docker-stacks) project on GitHub.

-```
+```bash
 cookiecutter https://github.com/jupyter/cookiecutter-docker-stacks.git
 ```

--- a/docs/contributing/tests.md
+++ b/docs/contributing/tests.md
@@ -13,10 +13,10 @@ Please follow the process below to add new tests:
 1. If the test should run against every image built, add your test code to one of the modules in [test/](https://github.com/jupyter/docker-stacks/tree/master/test) or create a new module.
 2. If your test should run against a single image, add your test code to one of the modules in `some-notebook/test/` or create a new module.
 3. Build one or more images you intend to test and run the tests locally. If you use `make`, call:
-```
-make build/somestack-notebook
-make test/somestack-notebook
-```
+    ```bash
+    make build/somestack-notebook
+    make test/somestack-notebook
+    ```
 4. [Submit a pull request](https://github.com/PointCloudLibrary/pcl/wiki/A-step-by-step-guide-on-preparing-and-submitting-a-pull-request) (PR) with your changes.
 5. Watch for Travis to report a build success or failure for your PR on GitHub.
-6. Discuss changes with the maintainers and address any issues running the tests on Travis.
+6. Discuss changes with the maintainers and address any issues running the tests on Travis.
--- a/docs/locale/en/LC_MESSAGES/using.po
+++ b/docs/locale/en/LC_MESSAGES/using.po
--- a/docs/using/common.md
+++ b/docs/using/common.md
@@ -8,13 +8,13 @@ This page describes the options supported by the startup script as well as how t

 You can pass [Jupyter command line options](https://jupyter.readthedocs.io/en/latest/projects/jupyter-command.html) to the `start-notebook.sh` script when launching the container. For example, to secure the Notebook server with a custom password hashed using `IPython.lib.passwd()` instead of the default token, you can run the following:

-```
+```bash
 docker run -d -p 8888:8888 jupyter/base-notebook start-notebook.sh --NotebookApp.password='sha1:74ba40f8a388:c913541b7ee99d15d5ed31d4226bf7838f83a50e'
 ```

 For example, to set the base URL of the notebook server, you can run the following:

-```
+```bash
 docker run -d -p 8888:8888 jupyter/base-notebook start-notebook.sh --NotebookApp.base_url=/some/path
 ```

@@ -54,7 +54,7 @@ script for execution details.

 You may mount SSL key and certificate files into a container and configure Jupyter Notebook to use them to accept HTTPS connections. For example, to mount a host folder containing a `notebook.key` and `notebook.crt` and use them, you might run the following:

-```
+```bash
 docker run -d -p 8888:8888 \
    -v /some/host/folder:/etc/ssl/notebook \
    jupyter/base-notebook start-notebook.sh \
@@ -64,7 +64,7 @@ docker run -d -p 8888:8888 \

 Alternatively, you may mount a single PEM file containing both the key and certificate. For example:

-```
+```bash
 docker run -d -p 8888:8888 \
    -v /some/host/folder/notebook.pem:/etc/ssl/notebook.pem \
    jupyter/base-notebook start-notebook.sh \
@@ -85,13 +85,13 @@ For additional information about using SSL, see the following:

 The `start-notebook.sh` script actually inherits most of its option handling capability from a more generic `start.sh` script. The `start.sh` script supports all of the features described above, but allows you to specify an arbitrary command to execute. For example, to run the text-based `ipython` console in a container, do the following:

-```
+```bash
 docker run -it --rm jupyter/base-notebook start.sh ipython
 ```

 Or, to run JupyterLab instead of the classic notebook, run the following:

-```
+```bash
 docker run -it --rm -p 8888:8888 jupyter/base-notebook start.sh jupyter lab
 ```

@@ -107,7 +107,7 @@ The default Python 3.x [Conda environment](http://conda.pydata.org/docs/using/en

 The `jovyan` user has full read/write access to the `/opt/conda` directory. You can use either `conda` or `pip` to install new packages without any additional permissions.

-```
+```bash
 # install a package into the default (python 3.x) environment
 pip install some-package
 conda install some-package
--- a/docs/using/recipes.md
+++ b/docs/using/recipes.md
@@ -17,7 +17,7 @@ orchestrator config.

 For example:

-```
+```bash
 docker run -it -e GRANT_SUDO=yes --user root jupyter/minimal-notebook
 ```

@@ -75,7 +75,7 @@ Python 2.x was removed from all images on August 10th, 2017, starting in tag `cc
 add a Python 2.x environment by defining your own Dockerfile inheriting from one of the images like
 so:

-```
+```dockerfile
 # Choose your desired base image
 FROM jupyter/scipy-notebook:latest

@@ -103,7 +103,7 @@ Ref:
 The default version of Python that ships with conda/ubuntu may not be the version you want.
 To add a conda environment with a different version and make it accessible to Jupyter, the instructions are very similar to Python 2.x but are slightly simpler (no need to switch to `root`):

-```
+```dockerfile
 # Choose your desired base image
 FROM jupyter/minimal-notebook:latest

@@ -168,12 +168,12 @@ ENTRYPOINT ["jupyter", "lab", "--ip=0.0.0.0", "--allow-root"]
 ```

 And build the image as:
-```
+```bash
 docker build -t jupyter/scipy-dasklabextension:latest .
 ```

 Once built, run using the command:
-```
+```bash
 docker run -it --rm -p 8888:8888 -p 8787:8787 jupyter/scipy-dasklabextension:latest
 ```

@@ -194,7 +194,7 @@ Ref:
 [RISE](https://github.com/damianavila/RISE) allows via extension to create live slideshows of your
 notebooks, with no conversion, adding javascript Reveal.js:

-```
+```bash
 # Add Live slideshows with RISE
 RUN conda install -c damianavila82 rise
 ```
@@ -207,7 +207,7 @@ Credit: [Paolo D.](https://github.com/pdonorio) based on
 You need to install conda's gcc for Python xgboost to work properly. Otherwise, you'll get an
 exception about libgomp.so.1 missing GOMP_4.0.

-```
+```bash
 %%bash
 conda install -y gcc
 pip install xgboost
@@ -312,8 +312,8 @@ Credit: [Justin Tyberg](https://github.com/jtyberg), [quanghoc](https://github.c
 To use a specific version of JupyterHub, the version of `jupyterhub` in your image should match the
 version in the Hub itself.

-```
-FROM  jupyter/base-notebook:5ded1de07260
+```dockerfile
+FROM jupyter/base-notebook:5ded1de07260
 RUN pip install jupyterhub==0.8.0b1
 ```

@@ -375,7 +375,7 @@ Ref:

 ### Using Local Spark JARs

-```
+```python
 import os
 os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /home/jovyan/spark-streaming-kafka-assembly_2.10-1.6.1.jar pyspark-shell'
 import pyspark
@@ -404,7 +404,7 @@ Ref:

 ### Use jupyter/all-spark-notebooks with an existing Spark/YARN cluster

-```
+```dockerfile
 FROM jupyter/all-spark-notebook

 # Set env vars for pydoop
@@ -480,13 +480,13 @@ convenient to launch the server without a password or token. In this case, you s

 For jupyterlab:

-```
+```bash
 docker run jupyter/base-notebook:6d2a05346196 start.sh jupyter lab --LabApp.token=''
 ```

 For jupyter classic:

-```
+```bash
 docker run jupyter/base-notebook:6d2a05346196 start.sh jupyter notebook --NotebookApp.token=''
 ```

@@ -494,7 +494,7 @@ docker run jupyter/base-notebook:6d2a05346196 start.sh jupyter notebook --Notebo

 NB: this works for classic notebooks only

-```
+```dockerfile
 # Update with your base image of choice
 FROM jupyter/minimal-notebook:latest

@@ -513,7 +513,7 @@ Ref:

 Using `auto-sklearn` requires `swig`, which the other notebook images lack, so it cant be experimented with. Also, there is no Conda package for `auto-sklearn`.

-```
+```dockerfile
 ARG BASE_CONTAINER=jupyter/scipy-notebook
 FROM jupyter/scipy-notebook:latest

--- a/docs/using/specifics.md
+++ b/docs/using/specifics.md
@@ -5,7 +5,8 @@ This page provides details about features specific to one or more images.
 ## Apache Spark

 **Specific Docker Image Options**
-* `-p 4040:4040` - The `jupyter/pyspark-notebook` and `jupyter/all-spark-notebook` images open [SparkUI (Spark Monitoring and Instrumentation UI)](http://spark.apache.org/docs/latest/monitoring.html) at default port `4040`, this option map `4040` port inside docker container to `4040` port on host machine . Note every new spark context that is created is put onto an incrementing port (ie. 4040, 4041, 4042, etc.), and it might be necessary to open multiple ports. For example: `docker run -d -p 8888:8888 -p 4040:4040 -p 4041:4041 jupyter/pyspark-notebook` 
+
+* `-p 4040:4040` - The `jupyter/pyspark-notebook` and `jupyter/all-spark-notebook` images open [SparkUI (Spark Monitoring and Instrumentation UI)](http://spark.apache.org/docs/latest/monitoring.html) at default port `4040`, this option map `4040` port inside docker container to `4040` port on host machine . Note every new spark context that is created is put onto an incrementing port (ie. 4040, 4041, 4042, etc.), and it might be necessary to open multiple ports. For example: `docker run -d -p 8888:8888 -p 4040:4040 -p 4041:4041 jupyter/pyspark-notebook`.

 **Usage Examples**

@@ -13,30 +14,66 @@ The `jupyter/pyspark-notebook` and `jupyter/all-spark-notebook` images support t

 ### Using Spark Local Mode

-Spark local mode is useful for experimentation on small data when you do not have a Spark cluster available.
+Spark **local mode** is useful for experimentation on small data when you do not have a Spark cluster available.

-#### In a Python Notebook
+#### In Python
+
+In a Python notebook.

 ```python
 from pyspark.sql import SparkSession
-spark = SparkSession.builder.appName("SimpleApp").getOrCreate()
-# do something to prove it works
-spark.sql('SELECT "Test" as c1').show()
+
+# Spark session & context
+spark = SparkSession.builder.master('local').getOrCreate()
+sc = spark.sparkContext
+
+# Sum of the first 100 whole numbers
+rdd = sc.parallelize(range(100 + 1))
+rdd.sum()
+# 5050
 ```

-#### In a R Notebook
+#### In R

-```r
+In a R notebook with [SparkR][sparkr].
+
+```R
 library(SparkR)

-as <- sparkR.session("local[*]")
+# Spark session & context
+sc <- sparkR.session("local")

-# do something to prove it works
-df <- as.DataFrame(iris)
-head(filter(df, df$Petal_Width > 0.2))
+# Sum of the first 100 whole numbers
+sdf <- createDataFrame(list(1:100))
+dapplyCollect(sdf,
+              function(x) 
+              { x <- sum(x)}
+             )
+# 5050
 ```

-#### In a Spylon Kernel Scala Notebook
+In a R notebook with [sparklyr][sparklyr].
+
+```R
+library(sparklyr)
+
+# Spark configuration
+conf <- spark_config()
+# Set the catalog implementation in-memory
+conf$spark.sql.catalogImplementation <- "in-memory"
+
+# Spark session & context
+sc <- spark_connect(master = "local", config = conf)
+
+# Sum of the first 100 whole numbers
+sdf_len(sc, 100, repartition = 1) %>% 
+    spark_apply(function(e) sum(e))
+# 5050
+```
+
+#### In Scala
+
+##### In a Spylon Kernel

 Spylon kernel instantiates a `SparkContext` for you in variable `sc` after you configure Spark
 options in a `%%init_spark` magic cell.
@@ -44,27 +81,30 @@ options in a `%%init_spark` magic cell.
 ```python
 %%init_spark
 # Configure Spark to use a local master
-launcher.master = "local[*]"
+launcher.master = "local"
 ```

 ```scala
-// Now run Scala code that uses the initialized SparkContext in sc
-val rdd = sc.parallelize(0 to 999)
-rdd.takeSample(false, 5)
+// Sum of the first 100 whole numbers
+val rdd = sc.parallelize(0 to 100)
+rdd.sum()
+// 5050
 ```

-#### In an Apache Toree Scala Notebook
+##### In an Apache Toree Kernel

 Apache Toree instantiates a local `SparkContext` for you in variable `sc` when the kernel starts.

 ```scala
-val rdd = sc.parallelize(0 to 999)
-rdd.takeSample(false, 5)
+// Sum of the first 100 whole numbers
+val rdd = sc.parallelize(0 to 100)
+rdd.sum()
+// 5050
 ```

 ### Connecting to a Spark Cluster in Standalone Mode

-Connection to Spark Cluster on Standalone Mode requires the following set of steps:
+Connection to Spark Cluster on **[Standalone Mode](https://spark.apache.org/docs/latest/spark-standalone.html)** requires the following set of steps:

 0. Verify that the docker image (check the Dockerfile) and the Spark Cluster which is being
   deployed, run the same version of Spark.
@@ -72,98 +112,107 @@ Connection to Spark Cluster on Standalone Mode requires the following set of ste
 2. Run the Docker container with `--net=host` in a location that is network addressable by all of
   your Spark workers. (This is a [Spark networking
   requirement](http://spark.apache.org/docs/latest/cluster-overview.html#components).)
-    * NOTE: When using `--net=host`, you must also use the flags `--pid=host -e
-      TINI_SUBREAPER=true`. See https://github.com/jupyter/docker-stacks/issues/64 for details.
+   * NOTE: When using `--net=host`, you must also use the flags `--pid=host -e
+   TINI_SUBREAPER=true`. See https://github.com/jupyter/docker-stacks/issues/64 for details.

-#### In a Python Notebook
+**Note**: In the following examples we are using the Spark master URL `spark://master:7077` that shall be replaced by the URL of the Spark master.
+
+#### In Python
+
+The **same Python version** need to be used on the notebook (where the driver is located) and on the Spark workers.
+The python version used at driver and worker side can be adjusted by setting the environment variables `PYSPARK_PYTHON` and / or `PYSPARK_DRIVER_PYTHON`, see [Spark Configuration][spark-conf] for more information.

 ```python
-import os
-# make sure pyspark tells workers to use python3 not 2 if both are installed
-os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
+from pyspark.sql import SparkSession

-import pyspark
-conf = pyspark.SparkConf()
+# Spark session & context
+spark = SparkSession.builder.master('spark://master:7077').getOrCreate()
+sc = spark.sparkContext

-# Point to spark master
-conf.setMaster("spark://10.10.10.10:7070")
-# point to spark binary package in HDFS or on local filesystem on all slave
-# nodes (e.g., file:///opt/spark/spark-2.2.0-bin-hadoop2.7.tgz)
-conf.set("spark.executor.uri", "hdfs://10.10.10.10/spark/spark-2.2.0-bin-hadoop2.7.tgz")
-# set other options as desired
-conf.set("spark.executor.memory", "8g")
-conf.set("spark.core.connection.ack.wait.timeout", "1200")
-
-# create the context
-sc = pyspark.SparkContext(conf=conf)
-
-# do something to prove it works
-rdd = sc.parallelize(range(100000000))
-rdd.sumApprox(3)
+# Sum of the first 100 whole numbers
+rdd = sc.parallelize(range(100 + 1))
+rdd.sum()
+# 5050
 ```

-#### In a R Notebook
+#### In R

-```r
+In a R notebook with [SparkR][sparkr].
+
+```R
 library(SparkR)

-# Point to spark master
-# Point to spark binary package in HDFS or on local filesystem on all worker
-# nodes (e.g., file:///opt/spark/spark-2.2.0-bin-hadoop2.7.tgz) in sparkEnvir
-# Set other options in sparkEnvir
-sc <- sparkR.session("spark://10.10.10.10:7070", sparkEnvir=list(
-    spark.executor.uri="hdfs://10.10.10.10/spark/spark-2.4.3-bin-hadoop2.7.tgz",
-    spark.executor.memory="8g"
-    )
-)
+# Spark session & context
+sc <- sparkR.session("spark://master:7077")

-# do something to prove it works
-data(iris)
-df <- as.DataFrame(iris)
-head(filter(df, df$Petal_Width > 0.2))
+# Sum of the first 100 whole numbers
+sdf <- createDataFrame(list(1:100))
+dapplyCollect(sdf,
+              function(x) 
+              { x <- sum(x)}
+             )
+# 5050
 ```

-#### In a Spylon Kernel Scala Notebook
+In a R notebook with [sparklyr][sparklyr].
+
+```R
+library(sparklyr)
+
+# Spark session & context
+# Spark configuration
+conf <- spark_config()
+# Set the catalog implementation in-memory
+conf$spark.sql.catalogImplementation <- "in-memory"
+sc <- spark_connect(master = "spark://master:7077", config = conf)
+
+# Sum of the first 100 whole numbers
+sdf_len(sc, 100, repartition = 1) %>% 
+    spark_apply(function(e) sum(e))
+# 5050
+```
+
+#### In Scala
+
+##### In a Spylon Kernel
+
+Spylon kernel instantiates a `SparkContext` for you in variable `sc` after you configure Spark
+options in a `%%init_spark` magic cell.

 ```python
 %%init_spark
-# Point to spark master
-launcher.master = "spark://10.10.10.10:7070"
-launcher.conf.spark.executor.uri=hdfs://10.10.10.10/spark/spark-2.4.3-bin-hadoop2.7.tgz
+# Configure Spark to use a local master
+launcher.master = "spark://master:7077"
 ```

 ```scala
-// Now run Scala code that uses the initialized SparkContext in sc
-val rdd = sc.parallelize(0 to 999)
-rdd.takeSample(false, 5)
+// Sum of the first 100 whole numbers
+val rdd = sc.parallelize(0 to 100)
+rdd.sum()
+// 5050
 ```

-#### In an Apache Toree Scala Notebook
+##### In an Apache Toree Scala Notebook

-The Apache Toree kernel automatically creates a `SparkContext` when it starts based on configuration
-information from its command line arguments and environment variables. You can pass information
-about your cluster via the `SPARK_OPTS` environment variable when you spawn a container.
+The Apache Toree kernel automatically creates a `SparkContext` when it starts based on configuration information from its command line arguments and environment variables. You can pass information about your cluster via the `SPARK_OPTS` environment variable when you spawn a container.

-For instance, to pass information about a standalone Spark master, Spark binary location in HDFS,
-and an executor options, you could start the container like so:
+For instance, to pass information about a standalone Spark master, you could start the container like so:

-```
-docker run -d -p 8888:8888 -e SPARK_OPTS='--master=spark://10.10.10.10:7070 \
-    --spark.executor.uri=hdfs://10.10.10.10/spark/spark-2.4.3-bin-hadoop2.7.tgz \
-    --spark.executor.memory=8g' jupyter/all-spark-notebook
+```bash
+docker run -d -p 8888:8888 -e SPARK_OPTS='--master=spark://master:7077' \
+       jupyter/all-spark-notebook
 ```

-Note that this is the same information expressed in a notebook in the Python case above. Once the
-kernel spec has your cluster information, you can test your cluster in an Apache Toree notebook like
-so:
+Note that this is the same information expressed in a notebook in the Python case above. Once the kernel spec has your cluster information, you can test your cluster in an Apache Toree notebook like so:

 ```scala
 // should print the value of --master in the kernel spec
 println(sc.master)

-// do something to prove it works
-val rdd = sc.parallelize(0 to 99999999)
+// Sum of the first 100 whole numbers
+val rdd = sc.parallelize(0 to 100)
 rdd.sum()
+// 5050
 ```

 ## Tensorflow
@@ -199,3 +248,7 @@ init = tf.global_variables_initializer()
 sess.run(init)
 sess.run(hello)
 ```
+
+[sparkr]: https://spark.apache.org/docs/latest/sparkr.html
+[sparklyr]: https://spark.rstudio.com/
+[spark-conf]: https://spark.apache.org/docs/latest/configuration.html
--- a/examples/docker-compose/README.md
+++ b/examples/docker-compose/README.md
@@ -12,7 +12,7 @@ See the [installation instructions](https://docs.docker.com/engine/installation/

 Build and run a `jupyter/minimal-notebook` container on a VirtualBox VM on local desktop.

-```
+```bash
 # create a Docker Machine-controlled VirtualBox VM
 bin/vbox.sh mymachine

@@ -28,7 +28,7 @@ notebook/up.sh

 To stop and remove the container:

-```
+```bash
 notebook/down.sh
 ```

@@ -39,14 +39,14 @@ notebook/down.sh

 You can customize the docker-stack notebook image to deploy by modifying the `notebook/Dockerfile`.  For example, you can build and deploy a `jupyter/all-spark-notebook` by modifying the Dockerfile like so:

-```
+```dockerfile
 FROM jupyter/all-spark-notebook:55d5ca6be183
 ...
 ```

 Once you modify the Dockerfile, don't forget to rebuild the image.

-```
+```bash
 # activate the docker machine
 eval "$(docker-machine env mymachine)"

@@ -57,14 +57,14 @@ notebook/build.sh

 Yes. Set environment variables to specify unique names and ports when running the `up.sh` command.

-```
+```bash
 NAME=my-notebook PORT=9000 notebook/up.sh
 NAME=your-notebook PORT=9001 notebook/up.sh
 ```

 To stop and remove the containers:

-```
+```bash
 NAME=my-notebook notebook/down.sh
 NAME=your-notebook notebook/down.sh
 ```
@@ -78,7 +78,7 @@ The `up.sh` creates a Docker volume named after the notebook container with a `-

 Yes. Set the `WORK_VOLUME` environment variable to the same value for each notebook.

-```
+```bash
 NAME=my-notebook PORT=9000 WORK_VOLUME=our-work notebook/up.sh
 NAME=your-notebook PORT=9001 WORK_VOLUME=our-work notebook/up.sh
 ```
@@ -87,7 +87,7 @@ NAME=your-notebook PORT=9001 WORK_VOLUME=our-work notebook/up.sh

 To run the notebook server with a self-signed certificate, pass the `--secure` option to the `up.sh` script.  You must also provide a password, which will be used to secure the notebook server.  You can specify the password by setting the `PASSWORD` environment variable, or by passing it to the `up.sh` script.  

-```
+```bash
 PASSWORD=a_secret notebook/up.sh --secure

 # or
@@ -103,7 +103,7 @@ This example includes the `bin/letsencrypt.sh` script, which runs the `letsencry

 The following command will create a certificate chain and store it in a Docker volume named `mydomain-secrets`.

-```
+```bash
 FQDN=host.mydomain.com EMAIL=myemail@somewhere.com \
  SECRETS_VOLUME=mydomain-secrets \
  bin/letsencrypt.sh
@@ -111,7 +111,7 @@ FQDN=host.mydomain.com EMAIL=myemail@somewhere.com \

 Now run `up.sh` with the `--letsencrypt` option.  You must also provide the name of the secrets volume and a password.

-```
+```bash
 PASSWORD=a_secret SECRETS_VOLUME=mydomain-secrets notebook/up.sh --letsencrypt

 # or
@@ -120,7 +120,7 @@ notebook/up.sh --letsencrypt --password a_secret --secrets mydomain-secrets

 Be aware that Let's Encrypt has a pretty [low rate limit per domain](https://community.letsencrypt.org/t/public-beta-rate-limits/4772/3) at the moment.  You can avoid exhausting your limit by testing against the Let's Encrypt staging servers.  To hit their staging servers, set the environment variable `CERT_SERVER=--staging`.

-```
+```bash
 FQDN=host.mydomain.com EMAIL=myemail@somewhere.com \
  CERT_SERVER=--staging \
  bin/letsencrypt.sh
@@ -134,13 +134,13 @@ Yes, you should be able to deploy to any Docker Machine-controlled host.  To mak

 To create a Docker machine using a VirtualBox VM on local desktop:

-```
+```bash
 bin/vbox.sh mymachine
 ```

 To create a Docker machine using a virtual device on IBM SoftLayer:

-```
+```bash
 export SOFTLAYER_USER=my_softlayer_username
 export SOFTLAYER_API_KEY=my_softlayer_api_key
 export SOFTLAYER_DOMAIN=my.domain
--- a/examples/make-deploy/README.md
+++ b/examples/make-deploy/README.md
@@ -11,7 +11,7 @@ This folder contains a Makefile and a set of supporting files demonstrating how

 To show what's possible, here's how to run the `jupyter/minimal-notebook` on a brand new local virtualbox.

-```
+```bash
 # create a new VM
 make virtualbox-vm NAME=dev
 # make the new VM the active docker machine
@@ -30,7 +30,7 @@ The last command will log the IP address and port to visit in your browser.

 Yes. Specify a unique name and port on the `make notebook` command.

-```
+```bash
 make notebook NAME=my-notebook PORT=9000
 make notebook NAME=your-notebook PORT=9001
 ```
@@ -39,7 +39,7 @@ make notebook NAME=your-notebook PORT=9001

 Yes.

-```
+```bash
 make notebook NAME=my-notebook PORT=9000 WORK_VOLUME=our-work
 make notebook NAME=your-notebook PORT=9001 WORK_VOLUME=our-work
 ```
@@ -52,7 +52,7 @@ Instead of `make notebook`, run `make self-signed-notebook PASSWORD=your_desired

 Yes. Please.

-```
+```bash
 make letsencrypt FQDN=host.mydomain.com EMAIL=myemail@somewhere.com
 make letsencrypt-notebook
 ```
@@ -61,7 +61,7 @@ The first command creates a Docker volume named after the notebook container wit

 Be aware: Let's Encrypt has a pretty [low rate limit per domain](https://community.letsencrypt.org/t/public-beta-rate-limits/4772/3) at the moment.  You can avoid exhausting your limit by testing against the Let's Encrypt staging servers.  To hit their staging servers, set the environment variable `CERT_SERVER=--staging`.

-```
+```bash
 make letsencrypt FQDN=host.mydomain.com EMAIL=myemail@somewhere.com CERT_SERVER=--staging
 ```

@@ -69,7 +69,7 @@ Also, keep in mind Let's Encrypt certificates are short lived: 90 days at the mo

 ### My pip/conda/apt-get installs disappear every time I restart the container. Can I make them permanent?

-```
+```bash
 # add your pip, conda, apt-get, etc. permanent features to the Dockerfile where
 # indicated by the comments in the Dockerfile
 vi Dockerfile
@@ -79,7 +79,7 @@ make notebook

 ### How do I upgrade my Docker container?

-```
+```bash
 make image DOCKER_ARGS=--pull
 make notebook
 ```
@@ -90,7 +90,7 @@ The first line pulls the latest version of the Docker image used in the local Do

 Yes. As an example, there's a `softlayer.makefile` included in this repo as an example. You would use it like so:

-```
+```bash
 make softlayer-vm NAME=myhost \
    SOFTLAYER_DOMAIN=your_desired_domain \
    SOFTLAYER_USER=your_user_id \
--- a/examples/openshift/README.md
+++ b/examples/openshift/README.md
@@ -16,7 +16,7 @@ Loading the Templates

 To load the templates, login to OpenShift from the command line and run:

-```
+```bash
 oc create -f https://raw.githubusercontent.com/jupyter-on-openshift/docker-stacks/master/examples/openshift/templates.json
 ```

@@ -33,7 +33,7 @@ Deploying a Notebook

 To deploy a notebook from the command line using the template, run:

-```
+```bash
 oc new-app --template jupyter-notebook
 ```

@@ -71,7 +71,7 @@ A password you can use when accessing the notebook will be auto generated and is

 To see the hostname for accessing the notebook run:

-```
+```bash
 oc get routes
 ```

@@ -95,7 +95,7 @@ Passing Template Parameters

 To override the name for the notebook, the image used, and the password, you can pass template parameters using the ``--param`` option.

-```
+```bash
 oc new-app --template jupyter-notebook \
  --param APPLICATION_NAME=mynotebook \
  --param NOTEBOOK_IMAGE=jupyter/scipy-notebook:latest \
@@ -120,7 +120,7 @@ Deleting the Notebook Instance

 To delete the notebook instance, run ``oc delete`` using a label selector for the application name.

-```
+```bash
 oc delete all,configmap --selector app=mynotebook
 ```

@@ -129,7 +129,7 @@ Enabling Jupyter Lab Interface

 To enable the Jupyter Lab interface for a deployed notebook set the ``JUPYTER_ENABLE_LAB`` environment variable.

-```
+```bash
 oc set env dc/mynotebook JUPYTER_ENABLE_LAB=true
 ```

@@ -140,7 +140,7 @@ Adding Persistent Storage

 You can upload notebooks and other files using the web interface of the notebook. Any uploaded files or changes you make to them will be lost when the notebook instance is restarted. If you want to save your work, you need to add persistent storage to the notebook. To add persistent storage run:

-```
+```bash
 oc set volume dc/mynotebook --add \
      --type=pvc --claim-size=1Gi --claim-mode=ReadWriteOnce \
      --claim-name mynotebook-data --name data \
@@ -149,7 +149,7 @@ oc set volume dc/mynotebook --add \

 When you have deleted the notebook instance, if using a persistent volume, you will need to delete it in a separate step.

-```
+```bash
 oc delete pvc/mynotebook-data
 ```

@@ -158,7 +158,7 @@ Customizing the Configuration

 If you want to set any custom configuration for the notebook, you can edit the config map created by the template.

-```
+```bash
 oc edit configmap/mynotebook-cfg
 ```

@@ -176,19 +176,19 @@ Because the configuration is Python code, ensure any indenting is correct. Any e

 If the error is in the config map, edit it again to fix it and trigged a new deployment if necessary by running:

-```
+```bash
 oc rollout latest dc/mynotebook
 ```

 If you make an error in the configuration file stored in the persistent volume, you will need to scale down the notebook so it isn't running.

-```
+```bash
 oc scale dc/mynotebook --replicas 0
 ```

 Then run:

-```
+```bash
 oc debug dc/mynotebook
 ```

@@ -196,7 +196,7 @@ to run the notebook in debug mode. This will provide you with an interactive ter

 Start up the notebook again.

-```
+```bash
 oc scale dc/mynotebook --replicas 1
 ```

@@ -207,7 +207,7 @@ The password for the notebook is supplied as a template parameter, or if not sup

 If you want to change the password, you can do so by editing the environment variable on the deployment configuration.

-```
+```bash
 oc set env dc/mynotebook JUPYTER_NOTEBOOK_PASSWORD=mypassword
 ```

@@ -232,13 +232,13 @@ If the image is in your OpenShift project, because you imported the image into O

 This can be illustrated by first importing an image into the OpenShift project.

-```
+```bash
 oc import-image jupyter/datascience-notebook:latest --confirm
 ```

 Then deploy it using the name of the image stream created.

-```
+```bash
 oc new-app --template jupyter-notebook \
  --param APPLICATION_NAME=mynotebook \
  --param NOTEBOOK_IMAGE=datascience-notebook \
--- a/examples/source-to-image/README.md
+++ b/examples/source-to-image/README.md
@@ -22,7 +22,7 @@ Getting Started with S2I

 As an example of how S2I can be used to create a custom image with a bundled set of notebooks, run:

-```
+```bash
 s2i build \
  --scripts-url https://raw.githubusercontent.com/jupyter/docker-stacks/master/examples/source-to-image \
  --context-dir docs/source/examples/Notebook \
@@ -76,7 +76,7 @@ The supplied ``assemble`` script performs a few key steps.

 The first steps copy files into the location they need to be when the image is run, from the directory where they are initially placed by the ``s2i`` command.

-```
+```bash
 cp -Rf /tmp/src/. /home/$NB_USER

 rm -rf /tmp/src
@@ -84,7 +84,7 @@ rm -rf /tmp/src

 The next steps are:

-```
+```bash
 if [ -f /home/$NB_USER/environment.yml ]; then
    conda env update --name root --file /home/$NB_USER/environment.yml
    conda clean --all -f -y
@@ -101,7 +101,7 @@ This means that so long as a set of notebook files provides one of these files l

 A final step is:

-```
+```bash
 fix-permissions $CONDA_DIR
 fix-permissions /home/$NB_USER
 ```
@@ -112,7 +112,7 @@ As long as you preserve the first and last set of steps, you can do whatever you

 The ``run`` script in this directory is very simple and just runs the notebook application.

-```
+```bash
 exec start-notebook.sh "$@"
 ```

@@ -121,13 +121,13 @@ Integration with OpenShift

 The OpenShift platform provides integrated support for S2I type builds. Templates are provided for using the S2I build mechanism with the scripts in this directory. To load the templates run:

-```
+```bash
 oc create -f https://raw.githubusercontent.com/jupyter/docker-stacks/master/examples/source-to-image/templates.json
 ```

 This will create the templates:

-```
+```bash
 jupyter-notebook-builder
 jupyter-notebook-quickstart
 ```
@@ -136,7 +136,7 @@ The templates can be used from the OpenShift web console or command line. This `

 To use the OpenShift command line to build into an image, and deploy, the set of notebooks used above, run:

-```
+```bash
 oc new-app --template jupyter-notebook-quickstart \
  --param APPLICATION_NAME=notebook-examples \
  --param GIT_REPOSITORY_URL=https://github.com/jupyter/notebook \