Merge branch 'master' into asalikhov/ubuntu_focal

This commit is contained in:
Peter Parente
2020-05-29 09:12:34 -04:00
committed by GitHub
18 changed files with 1267 additions and 650 deletions

View File

@@ -0,0 +1,60 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"output_type": "error",
"ename": "Error",
"evalue": "Jupyter cannot be started. Error attempting to locate jupyter: Data Science libraries jupyter and notebook are not installed in interpreter Python 3.7.7 64-bit ('jupyter': conda).",
"traceback": [
"Error: Jupyter cannot be started. Error attempting to locate jupyter: Data Science libraries jupyter and notebook are not installed in interpreter Python 3.7.7 64-bit ('jupyter': conda).",
"at b.startServer (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:92:270430)",
"at async b.createServer (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:92:269873)",
"at async connect (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:92:397876)",
"at async w.ensureConnectionAndNotebookImpl (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:556625)",
"at async w.ensureConnectionAndNotebook (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:556303)",
"at async w.clearResult (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:552346)",
"at async w.reexecuteCell (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:540374)",
"at async w.reexecuteCells (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:537541)"
]
}
],
"source": [
"from pyspark.sql import SparkSession\n",
"\n",
"# Spark session & context\n",
"spark = SparkSession.builder.master('local').getOrCreate()\n",
"sc = spark.sparkContext\n",
"\n",
"# Sum of the first 100 whole numbers\n",
"rdd = sc.parallelize(range(100 + 1))\n",
"rdd.sum()\n",
"# 5050"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,41 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"library(SparkR)\n",
"\n",
"# Spark session & context\n",
"sc <- sparkR.session(\"local\")\n",
"\n",
"# Sum of the first 100 whole numbers\n",
"sdf <- createDataFrame(list(1:100))\n",
"dapplyCollect(sdf,\n",
" function(x) \n",
" { x <- sum(x)}\n",
" )\n",
"# 5050"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "R",
"language": "R",
"name": "ir"
},
"language_info": {
"codemirror_mode": "r",
"file_extension": ".r",
"mimetype": "text/x-r-source",
"name": "R",
"pygments_lexer": "r",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,43 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"library(sparklyr)\n",
"\n",
"# get the default config\n",
"conf <- spark_config()\n",
"# Set the catalog implementation in-memory\n",
"conf$spark.sql.catalogImplementation <- \"in-memory\"\n",
"\n",
"# Spark session & context\n",
"sc <- spark_connect(master = \"local\", config = conf)\n",
"\n",
"# Sum of the first 100 whole numbers\n",
"sdf_len(sc, 100, repartition = 1) %>% \n",
" spark_apply(function(e) sum(e))\n",
"# 5050"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "R",
"language": "R",
"name": "ir"
},
"language_info": {
"codemirror_mode": "r",
"file_extension": ".r",
"mimetype": "text/x-r-source",
"name": "R",
"pygments_lexer": "r",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,63 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"%%init_spark\n",
"# Spark session & context\n",
"launcher.master = \"local\"\n",
"launcher.conf.spark.executor.cores = 1"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[8] at parallelize at <console>:28\n",
"res4: Double = 5050.0\n"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// Sum of the first 100 whole numbers\n",
"val rdd = sc.parallelize(0 to 100)\n",
"rdd.sum()\n",
"// 5050"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "spylon-kernel",
"language": "scala",
"name": "spylon-kernel"
},
"language_info": {
"codemirror_mode": "text/x-scala",
"file_extension": ".scala",
"help_links": [
{
"text": "MetaKernel Magics",
"url": "https://metakernel.readthedocs.io/en/latest/source/README.html"
}
],
"mimetype": "text/x-scala",
"name": "scala",
"pygments_lexer": "scala",
"version": "0.4.1"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,89 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Waiting for a Spark session to start..."
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"spark://master:7077\n"
]
}
],
"source": [
"// should print the value of --master in the kernel spec\n",
"println(sc.master)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Waiting for a Spark session to start..."
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"rdd = ParallelCollectionRDD[0] at parallelize at <console>:28\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"5050.0"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// Sum of the first 100 whole numbers\n",
"val rdd = sc.parallelize(0 to 100)\n",
"rdd.sum()\n",
"// 5050"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Apache Toree - Scala",
"language": "scala",
"name": "apache_toree_scala"
},
"language_info": {
"codemirror_mode": "text/x-scala",
"file_extension": ".scala",
"mimetype": "text/x-scala",
"name": "scala",
"pygments_lexer": "scala",
"version": "2.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,35 @@
# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.
import logging
import pytest
import os
LOGGER = logging.getLogger(__name__)
@pytest.mark.parametrize(
"test_file",
# TODO: add local_sparklyr
["local_pyspark", "local_spylon", "local_toree", "local_sparkR"],
)
def test_nbconvert(container, test_file):
"""Check if Spark notebooks can be executed"""
host_data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
cont_data_dir = "/home/jovyan/data"
output_dir = "/tmp"
timeout_ms = 600
LOGGER.info(f"Test that {test_file} notebook can be executed ...")
command = f"jupyter nbconvert --to markdown --ExecutePreprocessor.timeout={timeout_ms} --output-dir {output_dir} --execute {cont_data_dir}/{test_file}.ipynb"
c = container.run(
volumes={host_data_dir: {"bind": cont_data_dir, "mode": "ro"}},
tty=True,
command=["start.sh", "bash", "-c", command],
)
rv = c.wait(timeout=timeout_ms / 10 + 10)
assert rv == 0 or rv["StatusCode"] == 0, f"Command {command} failed"
logs = c.logs(stdout=True).decode("utf-8")
LOGGER.debug(logs)
expected_file = f"{output_dir}/{test_file}.md"
assert expected_file in logs, f"Expected file {expected_file} not generated"

View File

@@ -25,9 +25,9 @@ If there's agreement that the feature belongs in one or more of the core stacks:
1. Implement the feature in a local clone of the `jupyter/docker-stacks` project.
2. Please build the image locally before submitting a pull request. Building the image locally shortens the debugging cycle by taking some load off [Travis CI](http://travis-ci.org/), which graciously provides free build services for open source projects like this one. If you use `make`, call:
```
make build/somestack-notebook
```
```bash
make build/somestack-notebook
```
3. [Submit a pull request](https://github.com/PointCloudLibrary/pcl/wiki/A-step-by-step-guide-on-preparing-and-submitting-a-pull-request) (PR) with your changes.
4. Watch for Travis to report a build success or failure for your PR on GitHub.
5. Discuss changes with the maintainers and address any build issues.

View File

@@ -7,9 +7,9 @@ Please follow the process below to update a package version:
1. Locate the Dockerfile containing the library you wish to update (e.g., [base-notebook/Dockerfile](https://github.com/jupyter/docker-stacks/blob/master/base-notebook/Dockerfile), [scipy-notebook/Dockerfile](https://github.com/jupyter/docker-stacks/blob/master/scipy-notebook/Dockerfile))
2. Adjust the version number for the package. We prefer to pin the major and minor version number of packages so as to minimize rebuild side-effects when users submit pull requests (PRs). For example, you'll find the Jupyter Notebook package, `notebook`, installed using conda with `notebook=5.4.*`.
3. Please build the image locally before submitting a pull request. Building the image locally shortens the debugging cycle by taking some load off [Travis CI](http://travis-ci.org/), which graciously provides free build services for open source projects like this one. If you use `make`, call:
```
make build/somestack-notebook
```
```bash
make build/somestack-notebook
```
4. [Submit a pull request](https://github.com/PointCloudLibrary/pcl/wiki/A-step-by-step-guide-on-preparing-and-submitting-a-pull-request) (PR) with your changes.
5. Watch for Travis to report a build success or failure for your PR on GitHub.
6. Discuss changes with the maintainers and address any build issues. Version conflicts are the most common problem. You may need to upgrade additional packages to fix build failures.

View File

@@ -13,13 +13,13 @@ This approach mirrors how we build and share the core stack images. Feel free to
First, install [cookiecutter](https://github.com/audreyr/cookiecutter) using pip or conda:
```
```bash
pip install cookiecutter # or conda install cookiecutter
```
Run the cookiecutter command pointing to the [jupyter/cookiecutter-docker-stacks](https://github.com/jupyter/cookiecutter-docker-stacks) project on GitHub.
```
```bash
cookiecutter https://github.com/jupyter/cookiecutter-docker-stacks.git
```

View File

@@ -13,10 +13,10 @@ Please follow the process below to add new tests:
1. If the test should run against every image built, add your test code to one of the modules in [test/](https://github.com/jupyter/docker-stacks/tree/master/test) or create a new module.
2. If your test should run against a single image, add your test code to one of the modules in `some-notebook/test/` or create a new module.
3. Build one or more images you intend to test and run the tests locally. If you use `make`, call:
```
make build/somestack-notebook
make test/somestack-notebook
```
```bash
make build/somestack-notebook
make test/somestack-notebook
```
4. [Submit a pull request](https://github.com/PointCloudLibrary/pcl/wiki/A-step-by-step-guide-on-preparing-and-submitting-a-pull-request) (PR) with your changes.
5. Watch for Travis to report a build success or failure for your PR on GitHub.
6. Discuss changes with the maintainers and address any issues running the tests on Travis.
6. Discuss changes with the maintainers and address any issues running the tests on Travis.

File diff suppressed because it is too large Load Diff

View File

@@ -8,13 +8,13 @@ This page describes the options supported by the startup script as well as how t
You can pass [Jupyter command line options](https://jupyter.readthedocs.io/en/latest/projects/jupyter-command.html) to the `start-notebook.sh` script when launching the container. For example, to secure the Notebook server with a custom password hashed using `IPython.lib.passwd()` instead of the default token, you can run the following:
```
```bash
docker run -d -p 8888:8888 jupyter/base-notebook start-notebook.sh --NotebookApp.password='sha1:74ba40f8a388:c913541b7ee99d15d5ed31d4226bf7838f83a50e'
```
For example, to set the base URL of the notebook server, you can run the following:
```
```bash
docker run -d -p 8888:8888 jupyter/base-notebook start-notebook.sh --NotebookApp.base_url=/some/path
```
@@ -54,7 +54,7 @@ script for execution details.
You may mount SSL key and certificate files into a container and configure Jupyter Notebook to use them to accept HTTPS connections. For example, to mount a host folder containing a `notebook.key` and `notebook.crt` and use them, you might run the following:
```
```bash
docker run -d -p 8888:8888 \
-v /some/host/folder:/etc/ssl/notebook \
jupyter/base-notebook start-notebook.sh \
@@ -64,7 +64,7 @@ docker run -d -p 8888:8888 \
Alternatively, you may mount a single PEM file containing both the key and certificate. For example:
```
```bash
docker run -d -p 8888:8888 \
-v /some/host/folder/notebook.pem:/etc/ssl/notebook.pem \
jupyter/base-notebook start-notebook.sh \
@@ -85,13 +85,13 @@ For additional information about using SSL, see the following:
The `start-notebook.sh` script actually inherits most of its option handling capability from a more generic `start.sh` script. The `start.sh` script supports all of the features described above, but allows you to specify an arbitrary command to execute. For example, to run the text-based `ipython` console in a container, do the following:
```
```bash
docker run -it --rm jupyter/base-notebook start.sh ipython
```
Or, to run JupyterLab instead of the classic notebook, run the following:
```
```bash
docker run -it --rm -p 8888:8888 jupyter/base-notebook start.sh jupyter lab
```
@@ -107,7 +107,7 @@ The default Python 3.x [Conda environment](http://conda.pydata.org/docs/using/en
The `jovyan` user has full read/write access to the `/opt/conda` directory. You can use either `conda` or `pip` to install new packages without any additional permissions.
```
```bash
# install a package into the default (python 3.x) environment
pip install some-package
conda install some-package

View File

@@ -17,7 +17,7 @@ orchestrator config.
For example:
```
```bash
docker run -it -e GRANT_SUDO=yes --user root jupyter/minimal-notebook
```
@@ -75,7 +75,7 @@ Python 2.x was removed from all images on August 10th, 2017, starting in tag `cc
add a Python 2.x environment by defining your own Dockerfile inheriting from one of the images like
so:
```
```dockerfile
# Choose your desired base image
FROM jupyter/scipy-notebook:latest
@@ -103,7 +103,7 @@ Ref:
The default version of Python that ships with conda/ubuntu may not be the version you want.
To add a conda environment with a different version and make it accessible to Jupyter, the instructions are very similar to Python 2.x but are slightly simpler (no need to switch to `root`):
```
```dockerfile
# Choose your desired base image
FROM jupyter/minimal-notebook:latest
@@ -168,12 +168,12 @@ ENTRYPOINT ["jupyter", "lab", "--ip=0.0.0.0", "--allow-root"]
```
And build the image as:
```
```bash
docker build -t jupyter/scipy-dasklabextension:latest .
```
Once built, run using the command:
```
```bash
docker run -it --rm -p 8888:8888 -p 8787:8787 jupyter/scipy-dasklabextension:latest
```
@@ -194,7 +194,7 @@ Ref:
[RISE](https://github.com/damianavila/RISE) allows via extension to create live slideshows of your
notebooks, with no conversion, adding javascript Reveal.js:
```
```bash
# Add Live slideshows with RISE
RUN conda install -c damianavila82 rise
```
@@ -207,7 +207,7 @@ Credit: [Paolo D.](https://github.com/pdonorio) based on
You need to install conda's gcc for Python xgboost to work properly. Otherwise, you'll get an
exception about libgomp.so.1 missing GOMP_4.0.
```
```bash
%%bash
conda install -y gcc
pip install xgboost
@@ -312,8 +312,8 @@ Credit: [Justin Tyberg](https://github.com/jtyberg), [quanghoc](https://github.c
To use a specific version of JupyterHub, the version of `jupyterhub` in your image should match the
version in the Hub itself.
```
FROM jupyter/base-notebook:5ded1de07260
```dockerfile
FROM jupyter/base-notebook:5ded1de07260
RUN pip install jupyterhub==0.8.0b1
```
@@ -375,7 +375,7 @@ Ref:
### Using Local Spark JARs
```
```python
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /home/jovyan/spark-streaming-kafka-assembly_2.10-1.6.1.jar pyspark-shell'
import pyspark
@@ -404,7 +404,7 @@ Ref:
### Use jupyter/all-spark-notebooks with an existing Spark/YARN cluster
```
```dockerfile
FROM jupyter/all-spark-notebook
# Set env vars for pydoop
@@ -480,13 +480,13 @@ convenient to launch the server without a password or token. In this case, you s
For jupyterlab:
```
```bash
docker run jupyter/base-notebook:6d2a05346196 start.sh jupyter lab --LabApp.token=''
```
For jupyter classic:
```
```bash
docker run jupyter/base-notebook:6d2a05346196 start.sh jupyter notebook --NotebookApp.token=''
```
@@ -494,7 +494,7 @@ docker run jupyter/base-notebook:6d2a05346196 start.sh jupyter notebook --Notebo
NB: this works for classic notebooks only
```
```dockerfile
# Update with your base image of choice
FROM jupyter/minimal-notebook:latest
@@ -513,7 +513,7 @@ Ref:
Using `auto-sklearn` requires `swig`, which the other notebook images lack, so it cant be experimented with. Also, there is no Conda package for `auto-sklearn`.
```
```dockerfile
ARG BASE_CONTAINER=jupyter/scipy-notebook
FROM jupyter/scipy-notebook:latest

View File

@@ -5,7 +5,8 @@ This page provides details about features specific to one or more images.
## Apache Spark
**Specific Docker Image Options**
* `-p 4040:4040` - The `jupyter/pyspark-notebook` and `jupyter/all-spark-notebook` images open [SparkUI (Spark Monitoring and Instrumentation UI)](http://spark.apache.org/docs/latest/monitoring.html) at default port `4040`, this option map `4040` port inside docker container to `4040` port on host machine . Note every new spark context that is created is put onto an incrementing port (ie. 4040, 4041, 4042, etc.), and it might be necessary to open multiple ports. For example: `docker run -d -p 8888:8888 -p 4040:4040 -p 4041:4041 jupyter/pyspark-notebook`
* `-p 4040:4040` - The `jupyter/pyspark-notebook` and `jupyter/all-spark-notebook` images open [SparkUI (Spark Monitoring and Instrumentation UI)](http://spark.apache.org/docs/latest/monitoring.html) at default port `4040`, this option map `4040` port inside docker container to `4040` port on host machine . Note every new spark context that is created is put onto an incrementing port (ie. 4040, 4041, 4042, etc.), and it might be necessary to open multiple ports. For example: `docker run -d -p 8888:8888 -p 4040:4040 -p 4041:4041 jupyter/pyspark-notebook`.
**Usage Examples**
@@ -13,30 +14,66 @@ The `jupyter/pyspark-notebook` and `jupyter/all-spark-notebook` images support t
### Using Spark Local Mode
Spark local mode is useful for experimentation on small data when you do not have a Spark cluster available.
Spark **local mode** is useful for experimentation on small data when you do not have a Spark cluster available.
#### In a Python Notebook
#### In Python
In a Python notebook.
```python
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SimpleApp").getOrCreate()
# do something to prove it works
spark.sql('SELECT "Test" as c1').show()
# Spark session & context
spark = SparkSession.builder.master('local').getOrCreate()
sc = spark.sparkContext
# Sum of the first 100 whole numbers
rdd = sc.parallelize(range(100 + 1))
rdd.sum()
# 5050
```
#### In a R Notebook
#### In R
```r
In a R notebook with [SparkR][sparkr].
```R
library(SparkR)
as <- sparkR.session("local[*]")
# Spark session & context
sc <- sparkR.session("local")
# do something to prove it works
df <- as.DataFrame(iris)
head(filter(df, df$Petal_Width > 0.2))
# Sum of the first 100 whole numbers
sdf <- createDataFrame(list(1:100))
dapplyCollect(sdf,
function(x)
{ x <- sum(x)}
)
# 5050
```
#### In a Spylon Kernel Scala Notebook
In a R notebook with [sparklyr][sparklyr].
```R
library(sparklyr)
# Spark configuration
conf <- spark_config()
# Set the catalog implementation in-memory
conf$spark.sql.catalogImplementation <- "in-memory"
# Spark session & context
sc <- spark_connect(master = "local", config = conf)
# Sum of the first 100 whole numbers
sdf_len(sc, 100, repartition = 1) %>%
spark_apply(function(e) sum(e))
# 5050
```
#### In Scala
##### In a Spylon Kernel
Spylon kernel instantiates a `SparkContext` for you in variable `sc` after you configure Spark
options in a `%%init_spark` magic cell.
@@ -44,27 +81,30 @@ options in a `%%init_spark` magic cell.
```python
%%init_spark
# Configure Spark to use a local master
launcher.master = "local[*]"
launcher.master = "local"
```
```scala
// Now run Scala code that uses the initialized SparkContext in sc
val rdd = sc.parallelize(0 to 999)
rdd.takeSample(false, 5)
// Sum of the first 100 whole numbers
val rdd = sc.parallelize(0 to 100)
rdd.sum()
// 5050
```
#### In an Apache Toree Scala Notebook
##### In an Apache Toree Kernel
Apache Toree instantiates a local `SparkContext` for you in variable `sc` when the kernel starts.
```scala
val rdd = sc.parallelize(0 to 999)
rdd.takeSample(false, 5)
// Sum of the first 100 whole numbers
val rdd = sc.parallelize(0 to 100)
rdd.sum()
// 5050
```
### Connecting to a Spark Cluster in Standalone Mode
Connection to Spark Cluster on Standalone Mode requires the following set of steps:
Connection to Spark Cluster on **[Standalone Mode](https://spark.apache.org/docs/latest/spark-standalone.html)** requires the following set of steps:
0. Verify that the docker image (check the Dockerfile) and the Spark Cluster which is being
deployed, run the same version of Spark.
@@ -72,98 +112,107 @@ Connection to Spark Cluster on Standalone Mode requires the following set of ste
2. Run the Docker container with `--net=host` in a location that is network addressable by all of
your Spark workers. (This is a [Spark networking
requirement](http://spark.apache.org/docs/latest/cluster-overview.html#components).)
* NOTE: When using `--net=host`, you must also use the flags `--pid=host -e
TINI_SUBREAPER=true`. See https://github.com/jupyter/docker-stacks/issues/64 for details.
* NOTE: When using `--net=host`, you must also use the flags `--pid=host -e
TINI_SUBREAPER=true`. See https://github.com/jupyter/docker-stacks/issues/64 for details.
#### In a Python Notebook
**Note**: In the following examples we are using the Spark master URL `spark://master:7077` that shall be replaced by the URL of the Spark master.
#### In Python
The **same Python version** need to be used on the notebook (where the driver is located) and on the Spark workers.
The python version used at driver and worker side can be adjusted by setting the environment variables `PYSPARK_PYTHON` and / or `PYSPARK_DRIVER_PYTHON`, see [Spark Configuration][spark-conf] for more information.
```python
import os
# make sure pyspark tells workers to use python3 not 2 if both are installed
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
from pyspark.sql import SparkSession
import pyspark
conf = pyspark.SparkConf()
# Spark session & context
spark = SparkSession.builder.master('spark://master:7077').getOrCreate()
sc = spark.sparkContext
# Point to spark master
conf.setMaster("spark://10.10.10.10:7070")
# point to spark binary package in HDFS or on local filesystem on all slave
# nodes (e.g., file:///opt/spark/spark-2.2.0-bin-hadoop2.7.tgz)
conf.set("spark.executor.uri", "hdfs://10.10.10.10/spark/spark-2.2.0-bin-hadoop2.7.tgz")
# set other options as desired
conf.set("spark.executor.memory", "8g")
conf.set("spark.core.connection.ack.wait.timeout", "1200")
# create the context
sc = pyspark.SparkContext(conf=conf)
# do something to prove it works
rdd = sc.parallelize(range(100000000))
rdd.sumApprox(3)
# Sum of the first 100 whole numbers
rdd = sc.parallelize(range(100 + 1))
rdd.sum()
# 5050
```
#### In a R Notebook
#### In R
```r
In a R notebook with [SparkR][sparkr].
```R
library(SparkR)
# Point to spark master
# Point to spark binary package in HDFS or on local filesystem on all worker
# nodes (e.g., file:///opt/spark/spark-2.2.0-bin-hadoop2.7.tgz) in sparkEnvir
# Set other options in sparkEnvir
sc <- sparkR.session("spark://10.10.10.10:7070", sparkEnvir=list(
spark.executor.uri="hdfs://10.10.10.10/spark/spark-2.4.3-bin-hadoop2.7.tgz",
spark.executor.memory="8g"
)
)
# Spark session & context
sc <- sparkR.session("spark://master:7077")
# do something to prove it works
data(iris)
df <- as.DataFrame(iris)
head(filter(df, df$Petal_Width > 0.2))
# Sum of the first 100 whole numbers
sdf <- createDataFrame(list(1:100))
dapplyCollect(sdf,
function(x)
{ x <- sum(x)}
)
# 5050
```
#### In a Spylon Kernel Scala Notebook
In a R notebook with [sparklyr][sparklyr].
```R
library(sparklyr)
# Spark session & context
# Spark configuration
conf <- spark_config()
# Set the catalog implementation in-memory
conf$spark.sql.catalogImplementation <- "in-memory"
sc <- spark_connect(master = "spark://master:7077", config = conf)
# Sum of the first 100 whole numbers
sdf_len(sc, 100, repartition = 1) %>%
spark_apply(function(e) sum(e))
# 5050
```
#### In Scala
##### In a Spylon Kernel
Spylon kernel instantiates a `SparkContext` for you in variable `sc` after you configure Spark
options in a `%%init_spark` magic cell.
```python
%%init_spark
# Point to spark master
launcher.master = "spark://10.10.10.10:7070"
launcher.conf.spark.executor.uri=hdfs://10.10.10.10/spark/spark-2.4.3-bin-hadoop2.7.tgz
# Configure Spark to use a local master
launcher.master = "spark://master:7077"
```
```scala
// Now run Scala code that uses the initialized SparkContext in sc
val rdd = sc.parallelize(0 to 999)
rdd.takeSample(false, 5)
// Sum of the first 100 whole numbers
val rdd = sc.parallelize(0 to 100)
rdd.sum()
// 5050
```
#### In an Apache Toree Scala Notebook
##### In an Apache Toree Scala Notebook
The Apache Toree kernel automatically creates a `SparkContext` when it starts based on configuration
information from its command line arguments and environment variables. You can pass information
about your cluster via the `SPARK_OPTS` environment variable when you spawn a container.
The Apache Toree kernel automatically creates a `SparkContext` when it starts based on configuration information from its command line arguments and environment variables. You can pass information about your cluster via the `SPARK_OPTS` environment variable when you spawn a container.
For instance, to pass information about a standalone Spark master, Spark binary location in HDFS,
and an executor options, you could start the container like so:
For instance, to pass information about a standalone Spark master, you could start the container like so:
```
docker run -d -p 8888:8888 -e SPARK_OPTS='--master=spark://10.10.10.10:7070 \
--spark.executor.uri=hdfs://10.10.10.10/spark/spark-2.4.3-bin-hadoop2.7.tgz \
--spark.executor.memory=8g' jupyter/all-spark-notebook
```bash
docker run -d -p 8888:8888 -e SPARK_OPTS='--master=spark://master:7077' \
jupyter/all-spark-notebook
```
Note that this is the same information expressed in a notebook in the Python case above. Once the
kernel spec has your cluster information, you can test your cluster in an Apache Toree notebook like
so:
Note that this is the same information expressed in a notebook in the Python case above. Once the kernel spec has your cluster information, you can test your cluster in an Apache Toree notebook like so:
```scala
// should print the value of --master in the kernel spec
println(sc.master)
// do something to prove it works
val rdd = sc.parallelize(0 to 99999999)
// Sum of the first 100 whole numbers
val rdd = sc.parallelize(0 to 100)
rdd.sum()
// 5050
```
## Tensorflow
@@ -199,3 +248,7 @@ init = tf.global_variables_initializer()
sess.run(init)
sess.run(hello)
```
[sparkr]: https://spark.apache.org/docs/latest/sparkr.html
[sparklyr]: https://spark.rstudio.com/
[spark-conf]: https://spark.apache.org/docs/latest/configuration.html

View File

@@ -12,7 +12,7 @@ See the [installation instructions](https://docs.docker.com/engine/installation/
Build and run a `jupyter/minimal-notebook` container on a VirtualBox VM on local desktop.
```
```bash
# create a Docker Machine-controlled VirtualBox VM
bin/vbox.sh mymachine
@@ -28,7 +28,7 @@ notebook/up.sh
To stop and remove the container:
```
```bash
notebook/down.sh
```
@@ -39,14 +39,14 @@ notebook/down.sh
You can customize the docker-stack notebook image to deploy by modifying the `notebook/Dockerfile`. For example, you can build and deploy a `jupyter/all-spark-notebook` by modifying the Dockerfile like so:
```
```dockerfile
FROM jupyter/all-spark-notebook:55d5ca6be183
...
```
Once you modify the Dockerfile, don't forget to rebuild the image.
```
```bash
# activate the docker machine
eval "$(docker-machine env mymachine)"
@@ -57,14 +57,14 @@ notebook/build.sh
Yes. Set environment variables to specify unique names and ports when running the `up.sh` command.
```
```bash
NAME=my-notebook PORT=9000 notebook/up.sh
NAME=your-notebook PORT=9001 notebook/up.sh
```
To stop and remove the containers:
```
```bash
NAME=my-notebook notebook/down.sh
NAME=your-notebook notebook/down.sh
```
@@ -78,7 +78,7 @@ The `up.sh` creates a Docker volume named after the notebook container with a `-
Yes. Set the `WORK_VOLUME` environment variable to the same value for each notebook.
```
```bash
NAME=my-notebook PORT=9000 WORK_VOLUME=our-work notebook/up.sh
NAME=your-notebook PORT=9001 WORK_VOLUME=our-work notebook/up.sh
```
@@ -87,7 +87,7 @@ NAME=your-notebook PORT=9001 WORK_VOLUME=our-work notebook/up.sh
To run the notebook server with a self-signed certificate, pass the `--secure` option to the `up.sh` script. You must also provide a password, which will be used to secure the notebook server. You can specify the password by setting the `PASSWORD` environment variable, or by passing it to the `up.sh` script.
```
```bash
PASSWORD=a_secret notebook/up.sh --secure
# or
@@ -103,7 +103,7 @@ This example includes the `bin/letsencrypt.sh` script, which runs the `letsencry
The following command will create a certificate chain and store it in a Docker volume named `mydomain-secrets`.
```
```bash
FQDN=host.mydomain.com EMAIL=myemail@somewhere.com \
SECRETS_VOLUME=mydomain-secrets \
bin/letsencrypt.sh
@@ -111,7 +111,7 @@ FQDN=host.mydomain.com EMAIL=myemail@somewhere.com \
Now run `up.sh` with the `--letsencrypt` option. You must also provide the name of the secrets volume and a password.
```
```bash
PASSWORD=a_secret SECRETS_VOLUME=mydomain-secrets notebook/up.sh --letsencrypt
# or
@@ -120,7 +120,7 @@ notebook/up.sh --letsencrypt --password a_secret --secrets mydomain-secrets
Be aware that Let's Encrypt has a pretty [low rate limit per domain](https://community.letsencrypt.org/t/public-beta-rate-limits/4772/3) at the moment. You can avoid exhausting your limit by testing against the Let's Encrypt staging servers. To hit their staging servers, set the environment variable `CERT_SERVER=--staging`.
```
```bash
FQDN=host.mydomain.com EMAIL=myemail@somewhere.com \
CERT_SERVER=--staging \
bin/letsencrypt.sh
@@ -134,13 +134,13 @@ Yes, you should be able to deploy to any Docker Machine-controlled host. To mak
To create a Docker machine using a VirtualBox VM on local desktop:
```
```bash
bin/vbox.sh mymachine
```
To create a Docker machine using a virtual device on IBM SoftLayer:
```
```bash
export SOFTLAYER_USER=my_softlayer_username
export SOFTLAYER_API_KEY=my_softlayer_api_key
export SOFTLAYER_DOMAIN=my.domain

View File

@@ -11,7 +11,7 @@ This folder contains a Makefile and a set of supporting files demonstrating how
To show what's possible, here's how to run the `jupyter/minimal-notebook` on a brand new local virtualbox.
```
```bash
# create a new VM
make virtualbox-vm NAME=dev
# make the new VM the active docker machine
@@ -30,7 +30,7 @@ The last command will log the IP address and port to visit in your browser.
Yes. Specify a unique name and port on the `make notebook` command.
```
```bash
make notebook NAME=my-notebook PORT=9000
make notebook NAME=your-notebook PORT=9001
```
@@ -39,7 +39,7 @@ make notebook NAME=your-notebook PORT=9001
Yes.
```
```bash
make notebook NAME=my-notebook PORT=9000 WORK_VOLUME=our-work
make notebook NAME=your-notebook PORT=9001 WORK_VOLUME=our-work
```
@@ -52,7 +52,7 @@ Instead of `make notebook`, run `make self-signed-notebook PASSWORD=your_desired
Yes. Please.
```
```bash
make letsencrypt FQDN=host.mydomain.com EMAIL=myemail@somewhere.com
make letsencrypt-notebook
```
@@ -61,7 +61,7 @@ The first command creates a Docker volume named after the notebook container wit
Be aware: Let's Encrypt has a pretty [low rate limit per domain](https://community.letsencrypt.org/t/public-beta-rate-limits/4772/3) at the moment. You can avoid exhausting your limit by testing against the Let's Encrypt staging servers. To hit their staging servers, set the environment variable `CERT_SERVER=--staging`.
```
```bash
make letsencrypt FQDN=host.mydomain.com EMAIL=myemail@somewhere.com CERT_SERVER=--staging
```
@@ -69,7 +69,7 @@ Also, keep in mind Let's Encrypt certificates are short lived: 90 days at the mo
### My pip/conda/apt-get installs disappear every time I restart the container. Can I make them permanent?
```
```bash
# add your pip, conda, apt-get, etc. permanent features to the Dockerfile where
# indicated by the comments in the Dockerfile
vi Dockerfile
@@ -79,7 +79,7 @@ make notebook
### How do I upgrade my Docker container?
```
```bash
make image DOCKER_ARGS=--pull
make notebook
```
@@ -90,7 +90,7 @@ The first line pulls the latest version of the Docker image used in the local Do
Yes. As an example, there's a `softlayer.makefile` included in this repo as an example. You would use it like so:
```
```bash
make softlayer-vm NAME=myhost \
SOFTLAYER_DOMAIN=your_desired_domain \
SOFTLAYER_USER=your_user_id \

View File

@@ -16,7 +16,7 @@ Loading the Templates
To load the templates, login to OpenShift from the command line and run:
```
```bash
oc create -f https://raw.githubusercontent.com/jupyter-on-openshift/docker-stacks/master/examples/openshift/templates.json
```
@@ -33,7 +33,7 @@ Deploying a Notebook
To deploy a notebook from the command line using the template, run:
```
```bash
oc new-app --template jupyter-notebook
```
@@ -71,7 +71,7 @@ A password you can use when accessing the notebook will be auto generated and is
To see the hostname for accessing the notebook run:
```
```bash
oc get routes
```
@@ -95,7 +95,7 @@ Passing Template Parameters
To override the name for the notebook, the image used, and the password, you can pass template parameters using the ``--param`` option.
```
```bash
oc new-app --template jupyter-notebook \
--param APPLICATION_NAME=mynotebook \
--param NOTEBOOK_IMAGE=jupyter/scipy-notebook:latest \
@@ -120,7 +120,7 @@ Deleting the Notebook Instance
To delete the notebook instance, run ``oc delete`` using a label selector for the application name.
```
```bash
oc delete all,configmap --selector app=mynotebook
```
@@ -129,7 +129,7 @@ Enabling Jupyter Lab Interface
To enable the Jupyter Lab interface for a deployed notebook set the ``JUPYTER_ENABLE_LAB`` environment variable.
```
```bash
oc set env dc/mynotebook JUPYTER_ENABLE_LAB=true
```
@@ -140,7 +140,7 @@ Adding Persistent Storage
You can upload notebooks and other files using the web interface of the notebook. Any uploaded files or changes you make to them will be lost when the notebook instance is restarted. If you want to save your work, you need to add persistent storage to the notebook. To add persistent storage run:
```
```bash
oc set volume dc/mynotebook --add \
--type=pvc --claim-size=1Gi --claim-mode=ReadWriteOnce \
--claim-name mynotebook-data --name data \
@@ -149,7 +149,7 @@ oc set volume dc/mynotebook --add \
When you have deleted the notebook instance, if using a persistent volume, you will need to delete it in a separate step.
```
```bash
oc delete pvc/mynotebook-data
```
@@ -158,7 +158,7 @@ Customizing the Configuration
If you want to set any custom configuration for the notebook, you can edit the config map created by the template.
```
```bash
oc edit configmap/mynotebook-cfg
```
@@ -176,19 +176,19 @@ Because the configuration is Python code, ensure any indenting is correct. Any e
If the error is in the config map, edit it again to fix it and trigged a new deployment if necessary by running:
```
```bash
oc rollout latest dc/mynotebook
```
If you make an error in the configuration file stored in the persistent volume, you will need to scale down the notebook so it isn't running.
```
```bash
oc scale dc/mynotebook --replicas 0
```
Then run:
```
```bash
oc debug dc/mynotebook
```
@@ -196,7 +196,7 @@ to run the notebook in debug mode. This will provide you with an interactive ter
Start up the notebook again.
```
```bash
oc scale dc/mynotebook --replicas 1
```
@@ -207,7 +207,7 @@ The password for the notebook is supplied as a template parameter, or if not sup
If you want to change the password, you can do so by editing the environment variable on the deployment configuration.
```
```bash
oc set env dc/mynotebook JUPYTER_NOTEBOOK_PASSWORD=mypassword
```
@@ -232,13 +232,13 @@ If the image is in your OpenShift project, because you imported the image into O
This can be illustrated by first importing an image into the OpenShift project.
```
```bash
oc import-image jupyter/datascience-notebook:latest --confirm
```
Then deploy it using the name of the image stream created.
```
```bash
oc new-app --template jupyter-notebook \
--param APPLICATION_NAME=mynotebook \
--param NOTEBOOK_IMAGE=datascience-notebook \

View File

@@ -22,7 +22,7 @@ Getting Started with S2I
As an example of how S2I can be used to create a custom image with a bundled set of notebooks, run:
```
```bash
s2i build \
--scripts-url https://raw.githubusercontent.com/jupyter/docker-stacks/master/examples/source-to-image \
--context-dir docs/source/examples/Notebook \
@@ -76,7 +76,7 @@ The supplied ``assemble`` script performs a few key steps.
The first steps copy files into the location they need to be when the image is run, from the directory where they are initially placed by the ``s2i`` command.
```
```bash
cp -Rf /tmp/src/. /home/$NB_USER
rm -rf /tmp/src
@@ -84,7 +84,7 @@ rm -rf /tmp/src
The next steps are:
```
```bash
if [ -f /home/$NB_USER/environment.yml ]; then
conda env update --name root --file /home/$NB_USER/environment.yml
conda clean --all -f -y
@@ -101,7 +101,7 @@ This means that so long as a set of notebook files provides one of these files l
A final step is:
```
```bash
fix-permissions $CONDA_DIR
fix-permissions /home/$NB_USER
```
@@ -112,7 +112,7 @@ As long as you preserve the first and last set of steps, you can do whatever you
The ``run`` script in this directory is very simple and just runs the notebook application.
```
```bash
exec start-notebook.sh "$@"
```
@@ -121,13 +121,13 @@ Integration with OpenShift
The OpenShift platform provides integrated support for S2I type builds. Templates are provided for using the S2I build mechanism with the scripts in this directory. To load the templates run:
```
```bash
oc create -f https://raw.githubusercontent.com/jupyter/docker-stacks/master/examples/source-to-image/templates.json
```
This will create the templates:
```
```bash
jupyter-notebook-builder
jupyter-notebook-quickstart
```
@@ -136,7 +136,7 @@ The templates can be used from the OpenShift web console or command line. This `
To use the OpenShift command line to build into an image, and deploy, the set of notebooks used above, run:
```
```bash
oc new-app --template jupyter-notebook-quickstart \
--param APPLICATION_NAME=notebook-examples \
--param GIT_REPOSITORY_URL=https://github.com/jupyter/notebook \