From 1339b518a9d5033fec27db34750501de8942bdca Mon Sep 17 00:00:00 2001 From: Peter Parente Date: Wed, 17 Aug 2016 16:16:29 -0500 Subject: [PATCH 1/6] Upgrade Spark to 2.0 --- pyspark-notebook/Dockerfile | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/pyspark-notebook/Dockerfile b/pyspark-notebook/Dockerfile index 4d81ef33..1ac9e8f7 100644 --- a/pyspark-notebook/Dockerfile +++ b/pyspark-notebook/Dockerfile @@ -7,14 +7,18 @@ MAINTAINER Jupyter Project USER root # Spark dependencies -ENV APACHE_SPARK_VERSION 1.6.1 -RUN apt-get -y update && \ - apt-get install -y --no-install-recommends openjdk-7-jre-headless && \ +ENV APACHE_SPARK_VERSION 2.0.0 + +# Temporarily add jessie backports to get openjdk 8, but then remove that source +RUN echo 'deb http://ftp.debian.org/debian jessie-backports main' > /etc/apt/sources.list.d/jessie-backports.list && \ + apt-get -y update && \ + apt-get install -y --no-install-recommends openjdk-8-jre-headless && \ + rm /etc/apt/sources.list.d/jessie-backports.list && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* RUN cd /tmp && \ wget -q http://d3kbcqa49mib13.cloudfront.net/spark-${APACHE_SPARK_VERSION}-bin-hadoop2.6.tgz && \ - echo "09f3b50676abc9b3d1895773d18976953ee76945afa72fa57e6473ce4e215970 *spark-${APACHE_SPARK_VERSION}-bin-hadoop2.6.tgz" | sha256sum -c - && \ + echo "e17d9da4b3ac463ea3ce42289f2a71cefb479d154b1ffd00310c7d7ab207aa2c *spark-${APACHE_SPARK_VERSION}-bin-hadoop2.6.tgz" | sha256sum -c - && \ tar xzf spark-${APACHE_SPARK_VERSION}-bin-hadoop2.6.tgz -C /usr/local && \ rm spark-${APACHE_SPARK_VERSION}-bin-hadoop2.6.tgz RUN cd /usr/local && ln -s spark-${APACHE_SPARK_VERSION}-bin-hadoop2.6 spark @@ -35,7 +39,7 @@ RUN apt-key adv --keyserver keyserver.ubuntu.com --recv E56151BF && \ # Spark and Mesos config ENV SPARK_HOME /usr/local/spark -ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.9-src.zip +ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.1-src.zip ENV MESOS_NATIVE_LIBRARY /usr/local/lib/libmesos.so ENV SPARK_OPTS --driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info From 52bce8f002a7f1f996e669c7cf4a21ddbb9028b1 Mon Sep 17 00:00:00 2001 From: Peter Parente Date: Sun, 4 Dec 2016 14:45:11 -0500 Subject: [PATCH 2/6] Use Apache Toree 0.2.0dev1 for Spark 2.0.2 --- all-spark-notebook/Dockerfile | 2 +- all-spark-notebook/README.md | 14 +++++++------- pyspark-notebook/Dockerfile | 25 +++++++++++-------------- pyspark-notebook/README.md | 8 ++++---- 4 files changed, 23 insertions(+), 26 deletions(-) diff --git a/all-spark-notebook/Dockerfile b/all-spark-notebook/Dockerfile index 82424459..94fc8de2 100644 --- a/all-spark-notebook/Dockerfile +++ b/all-spark-notebook/Dockerfile @@ -29,5 +29,5 @@ RUN conda config --add channels r && \ 'r-rcurl=1.95*' && conda clean -tipsy # Apache Toree kernel -RUN pip --no-cache-dir install toree==0.1.0.dev7 +RUN pip --no-cache-dir install https://dist.apache.org/repos/dist/dev/incubator/toree/0.2.0/snapshots/dev1/toree-pip/toree-0.2.0.dev1.tar.gz RUN jupyter toree install --user diff --git a/all-spark-notebook/README.md b/all-spark-notebook/README.md index 3a49922c..62757581 100644 --- a/all-spark-notebook/README.md +++ b/all-spark-notebook/README.md @@ -11,8 +11,8 @@ * Scala 2.10.x * pyspark, pandas, matplotlib, scipy, seaborn, scikit-learn pre-installed for Python * ggplot2, rcurl preinstalled for R -* Spark 1.6.0 for use in local mode or to connect to a cluster of Spark workers -* Mesos client 0.22 binary that can communicate with a Mesos master +* Spark 2.0.2 with Hadoop 2.7 for use in local mode or to connect to a cluster of Spark workers +* Mesos client 0.25 binary that can communicate with a Mesos master * Unprivileged user `jovyan` (uid=1000, configurable, see options) in group `users` (gid=100) with ownership over `/home/jovyan` and `/opt/conda` * [tini](https://github.com/krallin/tini) as the container entrypoint and [start-notebook.sh](../base-notebook/start-notebook.sh) as the default command * A [start-singleuser.sh](../base-notebook/start-singleuser.sh) script useful for running a single-user instance of the Notebook server, as required by JupyterHub @@ -112,8 +112,8 @@ conf = pyspark.SparkConf() # point to mesos master or zookeeper entry (e.g., zk://10.10.10.10:2181/mesos) conf.setMaster("mesos://10.10.10.10:5050") # point to spark binary package in HDFS or on local filesystem on all slave -# nodes (e.g., file:///opt/spark/spark-1.6.0-bin-hadoop2.6.tgz) -conf.set("spark.executor.uri", "hdfs://10.10.10.10/spark/spark-1.6.0-bin-hadoop2.6.tgz") +# nodes (e.g., file:///opt/spark/spark-2.0.2-bin-hadoop2.7.tgz) +conf.set("spark.executor.uri", "hdfs://10.10.10.10/spark/spark-2.0.2-bin-hadoop2.7.tgz") # set other options as desired conf.set("spark.executor.memory", "8g") conf.set("spark.core.connection.ack.wait.timeout", "1200") @@ -145,10 +145,10 @@ library(SparkR) # point to mesos master or zookeeper entry (e.g., zk://10.10.10.10:2181/mesos)\ # as the first argument # point to spark binary package in HDFS or on local filesystem on all slave -# nodes (e.g., file:///opt/spark/spark-1.6.0-bin-hadoop2.6.tgz) in sparkEnvir +# nodes (e.g., file:///opt/spark/spark-2.0.2-bin-hadoop2.7.tgz) in sparkEnvir # set other options in sparkEnvir sc <- sparkR.init("mesos://10.10.10.10:5050", sparkEnvir=list( - spark.executor.uri="hdfs://10.10.10.10/spark/spark-1.6.0-bin-hadoop2.6.tgz", + spark.executor.uri="hdfs://10.10.10.10/spark/spark-2.0.2-bin-hadoop2.7.tgz", spark.executor.memory="8g" ) ) @@ -172,7 +172,7 @@ The Apache Toree kernel automatically creates a `SparkContext` when it starts ba For instance, to pass information about a Mesos master, Spark binary location in HDFS, and an executor options, you could start the container like so: `docker run -d -p 8888:8888 -e SPARK_OPTS '--master=mesos://10.10.10.10:5050 \ - --spark.executor.uri=hdfs://10.10.10.10/spark/spark-1.6.0-bin-hadoop2.6.tgz \ + --spark.executor.uri=hdfs://10.10.10.10/spark/spark-2.0.2-bin-hadoop2.7.tgz \ --spark.executor.memory=8g' jupyter/all-spark-notebook` Note that this is the same information expressed in a notebook in the Python case above. Once the kernel spec has your cluster information, you can test your cluster in an Apache Toree notebook like so: diff --git a/pyspark-notebook/Dockerfile b/pyspark-notebook/Dockerfile index 1ac9e8f7..9b3f20b9 100644 --- a/pyspark-notebook/Dockerfile +++ b/pyspark-notebook/Dockerfile @@ -7,39 +7,36 @@ MAINTAINER Jupyter Project USER root # Spark dependencies -ENV APACHE_SPARK_VERSION 2.0.0 +ENV APACHE_SPARK_VERSION 2.0.2 +ENV HADOOP_VERSION 2.7 # Temporarily add jessie backports to get openjdk 8, but then remove that source -RUN echo 'deb http://ftp.debian.org/debian jessie-backports main' > /etc/apt/sources.list.d/jessie-backports.list && \ +RUN echo 'deb http://cdn-fastly.deb.debian.org/debian jessie-backports main' > /etc/apt/sources.list.d/jessie-backports.list && \ apt-get -y update && \ apt-get install -y --no-install-recommends openjdk-8-jre-headless && \ rm /etc/apt/sources.list.d/jessie-backports.list && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* RUN cd /tmp && \ - wget -q http://d3kbcqa49mib13.cloudfront.net/spark-${APACHE_SPARK_VERSION}-bin-hadoop2.6.tgz && \ - echo "e17d9da4b3ac463ea3ce42289f2a71cefb479d154b1ffd00310c7d7ab207aa2c *spark-${APACHE_SPARK_VERSION}-bin-hadoop2.6.tgz" | sha256sum -c - && \ - tar xzf spark-${APACHE_SPARK_VERSION}-bin-hadoop2.6.tgz -C /usr/local && \ - rm spark-${APACHE_SPARK_VERSION}-bin-hadoop2.6.tgz -RUN cd /usr/local && ln -s spark-${APACHE_SPARK_VERSION}-bin-hadoop2.6 spark + wget -q http://d3kbcqa49mib13.cloudfront.net/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && \ + echo "e6349dd38ded84831e3ff7d391ae7f2525c359fb452b0fc32ee2ab637673552a *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | sha256sum -c - && \ + tar xzf spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -C /usr/local && \ + rm spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz +RUN cd /usr/local && ln -s spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark # Mesos dependencies -# Currently, Mesos is not available from Debian Jessie. -# So, we are installing it from Debian Wheezy. Once it -# becomes available for Debian Jessie. We should switch -# over to using that instead. RUN apt-key adv --keyserver keyserver.ubuntu.com --recv E56151BF && \ DISTRO=debian && \ - CODENAME=wheezy && \ + CODENAME=jessie && \ echo "deb http://repos.mesosphere.io/${DISTRO} ${CODENAME} main" > /etc/apt/sources.list.d/mesosphere.list && \ apt-get -y update && \ - apt-get --no-install-recommends -y --force-yes install mesos=0.22.1-1.0.debian78 && \ + apt-get --no-install-recommends -y --force-yes install mesos=0.25.0-0.2.70.debian81 && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* # Spark and Mesos config ENV SPARK_HOME /usr/local/spark -ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.1-src.zip +ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.3-src.zip ENV MESOS_NATIVE_LIBRARY /usr/local/lib/libmesos.so ENV SPARK_OPTS --driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info diff --git a/pyspark-notebook/README.md b/pyspark-notebook/README.md index b4090e39..9c08a689 100644 --- a/pyspark-notebook/README.md +++ b/pyspark-notebook/README.md @@ -7,8 +7,8 @@ * Jupyter Notebook 4.2.x * Conda Python 3.x and Python 2.7.x environments * pyspark, pandas, matplotlib, scipy, seaborn, scikit-learn pre-installed -* Spark 1.6.0 for use in local mode or to connect to a cluster of Spark workers -* Mesos client 0.22 binary that can communicate with a Mesos master +* Spark 2.0.2 with Hadoop 2.7 for use in local mode or to connect to a cluster of Spark workers +* Mesos client 0.25 binary that can communicate with a Mesos master * Unprivileged user `jovyan` (uid=1000, configurable, see options) in group `users` (gid=100) with ownership over `/home/jovyan` and `/opt/conda` * [tini](https://github.com/krallin/tini) as the container entrypoint and [start-notebook.sh](../base-notebook/start-notebook.sh) as the default command * A [start-singleuser.sh](../base-notebook/start-singleuser.sh) script useful for running a single-user instance of the Notebook server, as required by JupyterHub @@ -68,8 +68,8 @@ conf = pyspark.SparkConf() # point to mesos master or zookeeper entry (e.g., zk://10.10.10.10:2181/mesos) conf.setMaster("mesos://10.10.10.10:5050") # point to spark binary package in HDFS or on local filesystem on all slave -# nodes (e.g., file:///opt/spark/spark-1.6.0-bin-hadoop2.6.tgz) -conf.set("spark.executor.uri", "hdfs://10.122.193.209/spark/spark-1.6.0-bin-hadoop2.6.tgz") +# nodes (e.g., file:///opt/spark/spark-2.0.2-bin-hadoop2.7.tgz) +conf.set("spark.executor.uri", "hdfs://10.122.193.209/spark/spark-2.0.2-bin-hadoop2.7.tgz") # set other options as desired conf.set("spark.executor.memory", "8g") conf.set("spark.core.connection.ack.wait.timeout", "1200") From 0afcc8d1337ab504fc44c6c73981a8af8d99cca6 Mon Sep 17 00:00:00 2001 From: Peter Parente Date: Sun, 4 Dec 2016 16:46:48 -0500 Subject: [PATCH 3/6] Activate python2 conda environment in kernelspec Fixes issues with python version mismatches when using Spark --- scipy-notebook/Dockerfile | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/scipy-notebook/Dockerfile b/scipy-notebook/Dockerfile index f2efdbd2..bebd1ae7 100644 --- a/scipy-notebook/Dockerfile +++ b/scipy-notebook/Dockerfile @@ -93,7 +93,12 @@ COPY mplimporthook.py $HOME/.ipython/profile_default/startup/ USER root # Install Python 2 kernel spec globally to avoid permission problems when NB_UID -# switching at runtime. -RUN $CONDA_DIR/envs/python2/bin/python -m ipykernel install +# switching at runtime and to allow the notebook server running out of the root +# environment to find it. Also, activate the python2 environment upon kernel +# launch. +RUN pip install kernda --no-cache && \ + $CONDA_DIR/envs/python2/bin/python -m ipykernel install && \ + kernda -o -y /usr/local/share/jupyter/kernels/python2/kernel.json && \ + pip uninstall kernda -y USER $NB_USER From 6b56894aa334b298844c428cb9f3894216d4b8d9 Mon Sep 17 00:00:00 2001 From: Peter Parente Date: Sun, 4 Dec 2016 16:47:07 -0500 Subject: [PATCH 4/6] Doc updates for Spark 2 --- all-spark-notebook/README.md | 26 +++++++++++--------------- pyspark-notebook/README.md | 2 +- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/all-spark-notebook/README.md b/all-spark-notebook/README.md index 62757581..546d2ef9 100644 --- a/all-spark-notebook/README.md +++ b/all-spark-notebook/README.md @@ -53,27 +53,24 @@ rdd.takeSample(False, 5) 0. Run the container as shown above. 1. Open a R notebook. -2. Initialize `sparkR` for local mode. -3. Initialize `sparkRSQL`. +2. Initialize a `sparkR` session for local mode. For example, the first few cells in a R notebook might read: ``` library(SparkR) -sc <- sparkR.init("local[*]") -sqlContext <- sparkRSQL.init(sc) +as <- sparkR.session("local[*]") # do something to prove it works -data(iris) -df <- createDataFrame(sqlContext, iris) +df <- as.DataFrame(iris) head(filter(df, df$Petal_Width > 0.2)) ``` -### In an Apache Toree (Scala) Notebook +### In an Apache Toree - Scala Notebook 0. Run the container as shown above. -1. Open an Apache Toree (Scala) notebook. +1. Open an Apache Toree - Scala notebook. 2. Use the pre-configured `SparkContext` in variable `sc`. For example: @@ -147,25 +144,24 @@ library(SparkR) # point to spark binary package in HDFS or on local filesystem on all slave # nodes (e.g., file:///opt/spark/spark-2.0.2-bin-hadoop2.7.tgz) in sparkEnvir # set other options in sparkEnvir -sc <- sparkR.init("mesos://10.10.10.10:5050", sparkEnvir=list( +sc <- sparkR.session("mesos://10.10.10.10:5050", sparkEnvir=list( spark.executor.uri="hdfs://10.10.10.10/spark/spark-2.0.2-bin-hadoop2.7.tgz", spark.executor.memory="8g" ) ) -sqlContext <- sparkRSQL.init(sc) # do something to prove it works data(iris) -df <- createDataFrame(sqlContext, iris) +df <- as.DataFrame(iris) head(filter(df, df$Petal_Width > 0.2)) ``` -### In an Apache Toree (Scala) Notebook +### In an Apache Toree - Scala Notebook 0. Open a terminal via *New -> Terminal* in the notebook interface. 1. Add information about your cluster to the `SPARK_OPTS` environment variable when running the container. -2. Open an Apache Toree (Scala) notebook. -3. Use the pre-configured `SparkContext` in variable `sc`. +2. Open an Apache Toree - Scala notebook. +3. Use the pre-configured `SparkContext` in variable `sc` or `SparkSession` in variable `spark`. The Apache Toree kernel automatically creates a `SparkContext` when it starts based on configuration information from its command line arguments and environment variables. You can pass information about your Mesos cluster via the `SPARK_OPTS` environment variable when you spawn a container. @@ -223,7 +219,7 @@ You may customize the execution of the Docker container and the Notebook server * `-e GRANT_SUDO=yes` - Gives the `jovyan` user passwordless `sudo` capability. Useful for installing OS packages. For this option to take effect, you must run the container with `--user root`. (The `start-notebook.sh` script will `su jovyan` after adding `jovyan` to sudoers.) **You should only enable `sudo` if you trust the user or if the container is running on an isolated host.** * `-v /some/host/folder/for/work:/home/jovyan/work` - Host mounts the default working directory on the host to preserve work even when the container is destroyed and recreated (e.g., during an upgrade). * `-v /some/host/folder/for/server.pem:/home/jovyan/.local/share/jupyter/notebook.pem` - Mounts a SSL certificate plus key for `USE_HTTPS`. Useful if you have a real certificate for the domain under which you are running the Notebook server. -* `-p 4040:4040` - Opens the port for the [Spark Monitoring and Instrumentation UI](http://spark.apache.org/docs/latest/monitoring.html). Note every new spark context that is created is put onto an incrementing port (ie. 4040, 4041, 4042, etc.), and it might be necessary to open multiple ports. `docker run -d -p 8888:8888 -p 4040:4040 -p 4041:4041 jupyter/all-spark-notebook` +* `-p 4040:4040` - Opens the port for the [Spark Monitoring and Instrumentation UI](http://spark.apache.org/docs/latest/monitoring.html). Note every new Spark context that is created is put onto an incrementing port (ie. 4040, 4041, 4042, etc.), and it might be necessary to open multiple ports. `docker run -d -p 8888:8888 -p 4040:4040 -p 4041:4041 jupyter/all-spark-notebook` ## SSL Certificates diff --git a/pyspark-notebook/README.md b/pyspark-notebook/README.md index 9c08a689..ecffdd05 100644 --- a/pyspark-notebook/README.md +++ b/pyspark-notebook/README.md @@ -124,7 +124,7 @@ You may customize the execution of the Docker container and the Notebook server * `-e GRANT_SUDO=yes` - Gives the `jovyan` user passwordless `sudo` capability. Useful for installing OS packages. For this option to take effect, you must run the container with `--user root`. (The `start-notebook.sh` script will `su jovyan` after adding `jovyan` to sudoers.) **You should only enable `sudo` if you trust the user or if the container is running on an isolated host.** * `-v /some/host/folder/for/work:/home/jovyan/work` - Host mounts the default working directory on the host to preserve work even when the container is destroyed and recreated (e.g., during an upgrade). * `-v /some/host/folder/for/server.pem:/home/jovyan/.local/share/jupyter/notebook.pem` - Mounts a SSL certificate plus key for `USE_HTTPS`. Useful if you have a real certificate for the domain under which you are running the Notebook server. -* `-p 4040:4040` - Opens the port for the [Spark Monitoring and Instrumentation UI](http://spark.apache.org/docs/latest/monitoring.html). Note every new spark context that is created is put onto an incrementing port (ie. 4040, 4041, 4042, etc.), and it might be necessary to open multiple ports. `docker run -d -p 8888:8888 -p 4040:4040 -p 4041:4041 jupyter/pyspark-notebook` +* `-p 4040:4040` - Opens the port for the [Spark Monitoring and Instrumentation UI](http://spark.apache.org/docs/latest/monitoring.html). Note every new Spark context that is created is put onto an incrementing port (ie. 4040, 4041, 4042, etc.), and it might be necessary to open multiple ports. `docker run -d -p 8888:8888 -p 4040:4040 -p 4041:4041 jupyter/pyspark-notebook` ## SSL Certificates From f4d6934962f5c8705a2ac28784b7492c529948c6 Mon Sep 17 00:00:00 2001 From: Peter Parente Date: Sat, 10 Dec 2016 13:43:50 -0500 Subject: [PATCH 5/6] Add note about spark web ui port --- all-spark-notebook/README.md | 3 ++- pyspark-notebook/README.md | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/all-spark-notebook/README.md b/all-spark-notebook/README.md index 546d2ef9..99ab4e11 100644 --- a/all-spark-notebook/README.md +++ b/all-spark-notebook/README.md @@ -219,7 +219,8 @@ You may customize the execution of the Docker container and the Notebook server * `-e GRANT_SUDO=yes` - Gives the `jovyan` user passwordless `sudo` capability. Useful for installing OS packages. For this option to take effect, you must run the container with `--user root`. (The `start-notebook.sh` script will `su jovyan` after adding `jovyan` to sudoers.) **You should only enable `sudo` if you trust the user or if the container is running on an isolated host.** * `-v /some/host/folder/for/work:/home/jovyan/work` - Host mounts the default working directory on the host to preserve work even when the container is destroyed and recreated (e.g., during an upgrade). * `-v /some/host/folder/for/server.pem:/home/jovyan/.local/share/jupyter/notebook.pem` - Mounts a SSL certificate plus key for `USE_HTTPS`. Useful if you have a real certificate for the domain under which you are running the Notebook server. -* `-p 4040:4040` - Opens the port for the [Spark Monitoring and Instrumentation UI](http://spark.apache.org/docs/latest/monitoring.html). Note every new Spark context that is created is put onto an incrementing port (ie. 4040, 4041, 4042, etc.), and it might be necessary to open multiple ports. `docker run -d -p 8888:8888 -p 4040:4040 -p 4041:4041 jupyter/all-spark-notebook` +`-p 4040:4040` - Opens the default port for the [Spark Monitoring and Instrumentation UI](http://spark.apache.org/docs/latest/monitoring.html). Note every new Spark context that is created is put onto an incrementing port (ie. 4040, 4041, 4042, etc.) by default, and it might be necessary to open multiple ports using a command like `docker run -d -p 8888:8888 -p 4040:4040 -p 4041:4041 jupyter/pyspark-notebook`. You can also control the port Spark uses for its web UI with the `spark.ui.port` config option. + ## SSL Certificates diff --git a/pyspark-notebook/README.md b/pyspark-notebook/README.md index ecffdd05..9afe8c32 100644 --- a/pyspark-notebook/README.md +++ b/pyspark-notebook/README.md @@ -124,7 +124,7 @@ You may customize the execution of the Docker container and the Notebook server * `-e GRANT_SUDO=yes` - Gives the `jovyan` user passwordless `sudo` capability. Useful for installing OS packages. For this option to take effect, you must run the container with `--user root`. (The `start-notebook.sh` script will `su jovyan` after adding `jovyan` to sudoers.) **You should only enable `sudo` if you trust the user or if the container is running on an isolated host.** * `-v /some/host/folder/for/work:/home/jovyan/work` - Host mounts the default working directory on the host to preserve work even when the container is destroyed and recreated (e.g., during an upgrade). * `-v /some/host/folder/for/server.pem:/home/jovyan/.local/share/jupyter/notebook.pem` - Mounts a SSL certificate plus key for `USE_HTTPS`. Useful if you have a real certificate for the domain under which you are running the Notebook server. -* `-p 4040:4040` - Opens the port for the [Spark Monitoring and Instrumentation UI](http://spark.apache.org/docs/latest/monitoring.html). Note every new Spark context that is created is put onto an incrementing port (ie. 4040, 4041, 4042, etc.), and it might be necessary to open multiple ports. `docker run -d -p 8888:8888 -p 4040:4040 -p 4041:4041 jupyter/pyspark-notebook` +* `-p 4040:4040` - Opens the default port for the [Spark Monitoring and Instrumentation UI](http://spark.apache.org/docs/latest/monitoring.html). Note every new Spark context that is created is put onto an incrementing port (ie. 4040, 4041, 4042, etc.) by default, and it might be necessary to open multiple ports using a command like `docker run -d -p 8888:8888 -p 4040:4040 -p 4041:4041 jupyter/pyspark-notebook`. You can also control the port Spark uses for its web UI with the `spark.ui.port` config option. ## SSL Certificates From 0dcccdd2a399a9d632eaa92c319b39469ff74dab Mon Sep 17 00:00:00 2001 From: Peter Parente Date: Sat, 10 Dec 2016 13:44:08 -0500 Subject: [PATCH 6/6] Ignore generated orig files --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 64adc198..17dcb67c 100644 --- a/.gitignore +++ b/.gitignore @@ -63,4 +63,5 @@ target/ .DS_Store dockerspawner -dockerspawner.tar.gz \ No newline at end of file +dockerspawner.tar.gz +*.orig