Merge pull request #263 from parente/upgrade-spark-2.0

Upgrade Spark to 2.0.2
2025-10-16 14:32:57 +00:00 · 2016-12-10 15:54:56 -05:00
parent 4c363875a3 0dcccdd2a3
commit 2410ad5720
6 changed files with 50 additions and 46 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -63,4 +63,5 @@ target/
 .DS_Store

 dockerspawner
-dockerspawner.tar.gz
+dockerspawner.tar.gz
+*.orig
--- a/all-spark-notebook/Dockerfile
+++ b/all-spark-notebook/Dockerfile
@@ -29,5 +29,5 @@ RUN conda config --add channels r && \
    'r-rcurl=1.95*' && conda clean -tipsy

 # Apache Toree kernel
-RUN pip --no-cache-dir install toree==0.1.0.dev7
+RUN pip --no-cache-dir install https://dist.apache.org/repos/dist/dev/incubator/toree/0.2.0/snapshots/dev1/toree-pip/toree-0.2.0.dev1.tar.gz
 RUN jupyter toree install --user
--- a/all-spark-notebook/README.md
+++ b/all-spark-notebook/README.md
@@ -11,8 +11,8 @@
 * Scala 2.10.x
 * pyspark, pandas, matplotlib, scipy, seaborn, scikit-learn pre-installed for Python
 * ggplot2, rcurl preinstalled for R
-* Spark 1.6.0 for use in local mode or to connect to a cluster of Spark workers
-* Mesos client 0.22 binary that can communicate with a Mesos master
+* Spark 2.0.2 with Hadoop 2.7 for use in local mode or to connect to a cluster of Spark workers
+* Mesos client 0.25 binary that can communicate with a Mesos master
 * Unprivileged user `jovyan` (uid=1000, configurable, see options) in group `users` (gid=100) with ownership over `/home/jovyan` and `/opt/conda`
 * [tini](https://github.com/krallin/tini) as the container entrypoint and [start-notebook.sh](../base-notebook/start-notebook.sh) as the default command
 * A [start-singleuser.sh](../base-notebook/start-singleuser.sh) script useful for running a single-user instance of the Notebook server, as required by JupyterHub
@@ -53,27 +53,24 @@ rdd.takeSample(False, 5)

 0. Run the container as shown above.
 1. Open a R notebook.
-2. Initialize `sparkR` for local mode.
-3. Initialize `sparkRSQL`.
+2. Initialize a `sparkR` session for local mode.

 For example, the first few cells in a R notebook might read:

 ```
 library(SparkR)

-sc <- sparkR.init("local[*]")
-sqlContext <- sparkRSQL.init(sc)
+as <- sparkR.session("local[*]")

 # do something to prove it works
-data(iris)
-df <- createDataFrame(sqlContext, iris)
+df <- as.DataFrame(iris)
 head(filter(df, df$Petal_Width > 0.2))
 ```

-### In an Apache Toree (Scala) Notebook
+### In an Apache Toree - Scala Notebook

 0. Run the container as shown above.
-1. Open an Apache Toree (Scala) notebook.
+1. Open an Apache Toree - Scala notebook.
 2. Use the pre-configured `SparkContext` in variable `sc`.

 For example:
@@ -112,8 +109,8 @@ conf = pyspark.SparkConf()
 # point to mesos master or zookeeper entry (e.g., zk://10.10.10.10:2181/mesos)
 conf.setMaster("mesos://10.10.10.10:5050")
 # point to spark binary package in HDFS or on local filesystem on all slave
-# nodes (e.g., file:///opt/spark/spark-1.6.0-bin-hadoop2.6.tgz)
-conf.set("spark.executor.uri", "hdfs://10.10.10.10/spark/spark-1.6.0-bin-hadoop2.6.tgz")
+# nodes (e.g., file:///opt/spark/spark-2.0.2-bin-hadoop2.7.tgz)
+conf.set("spark.executor.uri", "hdfs://10.10.10.10/spark/spark-2.0.2-bin-hadoop2.7.tgz")
 # set other options as desired
 conf.set("spark.executor.memory", "8g")
 conf.set("spark.core.connection.ack.wait.timeout", "1200")
@@ -145,34 +142,33 @@ library(SparkR)
 # point to mesos master or zookeeper entry (e.g., zk://10.10.10.10:2181/mesos)\
 # as the first argument
 # point to spark binary package in HDFS or on local filesystem on all slave
-# nodes (e.g., file:///opt/spark/spark-1.6.0-bin-hadoop2.6.tgz) in sparkEnvir
+# nodes (e.g., file:///opt/spark/spark-2.0.2-bin-hadoop2.7.tgz) in sparkEnvir
 # set other options in sparkEnvir
-sc <- sparkR.init("mesos://10.10.10.10:5050", sparkEnvir=list(
-    spark.executor.uri="hdfs://10.10.10.10/spark/spark-1.6.0-bin-hadoop2.6.tgz",
+sc <- sparkR.session("mesos://10.10.10.10:5050", sparkEnvir=list(
+    spark.executor.uri="hdfs://10.10.10.10/spark/spark-2.0.2-bin-hadoop2.7.tgz",
    spark.executor.memory="8g"
    )
 )
-sqlContext <- sparkRSQL.init(sc)

 # do something to prove it works
 data(iris)
-df <- createDataFrame(sqlContext, iris)
+df <- as.DataFrame(iris)
 head(filter(df, df$Petal_Width > 0.2))
 ```

-### In an Apache Toree (Scala) Notebook
+### In an Apache Toree - Scala Notebook

 0. Open a terminal via *New -> Terminal* in the notebook interface.
 1. Add information about your cluster to the `SPARK_OPTS` environment variable when running the container.
-2. Open an Apache Toree (Scala) notebook.
-3. Use the pre-configured `SparkContext` in variable `sc`.
+2. Open an Apache Toree - Scala notebook.
+3. Use the pre-configured `SparkContext` in variable `sc` or `SparkSession` in variable `spark`.

 The Apache Toree kernel automatically creates a `SparkContext` when it starts based on configuration information from its command line arguments and environment variables. You can pass information about your Mesos cluster via the `SPARK_OPTS` environment variable when you spawn a container.

 For instance, to pass information about a Mesos master, Spark binary location in HDFS, and an executor options, you could start the container like so:

 `docker run -d -p 8888:8888 -e SPARK_OPTS '--master=mesos://10.10.10.10:5050 \
-    --spark.executor.uri=hdfs://10.10.10.10/spark/spark-1.6.0-bin-hadoop2.6.tgz \
+    --spark.executor.uri=hdfs://10.10.10.10/spark/spark-2.0.2-bin-hadoop2.7.tgz \
    --spark.executor.memory=8g' jupyter/all-spark-notebook`

 Note that this is the same information expressed in a notebook in the Python case above. Once the kernel spec has your cluster information, you can test your cluster in an Apache Toree notebook like so:
@@ -223,7 +219,8 @@ You may customize the execution of the Docker container and the Notebook server
 * `-e GRANT_SUDO=yes` - Gives the `jovyan` user passwordless `sudo` capability. Useful for installing OS packages. For this option to take effect, you must run the container with `--user root`. (The `start-notebook.sh` script will `su jovyan` after adding `jovyan` to sudoers.) **You should only enable `sudo` if you trust the user or if the container is running on an isolated host.**
 * `-v /some/host/folder/for/work:/home/jovyan/work` - Host mounts the default working directory on the host to preserve work even when the container is destroyed and recreated (e.g., during an upgrade).
 * `-v /some/host/folder/for/server.pem:/home/jovyan/.local/share/jupyter/notebook.pem` - Mounts a SSL certificate plus key for `USE_HTTPS`. Useful if you have a real certificate for the domain under which you are running the Notebook server.
-* `-p 4040:4040` - Opens the port for the [Spark Monitoring and Instrumentation UI](http://spark.apache.org/docs/latest/monitoring.html). Note every new spark context that is created is put onto an incrementing port (ie. 4040, 4041, 4042, etc.), and it might be necessary to open multiple ports. `docker run -d -p 8888:8888 -p 4040:4040 -p 4041:4041 jupyter/all-spark-notebook`
+`-p 4040:4040` - Opens the default port for the [Spark Monitoring and Instrumentation UI](http://spark.apache.org/docs/latest/monitoring.html). Note every new Spark context that is created is put onto an incrementing port (ie. 4040, 4041, 4042, etc.) by default, and it might be necessary to open multiple ports using a command like `docker run -d -p 8888:8888 -p 4040:4040 -p 4041:4041 jupyter/pyspark-notebook`. You can also control the port Spark uses for its web UI with the `spark.ui.port` config option.
+

 ## SSL Certificates

--- a/pyspark-notebook/Dockerfile
+++ b/pyspark-notebook/Dockerfile
@@ -7,35 +7,36 @@ MAINTAINER Jupyter Project <jupyter@googlegroups.com>
 USER root

 # Spark dependencies
-ENV APACHE_SPARK_VERSION 1.6.1
-RUN apt-get -y update && \
-    apt-get install -y --no-install-recommends openjdk-7-jre-headless && \
+ENV APACHE_SPARK_VERSION 2.0.2
+ENV HADOOP_VERSION 2.7
+
+# Temporarily add jessie backports to get openjdk 8, but then remove that source
+RUN echo 'deb http://cdn-fastly.deb.debian.org/debian jessie-backports main' > /etc/apt/sources.list.d/jessie-backports.list && \
+    apt-get -y update && \
+    apt-get install -y --no-install-recommends openjdk-8-jre-headless && \
+    rm /etc/apt/sources.list.d/jessie-backports.list && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 RUN cd /tmp && \
-        wget -q http://d3kbcqa49mib13.cloudfront.net/spark-${APACHE_SPARK_VERSION}-bin-hadoop2.6.tgz && \
-        echo "09f3b50676abc9b3d1895773d18976953ee76945afa72fa57e6473ce4e215970 *spark-${APACHE_SPARK_VERSION}-bin-hadoop2.6.tgz" | sha256sum -c - && \
-        tar xzf spark-${APACHE_SPARK_VERSION}-bin-hadoop2.6.tgz -C /usr/local && \
-        rm spark-${APACHE_SPARK_VERSION}-bin-hadoop2.6.tgz
-RUN cd /usr/local && ln -s spark-${APACHE_SPARK_VERSION}-bin-hadoop2.6 spark
+        wget -q http://d3kbcqa49mib13.cloudfront.net/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && \
+        echo "e6349dd38ded84831e3ff7d391ae7f2525c359fb452b0fc32ee2ab637673552a *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | sha256sum -c - && \
+        tar xzf spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -C /usr/local && \
+        rm spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
+RUN cd /usr/local && ln -s spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark

 # Mesos dependencies
-# Currently, Mesos is not available from Debian Jessie.
-# So, we are installing it from Debian Wheezy. Once it
-# becomes available for Debian Jessie. We should switch
-# over to using that instead.
 RUN apt-key adv --keyserver keyserver.ubuntu.com --recv E56151BF && \
    DISTRO=debian && \
-    CODENAME=wheezy && \
+    CODENAME=jessie && \
    echo "deb http://repos.mesosphere.io/${DISTRO} ${CODENAME} main" > /etc/apt/sources.list.d/mesosphere.list && \
    apt-get -y update && \
-    apt-get --no-install-recommends -y --force-yes install mesos=0.22.1-1.0.debian78 && \
+    apt-get --no-install-recommends -y --force-yes install mesos=0.25.0-0.2.70.debian81 && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

 # Spark and Mesos config
 ENV SPARK_HOME /usr/local/spark
-ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.9-src.zip
+ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.3-src.zip
 ENV MESOS_NATIVE_LIBRARY /usr/local/lib/libmesos.so
 ENV SPARK_OPTS --driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info

--- a/pyspark-notebook/README.md
+++ b/pyspark-notebook/README.md
@@ -7,8 +7,8 @@
 * Jupyter Notebook 4.2.x
 * Conda Python 3.x and Python 2.7.x environments
 * pyspark, pandas, matplotlib, scipy, seaborn, scikit-learn pre-installed
-* Spark 1.6.0 for use in local mode or to connect to a cluster of Spark workers
-* Mesos client 0.22 binary that can communicate with a Mesos master
+* Spark 2.0.2 with Hadoop 2.7 for use in local mode or to connect to a cluster of Spark workers
+* Mesos client 0.25 binary that can communicate with a Mesos master
 * Unprivileged user `jovyan` (uid=1000, configurable, see options) in group `users` (gid=100) with ownership over `/home/jovyan` and `/opt/conda`
 * [tini](https://github.com/krallin/tini) as the container entrypoint and [start-notebook.sh](../base-notebook/start-notebook.sh) as the default command
 * A [start-singleuser.sh](../base-notebook/start-singleuser.sh) script useful for running a single-user instance of the Notebook server, as required by JupyterHub
@@ -68,8 +68,8 @@ conf = pyspark.SparkConf()
 # point to mesos master or zookeeper entry (e.g., zk://10.10.10.10:2181/mesos)
 conf.setMaster("mesos://10.10.10.10:5050")
 # point to spark binary package in HDFS or on local filesystem on all slave
-# nodes (e.g., file:///opt/spark/spark-1.6.0-bin-hadoop2.6.tgz)
-conf.set("spark.executor.uri", "hdfs://10.122.193.209/spark/spark-1.6.0-bin-hadoop2.6.tgz")
+# nodes (e.g., file:///opt/spark/spark-2.0.2-bin-hadoop2.7.tgz)
+conf.set("spark.executor.uri", "hdfs://10.122.193.209/spark/spark-2.0.2-bin-hadoop2.7.tgz")
 # set other options as desired
 conf.set("spark.executor.memory", "8g")
 conf.set("spark.core.connection.ack.wait.timeout", "1200")
@@ -124,7 +124,7 @@ You may customize the execution of the Docker container and the Notebook server
 * `-e GRANT_SUDO=yes` - Gives the `jovyan` user passwordless `sudo` capability. Useful for installing OS packages. For this option to take effect, you must run the container with `--user root`. (The `start-notebook.sh` script will `su jovyan` after adding `jovyan` to sudoers.) **You should only enable `sudo` if you trust the user or if the container is running on an isolated host.**
 * `-v /some/host/folder/for/work:/home/jovyan/work` - Host mounts the default working directory on the host to preserve work even when the container is destroyed and recreated (e.g., during an upgrade).
 * `-v /some/host/folder/for/server.pem:/home/jovyan/.local/share/jupyter/notebook.pem` - Mounts a SSL certificate plus key for `USE_HTTPS`. Useful if you have a real certificate for the domain under which you are running the Notebook server.
-* `-p 4040:4040` - Opens the port for the [Spark Monitoring and Instrumentation UI](http://spark.apache.org/docs/latest/monitoring.html). Note every new spark context that is created is put onto an incrementing port (ie. 4040, 4041, 4042, etc.), and it might be necessary to open multiple ports. `docker run -d -p 8888:8888 -p 4040:4040 -p 4041:4041 jupyter/pyspark-notebook`
+* `-p 4040:4040` - Opens the default port for the [Spark Monitoring and Instrumentation UI](http://spark.apache.org/docs/latest/monitoring.html). Note every new Spark context that is created is put onto an incrementing port (ie. 4040, 4041, 4042, etc.) by default, and it might be necessary to open multiple ports using a command like `docker run -d -p 8888:8888 -p 4040:4040 -p 4041:4041 jupyter/pyspark-notebook`. You can also control the port Spark uses for its web UI with the `spark.ui.port` config option.

 ## SSL Certificates

--- a/scipy-notebook/Dockerfile
+++ b/scipy-notebook/Dockerfile
@@ -93,7 +93,12 @@ COPY mplimporthook.py $HOME/.ipython/profile_default/startup/
 USER root

 # Install Python 2 kernel spec globally to avoid permission problems when NB_UID
-# switching at runtime.
-RUN $CONDA_DIR/envs/python2/bin/python -m ipykernel install
+# switching at runtime and to allow the notebook server running out of the root
+# environment to find it. Also, activate the python2 environment upon kernel
+# launch.
+RUN pip install kernda --no-cache && \
+    $CONDA_DIR/envs/python2/bin/python -m ipykernel install && \
+    kernda -o -y /usr/local/share/jupyter/kernels/python2/kernel.json && \
+    pip uninstall kernda -y

 USER $NB_USER