From c740fbb1ca63db5856e004d29dd08d11fb4f91f8 Mon Sep 17 00:00:00 2001 From: uodna Date: Tue, 18 Jul 2017 23:31:44 +0900 Subject: [PATCH] Upgrade Spark to 2.2.0 --- all-spark-notebook/README.md | 14 +++++++------- pyspark-notebook/Dockerfile | 4 ++-- pyspark-notebook/README.md | 8 ++++---- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/all-spark-notebook/README.md b/all-spark-notebook/README.md index fcb73ac0..344d03cc 100644 --- a/all-spark-notebook/README.md +++ b/all-spark-notebook/README.md @@ -11,7 +11,7 @@ * Scala 2.11.x * pyspark, pandas, matplotlib, scipy, seaborn, scikit-learn pre-installed for Python * ggplot2, rcurl preinstalled for R -* Spark 2.1.1 with Hadoop 2.7 for use in local mode or to connect to a cluster of Spark workers +* Spark 2.2.0 with Hadoop 2.7 for use in local mode or to connect to a cluster of Spark workers * Mesos client 1.2 binary that can communicate with a Mesos master * spylon-kernel * Unprivileged user `jovyan` (uid=1000, configurable, see options) in group `users` (gid=100) with ownership over `/home/jovyan` and `/opt/conda` @@ -124,8 +124,8 @@ conf = pyspark.SparkConf() # point to mesos master or zookeeper entry (e.g., zk://10.10.10.10:2181/mesos) conf.setMaster("mesos://10.10.10.10:5050") # point to spark binary package in HDFS or on local filesystem on all slave -# nodes (e.g., file:///opt/spark/spark-2.1.1-bin-hadoop2.7.tgz) -conf.set("spark.executor.uri", "hdfs://10.10.10.10/spark/spark-2.1.1-bin-hadoop2.7.tgz") +# nodes (e.g., file:///opt/spark/spark-2.2.0-bin-hadoop2.7.tgz) +conf.set("spark.executor.uri", "hdfs://10.10.10.10/spark/spark-2.2.0-bin-hadoop2.7.tgz") # set other options as desired conf.set("spark.executor.memory", "8g") conf.set("spark.core.connection.ack.wait.timeout", "1200") @@ -157,10 +157,10 @@ library(SparkR) # point to mesos master or zookeeper entry (e.g., zk://10.10.10.10:2181/mesos)\ # as the first argument # point to spark binary package in HDFS or on local filesystem on all slave -# nodes (e.g., file:///opt/spark/spark-2.1.1-bin-hadoop2.7.tgz) in sparkEnvir +# nodes (e.g., file:///opt/spark/spark-2.2.0-bin-hadoop2.7.tgz) in sparkEnvir # set other options in sparkEnvir sc <- sparkR.session("mesos://10.10.10.10:5050", sparkEnvir=list( - spark.executor.uri="hdfs://10.10.10.10/spark/spark-2.1.1-bin-hadoop2.7.tgz", + spark.executor.uri="hdfs://10.10.10.10/spark/spark-2.2.0-bin-hadoop2.7.tgz", spark.executor.memory="8g" ) ) @@ -183,7 +183,7 @@ The Apache Toree kernel automatically creates a `SparkContext` when it starts ba For instance, to pass information about a Mesos master, Spark binary location in HDFS, and an executor options, you could start the container like so: `docker run -d -p 8888:8888 -e SPARK_OPTS '--master=mesos://10.10.10.10:5050 \ - --spark.executor.uri=hdfs://10.10.10.10/spark/spark-2.1.1-bin-hadoop2.7.tgz \ + --spark.executor.uri=hdfs://10.10.10.10/spark/spark-2.2.0-bin-hadoop2.7.tgz \ --spark.executor.memory=8g' jupyter/all-spark-notebook` Note that this is the same information expressed in a notebook in the Python case above. Once the kernel spec has your cluster information, you can test your cluster in an Apache Toree notebook like so: @@ -309,7 +309,7 @@ c.DockerSpawner.container_image = 'jupyter/all-spark-notebook' # Have the Spawner override the Docker run command c.DockerSpawner.extra_create_kwargs.update({ - 'command': '/usr/local/bin/start-singleuser.sh' + 'command': '/usr/local/bin/start-singleuser.sh' }) ``` diff --git a/pyspark-notebook/Dockerfile b/pyspark-notebook/Dockerfile index 394da2a4..99d06144 100644 --- a/pyspark-notebook/Dockerfile +++ b/pyspark-notebook/Dockerfile @@ -7,7 +7,7 @@ MAINTAINER Jupyter Project USER root # Spark dependencies -ENV APACHE_SPARK_VERSION 2.1.1 +ENV APACHE_SPARK_VERSION 2.2.0 ENV HADOOP_VERSION 2.7 # Temporarily add jessie backports to get openjdk 8, but then remove that source @@ -19,7 +19,7 @@ RUN echo 'deb http://cdn-fastly.deb.debian.org/debian jessie-backports main' > / rm -rf /var/lib/apt/lists/* RUN cd /tmp && \ wget -q http://d3kbcqa49mib13.cloudfront.net/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && \ - echo "4b6427ca6dc6f888b21bff9f9a354260af4a0699a1f43caabf58ae6030951ee5fa8b976497aa33de7e4ae55609d47a80bfe66dfc48c79ea28e3e5b03bdaaba11 *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | sha512sum -c - && \ + echo "7a186a2a007b2dfd880571f7214a7d329c972510a460a8bdbef9f7f2a891019343c020f74b496a61e5aa42bc9e9a79cc99defe5cb3bf8b6f49c07e01b259bc6b *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | sha512sum -c - && \ tar xzf spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -C /usr/local && \ rm spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz RUN cd /usr/local && ln -s spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark diff --git a/pyspark-notebook/README.md b/pyspark-notebook/README.md index 4c4d783d..a6429358 100644 --- a/pyspark-notebook/README.md +++ b/pyspark-notebook/README.md @@ -7,7 +7,7 @@ * Jupyter Notebook 5.0.x * Conda Python 3.x and Python 2.7.x environments * pyspark, pandas, matplotlib, scipy, seaborn, scikit-learn pre-installed -* Spark 2.1.1 with Hadoop 2.7 for use in local mode or to connect to a cluster of Spark workers +* Spark 2.2.0 with Hadoop 2.7 for use in local mode or to connect to a cluster of Spark workers * Mesos client 1.2 binary that can communicate with a Mesos master * Unprivileged user `jovyan` (uid=1000, configurable, see options) in group `users` (gid=100) with ownership over `/home/jovyan` and `/opt/conda` * [tini](https://github.com/krallin/tini) as the container entrypoint and [start-notebook.sh](../base-notebook/start-notebook.sh) as the default command @@ -70,8 +70,8 @@ conf = pyspark.SparkConf() # point to mesos master or zookeeper entry (e.g., zk://10.10.10.10:2181/mesos) conf.setMaster("mesos://10.10.10.10:5050") # point to spark binary package in HDFS or on local filesystem on all slave -# nodes (e.g., file:///opt/spark/spark-2.1.1-bin-hadoop2.7.tgz) -conf.set("spark.executor.uri", "hdfs://10.122.193.209/spark/spark-2.1.1-bin-hadoop2.7.tgz") +# nodes (e.g., file:///opt/spark/spark-2.2.0-bin-hadoop2.7.tgz) +conf.set("spark.executor.uri", "hdfs://10.122.193.209/spark/spark-2.2.0-bin-hadoop2.7.tgz") # set other options as desired conf.set("spark.executor.memory", "8g") conf.set("spark.core.connection.ack.wait.timeout", "1200") @@ -202,7 +202,7 @@ c.DockerSpawner.container_image = 'jupyter/pyspark-notebook' # Have the Spawner override the Docker run command c.DockerSpawner.extra_create_kwargs.update({ - 'command': '/usr/local/bin/start-singleuser.sh' + 'command': '/usr/local/bin/start-singleuser.sh' }) ```