diff --git a/all-spark-notebook/Dockerfile b/all-spark-notebook/Dockerfile index 8ce0bbe0..74343920 100644 --- a/all-spark-notebook/Dockerfile +++ b/all-spark-notebook/Dockerfile @@ -6,6 +6,9 @@ MAINTAINER Jupyter Project USER root +# Util to help with kernel spec later +RUN apt-get -y update && apt-get -y install jq + # Spark dependencies ENV APACHE_SPARK_VERSION 1.5.1 RUN apt-get -y update && \ @@ -90,12 +93,13 @@ RUN conda install --yes \ RUN mkdir -p /opt/conda/share/jupyter/kernels/scala COPY kernel.json /opt/conda/share/jupyter/kernels/scala/ -USER root - -# Install Python 2 kernel spec globally to avoid permission problems when NB_UID -# switching at runtime. -RUN $CONDA_DIR/envs/python2/bin/python \ - $CONDA_DIR/envs/python2/bin/ipython \ - kernelspec install-self - -USER jovyan +# Install Python 2 kernel spec into the Python 3 conda environment which +# runs the notebook server +RUN bash -c '. activate python2 && \ + python -m ipykernel.kernelspec --prefix=$CONDA_DIR && \ + . deactivate' +# Set PYSPARK_HOME in the python2 spec +RUN jq --arg v "$CONDA_DIR/envs/python2/bin/python" \ + '.["env"]["PYSPARK_PYTHON"]=$v' \ + $CONDA_DIR/share/jupyter/kernels/python2/kernel.json > /tmp/kernel.json && \ + mv /tmp/kernel.json $CONDA_DIR/share/jupyter/kernels/python2/kernel.json diff --git a/all-spark-notebook/README.md b/all-spark-notebook/README.md index bd5f9896..9698c9d6 100644 --- a/all-spark-notebook/README.md +++ b/all-spark-notebook/README.md @@ -32,7 +32,7 @@ This configuration is nice for using Spark on small, local data. 1. Open a Python 2 or 3 notebook. 2. Create a `SparkContext` configured for local mode. -For example, the first few cells in a Python 3 notebook might read: +For example, the first few cells in a notebook might read: ```python import pyspark @@ -43,15 +43,6 @@ rdd = sc.parallelize(range(1000)) rdd.takeSample(False, 5) ``` -In a Python 2 notebook, prefix the above with the following code to ensure the local workers use Python 2 as well. - -```python -import os -os.environ['PYSPARK_PYTHON'] = 'python2' - -# include pyspark cells from above here ... -``` - ### In a R Notebook 0. Run the container as shown above. @@ -100,7 +91,7 @@ This configuration allows your compute cluster to scale with your data. 0. Open a Python 2 or 3 notebook. 1. Create a `SparkConf` instance in a new notebook pointing to your Mesos master node (or Zookeeper instance) and Spark binary package location. -2. Create a `SparkContext` using this configuration. +2. Create a `SparkContext` using this configuration. For example, the first few cells in a Python 3 notebook might read: @@ -115,7 +106,7 @@ conf = pyspark.SparkConf() # point to mesos master or zookeeper entry (e.g., zk://10.10.10.10:2181/mesos) conf.setMaster("mesos://10.10.10.10:5050") # point to spark binary package in HDFS or on local filesystem on all slave -# nodes (e.g., file:///opt/spark/spark-1.5.1-bin-hadoop2.6.tgz) +# nodes (e.g., file:///opt/spark/spark-1.5.1-bin-hadoop2.6.tgz) conf.set("spark.executor.uri", "hdfs://10.10.10.10/spark/spark-1.5.1-bin-hadoop2.6.tgz") # set other options as desired conf.set("spark.executor.memory", "8g") diff --git a/pyspark-notebook/Dockerfile b/pyspark-notebook/Dockerfile index 25569738..1370477d 100644 --- a/pyspark-notebook/Dockerfile +++ b/pyspark-notebook/Dockerfile @@ -6,6 +6,9 @@ MAINTAINER Jupyter Project USER root +# Util to help with kernel spec later +RUN apt-get -y update && apt-get -y install jq + # Spark dependencies ENV APACHE_SPARK_VERSION 1.5.1 RUN apt-get -y update && \ @@ -52,13 +55,13 @@ RUN conda create -p $CONDA_DIR/envs/python2 python=2.7 \ pyzmq \ && conda clean -yt -USER root - -# Install Python 2 kernel spec globally to avoid permission problems when NB_UID -# switching at runtime. -RUN $CONDA_DIR/envs/python2/bin/python \ - $CONDA_DIR/envs/python2/bin/ipython \ - kernelspec install-self - -USER jovyan - +# Install Python 2 kernel spec into the Python 3 conda environment which +# runs the notebook server +RUN bash -c '. activate python2 && \ + python -m ipykernel.kernelspec --prefix=$CONDA_DIR && \ + . deactivate' +# Set PYSPARK_HOME in the python2 spec +RUN jq --arg v "$CONDA_DIR/envs/python2/bin/python" \ + '.["env"]["PYSPARK_PYTHON"]=$v' \ + $CONDA_DIR/share/jupyter/kernels/python2/kernel.json > /tmp/kernel.json && \ + mv /tmp/kernel.json $CONDA_DIR/share/jupyter/kernels/python2/kernel.json diff --git a/pyspark-notebook/README.md b/pyspark-notebook/README.md index 4e716ffd..50eef170 100644 --- a/pyspark-notebook/README.md +++ b/pyspark-notebook/README.md @@ -27,7 +27,7 @@ This configuration is nice for using Spark on small, local data. 2. Open a Python 2 or 3 notebook. 3. Create a `SparkContext` configured for local mode. -For example, the first few cells in a Python 3 notebook might read: +For example, the first few cells in the notebook might read: ```python import pyspark @@ -38,15 +38,6 @@ rdd = sc.parallelize(range(1000)) rdd.takeSample(False, 5) ``` -In a Python 2 notebook, prefix the above with the following code to ensure the local workers use Python 2 as well. - -```python -import os -os.environ['PYSPARK_PYTHON'] = 'python2' - -# include pyspark cells from above here ... -``` - ## Connecting to a Spark Cluster on Mesos This configuration allows your compute cluster to scale with your data. @@ -58,7 +49,7 @@ This configuration allows your compute cluster to scale with your data. * NOTE: When using `--net=host`, you must also use the flags `--pid=host -e TINI_SUBREAPER=true`. See https://github.com/jupyter/docker-stacks/issues/64 for details. 4. Open a Python 2 or 3 notebook. 5. Create a `SparkConf` instance in a new notebook pointing to your Mesos master node (or Zookeeper instance) and Spark binary package location. -6. Create a `SparkContext` using this configuration. +6. Create a `SparkContext` using this configuration. For example, the first few cells in a Python 3 notebook might read: @@ -73,7 +64,7 @@ conf = pyspark.SparkConf() # point to mesos master or zookeeper entry (e.g., zk://10.10.10.10:2181/mesos) conf.setMaster("mesos://10.10.10.10:5050") # point to spark binary package in HDFS or on local filesystem on all slave -# nodes (e.g., file:///opt/spark/spark-1.5.1-bin-hadoop2.6.tgz) +# nodes (e.g., file:///opt/spark/spark-1.5.1-bin-hadoop2.6.tgz) conf.set("spark.executor.uri", "hdfs://10.122.193.209/spark/spark-1.5.1-bin-hadoop2.6.tgz") # set other options as desired conf.set("spark.executor.memory", "8g")