mirror of
https://github.com/jupyter/docker-stacks.git
synced 2025-10-11 03:52:56 +00:00
Merge branch 'master' into asalikhov/py_codestyle
This commit is contained in:
@@ -25,6 +25,8 @@ repos:
|
|||||||
name: Hadolint linter
|
name: Hadolint linter
|
||||||
description: Runs Hadolint to check for Dockerfile best practices
|
description: Runs Hadolint to check for Dockerfile best practices
|
||||||
language: system
|
language: system
|
||||||
|
# Exclude ppc64le Dockerfile since cannot check their build easily
|
||||||
|
exclude: ppc64
|
||||||
types:
|
types:
|
||||||
- dockerfile
|
- dockerfile
|
||||||
entry: hadolint
|
entry: hadolint
|
||||||
|
@@ -4,7 +4,7 @@
|
|||||||
# Ubuntu 20.04 (focal)
|
# Ubuntu 20.04 (focal)
|
||||||
# https://hub.docker.com/_/ubuntu/?tab=tags&name=focal
|
# https://hub.docker.com/_/ubuntu/?tab=tags&name=focal
|
||||||
# OS/ARCH: linux/amd64
|
# OS/ARCH: linux/amd64
|
||||||
ARG ROOT_CONTAINER=ubuntu:focal-20200703@sha256:d5a6519d9f048100123c568eb83f7ef5bfcad69b01424f420f17c932b00dea76
|
ARG ROOT_CONTAINER=ubuntu:focal-20200916@sha256:028d7303257c7f36c721b40099bf5004a41f666a54c0896d5f229f1c0fd99993
|
||||||
|
|
||||||
ARG BASE_CONTAINER=$ROOT_CONTAINER
|
ARG BASE_CONTAINER=$ROOT_CONTAINER
|
||||||
FROM $BASE_CONTAINER
|
FROM $BASE_CONTAINER
|
||||||
@@ -19,6 +19,17 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"]
|
|||||||
|
|
||||||
USER root
|
USER root
|
||||||
|
|
||||||
|
# Miniconda installation
|
||||||
|
# Default values can be overridden at build time
|
||||||
|
# (ARGS are in lower case to distinguish them from ENV)
|
||||||
|
# Check https://repo.anaconda.com/miniconda/
|
||||||
|
# Miniconda archive to install
|
||||||
|
ARG miniconda_version="4.8.3"
|
||||||
|
# Archive MD5 checksum
|
||||||
|
ARG miniconda_checksum="d63adf39f2c220950a063e0529d4ff74"
|
||||||
|
# Conda version that can be different from the archive
|
||||||
|
ARG conda_version="4.8.5"
|
||||||
|
|
||||||
# Install all OS dependencies for notebook server that starts but lacks all
|
# Install all OS dependencies for notebook server that starts but lacks all
|
||||||
# features (e.g., download as all possible file formats)
|
# features (e.g., download as all possible file formats)
|
||||||
ENV DEBIAN_FRONTEND noninteractive
|
ENV DEBIAN_FRONTEND noninteractive
|
||||||
@@ -53,9 +64,12 @@ COPY fix-permissions /usr/local/bin/fix-permissions
|
|||||||
RUN chmod a+rx /usr/local/bin/fix-permissions
|
RUN chmod a+rx /usr/local/bin/fix-permissions
|
||||||
|
|
||||||
# Enable prompt color in the skeleton .bashrc before creating the default NB_USER
|
# Enable prompt color in the skeleton .bashrc before creating the default NB_USER
|
||||||
RUN sed -i 's/^#force_color_prompt=yes/force_color_prompt=yes/' /etc/skel/.bashrc
|
# hadolint ignore=SC2016
|
||||||
|
RUN sed -i 's/^#force_color_prompt=yes/force_color_prompt=yes/' /etc/skel/.bashrc && \
|
||||||
|
# Add call to conda init script see https://stackoverflow.com/a/58081608/4413446
|
||||||
|
echo 'eval "$(command conda shell.bash hook 2> /dev/null)"' >> /etc/skel/.bashrc
|
||||||
|
|
||||||
# Create NB_USER wtih name jovyan user with UID=1000 and in the 'users' group
|
# Create NB_USER with name jovyan user with UID=1000 and in the 'users' group
|
||||||
# and make sure these dirs are writable by the `users` group.
|
# and make sure these dirs are writable by the `users` group.
|
||||||
RUN echo "auth requisite pam_deny.so" >> /etc/pam.d/su && \
|
RUN echo "auth requisite pam_deny.so" >> /etc/pam.d/su && \
|
||||||
sed -i.bak -e 's/^%admin/#%admin/' /etc/sudoers && \
|
sed -i.bak -e 's/^%admin/#%admin/' /etc/sudoers && \
|
||||||
@@ -76,15 +90,15 @@ RUN mkdir /home/$NB_USER/work && \
|
|||||||
fix-permissions /home/$NB_USER
|
fix-permissions /home/$NB_USER
|
||||||
|
|
||||||
# Install conda as jovyan and check the md5 sum provided on the download site
|
# Install conda as jovyan and check the md5 sum provided on the download site
|
||||||
ENV MINICONDA_VERSION=4.8.3 \
|
ENV MINICONDA_VERSION="${miniconda_version}" \
|
||||||
MINICONDA_MD5=d63adf39f2c220950a063e0529d4ff74 \
|
CONDA_VERSION="${conda_version}"
|
||||||
CONDA_VERSION=4.8.3
|
|
||||||
|
|
||||||
WORKDIR /tmp
|
WORKDIR /tmp
|
||||||
RUN wget --quiet https://repo.continuum.io/miniconda/Miniconda3-py38_${MINICONDA_VERSION}-Linux-x86_64.sh && \
|
RUN wget --quiet https://repo.continuum.io/miniconda/Miniconda3-py38_${MINICONDA_VERSION}-Linux-x86_64.sh && \
|
||||||
echo "${MINICONDA_MD5} *Miniconda3-py38_${MINICONDA_VERSION}-Linux-x86_64.sh" | md5sum -c - && \
|
echo "${miniconda_checksum} *Miniconda3-py38_${MINICONDA_VERSION}-Linux-x86_64.sh" | md5sum -c - && \
|
||||||
/bin/bash Miniconda3-py38_${MINICONDA_VERSION}-Linux-x86_64.sh -f -b -p $CONDA_DIR && \
|
/bin/bash Miniconda3-py38_${MINICONDA_VERSION}-Linux-x86_64.sh -f -b -p $CONDA_DIR && \
|
||||||
rm Miniconda3-py38_${MINICONDA_VERSION}-Linux-x86_64.sh && \
|
rm Miniconda3-py38_${MINICONDA_VERSION}-Linux-x86_64.sh && \
|
||||||
|
# Conda configuration see https://conda.io/projects/conda/en/latest/configuration.html
|
||||||
echo "conda ${CONDA_VERSION}" >> $CONDA_DIR/conda-meta/pinned && \
|
echo "conda ${CONDA_VERSION}" >> $CONDA_DIR/conda-meta/pinned && \
|
||||||
conda config --system --prepend channels conda-forge && \
|
conda config --system --prepend channels conda-forge && \
|
||||||
conda config --system --set auto_update_conda false && \
|
conda config --system --set auto_update_conda false && \
|
||||||
@@ -92,7 +106,7 @@ RUN wget --quiet https://repo.continuum.io/miniconda/Miniconda3-py38_${MINICONDA
|
|||||||
conda config --system --set channel_priority strict && \
|
conda config --system --set channel_priority strict && \
|
||||||
if [ ! $PYTHON_VERSION = 'default' ]; then conda install --yes python=$PYTHON_VERSION; fi && \
|
if [ ! $PYTHON_VERSION = 'default' ]; then conda install --yes python=$PYTHON_VERSION; fi && \
|
||||||
conda list python | grep '^python ' | tr -s ' ' | cut -d '.' -f 1,2 | sed 's/$/.*/' >> $CONDA_DIR/conda-meta/pinned && \
|
conda list python | grep '^python ' | tr -s ' ' | cut -d '.' -f 1,2 | sed 's/$/.*/' >> $CONDA_DIR/conda-meta/pinned && \
|
||||||
conda install --quiet --yes conda && \
|
conda install --quiet --yes "conda=${CONDA_VERSION}" && \
|
||||||
conda install --quiet --yes pip && \
|
conda install --quiet --yes pip && \
|
||||||
conda update --all --quiet --yes && \
|
conda update --all --quiet --yes && \
|
||||||
conda clean --all -f -y && \
|
conda clean --all -f -y && \
|
||||||
@@ -114,9 +128,9 @@ RUN conda install --quiet --yes 'tini=0.18.0' && \
|
|||||||
# Do all this in a single RUN command to avoid duplicating all of the
|
# Do all this in a single RUN command to avoid duplicating all of the
|
||||||
# files across image layers when the permissions change
|
# files across image layers when the permissions change
|
||||||
RUN conda install --quiet --yes \
|
RUN conda install --quiet --yes \
|
||||||
'notebook=6.0.3' \
|
'notebook=6.1.4' \
|
||||||
'jupyterhub=1.1.0' \
|
'jupyterhub=1.1.0' \
|
||||||
'jupyterlab=2.1.5' && \
|
'jupyterlab=2.2.8' && \
|
||||||
conda clean --all -f -y && \
|
conda clean --all -f -y && \
|
||||||
npm cache clean --force && \
|
npm cache clean --force && \
|
||||||
jupyter notebook --generate-config && \
|
jupyter notebook --generate-config && \
|
||||||
|
@@ -107,9 +107,9 @@ RUN conda install --quiet --yes 'tini=0.18.0' && \
|
|||||||
# Do all this in a single RUN command to avoid duplicating all of the
|
# Do all this in a single RUN command to avoid duplicating all of the
|
||||||
# files across image layers when the permissions change
|
# files across image layers when the permissions change
|
||||||
RUN conda install --quiet --yes \
|
RUN conda install --quiet --yes \
|
||||||
'notebook=6.0.3' \
|
'notebook=6.1.3' \
|
||||||
'jupyterhub=1.1.0' \
|
'jupyterhub=1.1.0' \
|
||||||
'jupyterlab=2.1.1' && \
|
'jupyterlab=2.2.5' && \
|
||||||
conda clean --all -f -y && \
|
conda clean --all -f -y && \
|
||||||
npm cache clean --force && \
|
npm cache clean --force && \
|
||||||
jupyter notebook --generate-config && \
|
jupyter notebook --generate-config && \
|
||||||
|
@@ -80,7 +80,7 @@ if [ $(id -u) == 0 ] ; then
|
|||||||
if [ "$NB_UID" != $(id -u $NB_USER) ] || [ "$NB_GID" != $(id -g $NB_USER) ]; then
|
if [ "$NB_UID" != $(id -u $NB_USER) ] || [ "$NB_GID" != $(id -g $NB_USER) ]; then
|
||||||
echo "Set user $NB_USER UID:GID to: $NB_UID:$NB_GID"
|
echo "Set user $NB_USER UID:GID to: $NB_UID:$NB_GID"
|
||||||
if [ "$NB_GID" != $(id -g $NB_USER) ]; then
|
if [ "$NB_GID" != $(id -g $NB_USER) ]; then
|
||||||
groupadd -g $NB_GID -o ${NB_GROUP:-${NB_USER}}
|
groupadd -f -g $NB_GID -o ${NB_GROUP:-${NB_USER}}
|
||||||
fi
|
fi
|
||||||
userdel $NB_USER
|
userdel $NB_USER
|
||||||
useradd --home /home/$NB_USER -u $NB_UID -g $NB_GID -G 100 -l $NB_USER
|
useradd --home /home/$NB_USER -u $NB_UID -g $NB_GID -G 100 -l $NB_USER
|
||||||
|
@@ -11,11 +11,13 @@ LOGGER = logging.getLogger(__name__)
|
|||||||
def test_cli_args(container, http_client):
|
def test_cli_args(container, http_client):
|
||||||
"""Container should respect notebook server command line args
|
"""Container should respect notebook server command line args
|
||||||
(e.g., disabling token security)"""
|
(e.g., disabling token security)"""
|
||||||
container.run(
|
c = container.run(
|
||||||
command=['start-notebook.sh', '--NotebookApp.token=""']
|
command=["start-notebook.sh", "--NotebookApp.token=''"]
|
||||||
)
|
)
|
||||||
resp = http_client.get('http://localhost:8888')
|
resp = http_client.get('http://localhost:8888')
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
|
logs = c.logs(stdout=True).decode('utf-8')
|
||||||
|
LOGGER.debug(logs)
|
||||||
assert 'login_submit' not in resp.text
|
assert 'login_submit' not in resp.text
|
||||||
|
|
||||||
|
|
||||||
|
@@ -14,6 +14,14 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"]
|
|||||||
|
|
||||||
USER root
|
USER root
|
||||||
|
|
||||||
|
# Julia installation
|
||||||
|
# Default values can be overridden at build time
|
||||||
|
# (ARGS are in lower case to distinguish them from ENV)
|
||||||
|
# Check https://julialang.org/downloads/
|
||||||
|
ARG julia_version="1.5.1"
|
||||||
|
# SHA256 checksum
|
||||||
|
ARG julia_checksum="f5d37cb7fe40e3a730f721da8f7be40310f133220220949939d8f892ce2e86e3"
|
||||||
|
|
||||||
# R pre-requisites
|
# R pre-requisites
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y --no-install-recommends \
|
apt-get install -y --no-install-recommends \
|
||||||
@@ -24,16 +32,16 @@ RUN apt-get update && \
|
|||||||
|
|
||||||
# Julia dependencies
|
# Julia dependencies
|
||||||
# install Julia packages in /opt/julia instead of $HOME
|
# install Julia packages in /opt/julia instead of $HOME
|
||||||
ENV JULIA_DEPOT_PATH=/opt/julia
|
ENV JULIA_DEPOT_PATH=/opt/julia \
|
||||||
ENV JULIA_PKGDIR=/opt/julia
|
JULIA_PKGDIR=/opt/julia \
|
||||||
ENV JULIA_VERSION=1.4.1
|
JULIA_VERSION="${julia_version}"
|
||||||
|
|
||||||
WORKDIR /tmp
|
WORKDIR /tmp
|
||||||
|
|
||||||
# hadolint ignore=SC2046
|
# hadolint ignore=SC2046
|
||||||
RUN mkdir "/opt/julia-${JULIA_VERSION}" && \
|
RUN mkdir "/opt/julia-${JULIA_VERSION}" && \
|
||||||
wget -q https://julialang-s3.julialang.org/bin/linux/x64/$(echo "${JULIA_VERSION}" | cut -d. -f 1,2)"/julia-${JULIA_VERSION}-linux-x86_64.tar.gz" && \
|
wget -q https://julialang-s3.julialang.org/bin/linux/x64/$(echo "${JULIA_VERSION}" | cut -d. -f 1,2)"/julia-${JULIA_VERSION}-linux-x86_64.tar.gz" && \
|
||||||
echo "fd6d8cadaed678174c3caefb92207a3b0e8da9f926af6703fb4d1e4e4f50610a *julia-${JULIA_VERSION}-linux-x86_64.tar.gz" | sha256sum -c - && \
|
echo "${julia_checksum} *julia-${JULIA_VERSION}-linux-x86_64.tar.gz" | sha256sum -c - && \
|
||||||
tar xzf "julia-${JULIA_VERSION}-linux-x86_64.tar.gz" -C "/opt/julia-${JULIA_VERSION}" --strip-components=1 && \
|
tar xzf "julia-${JULIA_VERSION}-linux-x86_64.tar.gz" -C "/opt/julia-${JULIA_VERSION}" --strip-components=1 && \
|
||||||
rm "/tmp/julia-${JULIA_VERSION}-linux-x86_64.tar.gz"
|
rm "/tmp/julia-${JULIA_VERSION}-linux-x86_64.tar.gz"
|
||||||
RUN ln -fs /opt/julia-*/bin/julia /usr/local/bin/julia
|
RUN ln -fs /opt/julia-*/bin/julia /usr/local/bin/julia
|
||||||
@@ -54,9 +62,9 @@ RUN conda install --quiet --yes \
|
|||||||
'r-caret=6.0*' \
|
'r-caret=6.0*' \
|
||||||
'r-crayon=1.3*' \
|
'r-crayon=1.3*' \
|
||||||
'r-devtools=2.3*' \
|
'r-devtools=2.3*' \
|
||||||
'r-forecast=8.12*' \
|
'r-forecast=8.13*' \
|
||||||
'r-hexbin=1.28*' \
|
'r-hexbin=1.28*' \
|
||||||
'r-htmltools=0.4*' \
|
'r-htmltools=0.5*' \
|
||||||
'r-htmlwidgets=1.5*' \
|
'r-htmlwidgets=1.5*' \
|
||||||
'r-irkernel=1.1*' \
|
'r-irkernel=1.1*' \
|
||||||
'r-nycflights13=1.0*' \
|
'r-nycflights13=1.0*' \
|
||||||
@@ -64,11 +72,11 @@ RUN conda install --quiet --yes \
|
|||||||
'r-randomforest=4.6*' \
|
'r-randomforest=4.6*' \
|
||||||
'r-rcurl=1.98*' \
|
'r-rcurl=1.98*' \
|
||||||
'r-reshape2=1.4*' \
|
'r-reshape2=1.4*' \
|
||||||
'r-rmarkdown=2.1*' \
|
'r-rmarkdown=2.3*' \
|
||||||
'r-rsqlite=2.2*' \
|
'r-rsqlite=2.2*' \
|
||||||
'r-shiny=1.4*' \
|
'r-shiny=1.5*' \
|
||||||
'r-tidyverse=1.3*' \
|
'r-tidyverse=1.3*' \
|
||||||
'rpy2=3.1*' \
|
'rpy2=3.3*' \
|
||||||
&& \
|
&& \
|
||||||
conda clean --all -f -y && \
|
conda clean --all -f -y && \
|
||||||
fix-permissions "${CONDA_DIR}" && \
|
fix-permissions "${CONDA_DIR}" && \
|
||||||
|
File diff suppressed because it is too large
Load Diff
@@ -2,21 +2,81 @@
|
|||||||
|
|
||||||
This page provides details about features specific to one or more images.
|
This page provides details about features specific to one or more images.
|
||||||
|
|
||||||
## Apache Spark
|
## Apache Spark™
|
||||||
|
|
||||||
**Specific Docker Image Options**
|
### Specific Docker Image Options
|
||||||
|
|
||||||
* `-p 4040:4040` - The `jupyter/pyspark-notebook` and `jupyter/all-spark-notebook` images open [SparkUI (Spark Monitoring and Instrumentation UI)](http://spark.apache.org/docs/latest/monitoring.html) at default port `4040`, this option map `4040` port inside docker container to `4040` port on host machine . Note every new spark context that is created is put onto an incrementing port (ie. 4040, 4041, 4042, etc.), and it might be necessary to open multiple ports. For example: `docker run -d -p 8888:8888 -p 4040:4040 -p 4041:4041 jupyter/pyspark-notebook`.
|
* `-p 4040:4040` - The `jupyter/pyspark-notebook` and `jupyter/all-spark-notebook` images open [SparkUI (Spark Monitoring and Instrumentation UI)](http://spark.apache.org/docs/latest/monitoring.html) at default port `4040`, this option map `4040` port inside docker container to `4040` port on host machine . Note every new spark context that is created is put onto an incrementing port (ie. 4040, 4041, 4042, etc.), and it might be necessary to open multiple ports. For example: `docker run -d -p 8888:8888 -p 4040:4040 -p 4041:4041 jupyter/pyspark-notebook`.
|
||||||
|
|
||||||
**Usage Examples**
|
### Build an Image with a Different Version of Spark
|
||||||
|
|
||||||
|
You can build a `pyspark-notebook` image (and also the downstream `all-spark-notebook` image) with a different version of Spark by overriding the default value of the following arguments at build time.
|
||||||
|
|
||||||
|
* Spark distribution is defined by the combination of the Spark and the Hadoop version and verified by the package checksum, see [Download Apache Spark](https://spark.apache.org/downloads.html) for more information. At this time the build will only work with the set of versions available on the Apache Spark download page, so it will not work with the archived versions.
|
||||||
|
* `spark_version`: The Spark version to install (`3.0.0`).
|
||||||
|
* `hadoop_version`: The Hadoop version (`3.2`).
|
||||||
|
* `spark_checksum`: The package checksum (`BFE4540...`).
|
||||||
|
* Spark is shipped with a version of Py4J that has to be referenced in the `PYTHONPATH`.
|
||||||
|
* `py4j_version`: The Py4J version (`0.10.9`), see the tip below.
|
||||||
|
* Spark can run with different OpenJDK versions.
|
||||||
|
* `openjdk_version`: The version of (JRE headless) the OpenJDK distribution (`11`), see [Ubuntu packages](https://packages.ubuntu.com/search?keywords=openjdk).
|
||||||
|
|
||||||
|
For example here is how to build a `pyspark-notebook` image with Spark `2.4.6`, Hadoop `2.7` and OpenJDK `8`.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# From the root of the project
|
||||||
|
# Build the image with different arguments
|
||||||
|
docker build --rm --force-rm \
|
||||||
|
-t jupyter/pyspark-notebook:spark-2.4.6 ./pyspark-notebook \
|
||||||
|
--build-arg spark_version=2.4.6 \
|
||||||
|
--build-arg hadoop_version=2.7 \
|
||||||
|
--build-arg spark_checksum=3A9F401EDA9B5749CDAFD246B1D14219229C26387017791C345A23A65782FB8B25A302BF4AC1ED7C16A1FE83108E94E55DAD9639A51C751D81C8C0534A4A9641 \
|
||||||
|
--build-arg openjdk_version=8 \
|
||||||
|
--build-arg py4j_version=0.10.7
|
||||||
|
|
||||||
|
# Check the newly built image
|
||||||
|
docker images jupyter/pyspark-notebook:spark-2.4.6
|
||||||
|
|
||||||
|
# REPOSITORY TAG IMAGE ID CREATED SIZE
|
||||||
|
# jupyter/pyspark-notebook spark-2.4.6 7ad7b5a9dbcd 4 minutes ago 3.44GB
|
||||||
|
|
||||||
|
# Check the Spark version
|
||||||
|
docker run -it --rm jupyter/pyspark-notebook:spark-2.4.6 pyspark --version
|
||||||
|
|
||||||
|
# Welcome to
|
||||||
|
# ____ __
|
||||||
|
# / __/__ ___ _____/ /__
|
||||||
|
# _\ \/ _ \/ _ `/ __/ '_/
|
||||||
|
# /___/ .__/\_,_/_/ /_/\_\ version 2.4.6
|
||||||
|
# /_/
|
||||||
|
#
|
||||||
|
# Using Scala version 2.11.12, OpenJDK 64-Bit Server VM, 1.8.0_265
|
||||||
|
```
|
||||||
|
|
||||||
|
**Tip**: to get the version of Py4J shipped with Spark:
|
||||||
|
|
||||||
|
* Build a first image without changing `py4j_version` (it will not prevent the image to build it will just prevent Python to find the `pyspark` module),
|
||||||
|
* get the version (`ls /usr/local/spark/python/lib/`),
|
||||||
|
* set the version `--build-arg py4j_version=0.10.7`.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -it --rm jupyter/pyspark-notebook:spark-2.4.6 ls /usr/local/spark/python/lib/
|
||||||
|
# py4j-0.10.7-src.zip PY4J_LICENSE.txt pyspark.zip
|
||||||
|
# You can now set the build-arg
|
||||||
|
# --build-arg py4j_version=
|
||||||
|
```
|
||||||
|
|
||||||
|
*Note: At the time of writing there is an issue preventing to use Spark `2.4.6` with Python `3.8`, see [this answer on SO](https://stackoverflow.com/a/62173969/4413446) for more information.*
|
||||||
|
|
||||||
|
### Usage Examples
|
||||||
|
|
||||||
The `jupyter/pyspark-notebook` and `jupyter/all-spark-notebook` images support the use of [Apache Spark](https://spark.apache.org/) in Python, R, and Scala notebooks. The following sections provide some examples of how to get started using them.
|
The `jupyter/pyspark-notebook` and `jupyter/all-spark-notebook` images support the use of [Apache Spark](https://spark.apache.org/) in Python, R, and Scala notebooks. The following sections provide some examples of how to get started using them.
|
||||||
|
|
||||||
### Using Spark Local Mode
|
#### Using Spark Local Mode
|
||||||
|
|
||||||
Spark **local mode** is useful for experimentation on small data when you do not have a Spark cluster available.
|
Spark **local mode** is useful for experimentation on small data when you do not have a Spark cluster available.
|
||||||
|
|
||||||
#### In Python
|
##### In Python
|
||||||
|
|
||||||
In a Python notebook.
|
In a Python notebook.
|
||||||
|
|
||||||
@@ -33,7 +93,7 @@ rdd.sum()
|
|||||||
# 5050
|
# 5050
|
||||||
```
|
```
|
||||||
|
|
||||||
#### In R
|
##### In R
|
||||||
|
|
||||||
In a R notebook with [SparkR][sparkr].
|
In a R notebook with [SparkR][sparkr].
|
||||||
|
|
||||||
@@ -71,9 +131,7 @@ sdf_len(sc, 100, repartition = 1) %>%
|
|||||||
# 5050
|
# 5050
|
||||||
```
|
```
|
||||||
|
|
||||||
#### In Scala
|
##### In Scala
|
||||||
|
|
||||||
##### In a Spylon Kernel
|
|
||||||
|
|
||||||
Spylon kernel instantiates a `SparkContext` for you in variable `sc` after you configure Spark
|
Spylon kernel instantiates a `SparkContext` for you in variable `sc` after you configure Spark
|
||||||
options in a `%%init_spark` magic cell.
|
options in a `%%init_spark` magic cell.
|
||||||
@@ -91,18 +149,7 @@ rdd.sum()
|
|||||||
// 5050
|
// 5050
|
||||||
```
|
```
|
||||||
|
|
||||||
##### In an Apache Toree Kernel
|
#### Connecting to a Spark Cluster in Standalone Mode
|
||||||
|
|
||||||
Apache Toree instantiates a local `SparkContext` for you in variable `sc` when the kernel starts.
|
|
||||||
|
|
||||||
```scala
|
|
||||||
// Sum of the first 100 whole numbers
|
|
||||||
val rdd = sc.parallelize(0 to 100)
|
|
||||||
rdd.sum()
|
|
||||||
// 5050
|
|
||||||
```
|
|
||||||
|
|
||||||
### Connecting to a Spark Cluster in Standalone Mode
|
|
||||||
|
|
||||||
Connection to Spark Cluster on **[Standalone Mode](https://spark.apache.org/docs/latest/spark-standalone.html)** requires the following set of steps:
|
Connection to Spark Cluster on **[Standalone Mode](https://spark.apache.org/docs/latest/spark-standalone.html)** requires the following set of steps:
|
||||||
|
|
||||||
@@ -117,7 +164,7 @@ Connection to Spark Cluster on **[Standalone Mode](https://spark.apache.org/docs
|
|||||||
|
|
||||||
**Note**: In the following examples we are using the Spark master URL `spark://master:7077` that shall be replaced by the URL of the Spark master.
|
**Note**: In the following examples we are using the Spark master URL `spark://master:7077` that shall be replaced by the URL of the Spark master.
|
||||||
|
|
||||||
#### In Python
|
##### In Python
|
||||||
|
|
||||||
The **same Python version** need to be used on the notebook (where the driver is located) and on the Spark workers.
|
The **same Python version** need to be used on the notebook (where the driver is located) and on the Spark workers.
|
||||||
The python version used at driver and worker side can be adjusted by setting the environment variables `PYSPARK_PYTHON` and / or `PYSPARK_DRIVER_PYTHON`, see [Spark Configuration][spark-conf] for more information.
|
The python version used at driver and worker side can be adjusted by setting the environment variables `PYSPARK_PYTHON` and / or `PYSPARK_DRIVER_PYTHON`, see [Spark Configuration][spark-conf] for more information.
|
||||||
@@ -135,7 +182,7 @@ rdd.sum()
|
|||||||
# 5050
|
# 5050
|
||||||
```
|
```
|
||||||
|
|
||||||
#### In R
|
##### In R
|
||||||
|
|
||||||
In a R notebook with [SparkR][sparkr].
|
In a R notebook with [SparkR][sparkr].
|
||||||
|
|
||||||
@@ -172,9 +219,7 @@ sdf_len(sc, 100, repartition = 1) %>%
|
|||||||
# 5050
|
# 5050
|
||||||
```
|
```
|
||||||
|
|
||||||
#### In Scala
|
##### In Scala
|
||||||
|
|
||||||
##### In a Spylon Kernel
|
|
||||||
|
|
||||||
Spylon kernel instantiates a `SparkContext` for you in variable `sc` after you configure Spark
|
Spylon kernel instantiates a `SparkContext` for you in variable `sc` after you configure Spark
|
||||||
options in a `%%init_spark` magic cell.
|
options in a `%%init_spark` magic cell.
|
||||||
@@ -192,29 +237,6 @@ rdd.sum()
|
|||||||
// 5050
|
// 5050
|
||||||
```
|
```
|
||||||
|
|
||||||
##### In an Apache Toree Scala Notebook
|
|
||||||
|
|
||||||
The Apache Toree kernel automatically creates a `SparkContext` when it starts based on configuration information from its command line arguments and environment variables. You can pass information about your cluster via the `SPARK_OPTS` environment variable when you spawn a container.
|
|
||||||
|
|
||||||
For instance, to pass information about a standalone Spark master, you could start the container like so:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
docker run -d -p 8888:8888 -e SPARK_OPTS='--master=spark://master:7077' \
|
|
||||||
jupyter/all-spark-notebook
|
|
||||||
```
|
|
||||||
|
|
||||||
Note that this is the same information expressed in a notebook in the Python case above. Once the kernel spec has your cluster information, you can test your cluster in an Apache Toree notebook like so:
|
|
||||||
|
|
||||||
```scala
|
|
||||||
// should print the value of --master in the kernel spec
|
|
||||||
println(sc.master)
|
|
||||||
|
|
||||||
// Sum of the first 100 whole numbers
|
|
||||||
val rdd = sc.parallelize(0 to 100)
|
|
||||||
rdd.sum()
|
|
||||||
// 5050
|
|
||||||
```
|
|
||||||
|
|
||||||
## Tensorflow
|
## Tensorflow
|
||||||
|
|
||||||
The `jupyter/tensorflow-notebook` image supports the use of
|
The `jupyter/tensorflow-notebook` image supports the use of
|
||||||
|
@@ -4,12 +4,13 @@
|
|||||||
# Pick your favorite docker-stacks image
|
# Pick your favorite docker-stacks image
|
||||||
FROM jupyter/minimal-notebook:55d5ca6be183
|
FROM jupyter/minimal-notebook:55d5ca6be183
|
||||||
|
|
||||||
USER jovyan
|
|
||||||
|
|
||||||
# Add permanent pip/conda installs, data files, other user libs here
|
|
||||||
# e.g., RUN pip install jupyter_dashboards
|
|
||||||
|
|
||||||
USER root
|
USER root
|
||||||
|
|
||||||
# Add permanent apt-get installs and other root commands here
|
# Add permanent apt-get installs and other root commands here
|
||||||
# e.g., RUN apt-get install npm nodejs
|
# e.g., RUN apt-get install npm nodejs
|
||||||
|
|
||||||
|
USER $NB_UID
|
||||||
|
|
||||||
|
# Switch back to jovyan to avoid accidental container runs as root
|
||||||
|
# Add permanent pip/conda installs, data files, other user libs here
|
||||||
|
# e.g., RUN pip install jupyter_dashboards
|
||||||
|
@@ -4,12 +4,13 @@
|
|||||||
# Pick your favorite docker-stacks image
|
# Pick your favorite docker-stacks image
|
||||||
FROM jupyter/minimal-notebook:2d125a7161b5
|
FROM jupyter/minimal-notebook:2d125a7161b5
|
||||||
|
|
||||||
USER jovyan
|
|
||||||
|
|
||||||
# Add permanent pip/conda installs, data files, other user libs here
|
|
||||||
# e.g., RUN pip install jupyter_dashboards
|
|
||||||
|
|
||||||
USER root
|
USER root
|
||||||
|
|
||||||
# Add permanent apt-get installs and other root commands here
|
# Add permanent apt-get installs and other root commands here
|
||||||
# e.g., RUN apt-get install npm nodejs
|
# e.g., RUN apt-get install npm nodejs
|
||||||
|
|
||||||
|
USER $NB_UID
|
||||||
|
|
||||||
|
# Switch back to jovyan to avoid accidental container runs as root
|
||||||
|
# Add permanent pip/conda installs, data files, other user libs here
|
||||||
|
# e.g., RUN pip install jupyter_dashboards
|
||||||
|
@@ -11,20 +11,30 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"]
|
|||||||
USER root
|
USER root
|
||||||
|
|
||||||
# Spark dependencies
|
# Spark dependencies
|
||||||
ENV APACHE_SPARK_VERSION=3.0.0 \
|
# Default values can be overridden at build time
|
||||||
HADOOP_VERSION=3.2
|
# (ARGS are in lower case to distinguish them from ENV)
|
||||||
|
ARG spark_version="3.0.1"
|
||||||
|
ARG hadoop_version="3.2"
|
||||||
|
ARG spark_checksum="E8B47C5B658E0FBC1E57EEA06262649D8418AE2B2765E44DA53AAF50094877D17297CC5F0B9B35DF2CEEF830F19AA31D7E56EAD950BBE7F8830D6874F88CFC3C"
|
||||||
|
ARG py4j_version="0.10.9"
|
||||||
|
ARG openjdk_version="11"
|
||||||
|
|
||||||
|
ENV APACHE_SPARK_VERSION="${spark_version}" \
|
||||||
|
HADOOP_VERSION="${hadoop_version}"
|
||||||
|
|
||||||
RUN apt-get -y update && \
|
RUN apt-get -y update && \
|
||||||
apt-get install --no-install-recommends -y openjdk-11-jre-headless ca-certificates-java && \
|
apt-get install --no-install-recommends -y \
|
||||||
|
"openjdk-${openjdk_version}-jre-headless" \
|
||||||
|
ca-certificates-java && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Using the preferred mirror to download Spark
|
# Spark installation
|
||||||
WORKDIR /tmp
|
WORKDIR /tmp
|
||||||
|
# Using the preferred mirror to download Spark
|
||||||
# hadolint ignore=SC2046
|
# hadolint ignore=SC2046
|
||||||
RUN wget -q $(wget -qO- https://www.apache.org/dyn/closer.lua/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz\?as_json | \
|
RUN wget -q $(wget -qO- https://www.apache.org/dyn/closer.lua/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz\?as_json | \
|
||||||
python -c "import sys, json; content=json.load(sys.stdin); print(content['preferred']+content['path_info'])") && \
|
python -c "import sys, json; content=json.load(sys.stdin); print(content['preferred']+content['path_info'])") && \
|
||||||
echo "BFE45406C67CC4AE00411AD18CC438F51E7D4B6F14EB61E7BF6B5450897C2E8D3AB020152657C0239F253735C263512FFABF538AC5B9FFFA38B8295736A9C387 *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | sha512sum -c - && \
|
echo "${spark_checksum} *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | sha512sum -c - && \
|
||||||
tar xzf "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" -C /usr/local --owner root --group root --no-same-owner && \
|
tar xzf "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" -C /usr/local --owner root --group root --no-same-owner && \
|
||||||
rm "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz"
|
rm "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz"
|
||||||
|
|
||||||
@@ -33,16 +43,17 @@ RUN ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" spark
|
|||||||
|
|
||||||
# Configure Spark
|
# Configure Spark
|
||||||
ENV SPARK_HOME=/usr/local/spark
|
ENV SPARK_HOME=/usr/local/spark
|
||||||
ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9-src.zip \
|
ENV PYTHONPATH="${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-${py4j_version}-src.zip" \
|
||||||
SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
|
SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
|
||||||
PATH=$PATH:$SPARK_HOME/bin
|
PATH=$PATH:$SPARK_HOME/bin
|
||||||
|
|
||||||
USER $NB_UID
|
USER $NB_UID
|
||||||
|
|
||||||
# Install pyarrow
|
# Install pyarrow
|
||||||
RUN conda install --quiet -y 'pyarrow' && \
|
RUN conda install --quiet --yes --satisfied-skip-solve \
|
||||||
|
'pyarrow=1.0.*' && \
|
||||||
conda clean --all -f -y && \
|
conda clean --all -f -y && \
|
||||||
fix-permissions "${CONDA_DIR}" && \
|
fix-permissions "${CONDA_DIR}" && \
|
||||||
fix-permissions "/home/${NB_USER}"
|
fix-permissions "/home/${NB_USER}"
|
||||||
|
|
||||||
WORKDIR $HOME
|
WORKDIR $HOME
|
||||||
|
@@ -25,22 +25,22 @@ USER $NB_UID
|
|||||||
|
|
||||||
# R packages
|
# R packages
|
||||||
RUN conda install --quiet --yes \
|
RUN conda install --quiet --yes \
|
||||||
'r-base=4.0.0' \
|
'r-base=4.0.2' \
|
||||||
'r-caret=6.*' \
|
'r-caret=6.*' \
|
||||||
'r-crayon=1.3*' \
|
'r-crayon=1.3*' \
|
||||||
'r-devtools=2.3*' \
|
'r-devtools=2.3*' \
|
||||||
'r-forecast=8.12*' \
|
'r-forecast=8.13*' \
|
||||||
'r-hexbin=1.28*' \
|
'r-hexbin=1.28*' \
|
||||||
'r-htmltools=0.4*' \
|
'r-htmltools=0.5*' \
|
||||||
'r-htmlwidgets=1.5*' \
|
'r-htmlwidgets=1.5*' \
|
||||||
'r-irkernel=1.1*' \
|
'r-irkernel=1.1*' \
|
||||||
'r-nycflights13=1.0*' \
|
'r-nycflights13=1.0*' \
|
||||||
'r-randomforest=4.6*' \
|
'r-randomforest=4.6*' \
|
||||||
'r-rcurl=1.98*' \
|
'r-rcurl=1.98*' \
|
||||||
'r-rmarkdown=2.2*' \
|
'r-rmarkdown=2.3*' \
|
||||||
'r-rodbc=1.3*' \
|
'r-rodbc=1.3*' \
|
||||||
'r-rsqlite=2.2*' \
|
'r-rsqlite=2.2*' \
|
||||||
'r-shiny=1.4*' \
|
'r-shiny=1.5*' \
|
||||||
'r-tidyverse=1.3*' \
|
'r-tidyverse=1.3*' \
|
||||||
'unixodbc=2.3.*' \
|
'unixodbc=2.3.*' \
|
||||||
'r-tidymodels=0.1*' \
|
'r-tidymodels=0.1*' \
|
||||||
|
@@ -7,9 +7,9 @@ LABEL maintainer="Jupyter Project <jupyter@googlegroups.com>"
|
|||||||
|
|
||||||
USER root
|
USER root
|
||||||
|
|
||||||
# ffmpeg for matplotlib anim & dvipng for latex labels
|
# ffmpeg for matplotlib anim & dvipng+cm-super for latex labels
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y --no-install-recommends ffmpeg dvipng && \
|
apt-get install -y --no-install-recommends ffmpeg dvipng cm-super && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
USER $NB_UID
|
USER $NB_UID
|
||||||
@@ -18,19 +18,17 @@ USER $NB_UID
|
|||||||
RUN conda install --quiet --yes \
|
RUN conda install --quiet --yes \
|
||||||
'beautifulsoup4=4.9.*' \
|
'beautifulsoup4=4.9.*' \
|
||||||
'conda-forge::blas=*=openblas' \
|
'conda-forge::blas=*=openblas' \
|
||||||
'bokeh=2.1.*' \
|
'bokeh=2.2.*' \
|
||||||
'bottleneck=1.3.*' \
|
'bottleneck=1.3.*' \
|
||||||
'cloudpickle=1.5.*' \
|
'cloudpickle=1.6.*' \
|
||||||
'cython=0.29.*' \
|
'cython=0.29.*' \
|
||||||
'dask=2.20.*' \
|
'dask=2.25.*' \
|
||||||
'dill=0.3.*' \
|
'dill=0.3.*' \
|
||||||
'h5py=2.10.*' \
|
'h5py=2.10.*' \
|
||||||
'hdf5=1.10.*' \
|
|
||||||
'ipywidgets=7.5.*' \
|
'ipywidgets=7.5.*' \
|
||||||
'ipympl=0.5.*'\
|
'ipympl=0.5.*'\
|
||||||
'matplotlib-base=3.2.*' \
|
'matplotlib-base=3.3.*' \
|
||||||
# numba update to 0.49 fails resolving deps.
|
'numba=0.51.*' \
|
||||||
'numba=0.48.*' \
|
|
||||||
'numexpr=2.7.*' \
|
'numexpr=2.7.*' \
|
||||||
'pandas=1.1.*' \
|
'pandas=1.1.*' \
|
||||||
'patsy=0.5.*' \
|
'patsy=0.5.*' \
|
||||||
@@ -39,9 +37,9 @@ RUN conda install --quiet --yes \
|
|||||||
'scikit-image=0.17.*' \
|
'scikit-image=0.17.*' \
|
||||||
'scikit-learn=0.23.*' \
|
'scikit-learn=0.23.*' \
|
||||||
'scipy=1.5.*' \
|
'scipy=1.5.*' \
|
||||||
'seaborn=0.10.*' \
|
'seaborn=0.11.*' \
|
||||||
'sqlalchemy=1.3.*' \
|
'sqlalchemy=1.3.*' \
|
||||||
'statsmodels=0.11.*' \
|
'statsmodels=0.12.*' \
|
||||||
'sympy=1.6.*' \
|
'sympy=1.6.*' \
|
||||||
'vincent=0.4.*' \
|
'vincent=0.4.*' \
|
||||||
'widgetsnbextension=3.5.*'\
|
'widgetsnbextension=3.5.*'\
|
||||||
|
20
scipy-notebook/test/data/matplotlib_fonts_1.py
Normal file
20
scipy-notebook/test/data/matplotlib_fonts_1.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
# Matplotlit: Test tex fonts
|
||||||
|
import matplotlib
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import os
|
||||||
|
|
||||||
|
matplotlib.rcParams['pgf.texsystem'] = 'pdflatex'
|
||||||
|
matplotlib.rcParams.update({'font.family': 'serif', 'font.size': 18,
|
||||||
|
'axes.labelsize': 20, 'axes.titlesize': 24,
|
||||||
|
'figure.titlesize': 28})
|
||||||
|
matplotlib.rcParams['text.usetex'] = True
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(1, 1)
|
||||||
|
x = [1, 2]
|
||||||
|
y = [1, 2]
|
||||||
|
ax.plot(x, y, label='a label')
|
||||||
|
ax.legend(fontsize=15)
|
||||||
|
|
||||||
|
file_path = os.path.join("/tmp", "test_fonts.png")
|
||||||
|
fig.savefig(file_path)
|
||||||
|
print(f"File {file_path} saved")
|
@@ -8,13 +8,24 @@ import os
|
|||||||
LOGGER = logging.getLogger(__name__)
|
LOGGER = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def test_matplotlib(container):
|
@pytest.mark.parametrize("test_file,expected_file,description",
|
||||||
"""Test that matplotlib is able to plot a graph and write it as an image"""
|
[
|
||||||
host_data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
|
("matplotlib_1.py", "test.png",
|
||||||
|
"Test that matplotlib is able to plot a graph and write it as an image ..."),
|
||||||
|
("matplotlib_fonts_1.py", "test_fonts.png",
|
||||||
|
"Test cm-super latex labels in matplotlib ...")
|
||||||
|
])
|
||||||
|
def test_matplotlib(container, test_file, expected_file, description):
|
||||||
|
"""Various tests performed on matplotlib
|
||||||
|
|
||||||
|
- Test that matplotlib is able to plot a graph and write it as an image
|
||||||
|
- Test matplotlib latex fonts, which depend on the cm-super package
|
||||||
|
"""
|
||||||
|
host_data_dir = os.path.join(os.path.dirname(
|
||||||
|
os.path.realpath(__file__)), "data")
|
||||||
cont_data_dir = "/home/jovyan/data"
|
cont_data_dir = "/home/jovyan/data"
|
||||||
test_file = "matplotlib_1.py"
|
|
||||||
output_dir = "/tmp"
|
output_dir = "/tmp"
|
||||||
LOGGER.info("Test that matplotlib is able to plot a graph and write it as an image ...")
|
LOGGER.info(description)
|
||||||
command = "sleep infinity"
|
command = "sleep infinity"
|
||||||
running_container = container.run(
|
running_container = container.run(
|
||||||
volumes={host_data_dir: {"bind": cont_data_dir, "mode": "ro"}},
|
volumes={host_data_dir: {"bind": cont_data_dir, "mode": "ro"}},
|
||||||
@@ -27,8 +38,7 @@ def test_matplotlib(container):
|
|||||||
LOGGER.debug(cmd.output.decode("utf-8"))
|
LOGGER.debug(cmd.output.decode("utf-8"))
|
||||||
# Checking if the file is generated
|
# Checking if the file is generated
|
||||||
# https://stackoverflow.com/a/15895594/4413446
|
# https://stackoverflow.com/a/15895594/4413446
|
||||||
expected_file = f"{output_dir}/test.png"
|
command = f"test -s {output_dir}/{expected_file}"
|
||||||
command = f"test -s {expected_file}"
|
|
||||||
cmd = running_container.exec_run(command)
|
cmd = running_container.exec_run(command)
|
||||||
assert cmd.exit_code == 0, f"Command {command} failed"
|
assert cmd.exit_code == 0, f"Command {command} failed"
|
||||||
LOGGER.debug(cmd.output.decode("utf-8"))
|
LOGGER.debug(cmd.output.decode("utf-8"))
|
||||||
|
@@ -7,6 +7,6 @@ LABEL maintainer="Jupyter Project <jupyter@googlegroups.com>"
|
|||||||
|
|
||||||
# Install Tensorflow
|
# Install Tensorflow
|
||||||
RUN pip install --quiet --no-cache-dir \
|
RUN pip install --quiet --no-cache-dir \
|
||||||
'tensorflow==2.2.0' && \
|
'tensorflow==2.3.0' && \
|
||||||
fix-permissions "${CONDA_DIR}" && \
|
fix-permissions "${CONDA_DIR}" && \
|
||||||
fix-permissions "/home/${NB_USER}"
|
fix-permissions "/home/${NB_USER}"
|
||||||
|
Reference in New Issue
Block a user