mirror of
https://github.com/jupyter/docker-stacks.git
synced 2025-10-16 06:22:56 +00:00
Merge pull request #1727 from Bidek56/master
This commit is contained in:
@@ -41,37 +41,43 @@ ipython profile create
|
||||
|
||||
You can build a `pyspark-notebook` image (and also the downstream `all-spark-notebook` image) with a different version of Spark by overriding the default value of the following arguments at build time.
|
||||
|
||||
- Spark distribution is defined by the combination of the Spark and the Hadoop version and verified by the package checksum,
|
||||
- Spark distribution is defined by the combination of Spark, Hadoop and Scala versions and verified by the package checksum,
|
||||
see [Download Apache Spark](https://spark.apache.org/downloads.html) and the [archive repo](https://archive.apache.org/dist/spark/) for more information.
|
||||
- `spark_version`: The Spark version to install (`3.0.0`).
|
||||
- `hadoop_version`: The Hadoop version (`3.2`).
|
||||
- `spark_checksum`: The package checksum (`BFE4540...`).
|
||||
- Spark can run with different OpenJDK versions.
|
||||
- `openjdk_version`: The version of (JRE headless) the OpenJDK distribution (`11`), see [Ubuntu packages](https://packages.ubuntu.com/search?keywords=openjdk).
|
||||
|
||||
For example here is how to build a `pyspark-notebook` image with Spark `2.4.7`, Hadoop `2.7` and OpenJDK `8`.
|
||||
- `spark_version`: The Spark version to install (`3.3.0`).
|
||||
- `hadoop_version`: The Hadoop version (`3.2`).
|
||||
- `scala_version`: The Scala version (`2.13`).
|
||||
- `spark_checksum`: The package checksum (`BFE4540...`).
|
||||
- `openjdk_version`: The version of the OpenJDK (JRE headless) distribution (`17`).
|
||||
- This version needs to match the version supported by the Spark distribution used above.
|
||||
- See [Spark Overview](https://spark.apache.org/docs/latest/#downloading) and [Ubuntu packages](https://packages.ubuntu.com/search?keywords=openjdk).
|
||||
|
||||
- Starting with _Spark >= 3.2_ the distribution file contains Scala version, hence building older Spark will not work.
|
||||
- Building older version requires modification to the Dockerfile or using it's older version of the Dockerfile.
|
||||
|
||||
For example here is how to build a `pyspark-notebook` image with Spark `3.2.0`, Hadoop `3.2` and OpenJDK `11`.
|
||||
|
||||
```bash
|
||||
# From the root of the project
|
||||
# Build the image with different arguments
|
||||
docker build --rm --force-rm \
|
||||
-t jupyter/pyspark-notebook:spark-2.4.7 ./pyspark-notebook \
|
||||
--build-arg spark_version=2.4.7 \
|
||||
--build-arg hadoop_version=2.7 \
|
||||
--build-arg spark_checksum=0F5455672045F6110B030CE343C049855B7BA86C0ECB5E39A075FF9D093C7F648DA55DED12E72FFE65D84C32DCD5418A6D764F2D6295A3F894A4286CC80EF478 \
|
||||
--build-arg openjdk_version=8
|
||||
-t jupyter/pyspark-notebook:spark-3.2.0 ./pyspark-notebook \
|
||||
--build-arg spark_version=3.2.0 \
|
||||
--build-arg hadoop_version=3.2 \
|
||||
--build-arg spark_checksum=707DDE035926A50B75E53FCA72CADA519F3239B14A96546911CB4916A58DCF69A1D2BFDD2C7DD5899324DBD82B6EEAB9797A7B4ABF86736FFCA4C26D0E0BF0EE \
|
||||
--build-arg openjdk_version=11
|
||||
|
||||
# Check the newly built image
|
||||
docker run -it --rm jupyter/pyspark-notebook:spark-2.4.7 pyspark --version
|
||||
docker run -it --rm jupyter/pyspark-notebook:spark-3.2.0 pyspark --version
|
||||
|
||||
# Welcome to
|
||||
# ____ __
|
||||
# / __/__ ___ _____/ /__
|
||||
# _\ \/ _ \/ _ `/ __/ '_/
|
||||
# /___/ .__/\_,_/_/ /_/\_\ version 2.4.7
|
||||
# /___/ .__/\_,_/_/ /_/\_\ version 3.2.0
|
||||
# /_/
|
||||
#
|
||||
# Using Scala version 2.11.12, OpenJDK 64-Bit Server VM, 1.8.0_275
|
||||
|
||||
# Using Scala version 2.13.5, OpenJDK 64-Bit Server VM, 11.0.15
|
||||
```
|
||||
|
||||
### Usage Examples
|
||||
|
@@ -15,10 +15,11 @@ USER root
|
||||
# Spark dependencies
|
||||
# Default values can be overridden at build time
|
||||
# (ARGS are in lower case to distinguish them from ENV)
|
||||
ARG spark_version="3.2.1"
|
||||
ARG hadoop_version="3.2"
|
||||
ARG spark_checksum="145ADACF189FECF05FBA3A69841D2804DD66546B11D14FC181AC49D89F3CB5E4FECD9B25F56F0AF767155419CD430838FB651992AEB37D3A6F91E7E009D1F9AE"
|
||||
ARG openjdk_version="11"
|
||||
ARG spark_version="3.3.0"
|
||||
ARG hadoop_version="3"
|
||||
ARG scala_version="2.13"
|
||||
ARG spark_checksum="4c09dac70e22bf1d5b7b2cabc1dd92aba13237f52a5b682c67982266fc7a0f5e0f964edff9bc76adbd8cb444eb1a00fdc59516147f99e4e2ce068420ff4881f0"
|
||||
ARG openjdk_version="17"
|
||||
|
||||
ENV APACHE_SPARK_VERSION="${spark_version}" \
|
||||
HADOOP_VERSION="${hadoop_version}"
|
||||
@@ -31,10 +32,10 @@ RUN apt-get update --yes && \
|
||||
|
||||
# Spark installation
|
||||
WORKDIR /tmp
|
||||
RUN wget -q "https://archive.apache.org/dist/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \
|
||||
echo "${spark_checksum} *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | sha512sum -c - && \
|
||||
tar xzf "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" -C /usr/local --owner root --group root --no-same-owner && \
|
||||
rm "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz"
|
||||
RUN wget -q "https://archive.apache.org/dist/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz" && \
|
||||
echo "${spark_checksum} *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz" | sha512sum -c - && \
|
||||
tar xzf "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz" -C /usr/local --owner root --group root --no-same-owner && \
|
||||
rm "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz"
|
||||
|
||||
WORKDIR /usr/local
|
||||
|
||||
@@ -43,17 +44,11 @@ ENV SPARK_HOME=/usr/local/spark
|
||||
ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
|
||||
PATH="${PATH}:${SPARK_HOME}/bin"
|
||||
|
||||
RUN ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" spark && \
|
||||
RUN ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}" spark && \
|
||||
# Add a link in the before_notebook hook in order to source automatically PYTHONPATH
|
||||
mkdir -p /usr/local/bin/before-notebook.d && \
|
||||
ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/spark-config.sh
|
||||
|
||||
# Fix Spark installation for Java 11 and Apache Arrow library
|
||||
# see: https://github.com/apache/spark/pull/27356, https://spark.apache.org/docs/latest/#downloading
|
||||
RUN cp -p "${SPARK_HOME}/conf/spark-defaults.conf.template" "${SPARK_HOME}/conf/spark-defaults.conf" && \
|
||||
echo 'spark.driver.extraJavaOptions -Dio.netty.tryReflectionSetAccessible=true' >> "${SPARK_HOME}/conf/spark-defaults.conf" && \
|
||||
echo 'spark.executor.extraJavaOptions -Dio.netty.tryReflectionSetAccessible=true' >> "${SPARK_HOME}/conf/spark-defaults.conf"
|
||||
|
||||
# Configure IPython system-wide
|
||||
COPY ipython_kernel_config.py "/etc/ipython/"
|
||||
RUN fix-permissions "/etc/ipython/"
|
||||
|
@@ -32,14 +32,10 @@ def test_nbconvert(container: TrackedContainer, test_file: str) -> None:
|
||||
)
|
||||
logs = container.run_and_wait(
|
||||
timeout=60,
|
||||
no_warnings=False,
|
||||
volumes={str(host_data_dir): {"bind": cont_data_dir, "mode": "ro"}},
|
||||
tty=True,
|
||||
command=["start.sh", "bash", "-c", command],
|
||||
)
|
||||
warnings = TrackedContainer.get_warnings(logs)
|
||||
# Some Spark warnings
|
||||
assert len(warnings) == 5
|
||||
|
||||
expected_file = f"{output_dir}/{test_file}.md"
|
||||
assert expected_file in logs, f"Expected file {expected_file} not generated"
|
||||
|
@@ -11,12 +11,8 @@ def test_spark_shell(container: TrackedContainer) -> None:
|
||||
"""Checking if Spark (spark-shell) is running properly"""
|
||||
logs = container.run_and_wait(
|
||||
timeout=60,
|
||||
no_warnings=False,
|
||||
tty=True,
|
||||
command=["start.sh", "bash", "-c", 'spark-shell <<< "1+1"'],
|
||||
)
|
||||
warnings = TrackedContainer.get_warnings(logs)
|
||||
# Some Spark warnings
|
||||
assert len(warnings) == 5
|
||||
|
||||
assert "res0: Int = 2" in logs, "spark-shell does not work"
|
||||
|
Reference in New Issue
Block a user