mirror of
https://github.com/jupyter/docker-stacks.git
synced 2025-10-09 11:02:57 +00:00
Automatically install latest spark version (#2075)
* Automatically install latest pyspark version * Better text * Do not use shutil to keep behaviour * Make setup_script cwd independent * Use _get_program_version to calculate spark version * Update setup_spark.py reqs * Update setup_spark.py * Add info about HADOOP_VERSION * Add customization back * Better text * Specify build args when they are actually needed * Better text * Better code * Better code * Better text * Get rid of warning * Improve code * Remove information about checksum * Better text
This commit is contained in:
@@ -41,10 +41,12 @@ repos:
|
|||||||
args: [--config, ./mypy.ini]
|
args: [--config, ./mypy.ini]
|
||||||
additional_dependencies:
|
additional_dependencies:
|
||||||
[
|
[
|
||||||
|
"beautifulsoup4",
|
||||||
"numpy",
|
"numpy",
|
||||||
"pytest",
|
"pytest",
|
||||||
"requests",
|
"requests",
|
||||||
"urllib3",
|
"urllib3",
|
||||||
|
"types-beautifulsoup4",
|
||||||
"types-requests",
|
"types-requests",
|
||||||
"types-tabulate",
|
"types-tabulate",
|
||||||
"types-urllib3",
|
"types-urllib3",
|
||||||
|
@@ -42,18 +42,20 @@ ipython profile create
|
|||||||
You can build a `pyspark-notebook` image with a different `Spark` version by overriding the default value of the following arguments at build time.
|
You can build a `pyspark-notebook` image with a different `Spark` version by overriding the default value of the following arguments at build time.
|
||||||
`all-spark-notebook` is inherited from `pyspark-notebook`, so you have to first build `pyspark-notebook` and then `all-spark-notebook` to get the same version in `all-spark-notebook`.
|
`all-spark-notebook` is inherited from `pyspark-notebook`, so you have to first build `pyspark-notebook` and then `all-spark-notebook` to get the same version in `all-spark-notebook`.
|
||||||
|
|
||||||
- Spark distribution is defined by the combination of Spark, Hadoop, and Scala versions and verified by the package checksum,
|
- Spark distribution is defined by the combination of Spark, Hadoop, and Scala versions,
|
||||||
see [Download Apache Spark](https://spark.apache.org/downloads.html) and the [archive repo](https://archive.apache.org/dist/spark/) for more information.
|
see [Download Apache Spark](https://spark.apache.org/downloads.html) and the [archive repo](https://archive.apache.org/dist/spark/) for more information.
|
||||||
|
|
||||||
- `spark_version`: The Spark version to install (`3.3.0`).
|
- `openjdk_version`: The version of the OpenJDK (JRE headless) distribution (`17` by default).
|
||||||
- `hadoop_version`: The Hadoop version (`3.2`).
|
|
||||||
- `scala_version`: The Scala version (`2.13`, optional).
|
|
||||||
- `spark_checksum`: The package checksum (`BFE4540...`).
|
|
||||||
- `openjdk_version`: The version of the OpenJDK (JRE headless) distribution (`17`).
|
|
||||||
- This version needs to match the version supported by the Spark distribution used above.
|
- This version needs to match the version supported by the Spark distribution used above.
|
||||||
- See [Spark Overview](https://spark.apache.org/docs/latest/#downloading) and [Ubuntu packages](https://packages.ubuntu.com/search?keywords=openjdk).
|
- See [Spark Overview](https://spark.apache.org/docs/latest/#downloading) and [Ubuntu packages](https://packages.ubuntu.com/search?keywords=openjdk).
|
||||||
|
- `spark_version` (optional): The Spark version to install, for example `3.5.0`.
|
||||||
- Starting with _Spark >= 3.2_, the distribution file might contain the Scala version.
|
If not specified (this is the default), latest stable Spark will be installed.
|
||||||
|
- `hadoop_version`: The Hadoop version (`3` by default).
|
||||||
|
Note, that _Spark < 3.3_ require to specify `major.minor` Hadoop version (i.e. `3.2`).
|
||||||
|
- `scala_version` (optional): The Scala version, for example `2.13` (not specified by default).
|
||||||
|
Starting with _Spark >= 3.2_, the distribution file might contain the Scala version.
|
||||||
|
- `spark_download_url`: URL to use for Spark downloads.
|
||||||
|
You may need to use <https://archive.apache.org/dist/spark/> url if you want to download old Spark versions.
|
||||||
|
|
||||||
For example, here is how to build a `pyspark-notebook` image with Spark `3.2.0`, Hadoop `3.2`, and OpenJDK `11`.
|
For example, here is how to build a `pyspark-notebook` image with Spark `3.2.0`, Hadoop `3.2`, and OpenJDK `11`.
|
||||||
|
|
||||||
@@ -65,14 +67,14 @@ This recipe is not tested and might be broken.
|
|||||||
# From the root of the project
|
# From the root of the project
|
||||||
# Build the image with different arguments
|
# Build the image with different arguments
|
||||||
docker build --rm --force-rm \
|
docker build --rm --force-rm \
|
||||||
-t jupyter/pyspark-notebook:spark-3.2.0 ./images/pyspark-notebook \
|
-t my-pyspark-notebook ./images/pyspark-notebook \
|
||||||
|
--build-arg openjdk_version=11 \
|
||||||
--build-arg spark_version=3.2.0 \
|
--build-arg spark_version=3.2.0 \
|
||||||
--build-arg hadoop_version=3.2 \
|
--build-arg hadoop_version=3.2 \
|
||||||
--build-arg spark_checksum=707DDE035926A50B75E53FCA72CADA519F3239B14A96546911CB4916A58DCF69A1D2BFDD2C7DD5899324DBD82B6EEAB9797A7B4ABF86736FFCA4C26D0E0BF0EE \
|
--build-arg spark_download_url="https://archive.apache.org/dist/spark/"
|
||||||
--build-arg openjdk_version=11
|
|
||||||
|
|
||||||
# Check the newly built image
|
# Check the newly built image
|
||||||
docker run -it --rm quay.io/jupyter/pyspark-notebook:spark-3.2.0 pyspark --version
|
docker run -it --rm my-pyspark-notebook pyspark --version
|
||||||
|
|
||||||
# Welcome to
|
# Welcome to
|
||||||
# ____ __
|
# ____ __
|
||||||
@@ -81,7 +83,12 @@ docker run -it --rm quay.io/jupyter/pyspark-notebook:spark-3.2.0 pyspark --versi
|
|||||||
# /___/ .__/\_,_/_/ /_/\_\ version 3.2.0
|
# /___/ .__/\_,_/_/ /_/\_\ version 3.2.0
|
||||||
# /_/
|
# /_/
|
||||||
|
|
||||||
# Using Scala version 2.13.5, OpenJDK 64-Bit Server VM, 11.0.15
|
# Using Scala version 2.12.15, OpenJDK 64-Bit Server VM, 11.0.21
|
||||||
|
# Branch HEAD
|
||||||
|
# Compiled by user ubuntu on 2021-10-06T12:46:30Z
|
||||||
|
# Revision 5d45a415f3a29898d92380380cfd82bfc7f579ea
|
||||||
|
# Url https://github.com/apache/spark
|
||||||
|
# Type --help for more information.
|
||||||
```
|
```
|
||||||
|
|
||||||
### Usage Examples
|
### Usage Examples
|
||||||
|
@@ -16,49 +16,38 @@ USER root
|
|||||||
# Spark dependencies
|
# Spark dependencies
|
||||||
# Default values can be overridden at build time
|
# Default values can be overridden at build time
|
||||||
# (ARGS are in lowercase to distinguish them from ENV)
|
# (ARGS are in lowercase to distinguish them from ENV)
|
||||||
ARG spark_version="3.5.0"
|
|
||||||
ARG hadoop_version="3"
|
|
||||||
ARG scala_version
|
|
||||||
ARG spark_checksum="8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319"
|
|
||||||
ARG openjdk_version="17"
|
ARG openjdk_version="17"
|
||||||
|
|
||||||
ENV APACHE_SPARK_VERSION="${spark_version}" \
|
|
||||||
HADOOP_VERSION="${hadoop_version}"
|
|
||||||
|
|
||||||
RUN apt-get update --yes && \
|
RUN apt-get update --yes && \
|
||||||
apt-get install --yes --no-install-recommends \
|
apt-get install --yes --no-install-recommends \
|
||||||
"openjdk-${openjdk_version}-jre-headless" \
|
"openjdk-${openjdk_version}-jre-headless" \
|
||||||
ca-certificates-java && \
|
ca-certificates-java && \
|
||||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Spark installation
|
# If spark_version is not set, latest stable Spark will be installed
|
||||||
WORKDIR /tmp
|
ARG spark_version
|
||||||
|
ARG hadoop_version="3"
|
||||||
# You need to use https://archive.apache.org/dist/ website if you want to download old Spark versions
|
# If scala_version is not set, Spark without Scala will be installed
|
||||||
|
ARG scala_version
|
||||||
|
# URL to use for Spark downloads
|
||||||
|
# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions
|
||||||
# But it seems to be slower, that's why we use the recommended site for download
|
# But it seems to be slower, that's why we use the recommended site for download
|
||||||
RUN if [ -z "${scala_version}" ]; then \
|
ARG spark_download_url="https://dlcdn.apache.org/spark/"
|
||||||
curl --progress-bar --location --output "spark.tgz" \
|
|
||||||
"https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz"; \
|
|
||||||
else \
|
|
||||||
curl --progress-bar --location --output "spark.tgz" \
|
|
||||||
"https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz"; \
|
|
||||||
fi && \
|
|
||||||
echo "${spark_checksum} *spark.tgz" | sha512sum -c - && \
|
|
||||||
tar xzf "spark.tgz" -C /usr/local --owner root --group root --no-same-owner && \
|
|
||||||
rm "spark.tgz"
|
|
||||||
|
|
||||||
# Configure Spark
|
# Configure Spark
|
||||||
ENV SPARK_HOME=/usr/local/spark
|
ENV SPARK_VERSION="${spark_version}" \
|
||||||
ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
|
HADOOP_VERSION="${hadoop_version}" \
|
||||||
PATH="${PATH}:${SPARK_HOME}/bin"
|
SCALA_VERSION="${scala_version}" \
|
||||||
|
SPARK_DOWNLOAD_URL="${spark_download_url}"
|
||||||
|
|
||||||
RUN if [ -z "${scala_version}" ]; then \
|
ENV SPARK_HOME=/usr/local/spark
|
||||||
ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" "${SPARK_HOME}"; \
|
ENV PATH="${PATH}:${SPARK_HOME}/bin"
|
||||||
else \
|
ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info"
|
||||||
ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}" "${SPARK_HOME}"; \
|
|
||||||
fi && \
|
COPY setup_spark.py /opt/setup-scripts/
|
||||||
# Add a link in the before_notebook hook in order to source automatically PYTHONPATH && \
|
|
||||||
ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/10spark-config.sh
|
# Setup Spark
|
||||||
|
RUN /opt/setup-scripts/setup_spark.py
|
||||||
|
|
||||||
# Configure IPython system-wide
|
# Configure IPython system-wide
|
||||||
COPY ipython_kernel_config.py "/etc/ipython/"
|
COPY ipython_kernel_config.py "/etc/ipython/"
|
||||||
|
107
images/pyspark-notebook/setup_spark.py
Executable file
107
images/pyspark-notebook/setup_spark.py
Executable file
@@ -0,0 +1,107 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright (c) Jupyter Development Team.
|
||||||
|
# Distributed under the terms of the Modified BSD License.
|
||||||
|
|
||||||
|
# Requirements:
|
||||||
|
# - Run as the root user
|
||||||
|
# - Required env variables: SPARK_HOME, HADOOP_VERSION, SPARK_DOWNLOAD_URL
|
||||||
|
# - Optional env variables: SPARK_VERSION, SCALA_VERSION
|
||||||
|
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_refs(url: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Get all the references for a given webpage
|
||||||
|
"""
|
||||||
|
resp = requests.get(url)
|
||||||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
return [a["href"] for a in soup.find_all("a", href=True)]
|
||||||
|
|
||||||
|
|
||||||
|
def get_spark_version() -> str:
|
||||||
|
"""
|
||||||
|
If ${SPARK_VERSION} env variable is non-empty, simply returns it
|
||||||
|
Otherwise, returns the last stable version of Spark using spark archive
|
||||||
|
"""
|
||||||
|
if (version := os.environ["SPARK_VERSION"]) != "":
|
||||||
|
return version
|
||||||
|
all_refs = get_all_refs("https://archive.apache.org/dist/spark/")
|
||||||
|
stable_versions = [
|
||||||
|
ref.removeprefix("spark-").removesuffix("/")
|
||||||
|
for ref in all_refs
|
||||||
|
if ref.startswith("spark-") and "incubating" not in ref and "preview" not in ref
|
||||||
|
]
|
||||||
|
# Compare versions semantically
|
||||||
|
return max(
|
||||||
|
stable_versions, key=lambda ver: [int(sub_ver) for sub_ver in ver.split(".")]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def download_spark(
|
||||||
|
spark_version: str,
|
||||||
|
hadoop_version: str,
|
||||||
|
scala_version: str,
|
||||||
|
spark_download_url: Path,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Downloads and unpacks spark
|
||||||
|
The resulting spark directory name is returned
|
||||||
|
"""
|
||||||
|
spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}"
|
||||||
|
if scala_version:
|
||||||
|
spark_dir_name += f"-scala{scala_version}"
|
||||||
|
spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz"
|
||||||
|
|
||||||
|
tmp_file = Path("/tmp/spark.tar.gz")
|
||||||
|
subprocess.check_call(
|
||||||
|
["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url]
|
||||||
|
)
|
||||||
|
subprocess.check_call(
|
||||||
|
[
|
||||||
|
"tar",
|
||||||
|
"xzf",
|
||||||
|
tmp_file,
|
||||||
|
"-C",
|
||||||
|
"/usr/local",
|
||||||
|
"--owner",
|
||||||
|
"root",
|
||||||
|
"--group",
|
||||||
|
"root",
|
||||||
|
"--no-same-owner",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
tmp_file.unlink()
|
||||||
|
return spark_dir_name
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_spark(spark_dir_name: str, spark_home: Path) -> None:
|
||||||
|
"""
|
||||||
|
Creates a ${SPARK_HOME} symlink to a versioned spark directory
|
||||||
|
Creates a 10spark-config.sh symlink to source PYTHONPATH automatically
|
||||||
|
"""
|
||||||
|
subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home])
|
||||||
|
|
||||||
|
# Add a link in the before_notebook hook in order to source PYTHONPATH automatically
|
||||||
|
CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh"
|
||||||
|
subprocess.check_call(
|
||||||
|
["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
spark_version = get_spark_version()
|
||||||
|
spark_dir_name = download_spark(
|
||||||
|
spark_version=spark_version,
|
||||||
|
hadoop_version=os.environ["HADOOP_VERSION"],
|
||||||
|
scala_version=os.environ["SCALA_VERSION"],
|
||||||
|
spark_download_url=Path(os.environ["SPARK_DOWNLOAD_URL"]),
|
||||||
|
)
|
||||||
|
prepare_spark(
|
||||||
|
spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"])
|
||||||
|
)
|
@@ -128,7 +128,12 @@ class JuliaVersionTagger(TaggerInterface):
|
|||||||
class SparkVersionTagger(TaggerInterface):
|
class SparkVersionTagger(TaggerInterface):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def tag_value(container: Container) -> str:
|
def tag_value(container: Container) -> str:
|
||||||
return "spark-" + _get_env_variable(container, "APACHE_SPARK_VERSION")
|
SPARK_VERSION_LINE_PREFIX = r" /___/ .__/\_,_/_/ /_/\_\ version"
|
||||||
|
|
||||||
|
spark_version = _get_program_version(container, "spark-submit")
|
||||||
|
version_line = spark_version.split("\n")[4]
|
||||||
|
assert version_line.startswith(SPARK_VERSION_LINE_PREFIX)
|
||||||
|
return "spark-" + version_line.split(" ")[-1]
|
||||||
|
|
||||||
|
|
||||||
class HadoopVersionTagger(TaggerInterface):
|
class HadoopVersionTagger(TaggerInterface):
|
||||||
|
Reference in New Issue
Block a user