From 0fff54f3f2161a3079b06c3d1c7f8b3eb7f3606f Mon Sep 17 00:00:00 2001 From: Ayaz Salikhov Date: Sun, 2 May 2021 23:28:32 +0300 Subject: [PATCH 1/3] Install spark from archive.apache.org to be able to use old versions --- docs/using/specifics.md | 2 +- pyspark-notebook/Dockerfile | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/using/specifics.md b/docs/using/specifics.md index af6e4219..91a7dbc1 100644 --- a/docs/using/specifics.md +++ b/docs/using/specifics.md @@ -12,7 +12,7 @@ This page provides details about features specific to one or more images. You can build a `pyspark-notebook` image (and also the downstream `all-spark-notebook` image) with a different version of Spark by overriding the default value of the following arguments at build time. -* Spark distribution is defined by the combination of the Spark and the Hadoop version and verified by the package checksum, see [Download Apache Spark](https://spark.apache.org/downloads.html) for more information. At this time the build will only work with the set of versions available on the Apache Spark download page, so it will not work with the archived versions. +* Spark distribution is defined by the combination of the Spark and the Hadoop version and verified by the package checksum, see [Download Apache Spark](https://spark.apache.org/downloads.html) for more information. * `spark_version`: The Spark version to install (`3.0.0`). * `hadoop_version`: The Hadoop version (`3.2`). * `spark_checksum`: The package checksum (`BFE4540...`). diff --git a/pyspark-notebook/Dockerfile b/pyspark-notebook/Dockerfile index bf2f601b..435feb5e 100644 --- a/pyspark-notebook/Dockerfile +++ b/pyspark-notebook/Dockerfile @@ -29,10 +29,8 @@ RUN apt-get -y update && \ # Spark installation WORKDIR /tmp -# Using the preferred mirror to download Spark # hadolint ignore=SC2046 -RUN wget -q $(wget -qO- https://www.apache.org/dyn/closer.lua/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz\?as_json | \ - python -c "import sys, json; content=json.load(sys.stdin); print(content['preferred']+content['path_info'])") && \ +RUN wget -q "https://archive.apache.org/dist/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ echo "${spark_checksum} *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | sha512sum -c - && \ tar xzf "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" -C /usr/local --owner root --group root --no-same-owner && \ rm "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" From b5d6f04e4300f5997530a3881b5dda7713dab379 Mon Sep 17 00:00:00 2001 From: Ayaz Salikhov Date: Sun, 2 May 2021 23:28:53 +0300 Subject: [PATCH 2/3] Delete unused hadolint ignore --- pyspark-notebook/Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/pyspark-notebook/Dockerfile b/pyspark-notebook/Dockerfile index 435feb5e..9f09e161 100644 --- a/pyspark-notebook/Dockerfile +++ b/pyspark-notebook/Dockerfile @@ -29,7 +29,6 @@ RUN apt-get -y update && \ # Spark installation WORKDIR /tmp -# hadolint ignore=SC2046 RUN wget -q "https://archive.apache.org/dist/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ echo "${spark_checksum} *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | sha512sum -c - && \ tar xzf "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" -C /usr/local --owner root --group root --no-same-owner && \ From ef470a62da2e9a4b24d6372d0509c7b122fb5a51 Mon Sep 17 00:00:00 2001 From: Romain Date: Tue, 4 May 2021 07:42:26 +0200 Subject: [PATCH 3/3] Update docs/using/specifics.md Add explicit link to archive repo --- docs/using/specifics.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/using/specifics.md b/docs/using/specifics.md index 91a7dbc1..7536f692 100644 --- a/docs/using/specifics.md +++ b/docs/using/specifics.md @@ -12,7 +12,7 @@ This page provides details about features specific to one or more images. You can build a `pyspark-notebook` image (and also the downstream `all-spark-notebook` image) with a different version of Spark by overriding the default value of the following arguments at build time. -* Spark distribution is defined by the combination of the Spark and the Hadoop version and verified by the package checksum, see [Download Apache Spark](https://spark.apache.org/downloads.html) for more information. +* Spark distribution is defined by the combination of the Spark and the Hadoop version and verified by the package checksum, see [Download Apache Spark](https://spark.apache.org/downloads.html) and the [archive repo](https://archive.apache.org/dist/spark/) for more information. * `spark_version`: The Spark version to install (`3.0.0`). * `hadoop_version`: The Hadoop version (`3.2`). * `spark_checksum`: The package checksum (`BFE4540...`).