From aeb940220fe063e36467fd7dbda835e9dbdd9054 Mon Sep 17 00:00:00 2001
From: Darek <darek.bidek@gmail.com>
Date: Thu, 16 Jun 2022 14:53:37 -0400
Subject: [PATCH 01/10] Spark->3.3,Hadoop->3,Scala->2.13,Java->17

---
 pyspark-notebook/Dockerfile                   | 19 ++++++++++---------
 .../test_spark_notebooks.py                   |  4 ++--
 tests/pyspark-notebook/test_spark.py          |  5 +++--
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/pyspark-notebook/Dockerfile b/pyspark-notebook/Dockerfile
index 9a3f5417..85612f94 100644
--- a/pyspark-notebook/Dockerfile
+++ b/pyspark-notebook/Dockerfile
@@ -15,10 +15,11 @@ USER root
 # Spark dependencies
 # Default values can be overridden at build time
 # (ARGS are in lower case to distinguish them from ENV)
-ARG spark_version="3.2.1"
-ARG hadoop_version="3.2"
-ARG spark_checksum="145ADACF189FECF05FBA3A69841D2804DD66546B11D14FC181AC49D89F3CB5E4FECD9B25F56F0AF767155419CD430838FB651992AEB37D3A6F91E7E009D1F9AE"
-ARG openjdk_version="11"
+ARG spark_version="3.3.0"
+ARG hadoop_version="3"
+ARG scala_version="2.13"
+ARG spark_checksum="4c09dac70e22bf1d5b7b2cabc1dd92aba13237f52a5b682c67982266fc7a0f5e0f964edff9bc76adbd8cb444eb1a00fdc59516147f99e4e2ce068420ff4881f0"
+ARG openjdk_version="17"
 
 ENV APACHE_SPARK_VERSION="${spark_version}" \
     HADOOP_VERSION="${hadoop_version}"
@@ -31,10 +32,10 @@ RUN apt-get update --yes && \
 
 # Spark installation
 WORKDIR /tmp
-RUN wget -q "https://archive.apache.org/dist/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \
-    echo "${spark_checksum} *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | sha512sum -c - && \
-    tar xzf "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" -C /usr/local --owner root --group root --no-same-owner && \
-    rm "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz"
+RUN wget -q "https://archive.apache.org/dist/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz" && \
+    echo "${spark_checksum} *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz" | sha512sum -c - && \
+    tar xzf "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz" -C /usr/local --owner root --group root --no-same-owner && \
+    rm "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz"
 
 WORKDIR /usr/local
 
@@ -43,7 +44,7 @@ ENV SPARK_HOME=/usr/local/spark
 ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
     PATH="${PATH}:${SPARK_HOME}/bin"
 
-RUN ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" spark && \
+RUN ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}" spark && \
     # Add a link in the before_notebook hook in order to source automatically PYTHONPATH
     mkdir -p /usr/local/bin/before-notebook.d && \
     ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/spark-config.sh
diff --git a/tests/all-spark-notebook/test_spark_notebooks.py b/tests/all-spark-notebook/test_spark_notebooks.py
index fb4a6cbc..791b8dd4 100644
--- a/tests/all-spark-notebook/test_spark_notebooks.py
+++ b/tests/all-spark-notebook/test_spark_notebooks.py
@@ -38,8 +38,8 @@ def test_nbconvert(container: TrackedContainer, test_file: str) -> None:
         command=["start.sh", "bash", "-c", command],
     )
     warnings = TrackedContainer.get_warnings(logs)
-    # Some Spark warnings
-    assert len(warnings) == 5
+    # No Spark warnings
+    assert len(warnings) == 0
 
     expected_file = f"{output_dir}/{test_file}.md"
     assert expected_file in logs, f"Expected file {expected_file} not generated"
diff --git a/tests/pyspark-notebook/test_spark.py b/tests/pyspark-notebook/test_spark.py
index da47e19f..fda19177 100644
--- a/tests/pyspark-notebook/test_spark.py
+++ b/tests/pyspark-notebook/test_spark.py
@@ -16,7 +16,8 @@ def test_spark_shell(container: TrackedContainer) -> None:
         command=["start.sh", "bash", "-c", 'spark-shell <<< "1+1"'],
     )
     warnings = TrackedContainer.get_warnings(logs)
-    # Some Spark warnings
-    assert len(warnings) == 5
+
+    # Should not see any Spark warnings
+    assert len(warnings) == 0
 
     assert "res0: Int = 2" in logs, "spark-shell does not work"

From 3b1ddfa670ff4869b9a43fd84a6de10451d1de9b Mon Sep 17 00:00:00 2001
From: Darek <darek.bidek@gmail.com>
Date: Tue, 21 Jun 2022 15:18:33 -0400
Subject: [PATCH 02/10] Removing no_warnings

---
 tests/all-spark-notebook/test_spark_notebooks.py | 4 ----
 tests/pyspark-notebook/test_spark.py             | 5 -----
 2 files changed, 9 deletions(-)

diff --git a/tests/all-spark-notebook/test_spark_notebooks.py b/tests/all-spark-notebook/test_spark_notebooks.py
index 791b8dd4..b0c65efe 100644
--- a/tests/all-spark-notebook/test_spark_notebooks.py
+++ b/tests/all-spark-notebook/test_spark_notebooks.py
@@ -32,14 +32,10 @@ def test_nbconvert(container: TrackedContainer, test_file: str) -> None:
     )
     logs = container.run_and_wait(
         timeout=60,
-        no_warnings=False,
         volumes={str(host_data_dir): {"bind": cont_data_dir, "mode": "ro"}},
         tty=True,
         command=["start.sh", "bash", "-c", command],
     )
-    warnings = TrackedContainer.get_warnings(logs)
-    # No Spark warnings
-    assert len(warnings) == 0
 
     expected_file = f"{output_dir}/{test_file}.md"
     assert expected_file in logs, f"Expected file {expected_file} not generated"
diff --git a/tests/pyspark-notebook/test_spark.py b/tests/pyspark-notebook/test_spark.py
index fda19177..eb721bc1 100644
--- a/tests/pyspark-notebook/test_spark.py
+++ b/tests/pyspark-notebook/test_spark.py
@@ -11,13 +11,8 @@ def test_spark_shell(container: TrackedContainer) -> None:
     """Checking if Spark (spark-shell) is running properly"""
     logs = container.run_and_wait(
         timeout=60,
-        no_warnings=False,
         tty=True,
         command=["start.sh", "bash", "-c", 'spark-shell <<< "1+1"'],
     )
-    warnings = TrackedContainer.get_warnings(logs)
-
-    # Should not see any Spark warnings
-    assert len(warnings) == 0
 
     assert "res0: Int = 2" in logs, "spark-shell does not work"

From 4fe2609e12346d312365b70d3b424dbe66466f13 Mon Sep 17 00:00:00 2001
From: Darek <darek.bidek@gmail.com>
Date: Mon, 4 Jul 2022 07:09:23 -0400
Subject: [PATCH 03/10] Updating docs on building Spark with arguments

---
 docs/using/specifics.md | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/docs/using/specifics.md b/docs/using/specifics.md
index 04eef9cf..7d0d06cb 100644
--- a/docs/using/specifics.md
+++ b/docs/using/specifics.md
@@ -41,37 +41,41 @@ ipython profile create
 
 You can build a `pyspark-notebook` image (and also the downstream `all-spark-notebook` image) with a different version of Spark by overriding the default value of the following arguments at build time.
 
-- Spark distribution is defined by the combination of the Spark and the Hadoop version and verified by the package checksum,
+- Spark distribution is defined by the combination of Spark, Hadoop and Scala version and verified by the package checksum,
   see [Download Apache Spark](https://spark.apache.org/downloads.html) and the [archive repo](https://archive.apache.org/dist/spark/) for more information.
-  - `spark_version`: The Spark version to install (`3.0.0`).
+  - `spark_version`: The Spark version to install (`3.3.0`).
   - `hadoop_version`: The Hadoop version (`3.2`).
+  - `scala_version`: The Scala version (`2.13`).
   - `spark_checksum`: The package checksum (`BFE4540...`).
-- Spark can run with different OpenJDK versions.
-  - `openjdk_version`: The version of (JRE headless) the OpenJDK distribution (`11`), see [Ubuntu packages](https://packages.ubuntu.com/search?keywords=openjdk).
+- Spark can run with different OpenJDK versions and those version need to match else Spark will not work.
+  - `openjdk_version`: The version of (JRE headless) the OpenJDK distribution (`17`), see [Ubuntu packages](https://packages.ubuntu.com/search?keywords=openjdk).
 
-For example here is how to build a `pyspark-notebook` image with Spark `2.4.7`, Hadoop `2.7` and OpenJDK `8`.
+- Starting Spark 3.2 the distribution file contains Scala version, hence building different version will only work for Spark >= 3.2
+- Building Spark < 3.2 requires modification to the Docker file or using an older version of the Dockerfile
+
+For example here is how to build a `pyspark-notebook` image with Spark `3.2.0`, Hadoop `3.2` and OpenJDK `11`.
 
 ```bash
 # From the root of the project
 # Build the image with different arguments
 docker build --rm --force-rm \
-    -t jupyter/pyspark-notebook:spark-2.4.7 ./pyspark-notebook \
-    --build-arg spark_version=2.4.7 \
-    --build-arg hadoop_version=2.7 \
-    --build-arg spark_checksum=0F5455672045F6110B030CE343C049855B7BA86C0ECB5E39A075FF9D093C7F648DA55DED12E72FFE65D84C32DCD5418A6D764F2D6295A3F894A4286CC80EF478 \
-    --build-arg openjdk_version=8
+    -t jupyter/pyspark-notebook:spark-3.2.0 ./pyspark-notebook \
+    --build-arg spark_version=3.2.0 \
+    --build-arg hadoop_version=3.2 \
+    --build-arg spark_checksum=707DDE035926A50B75E53FCA72CADA519F3239B14A96546911CB4916A58DCF69A1D2BFDD2C7DD5899324DBD82B6EEAB9797A7B4ABF86736FFCA4C26D0E0BF0EE \
+    --build-arg openjdk_version=11
 
 # Check the newly built image
-docker run -it --rm jupyter/pyspark-notebook:spark-2.4.7 pyspark --version
+docker run -it --rm jupyter/pyspark-notebook:spark-3.2.0 pyspark --version
 
 # Welcome to
 #       ____              __
 #      / __/__  ___ _____/ /__
 #     _\ \/ _ \/ _ `/ __/  '_/
-#    /___/ .__/\_,_/_/ /_/\_\   version 2.4.7
+#    /___/ .__/\_,_/_/ /_/\_\   version 3.2.0
 #       /_/
-#
-# Using Scala version 2.11.12, OpenJDK 64-Bit Server VM, 1.8.0_275
+                        
+# Using Scala version 2.13.5, OpenJDK 64-Bit Server VM, 11.0.15
 ```
 
 ### Usage Examples

From b701723184f04fca5a29eb740e99a03e7d8fd7d1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 4 Jul 2022 11:09:46 +0000
Subject: [PATCH 04/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 docs/using/specifics.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/using/specifics.md b/docs/using/specifics.md
index 7d0d06cb..35cacf24 100644
--- a/docs/using/specifics.md
+++ b/docs/using/specifics.md
@@ -48,6 +48,7 @@ You can build a `pyspark-notebook` image (and also the downstream `all-spark-not
   - `scala_version`: The Scala version (`2.13`).
   - `spark_checksum`: The package checksum (`BFE4540...`).
 - Spark can run with different OpenJDK versions and those version need to match else Spark will not work.
+
   - `openjdk_version`: The version of (JRE headless) the OpenJDK distribution (`17`), see [Ubuntu packages](https://packages.ubuntu.com/search?keywords=openjdk).
 
 - Starting Spark 3.2 the distribution file contains Scala version, hence building different version will only work for Spark >= 3.2
@@ -74,7 +75,7 @@ docker run -it --rm jupyter/pyspark-notebook:spark-3.2.0 pyspark --version
 #     _\ \/ _ \/ _ `/ __/  '_/
 #    /___/ .__/\_,_/_/ /_/\_\   version 3.2.0
 #       /_/
-                        
+
 # Using Scala version 2.13.5, OpenJDK 64-Bit Server VM, 11.0.15
 ```
 

From b319f66084126907750596d9abd78028b707afec Mon Sep 17 00:00:00 2001
From: Darek <darek.bidek@gmail.com>
Date: Tue, 5 Jul 2022 10:13:18 -0400
Subject: [PATCH 05/10] Removing Spark config specific to Apache Arrow and Java
 11

---
 pyspark-notebook/Dockerfile | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/pyspark-notebook/Dockerfile b/pyspark-notebook/Dockerfile
index 85612f94..0b082df8 100644
--- a/pyspark-notebook/Dockerfile
+++ b/pyspark-notebook/Dockerfile
@@ -49,12 +49,6 @@ RUN ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scal
     mkdir -p /usr/local/bin/before-notebook.d && \
     ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/spark-config.sh
 
-# Fix Spark installation for Java 11 and Apache Arrow library
-# see: https://github.com/apache/spark/pull/27356, https://spark.apache.org/docs/latest/#downloading
-RUN cp -p "${SPARK_HOME}/conf/spark-defaults.conf.template" "${SPARK_HOME}/conf/spark-defaults.conf" && \
-    echo 'spark.driver.extraJavaOptions -Dio.netty.tryReflectionSetAccessible=true' >> "${SPARK_HOME}/conf/spark-defaults.conf" && \
-    echo 'spark.executor.extraJavaOptions -Dio.netty.tryReflectionSetAccessible=true' >> "${SPARK_HOME}/conf/spark-defaults.conf"
-
 # Configure IPython system-wide
 COPY ipython_kernel_config.py "/etc/ipython/"
 RUN fix-permissions "/etc/ipython/"

From 452ccd88fde1c1be1a7398f04536dde9a916a300 Mon Sep 17 00:00:00 2001
From: Darek <darek56@yahoo.com>
Date: Tue, 5 Jul 2022 15:23:14 -0400
Subject: [PATCH 06/10] Update docs/using/specifics.md

Co-authored-by: Ayaz Salikhov <mathbunnyru@users.noreply.github.com>
---
 docs/using/specifics.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/using/specifics.md b/docs/using/specifics.md
index 35cacf24..4ec07a1b 100644
--- a/docs/using/specifics.md
+++ b/docs/using/specifics.md
@@ -41,7 +41,7 @@ ipython profile create
 
 You can build a `pyspark-notebook` image (and also the downstream `all-spark-notebook` image) with a different version of Spark by overriding the default value of the following arguments at build time.
 
-- Spark distribution is defined by the combination of Spark, Hadoop and Scala version and verified by the package checksum,
+- Spark distribution is defined by the combination of Spark, Hadoop and Scala versions and verified by the package checksum,
   see [Download Apache Spark](https://spark.apache.org/downloads.html) and the [archive repo](https://archive.apache.org/dist/spark/) for more information.
   - `spark_version`: The Spark version to install (`3.3.0`).
   - `hadoop_version`: The Hadoop version (`3.2`).

From ca96253c4a81eb81b5fa6e4e7da7c605dd19a9fe Mon Sep 17 00:00:00 2001
From: Darek <darek56@yahoo.com>
Date: Tue, 5 Jul 2022 15:24:15 -0400
Subject: [PATCH 07/10] Update docs/using/specifics.md

Co-authored-by: Ayaz Salikhov <mathbunnyru@users.noreply.github.com>
---
 docs/using/specifics.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/using/specifics.md b/docs/using/specifics.md
index 4ec07a1b..6dddc7d0 100644
--- a/docs/using/specifics.md
+++ b/docs/using/specifics.md
@@ -51,8 +51,8 @@ You can build a `pyspark-notebook` image (and also the downstream `all-spark-not
 
   - `openjdk_version`: The version of (JRE headless) the OpenJDK distribution (`17`), see [Ubuntu packages](https://packages.ubuntu.com/search?keywords=openjdk).
 
-- Starting Spark 3.2 the distribution file contains Scala version, hence building different version will only work for Spark >= 3.2
-- Building Spark < 3.2 requires modification to the Docker file or using an older version of the Dockerfile
+- Starting with _Spark >= 3.2_ the distribution file contains Scala version, hence building older Spark will not work.
+- Building older version requires modification to the Dockerfile or using it's older version
 
 For example here is how to build a `pyspark-notebook` image with Spark `3.2.0`, Hadoop `3.2` and OpenJDK `11`.
 

From b96757876e5b8fafce4adb11a456c5addf6bf7f5 Mon Sep 17 00:00:00 2001
From: Darek <darek56@yahoo.com>
Date: Tue, 5 Jul 2022 15:41:31 -0400
Subject: [PATCH 08/10] Updates to the OpenJDK comments

Clarifying OpenJDK comments
---
 docs/using/specifics.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/using/specifics.md b/docs/using/specifics.md
index 6dddc7d0..c34cba05 100644
--- a/docs/using/specifics.md
+++ b/docs/using/specifics.md
@@ -47,12 +47,12 @@ You can build a `pyspark-notebook` image (and also the downstream `all-spark-not
   - `hadoop_version`: The Hadoop version (`3.2`).
   - `scala_version`: The Scala version (`2.13`).
   - `spark_checksum`: The package checksum (`BFE4540...`).
-- Spark can run with different OpenJDK versions and those version need to match else Spark will not work.
-
-  - `openjdk_version`: The version of (JRE headless) the OpenJDK distribution (`17`), see [Ubuntu packages](https://packages.ubuntu.com/search?keywords=openjdk).
+  - `openjdk_version`: The version of the OpenJDK (JRE headless) distribution (`17`).
+    - This version needs to match the version supported by the Spark distribution used above. 
+    - See [Spark Overview](https://spark.apache.org/docs/latest/#downloading) and [Ubuntu packages](https://packages.ubuntu.com/search?keywords=openjdk). 
 
 - Starting with _Spark >= 3.2_ the distribution file contains Scala version, hence building older Spark will not work.
-- Building older version requires modification to the Dockerfile or using it's older version
+- Building older version requires modification to the Dockerfile or using it's older version of the Dockerfile
 
 For example here is how to build a `pyspark-notebook` image with Spark `3.2.0`, Hadoop `3.2` and OpenJDK `11`.
 

From 079849388c4aa15350d30afa3103b542b8228c71 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 5 Jul 2022 19:42:18 +0000
Subject: [PATCH 09/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 docs/using/specifics.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/using/specifics.md b/docs/using/specifics.md
index c34cba05..a7f1b6b2 100644
--- a/docs/using/specifics.md
+++ b/docs/using/specifics.md
@@ -43,13 +43,14 @@ You can build a `pyspark-notebook` image (and also the downstream `all-spark-not
 
 - Spark distribution is defined by the combination of Spark, Hadoop and Scala versions and verified by the package checksum,
   see [Download Apache Spark](https://spark.apache.org/downloads.html) and the [archive repo](https://archive.apache.org/dist/spark/) for more information.
+
   - `spark_version`: The Spark version to install (`3.3.0`).
   - `hadoop_version`: The Hadoop version (`3.2`).
   - `scala_version`: The Scala version (`2.13`).
   - `spark_checksum`: The package checksum (`BFE4540...`).
   - `openjdk_version`: The version of the OpenJDK (JRE headless) distribution (`17`).
-    - This version needs to match the version supported by the Spark distribution used above. 
-    - See [Spark Overview](https://spark.apache.org/docs/latest/#downloading) and [Ubuntu packages](https://packages.ubuntu.com/search?keywords=openjdk). 
+    - This version needs to match the version supported by the Spark distribution used above.
+    - See [Spark Overview](https://spark.apache.org/docs/latest/#downloading) and [Ubuntu packages](https://packages.ubuntu.com/search?keywords=openjdk).
 
 - Starting with _Spark >= 3.2_ the distribution file contains Scala version, hence building older Spark will not work.
 - Building older version requires modification to the Dockerfile or using it's older version of the Dockerfile

From e862dc9005d77128d829f4b6ae27532d967222c3 Mon Sep 17 00:00:00 2001
From: Ayaz Salikhov <mathbunnyru@users.noreply.github.com>
Date: Wed, 6 Jul 2022 00:00:15 +0400
Subject: [PATCH 10/10] Update specifics.md

---
 docs/using/specifics.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/using/specifics.md b/docs/using/specifics.md
index a7f1b6b2..626978a1 100644
--- a/docs/using/specifics.md
+++ b/docs/using/specifics.md
@@ -53,7 +53,7 @@ You can build a `pyspark-notebook` image (and also the downstream `all-spark-not
     - See [Spark Overview](https://spark.apache.org/docs/latest/#downloading) and [Ubuntu packages](https://packages.ubuntu.com/search?keywords=openjdk).
 
 - Starting with _Spark >= 3.2_ the distribution file contains Scala version, hence building older Spark will not work.
-- Building older version requires modification to the Dockerfile or using it's older version of the Dockerfile
+- Building older version requires modification to the Dockerfile or using it's older version of the Dockerfile.
 
 For example here is how to build a `pyspark-notebook` image with Spark `3.2.0`, Hadoop `3.2` and OpenJDK `11`.