diff --git a/images/pyspark-notebook/Dockerfile b/images/pyspark-notebook/Dockerfile index 95633b19..a656d1fe 100644 --- a/images/pyspark-notebook/Dockerfile +++ b/images/pyspark-notebook/Dockerfile @@ -64,13 +64,12 @@ USER ${NB_UID} # NOTE: It's important to ensure compatibility between Pandas versions. # The pandas version in this Dockerfile should match the version # on which the Pandas API for Spark is built. -# To find the right version: -# 1. Check out the Spark branch you are on: -# 2. Find the pandas version in the file `dev/infra/Dockerfile`. +# To find the right version, check the pandas version being installed here: +# https://github.com/apache/spark/blob//dev/infra/Dockerfile RUN mamba install --yes \ 'grpcio-status' \ 'grpcio' \ - 'pandas=2.2.2' \ + 'pandas=2.2.3' \ 'pyarrow' && \ mamba clean --all -f -y && \ fix-permissions "${CONDA_DIR}" && \ diff --git a/images/pyspark-notebook/setup_spark.py b/images/pyspark-notebook/setup_spark.py index 9bc8ff80..7634d7cd 100755 --- a/images/pyspark-notebook/setup_spark.py +++ b/images/pyspark-notebook/setup_spark.py @@ -35,11 +35,8 @@ def get_latest_spark_version() -> str: LOGGER.info("Downloading Spark versions information") all_refs = get_all_refs("https://archive.apache.org/dist/spark/") LOGGER.info(f"All refs: {all_refs}") - versions = [ - ref.removeprefix("spark-").removesuffix("/") - for ref in all_refs - if re.match(r"^spark-\d", ref) is not None and "incubating" not in ref - ] + pattern = re.compile(r"^spark-(\d+\.\d+\.\d+)/$") + versions = [match.group(1) for ref in all_refs if (match := pattern.match(ref))] LOGGER.info(f"Available versions: {versions}") # Compare versions semantically @@ -74,6 +71,7 @@ def download_spark( spark_dir_name += f"-scala{scala_version}" LOGGER.info(f"Spark directory name: {spark_dir_name}") spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" + LOGGER.info(f"Spark download URL: {spark_url}") tmp_file = Path("/tmp/spark.tar.gz") subprocess.check_call( diff --git a/tests/by_image/pyspark-notebook/units/unit_pandas_version.py b/tests/by_image/pyspark-notebook/units/unit_pandas_version.py index 802a2192..d6ea4520 100644 --- a/tests/by_image/pyspark-notebook/units/unit_pandas_version.py +++ b/tests/by_image/pyspark-notebook/units/unit_pandas_version.py @@ -2,4 +2,4 @@ # Distributed under the terms of the Modified BSD License. import pandas -assert pandas.__version__ == "2.2.2" +assert pandas.__version__ == "2.2.3" diff --git a/tests/shared_checks/nbconvert_check.py b/tests/shared_checks/nbconvert_check.py index 9a1911e1..f3683493 100644 --- a/tests/shared_checks/nbconvert_check.py +++ b/tests/shared_checks/nbconvert_check.py @@ -17,7 +17,7 @@ def check_nbconvert( no_warnings: bool = True, ) -> str: """Check if nbconvert is able to convert a notebook file""" - cont_data_file = "/home/jovyan/data/" + host_file.name + cont_data_file = "/home/jovyan/" + host_file.name output_dir = "/tmp" LOGGER.info(