diff --git a/docs/using/specifics.md b/docs/using/specifics.md index 4df770f1..0ed2f55d 100644 --- a/docs/using/specifics.md +++ b/docs/using/specifics.md @@ -49,7 +49,8 @@ You can build a `pyspark-notebook` image with a different `Spark` version by ove - This version needs to match the version supported by the Spark distribution used above. - See [Spark Overview](https://spark.apache.org/docs/latest/#downloading) and [Ubuntu packages](https://packages.ubuntu.com/search?keywords=openjdk). - `spark_version` (optional): The Spark version to install, for example `3.5.0`. - If not specified (this is the default), latest stable Spark will be installed. + If not specified (this is the default), latest Spark will be installed. + Note: to support Python 3.12, we currently install Spark v4 preview versions: . - `hadoop_version`: The Hadoop version (`3` by default). Note, that _Spark < 3.3_ require to specify `major.minor` Hadoop version (i.e. `3.2`). - `scala_version` (optional): The Scala version, for example `2.13` (not specified by default). diff --git a/images/pyspark-notebook/Dockerfile b/images/pyspark-notebook/Dockerfile index 8585232c..be4bdfa4 100644 --- a/images/pyspark-notebook/Dockerfile +++ b/images/pyspark-notebook/Dockerfile @@ -24,7 +24,7 @@ RUN apt-get update --yes && \ ca-certificates-java && \ apt-get clean && rm -rf /var/lib/apt/lists/* -# If spark_version is not set, latest stable Spark will be installed +# If spark_version is not set, latest Spark will be installed ARG spark_version ARG hadoop_version="3" # If scala_version is not set, Spark without Scala will be installed diff --git a/images/pyspark-notebook/setup_spark.py b/images/pyspark-notebook/setup_spark.py index 79e571af..c5b76433 100755 --- a/images/pyspark-notebook/setup_spark.py +++ b/images/pyspark-notebook/setup_spark.py @@ -29,11 +29,11 @@ def get_all_refs(url: str) -> list[str]: def get_latest_spark_version() -> str: """ - Returns the last stable version of Spark using spark archive + Returns the last version of Spark using spark archive """ LOGGER.info("Downloading Spark versions information") all_refs = get_all_refs("https://archive.apache.org/dist/spark/") - stable_versions = [ + versions = [ ref.removeprefix("spark-").removesuffix("/") for ref in all_refs if ref.startswith("spark-") and "incubating" not in ref @@ -49,7 +49,7 @@ def get_latest_spark_version() -> str: patch, _, preview = arr[2].partition("-") return (major, minor, int(patch), preview) - latest_version = max(stable_versions, key=lambda ver: version_array(ver)) + latest_version = max(versions, key=lambda ver: version_array(ver)) LOGGER.info(f"Latest version: {latest_version}") return latest_version