Install Spark 4 release version (#2300)

This commit is contained in:
Ayaz Salikhov
2025-05-30 09:11:31 +01:00
committed by GitHub
parent 08cc9f7d17
commit f9a75e58c5
4 changed files with 8 additions and 11 deletions

View File

@@ -64,13 +64,12 @@ USER ${NB_UID}
# NOTE: It's important to ensure compatibility between Pandas versions. # NOTE: It's important to ensure compatibility between Pandas versions.
# The pandas version in this Dockerfile should match the version # The pandas version in this Dockerfile should match the version
# on which the Pandas API for Spark is built. # on which the Pandas API for Spark is built.
# To find the right version: # To find the right version, check the pandas version being installed here:
# 1. Check out the Spark branch you are on: <https://github.com/apache/spark> # https://github.com/apache/spark/blob/<SPARK_VERSION>/dev/infra/Dockerfile
# 2. Find the pandas version in the file `dev/infra/Dockerfile`.
RUN mamba install --yes \ RUN mamba install --yes \
'grpcio-status' \ 'grpcio-status' \
'grpcio' \ 'grpcio' \
'pandas=2.2.2' \ 'pandas=2.2.3' \
'pyarrow' && \ 'pyarrow' && \
mamba clean --all -f -y && \ mamba clean --all -f -y && \
fix-permissions "${CONDA_DIR}" && \ fix-permissions "${CONDA_DIR}" && \

View File

@@ -35,11 +35,8 @@ def get_latest_spark_version() -> str:
LOGGER.info("Downloading Spark versions information") LOGGER.info("Downloading Spark versions information")
all_refs = get_all_refs("https://archive.apache.org/dist/spark/") all_refs = get_all_refs("https://archive.apache.org/dist/spark/")
LOGGER.info(f"All refs: {all_refs}") LOGGER.info(f"All refs: {all_refs}")
versions = [ pattern = re.compile(r"^spark-(\d+\.\d+\.\d+)/$")
ref.removeprefix("spark-").removesuffix("/") versions = [match.group(1) for ref in all_refs if (match := pattern.match(ref))]
for ref in all_refs
if re.match(r"^spark-\d", ref) is not None and "incubating" not in ref
]
LOGGER.info(f"Available versions: {versions}") LOGGER.info(f"Available versions: {versions}")
# Compare versions semantically # Compare versions semantically
@@ -74,6 +71,7 @@ def download_spark(
spark_dir_name += f"-scala{scala_version}" spark_dir_name += f"-scala{scala_version}"
LOGGER.info(f"Spark directory name: {spark_dir_name}") LOGGER.info(f"Spark directory name: {spark_dir_name}")
spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz"
LOGGER.info(f"Spark download URL: {spark_url}")
tmp_file = Path("/tmp/spark.tar.gz") tmp_file = Path("/tmp/spark.tar.gz")
subprocess.check_call( subprocess.check_call(

View File

@@ -2,4 +2,4 @@
# Distributed under the terms of the Modified BSD License. # Distributed under the terms of the Modified BSD License.
import pandas import pandas
assert pandas.__version__ == "2.2.2" assert pandas.__version__ == "2.2.3"

View File

@@ -17,7 +17,7 @@ def check_nbconvert(
no_warnings: bool = True, no_warnings: bool = True,
) -> str: ) -> str:
"""Check if nbconvert is able to convert a notebook file""" """Check if nbconvert is able to convert a notebook file"""
cont_data_file = "/home/jovyan/data/" + host_file.name cont_data_file = "/home/jovyan/" + host_file.name
output_dir = "/tmp" output_dir = "/tmp"
LOGGER.info( LOGGER.info(