mirror of
https://github.com/jupyter/docker-stacks.git
synced 2025-10-10 11:32:57 +00:00
Automatically install latest spark version (#2075)
* Automatically install latest pyspark version * Better text * Do not use shutil to keep behaviour * Make setup_script cwd independent * Use _get_program_version to calculate spark version * Update setup_spark.py reqs * Update setup_spark.py * Add info about HADOOP_VERSION * Add customization back * Better text * Specify build args when they are actually needed * Better text * Better code * Better code * Better text * Get rid of warning * Improve code * Remove information about checksum * Better text
This commit is contained in:
107
images/pyspark-notebook/setup_spark.py
Executable file
107
images/pyspark-notebook/setup_spark.py
Executable file
@@ -0,0 +1,107 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (c) Jupyter Development Team.
|
||||
# Distributed under the terms of the Modified BSD License.
|
||||
|
||||
# Requirements:
|
||||
# - Run as the root user
|
||||
# - Required env variables: SPARK_HOME, HADOOP_VERSION, SPARK_DOWNLOAD_URL
|
||||
# - Optional env variables: SPARK_VERSION, SCALA_VERSION
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def get_all_refs(url: str) -> list[str]:
|
||||
"""
|
||||
Get all the references for a given webpage
|
||||
"""
|
||||
resp = requests.get(url)
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
return [a["href"] for a in soup.find_all("a", href=True)]
|
||||
|
||||
|
||||
def get_spark_version() -> str:
|
||||
"""
|
||||
If ${SPARK_VERSION} env variable is non-empty, simply returns it
|
||||
Otherwise, returns the last stable version of Spark using spark archive
|
||||
"""
|
||||
if (version := os.environ["SPARK_VERSION"]) != "":
|
||||
return version
|
||||
all_refs = get_all_refs("https://archive.apache.org/dist/spark/")
|
||||
stable_versions = [
|
||||
ref.removeprefix("spark-").removesuffix("/")
|
||||
for ref in all_refs
|
||||
if ref.startswith("spark-") and "incubating" not in ref and "preview" not in ref
|
||||
]
|
||||
# Compare versions semantically
|
||||
return max(
|
||||
stable_versions, key=lambda ver: [int(sub_ver) for sub_ver in ver.split(".")]
|
||||
)
|
||||
|
||||
|
||||
def download_spark(
|
||||
spark_version: str,
|
||||
hadoop_version: str,
|
||||
scala_version: str,
|
||||
spark_download_url: Path,
|
||||
) -> str:
|
||||
"""
|
||||
Downloads and unpacks spark
|
||||
The resulting spark directory name is returned
|
||||
"""
|
||||
spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}"
|
||||
if scala_version:
|
||||
spark_dir_name += f"-scala{scala_version}"
|
||||
spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz"
|
||||
|
||||
tmp_file = Path("/tmp/spark.tar.gz")
|
||||
subprocess.check_call(
|
||||
["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url]
|
||||
)
|
||||
subprocess.check_call(
|
||||
[
|
||||
"tar",
|
||||
"xzf",
|
||||
tmp_file,
|
||||
"-C",
|
||||
"/usr/local",
|
||||
"--owner",
|
||||
"root",
|
||||
"--group",
|
||||
"root",
|
||||
"--no-same-owner",
|
||||
]
|
||||
)
|
||||
tmp_file.unlink()
|
||||
return spark_dir_name
|
||||
|
||||
|
||||
def prepare_spark(spark_dir_name: str, spark_home: Path) -> None:
|
||||
"""
|
||||
Creates a ${SPARK_HOME} symlink to a versioned spark directory
|
||||
Creates a 10spark-config.sh symlink to source PYTHONPATH automatically
|
||||
"""
|
||||
subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home])
|
||||
|
||||
# Add a link in the before_notebook hook in order to source PYTHONPATH automatically
|
||||
CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh"
|
||||
subprocess.check_call(
|
||||
["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT]
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
spark_version = get_spark_version()
|
||||
spark_dir_name = download_spark(
|
||||
spark_version=spark_version,
|
||||
hadoop_version=os.environ["HADOOP_VERSION"],
|
||||
scala_version=os.environ["SCALA_VERSION"],
|
||||
spark_download_url=Path(os.environ["SPARK_DOWNLOAD_URL"]),
|
||||
)
|
||||
prepare_spark(
|
||||
spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"])
|
||||
)
|
Reference in New Issue
Block a user