mirror of
https://github.com/jupyter/docker-stacks.git
synced 2025-10-14 21:42:57 +00:00
Merge branch 'master' into asalikhov/improve_docs
This commit is contained in:
60
all-spark-notebook/test/data/local_pyspark.ipynb
Normal file
60
all-spark-notebook/test/data/local_pyspark.ipynb
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "error",
|
||||||
|
"ename": "Error",
|
||||||
|
"evalue": "Jupyter cannot be started. Error attempting to locate jupyter: Data Science libraries jupyter and notebook are not installed in interpreter Python 3.7.7 64-bit ('jupyter': conda).",
|
||||||
|
"traceback": [
|
||||||
|
"Error: Jupyter cannot be started. Error attempting to locate jupyter: Data Science libraries jupyter and notebook are not installed in interpreter Python 3.7.7 64-bit ('jupyter': conda).",
|
||||||
|
"at b.startServer (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:92:270430)",
|
||||||
|
"at async b.createServer (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:92:269873)",
|
||||||
|
"at async connect (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:92:397876)",
|
||||||
|
"at async w.ensureConnectionAndNotebookImpl (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:556625)",
|
||||||
|
"at async w.ensureConnectionAndNotebook (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:556303)",
|
||||||
|
"at async w.clearResult (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:552346)",
|
||||||
|
"at async w.reexecuteCell (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:540374)",
|
||||||
|
"at async w.reexecuteCells (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:537541)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from pyspark.sql import SparkSession\n",
|
||||||
|
"\n",
|
||||||
|
"# Spark session & context\n",
|
||||||
|
"spark = SparkSession.builder.master('local').getOrCreate()\n",
|
||||||
|
"sc = spark.sparkContext\n",
|
||||||
|
"\n",
|
||||||
|
"# Sum of the first 100 whole numbers\n",
|
||||||
|
"rdd = sc.parallelize(range(100 + 1))\n",
|
||||||
|
"rdd.sum()\n",
|
||||||
|
"# 5050"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
41
all-spark-notebook/test/data/local_sparkR.ipynb
Normal file
41
all-spark-notebook/test/data/local_sparkR.ipynb
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"library(SparkR)\n",
|
||||||
|
"\n",
|
||||||
|
"# Spark session & context\n",
|
||||||
|
"sc <- sparkR.session(\"local\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Sum of the first 100 whole numbers\n",
|
||||||
|
"sdf <- createDataFrame(list(1:100))\n",
|
||||||
|
"dapplyCollect(sdf,\n",
|
||||||
|
" function(x) \n",
|
||||||
|
" { x <- sum(x)}\n",
|
||||||
|
" )\n",
|
||||||
|
"# 5050"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "R",
|
||||||
|
"language": "R",
|
||||||
|
"name": "ir"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": "r",
|
||||||
|
"file_extension": ".r",
|
||||||
|
"mimetype": "text/x-r-source",
|
||||||
|
"name": "R",
|
||||||
|
"pygments_lexer": "r",
|
||||||
|
"version": "3.6.3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
43
all-spark-notebook/test/data/local_sparklyr.ipynb
Normal file
43
all-spark-notebook/test/data/local_sparklyr.ipynb
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"library(sparklyr)\n",
|
||||||
|
"\n",
|
||||||
|
"# get the default config\n",
|
||||||
|
"conf <- spark_config()\n",
|
||||||
|
"# Set the catalog implementation in-memory\n",
|
||||||
|
"conf$spark.sql.catalogImplementation <- \"in-memory\"\n",
|
||||||
|
"\n",
|
||||||
|
"# Spark session & context\n",
|
||||||
|
"sc <- spark_connect(master = \"local\", config = conf)\n",
|
||||||
|
"\n",
|
||||||
|
"# Sum of the first 100 whole numbers\n",
|
||||||
|
"sdf_len(sc, 100, repartition = 1) %>% \n",
|
||||||
|
" spark_apply(function(e) sum(e))\n",
|
||||||
|
"# 5050"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "R",
|
||||||
|
"language": "R",
|
||||||
|
"name": "ir"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": "r",
|
||||||
|
"file_extension": ".r",
|
||||||
|
"mimetype": "text/x-r-source",
|
||||||
|
"name": "R",
|
||||||
|
"pygments_lexer": "r",
|
||||||
|
"version": "3.6.3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
63
all-spark-notebook/test/data/local_spylon.ipynb
Normal file
63
all-spark-notebook/test/data/local_spylon.ipynb
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"%%init_spark\n",
|
||||||
|
"# Spark session & context\n",
|
||||||
|
"launcher.master = \"local\"\n",
|
||||||
|
"launcher.conf.spark.executor.cores = 1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[8] at parallelize at <console>:28\n",
|
||||||
|
"res4: Double = 5050.0\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"// Sum of the first 100 whole numbers\n",
|
||||||
|
"val rdd = sc.parallelize(0 to 100)\n",
|
||||||
|
"rdd.sum()\n",
|
||||||
|
"// 5050"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "spylon-kernel",
|
||||||
|
"language": "scala",
|
||||||
|
"name": "spylon-kernel"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": "text/x-scala",
|
||||||
|
"file_extension": ".scala",
|
||||||
|
"help_links": [
|
||||||
|
{
|
||||||
|
"text": "MetaKernel Magics",
|
||||||
|
"url": "https://metakernel.readthedocs.io/en/latest/source/README.html"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"mimetype": "text/x-scala",
|
||||||
|
"name": "scala",
|
||||||
|
"pygments_lexer": "scala",
|
||||||
|
"version": "0.4.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
89
all-spark-notebook/test/data/local_toree.ipynb
Normal file
89
all-spark-notebook/test/data/local_toree.ipynb
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"Waiting for a Spark session to start..."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"spark://master:7077\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"// should print the value of --master in the kernel spec\n",
|
||||||
|
"println(sc.master)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"Waiting for a Spark session to start..."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"rdd = ParallelCollectionRDD[0] at parallelize at <console>:28\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"5050.0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"// Sum of the first 100 whole numbers\n",
|
||||||
|
"val rdd = sc.parallelize(0 to 100)\n",
|
||||||
|
"rdd.sum()\n",
|
||||||
|
"// 5050"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Apache Toree - Scala",
|
||||||
|
"language": "scala",
|
||||||
|
"name": "apache_toree_scala"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": "text/x-scala",
|
||||||
|
"file_extension": ".scala",
|
||||||
|
"mimetype": "text/x-scala",
|
||||||
|
"name": "scala",
|
||||||
|
"pygments_lexer": "scala",
|
||||||
|
"version": "2.11.12"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
35
all-spark-notebook/test/test_spark_notebooks.py
Normal file
35
all-spark-notebook/test/test_spark_notebooks.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# Copyright (c) Jupyter Development Team.
|
||||||
|
# Distributed under the terms of the Modified BSD License.
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import os
|
||||||
|
|
||||||
|
LOGGER = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"test_file",
|
||||||
|
# TODO: add local_sparklyr
|
||||||
|
["local_pyspark", "local_spylon", "local_toree", "local_sparkR"],
|
||||||
|
)
|
||||||
|
def test_nbconvert(container, test_file):
|
||||||
|
"""Check if Spark notebooks can be executed"""
|
||||||
|
host_data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
|
||||||
|
cont_data_dir = "/home/jovyan/data"
|
||||||
|
output_dir = "/tmp"
|
||||||
|
timeout_ms = 600
|
||||||
|
LOGGER.info(f"Test that {test_file} notebook can be executed ...")
|
||||||
|
command = f"jupyter nbconvert --to markdown --ExecutePreprocessor.timeout={timeout_ms} --output-dir {output_dir} --execute {cont_data_dir}/{test_file}.ipynb"
|
||||||
|
c = container.run(
|
||||||
|
volumes={host_data_dir: {"bind": cont_data_dir, "mode": "ro"}},
|
||||||
|
tty=True,
|
||||||
|
command=["start.sh", "bash", "-c", command],
|
||||||
|
)
|
||||||
|
rv = c.wait(timeout=timeout_ms / 10 + 10)
|
||||||
|
assert rv == 0 or rv["StatusCode"] == 0, f"Command {command} failed"
|
||||||
|
logs = c.logs(stdout=True).decode("utf-8")
|
||||||
|
LOGGER.debug(logs)
|
||||||
|
expected_file = f"{output_dir}/{test_file}.md"
|
||||||
|
assert expected_file in logs, f"Expected file {expected_file} not generated"
|
@@ -5,7 +5,8 @@ This page provides details about features specific to one or more images.
|
|||||||
## Apache Spark
|
## Apache Spark
|
||||||
|
|
||||||
**Specific Docker Image Options**
|
**Specific Docker Image Options**
|
||||||
* `-p 4040:4040` - The `jupyter/pyspark-notebook` and `jupyter/all-spark-notebook` images open [SparkUI (Spark Monitoring and Instrumentation UI)](http://spark.apache.org/docs/latest/monitoring.html) at default port `4040`, this option map `4040` port inside docker container to `4040` port on host machine . Note every new spark context that is created is put onto an incrementing port (ie. 4040, 4041, 4042, etc.), and it might be necessary to open multiple ports. For example: `docker run -d -p 8888:8888 -p 4040:4040 -p 4041:4041 jupyter/pyspark-notebook`
|
|
||||||
|
* `-p 4040:4040` - The `jupyter/pyspark-notebook` and `jupyter/all-spark-notebook` images open [SparkUI (Spark Monitoring and Instrumentation UI)](http://spark.apache.org/docs/latest/monitoring.html) at default port `4040`, this option map `4040` port inside docker container to `4040` port on host machine . Note every new spark context that is created is put onto an incrementing port (ie. 4040, 4041, 4042, etc.), and it might be necessary to open multiple ports. For example: `docker run -d -p 8888:8888 -p 4040:4040 -p 4041:4041 jupyter/pyspark-notebook`.
|
||||||
|
|
||||||
**Usage Examples**
|
**Usage Examples**
|
||||||
|
|
||||||
@@ -13,30 +14,66 @@ The `jupyter/pyspark-notebook` and `jupyter/all-spark-notebook` images support t
|
|||||||
|
|
||||||
### Using Spark Local Mode
|
### Using Spark Local Mode
|
||||||
|
|
||||||
Spark local mode is useful for experimentation on small data when you do not have a Spark cluster available.
|
Spark **local mode** is useful for experimentation on small data when you do not have a Spark cluster available.
|
||||||
|
|
||||||
#### In a Python Notebook
|
#### In Python
|
||||||
|
|
||||||
|
In a Python notebook.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from pyspark.sql import SparkSession
|
from pyspark.sql import SparkSession
|
||||||
spark = SparkSession.builder.appName("SimpleApp").getOrCreate()
|
|
||||||
# do something to prove it works
|
# Spark session & context
|
||||||
spark.sql('SELECT "Test" as c1').show()
|
spark = SparkSession.builder.master('local').getOrCreate()
|
||||||
|
sc = spark.sparkContext
|
||||||
|
|
||||||
|
# Sum of the first 100 whole numbers
|
||||||
|
rdd = sc.parallelize(range(100 + 1))
|
||||||
|
rdd.sum()
|
||||||
|
# 5050
|
||||||
```
|
```
|
||||||
|
|
||||||
#### In a R Notebook
|
#### In R
|
||||||
|
|
||||||
```r
|
In a R notebook with [SparkR][sparkr].
|
||||||
|
|
||||||
|
```R
|
||||||
library(SparkR)
|
library(SparkR)
|
||||||
|
|
||||||
as <- sparkR.session("local[*]")
|
# Spark session & context
|
||||||
|
sc <- sparkR.session("local")
|
||||||
|
|
||||||
# do something to prove it works
|
# Sum of the first 100 whole numbers
|
||||||
df <- as.DataFrame(iris)
|
sdf <- createDataFrame(list(1:100))
|
||||||
head(filter(df, df$Petal_Width > 0.2))
|
dapplyCollect(sdf,
|
||||||
|
function(x)
|
||||||
|
{ x <- sum(x)}
|
||||||
|
)
|
||||||
|
# 5050
|
||||||
```
|
```
|
||||||
|
|
||||||
#### In a Spylon Kernel Scala Notebook
|
In a R notebook with [sparklyr][sparklyr].
|
||||||
|
|
||||||
|
```R
|
||||||
|
library(sparklyr)
|
||||||
|
|
||||||
|
# Spark configuration
|
||||||
|
conf <- spark_config()
|
||||||
|
# Set the catalog implementation in-memory
|
||||||
|
conf$spark.sql.catalogImplementation <- "in-memory"
|
||||||
|
|
||||||
|
# Spark session & context
|
||||||
|
sc <- spark_connect(master = "local", config = conf)
|
||||||
|
|
||||||
|
# Sum of the first 100 whole numbers
|
||||||
|
sdf_len(sc, 100, repartition = 1) %>%
|
||||||
|
spark_apply(function(e) sum(e))
|
||||||
|
# 5050
|
||||||
|
```
|
||||||
|
|
||||||
|
#### In Scala
|
||||||
|
|
||||||
|
##### In a Spylon Kernel
|
||||||
|
|
||||||
Spylon kernel instantiates a `SparkContext` for you in variable `sc` after you configure Spark
|
Spylon kernel instantiates a `SparkContext` for you in variable `sc` after you configure Spark
|
||||||
options in a `%%init_spark` magic cell.
|
options in a `%%init_spark` magic cell.
|
||||||
@@ -44,27 +81,30 @@ options in a `%%init_spark` magic cell.
|
|||||||
```python
|
```python
|
||||||
%%init_spark
|
%%init_spark
|
||||||
# Configure Spark to use a local master
|
# Configure Spark to use a local master
|
||||||
launcher.master = "local[*]"
|
launcher.master = "local"
|
||||||
```
|
```
|
||||||
|
|
||||||
```scala
|
```scala
|
||||||
// Now run Scala code that uses the initialized SparkContext in sc
|
// Sum of the first 100 whole numbers
|
||||||
val rdd = sc.parallelize(0 to 999)
|
val rdd = sc.parallelize(0 to 100)
|
||||||
rdd.takeSample(false, 5)
|
rdd.sum()
|
||||||
|
// 5050
|
||||||
```
|
```
|
||||||
|
|
||||||
#### In an Apache Toree Scala Notebook
|
##### In an Apache Toree Kernel
|
||||||
|
|
||||||
Apache Toree instantiates a local `SparkContext` for you in variable `sc` when the kernel starts.
|
Apache Toree instantiates a local `SparkContext` for you in variable `sc` when the kernel starts.
|
||||||
|
|
||||||
```scala
|
```scala
|
||||||
val rdd = sc.parallelize(0 to 999)
|
// Sum of the first 100 whole numbers
|
||||||
rdd.takeSample(false, 5)
|
val rdd = sc.parallelize(0 to 100)
|
||||||
|
rdd.sum()
|
||||||
|
// 5050
|
||||||
```
|
```
|
||||||
|
|
||||||
### Connecting to a Spark Cluster in Standalone Mode
|
### Connecting to a Spark Cluster in Standalone Mode
|
||||||
|
|
||||||
Connection to Spark Cluster on Standalone Mode requires the following set of steps:
|
Connection to Spark Cluster on **[Standalone Mode](https://spark.apache.org/docs/latest/spark-standalone.html)** requires the following set of steps:
|
||||||
|
|
||||||
0. Verify that the docker image (check the Dockerfile) and the Spark Cluster which is being
|
0. Verify that the docker image (check the Dockerfile) and the Spark Cluster which is being
|
||||||
deployed, run the same version of Spark.
|
deployed, run the same version of Spark.
|
||||||
@@ -75,97 +115,104 @@ Connection to Spark Cluster on Standalone Mode requires the following set of ste
|
|||||||
* NOTE: When using `--net=host`, you must also use the flags `--pid=host -e
|
* NOTE: When using `--net=host`, you must also use the flags `--pid=host -e
|
||||||
TINI_SUBREAPER=true`. See https://github.com/jupyter/docker-stacks/issues/64 for details.
|
TINI_SUBREAPER=true`. See https://github.com/jupyter/docker-stacks/issues/64 for details.
|
||||||
|
|
||||||
#### In a Python Notebook
|
**Note**: In the following examples we are using the Spark master URL `spark://master:7077` that shall be replaced by the URL of the Spark master.
|
||||||
|
|
||||||
|
#### In Python
|
||||||
|
|
||||||
|
The **same Python version** need to be used on the notebook (where the driver is located) and on the Spark workers.
|
||||||
|
The python version used at driver and worker side can be adjusted by setting the environment variables `PYSPARK_PYTHON` and / or `PYSPARK_DRIVER_PYTHON`, see [Spark Configuration][spark-conf] for more information.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import os
|
from pyspark.sql import SparkSession
|
||||||
# make sure pyspark tells workers to use python3 not 2 if both are installed
|
|
||||||
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
|
|
||||||
|
|
||||||
import pyspark
|
# Spark session & context
|
||||||
conf = pyspark.SparkConf()
|
spark = SparkSession.builder.master('spark://master:7077').getOrCreate()
|
||||||
|
sc = spark.sparkContext
|
||||||
|
|
||||||
# Point to spark master
|
# Sum of the first 100 whole numbers
|
||||||
conf.setMaster("spark://10.10.10.10:7070")
|
rdd = sc.parallelize(range(100 + 1))
|
||||||
# point to spark binary package in HDFS or on local filesystem on all slave
|
rdd.sum()
|
||||||
# nodes (e.g., file:///opt/spark/spark-2.2.0-bin-hadoop2.7.tgz)
|
# 5050
|
||||||
conf.set("spark.executor.uri", "hdfs://10.10.10.10/spark/spark-2.2.0-bin-hadoop2.7.tgz")
|
|
||||||
# set other options as desired
|
|
||||||
conf.set("spark.executor.memory", "8g")
|
|
||||||
conf.set("spark.core.connection.ack.wait.timeout", "1200")
|
|
||||||
|
|
||||||
# create the context
|
|
||||||
sc = pyspark.SparkContext(conf=conf)
|
|
||||||
|
|
||||||
# do something to prove it works
|
|
||||||
rdd = sc.parallelize(range(100000000))
|
|
||||||
rdd.sumApprox(3)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
#### In a R Notebook
|
#### In R
|
||||||
|
|
||||||
```r
|
In a R notebook with [SparkR][sparkr].
|
||||||
|
|
||||||
|
```R
|
||||||
library(SparkR)
|
library(SparkR)
|
||||||
|
|
||||||
# Point to spark master
|
# Spark session & context
|
||||||
# Point to spark binary package in HDFS or on local filesystem on all worker
|
sc <- sparkR.session("spark://master:7077")
|
||||||
# nodes (e.g., file:///opt/spark/spark-2.2.0-bin-hadoop2.7.tgz) in sparkEnvir
|
|
||||||
# Set other options in sparkEnvir
|
|
||||||
sc <- sparkR.session(
|
|
||||||
"spark://10.10.10.10:7070",
|
|
||||||
sparkEnvir=list(
|
|
||||||
spark.executor.uri="hdfs://10.10.10.10/spark/spark-2.4.3-bin-hadoop2.7.tgz",
|
|
||||||
spark.executor.memory="8g"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# do something to prove it works
|
# Sum of the first 100 whole numbers
|
||||||
data(iris)
|
sdf <- createDataFrame(list(1:100))
|
||||||
df <- as.DataFrame(iris)
|
dapplyCollect(sdf,
|
||||||
head(filter(df, df$Petal_Width > 0.2))
|
function(x)
|
||||||
|
{ x <- sum(x)}
|
||||||
|
)
|
||||||
|
# 5050
|
||||||
```
|
```
|
||||||
|
|
||||||
#### In a Spylon Kernel Scala Notebook
|
In a R notebook with [sparklyr][sparklyr].
|
||||||
|
|
||||||
|
```R
|
||||||
|
library(sparklyr)
|
||||||
|
|
||||||
|
# Spark session & context
|
||||||
|
# Spark configuration
|
||||||
|
conf <- spark_config()
|
||||||
|
# Set the catalog implementation in-memory
|
||||||
|
conf$spark.sql.catalogImplementation <- "in-memory"
|
||||||
|
sc <- spark_connect(master = "spark://master:7077", config = conf)
|
||||||
|
|
||||||
|
# Sum of the first 100 whole numbers
|
||||||
|
sdf_len(sc, 100, repartition = 1) %>%
|
||||||
|
spark_apply(function(e) sum(e))
|
||||||
|
# 5050
|
||||||
|
```
|
||||||
|
|
||||||
|
#### In Scala
|
||||||
|
|
||||||
|
##### In a Spylon Kernel
|
||||||
|
|
||||||
|
Spylon kernel instantiates a `SparkContext` for you in variable `sc` after you configure Spark
|
||||||
|
options in a `%%init_spark` magic cell.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
%%init_spark
|
%%init_spark
|
||||||
# Point to spark master
|
# Configure Spark to use a local master
|
||||||
launcher.master = "spark://10.10.10.10:7070"
|
launcher.master = "spark://master:7077"
|
||||||
launcher.conf.spark.executor.uri=hdfs://10.10.10.10/spark/spark-2.4.3-bin-hadoop2.7.tgz
|
|
||||||
```
|
```
|
||||||
|
|
||||||
```scala
|
```scala
|
||||||
// Now run Scala code that uses the initialized SparkContext in sc
|
// Sum of the first 100 whole numbers
|
||||||
val rdd = sc.parallelize(0 to 999)
|
val rdd = sc.parallelize(0 to 100)
|
||||||
rdd.takeSample(false, 5)
|
rdd.sum()
|
||||||
|
// 5050
|
||||||
```
|
```
|
||||||
|
|
||||||
#### In an Apache Toree Scala Notebook
|
##### In an Apache Toree Scala Notebook
|
||||||
|
|
||||||
The Apache Toree kernel automatically creates a `SparkContext` when it starts based on configuration
|
The Apache Toree kernel automatically creates a `SparkContext` when it starts based on configuration information from its command line arguments and environment variables. You can pass information about your cluster via the `SPARK_OPTS` environment variable when you spawn a container.
|
||||||
information from its command line arguments and environment variables. You can pass information
|
|
||||||
about your cluster via the `SPARK_OPTS` environment variable when you spawn a container.
|
|
||||||
|
|
||||||
For instance, to pass information about a standalone Spark master, Spark binary location in HDFS,
|
For instance, to pass information about a standalone Spark master, you could start the container like so:
|
||||||
and an executor options, you could start the container like so:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -d -p 8888:8888 -e SPARK_OPTS='--master=spark://10.10.10.10:7070 \
|
docker run -d -p 8888:8888 -e SPARK_OPTS='--master=spark://master:7077' \
|
||||||
--spark.executor.uri=hdfs://10.10.10.10/spark/spark-2.4.3-bin-hadoop2.7.tgz \
|
jupyter/all-spark-notebook
|
||||||
--spark.executor.memory=8g' jupyter/all-spark-notebook
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Note that this is the same information expressed in a notebook in the Python case above. Once the
|
Note that this is the same information expressed in a notebook in the Python case above. Once the kernel spec has your cluster information, you can test your cluster in an Apache Toree notebook like so:
|
||||||
kernel spec has your cluster information, you can test your cluster in an Apache Toree notebook like
|
|
||||||
so:
|
|
||||||
|
|
||||||
```scala
|
```scala
|
||||||
// should print the value of --master in the kernel spec
|
// should print the value of --master in the kernel spec
|
||||||
println(sc.master)
|
println(sc.master)
|
||||||
|
|
||||||
// do something to prove it works
|
// Sum of the first 100 whole numbers
|
||||||
val rdd = sc.parallelize(0 to 99999999)
|
val rdd = sc.parallelize(0 to 100)
|
||||||
rdd.sum()
|
rdd.sum()
|
||||||
|
// 5050
|
||||||
```
|
```
|
||||||
|
|
||||||
## Tensorflow
|
## Tensorflow
|
||||||
@@ -201,3 +248,7 @@ init = tf.global_variables_initializer()
|
|||||||
sess.run(init)
|
sess.run(init)
|
||||||
sess.run(hello)
|
sess.run(hello)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
[sparkr]: https://spark.apache.org/docs/latest/sparkr.html
|
||||||
|
[sparklyr]: https://spark.rstudio.com/
|
||||||
|
[spark-conf]: https://spark.apache.org/docs/latest/configuration.html
|
Reference in New Issue
Block a user