Add spark notebook tests and change examples

* Test added for all kernels
* Same examples as provided in the documentation (`specifics.md`)
* Used the same use case for all examples: sum of the first 100 whole numbers

Note: I've not automatically tested `local_sparklyr.ipynb` since it creates by default the `metastore_db` dir and the `derby.log` file in the working directory. Since I mount it in `RO` it's not working. I'm struggling to set it elsewhere...
This commit is contained in:
Romain
2020-05-29 06:54:46 +02:00
parent 36dc621f97
commit c83024c950
7 changed files with 378 additions and 30 deletions

View File

@@ -0,0 +1,60 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"output_type": "error",
"ename": "Error",
"evalue": "Jupyter cannot be started. Error attempting to locate jupyter: Data Science libraries jupyter and notebook are not installed in interpreter Python 3.7.7 64-bit ('jupyter': conda).",
"traceback": [
"Error: Jupyter cannot be started. Error attempting to locate jupyter: Data Science libraries jupyter and notebook are not installed in interpreter Python 3.7.7 64-bit ('jupyter': conda).",
"at b.startServer (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:92:270430)",
"at async b.createServer (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:92:269873)",
"at async connect (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:92:397876)",
"at async w.ensureConnectionAndNotebookImpl (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:556625)",
"at async w.ensureConnectionAndNotebook (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:556303)",
"at async w.clearResult (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:552346)",
"at async w.reexecuteCell (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:540374)",
"at async w.reexecuteCells (/Users/romain/.vscode/extensions/ms-python.python-2020.5.80290/out/client/extension.js:16:537541)"
]
}
],
"source": [
"from pyspark.sql import SparkSession\n",
"\n",
"# Spark session & context\n",
"spark = SparkSession.builder.master('local').getOrCreate()\n",
"sc = spark.sparkContext\n",
"\n",
"# Sum of the first 100 whole numbers\n",
"rdd = sc.parallelize(range(100 + 1))\n",
"rdd.sum()\n",
"# 5050"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,41 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"library(SparkR)\n",
"\n",
"# Spark session & context\n",
"sc <- sparkR.session(\"local\")\n",
"\n",
"# Sum of the first 100 whole numbers\n",
"sdf <- createDataFrame(list(1:100))\n",
"dapplyCollect(sdf,\n",
" function(x) \n",
" { x <- sum(x)}\n",
" )\n",
"# 5050"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "R",
"language": "R",
"name": "ir"
},
"language_info": {
"codemirror_mode": "r",
"file_extension": ".r",
"mimetype": "text/x-r-source",
"name": "R",
"pygments_lexer": "r",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,43 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"library(sparklyr)\n",
"\n",
"# get the default config\n",
"conf <- spark_config()\n",
"# Set the catalog implementation in-memory\n",
"conf$spark.sql.catalogImplementation <- \"in-memory\"\n",
"\n",
"# Spark session & context\n",
"sc <- spark_connect(master = \"local\", config = conf)\n",
"\n",
"# Sum of the first 100 whole numbers\n",
"sdf_len(sc, 100, repartition = 1) %>% \n",
" spark_apply(function(e) sum(e))\n",
"# 5050"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "R",
"language": "R",
"name": "ir"
},
"language_info": {
"codemirror_mode": "r",
"file_extension": ".r",
"mimetype": "text/x-r-source",
"name": "R",
"pygments_lexer": "r",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,63 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"%%init_spark\n",
"# Spark session & context\n",
"launcher.master = \"local\"\n",
"launcher.conf.spark.executor.cores = 1"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[8] at parallelize at <console>:28\n",
"res4: Double = 5050.0\n"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// Sum of the first 100 whole numbers\n",
"val rdd = sc.parallelize(0 to 100)\n",
"rdd.sum()\n",
"// 5050"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "spylon-kernel",
"language": "scala",
"name": "spylon-kernel"
},
"language_info": {
"codemirror_mode": "text/x-scala",
"file_extension": ".scala",
"help_links": [
{
"text": "MetaKernel Magics",
"url": "https://metakernel.readthedocs.io/en/latest/source/README.html"
}
],
"mimetype": "text/x-scala",
"name": "scala",
"pygments_lexer": "scala",
"version": "0.4.1"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,89 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Waiting for a Spark session to start..."
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"spark://master:7077\n"
]
}
],
"source": [
"// should print the value of --master in the kernel spec\n",
"println(sc.master)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Waiting for a Spark session to start..."
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"rdd = ParallelCollectionRDD[0] at parallelize at <console>:28\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"5050.0"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// Sum of the first 100 whole numbers\n",
"val rdd = sc.parallelize(0 to 100)\n",
"rdd.sum()\n",
"// 5050"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Apache Toree - Scala",
"language": "scala",
"name": "apache_toree_scala"
},
"language_info": {
"codemirror_mode": "text/x-scala",
"file_extension": ".scala",
"mimetype": "text/x-scala",
"name": "scala",
"pygments_lexer": "scala",
"version": "2.11.12"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,35 @@
# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.
import logging
import pytest
import os
LOGGER = logging.getLogger(__name__)
@pytest.mark.parametrize(
"test_file",
# TODO: add local_sparklyr
["local_pyspark", "local_spylon", "local_toree", "local_sparkR"],
)
def test_nbconvert(container, test_file):
"""Check if Spark notebooks can be executed"""
host_data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
cont_data_dir = "/home/jovyan/data"
output_dir = "/tmp"
timeout_ms = 600
LOGGER.info(f"Test that {test_file} notebook can be executed ...")
command = f"jupyter nbconvert --to markdown --ExecutePreprocessor.timeout={timeout_ms} --output-dir {output_dir} --execute {cont_data_dir}/{test_file}.ipynb"
c = container.run(
volumes={host_data_dir: {"bind": cont_data_dir, "mode": "ro"}},
tty=True,
command=["start.sh", "bash", "-c", command],
)
rv = c.wait(timeout=timeout_ms / 10 + 10)
assert rv == 0 or rv["StatusCode"] == 0, f"Command {command} failed"
logs = c.logs(stdout=True).decode("utf-8")
LOGGER.debug(logs)
expected_file = f"{output_dir}/{test_file}.md"
assert expected_file in logs, f"Expected file {expected_file} not generated"

View File

@@ -27,9 +27,10 @@ from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').getOrCreate()
sc = spark.sparkContext
# Do something to prove it works
rdd = sc.parallelize(range(100))
# Sum of the first 100 whole numbers
rdd = sc.parallelize(range(100 + 1))
rdd.sum()
# 5050
```
#### In R
@@ -42,26 +43,32 @@ library(SparkR)
# Spark session & context
sc <- sparkR.session("local")
# Do something to prove it works
data(iris)
df <- as.DataFrame(iris)
head(filter(df, df$Petal_Width > 0.2))
# Sum of the first 100 whole numbers
sdf <- createDataFrame(list(1:100))
dapplyCollect(sdf,
function(x)
{ x <- sum(x)}
)
# 5050
```
In a R notebook with [sparklyr][sparklyr].
```R
library(sparklyr)
library(dplyr)
# Spark configuration
conf <- spark_config()
# Set the catalog implementation in-memory
conf$spark.sql.catalogImplementation <- "in-memory"
# Spark session & context
sc <- spark_connect(master = "local")
sc <- spark_connect(master = "local", config = conf)
# Do something to prove it works
iris_tbl <- copy_to(sc, iris)
iris_tbl %>%
filter(Petal_Width > 0.2) %>%
head()
# Sum of the first 100 whole numbers
sdf_len(sc, 100, repartition = 1) %>%
spark_apply(function(e) sum(e))
# 5050
```
#### In Scala
@@ -78,9 +85,10 @@ launcher.master = "local"
```
```scala
// Do something to prove it works
// Sum of the first 100 whole numbers
val rdd = sc.parallelize(0 to 100)
rdd.sum()
// 5050
```
##### In an Apache Toree Kernel
@@ -88,9 +96,10 @@ rdd.sum()
Apache Toree instantiates a local `SparkContext` for you in variable `sc` when the kernel starts.
```scala
// do something to prove it works
// Sum of the first 100 whole numbers
val rdd = sc.parallelize(0 to 100)
rdd.sum()
// 5050
```
### Connecting to a Spark Cluster in Standalone Mode
@@ -120,9 +129,10 @@ from pyspark.sql import SparkSession
spark = SparkSession.builder.master('spark://master:7077').getOrCreate()
sc = spark.sparkContext
# Do something to prove it works
rdd = sc.parallelize(range(100))
# Sum of the first 100 whole numbers
rdd = sc.parallelize(range(100 + 1))
rdd.sum()
# 5050
```
#### In R
@@ -135,26 +145,31 @@ library(SparkR)
# Spark session & context
sc <- sparkR.session("spark://master:7077")
# Do something to prove it works
data(iris)
df <- as.DataFrame(iris)
head(filter(df, df$Petal_Width > 0.2))
# Sum of the first 100 whole numbers
sdf <- createDataFrame(list(1:100))
dapplyCollect(sdf,
function(x)
{ x <- sum(x)}
)
# 5050
```
In a R notebook with [sparklyr][sparklyr].
```R
library(sparklyr)
library(dplyr)
# Spark session & context
sc <- spark_connect(master = "spark://master:7077")
# Spark configuration
conf <- spark_config()
# Set the catalog implementation in-memory
conf$spark.sql.catalogImplementation <- "in-memory"
sc <- spark_connect(master = "spark://master:7077", config = conf)
# Do something to prove it works
iris_tbl <- copy_to(sc, iris)
iris_tbl %>%
filter(Petal_Width > 0.2) %>%
head()
# Sum of the first 100 whole numbers
sdf_len(sc, 100, repartition = 1) %>%
spark_apply(function(e) sum(e))
# 5050
```
#### In Scala
@@ -171,9 +186,10 @@ launcher.master = "spark://master:7077"
```
```scala
// Do something to prove it works
// Sum of the first 100 whole numbers
val rdd = sc.parallelize(0 to 100)
rdd.sum()
// 5050
```
##### In an Apache Toree Scala Notebook
@@ -193,9 +209,10 @@ Note that this is the same information expressed in a notebook in the Python cas
// should print the value of --master in the kernel spec
println(sc.master)
// do something to prove it works
// Sum of the first 100 whole numbers
val rdd = sc.parallelize(0 to 100)
rdd.sum()
// 5050
```
## Tensorflow