From a58e6afb9b6f507713598ddd899aa603bd1bea35 Mon Sep 17 00:00:00 2001 From: Giovanni Lanzani Date: Mon, 12 Mar 2018 21:43:55 +0100 Subject: [PATCH] Add pyarrow installation This way we can profit from pandas UDF --- pyspark-notebook/Dockerfile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyspark-notebook/Dockerfile b/pyspark-notebook/Dockerfile index 40a1ca59..8ae1b9c9 100644 --- a/pyspark-notebook/Dockerfile +++ b/pyspark-notebook/Dockerfile @@ -14,6 +14,12 @@ RUN apt-get -y update && \ apt-get install --no-install-recommends -y openjdk-8-jre-headless ca-certificates-java && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* + +# Install pyarrow +RUN conda install --quiet -y 'pyarrow' && \ + fix-permissions $CONDA_DIR && \ + fix-permissions /home/$NB_USER + RUN cd /tmp && \ wget -q http://apache.claz.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && \ echo "258683885383480BA01485D6C6F7DC7CFD559C1584D6CEB7A3BBCF484287F7F57272278568F16227BE46B4F92591768BA3D164420D87014A136BF66280508B46 *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | sha512sum -c - && \