Commit a41be45b authored by Francesco Pace's avatar Francesco Pace
Browse files

Update Conda and Jupyter Lab

parents
Pipeline #5272 failed with stages
# Created by .ignore support plugin (hsz.mobi)
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
# Translations
*.mo
*.pot
# Django stuff:
*.log
# Sphinx documentation
docs/_build/
# PyBuilder
target/
.idea/
state.zoe
/zoe*.conf
zoepass.csv
stages:
- build
- test
- push
- pull
- deploy
variables:
VERSION: $CI_PIPELINE_ID
REPOSITORY: zapps
image: docker:latest
before_script:
- mkdir -p $HOME/.docker
- echo $DOCKER_AUTH_CONFIG > $HOME/.docker/config.json
build:spark2-jupyter-notebook:
stage: build
script:
- docker build -t ${REPOSITORY}/spark2-jupyter-notebook-aml:${VERSION} docker/spark2-jupyter-notebook
build:spark2-master:
stage: build
script:
- docker build -t ${REPOSITORY}/spark2-master-aml:${VERSION} docker/spark2-master
build:spark2-worker:
stage: build
script:
- docker build -t ${REPOSITORY}/spark2-worker-aml:${VERSION} docker/spark2-worker
build:json:
image: python:3-alpine
stage: build
script:
- python gen_json.py
artifacts:
paths:
- aml-lab-zapp.json
test:json:
image: python:3-alpine
stage: test
before_script:
- pip install requests
script:
- python /scripts/validate.py ${ZOE_VALIDATION_URL} aml-lab-zapp.json
push:all:
stage: push
script:
- /scripts/distribute_docker_image.sh ${REPOSITORY}/spark2-master-aml:${VERSION}
- /scripts/distribute_docker_image.sh ${REPOSITORY}/spark2-worker-aml:${VERSION}
- /scripts/distribute_docker_image.sh ${REPOSITORY}/spark2-jupyter-notebook-aml:${VERSION}
push:hub:
stage: push
script:
- docker push ${REPOSITORY}/spark2-master-aml:${VERSION}
- docker push ${REPOSITORY}/spark2-worker-aml:${VERSION}
- docker push ${REPOSITORY}/spark2-jupyter-notebook-aml:${VERSION}
only:
- master
deploy:json:
stage: deploy
script:
- mkdir -p ${ZAPP_SHOP_BASE_PATH}/${CI_PROJECT_NAME}
- cp *.json ${ZAPP_SHOP_BASE_PATH}/${CI_PROJECT_NAME}/
- cp logo.png ${ZAPP_SHOP_BASE_PATH}/${CI_PROJECT_NAME}/
- cp README*.md ${ZAPP_SHOP_BASE_PATH}/${CI_PROJECT_NAME}/
artifacts:
paths:
- "*.json"
- logo.png
- "README*.md"
only:
- master
# Spark ZApp
URL: [https://gitlab.eurecom.fr/zoe-apps/zapp-spark](https://gitlab.eurecom.fr/zoe-apps/zapp-spark)
Combine the full power of a distributed [Apache Spark](http://spark.apache.org) cluster with Python Jupyter Notebooks.
The Spark shell can be used from the built-in terminal in the notebook ZApp.
Spark is configured in stand-alone, distributed mode. This ZApp contains Spark version 2.2.0.
# Jupyter Notebook image
This image contains the Jupyter notebook configured with Pythen and a Spark client. It is used by Zoe, the Container Analytics as a
Service system to create on-demand notebooks connected to containerized Spark clusters.
Zoe can be found at: https://github.com/DistributedSystemsGroup/zoe
## Setup
The Dockerfile runs a start script that configures the Notebook using these environment variables:
* SPARK\_MASTER\_IP: IP address of the Spark master this notebook should use for its kernel
* PROXY\_ID: string to use as a prefix for URL paths, for reverse proxying
* SPARK\_EXECUTOR\_RAM: How much RAM to use for each executor spawned by the notebook
# Spark Scala master image
This image contains the Scala master process. It is used by Zoe, the Container Analytics as a
Service system to create on-demand Spark clusers in Spark standalone mode.
Zoe can be found at: https://github.com/DistributedSystemsGroup/zoe
## Setup
The Dockerfile automatically starts the Spark master process when the container is run.
# Spark worker image
This image contains the Scala worker process. It is used by Zoe, the Container Analytics as a
Service system to create on-demand Spark clusters in standalone mode.
Zoe can be found at: https://github.com/DistributedSystemsGroup/zoe
## Setup
The Dockerfile runs the worker process when run. The following options can be passed via environment variables:
* SPARK\_MASTER\_IP: IP address of the Spark master this notebook should use for its kernel
* SPARK\_WORKER\_RAM: How much RAM the worker can use (default is 4g)
* SPARK\_WORKER\_CORES: How many cores can be used by the worker process (default is 4)
#!/usr/bin/env bash
set -e
if [ ! -d docker ]; then
exit
fi
REPOSITORY=${REPOSITORY:-zapps}
DOCKER_REGISTRY=${DOCKER_REGISTRY:-docker-registry:5000}
VERSION=${VERSION:-`date +%Y%m%d%H%M%S`}
built_images=''
for d in `find docker -mindepth 1 -maxdepth 1 -type d -printf '%f '`; do
pushd docker/${d}
docker build -t ${DOCKER_REGISTRY}/${REPOSITORY}/${d}:${VERSION} .
docker push ${DOCKER_REGISTRY}/${REPOSITORY}/${d}:${VERSION}
popd
built_images+="${DOCKER_REGISTRY}/${REPOSITORY}/${d}:${VERSION}\n"
done
echo "-------------END SCRIPT-----------------"
echo "Images built:"
printf ${built_images}
echo
FROM ubuntu:14.04
MAINTAINER Daniele Venzano <venza@brownhat.org>
RUN apt-get update && apt-get install -y --force-yes software-properties-common python-software-properties
RUN apt-add-repository -y ppa:webupd8team/java
RUN /bin/echo debconf shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections
RUN apt-get update && apt-get -y install oracle-java8-installer oracle-java8-set-default curl
ARG SPARK_VERSION
ENV SPARK_VERSION ${SPARK_VERSION:-2.2.0}
ARG HADOOP_VERSION
ENV HADOOP_VERSION ${HADOOP_VERSION:-hadoop2.6}
ENV JAVA_HOME /usr/lib/jvm/java-8-oracle/
RUN curl -s http://mirrors.ircam.fr/pub/apache/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${HADOOP_VERSION}.tgz | tar -xvz -C /opt/
WORKDIR /opt
RUN ln -s spark-${SPARK_VERSION}-bin-${HADOOP_VERSION} spark
ENV SPARK_HOME /opt/spark
ENV PATH /opt/spark/bin:/opt/spark/sbin:${PATH}
RUN apt-get update && apt-get install -y --force-yes --no-install-recommends \
wget \
build-essential \
python-dev \
ca-certificates \
bzip2 \
pandoc \
libopenblas-dev \
libjpeg-dev \
&& apt-get clean
RUN locale-gen en_US.UTF-8
# Configure environment
ENV CONDA_DIR /opt/conda
ENV HADOOP_HOME /opt/hadoop
ENV HADOOP_CONF_DIR $HADOOP_HOME/etc/hadoop
ENV PATH $HADOOP_HOME/bin:$CONDA_DIR/bin:$PATH
ENV SHELL /bin/bash
ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8
ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.4-src.zip
ENV PYSPARK_PYTHON=/opt/conda/bin/python
RUN cd /tmp && \
mkdir -p $CONDA_DIR && \
wget http://repo.continuum.io/miniconda/Miniconda3-4.4.10-Linux-x86_64.sh && \
/bin/bash Miniconda3-4.4.10-Linux-x86_64.sh -f -b -p $CONDA_DIR && \
rm Miniconda3-4.4.10-Linux-x86_64.sh && \
$CONDA_DIR/bin/conda install --yes conda==4.4.10
# Install Python 3 packages
RUN conda install --yes \
'pandas=0.17*' \
'matplotlib=1.4*' \
'scipy=0.16*' \
'seaborn=0.6*' \
'scikit-learn=0.16*' \
'statsmodels=0.6.1' \
&& conda clean -yt
ENV HADOOP_VERSION_DL 2.6.5
RUN curl http://apache.mirrors.ovh.net/ftp.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION_DL}/hadoop-${HADOOP_VERSION_DL}.tar.gz | tar -xz -C /opt/
RUN ln -s /opt/hadoop-${HADOOP_VERSION_DL} /opt/hadoop
# Install Tini
RUN wget --quiet https://github.com/krallin/tini/releases/download/v0.6.0/tini && \
echo "d5ed732199c36a1189320e6c4859f0169e950692f451c03e7854243b95f4234b *tini" | sha256sum -c - && \
mv tini /usr/local/bin/tini && \
chmod +x /usr/local/bin/tini
RUN apt-get update && apt-get install -y --force-yes --no-install-recommends \
git \
vim \
unzip \
libsm6 \
texlive-latex-base \
texlive-latex-extra \
texlive-fonts-extra \
texlive-fonts-recommended \
texlive-generic-recommended \
sudo \
locales \
libxrender1 \
&& apt-get clean
# Setup nbuser home directory
RUN ln -s /mnt/workspace /root/work && \
mkdir /root/.jupyter && \
mkdir /root/.local
RUN /opt/conda/bin/pip install jupyterlab widgetsnbextension && \
jupyter serverextension enable --py jupyterlab --sys-prefix
COPY files/spark-defaults.conf /opt/spark-defaults.conf
# Configure container startup as root
EXPOSE 4040 8888
WORKDIR /root/work
ENTRYPOINT ["tini", "--"]
CMD ["start-notebook.sh"]
# Add local files as late as possible to avoid cache busting
COPY files/start-notebook.sh /usr/local/bin/
RUN chmod 755 /usr/local/bin/start-notebook.sh
COPY files/jupyter_notebook_config.py /root/.jupyter/
RUN mkdir -p /root/.ipython/profile_default/startup/
COPY files/00-pyspark-setup.py /root/.ipython/profile_default/startup/
COPY files/core-site.xml /opt
COPY files/hdfs-site.xml /opt
# Configure the necessary Spark environment
import os
# make sure pyspark tells workers to use python3 not 2 if both are installed
os.environ['PYSPARK_PYTHON'] = '/opt/conda/bin/python'
if 'SPARK_MASTER' in os.environ:
import pyspark
conf = pyspark.SparkConf()
# point to mesos master or zookeeper entry (e.g., zk://10.10.10.10:2181/mesos)
conf.setMaster(os.environ["SPARK_MASTER"])
# set other options as desired
conf.set("spark.executor.memory", os.environ["SPARK_EXECUTOR_RAM"])
# create the context
sc = pyspark.SparkContext(conf=conf)
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://XXX_NAMENODE_HOST:8020/</value>
</property>
</configuration>
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:///mnt/namenode</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:///mnt/datanode</value>
</property>
</configuration>
# Copyright (c) Jupyter Development Team.
from jupyter_core.paths import jupyter_data_dir
import subprocess
import os
import errno
import stat
PEM_FILE = os.path.join(jupyter_data_dir(), 'notebook.pem')
c = get_config()
c.NotebookApp.ip = '*'
c.NotebookApp.port = 8888
c.NotebookApp.open_browser = False
# Set a certificate if USE_HTTPS is set to any value
if 'USE_HTTPS' in os.environ:
if not os.path.isfile(PEM_FILE):
# Ensure PEM_FILE directory exists
dir_name = os.path.dirname(PEM_FILE)
try:
os.makedirs(dir_name)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(dir_name):
pass
else: raise
# Generate a certificate if one doesn't exist on disk
subprocess.check_call(['openssl', 'req', '-new',
'-newkey', 'rsa:2048', '-days', '365', '-nodes', '-x509',
'-subj', '/C=XX/ST=XX/L=XX/O=generated/CN=generated',
'-keyout', PEM_FILE, '-out', PEM_FILE])
# Restrict access to PEM_FILE
os.chmod(PEM_FILE, stat.S_IRUSR | stat.S_IWUSR)
c.NotebookApp.certfile = PEM_FILE
# Set a password if PASSWORD is set
if 'PASSWORD' in os.environ:
from IPython.lib import passwd
c.NotebookApp.password = passwd(os.environ['PASSWORD'])
del os.environ['PASSWORD']
spark.jars /opt/spark/com.databricks_spark-csv_2.10-1.3.0.jar,/opt/spark/org.apache.commons_commons-csv-1.2.jar,/opt/spark/com.univocity_univocity-parsers-1.5.6.jar
spark.driver.memory XXX_DRIVER_MEMORY
spark.ui.reverseProxy true
#!/usr/bin/env bash
set -x
cat /opt/spark-defaults.conf | sed -e "s/XXX_DRIVER_MEMORY/$SPARK_DRIVER_RAM/" > ${SPARK_HOME}/conf/spark-defaults.conf
cat /opt/core-site.xml | sed -e "s/XXX_NAMENODE_HOST/$NAMENODE_HOST/" > ${HADOOP_HOME}/etc/hadoop/core-site.xml
cp /opt/hdfs-site.xml ${HADOOP_HOME}/etc/hadoop/
HADOOP_USER_NAME=root /opt/hadoop/bin/hdfs dfs -mkdir /user/$NB_USER
HADOOP_USER_NAME=root /opt/hadoop/bin/hdfs dfs -chown $NB_USER /user/$NB_USER
HADOOP_USER_NAME=root /opt/hadoop/bin/hdfs dfs -chmod 750 /user/$NB_USER
cd $HOME/work
exec jupyter lab --no-browser --allow-root --NotebookApp.token='' $*
exit
FROM ubuntu:14.04
MAINTAINER Daniele Venzano <venza@brownhat.org>
RUN apt-get update && apt-get install -y --force-yes software-properties-common python-software-properties
RUN apt-add-repository -y ppa:webupd8team/java
RUN /bin/echo debconf shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections
RUN apt-get update && apt-get -y install oracle-java8-installer oracle-java8-set-default curl
ARG SPARK_VERSION
ENV SPARK_VERSION ${SPARK_VERSION:-2.2.0}
ARG HADOOP_VERSION
ENV HADOOP_VERSION ${HADOOP_VERSION:-hadoop2.6}
ENV JAVA_HOME /usr/lib/jvm/java-8-oracle/
RUN curl -s http://mirrors.ircam.fr/pub/apache/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${HADOOP_VERSION}.tgz | tar -xvz -C /opt/
WORKDIR /opt
RUN ln -s spark-${SPARK_VERSION}-bin-${HADOOP_VERSION} spark
ENV SPARK_HOME /opt/spark
ENV PATH /opt/spark/bin:/opt/spark/sbin:${PATH}
COPY files/* /opt/
RUN chmod +x /opt/*.sh
EXPOSE 8080 7077
ENV SPARK_MASTER_PORT 7077
ENV SPARK_MASTER_WEBUI_PORT 8080
CMD /opt/start-master.sh
spark.jars /opt/spark/com.databricks_spark-csv_2.10-1.3.0.jar,/opt/spark/org.apache.commons_commons-csv-1.2.jar,/opt/spark/com.univocity_univocity-parsers-1.5.6.jar
spark.driver.memory XXX_DRIVER_MEMORY
spark.ui.reverseProxy true
#!/usr/bin/env bash
if [ -z ${SPARK_MASTER_IP} ]; then
export SPARK_MASTER_IP=`awk 'NR==1 {print $1}' /etc/hosts`
fi
cat /opt/spark-defaults.conf | sed -e "s/XXX_DRIVER_MEMORY/$SPARK_DRIVER_RAM/" > ${SPARK_HOME}/conf/spark-defaults.conf
cd /opt/spark
./bin/spark-class org.apache.spark.deploy.master.Master --host $SPARK_MASTER_IP --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT $@
FROM ubuntu:14.04
MAINTAINER Daniele Venzano <venza@brownhat.org>
RUN apt-get update && apt-get install -y --force-yes software-properties-common python-software-properties
RUN apt-add-repository -y ppa:webupd8team/java
RUN /bin/echo debconf shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections
RUN apt-get update && apt-get -y install oracle-java8-installer oracle-java8-set-default curl
ARG SPARK_VERSION
ENV SPARK_VERSION ${SPARK_VERSION:-2.2.0}
ARG HADOOP_VERSION
ENV HADOOP_VERSION ${HADOOP_VERSION:-hadoop2.6}
ENV JAVA_HOME /usr/lib/jvm/java-8-oracle/
RUN curl -s http://mirrors.ircam.fr/pub/apache/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${HADOOP_VERSION}.tgz | tar -xvz -C /opt/
WORKDIR /opt
RUN ln -s spark-${SPARK_VERSION}-bin-${HADOOP_VERSION} spark
ENV SPARK_HOME /opt/spark
ENV PATH /opt/spark/bin:/opt/spark/sbin:${PATH}
RUN apt-get update && apt-get install -y --force-yes --no-install-recommends \
wget \
build-essential \
python-dev \
ca-certificates \
bzip2 \
pandoc \
libopenblas-dev \
libjpeg-dev \
&& apt-get clean
RUN locale-gen en_US.UTF-8
# Configure environment
ENV CONDA_DIR /opt/conda
ENV HADOOP_HOME /opt/hadoop
ENV HADOOP_CONF_DIR $HADOOP_HOME/etc/hadoop
ENV PATH $HADOOP_HOME/bin:$CONDA_DIR/bin:$PATH
ENV SHELL /bin/bash
ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8
ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.4-src.zip
ENV PYSPARK_PYTHON=/opt/conda/bin/python
RUN cd /tmp && \
mkdir -p $CONDA_DIR && \
wget http://repo.continuum.io/miniconda/Miniconda3-4.2.12-Linux-x86_64.sh && \
/bin/bash Miniconda3-4.2.12-Linux-x86_64.sh -f -b -p $CONDA_DIR && \
rm Miniconda3-4.2.12-Linux-x86_64.sh && \
$CONDA_DIR/bin/conda install --yes conda==4.2.12
# Install Python 3 packages
RUN conda install --yes \
'pandas=0.17*' \
'matplotlib=1.4*' \
'scipy=0.16*' \
'seaborn=0.6*' \
'scikit-learn=0.16*' \
&& conda clean -yt
COPY files/* /opt/
RUN chmod +x /opt/*.sh
EXPOSE 8888 8081
ENV SPARK_WORKER_PORT 8888
ENV SPARK_WORKER_WEBUI_PORT 8081
CMD /opt/start-worker.sh
spark.jars /opt/spark/com.databricks_spark-csv_2.10-1.3.0.jar,/opt/spark/org.apache.commons_commons-csv-1.2.jar,/opt/spark/com.univocity_univocity-parsers-1.5.6.jar
spark.ui.reverseProxy true