Commit f237c65e authored by Daniele Venzano's avatar Daniele Venzano
Browse files

Update Spark ZApp to Spark 2.2.2 and to the Zoe base images

parent 866bcd5d
Pipeline #10380 passed with stages
in 23 minutes
......@@ -6,7 +6,7 @@ stages:
- deploy
variables:
VERSION: $CI_PIPELINE_ID
VERSION: 10378
REPOSITORY: zapps
image: docker:latest
......@@ -15,34 +15,20 @@ before_script:
- mkdir -p $HOME/.docker
- echo $DOCKER_AUTH_CONFIG > $HOME/.docker/config.json
build:spark2-jupyter-notebook:
build:spark-jupyter-notebook:
stage: build
script:
- docker build -t ${REPOSITORY}/spark2-jupyter-notebook:${VERSION} docker/spark2-jupyter-notebook
- docker build --build-arg VERSION=${VERSION} -t ${REPOSITORY}/spark-jupyter-notebook:${VERSION} spark-jupyter-notebook
build:spark2-master:
build:spark-master:
stage: build
script:
- docker build -t ${REPOSITORY}/spark2-master:${VERSION} docker/spark2-master
- docker build --build-arg VERSION=${VERSION} -t ${REPOSITORY}/spark-master:${VERSION} spark-master
build:spark2-worker:
build:spark-worker:
stage: build
script:
- docker build -t ${REPOSITORY}/spark2-worker:${VERSION} docker/spark2-worker
build:spark2-submit:
stage: build
script:
- docker build -t ${REPOSITORY}/spark2-submit:${VERSION} docker/spark2-submit
build:json:
image: python:3-alpine
stage: build
script:
- python gen_json.py
artifacts:
paths:
- "spark*.json"
- docker build --build-arg VERSION=${VERSION} -t ${REPOSITORY}/spark-worker:${VERSION} spark-worker
test:json:
image: python:3-alpine
......@@ -50,40 +36,22 @@ test:json:
before_script:
- pip install requests
script:
- python /scripts/validate.py ${ZOE_VALIDATION_URL} spark*.json
push:all:
stage: push
script:
- /scripts/distribute_docker_image.sh ${REPOSITORY}/spark2-submit:${VERSION}
- /scripts/distribute_docker_image.sh ${REPOSITORY}/spark2-master:${VERSION}
- /scripts/distribute_docker_image.sh ${REPOSITORY}/spark2-worker:${VERSION}
- /scripts/distribute_docker_image.sh ${REPOSITORY}/spark2-jupyter-notebook:${VERSION}
only:
- master
- python gen_json.py
- python /scripts/validate.py ${ZOE_VALIDATION_URL} spark.json
artifacts:
paths:
- spark.json
- manifest.json
- logo.png
- README.md
- images
push:hub:
stage: push
script:
- docker push ${REPOSITORY}/spark2-submit:${VERSION}
- docker push ${REPOSITORY}/spark2-master:${VERSION}
- docker push ${REPOSITORY}/spark2-worker:${VERSION}
- docker push ${REPOSITORY}/spark2-jupyter-notebook:${VERSION}
only:
- master
deploy:json:
stage: deploy
script:
- mkdir -p ${ZAPP_SHOP_BASE_PATH}/${CI_PROJECT_NAME}
- cp *.json ${ZAPP_SHOP_BASE_PATH}/${CI_PROJECT_NAME}/
- cp logo.png ${ZAPP_SHOP_BASE_PATH}/${CI_PROJECT_NAME}/
- cp README*.md ${ZAPP_SHOP_BASE_PATH}/${CI_PROJECT_NAME}/
artifacts:
paths:
- "*.json"
- logo.png
- "README*.md"
- docker push ${REPOSITORY}/spark-master:${VERSION}
- docker push ${REPOSITORY}/spark-worker:${VERSION}
- docker push ${REPOSITORY}/spark-jupyter-notebook:${VERSION}
only:
- master
# Jupyter Notebook image
This image contains the Jupyter notebook configured with Pythen and a Spark client. It is used by Zoe, the Container Analytics as a
Service system to create on-demand notebooks connected to containerized Spark clusters.
Zoe can be found at: https://github.com/DistributedSystemsGroup/zoe
## Setup
The Dockerfile runs a start script that configures the Notebook using these environment variables:
* SPARK\_MASTER\_IP: IP address of the Spark master this notebook should use for its kernel
* PROXY\_ID: string to use as a prefix for URL paths, for reverse proxying
* SPARK\_EXECUTOR\_RAM: How much RAM to use for each executor spawned by the notebook
# Spark Scala master image
This image contains the Scala master process. It is used by Zoe, the Container Analytics as a
Service system to create on-demand Spark clusers in Spark standalone mode.
Zoe can be found at: https://github.com/DistributedSystemsGroup/zoe
## Setup
The Dockerfile automatically starts the Spark master process when the container is run.
# Spark worker image
This image contains the Scala worker process. It is used by Zoe, the Container Analytics as a
Service system to create on-demand Spark clusters in standalone mode.
Zoe can be found at: https://github.com/DistributedSystemsGroup/zoe
## Setup
The Dockerfile runs the worker process when run. The following options can be passed via environment variables:
* SPARK\_MASTER\_IP: IP address of the Spark master this notebook should use for its kernel
* SPARK\_WORKER\_RAM: How much RAM the worker can use (default is 4g)
* SPARK\_WORKER\_CORES: How many cores can be used by the worker process (default is 4)
# Spark ZApp
URL: [https://gitlab.eurecom.fr/zoe-apps/zapp-spark](https://gitlab.eurecom.fr/zoe-apps/zapp-spark)
Combine the full power of a distributed [Apache Spark](http://spark.apache.org) cluster with Python Jupyter Notebooks.
The Spark shell can be used from the built-in terminal in the notebook ZApp.
Spark is configured in stand-alone, distributed mode. This ZApp contains Spark version 2.1.0.
# Spark ZApp
URL: [https://gitlab.eurecom.fr/zoe-apps/zapp-spark](https://gitlab.eurecom.fr/zoe-apps/zapp-spark)
Traditional Spark submit jobs. Use the command-line parameter to specify which Python or JAR file to execute from your workspace.
# Spark ZApp
URL: [https://gitlab.eurecom.fr/zoe-apps/zapp-spark](https://gitlab.eurecom.fr/zoe-apps/zapp-spark)
Combine the full power of a distributed [Apache Spark](http://spark.apache.org) cluster with Python Jupyter Notebooks.
The Spark shell can be used from the built-in terminal in the notebook ZApp.
Spark is configured in stand-alone, distributed mode. This ZApp contains Spark version 2.2.2.
## Changing the default configuration
When you start a kernel with this Zapp you will have a SparkContext already created for you with a default configuration.
You can modify the executor ram limit or add other options and re-create a new context by using the following code:
# default options
spark_executor_ram = int(os.environ["SPARK_WORKER_RAM"]) - (1024 ** 3) - (512 * 1024 ** 2)
conf.set("spark.executor.memory", spark_executor_ram)
# set other options as desired
# create the context
sc = pyspark.SparkContext(conf=conf)
## Customizing the ZApp
### Workers
To run your own script (for example to install additional libraries on the worker nodes) you can override the default command specified in the JSON file, in the service section corresponding to the workers.
To start the worker correctly, you will need to use this command-line at the end of your script:
/opt/spark/bin/spark-class org.apache.spark.deploy.worker.Worker \
spark://${SPARK_MASTER_IP}:7077 --cores ${SPARK_WORKER_CORES} --memory ${SPARK_WORKER_RAM} \
-h ${SPARK_LOCAL_IP:-127.0.0.1}
### Master
To run your own script you can override the default command specified in the JSON file, in the service section corresponding to the master.
To start the master correctly, you will need to use this command-line at the end of your script:
${SPARK_HOME}/bin/spark-class org.apache.spark.deploy.master.Master --host ${SPARK_MASTER_IP} --port 7077 --webui-port 8080
### Notebook and Spark submit
You can customize the command run by the notebook service, to install additional libraries before starting the notebook, or to transform the ZApp into a batch job, by calling spark-submit instead of jupyter.
If you want to run the notebook, at the end of your script call `/opt/start_notebook.sh`.
If you want to run spark-submit, you need to use:
/opt/spark/bin/spark-submit --master spark://${SPARK_MASTER_IP}:7077 <the rest of the options>
Where the rest of the options could be, for example:
wordcount.py hdfs://192.168.45.157/datasets/gutenberg_big_2x.txt hdfs://192.168.45.157/tmp/wcount-out
#!/usr/bin/env bash
set -e
if [ ! -d docker ]; then
exit
fi
REPOSITORY=${REPOSITORY:-zapps}
DOCKER_REGISTRY=${DOCKER_REGISTRY:-docker-registry:5000}
VERSION=${VERSION:-`date +%Y%m%d%H%M%S`}
built_images=''
for d in `find docker -mindepth 1 -maxdepth 1 -type d -printf '%f '`; do
pushd docker/${d}
docker build -t ${DOCKER_REGISTRY}/${REPOSITORY}/${d}:${VERSION} .
docker push ${DOCKER_REGISTRY}/${REPOSITORY}/${d}:${VERSION}
popd
built_images+="${DOCKER_REGISTRY}/${REPOSITORY}/${d}:${VERSION}\n"
done
echo "-------------END SCRIPT-----------------"
echo "Images built:"
printf ${built_images}
echo
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://XXX_NAMENODE_HOST:8020/</value>
</property>
</configuration>
# Copyright (c) Jupyter Development Team.
from jupyter_core.paths import jupyter_data_dir
import subprocess
import os
import errno
import stat
PEM_FILE = os.path.join(jupyter_data_dir(), 'notebook.pem')
c = get_config()
c.NotebookApp.ip = '*'
c.NotebookApp.port = 8888
c.NotebookApp.open_browser = False
# Set a certificate if USE_HTTPS is set to any value
if 'USE_HTTPS' in os.environ:
if not os.path.isfile(PEM_FILE):
# Ensure PEM_FILE directory exists
dir_name = os.path.dirname(PEM_FILE)
try:
os.makedirs(dir_name)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(dir_name):
pass
else: raise
# Generate a certificate if one doesn't exist on disk
subprocess.check_call(['openssl', 'req', '-new',
'-newkey', 'rsa:2048', '-days', '365', '-nodes', '-x509',
'-subj', '/C=XX/ST=XX/L=XX/O=generated/CN=generated',
'-keyout', PEM_FILE, '-out', PEM_FILE])
# Restrict access to PEM_FILE
os.chmod(PEM_FILE, stat.S_IRUSR | stat.S_IWUSR)
c.NotebookApp.certfile = PEM_FILE
# Set a password if PASSWORD is set
if 'PASSWORD' in os.environ:
from IPython.lib import passwd
c.NotebookApp.password = passwd(os.environ['PASSWORD'])
del os.environ['PASSWORD']
#!/usr/bin/env bash
set -x
cat /opt/spark-defaults.conf | sed -e "s/XXX_DRIVER_MEMORY/$SPARK_DRIVER_RAM/" > ${SPARK_HOME}/conf/spark-defaults.conf
cat /opt/core-site.xml | sed -e "s/XXX_NAMENODE_HOST/$NAMENODE_HOST/" > ${HADOOP_HOME}/etc/hadoop/core-site.xml
cp /opt/hdfs-site.xml ${HADOOP_HOME}/etc/hadoop/
HADOOP_USER_NAME=root /opt/hadoop/bin/hdfs dfs -mkdir /user/$NB_USER
HADOOP_USER_NAME=root /opt/hadoop/bin/hdfs dfs -chown $NB_USER /user/$NB_USER
HADOOP_USER_NAME=root /opt/hadoop/bin/hdfs dfs -chmod 750 /user/$NB_USER
cd $HOME/work
exec jupyter notebook --NotebookApp.token='' $*
exit
FROM ubuntu:14.04
MAINTAINER Daniele Venzano <venza@brownhat.org>
RUN apt-get update && apt-get install -y --force-yes software-properties-common python-software-properties
RUN apt-add-repository -y ppa:webupd8team/java
RUN /bin/echo debconf shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections
RUN apt-get update && apt-get -y install oracle-java8-installer oracle-java8-set-default curl
ARG SPARK_VERSION
ENV SPARK_VERSION ${SPARK_VERSION:-2.1.0}
ARG HADOOP_VERSION
ENV HADOOP_VERSION ${HADOOP_VERSION:-hadoop2.6}
ENV JAVA_HOME /usr/lib/jvm/java-8-oracle/
RUN curl -s http://mirrors.ircam.fr/pub/apache/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${HADOOP_VERSION}.tgz | tar -xvz -C /opt/
WORKDIR /opt
RUN ln -s spark-${SPARK_VERSION}-bin-${HADOOP_VERSION} spark
ENV SPARK_HOME /opt/spark
ENV PATH /opt/spark/bin:/opt/spark/sbin:${PATH}
RUN apt-get update && apt-get install -y --force-yes --no-install-recommends \
wget \
build-essential \
python-dev \
ca-certificates \
bzip2 \
pandoc \
libopenblas-dev \
libjpeg-dev \
&& apt-get clean
RUN locale-gen en_US.UTF-8
# Configure environment
ENV CONDA_DIR /opt/conda
ENV HADOOP_HOME /opt/hadoop
ENV HADOOP_CONF_DIR $HADOOP_HOME/etc/hadoop
ENV PATH $HADOOP_HOME/bin:$CONDA_DIR/bin:$PATH
ENV SHELL /bin/bash
ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8
ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.4-src.zip
ENV PYSPARK_PYTHON=/opt/conda/bin/python
RUN cd /tmp && \
mkdir -p $CONDA_DIR && \
wget http://repo.continuum.io/miniconda/Miniconda3-4.2.12-Linux-x86_64.sh && \
/bin/bash Miniconda3-4.2.12-Linux-x86_64.sh -f -b -p $CONDA_DIR && \
rm Miniconda3-4.2.12-Linux-x86_64.sh && \
$CONDA_DIR/bin/conda install --yes conda==4.2.12
# Install Python 3 packages
RUN conda install --yes \
'pandas=0.17*' \
'matplotlib=1.4*' \
'scipy=0.16*' \
'seaborn=0.6*' \
'scikit-learn=0.16*' \
'statsmodels=0.6.1' \
'pillow' \
'basemap' \
&& conda clean -yt
RUN /opt/conda/bin/pip install thunder-python showit
# Add Spark JARs
RUN curl http://central.maven.org/maven2/com/databricks/spark-csv_2.10/1.3.0/spark-csv_2.10-1.3.0.jar -o /opt/spark/com.databricks_spark-csv_2.10-1.3.0.jar
RUN curl http://central.maven.org/maven2/org/apache/commons/commons-csv/1.2/commons-csv-1.2.jar -o /opt/spark/org.apache.commons_commons-csv-1.2.jar
RUN curl http://central.maven.org/maven2/com/univocity/univocity-parsers/1.5.6/univocity-parsers-1.5.6.jar -o /opt/spark/com.univocity_univocity-parsers-1.5.6.jar
ENV HADOOP_VERSION_DL 2.6.5
RUN curl http://apache.mirrors.ovh.net/ftp.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION_DL}/hadoop-${HADOOP_VERSION_DL}.tar.gz | tar -xz -C /opt/
RUN ln -s /opt/hadoop-${HADOOP_VERSION_DL} /opt/hadoop
COPY files/* /opt/
RUN chmod +x /opt/*.sh
EXPOSE 4040
ENV ZOE_WORKSPACE /mnt/workspace
ENV HADOOP_HOME /opt/hadoop
VOLUME /mnt/workspace
WORKDIR /mnt/workspace
ENTRYPOINT ["/opt/submit.sh"]
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:///mnt/namenode</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:///mnt/datanode</value>
</property>
</configuration>
spark.jars /opt/spark/com.databricks_spark-csv_2.10-1.3.0.jar,/opt/spark/org.apache.commons_commons-csv-1.2.jar,/opt/spark/com.univocity_univocity-parsers-1.5.6.jar
spark.driver.memory XXX_DRIVER_MEMORY
spark.ui.reverseProxy true
#!/usr/bin/env bash
set -e
cat /opt/spark-defaults.conf | sed -e "s/XXX_DRIVER_MEMORY/$SPARK_DRIVER_RAM/" > ${SPARK_HOME}/conf/spark-defaults.conf
cat /opt/core-site.xml | sed -e "s/XXX_NAMENODE_HOST/$NAMENODE_HOST/" > ${HADOOP_HOME}/etc/hadoop/core-site.xml
cp /opt/hdfs-site.xml ${HADOOP_HOME}/etc/hadoop/
cd $ZOE_WORKSPACE
echo 'Configuration done, starting Spark...'
/opt/spark/bin/spark-submit --master spark://${SPARK_MASTER_IP}:7077 --executor-memory=${SPARK_EXECUTOR_RAM} "$@"
......@@ -46,24 +46,15 @@ options = {
'worker_count': {
'value': 2,
'description': 'Number of workers'
},
'hdfs_namenode': {
'value': 'hdfs-namenode.zoe',
'description': 'Namenode hostname'
},
'submit_command': {
'value': 'wordcount.py hdfs://192.168.45.157/datasets/gutenberg_big_2x.txt hdfs://192.168.45.157/tmp/wcount-out',
'description': 'Spark submit command line'
}
}
REPOSITORY = os.getenv("REPOSITORY", default="zapps")
VERSION = os.getenv("VERSION", default="latest")
MASTER_IMAGE = REPOSITORY + "/spark2-master:" + VERSION
WORKER_IMAGE = REPOSITORY + "/spark2-worker:" + VERSION
NOTEBOOK_IMAGE = REPOSITORY + "/spark2-jupyter-notebook:" + VERSION
SUBMIT_IMAGE = REPOSITORY + "/spark2-submit:" + VERSION
MASTER_IMAGE = REPOSITORY + "/spark-master:" + VERSION
WORKER_IMAGE = REPOSITORY + "/spark-worker:" + VERSION
NOTEBOOK_IMAGE = REPOSITORY + "/spark-jupyter-notebook:" + VERSION
def spark_master_service(mem_limit):
"""
......@@ -103,7 +94,7 @@ def spark_master_service(mem_limit):
'essential_count': 1,
'startup_order': 0,
'replicas': 1,
'command': None
'command': '/opt/start_master.sh'
}
return service
......@@ -149,19 +140,18 @@ def spark_worker_service(count, mem_limit, cores):
'essential_count': 1,
'startup_order': 1,
'replicas': 1,
'command': None
'command': '/opt/start-worker.sh'
}
return service
def spark_jupyter_notebook_service(mem_limit, worker_mem_limit, hdfs_namenode):
def spark_jupyter_notebook_service(mem_limit, worker_mem_limit):
"""
:type mem_limit: int
:type worker_mem_limit: int
:type hdfs_namenode: str
:rtype: dict
"""
executor_ram = worker_mem_limit - (1024 ** 3) - (512 * 1025 ** 2)
driver_ram = (2 * 1024 ** 3)
service = {
'name': "spark-jupyter",
......@@ -187,61 +177,16 @@ def spark_jupyter_notebook_service(mem_limit, worker_mem_limit, hdfs_namenode):
],
'environment': [
["SPARK_MASTER", "spark://{dns_name#spark-master0}:7077"],
["SPARK_EXECUTOR_RAM", str(executor_ram)],
["SPARK_DRIVER_RAM", str(driver_ram)],
["HADOOP_USER_NAME", "{user_name}"],
["NB_USER", "{user_name}"],
["PYTHONHASHSEED", "42"],
['NAMENODE_HOST', hdfs_namenode]
["PYTHONHASHSEED", "42"]
],
'volumes': [],
'total_count': 1,
'essential_count': 1,
'startup_order': 0,
'replicas': 1,
'command': None
}
return service
def spark_submit_service(mem_limit, worker_mem_limit, hdfs_namenode, command):
"""
:type mem_limit: int
:type worker_mem_limit: int
:type hdfs_namenode: str
:rtype: dict
"""
executor_ram = worker_mem_limit - (1024 ** 3) - (512 * 1025 ** 2)
driver_ram = (2 * 1024 ** 3)
service = {
'name': "spark-submit",
'image': SUBMIT_IMAGE,
'monitor': True,
'resources': {
"memory": {
"min": mem_limit,
"max": mem_limit
},
"cores": {
'min': 2,
'max': 2
}
},
'ports': [],
'environment': [
["SPARK_MASTER_IP", "{dns_name#spark-master0}"],
["SPARK_EXECUTOR_RAM", str(executor_ram)],
["SPARK_DRIVER_RAM", str(driver_ram)],
["HADOOP_USER_NAME", "{user_name}"],
["PYTHONHASHSEED", "42"],
['NAMENODE_HOST', hdfs_namenode]
],
'volumes': [],
'total_count': 1,
'essential_count': 1,
'startup_order': 2,
'replicas': 1,
'command': command
'command': 'jupyter lab --no-browser --NotebookApp.token=\'\' --allow-root --ip=0.0.0.0'
}
return service
......@@ -249,8 +194,7 @@ def spark_submit_service(mem_limit, worker_mem_limit, hdfs_namenode, command):
if __name__ == '__main__':
sp_master = spark_master_service(options['master_mem_limit']['value'])
sp_worker = spark_worker_service(options['worker_count']['value'], options['worker_mem_limit']['value'], options['worker_cores']['value'])
jupyter = spark_jupyter_notebook_service(options['notebook_mem_limit']['value'], options['worker_mem_limit']['value'], options['hdfs_namenode']['value'])
sp_submit = spark_submit_service(options['submit_mem_limit']['value'], options['worker_mem_limit']['value'], options['hdfs_namenode']['value'], options['submit_command']['value'])
jupyter = spark_jupyter_notebook_service(options['notebook_mem_limit']['value'], options['worker_mem_limit']['value'])
app = {
'name': APP_NAME,
......@@ -264,21 +208,12 @@ if __name__ == '__main__':
]
}
json.dump(app, open("spark-jupyter.json", "w"), sort_keys=True, indent=4)
app = {
'name': APP_NAME,
'version': ZOE_APPLICATION_DESCRIPTION_VERSION,
'will_end': False,
'size': 512,
'services': [
sp_master,