Commit 654eb1d1 authored by Daniele Venzano's avatar Daniele Venzano
Browse files

Add Spark submit service and generate the submit zapp

parent 1d957c01
Pipeline #4026 failed
......@@ -19,7 +19,7 @@ images:
- python gen_json.py
artifacts:
paths:
- zapp.json
- *.json
only:
- master
# Spark Jupyter ZApp
# Spark ZApp
Jupyter Notebook configured to run PySpark. `spark-submit` and the Spark shell can be used from the built-in terminal.
\ No newline at end of file
PySpark Jupyter Notebook and traditional Spark submit jobs. The Spark shell can be used from the built-in terminal in the notebook ZApp.
FROM ubuntu:14.04
MAINTAINER Daniele Venzano <venza@brownhat.org>
RUN apt-get update && apt-get install -y --force-yes software-properties-common python-software-properties
RUN apt-add-repository -y ppa:webupd8team/java
RUN /bin/echo debconf shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections
RUN apt-get update && apt-get -y install oracle-java8-installer oracle-java8-set-default curl
ARG SPARK_VERSION
ENV SPARK_VERSION ${SPARK_VERSION:-2.1.0}
ARG HADOOP_VERSION
ENV HADOOP_VERSION ${HADOOP_VERSION:-hadoop2.6}
ENV JAVA_HOME /usr/lib/jvm/java-8-oracle/
RUN curl -s http://mirrors.ircam.fr/pub/apache/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${HADOOP_VERSION}.tgz | tar -xvz -C /opt/
WORKDIR /opt
RUN ln -s spark-${SPARK_VERSION}-bin-${HADOOP_VERSION} spark
ENV SPARK_HOME /opt/spark
ENV PATH /opt/spark/bin:/opt/spark/sbin:${PATH}
RUN apt-get update && apt-get install -y --force-yes --no-install-recommends \
wget \
build-essential \
python-dev \
ca-certificates \
bzip2 \
pandoc \
libopenblas-dev \
libjpeg-dev \
&& apt-get clean
RUN locale-gen en_US.UTF-8
# Configure environment
ENV CONDA_DIR /opt/conda
ENV HADOOP_HOME /opt/hadoop
ENV HADOOP_CONF_DIR $HADOOP_HOME/etc/hadoop
ENV PATH $HADOOP_HOME/bin:$CONDA_DIR/bin:$PATH
ENV SHELL /bin/bash
ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8
ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.4-src.zip
ENV PYSPARK_PYTHON=/opt/conda/bin/python
RUN cd /tmp && \
mkdir -p $CONDA_DIR && \
wget http://repo.continuum.io/miniconda/Miniconda3-4.2.12-Linux-x86_64.sh && \
/bin/bash Miniconda3-4.2.12-Linux-x86_64.sh -f -b -p $CONDA_DIR && \
rm Miniconda3-4.2.12-Linux-x86_64.sh && \
$CONDA_DIR/bin/conda install --yes conda==4.2.12
# Install Python 3 packages
RUN conda install --yes \
'pandas=0.17*' \
'matplotlib=1.4*' \
'scipy=0.16*' \
'seaborn=0.6*' \
'scikit-learn=0.16*' \
'statsmodels=0.6.1' \
'pillow' \
'basemap' \
&& conda clean -yt
RUN /opt/conda/bin/pip install thunder-python showit
# Add Spark JARs
RUN curl http://central.maven.org/maven2/com/databricks/spark-csv_2.10/1.3.0/spark-csv_2.10-1.3.0.jar -o /opt/spark/com.databricks_spark-csv_2.10-1.3.0.jar
RUN curl http://central.maven.org/maven2/org/apache/commons/commons-csv/1.2/commons-csv-1.2.jar -o /opt/spark/org.apache.commons_commons-csv-1.2.jar
RUN curl http://central.maven.org/maven2/com/univocity/univocity-parsers/1.5.6/univocity-parsers-1.5.6.jar -o /opt/spark/com.univocity_univocity-parsers-1.5.6.jar
ENV HADOOP_VERSION_DL 2.6.5
RUN curl http://apache.mirrors.ovh.net/ftp.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION_DL}/hadoop-${HADOOP_VERSION_DL}.tar.gz | tar -xz -C /opt/
RUN ln -s /opt/hadoop-${HADOOP_VERSION_DL} /opt/hadoop
COPY files/* /opt/
RUN chmod +x /opt/*.sh
EXPOSE 4040
ENV ZOE_WORKSPACE /mnt/workspace
ENV HADOOP_HOME /opt/hadoop
VOLUME /mnt/workspace
WORKDIR /mnt/workspace
ENTRYPOINT ["/opt/submit.sh"]
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://XXX_NAMENODE_HOST:8020/</value>
</property>
</configuration>
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:///mnt/namenode</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:///mnt/datanode</value>
</property>
</configuration>
spark.jars /opt/spark/com.databricks_spark-csv_2.10-1.3.0.jar,/opt/spark/org.apache.commons_commons-csv-1.2.jar,/opt/spark/com.univocity_univocity-parsers-1.5.6.jar
spark.driver.memory XXX_DRIVER_MEMORY
spark.ui.reverseProxy true
#!/usr/bin/env bash
set -e
cat /opt/spark-defaults.conf | sed -e "s/XXX_DRIVER_MEMORY/$SPARK_DRIVER_RAM/" > ${SPARK_HOME}/conf/spark-defaults.conf
cat /opt/core-site.xml | sed -e "s/XXX_NAMENODE_HOST/$NAMENODE_HOST/" > ${HADOOP_HOME}/etc/hadoop/core-site.xml
cp /opt/hdfs-site.xml ${HADOOP_HOME}/etc/hadoop/
cd $ZOE_WORKSPACE
echo 'Configuration done, starting Spark...'
/opt/spark/bin/spark-submit --master spark://${SPARK_MASTER_IP}:7077 --executor-memory=${SPARK_EXECUTOR_RAM} "$@"
......@@ -35,6 +35,10 @@ options = {
'value': 12 * (1024**3),
'description': 'Notebook memory limit (bytes)'
},
'submit_mem_limit': {
'value': 12 * (1024**3),
'description': 'Spark submit memory limit (bytes)'
},
'worker_cores': {
'value': 6,
'description': 'Cores used by each worker'
......@@ -46,6 +50,10 @@ options = {
'hdfs_namenode': {
'value': 'hdfs-namenode.zoe',
'description': 'Namenode hostname'
},
'submit_command': {
'value': 'wordcount.py hdfs://192.168.45.157/datasets/gutenberg_big_2x.txt hdfs://192.168.45.157/tmp/wcount-out',
'description': 'Spark submit command line'
}
}
......@@ -56,6 +64,7 @@ VERSION = os.getenv("VERSION", default="latest")
MASTER_IMAGE = REGISTRY + "/" + REPOSITORY + "/spark2-master:" + VERSION
WORKER_IMAGE = REGISTRY + "/" + REPOSITORY + "/spark2-worker:" + VERSION
NOTEBOOK_IMAGE = REGISTRY + "/" + REPOSITORY + "/spark2-jupyter-notebook:" + VERSION
SUBMIT_IMAGE = REGISTRY + "/" + REPOSITORY + "/spark2-submit:" + VERSION
def spark_master_service(mem_limit):
"""
......@@ -187,7 +196,49 @@ def spark_jupyter_notebook_service(mem_limit, worker_mem_limit, hdfs_namenode):
'volumes': [],
'total_count': 1,
'essential_count': 1,
'startup_order': 0
'startup_order': 0,
'command': None
}
return service
def spark_submit_service(mem_limit, worker_mem_limit, hdfs_namenode, command):
"""
:type mem_limit: int
:type worker_mem_limit: int
:type hdfs_namenode: str
:rtype: dict
"""
executor_ram = worker_mem_limit - (1024 ** 3) - (512 * 1025 ** 2)
driver_ram = (2 * 1024 ** 3)
service = {
'name': "spark-submit",
'image': SUBMIT_IMAGE,
'monitor': True,
'required_resources': {
"memory": {
"min": mem_limit,
"max": mem_limit
},
"cores": {
'min': None,
'max': None
}
},
'ports': [],
'environment': [
["SPARK_MASTER", "spark://{dns_name#spark-master0}:7077"],
["SPARK_EXECUTOR_RAM", str(executor_ram)],
["SPARK_DRIVER_RAM", str(driver_ram)],
["HADOOP_USER_NAME", "{user_name}"],
["PYTHONHASHSEED", "42"],
['NAMENODE_HOST', hdfs_namenode]
],
'volumes': [],
'total_count': 1,
'essential_count': 1,
'startup_order': 2,
'command': command
}
return service
......@@ -196,6 +247,7 @@ if __name__ == '__main__':
sp_master = spark_master_service(options['master_mem_limit']['value'])
sp_worker = spark_worker_service(options['worker_count']['value'], options['worker_mem_limit']['value'], options['worker_cores']['value'])
jupyter = spark_jupyter_notebook_service(options['notebook_mem_limit']['value'], options['worker_mem_limit']['value'], options['hdfs_namenode']['value'])
sp_submit = spark_submit_service(options['submit_mem_limit']['value'], options['worker_mem_limit']['value'], options['hdfs_namenode']['value'], options['submit_command']['value'])
app = {
'name': APP_NAME,
......@@ -209,6 +261,21 @@ if __name__ == '__main__':
]
}
json.dump(app, open("zapp.json", "w"), sort_keys=True, indent=4)
print("ZApp written to zapp.json")
json.dump(app, open("spark-jupyter.json", "w"), sort_keys=True, indent=4)
app = {
'name': APP_NAME,
'version': ZOE_APPLICATION_DESCRIPTION_VERSION,
'will_end': False,
'size': 512,
'services': [
sp_master,
sp_worker,
sp_submit
]
}
json.dump(app, open("spark-submit.json", "w"), sort_keys=True, indent=4)
print("Two ZApps written")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment