Commit 559e0ab8 authored by Daniele Venzano's avatar Daniele Venzano

First commit

parents
This diff is collapsed.
#!/bin/sh
SPARK_VER=1.4.1
HADOOP_VER=hadoop2.4
./gen_dockerfiles.py $SPARK_VER $HADOOP_VER
for d in master worker shell submit; do
cd $d
docker build -t 10.0.0.2:5000/venza/spark-$d:$SPARK_VER .
docker push 10.0.0.2:5000/venza/spark-$d:$SPARK_VER
cd ..
docker -H 10.0.0.2:2380 pull 10.0.0.2:5000/venza/spark-$d:$SPARK_VER
done
#!/usr/bin/env bash
sed '1d' /etc/hosts > tmpHosts
cat tmpHosts > /etc/hosts
rm tmpHosts
spark.driver.port 7001
spark.fileserver.port 7002
spark.broadcast.port 7003
spark.replClassServer.port 7004
spark.blockManager.port 7005
spark.executor.port 7006
spark.ui.port 4040
spark.broadcast.factory org.apache.spark.broadcast.HttpBroadcastFactory
#!/usr/bin/env bash
export SPARK_MASTER_IP=`awk 'NR==1 {print $1}' /etc/hosts`
export SPARK_LOCAL_IP=`awk 'NR==1 {print $1}' /etc/hosts`
export SPARK_MASTER_PORT=7077
export SPARK_MASTER_WEBUI_PORT=8080
cd /opt/spark
./bin/spark-class org.apache.spark.deploy.master.Master \
--host $SPARK_MASTER_IP --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT \
$@
#!/usr/bin/env bash
cd /opt/spark/
./bin/spark-shell --master spark://${SPARK_MASTER_IP}:7077 --executor-memory ${SPARK_EXECUTOR_RAM} "$@"
#!/usr/bin/env bash
cd /opt/spark
export SPARK_LOCAL_IP=`awk 'NR==1 {print $1}' /etc/hosts`
./bin/spark-class org.apache.spark.deploy.worker.Worker \
spark://${SPARK_MASTER_IP}:7077 --cores ${SPARK_WORKER_CORES:-4} --memory ${SPARK_WORKER_RAM:-4g} \
-h $SPARK_LOCAL_IP \
"$@"
#!/usr/bin/env bash
cd /opt/spark/
./bin/spark-submit --master spark://${SPARK_MASTER_IP}:7077 --executor-memory=${SPARK_EXECUTOR_RAM} "$@"
#!/usr/bin/python
import sys
import shutil
from jinja2 import Environment, FileSystemLoader
def copyfile(fname, dest):
shutil.copy("files/" + fname, dest + "/files/" + fname)
if len(sys.argv) < 3:
print("Usage: {} <spark version> <hadoop version>".format(sys.argv[0]))
sys.exit(1)
ji_env = Environment(loader=FileSystemLoader('templates'))
common_tmpl = ji_env.get_template('common.tmpl')
common = common_tmpl.render(spark_version=sys.argv[1], hadoop_version=sys.argv[2])
# Master
master_tmpl = ji_env.get_template('master.tmpl')
master = common + "\n" + master_tmpl.render()
open("master/Dockerfile", "w").write(master)
for f in ["remove_alias.sh", "start-master.sh"]:
copyfile(f, "master")
# Worker
worker_tmpl = ji_env.get_template('worker.tmpl')
worker = common + "\n" + worker_tmpl.render()
open("worker/Dockerfile", "w").write(worker)
for f in ["remove_alias.sh", "start-worker.sh"]:
copyfile(f, "worker")
# Shell
shell_tmpl = ji_env.get_template('shell.tmpl')
shell = common + "\n" + shell_tmpl.render()
open("shell/Dockerfile", "w").write(shell)
for f in ["remove_alias.sh", "start-shell.sh"]:
copyfile(f, "shell")
# Submit
submit_tmpl = ji_env.get_template('submit.tmpl')
submit = common + "\n" + submit_tmpl.render()
open("submit/Dockerfile", "w").write(submit)
for f in ["remove_alias.sh", "submit.sh"]:
copyfile(f, "submit")
# Notebook
#nb_tmpl = ji_env.get_template('notebook.tmpl')
#nb = nb_tmpl.render()
#open("notebook/Dockerfile", "w").write(nb)
FROM ubuntu:14.04
MAINTAINER Daniele Venzano <venza@brownhat.org>
ENV SPARK_VERSION 1.4.1
ENV HADOOP_VERSION hadoop2.4
RUN apt-get update && apt-get install -y --force-yes software-properties-common python-software-properties
RUN apt-add-repository -y ppa:webupd8team/java
RUN /bin/echo debconf shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections
RUN apt-get update && apt-get -y install oracle-java7-installer oracle-java7-set-default curl
RUN curl -s http://mirrors.ircam.fr/pub/apache/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${HADOOP_VERSION}.tgz | tar -xz -C /opt/
WORKDIR /opt
RUN ln -s spark-${SPARK_VERSION}-bin-${HADOOP_VERSION} spark
ENV SPARK_HOME /opt/spark
ENV PATH /opt/spark/bin:/opt/spark/sbin:${PATH}
COPY files/* /opt/
EXPOSE 8080 7077
CMD /opt/start-master.sh
#ENV SPARK_MASTER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory"
#ENV SPARK_WORKER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory"
#ENV SPARK_MASTER_PORT 7077
#ENV SPARK_MASTER_WEBUI_PORT 8080
#ENV SPARK_WORKER_PORT 8888
#ENV SPARK_WORKER_WEBUI_PORT 8081
#EXPOSE 8080 7077 8888 8081 4040 7001 7002 7003 7004 7005 7006
\ No newline at end of file
#!/usr/bin/env bash
sed '1d' /etc/hosts > tmpHosts
cat tmpHosts > /etc/hosts
rm tmpHosts
#!/usr/bin/env bash
export SPARK_LOCAL_IP=`awk 'NR==1 {print $1}' /etc/hosts`
/remove_alias.sh # problems with hostname alias, see https://issues.apache.org/jira/browse/SPARK-6680
cd /opt/
./bin/spark-shell \
--master spark://${SPARK_MASTER_PORT_7077_TCP_ADDR}:7077 \
-i ${SPARK_LOCAL_IP} \
"$@"
#!/usr/bin/env bash
export SPARK_MASTER_IP=`awk 'NR==1 {print $1}' /etc/hosts`
export SPARK_LOCAL_IP=`awk 'NR==1 {print $1}' /etc/hosts`
export SPARK_MASTER_PORT=7077
export SPARK_MASTER_WEBUI_PORT=8080
cd /opt/spark
./bin/spark-class org.apache.spark.deploy.master.Master \
--host $SPARK_MASTER_IP --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT \
$@
FROM ubuntu:14.04
MAINTAINER Daniele Venzano <venza@brownhat.org>
ENV SPARK_VERSION 1.4.1
ENV HADOOP_VERSION hadoop2.4
RUN apt-get update && apt-get install -y --force-yes software-properties-common python-software-properties
RUN apt-add-repository -y ppa:webupd8team/java
RUN /bin/echo debconf shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections
RUN apt-get update && apt-get -y install oracle-java7-installer oracle-java7-set-default curl
RUN curl -s http://mirrors.ircam.fr/pub/apache/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${HADOOP_VERSION}.tgz | tar -xz -C /opt/
WORKDIR /opt
RUN ln -s spark-${SPARK_VERSION}-bin-${HADOOP_VERSION} spark
ENV SPARK_HOME /opt/spark
ENV PATH /opt/spark/bin:/opt/spark/sbin:${PATH}
COPY files/* /opt/
CMD /opt/start-shell.sh
#ADD start-master.sh /start-master.sh
#ADD scripts/start-worker /start-worker.sh
#ADD scripts/spark-shell.sh /spark-shell.sh
#ADD spark-defaults.conf /spark-defaults.conf
#ADD remove_alias.sh /remove_alias.sh
#ENV SPARK_MASTER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory"
#ENV SPARK_WORKER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory"
#ENV SPARK_MASTER_PORT 7077
#ENV SPARK_MASTER_WEBUI_PORT 8080
#ENV SPARK_WORKER_PORT 8888
#ENV SPARK_WORKER_WEBUI_PORT 8081
#EXPOSE 8080 7077 8888 8081 4040 7001 7002 7003 7004 7005 7006
\ No newline at end of file
#!/usr/bin/env bash
sed '1d' /etc/hosts > tmpHosts
cat tmpHosts > /etc/hosts
rm tmpHosts
#!/usr/bin/env bash
cd /opt/spark/
./bin/spark-shell --master spark://${SPARK_MASTER_IP}:7077 --executor-memory ${SPARK_EXECUTOR_RAM} "$@"
FROM ubuntu:14.04
MAINTAINER Daniele Venzano <venza@brownhat.org>
ENV SPARK_VERSION 1.4.1
ENV HADOOP_VERSION hadoop2.4
RUN apt-get update && apt-get install -y --force-yes software-properties-common python-software-properties
RUN apt-add-repository -y ppa:webupd8team/java
RUN /bin/echo debconf shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections
RUN apt-get update && apt-get -y install oracle-java7-installer oracle-java7-set-default curl
RUN curl -s http://mirrors.ircam.fr/pub/apache/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${HADOOP_VERSION}.tgz | tar -xz -C /opt/
WORKDIR /opt
RUN ln -s spark-${SPARK_VERSION}-bin-${HADOOP_VERSION} spark
ENV SPARK_HOME /opt/spark
ENV PATH /opt/spark/bin:/opt/spark/sbin:${PATH}
COPY files/* /opt/
CMD /opt/submit.sh
#ADD start-master.sh /start-master.sh
#ADD scripts/start-worker /start-worker.sh
#ADD scripts/spark-shell.sh /spark-shell.sh
#ADD spark-defaults.conf /spark-defaults.conf
#ADD remove_alias.sh /remove_alias.sh
#ENV SPARK_MASTER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory"
#ENV SPARK_WORKER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory"
#ENV SPARK_MASTER_PORT 7077
#ENV SPARK_MASTER_WEBUI_PORT 8080
#ENV SPARK_WORKER_PORT 8888
#ENV SPARK_WORKER_WEBUI_PORT 8081
#EXPOSE 8080 7077 8888 8081 4040 7001 7002 7003 7004 7005 7006
\ No newline at end of file
#!/usr/bin/env bash
sed '1d' /etc/hosts > tmpHosts
cat tmpHosts > /etc/hosts
rm tmpHosts
#!/usr/bin/env bash
cd /opt/spark/
./bin/spark-submit --master spark://${SPARK_MASTER_IP}:7077 --executor-memory=${SPARK_EXECUTOR_RAM} "$@"
FROM ubuntu:14.04
MAINTAINER Daniele Venzano <venza@brownhat.org>
ENV SPARK_VERSION {{ spark_version }}
ENV HADOOP_VERSION {{ hadoop_version }}
RUN apt-get update && apt-get install -y --force-yes software-properties-common python-software-properties
RUN apt-add-repository -y ppa:webupd8team/java
RUN /bin/echo debconf shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections
RUN apt-get update && apt-get -y install oracle-java7-installer oracle-java7-set-default curl
RUN curl -s http://mirrors.ircam.fr/pub/apache/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${HADOOP_VERSION}.tgz | tar -xz -C /opt/
WORKDIR /opt
RUN ln -s spark-${SPARK_VERSION}-bin-${HADOOP_VERSION} spark
ENV SPARK_HOME /opt/spark
ENV PATH /opt/spark/bin:/opt/spark/sbin:${PATH}
COPY files/* /opt/
EXPOSE 8080 7077
CMD /opt/start-master.sh
#ENV SPARK_MASTER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory"
#ENV SPARK_WORKER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory"
#ENV SPARK_MASTER_PORT 7077
#ENV SPARK_MASTER_WEBUI_PORT 8080
#ENV SPARK_WORKER_PORT 8888
#ENV SPARK_WORKER_WEBUI_PORT 8081
#EXPOSE 8080 7077 8888 8081 4040 7001 7002 7003 7004 7005 7006
COPY files/* /opt/
CMD /opt/start-shell.sh
#ADD start-master.sh /start-master.sh
#ADD scripts/start-worker /start-worker.sh
#ADD scripts/spark-shell.sh /spark-shell.sh
#ADD spark-defaults.conf /spark-defaults.conf
#ADD remove_alias.sh /remove_alias.sh
#ENV SPARK_MASTER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory"
#ENV SPARK_WORKER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory"
#ENV SPARK_MASTER_PORT 7077
#ENV SPARK_MASTER_WEBUI_PORT 8080
#ENV SPARK_WORKER_PORT 8888
#ENV SPARK_WORKER_WEBUI_PORT 8081
#EXPOSE 8080 7077 8888 8081 4040 7001 7002 7003 7004 7005 7006
COPY files/* /opt/
CMD /opt/submit.sh
#ADD start-master.sh /start-master.sh
#ADD scripts/start-worker /start-worker.sh
#ADD scripts/spark-shell.sh /spark-shell.sh
#ADD spark-defaults.conf /spark-defaults.conf
#ADD remove_alias.sh /remove_alias.sh
#ENV SPARK_MASTER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory"
#ENV SPARK_WORKER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory"
#ENV SPARK_MASTER_PORT 7077
#ENV SPARK_MASTER_WEBUI_PORT 8080
#ENV SPARK_WORKER_PORT 8888
#ENV SPARK_WORKER_WEBUI_PORT 8081
#EXPOSE 8080 7077 8888 8081 4040 7001 7002 7003 7004 7005 7006
COPY files/* /opt/
EXPOSE 8081 4040
CMD /opt/start-worker.sh
#ENV SPARK_MASTER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory"
#ENV SPARK_WORKER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory"
#ENV SPARK_MASTER_PORT 7077
#ENV SPARK_MASTER_WEBUI_PORT 8080
#ENV SPARK_WORKER_PORT 8888
#ENV SPARK_WORKER_WEBUI_PORT 8081
#EXPOSE 8888 8081 4040 7001 7002 7003 7004 7005 7006
FROM ubuntu:14.04
MAINTAINER Daniele Venzano <venza@brownhat.org>
ENV SPARK_VERSION 1.4.1
ENV HADOOP_VERSION hadoop2.4
RUN apt-get update && apt-get install -y --force-yes software-properties-common python-software-properties
RUN apt-add-repository -y ppa:webupd8team/java
RUN /bin/echo debconf shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections
RUN apt-get update && apt-get -y install oracle-java7-installer oracle-java7-set-default curl
RUN curl -s http://mirrors.ircam.fr/pub/apache/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${HADOOP_VERSION}.tgz | tar -xz -C /opt/
WORKDIR /opt
RUN ln -s spark-${SPARK_VERSION}-bin-${HADOOP_VERSION} spark
ENV SPARK_HOME /opt/spark
ENV PATH /opt/spark/bin:/opt/spark/sbin:${PATH}
COPY files/* /opt/
EXPOSE 8081 4040
CMD /opt/start-worker.sh
#ENV SPARK_MASTER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory"
#ENV SPARK_WORKER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory"
#ENV SPARK_MASTER_PORT 7077
#ENV SPARK_MASTER_WEBUI_PORT 8080
#ENV SPARK_WORKER_PORT 8888
#ENV SPARK_WORKER_WEBUI_PORT 8081
#EXPOSE 8888 8081 4040 7001 7002 7003 7004 7005 7006
\ No newline at end of file
#!/usr/bin/env bash
sed '1d' /etc/hosts > tmpHosts
cat tmpHosts > /etc/hosts
rm tmpHosts
#!/usr/bin/env bash
cd /opt/spark
export SPARK_LOCAL_IP=`awk 'NR==1 {print $1}' /etc/hosts`
./bin/spark-class org.apache.spark.deploy.worker.Worker \
spark://${SPARK_MASTER_IP}:7077 --cores ${SPARK_WORKER_CORES:-4} --memory ${SPARK_WORKER_RAM:-4g} \
-h $SPARK_LOCAL_IP \
"$@"
#!/bin/sh
SWARM_MANAGER=10.0.0.2:2380
REGISTRY=10.0.0.2:5000
MASTER_IMAGE=$REGISTRY/venza/spark-master:1.4.1
WORKER_IMAGE=$REGISTRY/venza/spark-worker:1.4.1
SHELL_IMAGE=$REGISTRY/venza/spark-shell:1.4.1
SUBMIT_IMAGE=$REGISTRY/venza/spark-submit:1.4.1
WORKER_COUNT=3
WORKER_RAM=8g
WORKER_CORES=4
MASTER_ID=`docker -H $SWARM_MANAGER run -d $MASTER_IMAGE`
MASTER_IP=`docker -H $SWARM_MANAGER inspect --format '{{ .NetworkSettings.IPAddress }}' $MASTER_ID`
echo "Spark master is at $MASTER_IP"
for w in `seq $WORKER_COUNT`; do
docker -H $SWARM_MANAGER run -e SPARK_MASTER_IP=$MASTER_IP -e SPARK_WORKER_RAM=$WORKER_RAM -e SPARK_WORKER_CORES=$WORKER_CORES -d $WORKER_IMAGE
done
if [ "$1" == "--shell" ]; then
docker -H $SWARM_MANAGER run -i -t -e SPARK_MASTER_IP=$MASTER_IP -e SPARK_EXECUTOR_RAM=$WORKER_RAM $SHELL_IMAGE
fi
if [ "$1" == "--submit" ]; then
docker -H $SWARM_MANAGER run --rm -i -t -e SPARK_MASTER_IP=$MASTER_IP -e SPARK_EXECUTOR_RAM=$WORKER_RAM -v /mnt/cephfs/temp/spark-apps:/apps $SUBMIT_IMAGE /opt/submit.sh /apps/wordcount.py hdfs://192.168.45.157/datasets/gutenberg_big_2x.txt hdfs://192.168.45.157/tmp/cntwdc1
fi
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment