Commit 70c9822a authored by Daniele Venzano's avatar Daniele Venzano

Merge branch 'devel/master'

parents bc1d66b9 c0b0844d
......@@ -11,7 +11,7 @@ variables:
ZOE_COMMON_OPTIONS: --debug --deployment-name test${CI_BUILD_REF} --dbuser ${POSTGRES_USER} --dbhost postgres --dbport 5432 --dbname ${POSTGRES_DB} --dbpass ${POSTGRES_PASSWORD} --master-url tcp://localhost:4850 --auth-type text --listen-port 5100 --workspace-base-path /tmp
ZOE_COMMON_OPTIONS: --debug --deployment-name test${CI_BUILD_REF} --dbuser ${POSTGRES_USER} --dbhost postgres --dbport 5432 --dbname ${POSTGRES_DB} --dbpass ${POSTGRES_PASSWORD} --master-url tcp://localhost:4850 --listen-port 5100 --workspace-base-path /tmp --overlay-network-name bridge
# Jupyter Notebook with PyTorch
URL: []( and [](
* Jupyter Notebook 5.0.x
* Conda Python 3.x environment
* pandas, matplotlib, scipy, seaborn, scikit-learn, scikit-image, sympy, cython, patsy, statsmodel, cloudpickle, dill, numba, bokeh, vincent, beautifulsoup, xlrd pre-installed
* PyTorch
Please note that you need to retrieve the secret key from the service logs to be able to access the notebooks.
"name": "pytorch-notebook",
"services": [
"command": null,
"environment": [
"essential_count": 1,
"image": "zapps/pytorch:4761",
"monitor": true,
"name": "jupyter",
"ports": [
"name": "Jupyter Notebook interface",
"port_number": 8888,
"protocol": "tcp",
"url_template": "http://{ip_port}/"
"replicas": 1,
"resources": {
"cores": {
"max": 4,
"min": 4
"memory": {
"max": 4294967296,
"min": 4294967296
"startup_order": 0,
"total_count": 1,
"volumes": [],
"work_dir": "/mnt/workspace"
"size": 512,
"version": 3,
"will_end": false
\ No newline at end of file
"name": "tf-google-gpu",
"services": [
"command": null,
"environment": [
"essential_count": 1,
"image": "",
"labels": [
"monitor": true,
"name": "tf-jupyter",
"ports": [
"name": "Tensorboard web interface",
"port_number": 6006,
"protocol": "tcp",
"url_template": "http://{ip_port}/"
"name": "Notebook web interface",
"port_number": 8888,
"protocol": "tcp",
"url_template": "http://{ip_port}/"
"replicas": 1,
"resources": {
"cores": {
"max": 4,
"min": 4
"memory": {
"max": 34359738368,
"min": 34359738368
"startup_order": 0,
"total_count": 1,
"volumes": []
"size": 512,
"version": 3,
"will_end": false
\ No newline at end of file
......@@ -28,12 +28,12 @@ Workspaces:
* ``workspace-deployment-path`` : path appended to the ``workspace-base-path`` to distinguish this deployment. If left unspecified it is equal to the deployment name
* ``workspace-base-path = /mnt/zoe-workspaces`` : Base directory where user workspaces will be created. This directory should reside on a shared filesystem visible by all hosts where containers will be run.
* ``fs-group-id = 5001`` : Group ID to use for all Zoe users in workspace files
* ``influxdb-dbname = zoe`` : Name of the InfluxDB database to use for storing metrics
* ``influxdb-url = http://localhost:8086`` : URL of the InfluxDB service (ex. )
* ``influxdb-enable = False`` : Enable metric output toward influxDB
* ``kairosdb-enable = false`` : Enable gathering of usage metrics recorded in KairosDB
* ``kairosdb-url = http://localhost:8090`` : URL of KairosDB REST API
Service logs (see: :ref:`logging`):
......@@ -60,31 +60,24 @@ API options:
Master options:
* ``api-listen-uri = tcp://*:4850`` : ZeroMQ server connection string, used for the master listening endpoint
* ``kairosdb-enable = false`` : Enable gathering of usage metrics recorded in KairosDB
* ``kairosdb-url = http://localhost:8090`` : URL of KairosDB REST API
* ``overlay-network-name = zoe`` : name of the pre-configured Docker overlay network Zoe should use (Swarm backend)
* ``max-core-limit = 16`` : maximum amount of cores a user is able to reserve
* ``max-memory-limit = 64`` : maximum amount of memory a user is able to reserve
* ``no-user-edit-limits-web = False`` : if set to true, users are NOT allowed to modify ZApp reservations via the web interface
* ``additional-volumes = <none>`` : list of additional volumes to mount in every service, for every ZApp (ex. /mnt/data:data,/mnt/data_n:data_n)
* ``auth-type = text`` : Authentication type (text, ldap or ldapsasl)
* ``auth-file = zoepass.csv`` : Path to the CSV file containing user,pass,role lines for text authentication
* ``ldap-server-uri = ldap://localhost`` : LDAP server to use for user authentication
* ``ldap-bind-user = ou=something,dc=any,dc=local`` : LDAP user for binding to the server
* ``ldap-bind-password = mysecretpassword`` : Password for the bind user
* ``ldap-base-dn = ou=something,dc=any,dc=local`` : LDAP base DN for users
* ``ldap-admin-gid = 5000`` : LDAP group ID for admins
* ``ldap-user-gid = 5001`` : LDAP group ID for users
* ``ldap-guest-gid = 5002`` : LDAP group ID for guests
* ``ldap-group-name = gidNumber`` : LDAP user attribute that contains the group names/IDs
Scheduler options:
* ``scheduler-class = <ZoeElasticScheduler>`` : Scheduler class to use for scheduling ZApps (default: elastic scheduler)
* ``scheduler-policy = <FIFO | SIZE>`` : Scheduler policy to use for scheduling ZApps (default: FIFO)
* ``placement-policy = <waterfill | random | average>`` : how containers should be placed on hosts (default: average)
ZApp shop:
......@@ -92,15 +85,7 @@ ZApp shop:
Back-end choice:
* ``backend = <DockerEngine|Swarm|Kubernetes>`` : cluster back-end to use to run ZApps, default is DockerEngine
Swarm back-end options:
* ``backend-swarm-url = zk://zk1:2181,zk2:2181,zk3:2181`` : connection string to the Swarm API endpoint. Can be expressed by a plain http URL or as a zookeeper node list in case Swarm is configured for HA.
* ``backend-swarm-zk-path = /docker`` : ZooKeeper path used by Docker Swarm
* ``backend-swarm-tls-cert = cert.pem`` : Docker TLS certificate file
* ``backend-swarm-tls-key = key.pem`` : Docker TLS private key file
* ``backend-swarm-tls-ca = ca.pem`` : Docker TLS CA certificate file
* ``backend = <DockerEngine|Kubernetes>`` : cluster back-end to use to run ZApps, default is DockerEngine
Kubernetes back-end:
.. _contributing:
.. _contributing:
Contributing to Zoe
......@@ -38,7 +38,6 @@ Variables
To run the tests a number of variables need to be set from the GitLab interface:
* REGISTRY_PASSWORD: the password used for authenticating with the registry via docker login
* SSH_PRIVATE_KEY: private key to be used to deploy via rsync the staging build
* STAGING_IP: IP/hostname of the staging server
* WEB_STAGING_PATH: path for the web interface on the staging server
......@@ -7,51 +7,7 @@ As a developer you can:
- call Zoe from your own software: :ref:`Zoe REST API documentation <rest-api>`
- create ot modify ZApps: :ref:`howto_zapp`
- contribute to Zoe: keep reading
Contributing to Zoe
Zoe is open source and all kinds of contributions are welcome.
Zoe is licensed under the terms of the Apache 2.0 license.
Bugs, issues and feature requests
`Zoe issue tracker <>`_
Testing beta code
The ``HEAD`` of the master branch represents the latest version of Zoe. Automatic tests are performed before code is merged into master, but human feedback is invaluable. Clone the repository and report on the `mailing list <>`_ or on the `issue tracker <>`_.
Code changes and pull requests
**When you contribute code, you affirm that the contribution is your original work and that you license the work to the project under the project’s open source license. Whether or not you state this explicitly, by submitting any copyrighted material via pull request, email, or other means you agree to license the material under the project’s open source license and warrant that you have the legal authority to do so.**
To contribute code and/or documentation you should follow this workflow:
1. announce your idea on the mailing list, to prevent duplicated work
2. fork the Zoe repository via GitHub (if you don't already have a fork)
3. ... develop and debug ...
4. when you are ready propose your changes with a pull request
Zoe maintainers will review your code, give constructive feedback and eventually accept the code and merge.
Contributors can setup their own CI pipeline following the quality guidelines (:ref:`quality`). At a bare minimum all code should be tested via the `` script available in the root of the repository. Accepted contributions will be run through the full Zoe CI pipeline before being merged in the public repository.
Repository contents
- `docs`: Sphinx documentation used to build these pages
- `scripts`: scripts for deployment and testing
- `zoe_api`: the front-end Zoe process that provides the REST API
- `zoe_cmd`: Command-line client
- `zoe_lib`: library, contains common modules needed by the api and the master processes
- `zoe_master`: the back-end Zoe process schedules and talks to the containerization system
- `contrib`: supervisord config files and sample ZApps
- contribute to Zoe: :ref:`contributing`
Internal module/class/method documentation
This diff is collapsed.
......@@ -50,10 +50,13 @@ Main documentation
:maxdepth: 1
Zoe applications
......@@ -61,7 +64,6 @@ Zoe applications
.. toctree::
:maxdepth: 1
......@@ -75,8 +77,6 @@ Development and contributing to the project
External resources
......@@ -99,21 +99,14 @@ Most of the ZApps expose a number of interfaces (web, REST and others) to the us
* use a proxy, like the one developed for Zoe: :ref:`proxy`
* use back-end network plugins to build custom topologies
Authentication back-end
Authentication back-ends
Zoe has a simple user model: users are authenticated against an external source of truth, that assigns also one of three roles:
Zoe supports multiple user authentication back-ends. Multiple back-ends can coexist at the same time.
* guest: cannot access the API (and the command-line tools) and can run one execution at a time
* user: can use the API and has no limits on executions
* admin: can operate on executions belonging to other users, can delete records of past executions
Check the :ref:`users` page for more details on the user model.
Zoe supports two authentication back-ends:
* LDAP and LDAP+SASL (``auth-type=ldap`` ot ``auth-type=ldapsasl``)
* Text file (``auth-type=text``)
As most of Zoe, the authentication back-end is pluggable and others can be easily implemented.
Remember to disable or change the password of the default admin user.
.. _motivation:
The motivation behind Zoe
The fundamental idea of Zoe is that a user who wants run data analytics applications should not be bothered by systems details, such as how to configure the amount of RAM a Spark Executor should use, how many cores are available in the system or even how many worker nodes should be used to meet an execution deadline.
Moreover final users require a lot of flexibility, they want ot test new analytics systems and algorithms as soon as possible, without having to wait for some approval procedure to go through the IT department. Zoe proposes a flexible model for applications descriptions: their management can be left entirely to the final user, but they can also prepared very quickly in all or in part by an IT department, who sometimes is more knowledgeable of resource limits and environment variables. We also plan to offer a number of building blocks (Zoe Frameworks) that can be composed to make Zoe Applications.
Finally we feel that there is a lack of solutions in the field of private clouds, where resources are not infinite and data layers (data-sets) may be shared between different users. All the current Open Source solutions we are aware of target the public cloud use case and try, more or less, to mimic what Amazon and other big names are doing in their data-centers.
Zoe strives to satisfy the following requirements:
* easy to use for the end-user
* easy to manage for the system administrator, easy to integrate in existing data-centers/clouds/VM deployments
* short (a few seconds) reaction times to user requests or other system events
* smart queuing and scheduling of applications when resources are critical
Kubernetes, OpenStack Sahara, Mesos and YARN are the projects that, each in its own way, come near Zoe, without solving the distributed analytics problem.
Kubernetes is a very complex system, both to deploy and to use. It takes some of the architectural principles from Google Borg and targets data centers with vast amounts of resources. We feel that while Kubernetes can certainly run analytic services in containers, it does so at a very high complexity cost for smaller setups. Moreover, certain scheduler choices in how preemption is managed do not apply well to environments with a limited set of users and compute resources, causing a less than optimal resource usage.
OpenStack Sahara
We know well `OpenStack Sahara <>`_, as we have been using it since 2013 and we contributed the Spark plugin. We feel that Sahara has limitations in:
* software support: Sahara plugins support a limited set of data-intensive frameworks, adding a new one means writing a new Sahara plugin and even adding support for a new version requires going through a one-two week (on average) review process.
* lack of scheduling: Sahara makes the assumption that you have infinite resources. When you try to launch a new cluster and there are not enough resources available, the request fails and the user is left doing application and resources scheduling by hand.
* usability: setting up everything that is needed to run an EDP job is cumbersome and error-prone. The user has to provide too many details in too many different places.
Moreover changes to Sahara needs to go through a lengthy review process, that on one side tries to ensure high quality, but on the other side slows down development, especially of major architectural changes, like the ones needed to address the concerns listed above.
Mesos is marketing itself as a data-center operating system. Zoe has no such high profile objective: while Zoe schedules distributed applications, it has no knowledge of the applications it is scheduling and, even more importantly, does not require any change in the applications themselves to be run in Zoe.
Mesos requires that each application provides two Mesos-specific components: a scheduler and an executor. Zoe has no such requirements and runs applications unmodified.
YARN, from our point of view, has many similarities with Mesos. It requires application support. Moreover it is integrated in the Hadoop distribution and, while recent efforts are pushing toward making YARN stand up on its own, it is currently tailored for Hadoop applications.
.. _quotas:
Quotas enforce resource limits to users. A quota can be assigned to multiple users, but a user can have one quota.
Quotas can be set on the following resources:
* concurrent_executions : maximum number of concurrent executions in an active state
* memory : maximum amount of memory a user can reserve in total, across all its active executions (not yet implemented)
* cores : maximum amount of cores a user can reserve in total, across all its active executions (not yet implemented)
A default quota is always available:
* name: default
* concurrent executions: 5
* memory: 32GB
* cores: 20
This default quota can be modified, but not deleted. More quotas can be created via the command.
.. _roles:
Roles in Zoe define the limits of what a user can do. A role can be assigned to multiple users, but a user can have only a single role.
The capabilities that can be turned on and off for a role are:
* can_see_status : can access the status page on the web interface
* can_change_config : can make changes to the configuration (add/delete/modify users, quotas and roles)
* can_operate_others : can operate on others' work (see and terminate other users' executions)
* can_delete_executions : can permanently delete executions and all the associated logs
* can_access_api : can access the REST API
* can_customize_resources : can use the web interface to modify resource reservations when starting ZApps from the shop
* can_access_full_zapp_shop : has access to all ZApps in the shop
By default three roles are created:
* admin : all capabilities are set
* superuser : has can_see_status, can_access_api, can_customize_resources and can_access_full_zapp_shop
* user : has no capabilities
Zoe will refuse to delete or modify the admin role.
.. _users:
Zoe has a flexible user management system. All users that need access to Zoe need to have an entry created in the Zoe user database through the command-line utility ( or the web interface.
When the entry is being created, the administrator can choose an authentication source, that can be different for each user. Currently the following sources are available:
* internal : the password is stored in Zoe
* LDAP(+SASL) : authentication is performed by contacting an external LDAP server
* textfile : the password is stored in a CSV file
* pam : authentication is performed by using the PAM subsystem of the operating system where the zoe-api process is running
More backends can be developed, the authentication system is designed to be pluggable.
Each user has a :ref:`roles` and a :ref:`quotas` associated.
By default Zoe has an admin user (password admin), created during the first startup. While deploying Zoe, this user must be disabled or its password changed. The default password is a security risk.
.. _vision:
The vision for Zoe Analytics
Zoe focus is data analytics. This focus helps defining a clear set of objectives and priorities for the project and avoid the risk of competing directly with generic infrastructure managers like Kubernetes or Swarm. Zoe instead sits on top of these "cloud managers" to provide a simpler interface to end users who have no interest in the intricacies of container infrastructures.
Data analytic applications do not work in isolation. They need data, that may be stored or streamed, and they generate logs, that may have to be analyzed for debugging or stored for auditing. Data layers, in turn, need health monitoring. All these tools, frameworks, distributed filesystems, object stores, form a galaxy that revolves around analytic applications. For simplicity we will call these "support applications".
Zoe does not focus on support applications. Managing a stable and fault tolerant HDFS cluster, for example, is a task for tools like Puppet, Chef or Ansible and is done by system administrators. Zoe, instead, targets data scientists, that need to use a cluster infrastructure, but do not usually have sysadmin skills.
Kinds of applications
See :ref:`zapp_classification`.
.. _zapp_classification:
Zoe runs processes inside containers and the Zoe application description is very generic, allowing any kind of application to be described in Zoe and submitted for execution. While the main focus of Zoe are so-called "analytic applications", there are many other tools that can be run on the same cluster, for monitoring, storage, log management, history servers, etc. These applications can be described in Zoe and executed, but they have quite different scheduling constraints. Zoe is not a generic application deployment solution and lacks, by design, features like automatic migration, rolling upgrades, etc.
Please note that in this context an "elastic" service is a service that "can be automatically killed or started".
- Long running: potentially will never terminate
- Non elastic
- Storage: need to have access to non-container storage (volumes or disk partitions)
- Cassandra
- ElasticSearch
- Interactive: need to expose web interfaces to the end user
- Jupyter
- Spark, Hadoop, Tensorflow, etc history servers
- Kibana
- Graylog (web interface only)
- Streaming:
- Logstash
- User access
- Proxies and SSH gateways
- Elastic (can be automatically resized)
- Streaming:
- Spark streaming user jobs
- Storm
- Flink streaming
- Kafka
- Ephemeral: will eventually finish by themselves
- Elastic:
- Spark classic batch jobs
- Hadoop MapReduce
- Flink
- Non elastic:
- Tensorflow
All the applications in the **long-running** category need to be deployed, managed, upgraded and monitored since they are part of the cluster infrastructure. The Jupyter notebook at first glance may seem out of place, but in fact it is an interface to access different computing systems and languages, sometimes integrated in Jupyter itself, but also distributed in other nodes, with Spark or Tensorflow backends. As an interface the user may expect for it to be always there, making it part of the infrastructure.
The **elastic, long-running** applications have a degree more of flexibility, that can be taken into account by Zoe. They all have the same needs as the non-elastic applications, but they can also be scaled according to many criteria (priority, latency, data volume).
The applications in the **ephemeral** category, instead, will eventually terminate by themselves: a batch job is a good example of such applications.
......@@ -10,10 +10,10 @@ TIMEOUT = 5
class TestZoeRestAuthFails:
"""Test case class."""
def test_userinfo(self, zoe_api_process):
"""Test userinfo api endpoint."""
print('Test userinfo api endpoint')
req = requests.get(ZOE_API_URI + 'userinfo', auth=WRONG_AUTH)
def test_user(self, zoe_api_process):
"""Test user api endpoint."""
print('Test user api endpoint')
req = requests.get(ZOE_API_URI + 'user', auth=WRONG_AUTH)
assert req.status_code == 401
def test_3_execution_details(self, zoe_api_process):
......@@ -21,10 +21,10 @@ class TestZoeRest:
req = requests.get(ZOE_API_URI + 'info', timeout=TIMEOUT)
assert req.status_code == 200
def test_userinfo(self, zoe_api_process):
"""Test userinfo api endpoint."""
print('Test userinfo api endpoint')
req = requests.get(ZOE_API_URI + 'userinfo', auth=ZOE_AUTH, timeout=TIMEOUT)
def test_user(self, zoe_api_process):
"""Test user api endpoint."""
print('Test user api endpoint')
req = requests.get(ZOE_API_URI + 'user', auth=ZOE_AUTH, timeout=TIMEOUT)
assert req.status_code == 200
def test_list_all_executions(self, zoe_api_process):
......@@ -48,6 +48,7 @@ class TestZoeRest:
with open('integration_tests/zapp.json', encoding='utf-8') as data_file:
data = json.loads(
time.sleep(10) # wait for test Zoe to start and load the docker status
req = + 'execution', auth=ZOE_AUTH, json={"application": data, "name": "requests"}, timeout=TIMEOUT)
assert req.status_code == 201
exec_id = str(req.json()['execution_id'])
......@@ -82,7 +83,9 @@ class TestZoeRest:
with open('integration_tests/zapp.json', encoding='utf-8') as data_file:
data = json.loads(
req = + 'zapp_validate', json={"application": data}, timeout=TIMEOUT)
req = + 'zapp_validate', json=data, timeout=TIMEOUT)
if req.status_code != 200:
print('Error reason: {}, {}'.format(req.reason, req.text))
assert req.status_code == 200
def test_zapp_validate_fail(self, zoe_api_process):
......@@ -92,5 +95,5 @@ class TestZoeRest:
data = json.loads(
del data['version']
req = + 'zapp_validate', json={"application": data}, timeout=TIMEOUT)
req = + 'zapp_validate', json=data, timeout=TIMEOUT)
assert req.status_code == 400
......@@ -14,7 +14,6 @@ ARGS = [
'--dbname', 'zoe',
'--dbpass', 'zoepass',
'--master-url', 'tcp://localhost:4850',
'--auth-type', 'text',
'--listen-port', '5100',
'--workspace-base-path', '/tmp',
'--workspace-deployment-path', 'integration_test',
......@@ -4,6 +4,6 @@ tls_cert: /certs/cert.pem
tls_key: /certs/key.pem
tls_ca: /certs/ca.pem
......@@ -3,7 +3,7 @@ requests>=2.9.1
......@@ -12,3 +12,5 @@ sphinx
......@@ -153,6 +153,10 @@
"minItems": 0,
"uniqueItems": true
"network": {
"type": "string",
"minLength": 1
"additionalProperties": false,
This diff is collapsed.
# Copyright (c) 2016, Daniele Venzano
# Copyright (c) 2018, Daniele Venzano
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -15,10 +15,59 @@