Commit 43b1eb39 authored by Daniele Venzano's avatar Daniele Venzano

New backend based on the new Python Docker library

parent e331f298
......@@ -83,7 +83,7 @@ class APIEndpoint:
if e.user_id != uid and role != 'admin':
raise zoe_api.exceptions.ZoeAuthException()
if e.is_active():
if e.is_active:
return self.master.execution_terminate(exec_id)
else:
raise zoe_api.exceptions.ZoeException('Execution is not running')
......@@ -98,7 +98,7 @@ class APIEndpoint:
if e.user_id != uid and role != 'admin':
raise zoe_api.exceptions.ZoeAuthException()
if e.is_active():
if e.is_active:
raise zoe_api.exceptions.ZoeException('Cannot delete an active execution')
status, message = self.master.execution_delete(exec_id)
......@@ -144,7 +144,7 @@ class APIEndpoint:
log.debug('Starting dead execution cleanup task')
all_execs = self.sql.execution_list()
for execution in all_execs:
if execution.is_running():
if execution.is_running:
for service in execution.services:
if service.description['monitor'] and service.is_dead():
log.info("Service {} ({}) of execution {} died, terminating execution".format(service.id, service.name, execution.id))
......
......@@ -32,11 +32,11 @@
<li class="container_name" id="{{ s['id'] }}">{{ s['name'] }}</li>
<ul>
<li>Zoe status: {{ s['status'] }}</li>
<li>Docker status: {{ s['docker_status'] }}</li>
<li>Docker status: {{ s['backend_status'] }}</li>
{% if s['error_message'] is not none %}
<li>Error: {{ s['error_message'] }}</li>
{% endif %}
{% if s['docker_status'] == 'started' %}
{% if s['backend_status'] == 'started' %}
{% for p in s['description']['ports'] %}
<li><a href="{{ p['protocol'] }}://{{ s['ip_address'] }}:{{ p['port_number'] }}{{ p['path'] }}">{{ p['name'] }}</a></li>
{% endfor %}
......
......@@ -49,7 +49,6 @@ def load_configuration(test_conf=None):
# Common options
argparser.add_argument('--debug', action='store_true', help='Enable debug output')
argparser.add_argument('--swarm', help='Swarm/Docker API endpoint (ex.: zk://zk1:2181,zk2:2181 or http://swarm:2380)', default='http://localhost:2375')
argparser.add_argument('--deployment-name', help='name of this Zoe deployment', default='prod')
argparser.add_argument('--dbname', help='DB name', default='zoe')
......@@ -87,7 +86,11 @@ def load_configuration(test_conf=None):
argparser.add_argument('--scheduler-class', help='Scheduler class to use for scheduling ZApps', choices=['ZoeSimpleScheduler', 'ZoeElasticScheduler'], default='ZoeSimpleScheduler')
argparser.add_argument('--scheduler-policy', help='Scheduler policy to use for scheduling ZApps', choices=['FIFO', 'SIZE'], default='FIFO')
argparser.add_argument('--backend', choices=['OldSwarm'], default='OldSwarm')
argparser.add_argument('--backend', choices=['OldSwarm', 'OldSwarmNewAPI'], default='OldSwarmNewAPI')
# Docker Swarm backend options
argparser.add_argument('--backend-swarm-url', help='Swarm/Docker API endpoint (ex.: zk://zk1:2181,zk2:2181 or http://swarm:2380)', default='http://localhost:2375')
argparser.add_argument('--backend-swarm-zk-path', help='Swarm/Docker optional ZooKeeper path for Swarm Znodes', default='/docker')
argparser.add_argument('--cookie-secret', help='secret used to encrypt cookies', default='changeme')
......
......@@ -23,6 +23,7 @@ from zoe_lib.state import Execution, Service
from zoe_master.backends.base import BaseBackend
from zoe_master.backends.old_swarm.backend import OldSwarmBackend
from zoe_master.backends.old_swarm_new_api.backend import OldSwarmNewAPIBackend
from zoe_master.exceptions import ZoeStartExecutionFatalException, ZoeStartExecutionRetryException
log = logging.getLogger(__name__)
......@@ -33,6 +34,8 @@ def _get_backend() -> BaseBackend:
backend_name = get_conf().backend
if backend_name == 'OldSwarm':
return OldSwarmBackend(get_conf())
elif backend_name == 'OldSwarmNewAPI':
return OldSwarmNewAPIBackend(get_conf())
else:
log.error('Unknown backend selected')
assert False
......@@ -92,7 +95,7 @@ def service_list_to_containers(execution: Execution, service_list: List[Service]
return "fatal"
else:
execution.set_running()
return "ok"
return "ok"
def start_all(execution: Execution) -> str:
......
......@@ -128,11 +128,11 @@ def zookeeper_swarm(zk_server_list: str, path='/docker') -> str:
zk_client.stop()
return master.decode('utf-8')
def consul_swarm(consul_ip: str, path='/docker') -> str:
def consul_swarm(consul_ip: str) -> str:
"""
Using consul as discovery service, find the currently active Swarm master.
:param consul_ip: consul ip address
:param path: Swarm path in Consul
:return: Swarm master connection string
"""
leader_key = 'docker/swarm/leader'
......@@ -141,6 +141,7 @@ def consul_swarm(consul_ip: str, path='/docker') -> str:
master = key_val[1]['Value']
return master.decode('utf-8')
class SwarmClient:
"""The Swarm client class that wraps the Docker API."""
def __init__(self, opts: Namespace) -> None:
......@@ -148,7 +149,7 @@ class SwarmClient:
url = opts.swarm
if 'zk://' in url:
url = url[len('zk://'):]
manager = zookeeper_swarm(url)
manager = zookeeper_swarm(url, opts.backend_swarm_zk_path)
elif 'consul://' in url:
url = url[len('consul://'):]
manager = consul_swarm(url)
......
This diff is collapsed.
# Copyright (c) 2017, Daniele Venzano
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Zoe backend implementation for old-style stand-alone Docker Swarm."""
import logging
from typing import Dict
from zoe_lib.config import get_conf
from zoe_lib.exceptions import ZoeLibException, ZoeNotEnoughResourcesException
from zoe_lib.state import Execution, Service
from zoe_master.backends.old_swarm.api_client import DockerContainerOptions, SwarmClient
from zoe_master.exceptions import ZoeStartExecutionRetryException, ZoeStartExecutionFatalException, ZoeException
from zoe_master.workspace.filesystem import ZoeFSWorkspace
import zoe_master.backends.common
import zoe_master.backends.base
from zoe_master.backends.old_swarm.threads import SwarmMonitor, SwarmStateSynchronizer
from zoe_master.stats import NodeStats, ClusterStats # pylint: disable=unused-import
log = logging.getLogger(__name__)
# These two module-level variables hold the references to the monitor and checker threads
_monitor = None
_checker = None
class OldSwarmNewAPIBackend(zoe_master.backends.base.BaseBackend):
"""Zoe backend implementation for old-style stand-alone Docker Swarm."""
def __init__(self, opts):
super().__init__(opts)
self.swarm = SwarmClient(opts)
@classmethod
def init(cls, state):
"""Initializes Swarm backend starting the event monitoring thread."""
global _monitor, _checker
_monitor = SwarmMonitor(state)
_checker = SwarmStateSynchronizer(state)
@classmethod
def shutdown(cls):
"""Performs a clean shutdown of the resources used by Swarm backend."""
_monitor.quit()
_checker.quit()
def spawn_service(self, execution: Execution, service: Service, env_subst_dict: Dict):
"""Spawn a service, translating a Zoe Service into a Docker container."""
copts = DockerContainerOptions()
copts.gelf_log_address = get_conf().gelf_address
copts.name = service.dns_name
copts.set_memory_limit(service.resource_reservation.memory)
copts.network_name = get_conf().overlay_network_name
copts.labels = {
'zoe.execution.name': execution.name,
'zoe.execution.id': str(execution.id),
'zoe.service.name': service.name,
'zoe.service.id': str(service.id),
'zoe.owner': execution.user_id,
'zoe.deployment_name': get_conf().deployment_name,
'zoe.type': 'app_service'
}
if service.is_monitor:
copts.labels['zoe.monitor'] = 'true'
else:
copts.labels['zoe.monitor'] = 'false'
# Always disable autorestart
# if 'disable_autorestart' in execution.description and execution.description['disable_autorestart']:
# log.debug("Autorestart disabled for service {}".format(service.id))
# copts.restart = False
# else:
# copts.restart = not service.is_monitor # Monitor containers should not restart
copts.restart = False
env_vars = zoe_master.backends.common.gen_environment(service, env_subst_dict)
for name, value in env_vars:
copts.add_env_variable(name, value)
for port in service.ports:
if port.expose:
copts.ports.append(port.number)
for volume in service.volumes:
if volume.type == "host_directory":
copts.add_volume_bind(volume.path, volume.mount_point, volume.readonly)
else:
log.warning('Docker Swarm backend does not support volume type {}'.format(volume.type))
# if 'constraints' in service.description:
# for constraint in service.description['constraints']:
# copts.add_constraint(constraint)
fswk = ZoeFSWorkspace()
if fswk.can_be_attached():
copts.add_volume_bind(fswk.get_path(execution.user_id), fswk.get_mountpoint(), False)
copts.add_env_variable('ZOE_WORKSPACE', fswk.get_mountpoint())
# The same dictionary is used for templates in the command
copts.set_command(service.command.format(**env_subst_dict))
try:
cont_info = self.swarm.spawn_container(service.image_name, copts)
except ZoeNotEnoughResourcesException:
service.set_error('Not enough free resources to satisfy reservation request')
raise ZoeStartExecutionRetryException('Not enough free resources to satisfy reservation request for service {}'.format(service.name))
except (ZoeException, ZoeLibException) as e:
raise ZoeStartExecutionFatalException(str(e))
service.set_active(cont_info["id"], cont_info['ip_address'][get_conf().overlay_network_name])
def terminate_service(self, service: Service) -> None:
"""Terminate and delete a container."""
self.swarm.terminate_container(service.backend_id, delete=True)
def platform_state(self) -> ClusterStats:
"""Get the platform state."""
info = self.swarm.info()
for node in info.nodes: # type: NodeStats
node.memory_free = node.memory_total - node.memory_reserved
node.cores_free = node.cores_total - node.cores_reserved
return info
# Copyright (c) 2016, Daniele Venzano
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Monitor for the Swarm event stream."""
import logging
import threading
import time
from zoe_lib.config import get_conf
from zoe_lib.state import SQLManager, Service
from zoe_master.backends.old_swarm.api_client import SwarmClient
log = logging.getLogger(__name__)
class SwarmMonitor(threading.Thread):
"""The monitor."""
def __init__(self, state: SQLManager) -> None:
super().__init__()
self.setName('monitor')
self.stop = False
self.state = state
self.setDaemon(True)
self.start()
def run(self):
"""The thread loop."""
log.info("Monitor thread started")
swarm = SwarmClient(get_conf())
while True:
try:
swarm.event_listener(lambda x: self._event_cb(x))
except Exception:
log.exception('Exception in monitor thread')
time.sleep(1) # wait a bit before retrying the connection
def _event_cb(self, event: dict) -> bool:
if event['Type'] == 'container':
self._container_event(event)
elif event['Type'] == 'volume' or event['Type'] == 'network' or event['Type'] == 'swarm' or event['Type'] == 'image':
pass
else:
log.debug('Unmanaged event type: {}'.format(event['Type']))
log.debug(str(event))
if self.stop:
return False
else:
return True
def _container_event(self, event: dict):
if 'zoe.deployment_name' not in event['Actor']['Attributes']:
return
if event['Actor']['Attributes']['zoe.deployment_name'] != get_conf().deployment_name:
return
service_id = event['Actor']['Attributes']['zoe.service.id'] # type: int
service = self.state.service_list(only_one=True, id=service_id)
if service is None:
return
if 'create' in event['Action']:
service.set_backend_status(service.BACKEND_CREATE_STATUS)
elif 'start' in event['Action']:
service.set_backend_status(service.BACKEND_START_STATUS)
elif 'die' in event['Action'] or 'kill' in event['Action'] or 'stop' in event['Action']:
service.set_backend_status(service.BACKEND_DIE_STATUS)
elif 'oom' in event['Action']:
service.set_backend_status(service.BACKEND_OOM_STATUS)
log.warning('Service {} got killed by an OOM condition'.format(service.id))
elif 'destroy' in event['Action']:
service.set_backend_status(service.BACKEND_DESTROY_STATUS)
else:
log.debug('Unmanaged container action: {}'.format(event['Action']))
def quit(self):
"""Stops the thread."""
self.stop = True
CHECK_INTERVAL = 300
class SwarmStateSynchronizer(threading.Thread):
"""The Swarm Checker."""
def __init__(self, state: SQLManager) -> None:
super().__init__()
self.setName('checker')
self.stop = False
self.state = state
self.setDaemon(True)
self.start()
def _find_dead_service(self, container_list, service: Service):
"""Loop through the containers and try to update the service status."""
found = False
for container in container_list:
if container['id'] == service.backend_id:
found = True
if container['status'] == 'exited':
log.info('resetting status of service {}, died with no event'.format(service.name))
service.set_backend_status(service.BACKEND_DIE_STATUS)
if not found:
service.set_backend_status(service.BACKEND_DESTROY_STATUS)
def run(self):
"""The thread loop."""
log.info("Checker thread started")
swarm = SwarmClient(get_conf())
while not self.stop:
service_list = self.state.service_list()
container_list = swarm.list(only_label={'zoe.deployment_name': get_conf().deployment_name})
for service in service_list:
assert isinstance(service, Service)
if service.backend_status == service.BACKEND_DESTROY_STATUS or service.backend_status == service.BACKEND_DIE_STATUS:
continue
self._find_dead_service(container_list, service)
time.sleep(CHECK_INTERVAL)
def quit(self):
"""Stops the thread."""
self.stop = True
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment