Commit e5e6971c authored by Daniele Venzano's avatar Daniele Venzano

Backend abstraction and new docker api backend

parent 91c913ad
...@@ -21,7 +21,7 @@ import psycopg2.extras ...@@ -21,7 +21,7 @@ import psycopg2.extras
import zoe_api.exceptions import zoe_api.exceptions
from zoe_lib.config import get_conf from zoe_lib.config import get_conf
SQL_SCHEMA_VERSION = 1 # ---> Increment this value every time the schema changes !!! <--- SQL_SCHEMA_VERSION = 3 # ---> Increment this value every time the schema changes !!! <---
def version_table(cur): def version_table(cur):
...@@ -73,8 +73,10 @@ def create_tables(cur): ...@@ -73,8 +73,10 @@ def create_tables(cur):
execution_id INT REFERENCES execution, execution_id INT REFERENCES execution,
service_group TEXT NOT NULL, service_group TEXT NOT NULL,
name TEXT NOT NULL, name TEXT NOT NULL,
docker_id TEXT NULL DEFAULT NULL, backend_id TEXT NULL DEFAULT NULL,
docker_status TEXT NOT NULL DEFAULT 'undefined' backend_status TEXT NOT NULL DEFAULT 'undefined',
ip_address CIDR NULL DEFAULT NULL,
essential BOOLEAN NOT NULL DEFAULT FALSE
)''') )''')
......
...@@ -157,7 +157,7 @@ def exec_get_cmd(args): ...@@ -157,7 +157,7 @@ def exec_get_cmd(args):
service = cont_api.get(c_id) service = cont_api.get(c_id)
print('Service {} (ID: {})'.format(service['name'], service['id'])) print('Service {} (ID: {})'.format(service['name'], service['id']))
print(' - zoe status: {}'.format(service['status'])) print(' - zoe status: {}'.format(service['status']))
print(' - docker status: {}'.format(service['docker_status'])) print(' - backend status: {}'.format(service['docker_status']))
if service['error_message'] is not None: if service['error_message'] is not None:
print(' - error: {}'.format(service['error_message'])) print(' - error: {}'.format(service['error_message']))
if service['docker_status'] == 'started': if service['docker_status'] == 'started':
......
...@@ -49,7 +49,6 @@ def load_configuration(test_conf=None): ...@@ -49,7 +49,6 @@ def load_configuration(test_conf=None):
# Common options # Common options
argparser.add_argument('--debug', action='store_true', help='Enable debug output') argparser.add_argument('--debug', action='store_true', help='Enable debug output')
argparser.add_argument('--swarm', help='Swarm/Docker API endpoint (ex.: zk://zk1:2181,zk2:2181 or http://swarm:2380)', default='http://localhost:2375')
argparser.add_argument('--deployment-name', help='name of this Zoe deployment', default='prod') argparser.add_argument('--deployment-name', help='name of this Zoe deployment', default='prod')
argparser.add_argument('--dbname', help='DB name', default='zoe') argparser.add_argument('--dbname', help='DB name', default='zoe')
...@@ -87,11 +86,19 @@ def load_configuration(test_conf=None): ...@@ -87,11 +86,19 @@ def load_configuration(test_conf=None):
argparser.add_argument('--service-log-path', help='Save service logs in this directory, EXPERIMENTAL', default='') argparser.add_argument('--service-log-path', help='Save service logs in this directory, EXPERIMENTAL', default='')
argparser.add_argument('--scheduler-class', help='Scheduler class to use for scheduling ZApps', default='ZoeSimpleScheduler') argparser.add_argument('--scheduler-class', help='Scheduler class to use for scheduling ZApps', default='ZoeSimpleScheduler')
argparser.add_argument('--scheduler-policy', help='Scheduler policy to use for scheduling ZApps', choices=['FIFO', 'SIZE'], default='FIFO')
argparser.add_argument('--docker-tls-cert', help='Docker TLS certificate file', default='cert.pem') argparser.add_argument('--docker-tls-cert', help='Docker TLS certificate file', default='cert.pem')
argparser.add_argument('--docker-tls-key', help='Docker TLS private key file', default='key.pem') argparser.add_argument('--docker-tls-key', help='Docker TLS private key file', default='key.pem')
argparser.add_argument('--docker-tls-ca', help='Docker TLS CA certificate file', default='ca.pem') argparser.add_argument('--docker-tls-ca', help='Docker TLS CA certificate file', default='ca.pem')
# Docker Swarm backend options
argparser.add_argument('--backend', choices=['OldSwarm', 'OldSwarmNewAPI'], default='OldSwarmNewAPI')
argparser.add_argument('--backend-swarm-url', help='Swarm/Docker API endpoint (ex.: zk://zk1:2181,zk2:2181 or http://swarm:2380)', default='http://localhost:2375')
argparser.add_argument('--backend-swarm-zk-path', help='Swarm/Docker optional ZooKeeper path for Swarm Znodes', default='/docker')
argparser.add_argument('--cookie-secret', help='secret used to encrypt cookies', default='changeme')
opts = argparser.parse_args() opts = argparser.parse_args()
if opts.debug: if opts.debug:
argparser.print_values() argparser.print_values()
......
This diff is collapsed.
...@@ -173,6 +173,7 @@ class Service: ...@@ -173,6 +173,7 @@ class Service:
execution = self.sql_manager.execution_list(only_one=True, id=self.execution_id) execution = self.sql_manager.execution_list(only_one=True, id=self.execution_id)
return execution.user_id return execution.user_id
@property
def is_dead(self): def is_dead(self):
"""Returns True if this service is not running.""" """Returns True if this service is not running."""
return self.backend_status == self.BACKEND_DESTROY_STATUS or self.backend_status == self.BACKEND_OOM_STATUS or self.backend_status == self.BACKEND_DIE_STATUS return self.backend_status == self.BACKEND_DESTROY_STATUS or self.backend_status == self.BACKEND_OOM_STATUS or self.backend_status == self.BACKEND_DIE_STATUS
# Copyright (c) 2017, Daniele Venzano
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The base class that all backends should implement."""
from zoe_lib.state import Service
from zoe_master.stats import ClusterStats
from zoe_master.backends.service_instance import ServiceInstance
class BaseBackend:
"""The base class that all backends should implement."""
def __init__(self, conf):
pass
def init(self, state):
"""Initializes the backend. In general this includes finding the current API endpoint and opening a connection to it, negotiate the API version, etc. Here backend-related threads can be started, too. This method will be called only once at Zoe startup."""
raise NotImplementedError
def shutdown(self):
"""Performs a clean shutdown of the resources used by Swarm backend. Any threads that where started in the init() method should be terminated here. This method will be called when Zoe shuts down."""
raise NotImplementedError
def spawn_service(self, service_instance: ServiceInstance):
"""Create a container for a service.
The backend translates all the configuration parameters given in the ServiceInstance object into backend-specific container options and starts the container.
This function should either:
* raise ``ZoeStartExecutionRetryException`` in case a temporary error is generated
* raise ``ZoeStartExecutionFatalException`` in case a fatal error is generated
* return a backend-specific ID that will be used later by Zoe to interact with the running container
"""
raise NotImplementedError
def terminate_service(self, service: Service) -> None:
"""Terminate the container corresponding to a service."""
raise NotImplementedError
def platform_state(self) -> ClusterStats:
"""Get the platform state. This method should fill-in a new ClusterStats object at each call, with fresh statistics on the available nodes and resource availability. This information will be used for taking scheduling decisions."""
raise NotImplementedError
# Copyright (c) 2017, Daniele Venzano
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The high-level interface that Zoe uses to talk to the configured container backend."""
import os
import logging
from zoe_lib.state import Service, Execution, VolumeDescription
from zoe_lib.config import get_conf
from zoe_master.workspace.filesystem import ZoeFSWorkspace
log = logging.getLogger(__name__)
def gen_environment(service: Service, execution: Execution):
"""Return the list of environment variables that needs to be added to all containers."""
fswk = ZoeFSWorkspace()
env_list = [
('ZOE_EXECUTION_ID', execution.id),
('ZOE_EXECUTION_NAME', execution.name),
('ZOE_SERVICE_GROUP', service.service_group),
('ZOE_SERVICE_NAME', service.name),
('ZOE_SERVICE_ID', service.id),
('ZOE_OWNER', execution.user_id),
('ZOE_DEPLOYMENT_NAME', get_conf().deployment_name),
('ZOE_MY_DNS_NAME', service.dns_name),
('ZOE_WORKSPACE', fswk.get_mountpoint())
]
service_list = []
for tmp_service in execution.services:
service_list.append(tmp_service.dns_name)
env_list.append(('ZOE_EXECUTION_SERVICE_LIST', ','.join(service_list)))
return env_list
def _create_logs_directories(exec_id, service_name):
path = os.path.join(get_conf().logs_base_path, get_conf().deployment_name, str(exec_id), service_name)
try:
os.makedirs(path)
except OSError as e:
log.error('Cannot create path {}: {}'.format(path, str(e)))
return None
return path
def gen_volumes(service: Service, execution: Execution):
"""Return the list of default volumes to be added to all containers."""
vol_list = []
fswk = ZoeFSWorkspace()
wk_vol = fswk.get(execution.user_id)
vol_list.append(wk_vol)
logs_path = _create_logs_directories(execution.id, service.name)
if logs_path is not None:
logs_mountpoint = '/logs'
logs_vol = VolumeDescription((logs_path, logs_mountpoint, True))
vol_list.append(logs_vol)
return vol_list
def gen_labels(service: Service, execution: Execution):
"""Generate container labels, useful for identifying containers in monitoring systems."""
return {
'zoe_execution_name': execution.name,
'zoe_execution_id': str(execution.id),
'zoe_service_name': service.name,
'zoe_service_id': str(service.id),
'zoe_owner': execution.user_id,
'zoe_deployment_name': get_conf().deployment_name,
'zoe_type': 'app_service'
}
# Copyright (c) 2017, Daniele Venzano
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The high-level interface that Zoe uses to talk to the configured container backend."""
import logging
from typing import List
from zoe_lib.config import get_conf
from zoe_lib.state import Execution, Service
from zoe_master.backends.base import BaseBackend
from zoe_master.backends.service_instance import ServiceInstance
from zoe_master.backends.old_swarm.backend import OldSwarmBackend
import zoe_master.backends.old_swarm.api_client
from zoe_master.backends.old_swarm_new_api.backend import OldSwarmNewAPIBackend
import zoe_master.backends.old_swarm_new_api.api_client
from zoe_master.exceptions import ZoeStartExecutionFatalException, ZoeStartExecutionRetryException, ZoeException
log = logging.getLogger(__name__)
def _get_backend() -> BaseBackend:
"""Return the right backend instance by reading the global configuration."""
backend_name = get_conf().backend
if backend_name == 'OldSwarm':
if not zoe_master.backends.old_swarm.api_client.AVAILABLE:
raise ZoeException('The OldSwarm backend requires docker-py version <= 1.10.2')
return OldSwarmBackend(get_conf())
elif backend_name == 'OldSwarmNewAPI':
if not zoe_master.backends.old_swarm_new_api.api_client.AVAILABLE:
raise ZoeException('The OldSwarmNewAPI backend requires docker python version >= 2.0.2')
return OldSwarmNewAPIBackend(get_conf())
else:
log.error('Unknown backend selected')
assert False
def initialize_backend(state):
"""Initializes the configured backend."""
backend = _get_backend()
backend.init(state)
def shutdown_backend():
"""Shuts down the configured backend."""
backend = _get_backend()
backend.shutdown()
def service_list_to_containers(execution: Execution, service_list: List[Service]) -> str:
"""Given a subset of services from an execution, tries to start them, return one of 'ok', 'requeue' for temporary failures and 'fatal' for fatal failures."""
backend = _get_backend()
ordered_service_list = sorted(service_list, key=lambda x: x.startup_order)
for service in ordered_service_list:
service.set_starting()
instance = ServiceInstance(execution, service)
try:
backend_id = backend.spawn_service(instance)
except ZoeStartExecutionRetryException as ex:
log.warning('Temporary failure starting service {} of execution {}: {}'.format(service.id, execution.id, ex.message))
service.set_error(ex.message)
execution.set_error_message(ex.message)
terminate_execution(execution)
execution.set_scheduled()
return "requeue"
except ZoeStartExecutionFatalException as ex:
log.error('Fatal error trying to start service {} of execution {}: {}'.format(service.id, execution.id, ex.message))
execution.set_error_message(ex.message)
terminate_execution(execution)
execution.set_error()
return "fatal"
except Exception as ex:
log.error('Fatal error trying to start service {} of execution {}'.format(service.id, execution.id))
log.exception('BUG, this error should have been caught earlier')
execution.set_error_message(str(ex))
terminate_execution(execution)
execution.set_error()
return "fatal"
else:
service.set_active(backend_id)
return "ok"
def start_all(execution: Execution) -> str:
"""Translate an execution object into containers.
If an error occurs some containers may have been created and needs to be cleaned-up.
"""
log.debug('starting all services for execution {}'.format(execution.id))
execution.set_starting()
return service_list_to_containers(execution, execution.services)
def start_essential(execution) -> str:
"""Start the essential services for this execution"""
log.debug('starting essential services for execution {}'.format(execution.id))
execution.set_starting()
return service_list_to_containers(execution, execution.essential_services)
def start_elastic(execution) -> str:
"""Start the runnable elastic services"""
elastic_to_start = [s for s in execution.elastic_services if s.status == Service.RUNNABLE_STATUS]
return service_list_to_containers(execution, elastic_to_start)
def terminate_execution(execution: Execution) -> None:
"""Terminate an execution."""
execution.set_cleaning_up()
backend = _get_backend()
for service in execution.services:
assert isinstance(service, Service)
if service.backend_id is not None:
service.set_terminating()
backend.terminate_service(service)
service.set_inactive()
log.debug('Service {} terminated'.format(service.name))
execution.set_terminated()
def get_platform_state():
"""Retrieves the state of the platform by querying the container backend. Platform state includes information on free/reserved resources for each node. This information is used for advanced scheduling."""
backend = _get_backend()
return backend.platform_state()
This diff is collapsed.
# Copyright (c) 2017, Daniele Venzano
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Zoe backend implementation for old-style stand-alone Docker Swarm."""
import logging
from zoe_lib.config import get_conf
from zoe_lib.exceptions import ZoeLibException, ZoeNotEnoughResourcesException
from zoe_lib.state import Service
from zoe_master.backends.old_swarm.api_client import DockerContainerOptions, SwarmClient
from zoe_master.exceptions import ZoeStartExecutionRetryException, ZoeStartExecutionFatalException, ZoeException
import zoe_master.backends.base
from zoe_master.backends.service_instance import ServiceInstance
from zoe_master.backends.old_swarm.threads import SwarmMonitor, SwarmStateSynchronizer
from zoe_master.stats import NodeStats, ClusterStats # pylint: disable=unused-import
log = logging.getLogger(__name__)
# These two module-level variables hold the references to the monitor and checker threads
_monitor = None
_checker = None
class OldSwarmBackend(zoe_master.backends.base.BaseBackend):
"""Zoe backend implementation for old-style stand-alone Docker Swarm."""
def __init__(self, opts):
super().__init__(opts)
self.swarm = SwarmClient(opts)
@classmethod
def init(cls, state):
"""Initializes Swarm backend starting the event monitoring thread."""
global _monitor, _checker
_monitor = SwarmMonitor(state)
_checker = SwarmStateSynchronizer(state)
@classmethod
def shutdown(cls):
"""Performs a clean shutdown of the resources used by Swarm backend."""
_monitor.quit()
_checker.quit()
def spawn_service(self, service_instance: ServiceInstance):
"""Spawn a service, translating a Zoe Service into a Docker container."""
copts = DockerContainerOptions()
copts.gelf_log_address = get_conf().gelf_address
copts.name = service_instance.hostname
copts.set_memory_limit(service_instance.memory_limit)
copts.network_name = get_conf().overlay_network_name
copts.labels = service_instance.labels
# Always disable auto restart
copts.restart = False
for name, value in service_instance.environment:
copts.add_env_variable(name, value)
for port in service_instance.ports:
if port.expose:
copts.ports.append(port.port_number)
for volume in service_instance.volumes:
if volume.type == "host_directory":
copts.add_volume_bind(volume.path, volume.mount_point, volume.readonly)
else:
log.warning('Docker Swarm backend does not support volume type {}'.format(volume.type))
copts.set_entrypoint(service_instance.entrypoint)
copts.set_command(service_instance.command)
try:
cont_info = self.swarm.spawn_container(service_instance.image_name, copts)
except ZoeNotEnoughResourcesException:
raise ZoeStartExecutionRetryException('Not enough free resources to satisfy reservation request for service {}'.format(service_instance.name))
except (ZoeException, ZoeLibException) as e:
raise ZoeStartExecutionFatalException(str(e))
return cont_info["docker_id"]
def terminate_service(self, service: Service) -> None:
"""Terminate and delete a container."""
self.swarm.terminate_container(service.backend_id, delete=True)
def platform_state(self) -> ClusterStats:
"""Get the platform state."""
info = self.swarm.info()
for node in info.nodes: # type: NodeStats
node.memory_free = node.memory_total - node.memory_reserved
node.cores_free = node.cores_total - node.cores_reserved
return info
...@@ -19,14 +19,14 @@ import logging ...@@ -19,14 +19,14 @@ import logging
import threading import threading
import time import time
from zoe_lib.swarm_client import SwarmClient
from zoe_lib.config import get_conf from zoe_lib.config import get_conf
from zoe_lib.sql_manager import SQLManager from zoe_lib.state import SQLManager, Service
from zoe_master.backends.old_swarm.api_client import SwarmClient
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
class ZoeMonitor(threading.Thread): class SwarmMonitor(threading.Thread):
"""The monitor.""" """The monitor."""
def __init__(self, state: SQLManager) -> None: def __init__(self, state: SQLManager) -> None:
...@@ -52,10 +52,6 @@ class ZoeMonitor(threading.Thread): ...@@ -52,10 +52,6 @@ class ZoeMonitor(threading.Thread):
def _event_cb(self, event: dict) -> bool: def _event_cb(self, event: dict) -> bool:
if event['Type'] == 'container': if event['Type'] == 'container':
self._container_event(event) self._container_event(event)
elif event['Type'] == 'network':
pass
elif event['Type'] == 'image':
pass
else: else:
log.debug('Unmanaged event type: {}'.format(event['Type'])) log.debug('Unmanaged event type: {}'.format(event['Type']))
log.debug(str(event)) log.debug(str(event))
...@@ -73,19 +69,19 @@ class ZoeMonitor(threading.Thread): ...@@ -73,19 +69,19 @@ class ZoeMonitor(threading.Thread):
service_id = event['Actor']['Attributes']['zoe.service.id'] # type: int service_id = event['Actor']['Attributes']['zoe.service.id'] # type: int
service = self.state.service_list(only_one=True, id=service_id) service = self.state.service_list(only_one=True, id=service_id)
if 'exec' in event['Action']: if service is None:
pass return
elif 'create' in event['Action']: if 'create' in event['Action']:
service.set_docker_status(service.DOCKER_CREATE_STATUS) service.set_backend_status(service.BACKEND_CREATE_STATUS)
elif 'start' in event['Action']: elif 'start' in event['Action']:
service.set_docker_status(service.DOCKER_START_STATUS) service.set_backend_status(service.BACKEND_START_STATUS)
elif 'die' in event['Action'] or 'kill' in event['Action'] or 'stop' in event['Action']: elif 'die' in event['Action'] or 'kill' in event['Action'] or 'stop' in event['Action']:
service.set_docker_status(service.DOCKER_DIE_STATUS) service.set_backend_status(service.BACKEND_DIE_STATUS)
elif 'oom' in event['Action']: elif 'oom' in event['Action']:
service.set_docker_status(service.DOCKER_OOM_STATUS) service.set_backend_status(service.BACKEND_OOM_STATUS)
log.warning('Service {} got killed by an OOM condition'.format(service.id)) log.warning('Service {} got killed by an OOM condition'.format(service.id))
elif 'destroy' in event['Action']: elif 'destroy' in event['Action']:
service.set_docker_status(service.DOCKER_DESTROY_STATUS) service.set_backend_status(service.BACKEND_DESTROY_STATUS)
else: else:
log.debug('Unmanaged container action: {}'.format(event['Action'])) log.debug('Unmanaged container action: {}'.format(event['Action']))
...@@ -94,38 +90,49 @@ class ZoeMonitor(threading.Thread): ...@@ -94,38 +90,49 @@ class ZoeMonitor(threading.Thread):
self.stop = True self.stop = True
SAMPLE_EVENT = { CHECK_INTERVAL = 300
'node': {
'Name': 'bf18',
'Id': 'VPCL:E5GW:WON3:2DPV:WFO7:EVNO:ZAKS:V2PA:PGKU:RSM7:AAR3:EAV7', class SwarmStateSynchronizer(threading.Thread):
'Addr': '192.168.47.18:2375', """The Swarm Checker."""
'Ip': '192.168.47.18'
}, def __init__(self, state: SQLManager) -> None:
'timeNano': 1469622892143470822, super().__init__()
'Actor': { self.setName('checker')
'ID': 'e4d3e639c1ec2107262f19cf6e57406cf83e376ef4f131461c3f256d0ce64e13', self.stop = False
'Attributes': { self.state = state
'node.ip': '192.168.47.18', self.setDaemon(True)
'image': 'docker-registry:5000/zoerepo/spark-submit',
'node.name': 'bf18', self.start()
'node.addr': '192.168.47.18:2375',
'zoe.service.name': 'spark-submit0', def _find_dead_service(self, container_list, service: Service):
'name': 'spark-submit0-60-prod', """Loop through the containers and try to update the service status."""
'zoe.owner': 'milanesio', found = False
'zoe.deployment_name': 'prod', for container in container_list:
'com.docker.swarm.id': 'de7515d8839c461523e8326c552b45da0f9bd0f9af4f68d4d5a55429533405d4', if container['id'] == service.backend_id:
'zoe.execution.id': '60', found = True
'zoe.monitor': 'true', if container['status'] == 'exited':
'zoe.execution.name': 'testebob', log.info('resetting status of service {}, died with no event'.format(service.name))
'node.id': 'VPCL:E5GW:WON3:2DPV:WFO7:EVNO:ZAKS:V2PA:PGKU:RSM7:AAR3:EAV7', service.set_backend_status(service.BACKEND_DIE_STATUS)
'zoe.service.id': '233', if not found:
'zoe.type': 'app_service' service.set_backend_status(service.BACKEND_DESTROY_STATUS)
}
}, def run(self):
'status': 'start', """The thread loop."""
'Action': 'start', log.info("Checker thread started")
'id': 'e4d3e639c1ec2107262f19cf6e57406cf83e376ef4f131461c3f256d0ce64e13', swarm = SwarmClient(get_conf())
'time': 1469622892, while not self.stop:
'Type': 'container', service_list = self.state.service_list()
'from': 'docker-registry:5000/zoerepo/spark-submit node:bf18' container_list = swarm.list(only_label={'zoe_deployment_name': get_conf().deployment_name})
}
for service in service_list:
assert isinstance(service, Service)
if service.backend_status == service.BACKEND_DESTROY_STATUS or service.backend_status == service.BACKEND_DIE_STATUS:
continue
self._find_dead_service(container_list, service)
time.sleep(CHECK_INTERVAL)
def quit(self):
"""Stops the thread."""
self.stop = True
This diff is collapsed.
# Copyright (c) 2017, Daniele Venzano
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Zoe backend implementation for old-style stand-alone Docker Swarm."""
import logging
from zoe_lib.exceptions import ZoeLibException, ZoeNotEnoughResourcesException
from zoe_lib.state import Service
from zoe_master.backends.old_swarm_new_api.api_client import SwarmClient
from zoe_master.exceptions import ZoeStartExecutionRetryException, ZoeStartExecutionFatalException, ZoeException
import zoe_master.backends.base
from zoe_master.backends.service_instance import ServiceInstance
from zoe_master.backends.old_swarm_new_api.threads import SwarmStateSynchronizer
from zoe_master.stats import NodeStats, ClusterStats # pylint: disable=unused-import
log = logging.getLogger(__name__)
# These two module-level variables hold the references to the monitor and checker threads
_checker = None
class OldSwarmNewAPIBackend(zoe_master.backends.base.BaseBackend):
"""Zoe backend implementation for old-style stand-alone Docker Swarm."""
def __init__(self, opts):
super().__init__(opts)
self.swarm = SwarmClient()
@classmethod