Commit ddf1f8a3 authored by Daniele Venzano's avatar Daniele Venzano
Browse files

Implement a new backend that communicates directly with Docker engines

parent 9be889df
......@@ -61,3 +61,4 @@ target/
state.zoe
/zoe*.conf
zoepass.csv
/docker.conf
......@@ -21,7 +21,7 @@ import psycopg2.extras
import zoe_api.exceptions
from zoe_lib.config import get_conf
SQL_SCHEMA_VERSION = 4 # ---> Increment this value every time the schema changes !!! <---
SQL_SCHEMA_VERSION = 5 # ---> Increment this value every time the schema changes !!! <---
def version_table(cur):
......@@ -75,6 +75,7 @@ def create_tables(cur):
name TEXT NOT NULL,
backend_id TEXT NULL DEFAULT NULL,
backend_status TEXT NOT NULL DEFAULT 'undefined',
backend_host TEXT NULL DEFAULT NULL,
ip_address CIDR NULL DEFAULT NULL,
essential BOOLEAN NOT NULL DEFAULT FALSE
)''')
......
......@@ -95,7 +95,7 @@ def load_configuration(test_conf=None):
argparser.add_argument('--scheduler-class', help='Scheduler class to use for scheduling ZApps', choices=['ZoeSimpleScheduler', 'ZoeElasticScheduler'], default='ZoeSimpleScheduler')
argparser.add_argument('--scheduler-policy', help='Scheduler policy to use for scheduling ZApps', choices=['FIFO', 'SIZE'], default='FIFO')
argparser.add_argument('--backend', choices=['Swarm', 'Kubernetes'], default='Swarm')
argparser.add_argument('--backend', choices=['Swarm', 'Kubernetes', 'DockerEngine'], default='Swarm')
# Docker Swarm backend options
argparser.add_argument('--backend-swarm-url', help='Swarm/Docker API endpoint (ex.: zk://zk1:2181,zk2:2181 or http://swarm:2380)', default='http://localhost:2375')
......@@ -104,6 +104,9 @@ def load_configuration(test_conf=None):
argparser.add_argument('--backend-swarm-tls-key', help='Docker TLS private key file', default='key.pem')
argparser.add_argument('--backend-swarm-tls-ca', help='Docker TLS CA certificate file', default='ca.pem')
# Docker Engine backend options
argparser.add_argument('--backend-docker-config-file', help='Location of the Docker Engine config file', default='docker.conf')
# Kubernetes backend
argparser.add_argument('--kube-config-file', help='Kubernetes configuration file', default='/opt/zoe/kube.conf')
argparser.add_argument('--kube-namespace', help='The namespace that Zoe operates on', default='default')
......
......@@ -115,6 +115,7 @@ class Service:
self.service_group = d['service_group']
self.backend_id = d['backend_id']
self.backend_status = d['backend_status']
self.backend_host = d['backend_host']
self.ip_address = d['ip_address']
if self.ip_address is not None and ('/32' in self.ip_address or '/128' in self.ip_address):
......@@ -176,6 +177,7 @@ class Service:
'backend_id': self.backend_id,
'ip_address': self.ip_address,
'backend_status': self.backend_status,
'backend_host': self.backend_host,
'essential': self.essential,
'proxy_address': self.proxy_address
}
......@@ -190,10 +192,11 @@ class Service:
def set_inactive(self):
"""The service is not running."""
self.sql_manager.service_update(self.id, status=self.INACTIVE_STATUS, backend_id=None, ip_address=None)
self.sql_manager.service_update(self.id, status=self.INACTIVE_STATUS, backend_id=None, ip_address=None, backend_host=None)
self.status = self.INACTIVE_STATUS
for port in self.ports:
port.reset()
self.backend_host = None
def set_starting(self):
"""The service is being created by Docker."""
......@@ -225,6 +228,12 @@ class Service:
log.debug("service {}, backend status updated to {}".format(self.id, new_status))
self.backend_status = new_status
def assign_backend_host(self, backend_host):
"""Assign this service to a host in particular."""
self.sql_manager.service_update(self.id, backend_host=backend_host)
log.debug('service {} assigned to host {}'.format(self.id, backend_host))
self.backend_host = backend_host
@property
def dns_name(self):
"""Getter for the DNS name of this service as it will be registered in Docker's DNS."""
......
......@@ -194,7 +194,7 @@ class SQLManager:
"""Adds a new service to the state."""
cur = self._cursor()
status = 'created'
query = cur.mogrify('INSERT INTO service (id, status, error_message, execution_id, name, service_group, description, essential) VALUES (DEFAULT, %s,NULL,%s,%s,%s,%s,%s) RETURNING id', (status, execution_id, name, service_group, description, is_essential))
query = cur.mogrify('INSERT INTO service (id, status, execution_id, name, service_group, description, essential) VALUES (DEFAULT,%s,%s,%s,%s,%s,%s) RETURNING id', (status, execution_id, name, service_group, description, is_essential))
cur.execute(query)
self.conn.commit()
return cur.fetchone()[0]
......
# Copyright (c) 2016, Daniele Venzano
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Interface to the low-level Docker API."""
import logging
from typing import Iterable, Callable, Dict, Any
import docker
import docker.tls
import docker.errors
import docker.utils
import docker.models.containers
import requests.exceptions
from zoe_lib.config import get_conf
from zoe_lib.state import Service, VolumeDescriptionHostPath
from zoe_master.backends.service_instance import ServiceInstance
from zoe_master.backends.docker.config import DockerHostConfig # pylint: disable=unused-import
from zoe_master.exceptions import ZoeException, ZoeNotEnoughResourcesException
log = logging.getLogger(__name__)
try:
docker.DockerClient()
except AttributeError:
log.error('Docker package does not have the DockerClient attribute')
raise ImportError('Wrong Docker library version')
class DockerClient:
"""The client class that wraps the Docker API."""
def __init__(self, docker_config: DockerHostConfig) -> None:
self.name = docker_config.name
if not docker_config.tls:
tls = None
else:
tls = docker.tls.TLSConfig(client_cert=(docker_config.tls_cert, docker_config.tls_key), verify=docker_config.tls_ca)
try:
self.cli = docker.DockerClient(base_url=docker_config.address, version="auto", tls=tls)
except docker.errors.DockerException as e:
raise ZoeException("Cannot connect to Docker host {} at address {}: {}".format(docker_config.name, docker_config.address, str(e)))
def info(self) -> Dict:
"""Retrieve engine statistics."""
return self.cli.info()
def spawn_container(self, service_instance: ServiceInstance) -> Dict[str, Any]:
"""Create and start a new container."""
cont = None
port_bindings = {} # type: Dict[str, Any]
for port in service_instance.ports:
port_bindings[str(port.number) + '/' + port.proto] = None
environment = {}
for name, value in service_instance.environment:
environment[name] = value
volumes = {}
for volume in service_instance.volumes:
if volume.type == "host_directory":
assert isinstance(volume, VolumeDescriptionHostPath)
volumes[volume.path] = {'bind': volume.mount_point, 'mode': ("ro" if volume.readonly else "rw")}
else:
log.error('Swarm backend does not support volume type {}'.format(volume.type))
if service_instance.memory_limit is not None:
mem_limit = service_instance.memory_limit.max
mem_reservation = service_instance.memory_limit.min
if mem_reservation == mem_limit:
mem_reservation -= 1
else:
mem_limit = 0
mem_reservation = 0
if service_instance.core_limit is not None:
cpu_period = 100000
cpu_quota = 100000 * service_instance.core_limit.max
else:
cpu_period = 100000
cpu_quota = 100000
if get_conf().gelf_address != '':
log_config = {
"type": "gelf",
"config": {
'gelf-address': get_conf().gelf_address,
'labels': ",".join(service_instance.labels)
}
}
else:
log_config = {
"type": "json-file",
"config": {}
}
try:
cont = self.cli.containers.run(image=service_instance.image_name,
command=service_instance.command,
detach=True,
environment=environment,
hostname=service_instance.hostname,
labels=service_instance.labels,
log_config=log_config,
cpu_period=cpu_period,
cpu_quota=cpu_quota,
mem_limit=mem_limit,
mem_reservation=mem_reservation,
memswap_limit=0,
name=service_instance.name,
network_disabled=False,
network_mode=get_conf().overlay_network_name,
ports=port_bindings,
working_dir=service_instance.work_dir,
volumes=volumes)
except docker.errors.ImageNotFound:
raise ZoeException(message='Image not found')
except docker.errors.APIError as e:
if cont is not None:
cont.remove(force=True)
if e.explanation == b'no resources available to schedule container':
raise ZoeNotEnoughResourcesException(message=str(e))
else:
raise ZoeException(message=str(e))
except Exception as e:
if cont is not None:
cont.remove(force=True)
raise ZoeException(str(e))
cont = self.cli.containers.get(cont.id)
return self._container_summary(cont)
def _container_summary(self, container: docker.models.containers.Container):
"""Translate a docker-specific container object into a simple dictionary."""
info = {
"id": container.id,
"ip_address": {},
"name": container.name,
'labels': container.attrs['Config']['Labels']
} # type: Dict[str, Any]
try:
info['host'] = container.attrs['Node']['Name'],
except KeyError:
info['host'] = 'N/A'
if container.attrs["NetworkSettings"]["Networks"] is not None:
for net in container.attrs["NetworkSettings"]["Networks"]:
if len(container.attrs["NetworkSettings"]["Networks"][net]['IPAddress']) > 0:
info["ip_address"][net] = container.attrs["NetworkSettings"]["Networks"][net]['IPAddress']
else:
info["ip_address"][net] = None
if container.status == 'running' or container.status == 'restarting':
info["state"] = Service.BACKEND_START_STATUS
info["running"] = True
elif container.status == 'paused' or container.status == 'exited':
info["state"] = Service.BACKEND_DIE_STATUS
info["running"] = False
elif container.status == 'OOMKilled':
info["state"] = Service.BACKEND_OOM_STATUS
info["running"] = False
elif container.status == 'created':
info["state"] = Service.BACKEND_CREATE_STATUS
info["running"] = False
else:
log.error('Unknown container status: {}'.format(container.status))
info["state"] = Service.BACKEND_UNDEFINED_STATUS
info["running"] = False
info['ports'] = {}
if container.attrs['NetworkSettings']['Ports'] is not None:
for port in container.attrs['NetworkSettings']['Ports']:
if container.attrs['NetworkSettings']['Ports'][port] is not None:
mapping = (
container.attrs['NetworkSettings']['Ports'][port][0]['HostIp'],
container.attrs['NetworkSettings']['Ports'][port][0]['HostPort']
)
info['ports'][port] = mapping
else:
info['ports'][port] = None
return info
def inspect_container(self, docker_id: str) -> Dict[str, Any]:
"""Retrieve information about a running container."""
try:
cont = self.cli.container.get(docker_id)
except Exception as e:
raise ZoeException(str(e))
return self._container_summary(cont)
def terminate_container(self, docker_id: str, delete=False) -> None:
"""
Terminate a container.
:param docker_id: The container to terminate
:type docker_id: str
:param delete: If True, also delete the container files
:type delete: bool
:return: None
"""
try:
cont = self.cli.containers.get(docker_id)
except docker.errors.NotFound:
return
cont.stop(timeout=5)
if delete:
try:
cont.remove(force=True)
except docker.errors.APIError as e:
log.warning(str(e))
def event_listener(self, callback: Callable[[str], bool]) -> None:
"""An infinite loop that listens for events from Swarm."""
event_gen = self.cli.events(decode=True)
while True:
try:
event = next(event_gen)
except requests.exceptions.RequestException:
log.warning('Docker closed event connection, retrying...')
event_gen = self.cli.events(decode=True)
continue
try:
res = callback(event)
except Exception:
log.exception('Uncaught exception in swarm event callback')
log.warning('event was: {}'.format(event))
continue
if not res:
break
def list(self, only_label=None) -> Iterable[dict]:
"""
List running or defined containers.
:param only_label: filter containers with only a certain label
:return: a list of containers
"""
try:
ret = self.cli.containers.list(all=True)
except docker.errors.APIError as ex:
raise ZoeException(str(ex))
except requests.exceptions.RequestException as ex:
raise ZoeException(str(ex))
if only_label is None:
only_label = {}
conts = []
for cont_info in ret:
match = True
for key, value in only_label.items():
if key not in cont_info.attrs['Config']['Labels']:
match = False
break
if cont_info.attrs['Config']['Labels'][key] != value:
match = False
break
if match:
conts.append(self._container_summary(cont_info))
return conts
def stats(self, docker_id: str, stream: bool):
"""Retrieves container stats based on resource usage."""
try:
cont = self.cli.containers.get(docker_id)
except (docker.errors.NotFound, docker.errors.APIError):
return None
try:
return cont.stats(stream=stream)
except docker.errors.APIError:
return None
def logs(self, docker_id: str, stream: bool, follow=None):
"""
Retrieves the logs of the selected container.
:param docker_id:
:param stream:
:param follow:
:return:
"""
try:
cont = self.cli.containers.get(docker_id)
except (docker.errors.NotFound, docker.errors.APIError):
return None
try:
return cont.logs(stdout=True, stderr=True, follow=follow, stream=stream, timestamps=True, tail='all')
except docker.errors.APIError:
return None
# Copyright (c) 2017, Daniele Venzano
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Zoe backend implementation for old-style stand-alone Docker Swarm."""
import logging
from zoe_lib.state import Service
from zoe_lib.config import get_conf
from zoe_master.exceptions import ZoeStartExecutionRetryException, ZoeStartExecutionFatalException, ZoeException, ZoeNotEnoughResourcesException
import zoe_master.backends.base
from zoe_master.backends.service_instance import ServiceInstance
from zoe_master.backends.docker.threads import DockerStateSynchronizer
from zoe_master.backends.docker.api_client import DockerClient
from zoe_master.backends.docker.config import DockerConfig, DockerHostConfig # pylint: disable=unused-import
from zoe_master.stats import ClusterStats # pylint: disable=unused-import
log = logging.getLogger(__name__)
# These two module-level variables hold the references to the monitor and checker threads
_checker = None
class DockerEngineBackend(zoe_master.backends.base.BaseBackend):
"""Zoe backend implementation for old-style stand-alone Docker Swarm."""
def __init__(self, opts):
super().__init__(opts)
self.docker_config = DockerConfig().read_config()
def _get_config(self, host) -> DockerHostConfig:
for conf in self.docker_config:
if conf.name == host:
return conf
@classmethod
def init(cls, state):
"""Initializes Swarm backend starting the event monitoring thread."""
global _checker
_checker = DockerStateSynchronizer(state)
@classmethod
def shutdown(cls):
"""Performs a clean shutdown of the resources used by Swarm backend."""
_checker.quit()
def spawn_service(self, service_instance: ServiceInstance):
"""Spawn a service, translating a Zoe Service into a Docker container."""
conf = self._get_config(service_instance.backend_host)
try:
engine = DockerClient(conf)
cont_info = engine.spawn_container(service_instance)
except ZoeNotEnoughResourcesException:
raise ZoeStartExecutionRetryException('Not enough free resources to satisfy reservation request for service {}'.format(service_instance.name))
except ZoeException as e:
raise ZoeStartExecutionFatalException(str(e))
return cont_info["id"], cont_info['ip_address'][get_conf().overlay_network_name]
def terminate_service(self, service: Service) -> None:
"""Terminate and delete a container."""
conf = self._get_config(service.backend_host)
engine = DockerClient(conf)
engine.terminate_container(service.backend_id, delete=True)
def platform_state(self) -> ClusterStats:
"""Get the platform state."""
return _checker.get_platform_stats()
def service_log(self, service: Service):
"""Get the log."""
conf = self._get_config(service.backend_host)
engine = DockerClient(conf)
return engine.logs(service.backend_id, True, False)
# Copyright (c) 2017, Daniele Venzano
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Parses Docker-specific configuration file."""
import configparser
from typing import List
import logging
from zoe_lib.config import get_conf
log = logging.getLogger(__name__)
class DockerHostConfig:
"""A class that holds static information about a host."""
def __init__(self):
self.name = None
self.address = None
self.tls = False
self.tls_cert = None
self.tls_key = None
self.tls_ca = None
self.labels = []
class DockerConfig:
"""A class that holds the configuration for the Docker Engine backend."""
def __init__(self):
self.conffile = get_conf().backend_docker_config_file
def read_config(self) -> List[DockerHostConfig]:
"""Parse the configuration file."""
config = configparser.ConfigParser()
config.read(self.conffile)
hosts = []
for section in config.sections():
host = DockerHostConfig()
host.name = section
try:
host.address = config[section]['address']
host.tls = config.getboolean(section, 'use_tls')
if host.tls:
host.tls_cert = config[section]['tls_cert']
host.tls_ca = config[section]['tls_ca']
host.tls_key = config[section]['tls_key']
host.labels = config[section]['labels'].split(',')
except KeyError as e:
log.error('Error in Docker backend configuration, missing key {} in section {}'.format(e.args[0], section))
continue
hosts.append(host)
if len(hosts) == 0:
log.error('Host list is empty, verify your docker backend configuration!')
return hosts
# Copyright (c) 2017, Daniele Venzano
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Monitor for the Swarm event stream."""
import logging
import threading
import time
from copy import deepcopy
from zoe_lib.config import get_conf
from zoe_lib.state import SQLManager, Service
from zoe_master.backends.docker.api_client import DockerClient
from zoe_master.backends.docker.config import DockerConfig, DockerHostConfig # pylint: disable=unused-import
from zoe_master.exceptions import ZoeException
from zoe_master.stats import ClusterStats, NodeStats
log = logging.getLogger(__name__)
CHECK_INTERVAL = 10
THREAD_POOL_SIZE = 10
class DockerStateSynchronizer(threading.Thread):
"""The Docker Checker."""
def __init__(self, state: SQLManager) -> None:
super().__init__()
self.setName('checker')