Commit ae258a66 authored by Daniele Venzano's avatar Daniele Venzano

Consider all docker errors as fatal, except the "not enough resources" one

Expose container-level error messages to the user through the service error_message field
Closes #31
parent 7d10f22b
......@@ -33,6 +33,9 @@
<ul>
<li>Zoe status: {{ s['status'] }}</li>
<li>Docker status: {{ s['docker_status'] }}</li>
{% if s['error_message'] is not none %}
<li>Error: {{ s['error_message'] }}</li>
{% endif %}
{% if s['docker_status'] == 'started' %}
{% for p in s['description']['ports'] %}
<li><a href="{{ p['protocol'] }}://{{ s['ip_address'] }}:{{ p['port_number'] }}{{ p['path'] }}">{{ p['name'] }}</a></li>
......
......@@ -158,6 +158,8 @@ def exec_get_cmd(args):
print('Service {} (ID: {})'.format(service['name'], service['id']))
print(' - zoe status: {}'.format(service['status']))
print(' - docker status: {}'.format(service['docker_status']))
if service['error_message'] is not None:
print(' - error: {}'.format(service['error_message']))
if service['docker_status'] == 'started':
ip = service['ip_address']
for port in service['description']['ports']:
......
......@@ -46,3 +46,8 @@ class InvalidApplicationDescription(ZoeAPIException):
"""
def __init__(self, msg):
super().__init__("Error: " + msg)
class ZoeNotEnoughResourcesException(ZoeLibException):
"""Service failed to start due to not enough available resources."""
pass
......@@ -321,6 +321,7 @@ class Service(Base):
INACTIVE_STATUS = "inactive"
ACTIVE_STATUS = "active"
STARTING_STATUS = "starting"
ERROR_STATUS = "error"
DOCKER_UNDEFINED_STATUS = 'undefined'
DOCKER_CREATE_STATUS = 'created'
......@@ -377,7 +378,12 @@ class Service(Base):
def set_active(self, docker_id):
"""The service is running and has a valid docker_id."""
self.sql_manager.service_update(self.id, status=self.ACTIVE_STATUS, docker_id=docker_id)
self.sql_manager.service_update(self.id, status=self.ACTIVE_STATUS, docker_id=docker_id, error_message=None)
self.error_message = None
def set_error(self, error_message):
"""The service could not be created/started."""
self.sql_manager.service_update(self.id, status=self.ERROR_STATUS, error_message=error_message)
def set_docker_status(self, new_status):
"""Docker has emitted an event related to this service."""
......
......@@ -34,7 +34,7 @@ import docker.utils
import requests.packages
from zoe_master.stats import SwarmStats, SwarmNodeStats
from zoe_lib.exceptions import ZoeLibException
from zoe_lib.exceptions import ZoeLibException, ZoeNotEnoughResourcesException
log = logging.getLogger(__name__)
......@@ -223,6 +223,13 @@ class SwarmClient:
ports=options.ports,
labels=options.labels)
self.cli.start(container=cont.get('Id'))
except docker.errors.APIError as e:
if cont is not None:
self.cli.remove_container(container=cont.get('Id'), force=True)
if e.explanation == b'no resources available to schedule container':
raise ZoeNotEnoughResourcesException(message=e.explanation.decode('utf-8'))
else:
raise ZoeLibException(message=e.explanation.decode('utf-8'))
except Exception as e:
if cont is not None:
self.cli.remove_container(container=cont.get('Id'), force=True)
......
......@@ -22,7 +22,7 @@ from zoe_master.workspace.filesystem import ZoeFSWorkspace
from zoe_master.exceptions import ZoeStartExecutionRetryException, ZoeStartExecutionFatalException, ZoeException
from zoe_lib.config import get_conf
from zoe_lib.exceptions import ZoeLibException
from zoe_lib.exceptions import ZoeLibException, ZoeNotEnoughResourcesException
from zoe_lib.sql_manager import Execution, Service
from zoe_lib.swarm_client import DockerContainerOptions, SwarmClient
......@@ -61,7 +61,9 @@ def _gen_environment(service, env_subst_dict, copts):
try:
env_value = env_value.format(**env_subst_dict)
except KeyError:
raise ZoeStartExecutionFatalException("unknown variable in expression '{}', known variables are: {}".format(env_value, list(env_subst_dict.keys())))
error_msg = "Unknown variable in environment expression '{}', known variables are: {}".format(env_value, list(env_subst_dict.keys()))
service.set_error(error_msg)
raise ZoeStartExecutionFatalException("Service {} has wrong environment expression")
copts.add_env_variable(env_name, env_value)
......@@ -116,10 +118,11 @@ def _spawn_service(execution: Execution, service: Service, env_subst_dict: dict)
try:
cont_info = swarm.spawn_container(service.description['docker_image'], copts)
except ZoeException as e:
raise ZoeStartExecutionRetryException(str(e))
except ZoeLibException as e:
raise ZoeStartExecutionRetryException(str(e))
except ZoeNotEnoughResourcesException:
service.set_error('Not enough free resources to satisfy reservation request')
raise ZoeStartExecutionRetryException('Not enough free resources to satisfy reservation request for service {}'.format(service.name))
except (ZoeException, ZoeLibException) as e:
raise ZoeStartExecutionFatalException(str(e))
service.set_active(cont_info["docker_id"])
......@@ -128,7 +131,8 @@ def _spawn_service(execution: Execution, service: Service, env_subst_dict: dict)
try:
swarm.connect_to_network(service.docker_id, net)
except ZoeException as e:
raise ZoeStartExecutionFatalException(str(e))
service.set_error(str(e))
raise ZoeStartExecutionFatalException("Failed to attach network {} to service {}".format(net, service.name))
return
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment