Commit c9dd1e1f authored by Daniele Venzano's avatar Daniele Venzano 🏇

Merge branch 'devel/fixes' into 'master'

Add a status page for administrators with the scheduler status

See merge request !17
parents 865df6c8 15991623
...@@ -23,6 +23,7 @@ import zoe_api.web.start ...@@ -23,6 +23,7 @@ import zoe_api.web.start
import zoe_api.web.websockets import zoe_api.web.websockets
import zoe_api.web.executions import zoe_api.web.executions
import zoe_api.web.zapp_shop import zoe_api.web.zapp_shop
import zoe_api.web.status
from zoe_lib.version import ZOE_API_VERSION, ZOE_VERSION from zoe_lib.version import ZOE_API_VERSION, ZOE_VERSION
...@@ -50,7 +51,9 @@ def web_init(api_endpoint) -> List[tornado.web.URLSpec]: ...@@ -50,7 +51,9 @@ def web_init(api_endpoint) -> List[tornado.web.URLSpec]:
tornado.web.url(r'/zapp-shop', zoe_api.web.zapp_shop.ZAppShopHomeWeb, route_args, name='zappshop'), tornado.web.url(r'/zapp-shop', zoe_api.web.zapp_shop.ZAppShopHomeWeb, route_args, name='zappshop'),
tornado.web.url(r'/zapp-shop/logo/([a-z\-.]+)', zoe_api.web.zapp_shop.ZAppLogoWeb, route_args, name='zappshop_logo'), tornado.web.url(r'/zapp-shop/logo/([a-z\-.]+)', zoe_api.web.zapp_shop.ZAppLogoWeb, route_args, name='zappshop_logo'),
tornado.web.url(r'/zapp-shop/start/([0-9a-z\-.]+)', zoe_api.web.zapp_shop.ZAppStartWeb, route_args, name='zappshop_start') tornado.web.url(r'/zapp-shop/start/([0-9a-z\-.]+)', zoe_api.web.zapp_shop.ZAppStartWeb, route_args, name='zappshop_start'),
tornado.web.url(r'/status', zoe_api.web.status.StatusEndpointWeb, route_args, name='status')
] ]
return web_routes return web_routes
......
...@@ -188,3 +188,60 @@ div.status_line { ...@@ -188,3 +188,60 @@ div.status_line {
clear: both; clear: both;
} }
/* stats page */
.node_name {
margin-top: 1em;
font-size: larger;
}
div.node_detail div.memory_total,
div.node_detail div.cores_total {
border: 1px solid black;
background-color: green;
width: 100%;
position: relative;
z-index: 0;
}
div.node_detail div.memory_reserved,
div.node_detail div.cores_reserved {
background-color: red;
display: inline-block;
left: 0;
z-index: 2;
position: relative;
float: left;
}
div.node_detail div.memory_total span,
div.node_detail div.cores_total span {
text-align: end;
width: 100%;
position: relative;
left: 0;
display: block;
z-index: 1;
}
div.scheduler_queue {
display: flex;
overflow: auto;
}
div.queue_item {
margin-right: 5px;
flex: none;
}
div.service {
border: 1px dashed red;
margin-top: 3px;
padding: 5px;
}
div.essential {
border-style: solid;
}
div.running {
border-color: green;
}
# Copyright (c) 2017, Daniele Venzano
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Main points of entry for the Zoe web interface."""
from zoe_api.api_endpoint import APIEndpoint # pylint: disable=unused-import
from zoe_api.web.utils import get_auth, catch_exceptions
from zoe_api.web.custom_request_handler import ZoeRequestHandler
class StatusEndpointWeb(ZoeRequestHandler):
"""Handler class"""
def initialize(self, **kwargs):
"""Initializes the request handler."""
super().initialize(**kwargs)
self.api_endpoint = kwargs['api_endpoint'] # type: APIEndpoint
@catch_exceptions
def get(self):
"""Status and statistics page."""
uid, role = get_auth(self)
if uid is None or role != 'admin':
return self.redirect(self.get_argument('next', u'/login'))
stats = self.api_endpoint.statistics_scheduler(uid, role)
executions_in_queue = {}
for exec_id in stats['queue']:
executions_in_queue[exec_id] = self.api_endpoint.execution_by_id(uid, role, exec_id)
for exec_id in stats['running_queue']:
executions_in_queue[exec_id] = self.api_endpoint.execution_by_id(uid, role, exec_id)
template_vars = {
"uid": uid,
"role": role,
"stats": stats,
"executions_in_queue": executions_in_queue
}
self.render('status.html', **template_vars)
...@@ -15,6 +15,11 @@ ...@@ -15,6 +15,11 @@
<div class="nav-item"> <div class="nav-item">
<a href="{{ reverse_url("execution_list") }}">Executions</a> <a href="{{ reverse_url("execution_list") }}">Executions</a>
</div> </div>
{% if role == "admin" %}
<div class="nav-item">
<a href="{{ reverse_url("status") }}">Status</a>
</div>
{% endif %}
</div> </div>
<div id="user_info"> <div id="user_info">
{{ uid }} ({{ role }}) <a href="{{ reverse_url("logout") }}">logout</a> {{ uid }} ({{ role }}) <a href="{{ reverse_url("logout") }}">logout</a>
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
<h2>Detailed information for execution {{ e.name }}</h2> <h2>Detailed information for execution {{ e.name }}</h2>
<div id="contents"> <div id="contents">
<ul> <ul>
<li>ID: {{ e.id }}</li>
<li>Application name: {{ e.description['name'] }}</li> <li>Application name: {{ e.description['name'] }}</li>
<li>Owner: {{ e.user_id }}</li> <li>Owner: {{ e.user_id }}</li>
<li>Status: {{ e.status }}</li> <li>Status: {{ e.status }}</li>
...@@ -56,7 +57,7 @@ ...@@ -56,7 +57,7 @@
{% endif %} {% endif %}
<ul> <ul>
{% for s in services_info %} {% for s in services_info %}
<li class="container_name">{{ s['name'] }}</li> <li class="container_name">{{ s['name'] }} ({{ s['id'] }})</li>
<ul> <ul>
<li>Zoe status: {{ s['status'] }}</li> <li>Zoe status: {{ s['status'] }}</li>
<li>Backend status: {{ s['backend_status'] }}</li> <li>Backend status: {{ s['backend_status'] }}</li>
......
{% extends "base_user.html" %}
{% block title %}Zoe system status{% endblock %}
{% block content %}
<h2>Zoe system status</h2>
<h3>Scheduler</h3>
<ul>
<li>Queue length: <span id="sched_queue_len">{{ stats.queue_length }}</span></li>
<li>Running queue length: <span id="sched_running_queue_len">{{ stats.running_length }}</span></li>
<li>On-going clean-up threads: <span id="termination_threads_count">{{ stats.termination_threads_count }}</span></li>
</ul>
<h4>Queue</h4>
<p>Service border legend:</p>
<ul>
<li>Green/red: service is active/inactive. Active services have been scheduled and placed.</li>
<li>Solid/dashed: service is essential/elastic</li>
</ul>
<div class="scheduler_queue">
{% for id in stats['queue'] %}
<div class="queue_item" id="{{ id }}">
<a href="{{ reverse_url('execution_inspect', id) }}">{{ id }}</a>
{% for service in executions_in_queue[id].services %}
{% if service.essential %}
<div class="service essential {{ 'running' if service.status == service.ACTIVE_STATUS }}">
{{ service['name'] }}<br/>
M: <script>format_bytes({{ service['resource_reservation']['memory']['max'] }});</script><br/>
C: {{ service['resource_reservation']['cores']['max'] }}
</div>
{% endif %}
{% endfor %}
{% for service in executions_in_queue[id].services %}
{% if not service.essential %}
<div class="service {{ 'running' if service.status == service.ACTIVE_STATUS }}">
{{ service['name'] }}<br/>
M: <script>format_bytes({{ service['resource_reservation']['memory']['max'] }});</script><br/>
C: {{ service['resource_reservation']['cores']['max'] }}
</div>
{% endif %}
{% endfor %}
</div>
{% endfor %}
{% if stats['queue']|length == 0 %}
<p>The queue is empty.</p>
{% endif %}
</div>
<h4>Running queue</h4>
<p>This queue is unsorted, all services here should be green.</p>
<div class="scheduler_queue">
{% for id in stats['running_queue'] %}
<div class="queue_item" id="{{ id }}">
<a href="{{ reverse_url('execution_inspect', id) }}">{{ id }}</a>
{% for service in executions_in_queue[id].services %}
{% if service.essential %}
<div class="service essential {{ 'running' if service.status == service.ACTIVE_STATUS }}">
{{ service['name'] }}<br/>
M: <script>format_bytes({{ service['resource_reservation']['memory']['max'] }});</script><br/>
C: {{ service['resource_reservation']['cores']['max'] }}
</div>
{% endif %}
{% endfor %}
{% for service in executions_in_queue[id].services %}
{% if not service.essential %}
<div class="service {{ 'running' if service.status == service.ACTIVE_STATUS }}">
{{ service['name'] }}<br/>
M: <script>format_bytes({{ service['resource_reservation']['memory']['max'] }});</script><br/>
C: {{ service['resource_reservation']['cores']['max'] }}
</div>
{% endif %}
{% endfor %}
</div>
{% endfor %}
{% if stats['running_queue']|length == 0 %}
<p>The running queue is empty.</p>
{% endif %}
</div>
<h3>Platform</h3>
<ul>
<li>Total containers: {{ stats.platform_stats.container_count }}</li>
<li>Total memory: <script>format_bytes({{ stats.platform_stats.memory_total }}, 2) </script></li>
<li>Total cores: {{ stats.platform_stats.cores_total }}</li>
</ul>
<div class="platform_node_detail">
{% for node in stats.platform_stats.nodes %}
<div class="node_detail">
<div class="node_name">{{ node['name'] }}</div>
<div class="container_count">{{ node['container_count'] }} containers</div>
<div class="memory_total">
<div class="memory_reserved" style="width: {{ node['memory_reserved'] * 100 / node['memory_total'] }}%;">&nbsp;</div><span>{{ '%0.2f' % (node['memory_reserved'] * 100 / node['memory_total'],) }}% memory reserved</span>
</div>
<div class="cores_total">
<div class="cores_reserved" style="width: {{ node['cores_reserved'] * 100 / node['cores_total'] }}%;">&nbsp;</div><span>{{ '%0.2f' % (node['cores_reserved'] * 100 / node['cores_total'],) }}% cores reserved</span>
</div>
</div>
{% endfor %}
</div>
{% endblock %}
...@@ -109,6 +109,9 @@ class WebSocketEndpointWeb(tornado.websocket.WebSocketHandler): ...@@ -109,6 +109,9 @@ class WebSocketEndpointWeb(tornado.websocket.WebSocketHandler):
continue continue
self.write_message(log_line) self.write_message(log_line)
elif request['command'] == 'system_status':
stats = self.api_endpoint.statistics_scheduler(self.uid, self.role)
self.write_message(json.dumps(stats))
else: else:
response = { response = {
'status': 'error', 'status': 'error',
......
...@@ -24,6 +24,7 @@ from zoe_lib.state import Execution, Service ...@@ -24,6 +24,7 @@ from zoe_lib.state import Execution, Service
from zoe_master.backends.base import BaseBackend from zoe_master.backends.base import BaseBackend
from zoe_master.backends.service_instance import ServiceInstance from zoe_master.backends.service_instance import ServiceInstance
from zoe_master.exceptions import ZoeStartExecutionFatalException, ZoeStartExecutionRetryException, ZoeException from zoe_master.exceptions import ZoeStartExecutionFatalException, ZoeStartExecutionRetryException, ZoeException
from zoe_master.stats import ClusterStats # pylint: disable=unused-import
try: try:
from zoe_master.backends.swarm.backend import SwarmBackend from zoe_master.backends.swarm.backend import SwarmBackend
...@@ -154,7 +155,7 @@ def terminate_execution(execution: Execution) -> None: ...@@ -154,7 +155,7 @@ def terminate_execution(execution: Execution) -> None:
execution.set_terminated() execution.set_terminated()
def get_platform_state(): def get_platform_state() -> ClusterStats:
"""Retrieves the state of the platform by querying the container backend. Platform state includes information on free/reserved resources for each node. This information is used for advanced scheduling.""" """Retrieves the state of the platform by querying the container backend. Platform state includes information on free/reserved resources for each node. This information is used for advanced scheduling."""
backend = _get_backend() backend = _get_backend()
return backend.platform_state() return backend.platform_state()
...@@ -49,6 +49,11 @@ class ZoeElasticScheduler: ...@@ -49,6 +49,11 @@ class ZoeElasticScheduler:
self.loop_th = threading.Thread(target=self._thread_wrapper, name='scheduler') self.loop_th = threading.Thread(target=self._thread_wrapper, name='scheduler')
self.loop_th.start() self.loop_th.start()
self.state = state self.state = state
for execution in self.state.execution_list(status='running'):
if execution.all_services_running:
self.queue_running.append(execution)
else:
self.queue.append(execution)
def trigger(self): def trigger(self):
"""Trigger a scheduler run.""" """Trigger a scheduler run."""
...@@ -255,8 +260,16 @@ class ZoeElasticScheduler: ...@@ -255,8 +260,16 @@ class ZoeElasticScheduler:
def stats(self): def stats(self):
"""Scheduler statistics.""" """Scheduler statistics."""
if self.policy == "SIZE":
queue = sorted(self.queue, key=lambda execution: execution.size)
else:
queue = self.queue
return { return {
'queue_length': len(self.queue), 'queue_length': len(self.queue),
'running_length': len(self.queue_running), 'running_length': len(self.queue_running),
'termination_threads_count': len(self.async_threads) 'termination_threads_count': len(self.async_threads),
'queue': [s.id for s in queue],
'running_queue': [s.id for s in self.queue_running],
'platform_stats': get_platform_state().serialize()
} }
...@@ -43,6 +43,22 @@ class NodeStats(Stats): ...@@ -43,6 +43,22 @@ class NodeStats(Stats):
self.status = None self.status = None
self.error = '' self.error = ''
def serialize(self):
"""Convert the object into a dict."""
return {
'name': self.name,
'container_count': self.container_count,
'cores_total': self.cores_total,
'cores_reserved': self.cores_reserved,
'cores_free': self.cores_free,
'memory_total': self.memory_total,
'memory_reserved': self.memory_reserved,
'memory_free': self.memory_free,
'labels': self.labels,
'status': self.status,
'error': self.error
}
class ClusterStats(Stats): class ClusterStats(Stats):
"""Stats related to the whole cluster.""" """Stats related to the whole cluster."""
...@@ -53,10 +69,11 @@ class ClusterStats(Stats): ...@@ -53,10 +69,11 @@ class ClusterStats(Stats):
self.cores_total = 0 self.cores_total = 0
self.nodes = [] self.nodes = []
def serialize(self):
class SchedulerStats(Stats): """Convert the object into a dict."""
"""Stats related to the scheduler.""" return {
def __init__(self): 'container_count': self.container_count,
super().__init__() 'memory_total': self.memory_total,
self.count_waiting = 0 'cores_total': self.cores_total,
self.waiting_list = [] 'nodes': [x.serialize() for x in self.nodes]
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment