Commit 15991623 authored by Daniele Venzano's avatar Daniele Venzano

Add a status page for administrators with the scheduler status

parent e1a88f0f
......@@ -23,6 +23,7 @@ import zoe_api.web.start
import zoe_api.web.websockets
import zoe_api.web.executions
import zoe_api.web.zapp_shop
import zoe_api.web.status
from zoe_lib.version import ZOE_API_VERSION, ZOE_VERSION
......@@ -50,7 +51,9 @@ def web_init(api_endpoint) -> List[tornado.web.URLSpec]:
tornado.web.url(r'/zapp-shop', zoe_api.web.zapp_shop.ZAppShopHomeWeb, route_args, name='zappshop'),
tornado.web.url(r'/zapp-shop/logo/([a-z\-.]+)', zoe_api.web.zapp_shop.ZAppLogoWeb, route_args, name='zappshop_logo'),
tornado.web.url(r'/zapp-shop/start/([0-9a-z\-.]+)', zoe_api.web.zapp_shop.ZAppStartWeb, route_args, name='zappshop_start')
tornado.web.url(r'/zapp-shop/start/([0-9a-z\-.]+)', zoe_api.web.zapp_shop.ZAppStartWeb, route_args, name='zappshop_start'),
tornado.web.url(r'/status', zoe_api.web.status.StatusEndpointWeb, route_args, name='status')
]
return web_routes
......
......@@ -188,3 +188,60 @@ div.status_line {
clear: both;
}
/* stats page */
.node_name {
margin-top: 1em;
font-size: larger;
}
div.node_detail div.memory_total,
div.node_detail div.cores_total {
border: 1px solid black;
background-color: green;
width: 100%;
position: relative;
z-index: 0;
}
div.node_detail div.memory_reserved,
div.node_detail div.cores_reserved {
background-color: red;
display: inline-block;
left: 0;
z-index: 2;
position: relative;
float: left;
}
div.node_detail div.memory_total span,
div.node_detail div.cores_total span {
text-align: end;
width: 100%;
position: relative;
left: 0;
display: block;
z-index: 1;
}
div.scheduler_queue {
display: flex;
overflow: auto;
}
div.queue_item {
margin-right: 5px;
flex: none;
}
div.service {
border: 1px dashed red;
margin-top: 3px;
padding: 5px;
}
div.essential {
border-style: solid;
}
div.running {
border-color: green;
}
# Copyright (c) 2017, Daniele Venzano
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Main points of entry for the Zoe web interface."""
from zoe_api.api_endpoint import APIEndpoint # pylint: disable=unused-import
from zoe_api.web.utils import get_auth, catch_exceptions
from zoe_api.web.custom_request_handler import ZoeRequestHandler
class StatusEndpointWeb(ZoeRequestHandler):
"""Handler class"""
def initialize(self, **kwargs):
"""Initializes the request handler."""
super().initialize(**kwargs)
self.api_endpoint = kwargs['api_endpoint'] # type: APIEndpoint
@catch_exceptions
def get(self):
"""Status and statistics page."""
uid, role = get_auth(self)
if uid is None or role != 'admin':
return self.redirect(self.get_argument('next', u'/login'))
stats = self.api_endpoint.statistics_scheduler(uid, role)
executions_in_queue = {}
for exec_id in stats['queue']:
executions_in_queue[exec_id] = self.api_endpoint.execution_by_id(uid, role, exec_id)
for exec_id in stats['running_queue']:
executions_in_queue[exec_id] = self.api_endpoint.execution_by_id(uid, role, exec_id)
template_vars = {
"uid": uid,
"role": role,
"stats": stats,
"executions_in_queue": executions_in_queue
}
self.render('status.html', **template_vars)
......@@ -15,6 +15,11 @@
<div class="nav-item">
<a href="{{ reverse_url("execution_list") }}">Executions</a>
</div>
{% if role == "admin" %}
<div class="nav-item">
<a href="{{ reverse_url("status") }}">Status</a>
</div>
{% endif %}
</div>
<div id="user_info">
{{ uid }} ({{ role }}) <a href="{{ reverse_url("logout") }}">logout</a>
......
......@@ -18,6 +18,7 @@
<h2>Detailed information for execution {{ e.name }}</h2>
<div id="contents">
<ul>
<li>ID: {{ e.id }}</li>
<li>Application name: {{ e.description['name'] }}</li>
<li>Owner: {{ e.user_id }}</li>
<li>Status: {{ e.status }}</li>
......@@ -56,7 +57,7 @@
{% endif %}
<ul>
{% for s in services_info %}
<li class="container_name">{{ s['name'] }}</li>
<li class="container_name">{{ s['name'] }} ({{ s['id'] }})</li>
<ul>
<li>Zoe status: {{ s['status'] }}</li>
<li>Backend status: {{ s['backend_status'] }}</li>
......
{% extends "base_user.html" %}
{% block title %}Zoe system status{% endblock %}
{% block content %}
<h2>Zoe system status</h2>
<h3>Scheduler</h3>
<ul>
<li>Queue length: <span id="sched_queue_len">{{ stats.queue_length }}</span></li>
<li>Running queue length: <span id="sched_running_queue_len">{{ stats.running_length }}</span></li>
<li>On-going clean-up threads: <span id="termination_threads_count">{{ stats.termination_threads_count }}</span></li>
</ul>
<h4>Queue</h4>
<p>Service border legend:</p>
<ul>
<li>Green/red: service is active/inactive. Active services have been scheduled and placed.</li>
<li>Solid/dashed: service is essential/elastic</li>
</ul>
<div class="scheduler_queue">
{% for id in stats['queue'] %}
<div class="queue_item" id="{{ id }}">
<a href="{{ reverse_url('execution_inspect', id) }}">{{ id }}</a>
{% for service in executions_in_queue[id].services %}
{% if service.essential %}
<div class="service essential {{ 'running' if service.status == service.ACTIVE_STATUS }}">
{{ service['name'] }}<br/>
M: <script>format_bytes({{ service['resource_reservation']['memory']['max'] }});</script><br/>
C: {{ service['resource_reservation']['cores']['max'] }}
</div>
{% endif %}
{% endfor %}
{% for service in executions_in_queue[id].services %}
{% if not service.essential %}
<div class="service {{ 'running' if service.status == service.ACTIVE_STATUS }}">
{{ service['name'] }}<br/>
M: <script>format_bytes({{ service['resource_reservation']['memory']['max'] }});</script><br/>
C: {{ service['resource_reservation']['cores']['max'] }}
</div>
{% endif %}
{% endfor %}
</div>
{% endfor %}
{% if stats['queue']|length == 0 %}
<p>The queue is empty.</p>
{% endif %}
</div>
<h4>Running queue</h4>
<p>This queue is unsorted, all services here should be green.</p>
<div class="scheduler_queue">
{% for id in stats['running_queue'] %}
<div class="queue_item" id="{{ id }}">
<a href="{{ reverse_url('execution_inspect', id) }}">{{ id }}</a>
{% for service in executions_in_queue[id].services %}
{% if service.essential %}
<div class="service essential {{ 'running' if service.status == service.ACTIVE_STATUS }}">
{{ service['name'] }}<br/>
M: <script>format_bytes({{ service['resource_reservation']['memory']['max'] }});</script><br/>
C: {{ service['resource_reservation']['cores']['max'] }}
</div>
{% endif %}
{% endfor %}
{% for service in executions_in_queue[id].services %}
{% if not service.essential %}
<div class="service {{ 'running' if service.status == service.ACTIVE_STATUS }}">
{{ service['name'] }}<br/>
M: <script>format_bytes({{ service['resource_reservation']['memory']['max'] }});</script><br/>
C: {{ service['resource_reservation']['cores']['max'] }}
</div>
{% endif %}
{% endfor %}
</div>
{% endfor %}
{% if stats['running_queue']|length == 0 %}
<p>The running queue is empty.</p>
{% endif %}
</div>
<h3>Platform</h3>
<ul>
<li>Total containers: {{ stats.platform_stats.container_count }}</li>
<li>Total memory: <script>format_bytes({{ stats.platform_stats.memory_total }}, 2) </script></li>
<li>Total cores: {{ stats.platform_stats.cores_total }}</li>
</ul>
<div class="platform_node_detail">
{% for node in stats.platform_stats.nodes %}
<div class="node_detail">
<div class="node_name">{{ node['name'] }}</div>
<div class="container_count">{{ node['container_count'] }} containers</div>
<div class="memory_total">
<div class="memory_reserved" style="width: {{ node['memory_reserved'] * 100 / node['memory_total'] }}%;">&nbsp;</div><span>{{ '%0.2f' % (node['memory_reserved'] * 100 / node['memory_total'],) }}% memory reserved</span>
</div>
<div class="cores_total">
<div class="cores_reserved" style="width: {{ node['cores_reserved'] * 100 / node['cores_total'] }}%;">&nbsp;</div><span>{{ '%0.2f' % (node['cores_reserved'] * 100 / node['cores_total'],) }}% cores reserved</span>
</div>
</div>
{% endfor %}
</div>
{% endblock %}
......@@ -109,6 +109,9 @@ class WebSocketEndpointWeb(tornado.websocket.WebSocketHandler):
continue
self.write_message(log_line)
elif request['command'] == 'system_status':
stats = self.api_endpoint.statistics_scheduler(self.uid, self.role)
self.write_message(json.dumps(stats))
else:
response = {
'status': 'error',
......
......@@ -24,6 +24,7 @@ from zoe_lib.state import Execution, Service
from zoe_master.backends.base import BaseBackend
from zoe_master.backends.service_instance import ServiceInstance
from zoe_master.exceptions import ZoeStartExecutionFatalException, ZoeStartExecutionRetryException, ZoeException
from zoe_master.stats import ClusterStats # pylint: disable=unused-import
try:
from zoe_master.backends.swarm.backend import SwarmBackend
......@@ -154,7 +155,7 @@ def terminate_execution(execution: Execution) -> None:
execution.set_terminated()
def get_platform_state():
def get_platform_state() -> ClusterStats:
"""Retrieves the state of the platform by querying the container backend. Platform state includes information on free/reserved resources for each node. This information is used for advanced scheduling."""
backend = _get_backend()
return backend.platform_state()
......@@ -49,6 +49,11 @@ class ZoeElasticScheduler:
self.loop_th = threading.Thread(target=self._thread_wrapper, name='scheduler')
self.loop_th.start()
self.state = state
for execution in self.state.execution_list(status='running'):
if execution.all_services_running:
self.queue_running.append(execution)
else:
self.queue.append(execution)
def trigger(self):
"""Trigger a scheduler run."""
......@@ -255,8 +260,16 @@ class ZoeElasticScheduler:
def stats(self):
"""Scheduler statistics."""
if self.policy == "SIZE":
queue = sorted(self.queue, key=lambda execution: execution.size)
else:
queue = self.queue
return {
'queue_length': len(self.queue),
'running_length': len(self.queue_running),
'termination_threads_count': len(self.async_threads)
'termination_threads_count': len(self.async_threads),
'queue': [s.id for s in queue],
'running_queue': [s.id for s in self.queue_running],
'platform_stats': get_platform_state().serialize()
}
......@@ -43,6 +43,22 @@ class NodeStats(Stats):
self.status = None
self.error = ''
def serialize(self):
"""Convert the object into a dict."""
return {
'name': self.name,
'container_count': self.container_count,
'cores_total': self.cores_total,
'cores_reserved': self.cores_reserved,
'cores_free': self.cores_free,
'memory_total': self.memory_total,
'memory_reserved': self.memory_reserved,
'memory_free': self.memory_free,
'labels': self.labels,
'status': self.status,
'error': self.error
}
class ClusterStats(Stats):
"""Stats related to the whole cluster."""
......@@ -53,10 +69,11 @@ class ClusterStats(Stats):
self.cores_total = 0
self.nodes = []
class SchedulerStats(Stats):
"""Stats related to the scheduler."""
def __init__(self):
super().__init__()
self.count_waiting = 0
self.waiting_list = []
def serialize(self):
"""Convert the object into a dict."""
return {
'container_count': self.container_count,
'memory_total': self.memory_total,
'cores_total': self.cores_total,
'nodes': [x.serialize() for x in self.nodes]
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment