Commit 4bf272be authored by Daniele Venzano's avatar Daniele Venzano

Add per-node service list to the status page

parent 59a86198
...@@ -218,6 +218,15 @@ div.running { ...@@ -218,6 +218,15 @@ div.running {
border-color: green; border-color: green;
} }
div.node_detail {
width: 48%;
height: 28em;
float: left;
border: dashed darkgray 1px;
margin: 0.2em;
padding-left: 0.5em;
}
div.pie-plots { div.pie-plots {
width: 20em; width: 20em;
height: 22em; height: 22em;
...@@ -230,3 +239,16 @@ div.plot-container { ...@@ -230,3 +239,16 @@ div.plot-container {
div.plot-container p { div.plot-container p {
text-align: center; text-align: center;
} }
h3.section {
clear: both;
}
table.service-distrib th {
border-bottom: 1px double black;
}
table.service-distrib {
border-spacing: 10px;
border-collapse: separate;
}
...@@ -45,11 +45,14 @@ class StatusEndpointWeb(ZoeRequestHandler): ...@@ -45,11 +45,14 @@ class StatusEndpointWeb(ZoeRequestHandler):
for exec_id in stats['running_queue']: for exec_id in stats['running_queue']:
executions_in_queue[exec_id] = self.api_endpoint.execution_by_id(uid, role, exec_id) executions_in_queue[exec_id] = self.api_endpoint.execution_by_id(uid, role, exec_id)
max_service_count = max([len(node['services']) for node in stats['platform_stats']['nodes']])
template_vars = { template_vars = {
"uid": uid, "uid": uid,
"role": role, "role": role,
"stats": stats, "stats": stats,
"executions_in_queue": executions_in_queue "executions_in_queue": executions_in_queue,
"max_service_count": max_service_count
} }
self.render('status.html', **template_vars) self.render('status.html', **template_vars)
...@@ -8,7 +8,16 @@ ...@@ -8,7 +8,16 @@
{% block content %} {% block content %}
<h2>Zoe system status</h2> <h2>Zoe system status</h2>
<h3>Scheduler</h3> <div>
<h3 class="section">Index</h3>
<ul>
<li><a href="#scheduler">Scheduler</a></li>
<li><a href="#platform">Platform</a></li>
<li><a href="#service-distrib">Service distribution</a></li>
</ul>
</div>
<h3 class="section"><a name="scheduler">Scheduler</a></h3>
<ul> <ul>
<li>Queue length: <span id="sched_queue_len">{{ stats.queue_length }}</span></li> <li>Queue length: <span id="sched_queue_len">{{ stats.queue_length }}</span></li>
...@@ -82,7 +91,7 @@ ...@@ -82,7 +91,7 @@
{% endif %} {% endif %}
</div> </div>
<h3>Platform</h3> <h3 class="section"><a name="platform">Platform</a></h3>
<ul> <ul>
<li>Total containers: {{ stats.platform_stats.container_count }}</li> <li>Total containers: {{ stats.platform_stats.container_count }}</li>
<li>Total memory: <script>format_bytes({{ stats.platform_stats.memory_total }}, 2) </script></li> <li>Total memory: <script>format_bytes({{ stats.platform_stats.memory_total }}, 2) </script></li>
...@@ -197,4 +206,30 @@ ...@@ -197,4 +206,30 @@
{% endfor %} {% endfor %}
</div> </div>
<h3 class="section"><a name="service-distrib">Service distribution</a></h3>
<table class="service-distrib">
<thead>
<tr>
<th class="cell-host">Host</th>
<th colspan="{{ max_service_count }}">Services and reserved resources</th>
</tr>
</thead>
<tbody>
{% for node in stats.platform_stats.nodes %}
<tr>
<td class="cell-host">{{ node.name }}</td>
{% for service in node.services %}
<td><a href="{{ reverse_url('execution_inspect', service['execution_id']) }}">{{ service['name'] }}</a> (M: <script>format_bytes({{ service['description']['resources']['memory']['max'] }});</script> C: {{ service['description']['resources']['cores']['max'] }})</td>
{% endfor %}
</tr>
{% endfor %}
</tbody>
</table>
<script>
function refresh_page() {
document.location.reload();
}
setInterval(refresh_page, 15000);
</script>
{% endblock %} {% endblock %}
...@@ -19,7 +19,7 @@ import logging ...@@ -19,7 +19,7 @@ import logging
from typing import List from typing import List
from zoe_lib.config import get_conf from zoe_lib.config import get_conf
from zoe_lib.state import Execution, Service from zoe_lib.state import Execution, Service, SQLManager # pylint: disable=unused-import
from zoe_master.backends.base import BaseBackend from zoe_master.backends.base import BaseBackend
from zoe_master.backends.service_instance import ServiceInstance from zoe_master.backends.service_instance import ServiceInstance
...@@ -166,7 +166,10 @@ def terminate_execution(execution: Execution) -> None: ...@@ -166,7 +166,10 @@ def terminate_execution(execution: Execution) -> None:
execution.set_terminated() execution.set_terminated()
def get_platform_state() -> ClusterStats: def get_platform_state(state: SQLManager) -> ClusterStats:
"""Retrieves the state of the platform by querying the container backend. Platform state includes information on free/reserved resources for each node. This information is used for advanced scheduling.""" """Retrieves the state of the platform by querying the container backend. Platform state includes information on free/reserved resources for each node. This information is used for advanced scheduling."""
backend = _get_backend() backend = _get_backend()
return backend.platform_state() platform_state = backend.platform_state()
for node in platform_state.nodes:
node.services = state.service_list(backend_host=node.name, backend_status=Service.BACKEND_START_STATUS)
return platform_state
...@@ -48,7 +48,8 @@ class ServiceInstance: ...@@ -48,7 +48,8 @@ class ServiceInstance:
'zoe.service.id': str(service.id), 'zoe.service.id': str(service.id),
'zoe.owner': execution.user_id, 'zoe.owner': execution.user_id,
'zoe.deployment_name': get_conf().deployment_name, 'zoe.deployment_name': get_conf().deployment_name,
'zoe.type': 'app_service' 'zoe.type': 'service_{}'.format('essential' if service.essential else 'elastic'),
'zoe.zapp_size': execution.size
} }
if service.is_monitor: if service.is_monitor:
self.labels['zoe_monitor'] = 'true' self.labels['zoe_monitor'] = 'true'
......
...@@ -113,9 +113,6 @@ class SwarmClient: ...@@ -113,9 +113,6 @@ class SwarmClient:
"""Retrieve Swarm statistics. The Docker API returns a mess difficult to parse.""" """Retrieve Swarm statistics. The Docker API returns a mess difficult to parse."""
info = self.cli.info() info = self.cli.info()
pl_status = ClusterStats() pl_status = ClusterStats()
pl_status.container_count = info["Containers"]
pl_status.memory_total = info["MemTotal"]
pl_status.cores_total = info["NCPU"]
# SystemStatus is a list... # SystemStatus is a list...
idx = 0 # Role, skip idx = 0 # Role, skip
...@@ -135,7 +132,10 @@ class SwarmClient: ...@@ -135,7 +132,10 @@ class SwarmClient:
node_stats.docker_endpoint = info["SystemStatus"][idx + node][1] node_stats.docker_endpoint = info["SystemStatus"][idx + node][1]
idx2 += 1 # ID, skip idx2 += 1 # ID, skip
idx2 += 1 # Status idx2 += 1 # Status
node_stats.status = info["SystemStatus"][idx + node + idx2][1] if info["SystemStatus"][idx + node + idx2][1] == 'Healthy':
node_stats.status = 'online'
else:
node_stats.status = 'offline'
idx2 += 1 # Containers idx2 += 1 # Containers
node_stats.container_count = int(info["SystemStatus"][idx + node + idx2][1].split(' ')[0]) node_stats.container_count = int(info["SystemStatus"][idx + node + idx2][1].split(' ')[0])
idx2 += 1 # CPUs idx2 += 1 # CPUs
......
...@@ -190,7 +190,7 @@ class ZoeElasticScheduler: ...@@ -190,7 +190,7 @@ class ZoeElasticScheduler:
log.debug("-> {}".format(job)) log.debug("-> {}".format(job))
try: try:
platform_state = get_platform_state() platform_state = get_platform_state(self.state)
except ZoeException: except ZoeException:
log.error('Cannot retrieve platform state, cannot schedule') log.error('Cannot retrieve platform state, cannot schedule')
for job in jobs_to_attempt_scheduling: for job in jobs_to_attempt_scheduling:
...@@ -286,5 +286,5 @@ class ZoeElasticScheduler: ...@@ -286,5 +286,5 @@ class ZoeElasticScheduler:
'termination_threads_count': len(self.async_threads), 'termination_threads_count': len(self.async_threads),
'queue': [s.id for s in queue], 'queue': [s.id for s in queue],
'running_queue': [s.id for s in self.queue_running], 'running_queue': [s.id for s in self.queue_running],
'platform_stats': get_platform_state().serialize() 'platform_stats': get_platform_state(self.state).serialize()
} }
...@@ -42,10 +42,11 @@ class NodeStats(Stats): ...@@ -42,10 +42,11 @@ class NodeStats(Stats):
self.labels = {} self.labels = {}
self.status = None self.status = None
self.error = '' self.error = ''
self.services = []
def serialize(self): def serialize(self):
"""Convert the object into a dict.""" """Convert the object into a dict."""
return { ret = {
'name': self.name, 'name': self.name,
'container_count': self.container_count, 'container_count': self.container_count,
'cores_total': self.cores_total, 'cores_total': self.cores_total,
...@@ -56,8 +57,12 @@ class NodeStats(Stats): ...@@ -56,8 +57,12 @@ class NodeStats(Stats):
'memory_free': self.memory_free, 'memory_free': self.memory_free,
'labels': self.labels, 'labels': self.labels,
'status': self.status, 'status': self.status,
'error': self.error 'error': self.error,
'services': []
} }
for service in self.services:
ret['services'].append(service.serialize())
return ret
class ClusterStats(Stats): class ClusterStats(Stats):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment