Commit dc68cf0d authored by Daniele Venzano's avatar Daniele Venzano

Remove rpyc and convert more client methods to new rpc

parent ba2bb2ba
from configparser import ConfigParser
rpycconf = {
'client_rpyc_autodiscovery': True,
'client_rpyc_server': None,
'client_rpyc_port': None,
ipcconf = {
'server': None,
'port': None,
}
config_paths = [
......
......@@ -3,41 +3,28 @@
import argparse
import logging
from rpyc.utils.server import ThreadedServer
from zoe_scheduler.rpyc_service import ZoeSchedulerRPCService
from zoe_scheduler.scheduler import zoe_sched
from zoe_scheduler.periodic_tasks import PeriodicTaskManager
from zoe_scheduler.ipc import ZoeIPCServer
from common.object_storage import init_history_paths
log = logging.getLogger('zoe')
loop = None
rpyc_server = None
def sigint_handler():
log.warning('CTRL-C detected, terminating event loop...')
loop.stop()
zoe_sched.stop_tasks()
rpyc_server.stop()
try:
loop.run_forever()
except RuntimeError:
pass
def process_arguments() -> argparse.Namespace:
argparser = argparse.ArgumentParser(description="Zoe Scheduler - Container Analytics as a Service scheduling component")
argparser.add_argument('-d', '--debug', action='store_true', help='Enable debug output')
argparser.add_argument('--rpyc-no-auto-register', action='store_true', help='Do not register automatically in the RPyC registry')
argparser.add_argument('--ipc-server-port', type=int, default=8723, help='Port the IPC server should bind to')
return argparser.parse_args()
def main():
global loop, rpyc_server
args = process_arguments()
if args.debug:
logging.basicConfig(level=logging.DEBUG)
......@@ -45,8 +32,6 @@ def main():
logging.basicConfig(level=logging.INFO)
logging.getLogger('requests').setLevel(logging.WARNING)
rpyc_logger = logging.getLogger('rpyc')
rpyc_logger.setLevel(logging.WARNING)
ipc_server = ZoeIPCServer(zoe_sched, args.ipc_server_port)
......@@ -55,16 +40,11 @@ def main():
tm = PeriodicTaskManager()
rpyc_server = ThreadedServer(ZoeSchedulerRPCService, '0.0.0.0', port=4000,
auto_register=not args.rpyc_no_auto_register,
protocol_config={"allow_public_attrs": True},
logger=rpyc_logger)
zoe_sched.init_tasks(tm)
ipc_server.start_loop()
ipc_server.start_thread()
rpyc_server.start()
zoe_sched.loop()
tm.stop_all()
......
......@@ -8,7 +8,7 @@ from tornado.ioloop import IOLoop
from zoe_web import app
from common.configuration import rpycconf
from common.configuration import ipcconf
log = logging.getLogger("zoe_web")
......@@ -16,8 +16,8 @@ log = logging.getLogger("zoe_web")
def process_arguments() -> argparse.Namespace:
argparser = argparse.ArgumentParser(description="Zoe Web - Container Analytics as a Service web client")
argparser.add_argument('-d', '--debug', action='store_true', default=False, help='Enable debug output')
argparser.add_argument('--rpyc-server', default=None, help='Specify an RPyC server instead of using autodiscovery')
argparser.add_argument('--rpyc-port', default=4000, type=int, help='Specify an RPyC server port, default is 4000')
argparser.add_argument('--ipc-server', default='localhost', help='Address of the Zoe scheduler process')
argparser.add_argument('--ipc-port', default=8723, type=int, help='Port of the Zoe scheduler process')
return argparser.parse_args()
......@@ -31,12 +31,8 @@ def main():
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("tornado").setLevel(logging.WARNING)
if args.rpyc_server is None:
rpycconf['client_rpyc_autodiscovery'] = True
else:
rpycconf['client_rpyc_autodiscovery'] = False
rpycconf['client_rpyc_server'] = args.rpyc_server
rpycconf['client_rpyc_port'] = args.rpyc_port
ipcconf['server'] = args.ipc_server
ipcconf['port'] = args.ipc_port
log.info("Starting HTTP server...")
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024
......
from zoe_client.client import ZoeClient, get_zoe_client
from zoe_client.client import ZoeClient
import base64
import logging
import rpyc
from sqlalchemy.orm.exc import NoResultFound
from zoe_client.ipc import ZoeIPCClient
from common.state import AlchemySession
from common.state.application import ApplicationState, SparkNotebookApplicationState, SparkSubmitApplicationState, SparkApplicationState, Application
from common.state.container import ContainerState
from common.state.execution import ExecutionState, SparkSubmitExecutionState, Execution
from common.state.execution import ExecutionState, Execution
from common.state.proxy import ProxyState
from common.state.user import UserState
from common.application_resources import SparkApplicationResources
from common.exceptions import UserIDDoesNotExist, ApplicationStillRunning
import common.object_storage as storage
from common.configuration import zoeconf, rpycconf
from common.configuration import zoeconf
from zoe_client.entities import User
log = logging.getLogger(__name__)
......@@ -26,16 +25,9 @@ NOTEBOOK_IMAGE = REGISTRY + "/zoerepo/spark-notebook"
class ZoeClient:
def __init__(self, rpyc_server=None, rpyc_port=4000):
self.ipc_server = ZoeIPCClient("localhost")
self.rpyc_server = rpyc_server
self.rpyc_port = rpyc_port
def __init__(self, ipc_server='localhost', ipc_port=8723):
self.ipc_server = ZoeIPCClient(ipc_server, ipc_port)
self.state = AlchemySession()
if self.rpyc_server is None:
self.server_connection = rpyc.connect_by_service("ZoeSchedulerRPC")
else:
self.server_connection = rpyc.connect(self.rpyc_server, self.rpyc_port)
self.server = self.server_connection.root
# Applications
def application_get(self, application_id: int) -> Application:
......@@ -149,11 +141,7 @@ class ZoeClient:
# Containers
def container_stats(self, container_id):
try:
self.state.query(ContainerState).filter_by(id=container_id).one()
except NoResultFound:
return None
return self.server.container_stats(container_id)
return self.ipc_server.ask('container_stats', container_id=container_id)
# Executions
def execution_delete(self, execution_id: int) -> None:
......@@ -193,51 +181,23 @@ class ZoeClient:
return None
def execution_spark_new(self, application_id: int, name, commandline=None, spark_options=None) -> bool:
try:
application = self.state.query(ApplicationState).filter_by(id=application_id).one()
except NoResultFound:
return None
if type(application) is SparkSubmitApplicationState:
if commandline is None:
raise ValueError("Spark submit application requires a commandline")
execution = SparkSubmitExecutionState(name=name,
application_id=application.id,
status="submitted",
commandline=commandline,
spark_opts=spark_options)
else:
execution = ExecutionState(name=name,
application_id=application.id,
status="submitted")
self.state.add(execution)
self.state.commit()
ret = self.server.execution_schedule(execution.id)
return ret
ret = self.ipc_server.ask('execution_spark_new', application_id=application_id, name=name, commandline=commandline, spark_options=spark_options)
return ret is not None
def execution_terminate(self, execution_id: int) -> None:
try:
self.state.query(ExecutionState).filter_by(id=execution_id).one()
except NoResultFound:
pass
self.server.execution_terminate(execution_id)
ret = self.ipc_server.ask('execution_terminate', execution_id=execution_id)
return ret is not None
# Logs
def log_get(self, container_id: int) -> str:
try:
self.state.query(ContainerState).filter_by(id=container_id).one()
except NoResultFound:
return None
else:
ret = self.server.log_get(container_id)
return ret
clog = self.ipc_server.ask('log_get', container_id=container_id)
if clog is not None:
return clog['log']
def log_history_get(self, execution_id):
try:
execution = self.state.query(ExecutionState).filter_by(id=execution_id).one()
except NoResultFound:
return None
return storage.logs_archive_download(execution)
data = self.ipc_server.ask('log_history_get', execution_id=execution_id)
log_data = base64.b64decode(data['zip_data'])
return log_data
# Platform
def platform_stats(self) -> dict:
......@@ -263,10 +223,3 @@ class ZoeClient:
user_dict = self.ipc_server.ask('user_get_by_email', user_email=email)
if user_dict is not None:
return User(user_dict)
def get_zoe_client() -> ZoeClient:
if rpycconf['client_rpyc_autodiscovery']:
return ZoeClient()
else:
return ZoeClient(rpycconf['client_rpyc_server'], rpycconf['client_rpyc_port'])
import base64
import logging
import threading
......@@ -5,7 +6,11 @@ from sqlalchemy.orm.exc import NoResultFound
import zmq
from common.state import AlchemySession
from common.state.application import ApplicationState, SparkSubmitApplicationState
from common.state.container import ContainerState
from common.state.execution import ExecutionState, SparkSubmitExecutionState
from common.state.user import UserState
import common.object_storage as storage
from zoe_scheduler.scheduler import ZoeScheduler
......@@ -18,18 +23,23 @@ class ZoeIPCServer:
self.socket = self.context.socket(zmq.REP)
self.socket.bind("tcp://*:%s" % port)
self.th = None
self.state = AlchemySession()
self.state = None
self.sched = scheduler
def start_loop(self):
def start_thread(self):
self.th = threading.Thread(target=self._loop, name="IPC server", daemon=True)
self.th.start()
def _loop(self):
self.state = AlchemySession() # thread-local session
log.debug("IPC server thread started")
while True:
message = self.socket.recv_json()
reply = self._dispatch(message)
try:
reply = self._dispatch(message)
except:
log.exception("Uncaught exception in IPC server thread")
reply = self._reply_error('exception')
self.socket.send_json(reply)
def _dispatch(self, message: dict) -> dict:
......@@ -49,16 +59,80 @@ class ZoeIPCServer:
return func(**message["args"])
def _reply_ok(self, reply: dict) -> dict:
def _reply_ok(self, **reply) -> dict:
return {'status': 'ok', 'answer': reply}
def _reply_error(self, error_msg: str) -> dict:
return {'status': 'error', 'answer': error_msg}
# ############# Exposed methods below ################
# Containers
def container_stats(self, container_id: int) -> dict:
ret = self.sched.platform.container_stats(container_id).to_dict()
return self._reply_ok(**ret)
# Executions
def execution_spark_new(self, application_id: int, name: str, commandline=None, spark_options=None) -> dict:
try:
application = self.state.query(ApplicationState).filter_by(id=application_id).one()
except NoResultFound:
return self._reply_error('no such application')
if type(application) is SparkSubmitApplicationState:
if commandline is None:
raise ValueError("Spark submit application requires a commandline")
execution = SparkSubmitExecutionState(name=name,
application_id=application.id,
status="submitted",
commandline=commandline,
spark_opts=spark_options)
else:
execution = ExecutionState(name=name,
application_id=application.id,
status="submitted")
self.state.add(execution)
ret = self.sched.incoming(execution)
if ret:
execution.set_scheduled()
self.state.commit()
else:
self._reply_error('admission control refused this application execution')
self.state.rollback()
return self._reply_ok()
def execution_terminate(self, execution_id: int) -> dict:
state = AlchemySession()
execution = state.query(ExecutionState).filter_by(id=execution_id).one()
self.sched.execution_terminate(state, execution)
state.commit()
state.close()
return self._reply_ok()
# Logs
def log_get(self, container_id: int) -> dict:
try:
container = self.state.query(ContainerState).filter_by(id=container_id).one()
except NoResultFound:
return self._reply_error('no such container')
else:
ret = self.sched.platform.log_get(container)
return self._reply_ok(log=ret)
def log_history_get(self, execution_id) -> dict:
try:
execution = self.state.query(ExecutionState).filter_by(id=execution_id).one()
except NoResultFound:
return self._reply_error('no such execution')
log_data = storage.logs_archive_download(execution)
log_data = base64.b64encode(log_data)
return self._reply_ok(zip_data=log_data.decode('ascii'))
# Platform
def platform_stats(self):
def platform_stats(self) -> dict:
ret = self.sched.platform_status.stats()
return self._reply_ok(ret.to_dict())
return self._reply_ok(**ret.to_dict())
# Users
def user_get(self, user_id) -> dict:
......@@ -67,7 +141,7 @@ class ZoeIPCServer:
except NoResultFound:
return self._reply_error('no such user')
else:
return self._reply_ok(user.to_dict())
return self._reply_ok(**user.to_dict())
def user_get_by_email(self, user_email) -> dict:
try:
......@@ -75,10 +149,10 @@ class ZoeIPCServer:
except NoResultFound:
return self._reply_error('no such user')
else:
return self._reply_ok(user.to_dict())
return self._reply_ok(**user.to_dict())
def user_new(self, email: str) -> dict:
user = UserState(email=email)
self.state.add(user)
self.state.commit()
return self._reply_ok(user.to_dict())
return self._reply_ok(**user.to_dict())
......@@ -45,7 +45,9 @@ class ProxyManager:
def _get_proxy_entries(self):
state = AlchemySession()
return state.query(ProxyState).all()
ret = state.query(ProxyState).all()
state.close()
return ret
def _generate_file(self, proxy_entries):
output = ""
......@@ -100,5 +102,6 @@ class ProxyManager:
proxy.container.cluster.execution.termination_notice = False
if something_to_commit:
state.commit()
state.close()
pm = ProxyManager()
import rpyc
from sqlalchemy.orm.exc import NoResultFound
from zoe_scheduler.scheduler import zoe_sched
from zoe_scheduler.stats import PlatformStats, ContainerStats
from common.state import AlchemySession, ContainerState
from common.state.execution import ExecutionState
class ZoeSchedulerRPCService(rpyc.Service):
sched = zoe_sched
def on_connect(self):
pass
def on_disconnect(self):
pass
def exposed_container_stats(self, container_id: int) -> ContainerStats:
return self.sched.platform.container_stats(container_id)
def exposed_execution_schedule(self, execution_id: int) -> bool:
state = AlchemySession()
execution = state.query(ExecutionState).filter_by(id=execution_id).one()
ret = self.sched.incoming(execution)
if ret:
execution.set_scheduled()
state.commit()
return ret
def exposed_execution_terminate(self, execution_id: int) -> bool:
state = AlchemySession()
execution = state.query(ExecutionState).filter_by(id=execution_id).one()
self.sched.execution_terminate(state, execution)
state.commit()
return True
def exposed_log_get(self, container_id: int) -> str:
state = AlchemySession()
try:
container = state.query(ContainerState).filter_by(id=container_id).one()
except NoResultFound:
return None
return self.sched.platform.log_get(container)
def exposed_platform_stats(self) -> PlatformStats:
return self.sched.platform_status.stats()
......@@ -75,7 +75,6 @@ class ZoeScheduler:
def init_tasks(self, tm: PeriodicTaskManager):
tm.add_task("platform status updater", self.platform_status.update, zoeconf.interval_status_refresh)
tm.add_task("scheduler", self.schedule, zoeconf.interval_scheduler_task)
tm.add_task("proxy access timestamp updater", pm.update_proxy_access_timestamps, zoeconf.interval_proxy_update_accesses)
tm.add_task("execution health checker", self.platform.check_executions_health, zoeconf.interval_check_health)
......@@ -95,6 +94,11 @@ class ZoeScheduler:
else: # Some error happened
log.error('Execution ID {} cannot be started'.format(execution_id))
def loop(self): # FIXME the scheduler should wait on events, not sleep
while True:
self.schedule()
time.sleep(zoeconf.interval_scheduler_task)
def schedule(self):
self._check_runnable()
......
......@@ -9,4 +9,3 @@ app.register_blueprint(web_bp, url_prefix='')
app.register_blueprint(api_bp, url_prefix='/api')
app.secret_key = zoeconf.cookies_secret_key
......@@ -3,7 +3,8 @@ from zipfile import is_zipfile
from flask import Blueprint, jsonify, request, session, abort, send_file
from zoe_client import get_zoe_client
from zoe_client import ZoeClient
from common.configuration import ipcconf
from common.exceptions import ApplicationStillRunning
api_bp = Blueprint('api', __name__)
......@@ -21,11 +22,11 @@ def _api_check_user(zoe_client):
@api_bp.route('/status/basic')
def status_basic():
client = get_zoe_client()
client = ZoeClient(ipcconf['server'], ipcconf['port'])
platform_stats = client.platform_stats()
ret = {
'num_nodes': len(platform_stats.swarm.nodes),
'num_containers': platform_stats.swarm.container_count
'num_nodes': len(platform_stats['swarm']['nodes']),
'num_containers': platform_stats['swarm']['container_count']
}
return jsonify(**ret)
......@@ -34,7 +35,7 @@ def status_basic():
def login():
form_data = request.form
email = form_data["email"]
client = get_zoe_client()
client = ZoeClient(ipcconf['server'], ipcconf['port'])
user = client.user_get_by_email(email)
if user is None:
user = client.user_new(email)
......@@ -44,7 +45,7 @@ def login():
@api_bp.route('/applications/new', methods=['POST'])
def application_new():
client = get_zoe_client()
client = ZoeClient(ipcconf['server'], ipcconf['port'])
user = _api_check_user(client)
form_data = request.form
......@@ -66,7 +67,7 @@ def application_new():
@api_bp.route('/applications/delete/<app_id>', methods=['GET', 'POST'])
def application_delete(app_id):
client = get_zoe_client()
client = ZoeClient(ipcconf['server'], ipcconf['port'])
_api_check_user(client)
try:
......@@ -79,7 +80,7 @@ def application_delete(app_id):
@api_bp.route('/applications/download/<int:app_id>')
def application_binary_download(app_id: int):
client = get_zoe_client()
client = ZoeClient(ipcconf['server'], ipcconf['port'])
_api_check_user(client)
data = client.application_get_binary(app_id)
......@@ -91,7 +92,7 @@ def application_binary_download(app_id: int):
@api_bp.route('/executions/new', methods=['POST'])
def execution_new():
client = get_zoe_client()
client = ZoeClient(ipcconf['server'], ipcconf['port'])
_api_check_user(client)
form_data = request.form
......@@ -111,7 +112,7 @@ def execution_new():
@api_bp.route('/executions/logs/container/<int:container_id>')
def execution_logs(container_id: int):
client = get_zoe_client()
client = ZoeClient(ipcconf['server'], ipcconf['port'])
_api_check_user(client)
log = client.log_get(container_id)
......@@ -122,20 +123,20 @@ def execution_logs(container_id: int):
@api_bp.route('/executions/stats/container/<int:container_id>')
def execution_stats(container_id: int):
client = get_zoe_client()
def container_stats(container_id: int):
client = ZoeClient(ipcconf['server'], ipcconf['port'])
_api_check_user(client)
stats = client.container_stats(container_id)
if stats is None:
return jsonify(status="error", msg="no stats found")
else:
return jsonify(status="ok", **stats.to_dict())
return jsonify(status="ok", **stats)
@api_bp.route('/executions/terminate/<int:exec_id>')
def execution_terminate(exec_id: int):
client = get_zoe_client()
client = ZoeClient(ipcconf['server'], ipcconf['port'])
_api_check_user(client)
client.execution_terminate(exec_id)
......@@ -145,7 +146,7 @@ def execution_terminate(exec_id: int):
@api_bp.route('/history/logs/<int:execution_id>')
def history_logs_get(execution_id: int):
client = get_zoe_client()
client = ZoeClient(ipcconf['server'], ipcconf['port'])
_api_check_user(client)
logs = client.log_history_get(execution_id)
......
from flask import render_template
from zoe_client import get_zoe_client
from zoe_client import ZoeClient
from common.configuration import ipcconf
from zoe_web.web import web_bp
import zoe_web.utils as web_utils
@web_bp.route('/apps/new')
def application_new():
client = get_zoe_client()
client = ZoeClient(ipcconf['server'], ipcconf['port'])
user = web_utils.check_user(client)
template_vars = {
......@@ -19,7 +20,7 @@ def application_new():
@web_bp.route('/executions/new/<app_id>')
def execution_new(app_id):
client = get_zoe_client()
client = ZoeClient(ipcconf['server'], ipcconf['port'])
user = web_utils.check_user(client)
application = client.application_get(app_id)
......@@ -33,7 +34,7 @@ def execution_new(app_id):
@web_bp.route('/executions/terminate/<exec_id>')
def execution_terminate(exec_id):
client = get_zoe_client()
client = ZoeClient(ipcconf['server'], ipcconf['port'])
user = web_utils.check_user(client)
execution = client.execution_get(exec_id)
......@@ -47,7 +48,7 @@ def execution_terminate(exec_id):
@web_bp.route('/apps/delete/<app_id>')
def application_delete(app_id):
client = get_zoe_client()
client = ZoeClient(ipcconf['server'], ipcconf['port'])
user = web_utils.check_user(client)
application = client.application_get(app_id)
......@@ -61,7 +62,7 @@ def application_delete(app_id):
@web_bp.route('/executions/inspect/<execution_id>')
def execution_inspect(execution_id):
client = get_zoe_client()
client = ZoeClient(ipcconf['server'], ipcconf['port'])
user = web_utils.check_user(client)
execution = client.execution_get(execution_id)
......
from flask import render_template
from zoe_client import get_zoe_client
from zoe_client import ZoeClient
from common.configuration import ipcconf
from zoe_web.web import web_bp
import zoe_web.utils as web_utils
from common.state.execution import Execution
......@@ -13,7 +14,7 @@ def index():
@web_bp.route('/home')
def home():
client = get_zoe_client()
client = ZoeClient(ipcconf['server'], ipcconf['port'])
user = web_utils.check_user(client)
apps = client.application_list(user.id)
template_vars = {
......
from flask import render_template
from zoe_client import get_zoe_client
from zoe_client import ZoeClient
from common.configuration import ipcconf
from zoe_web.web import web_bp
import zoe_web.utils as web_utils
@web_bp.route('/status/platform')
def status_platform():
client = get_zoe_client()
client = ZoeClient(ipcconf['server'], ipcconf['port'])
user = web_utils.check_user(client)
platform_stats = client.platform_stats()
......
......@@ -5,8 +5,8 @@
<h3>Scheduler stats</h3>
<ul>
<li>Running applications: {{ platform.scheduler.count_running }}</li>
<li>Waiting applications: {{ platform.scheduler.count_waiting }}</li>
<li>Running applications: {{ platform['scheduler']['count_running'] }}</li>
<li>Waiting applications: {{ platform['scheduler']['count_waiting'] }}</li>
</ul>
<h3>Swarm staus</h3>
......
......@@ -3,18 +3,23 @@
from argparse import ArgumentParser, Namespace
import logging
from zipfile import is_zipfile
from pprint import pprint
from zoe_client import get_zoe_client
from zoe_client import ZoeClient
from common.state import create_tables
from common.configuration import zoeconf, rpycconf
from common.configuration import zoeconf
argparser = None
def stats_cmd(_):
client = get_zoe_client()
def get_zoe_client(args) -> ZoeClient: