Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in / Register
Toggle navigation
main
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Simone Rossi
main
Commits
722a5e2d
Commit
722a5e2d
authored
Dec 04, 2018
by
Simone Rossi
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[add] Add submission to zoe
parent
a3d39e28
Pipeline
#11162
canceled with stages
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
66 additions
and
26 deletions
+66
-26
grid_queue.py
scripts/grid_queue.py
+57
-26
grid_sample.json
scripts/grid_sample.json
+9
-0
No files found.
scripts/grid_queue.py
View file @
722a5e2d
...
...
@@ -35,6 +35,10 @@ import json
from
zoe_cmd.utils
import
read_auth
from
zoe_cmd.api_lib
import
ZoeAPI
import
zoe_lib.exceptions
from
zoe_lib.state.execution
import
Execution
from
zoe_cmd.api_lib.api_base
import
retry
LOG_FORMAT
=
'[
%(asctime)-15
s
%(levelname)
s
%(name)
s]
%(message)
s'
logger
=
logging
.
getLogger
()
...
...
@@ -150,7 +154,7 @@ class ParameterGrid(object):
# Reverse so most frequent cycling parameter comes first
keys
,
values_lists
=
zip
(
*
sorted
(
sub_grid
.
items
())[::
-
1
])
sizes
=
[
len
(
v_list
)
for
v_list
in
values_lists
]
total
=
np
.
product
(
sizes
)
total
=
product
(
sizes
)
if
ind
>=
total
:
# Try the next grid
...
...
@@ -176,7 +180,14 @@ def get_all_jobs(bin, executable, param_grid) -> list:
def
get_current_usage
(
api
):
running_executions
=
list
(
filter
(
lambda
e
:
(
e
[
'status'
]
==
'running'
or
e
[
'status'
]
==
'queued'
),
"""
Get all running or ready to run executions
:param api:
:return: list
"""
running_executions
=
list
(
filter
(
lambda
e
:
(
e
[
'status'
]
==
Execution
.
RUNNING_STATUS
or
e
[
'status'
]
==
Execution
.
QUEUED_STATUS
or
e
[
'status'
]
==
Execution
.
SUBMIT_STATUS
),
api
.
executions
.
list
()))
used_jobs
=
len
(
running_executions
)
...
...
@@ -197,73 +208,93 @@ def get_user_quota(api, user_id):
def
get_zapp_resources
(
zapp
):
return
zapp
[
'services'
][
0
][
'resources'
][
'cores'
][
'min'
],
zapp
[
'services'
][
0
][
'resources'
][
'memory'
][
'min'
],
@
retry
(
zoe_lib
.
exceptions
.
ZoeAPIException
)
def
submit_zapp
(
zapp
,
command
,
name
,
api
):
"""Submits one ZApp for execution."""
zapp
[
'services'
][
0
][
'command'
]
=
command
try
:
ret
=
api
.
executions
.
start
(
name
,
zapp
)
except
zoe_lib
.
exceptions
.
ZoeAPIException
as
e
:
logger
.
error
(
'Error starting ZApp: {}'
.
format
(
str
(
e
)))
ret
=
None
return
ret
def
main
():
# Definition of command line arg
Args
=
namedtuple
(
'Args'
,
[
'auth_file'
])
fauth
=
os
.
path
.
join
(
os
.
getenv
(
'HOME'
),
'.zoerc'
)
args
=
Args
(
fauth
)
auth
=
read_auth
(
args
)
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'name'
,
type
=
str
,
help
=
'Base application name'
)
parser
.
add_argument
(
'appfile'
,
type
=
str
,
help
=
'Base application description'
)
parser
.
add_argument
(
'gridfile'
,
type
=
str
,
help
=
"Json file with parameter configurations"
,
default
=
'experiment.json'
)
args
=
parser
.
parse_args
()
# Connect to zoe API
api
=
ZoeAPI
(
auth
[
'url'
],
auth
[
'user'
],
auth
[
'pass'
])
# Parse the application description (json file)
with
open
(
args
.
appfile
)
as
data_file
:
zapp
=
json
.
load
(
data_file
)
# Test if zapp file is valid
if
not
api
.
validation
.
validate
(
zapp
):
logger
.
error
(
'Zapp file not valid! Please provide a valid application description'
)
sys
.
exit
(
-
1
)
# Parse experiment setup file and prepare all combination of hyperparameters
with
open
(
args
.
gridfile
)
as
data_file
:
grid
=
json
.
load
(
data_file
)
list_of_jobs
=
get_all_jobs
(
grid
[
'bin'
],
grid
[
'executable'
],
grid
[
'hyperparameters'
])
api
=
ZoeAPI
(
auth
[
'url'
],
auth
[
'user'
],
auth
[
'pass'
])
user_id
=
13
# TODO: get this from somewhere
user_id
=
13
# TODO: get this from somewhere
# Get user quota (this is supposed to be fixed during an experiment - if not: move this in the next loop)
quota_jobs
,
quota_cores
,
quota_memory
=
get_user_quota
(
api
,
user_id
)
logger
.
info
(
'User
%
s (id:
%
d) can run
%
d concurrent jobs with up to
%
d cores and
%.0
f GB of memory'
%
(
auth
[
'user'
],
user_id
,
quota_jobs
,
quota_cores
,
quota_memory
/
(
1
<<
30
)))
# Get requested resources for one job
zapp_cores
,
zapp_memory
=
get_zapp_resources
(
zapp
)
list_of_jobs
=
get_all_jobs
(
grid
[
'bin'
],
grid
[
'executable'
],
grid
[
'hyperparameters'
])
job_count
=
0
while
len
(
list_of_jobs
)
!=
0
:
number_new_jobs
=
len
(
list_of_jobs
)
used_jobs
,
used_cores
,
used_memory
=
get_current_usage
(
api
)
logger
.
info
(
'User
%
s (id:
%
d) has currently
%
d running jobs (
%
d cores and
%.0
f GB of memory)'
%
logger
.
debug
(
'User
%
s (id:
%
d) has currently
%
d running jobs (
%
d cores and
%.0
f GB of memory)'
%
(
auth
[
'user'
],
user_id
,
used_jobs
,
used_cores
,
used_memory
/
(
1
<<
30
)))
zapp_cores
,
zapp_memory
=
get_zapp_resources
(
zapp
)
logger
.
info
(
'User
%
s (id:
%
d) has
%
d pending jobs (
%
d cores and
%.0
f GB of memory)'
%
logger
.
debug
(
'User
%
s (id:
%
d) has
%
d pending jobs (
%
d cores and
%.0
f GB of memory)'
%
(
auth
[
'user'
],
user_id
,
number_new_jobs
,
number_new_jobs
*
zapp_cores
,
number_new_jobs
*
zapp_memory
/
(
1
<<
30
)))
constrains
=
min
([
# Number of new jobs to schedule need to satisfy all three constrains (concurrent ex, cores and memory)
count_next_jobs_to_schedule
=
min
([
quota_jobs
-
used_jobs
,
int
((
quota_cores
-
used_cores
)
/
zapp_cores
),
int
((
quota_memory
-
used_memory
)
/
zapp_memory
)
])
if
constrains
>
0
:
logger
.
info
(
'Scheduling
%
d new jobs'
%
constrains
)
if
count_next_jobs_to_schedule
>
0
:
logger
.
info
(
'Scheduling
%
d new jobs'
%
count_next_jobs_to_schedule
)
for
i
in
range
(
count_next_jobs_to_schedule
):
name
=
args
.
name
+
'-
%02
d'
%
job_count
# name-00; name-01; name-02
next_job
=
list_of_jobs
.
pop
()
job_count
+=
1
exec_id
=
submit_zapp
(
zapp
,
next_job
,
name
,
api
)
if
exec_id
is
None
:
logger
.
error
(
'Continuous failing while submitting new execution. Skipping this job.'
)
continue
logger
.
info
(
'Job submitted with ID
%
d'
%
exec_id
)
for
i
in
range
(
constrains
):
new_command
=
list_of_jobs
.
pop
()
# TODO: submit to zoe
#print(new_command)
else
:
logger
.
warning
(
'Quota exceeded. No
ne
jobs can be scheduled now! Sleeping'
)
logger
.
warning
(
'Quota exceeded. No jobs can be scheduled now! Sleeping'
)
time
.
sleep
(
15
)
# user_endpoint = APIEndpoint(api, )
if
__name__
==
'__main__'
:
main
()
scripts/grid_sample.json
0 → 100644
View file @
722a5e2d
{
"bin"
:
"python3"
,
"executable"
:
"main.py"
,
"hyperparameters"
:
{
"dataset"
:
[
"mnist"
,
"cifar10"
],
"method"
:
[
"1"
,
"2"
],
"parameter1"
:
[
2
,
6
,
1
]
}
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment