Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • fsl/fsl_sub_plugin_slurm
1 result
Show changes
Commits on Source (2)
# fsl_sub_plugin_slurm release history
## 1.6.4
* Multiple fixes for GPU class selection when not using constraints
## 1.6.3
* Add option/environment variable to enable nested queuing of tasks - principally of use when using Open OnDemand desktop environments
......
......@@ -73,10 +73,14 @@ For each coprocessor hardware type you need a sub-section given an identifier th
| | resource | This is the name of the SLURM GRES 'type' or contraint that will be used to select this GPU family. |
| | doc | The description that appears in the fsl\_sub help text about this device. |
| | capability | An integer defining the feature set of the device, your most basic device should be given the value 1 and more capable devices higher values, e.g. GTX = 1, Kelper = 2, Pascal = 3, Volta = 4. |
| default\_class | _Class type key_ | The _class selector_ for the class to assign jobs to where a class has not been specified in the fsl\_sub call. For FSL tools that automatically submit to CUDA queues you should aim to select one that has good double-precision performance (K40\|80, P100, V100, A100) and ensure all higher capability devices also have good double-precision. |
| default\_class | _Class type key_ | The _class selector_ for the class to assign jobs to where a class has not been specified in the fsl\_sub call. For FSL tools that automatically submit to CUDA queues you should aim to select one that has good double-precision performance (K40\|80, P100, V100, A30, A100, H100, H200 or B200) and ensure all higher capability devices also have good double-precision. |
| class\_constraint | **False**/string | Whether your SLURM cluster is configured to use constraints to select co-processor models/features. If so this should be set to the name of the feature that selects between the models and the co-processor class _resource_ strings set appropriately to match the available values. |
| presence\_test | _Program path_ (**nvidia-smi** for CUDA) | The name of a program that can be used to look for this coprocessor type, for example nvidia-smi for CUDA devices. Program needs to return non-zero exit status if there are no available coprocessors. |
#### GRES vs Constraints
When using GRES to select GPUs and types, it is only possible to select all GPUs or on class of card, not a mixture. To be able to support this, you have two options, one switch to using constraints to select the GPUs or advise users to add --constraint arguments using the `--extra` fsl_sub argument.
### Queue Definitions
Slurm refers to queues as partitions. The example configuration should contain definitions for the automatically discovered partitions but you should review these, in particular any warnings generated.
......
......@@ -382,17 +382,21 @@ def submit(
gres_items = [cpconf['resource'], str(coprocessor_multi), ]
cpclasses = []
if cpconf.get('classes', False) and coprocessor_class is None:
coprocessor_class = cpconf.get('default_class', None)
cpclasses.append(
cp_class_item(cpconf, coprocessor_class, 'resource'))
if cpconf.get('classes', False):
class_constraint = cpconf.get('class_constraint', False)
if class_constraint:
# SLURM only supports multiple GPU selections when
# using constraints
cpclasses = []
if coprocessor_class is None:
cpclasses.append(
cp_class_item(
cpconf,
cpconf.get('default_class', None),
'resource'))
if (cpconf.get('include_more_capable', True)
and not coprocessor_class_strict):
cp_capability = cp_class_item(
......@@ -400,7 +404,7 @@ def submit(
base_list = [
a for a in cpconf['class_types'].keys()
if (cpconf['class_types'][a]['capability']
> cp_capability)]
>= cp_capability)]
[cpclasses.append(
cpconf['class_types'][a]['resource']) for a in
sorted(
......@@ -416,12 +420,18 @@ def submit(
command_args.append('='.join(
('--constraint', '"{0}"'.format('|'.join(cpclasses)))
))
else:
if len(cpclasses) == 1:
gres_items.insert(1, cpclasses[0])
else:
if len(cpclasses) == 1:
gres_items.insert(1, cpclasses[0])
elif coprocessor_class is not None:
if cpconf.get('include_more_capable', True):
warnings.warn(
"Option 'include_more_capable: True' not "
"supported when not using constraints - "
"limiting to coprocessor class "
+ coprocessor_class)
# SLURM's gres only allows for the generic or a single
# specific GPU resource string
gres_items.insert(
1,
cp_class_item(cpconf, coprocessor_class, 'resource'))
gres.append(":".join(gres_items))
......@@ -1268,7 +1278,7 @@ def build_queue_defs():
# GPUs found
cuda['resource'] = 'gpu'
order = ['m', 'k', 'g', 'p', 'v', 'a']
order = ['m', 'k', 'g', 'p', 'v', 'a', 'h', 'b', ]
cuda['include_more_capable'] = False
key_cmt(
(
......
......@@ -55,3 +55,8 @@ coproc_opts:
# adding it to the GRES? If your cluster instructions say use --constraint (or -C) <class> then set this
# to true.
# If you are told to use --gres gpu:<class>:<qty> then set this to false.
include_more_capable: True # Whether to include more advanced CUDA cards when you select a
# specific class. This cannot be used when not using class_constraint, and will generate a
# warning if class_constraint is False and this is True. In these setups then users can
# specify groups of GPUs using the --extra option with the appropriate SLURM constraint
# argument, assuming that you have the necessary features configured for the devices.
......@@ -3,6 +3,7 @@ import copy
import datetime
import io
import os
import pytest
import subprocess
import tempfile
import unittest
......@@ -69,7 +70,7 @@ copro_opts:
uses_modules: True
module_parent: cuda
no_binding: True
class_constriant: True
class_constraint: True
queues:
a.q:
''')
......@@ -431,6 +432,8 @@ class TestSubmit(unittest.TestCase):
self.submission_time_str = ('# Submission time (H:M:S DD/MM/YYYY): '
+ self.now.strftime("%H:%M:%S %d/%m/%Y"))
self.cmd_str = '\n'.join(('', ' '.join(self.cmd)))
self.gpu_argv = [
'fsl_sub', '--coprocessor', 'cuda', '-q', self.queue, ]
def submit_str(
self, cmd=None, threads=1, copy_env=False,
......@@ -876,115 +879,285 @@ class TestSubmit(unittest.TestCase):
input=expected_script
)
def test_GPU(
def test_GPU_without_classes(
self, mock_sprun, mock_cpconf,
mock_srbs, mock_qsub,
mock_getcwd):
with self.subTest("GPU with constraints"):
w_conf = copy.deepcopy(self.config)
w_conf['copro_opts']['cuda']['class_constraint'] = False
self.mocks[
'fsl_sub_plugin_slurm.method_config'
].return_value = w_conf['method_opts']['slurm']
mock_cpconf.return_value = w_conf['copro_opts']['cuda']
cmd_argv = ['fsl_sub', '--coprocessor', 'cuda', '-q', self.queue, ]
cmd_argv.extend(self.cmd)
expected_cmd = ['/usr/bin/sbatch']
expected_script = self.submit_str(
cmd=' '.join(cmd_argv),
gpu_lines=['--gres=gpu:k80:1', ])
mock_sprun.return_value = subprocess.CompletedProcess(
expected_cmd, 0,
stdout=self.qsub_out, stderr=None)
cmd_argv = ['fsl_sub', '--coprocessor', 'cuda', '-q', self.queue, ]
cmd_argv.extend(self.cmd)
with patch('fsl_sub.utils.sys.argv', cmd_argv):
job_id = self.plugin.submit(
command=self.cmd,
job_name=self.job_name,
queue=self.queue,
coprocessor='cuda'
)
self.assertEqual(self.jid, job_id)
mock_sprun.assert_called_once_with(
expected_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
input=expected_script
w_conf = copy.deepcopy(self.config)
cmd_argv = list(self.gpu_argv)
w_conf['copro_opts']['cuda']['classes'] = False
self.mocks[
'fsl_sub_plugin_slurm.method_config'
].return_value = w_conf['method_opts']['slurm']
mock_cpconf.return_value = w_conf['copro_opts']['cuda']
cmd_argv.extend(self.cmd)
expected_cmd = ['/usr/bin/sbatch']
expected_script = self.submit_str(
cmd=' '.join(cmd_argv),
gpu_lines=['--gres=gpu:1', ])
mock_sprun.return_value = subprocess.CompletedProcess(
expected_cmd, 0,
stdout=self.qsub_out, stderr=None)
with patch('fsl_sub.utils.sys.argv', cmd_argv):
job_id = self.plugin.submit(
command=self.cmd,
job_name=self.job_name,
queue=self.queue,
coprocessor='cuda'
)
self.assertEqual(self.jid, job_id)
mock_sprun.assert_called_once_with(
expected_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
input=expected_script
)
mock_sprun.reset_mock()
mock_sprun.reset_mock()
with self.subTest("GPU without constraints"):
w_conf = copy.deepcopy(self.config)
w_conf['copro_opts']['cuda']['classes'] = False
self.mocks[
'fsl_sub_plugin_slurm.method_config'
].return_value = w_conf['method_opts']['slurm']
mock_cpconf.return_value = w_conf['copro_opts']['cuda']
cmd_argv = ['fsl_sub', '--coprocessor', 'cuda', '-q', self.queue, ]
cmd_argv.extend(self.cmd)
expected_cmd = ['/usr/bin/sbatch']
expected_script = self.submit_str(
cmd=' '.join(cmd_argv),
gpu_lines=['--gres=gpu:1', ])
mock_sprun.return_value = subprocess.CompletedProcess(
expected_cmd, 0,
stdout=self.qsub_out, stderr=None)
cmd_argv = ['fsl_sub', '--coprocessor', 'cuda', '-q', self.queue, ]
cmd_argv.extend(self.cmd)
with patch('fsl_sub.utils.sys.argv', cmd_argv):
job_id = self.plugin.submit(
command=self.cmd,
job_name=self.job_name,
queue=self.queue,
coprocessor='cuda'
)
self.assertEqual(self.jid, job_id)
mock_sprun.assert_called_once_with(
expected_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
input=expected_script
def test_GPU_multiple(
self, mock_sprun, mock_cpconf,
mock_srbs, mock_qsub,
mock_getcwd):
w_conf = copy.deepcopy(self.config)
cmd_argv = list(self.gpu_argv)
w_conf['copro_opts']['cuda']['classes'] = False
self.mocks[
'fsl_sub_plugin_slurm.method_config'
].return_value = w_conf['method_opts']['slurm']
mock_cpconf.return_value = w_conf['copro_opts']['cuda']
cmd_argv.extend(self.cmd)
expected_cmd = ['/usr/bin/sbatch']
expected_script = self.submit_str(
cmd=' '.join(cmd_argv),
gpu_lines=['--gres=gpu:2', ])
mock_sprun.return_value = subprocess.CompletedProcess(
expected_cmd, 0,
stdout=self.qsub_out, stderr=None)
with patch('fsl_sub.utils.sys.argv', cmd_argv):
job_id = self.plugin.submit(
command=self.cmd,
job_name=self.job_name,
queue=self.queue,
coprocessor='cuda',
coprocessor_multi=2,
)
self.assertEqual(self.jid, job_id)
mock_sprun.assert_called_once_with(
expected_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
input=expected_script
)
mock_sprun.reset_mock()
with self.subTest("With GPU constraints"):
w_conf = self.config
w_conf['copro_opts']['cuda']['set_visible'] = True
w_conf['copro_opts']['cuda']['class_constraint'] = 'gpu_sku'
self.mocks[
'fsl_sub_plugin_slurm.method_config'
].return_value = w_conf['method_opts']['slurm']
mock_cpconf.return_value = w_conf['copro_opts']['cuda']
cmd_argv = ['fsl_sub', '--coprocessor', 'cuda', '-q', self.queue, ]
cmd_argv.extend(self.cmd)
expected_cmd = ['/usr/bin/sbatch']
expected_script = self.submit_str(
cmd=' '.join(cmd_argv),
gpu_lines=[
'--constraint="k80|p100"',
'--gres=gpu:1', ])
mock_sprun.return_value = subprocess.CompletedProcess(
expected_cmd, 0,
stdout=self.qsub_out, stderr=None)
with patch('fsl_sub.utils.sys.argv', cmd_argv):
def test_GPU_without_constraints(
self, mock_sprun, mock_cpconf,
mock_srbs, mock_qsub,
mock_getcwd):
w_conf = self.config
w_conf['copro_opts']['cuda']['class_constraint'] = False
self.mocks[
'fsl_sub_plugin_slurm.method_config'
].return_value = w_conf['method_opts']['slurm']
mock_cpconf.return_value = w_conf['copro_opts']['cuda']
cmd_argv = list(self.gpu_argv)
cmd_argv.extend(self.cmd)
expected_cmd = ['/usr/bin/sbatch']
expected_script = self.submit_str(
cmd=' '.join(cmd_argv),
gpu_lines=['--gres=gpu:1', ])
mock_sprun.return_value = subprocess.CompletedProcess(
expected_cmd, 0,
stdout=self.qsub_out, stderr=None)
with patch('fsl_sub.utils.sys.argv', cmd_argv):
job_id = self.plugin.submit(
command=self.cmd,
job_name=self.job_name,
queue=self.queue,
coprocessor='cuda'
)
self.assertEqual(self.jid, job_id)
mock_sprun.assert_called_once_with(
expected_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
input=expected_script
)
def test_GPU_with_specific_gres(
self, mock_sprun, mock_cpconf,
mock_srbs, mock_qsub,
mock_getcwd):
w_conf = self.config
w_conf['copro_opts']['cuda']['class_constraint'] = False
w_conf['copro_opts']['cuda']['include_more_capable'] = False
self.mocks[
'fsl_sub_plugin_slurm.method_config'
].return_value = w_conf['method_opts']['slurm']
mock_cpconf.return_value = w_conf['copro_opts']['cuda']
cmd_argv = list(self.gpu_argv)
cmd_argv[3:3] = ['--coprocessor_class', 'K']
cmd_argv.extend(self.cmd)
expected_cmd = ['/usr/bin/sbatch']
expected_script = self.submit_str(
cmd=' '.join(cmd_argv),
gpu_lines=['--gres=gpu:k80:1', ])
mock_sprun.return_value = subprocess.CompletedProcess(
expected_cmd, 0,
stdout=self.qsub_out, stderr=None)
with patch('fsl_sub.utils.sys.argv', cmd_argv):
job_id = self.plugin.submit(
command=self.cmd,
job_name=self.job_name,
queue=self.queue,
coprocessor='cuda',
coprocessor_class='K'
)
self.assertEqual(self.jid, job_id)
mock_sprun.assert_called_once_with(
expected_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
input=expected_script
)
def test_GPU_more_capable_warning(
self, mock_sprun, mock_cpconf,
mock_srbs, mock_qsub,
mock_getcwd):
w_conf = self.config
w_conf['copro_opts']['cuda']['class_constraint'] = False
w_conf['copro_opts']['cuda']['include_more_capable'] = True
self.mocks[
'fsl_sub_plugin_slurm.method_config'
].return_value = w_conf['method_opts']['slurm']
mock_cpconf.return_value = w_conf['copro_opts']['cuda']
cmd_argv = list(self.gpu_argv)
cmd_argv[3:3] = ['--coprocessor_class', 'K']
cmd_argv.extend(self.cmd)
expected_cmd = ['/usr/bin/sbatch']
expected_script = self.submit_str(
cmd=' '.join(cmd_argv),
gpu_lines=['--gres=gpu:k80:1', ])
mock_sprun.return_value = subprocess.CompletedProcess(
expected_cmd, 0,
stdout=self.qsub_out, stderr=None)
with patch('fsl_sub.utils.sys.argv', cmd_argv):
with pytest.warns(
UserWarning,
match="Option 'include_more_capable: True' not "
"supported when not using constraints - "
"limiting to coprocessor class K"):
job_id = self.plugin.submit(
command=self.cmd,
job_name=self.job_name,
queue=self.queue,
coprocessor='cuda'
coprocessor='cuda',
coprocessor_class='K'
)
self.assertEqual(self.jid, job_id)
mock_sprun.assert_called_once_with(
expected_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
input=expected_script
mock_sprun.assert_called_once_with(
expected_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
input=expected_script
)
def test_GPU_without_constraints_class(
self, mock_sprun, mock_cpconf,
mock_srbs, mock_qsub,
mock_getcwd):
w_conf = self.config
w_conf['copro_opts']['cuda']['class_constraint'] = True
self.mocks[
'fsl_sub_plugin_slurm.method_config'
].return_value = w_conf['method_opts']['slurm']
mock_cpconf.return_value = w_conf['copro_opts']['cuda']
cmd_argv = list(self.gpu_argv)
cmd_argv[3:3] = [
'--coprocessor_class', 'K',
'--coprocessor_class_strict', ]
cmd_argv.extend(self.cmd)
expected_cmd = ['/usr/bin/sbatch']
expected_script = self.submit_str(
cmd=' '.join(cmd_argv),
gpu_lines=[
'--constraint="k80"',
'--gres=gpu:1',
])
mock_sprun.return_value = subprocess.CompletedProcess(
expected_cmd, 0,
stdout=self.qsub_out, stderr=None)
with patch('fsl_sub.utils.sys.argv', cmd_argv):
job_id = self.plugin.submit(
command=self.cmd,
job_name=self.job_name,
queue=self.queue,
coprocessor='cuda',
coprocessor_class='K',
coprocessor_class_strict=True
)
self.assertEqual(self.jid, job_id)
mock_sprun.assert_called_once_with(
expected_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
input=expected_script
)
def test_GPU_with_multiple_constraints(
self, mock_sprun, mock_cpconf,
mock_srbs, mock_qsub,
mock_getcwd):
w_conf = self.config
w_conf['copro_opts']['cuda']['set_visible'] = True
w_conf['copro_opts']['cuda']['class_constraint'] = True
w_conf['copro_opts']['cuda']['include_mode_capable'] = True
self.mocks[
'fsl_sub_plugin_slurm.method_config'
].return_value = w_conf['method_opts']['slurm']
mock_cpconf.return_value = w_conf['copro_opts']['cuda']
cmd_argv = list(self.gpu_argv)
cmd_argv[3:3] = ['--coprocessor_class', 'K']
cmd_argv.extend(self.cmd)
expected_cmd = ['/usr/bin/sbatch']
expected_script = self.submit_str(
cmd=' '.join(cmd_argv),
gpu_lines=[
'--constraint="k80|p100"',
'--gres=gpu:1', ])
mock_sprun.return_value = subprocess.CompletedProcess(
expected_cmd, 0,
stdout=self.qsub_out, stderr=None)
with patch('fsl_sub.utils.sys.argv', cmd_argv):
job_id = self.plugin.submit(
command=self.cmd,
job_name=self.job_name,
queue=self.queue,
coprocessor='cuda',
coprocessor_class='K'
)
self.assertEqual(self.jid, job_id)
mock_sprun.assert_called_once_with(
expected_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
input=expected_script
)
def test_submit_wrapper_copy_env(
self, mock_sprun, mock_cpconf,
......
PLUGIN_VERSION = '1.6.3'
PLUGIN_VERSION = '1.6.4'