Compare revisions

Duncan Mortimer · Duncan Mortimer · a2e81914 · a2e81914 · a2e81914 · a2e81914
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -7,7 +7,8 @@
 * Detection and handling of constraints in --extra_args
 * Fix handling of job name when none provided
 * Refactor log location handling
-* Add support for Symmetric MultiThreading to avoid over specifying RAM
+* Change to requesting memory via --mem to avoid issues with SMT systems
+* Add additional environment variable, FSLSUB_NORAMLIMIT, to allow user to temporarily disable notifying SLURM of RAM requirements

 ## 1.6.4


--- a/README.md
+++ b/README.md
@@ -35,7 +35,7 @@ The configuration for the SLURM plugin is in the _method\_opts_ section, under t
 | mail\_support | True/**False** | Whether the grid installation is configured to send email on job events. |
 | mail\_modes | Dictionary of option lists | If the grid has email notifications turned on, this option configures the submission options for different verbosity levels, 'b' = job start, 'e' = job end, 'a' = job abort, 'f' = all events, 'n' = no mail. Each event type should then have a list of submission mail arguments that will be applied to the submitted job. Typically, these should not be edited. |
 | mail\_mode | b/e/a/f/**n** | Which of the above mail_modes to use by default. |
-| notify\_ram\_usage | **True**/False | Whether to notify SLURM of the RAM you have requested. SLURM is typically configured to give jobs a small RAM allocation so you will invariably need this set to true. |
+| notify\_ram\_usage | **True**/False | Whether to notify SLURM of the RAM you have requested. SLURM is typically configured to give jobs a small RAM allocation so you will invariably need this set to true. This helps the scheduler allocate resources efficiently, however, this RAM limit will be enforced. If you are unsure of your RAM requirements, this can be disabled on a per-job basis by setting the environment variable FSLSUB_NORAMLIMIT to '1' (or 'True'). |
 | set\_time\_limit | True/**False** | Whether to notify SLURM of the expected **maximum** run-time of your job. This helps the scheduler fill in reserved slots (for e.g. parallel environment jobs), however, this time limit will be enforced, resulting in a job being killed if it is exceeded, even if this is less than the queue run-time limit. This can be disabled on a per-job basis by setting the environment variable FSLSUB_NOTIMELIMIT to '1' (or 'True'). |
 | array\_holds | **True**/False | Enable support array holds, e.g. sub-task 1 waits for parent sub-task 1. |
 | array\_limit | **True**/False | Enable limiting number of concurrent array tasks. |
@@ -48,7 +48,6 @@ The configuration for the SLURM plugin is in the _method\_opts_ section, under t
 | extra\_args | **[]**/List | List of additional SLURM arguments to pass through to the sheduler. |
 | strict_dependencies | True/**False** | Whether to use 'afterok' (True) or 'afterany' (False) when specifying simple job dependencies. This can also be controlled by the environment variable `FSLSUB_STRICTDEPS` (True="1", False="0"). |
 | allow_nested_queuing | True/**False** | Whether fsl_sub, when called from within a cluster job, should be able to submit further jobs (True) or run subsequent jobs with the shell plugin. You can override this on a per-job or session basis using the environmet |
-| smt | True/**False** | When Symmetric Multi-Threading is turned on for SLURM processing nodes, jobs will receive a minimum of two threads. If notify\_ram\_usage is turned on then SLURM will be told that you require the requested RAM **per CPU** (thread) and thus you will be allocated twice the amount of memory you requested. Setting this to True will halve the requested RAM for single-thread jobs. |

 ### Coprocessor Configuration


--- a/fsl_sub_plugin_slurm/__init__.py
+++ b/fsl_sub_plugin_slurm/__init__.py
@@ -33,7 +33,6 @@ from fsl_sub.coprocessors import (
 from fsl_sub.shell_modules import loaded_modules
 from fsl_sub.utils import (
    affirmative,
-    split_ram_by_slots,
    human_to_ram,
    parse_array_specifier,
    bash_cmd,
@@ -535,40 +534,18 @@ def submit(
            else:
                array_limit_modifier = ""

-        if mconf.get('smt', False) and threads == 1:
-            # We will automatically get two virtual threads, so increase
-            # threads to ensure that SLURM's memory calculation is correct
-            threads = 2
-
        if jobram:
-            # Slurm defaults to dividing up the task into multiple cpu
-            # requests, automatically reducing memory per cpu value.
-            # However, we have already done this, so we need to
-            # reduce the RAM requirements.
-
-            mem_specifier = 'mem-per-gpu' if 'copros' in queue_config(queue) \
-                else 'mem-per-cpu'
-
-            if mem_specifier == 'mem-per-cpu':
-                jobram = split_ram_by_slots(jobram, threads)
-            else:
-                try:
-                    gpus = int(coprocessor_multi)
-                except ValueError:
-                    gpus = 1  # Cannot process complete coprocessor request
-                    warnings.warn(
-                        "Coprocessor_multi definition is complex "
-                        f"{coprocessor_multi} - defaulting to calculating "
-                        "memory requirements for a single GPU")
-                jobram = split_ram_by_slots(jobram, gpus)
-
-            if mconf['notify_ram_usage']:
+            try:
+                no_set_rlimit = (
+                    os.environ['FSLSUB_NORAMLIMIT'] == '1'
+                    or affirmative(os.environ['FSLSUB_NORAMLIMIT']))
+            except Exception:
+                no_set_rlimit = False
+
+            if (mconf.get('notify_ram_usage', False)
+                    and not no_set_rlimit):
                command_args.append(
-                    '='.join((
-                        f'--{mem_specifier}',
-                        str(jobram) + fsl_sub.consts.RAMUNITS
-                    ))
-                )
+                    f'--mem={str(jobram)}{fsl_sub.consts.RAMUNITS}')
        try:
            no_set_tlimit = (
                os.environ['FSLSUB_NOTIMELIMIT'] == '1'

--- a/fsl_sub_plugin_slurm/fsl_sub_slurm.yml
+++ b/fsl_sub_plugin_slurm/fsl_sub_slurm.yml
@@ -49,10 +49,6 @@ method_opts:
    # if an earlier job fails - equivalent environment variable FSLSUB_STRICTDEPS (0=False)
    allow_nested_queuing: False # Do you want fsl_sub to be able to submit to a cluster when already
    # running in a batch job. See also FSLSUB_NESTED environment variable.
-    smt: False # Do you have Symmetric Mult-Threading enabled, e.g. Hyperthreading or equivalents?
-    # Set this to true to ensure that memory requests (see 'notify_ram_usage' above) don't request
-    # twice the RAM when only a single thread is requested (SLURM always gives you two threads in this
-    # case)
 coproc_opts:
  cuda:
    class_constraint: False # Does the SURLM cluster use constraints to specify GPU types rather than

--- a/fsl_sub_plugin_slurm/tests/test_fsl_sub_plugin_slurm.py
+++ b/fsl_sub_plugin_slurm/tests/test_fsl_sub_plugin_slurm.py
@@ -1610,49 +1610,6 @@ class TestSubmit(unittest.TestCase):
            input=expected_script
        )

-    def test_submit_smt(
-            self, mock_sprun, mock_cpconf,
-            mock_qsub, mock_getcwd):
-        cmd_argv = ['fsl_sub', '-q', self.queue, '-R', '16', ]
-        cmd_argv.extend(self.cmd)
-        # Turn on smt
-        w_conf = copy.deepcopy(self.config)
-        w_conf['method_opts']['slurm']['smt'] = True
-        w_conf['method_opts']['slurm']['notify_ram_usage'] = True
-        self.mocks[
-            'fsl_sub_plugin_slurm.method_config'
-            ].return_value = w_conf['method_opts']['slurm']
-
-        expected_cmd = ['/usr/bin/sbatch']
-        expected_script = self.submit_str(
-            threads=2,
-            cmd=' '.join(cmd_argv),
-            notify_ram=True,
-            ram="8G")
-        mock_sprun.return_value = subprocess.CompletedProcess(
-            expected_cmd, 0,
-            stdout=self.qsub_out, stderr=None)
-        with patch('fsl_sub.utils.sys.argv', cmd_argv):
-            self.assertEqual(
-                self.jid,
-                self.plugin.submit(
-                    command=self.cmd,
-                    job_name=self.job_name,
-                    queue=self.queue,
-                    threads=1,
-                    parallel_env=None,
-                    jobram=16
-                )
-            )
-
-        mock_sprun.assert_called_once_with(
-            expected_cmd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            universal_newlines=True,
-            input=expected_script
-        )
-
    def test_submit_threads(
            self, mock_sprun, mock_cpconf,
            mock_qsub, mock_getcwd):
No results found