diff --git a/doc/fsl.utils.rst b/doc/fsl.utils.rst index d7bf157ed3154a6c3abe9fefe100da7548ea4087..76631a31b4156a09fee7ca689bf964d8da6f9d02 100644 --- a/doc/fsl.utils.rst +++ b/doc/fsl.utils.rst @@ -8,6 +8,7 @@ fsl.utils.cache fsl.utils.deprecated fsl.utils.ensure + fsl.utils.filetree fsl.utils.fslsub fsl.utils.idle fsl.utils.imcp diff --git a/fsl/utils/filetree/__init__.py b/fsl/utils/filetree/__init__.py index b5ffec3ef7d33eb05b20f6e0b2a4ae2b2b26f538..73589296af8b84d6d5677b03dde9058a229b2827 100644 --- a/fsl/utils/filetree/__init__.py +++ b/fsl/utils/filetree/__init__.py @@ -1,4 +1,278 @@ -"""mc_filetree - Easy format to define intput/output files in a python pipeline""" +""" +Easy format to define input/output files in a python pipeline. + +The goal is to separate the definition of the input/output filenames from the actual code +by defining a directory tree (i.e., FileTree) in a separate file from the code. + +Loading FileTrees +----------------- +.. code-block:: python + + from fsl.utils.filetree import FileTree, tree_directories + tree = FileTree.read('bids_raw') + +This creates a `tree` object that describes input/output filenames +for your pipeline based on `this file <trees/bids_raw.tree>` + +:py:func:`filetree.FileTree.read` will search through the `filetree.tree_directories` list of directories +for any FileTrees matching the given name. This list by default includes the current directory. Of course, +a full path to the requested FileTree can also be provided. This includes all FileTrees defined +`here <https://git.fmrib.ox.ac.uk/fsl/fslpy/tree/master/fsl/utils/filetree/trees>`_. + +FileTree format +--------------- +The FileTrees are defined in a simple to type format, where indendation is used to indicate subdirectories, for example: + +:: + + # Any text following a #-character can be used for comments + parent + file1.txt + child + file2 + file3.txt + file4.txt + +In the top-level directory this represents one file ("file4.txt") and one directory ("parent"). The directory +contains two files ("file1.txt" and "file3.txt") and one directory ("child") which contains a single file ("file2"). + +Individual aspects of this format are defined in more detail below. + +Short names +^^^^^^^^^^^ +Each directory and file in the FileTree is assigned a short name for convenient access. +For example, for the FileTree + +:: + + parent + file1.txt + child + file2 + file3.txt + file4.txt + +We can load this FileTree using + +.. code-block:: python + + >>> tree = FileTree.read(<tree filename>) + >>> tree.get('file2') + 'parent/child/file2' + >>> tree.get('child') + 'parent/child' + +These filenames will be returned whether the underlying file exists or not (see :py:func:`filetree.FileTree.get`). + +By default the short name will be the name of the file or directory without extension (i.e., everything the first dot). +The short name can be explicitly set by including it in round brackets behind the filename, +so ``left_hippocampus_segment_from_first.nii.gz (Lhipp)`` will have the short name "Lhipp" +rather than "left_hippocampus_segment_from_first"). This allows changing of the filenames +without having to alter the short names used to refer to those filenames in the code. + +Variables +^^^^^^^^^ +FileTrees can have placeholders for variables such as subject id: + +:: + + {subject} + T1w.nii.gz + {hemi}_pial.surf.gii (pial) + +Any part of the directory or file names contained within curly brackets will have to be filled when getting the path: + +.. code-block:: python + + >>> tree = FileTree.read(<tree filename>, subject='A') + >>> tree.get('T1w') + 'A/T1w.nii.gz + >>> B_tree = tree.update(subject='B') + >>> B_tree.get('T1w') + 'B/T1w.nii.gz + >>> tree.get('pial') # note that pial was explicitly set as the short name in the file above + # Raises a MissingVariable error as the hemi variable is not defined + +Variables can be either set during initialisation of the FileTree or by :py:func:`filetree.FileTree.update`, which +returns a new `FileTree` rather than updating the existing one. + +Finally initial values for the variables can be set in the FileTree itself, for example in + +:: + + hemi = left + + {subject} + T1w.nii.gz + {hemi}_pial.surf.gii (pial) + +the variable "hemi" will be "left" unless explicitly set during initialisation or updating of the `FileTree`. + +Optional Variables +^^^^^^^^^^^^^^^^^^ +Normally having undefined variables will lead to :py:exc:`filetree.MissingVariable` being raised. +This can be avoided by putting these variables in square brackets, indicating that they can simply +be skipped. For example for the FileTree: + +:: + + {subject} + [{session}] + T1w[_{session}].nii.gz (T1w) + +.. code-block:: python + + >>> tree = FileTree.read(<tree filename>, subject='A') + >>> tree.get('T1w') + 'A/T1w.nii.gz' + >>> tree.update(session='test').get('T1w') + 'A/test/T1w_test.nii.gz' + +Note that if any variable within the square brackets is missing, any text within those square brackets is omitted. + +Extensive use of optional variables can be found in the +`FileTree of the BIDS raw data format <https://git.fmrib.ox.ac.uk/fsl/fslpy/blob/master/fsl/utils/filetree/trees/bids_raw.tree>`_. + +Sub-trees +^^^^^^^^^ +FileTrees can include other FileTrees within their directory structure. For example, + +:: + + {subject} + topup + b0.nii.gz + ->topup basename=out (topup) + eddy + ->eddy (eddy) + nodif_brain_mask.nii.gz + Diffusion + ->Diffusion (diff) + ->dti (dti) + +which might represent a diffusion MRI pipeline, which contains references to the predefined trees for the +`topup <https://git.fmrib.ox.ac.uk/fsl/fslpy/blob/master/fsl/utils/filetree/trees/topup.tree>`_, +`eddy <https://git.fmrib.ox.ac.uk/fsl/fslpy/blob/master/fsl/utils/filetree/trees/eddy.tree>`_, +`Diffusion <https://git.fmrib.ox.ac.uk/fsl/fslpy/blob/master/fsl/utils/filetree/trees/Diffusion.tree>`_, and +`dti <https://git.fmrib.ox.ac.uk/fsl/fslpy/blob/master/fsl/utils/filetree/trees/dti.tree>`_ +FileTrees describing the input/output of various FSL tools. + +The general format of this is: +``-><tree name> [<variable in sub-tree>=<value>, ...] (<sub-tree short name)`` + +The filenames defined in the sub-trees can be accessed using a "/" in the short name: + +.. code-block:: python + + >>> tree = FileTree.read(<tree filename>, subject='A') + >>> tree.get('dti/FA') + 'A/Diffusion/dti_FA.nii.gz' + >>> tree.get('topup/fieldcoef') + 'A/topup/out_fielcoef.nii.gz + +Extensive use of sub-trees can be found in +`the FileTree of the HCP pre-processed directory structure <https://git.fmrib.ox.ac.uk/fsl/fslpy/blob/master/fsl/utils/filetree/trees/HCP_directory.tree>`_, +which amongst others refers to +`the HCP surface directory format FileTree <https://git.fmrib.ox.ac.uk/fsl/fslpy/blob/master/fsl/utils/filetree/trees/HCP_Surface.tree>`_. + +Example pipeline +---------------- +A very simple pipeline to run BET on every subject can start with a simply FileTree like +:: + + {subject} + T1w.nii.gz + T1w_brain.nii.gz (bet_output) + T1w_brain_mask.nii.gz (bet_mask) + + +Assuming that the input T1w's already exist, we can then simply run BET for every subject using: + +.. code-block:: python + + from fsl.utils.filetree import FileTree + from fsl.wrappers.bet import bet + tree = FileTree.read(<tree filename>) + variables = tree.get_all_vars('T1w') # extract the set of variables for all existing T1w files + for single_variable_set in variables: + T1w_tree = tree.update(**single_variable_set) + # get retrieves the filenames based on the current set of variables + # make_dir=True ensures that the output directory containing the "bet_output" actually exists + bet(input=T1w_tree.get('T1w'), output=T1w_tree.get('bet_output', make_dir=True), mask=True) + +If later on in our input files change, because for some subjects we added a second session, we could keep our script +and simply update the FileTree: +:: + + {subject} + [ses-{session}] + T1w.nii.gz + T1w_brain.nii.gz (bed_output) + T1w_brain_mask.nii.gz (bed_mask) + +Note the square brackets around the session sub-directory. This indicates that this sub-directory is optional and +will only be present if the "session" variable is defined (see `Optional variables`_). + +This means that with the script run with this updated tree will run bet on each T1-weighted image even for a directory +structure like: +:: + + subjectA/ + T1w.nii.gz + subjectB/ + ses-01/ + T1w.nii.gz + ses-02/ + T1w.nii.gz + +If we get told off that our script is writing the output to the same directory as our input data, +altering this behaviour is again as simple as altering the FileTree to something like: +:: + + raw_data + {subject} + [ses-{session}] + T1w.nii.gz + processed_data + {subject} + [ses-{session}] + bet + {subject}[_{session}]_T1w_brain.nii.gz (bet_output) + {subject}[_{session}]_T1w_brain_mask.nii.gz (bet_mask) + +Note that we also encoded the subject and session ID in the output filename. + +Some tools like FSL's FAST produce many output files. Rather than entering all +of these files in our FileTree by hand you can include them all at once by including `Sub-trees`_: + +:: + + raw_data + {subject} + [ses-{session}] + T1w.nii.gz + processed_data + {subject} + [ses-{session}] + bet + {subject}[_{session}]_T1w_brain.nii.gz (bet_output) + {subject}[_{session}]_T1w_brain_mask.nii.gz (bet_mask) + fast + ->fast basename={subject}[_{session}] (segment) + +Here we chose to set the "basename" of the FAST output to a combination of the subject and if available session ID. + +Within the script we can generate the fast output by running + +.. code-block:: python + + from fsl.wrappers.fast import fast + fast(imgs=[T1w_tree.get('T1w')], out=T1w_tree.get('segment/basename')) + +The output files will be available as `T1w_tree.get('segment/<variable name>')`, where `<variable name>` is one +of the short variable names defined in the +`FAST FileTree <https://git.fmrib.ox.ac.uk/fsl/fslpy/blob/master/fsl/utils/filetree/trees/fast.tree>`_. +""" __author__ = 'Michiel Cottaar <Michiel.Cottaar@ndcn.ox.ac.uk>' diff --git a/fsl/utils/filetree/filetree.py b/fsl/utils/filetree/filetree.py index 15bd8bb8841d49900362be6f2cd53860f05f782b..93bf98070bfffd4054af95a62105e7e3cb9deb58 100644 --- a/fsl/utils/filetree/filetree.py +++ b/fsl/utils/filetree/filetree.py @@ -18,7 +18,8 @@ class FileTree(object): """ Contains the input/output filename tree - Properties + Properties: + - ``templates``: dictionary mapping short names to filename templates - ``variables``: dictionary mapping variables in the templates to specific values (variables set to None are explicitly unset) - ``sub_trees``: filename trees describing specific sub-directories @@ -41,12 +42,15 @@ class FileTree(object): @property def parent(self, ): + """ + Parent FileTree, of which this sub-tree is a sub-directory + """ return self._parent @property def all_variables(self, ): """ - All tree variables including those from the parent tree + All tree variables including those inherited from the parent tree """ if self.parent is None: return dict(self.variables) @@ -59,7 +63,7 @@ class FileTree(object): Gets a variable used to fill out the template :param name: variable name - :param default: default variables (if not set an error is raised for a missing variable) + :param default: default variables (if not set a MissingVariable error is raised if a variable is missing) :return: value of the variable """ variables = self.all_variables @@ -93,8 +97,9 @@ class FileTree(object): - '../' characters in short_name refer to parents For example: - - eddy/output refers to the output in the eddy sub_tree (i.e. self.sub_trees['eddy'].templates['output'] - - ../other/name refers to the 'other' sub-tree of the parent tree (i.e., self.parent.sub_trees['other'].templates['name'] + + - "eddy/output" refers to the "output" in the "eddy" sub_tree (i.e. ``self.sub_trees['eddy'].templates['output']``) + - "../other/name" refers to the "other" sub-tree of the parent tree (i.e., ``self.parent.sub_trees['other'].templates['name']``) :param short_name: name of the template :return: tuple with the template and the variables corresponding to the template @@ -160,9 +165,9 @@ class FileTree(object): Gets all existing directory/file names matching a specific pattern :param short_name: short name of the path template - :param glob_vars: sequence of undefined variables that can take any possible values when looking for matches on the disk - Any defined variables in `glob_vars` will be ignored. - If glob_vars is set to 'all', all undefined variables will be used to look up matches + :param glob_vars: sequence of undefined variables that can take any possible values when looking for matches on the disk. + Any defined variables in `glob_vars` will be ignored. + If glob_vars is set to 'all', all undefined variables will be used to look up matches. :return: sorted sequence of paths """ text, variables = self.get_template(short_name) @@ -173,9 +178,9 @@ class FileTree(object): Gets all the parameters that generate existing filenames :param short_name: short name of the path template - :param glob_vars: sequence of undefined variables that can take any possible values when looking for matches on the disk - Any defined variables in `glob_vars` will be ignored. - If glob_vars is set to 'all', all undefined variables will be used to look up matches + :param glob_vars: sequence of undefined variables that can take any possible values when looking for matches on the disk. + Any defined variables in `glob_vars` will be ignored. + If glob_vars is set to 'all', all undefined variables will be used to look up matches. :return: sequence of dictionaries with the variables settings used to generate each filename """ return tuple(self.extract_variables(short_name, fn) for fn in self.get_all(short_name, glob_vars=glob_vars)) @@ -184,8 +189,9 @@ class FileTree(object): """ Creates a new filetree with updated variables - :arg variables: new values for the variables - Setting variables to None will explicitly unset them + :param variables: new values for the variables + Setting variables to None will explicitly unset them + :return: New FileTree with same templates for directory names and filenames, but updated variables """ new_tree = deepcopy(self) new_tree.variables.update(variables) @@ -198,7 +204,7 @@ class FileTree(object): :param short_name: short name of the path template :param filename: filename matching the template :return: variables needed to get to the given filename - Variables with None value are optional variables in the template that were not used + Variables with None value are optional variables in the template that were not used """ text, _ = self.get_template(short_name) return utils.extract_variables(text, filename, self.variables) @@ -234,11 +240,11 @@ class FileTree(object): :param on_disk: if True checks whether the files exist on disk :param error: if True raises a helpful error when the check fails :param glob_vars: sequence of undefined variables that can take any possible values when looking for matches on the disk - If `glob_vars` contains any defined variables, it will be ignored. + If `glob_vars` contains any defined variables, it will be ignored. :return: True if short names exist and optionally exist on disk (False otherwise) :raise: - - ValueError if error is set and the tree is incomplete - - IOError if error is set and any files are missing from the disk + - ValueError if error is set and the tree is incomplete + - IOError if error is set and any files are missing from the disk """ if isinstance(short_names, str): short_names = (short_names, ) @@ -273,11 +279,11 @@ class FileTree(object): """ Reads a FileTree from a specific file - The return type is ``cls`` unless the tree_name has been previously registered - The return type of any sub-tree is FileTree unless the tree_name has been previously registered + The return type is ``cls`` unless the tree_name has been previously registered. + The return type of any sub-tree is ``FileTree`` unless the tree_name has been previously registered. :param tree_name: file containing the filename tree. - Can provide the filename of a tree file or the name for a tree in the ``filetree.tree_directories``. + Can provide the filename of a tree file or the name for a tree in the ``filetree.tree_directories``. :param directory: parent directory of the full tree (defaults to current directory) :param variables: variable settings :return: dictionary from specifier to filename diff --git a/fsl/utils/filetree/parse.py b/fsl/utils/filetree/parse.py index 2e5195361b2fe1b51bb0dcb60219949adaff9a50..c89b381fd38142158f78319c7b070a331bb875a5 100644 --- a/fsl/utils/filetree/parse.py +++ b/fsl/utils/filetree/parse.py @@ -30,11 +30,12 @@ def read_line(line: str) -> Tuple[int, PurePath, str]: """ Parses line from the tree file - :param line: input line from a *.tree file + :param line: input line from a \*.tree file :return: Tuple with: - - number of spaces in front of the name - - name of the file or the sub_tree - - short name of the file + + - number of spaces in front of the name + - name of the file or the sub_tree + - short name of the file """ if line.strip()[:1] == '->': return read_subtree_line(line) @@ -54,12 +55,13 @@ def read_subtree_line(line: str, directory: str) -> Tuple[int, "filetree.FileTre """ Parses the line defining a sub_tree - :param line: input line from a *.tree file + :param line: input line from a \*.tree file :param directory: containing directory :return: Tuple with - - number of spaces in front of the name - - sub_tree - - short name of the sub_tree + + - number of spaces in front of the name + - sub_tree + - short name of the sub_tree """ match = re.match(r'^(\s*)->\s*(\S*)(.*)\((\S*)\)', line) if match is None: diff --git a/fsl/utils/filetree/utils.py b/fsl/utils/filetree/utils.py index f78046fff8f6878309225b55ae22d6c8dab519ff..4ad5f63aff9164531b1a491056f9f9574f963b70 100644 --- a/fsl/utils/filetree/utils.py +++ b/fsl/utils/filetree/utils.py @@ -27,7 +27,7 @@ def get_all(template, variables, glob_vars=()): :param template: template :param variables: (incomplete) mapping of variable names to values :param glob_vars: sequence of undefined variables that can take any possible values when looking for matches on the disk - If `glob_vars` contains any defined variables, it will be ignored. + If `glob_vars` contains any defined variables, it will be ignored. :return: sequence of filenames """ filled = fill_known(template, variables)