From 689f5fc3324c9f07796c847839b20c33bad0d867 Mon Sep 17 00:00:00 2001 From: Paul McCarthy <pauldmccarthy@gmail.com> Date: Wed, 6 Feb 2019 17:53:06 +0000 Subject: [PATCH] ENH: Fleshed out FileTreeQuery implementation. Need to write some tests. --- fsl/utils/filetree/query.py | 264 ++++++++++++++++++++++++++++-------- 1 file changed, 207 insertions(+), 57 deletions(-) diff --git a/fsl/utils/filetree/query.py b/fsl/utils/filetree/query.py index 22827dea2..ba1b8d973 100644 --- a/fsl/utils/filetree/query.py +++ b/fsl/utils/filetree/query.py @@ -5,7 +5,20 @@ # Author: Paul McCarthy <pauldmccarthy@gmail.com> # Author: Michiel Cottaar <michiel.cottaar@.ndcn.ox.ac.uk> # -""" +"""This module contains the :class:`FileTreeQuery` class, which can be used to +search for files in a directory described by a `.FileTree`. A +``FileTreeQuery`` object returns :class:`Match` objects which each represent a +file that is described by the ``FileTree``, and which is present in the +directory. + +The following utility functions, used by the ``FileTreeQuery`` class, are also +defined in this module: + +.. autosummary:: + :nosignatures: + + scan + allVariables """ @@ -13,107 +26,244 @@ import logging import collections import os.path as op -from typing import Dict, Set, List +from typing import Dict, List, Tuple + +import numpy as np log = logging.getLogger(__name__) class FileTreeQuery(object): + """The ``FileTreeQuery`` class uses a :class:`.FileTree` to search + a directory for files which match a specific query. + + + """ + def __init__(self, tree): + """Create a ``FileTreeQuery``. + + :arg tree: The ``FileTree`` object """ + + # Find all files present in the directory + # (as Match objects), and find all variables, + # plus their values, and all short names, + # that are present in the directory. + matches = scan(tree) + allvars, shortnamevars = allVariables(tree, matches) + + # Now we are going to build a series of ND + # arrays to store Match objects. We create + # one array for each short name. Each axis + # in an array corresponds to a variable + # present in files of that short name type, + # and each position along an axis corresponds + # to one value of that variable. + # + # These arrays will be used to store and + # retrieve Match objects - given a short + # name and a set of variable values, we + # can quickly find the corresponding Match + # object (or objects). + + # matcharrays contains {shortname : ndarray} + # mappings, and varidxs contains + # {shortname : {varvalue : index}} mappings + matcharrays = {} + varidxs = {} + + for shortname in shortnamevars.keys(): + + snvars = shortnamevars[shortname] + snvarlens = [len(allvars[v]) for v in snvars] + + # An ND array for this short + # name. Each element is a + # Match object, or nan. + matcharray = np.zeros(snvarlens, dtype=np.object) + matcharray[:] = np.nan + + # indices into the match array + # for each variable value + snvaridxs = {} + for v in snvars: + snvaridxs[v] = {n : i for i, n in enumerate(allvars[v])} + + matcharrays[shortname] = matcharray + varidxs[ shortname] = snvaridxs + + # Populate the match arrays + for match in matches: + snvars = shortnamevars[match.short_name] + snvaridxs = varidxs[ match.short_name] + snarr = matcharrays[ match.short_name] + idx = [] + for var in snvars: + + # TODO handle optional variables. Need + # an extra element on each axis which + # represents a missing value + val = match.variables[var] + idx.append(snvaridxs[var][val]) + + snarr[tuple(idx)] = match + + self.__allvars = allvars + self.__shortnamevars = shortnamevars + self.__matches = matches + self.__matcharrays = matcharrays + self.__varidxs = varidxs + + + def axes(self, short_name) -> List[str]: + """Returns a list containing the names of variables present in files + of the given ``short_name`` type, in the same order of the axes of + :class:`Match` arrays that are returned by the :meth:`query` method. """ - self.__tree = tree - self.__matches = Match.scan(tree) - self.__variables = Match.allVariables(tree, self.__matches) + return self.__shortnamevars[short_name] - def variables(self) -> Dict[str, Set]: + def variables(self, short_name=None) -> Dict[str, List]: """Return a dict of ``{variable : [values]}`` mappings. This dict describes all variables and their possible values in the tree. + + If a ``short_name`` is specified, only variables which are present in + files of that ``short_name`` type are returned. """ - return dict(self.__variables) + if short_name is None: + return dict(self.__allvars) + else: + varnames = self.__shortnamevars[short_name] + return {var : self.__allvars[var] for var in varnames} - def query(self, **variables) -> List[str]: - """Return all ``Match`` objects which match the given set of - ``variable=value`` arguments. + @property + def short_names(self) -> List[str]: + """Returns a list containing all short names of the ``FileTree`` that + are present in the directory. """ - hits = [] + return list(self.__shortnamevars.keys()) - for m in self.__matches: - if all([m.variables.get(n, None) == v - for n, v in variables.items()]): - hits.append(m) - return hits + def query(self, short_name, **variables): + """Search for files of the given ``short_name``, which match + the specified ``variables``. + :arg short_name: Short name of files to search for. -class Match(object): - """ - Filename matching a template in the file tree + All other arguments are + """ + + varnames = list(variables.keys()) + allvarnames = self.__shortnamevars[short_name] + varidxs = self.__varidxs[ short_name] + matcharray = self.__matcharrays[short_name] + slc = [] + + for var in allvarnames: + + if var in varnames: val = variables[var] + else: val = '*' + + # We're using np.newaxis to retain + # the full dimensionality of the + # array, so that the axis labels + # returned by the axes() method + # are valid. + if val == '*': slc.append(slice(None)) + else: slc.extend([np.newaxis, varidxs[var][val]]) + + return matcharray[tuple(slc)] + + +def scan(tree): + """Scans the directory of the given ``FileTree`` to find all files which + match a tree template. + + :return: list of :class:`Match` objects """ - @staticmethod - def allVariables(tree, matches) -> Dict[str, Set]: - """Returns a dict of ``{ variable : [values] }`` mappings - containing all variables and their possible values present - in the given list of ``Match`` objects. - """ - allvars = collections.defaultdict(set) + matches = [] + for template in tree.templates: + for filename in tree.get_all(template, glob_vars='all'): - for m in matches: - for var, val in m.variables.items(): - allvars[var].add(val) - return allvars + if not op.isfile(filename): + continue + variables = tree.extract_variables(template, filename) + variables = {var : val + for var, val in variables.items() + if val is not None} - @staticmethod - def scan(tree): - """ - Scans the disk to find any matches + matches.append(Match(filename, template, variables)) - :return: list of :class:`Match` objects - """ + for tree_name, sub_tree in tree.sub_trees: + matches.extend(Match.scan(sub_tree)) + + return matches - matches = [] - for template in tree.templates: - for filename in tree.get_all(template, glob_vars='all'): - if not op.isfile(filename): - continue +def allVariables(tree, matches) -> Tuple[Dict[str, List], Dict[str, List]]: + """Identifies the ``FileTree`` variables which are actually represented + in files in the directory. - variables = tree.extract_variables(template, filename) - variables = {var : val - for var, val in variables.items() - if val is not None} + :arg filetree: The ``FileTree``object + :arg matches: list of ``Match`` objects (e.g. as returned by :func:`scan`) - matches.append(Match(filename, template, variables)) + :returns: a tuple containing two dicts: - for tree_name, sub_tree in tree.sub_trees: - matches.extend(Match.scan(sub_tree)) + - A dict of ``{ variable : [values] }`` mappings containing all + variables and their possible values present in the given list + of ``Match`` objects. - return matches + - A dict of ``{ short_name : [variables] }`` mappings, + containing the variables which are relevant to each short + name. + """ + allvars = collections.defaultdict(set) + allshortnames = collections.defaultdict(set) + + for m in matches: + for var, val in m.variables.items(): + allvars[ var] .add(val) + allshortnames[m.short_name].add(var) + + allvars = {var : list(sorted(vals)) + for var, vals in allvars.items()} + allshortnames = {sn : list(sorted(vars)) + for sn, vars in allshortnames.items()} + + return allvars, allshortnames + + +class Match(object): + """A ``Match`` object represents a file with a name matching a template in + a ``FileTree``. + """ def __init__(self, filename, short_name, variables): - """ - Defines a new match + """Create a ``Match`` object. All arguments are added as attributes. - :param filename: name of existing file - :param short_name: template identifier - :param variables: variable values + :arg filename: name of existing file + :arg short_name: template identifier + :arg variables: Dictionary of ``{variable : value}`` mappings + containing all variables present in the file name. """ - self.filename = filename + self.filename = filename self.short_name = short_name - self.variables = dict(variables) + self.variables = dict(variables) def __repr__(self): - return self.filename + """Returns a string representation of this ``Match``. """ + return 'Match({})'.format(self.filename) def __str__(self): + """Returns a string representation of this ``Match``. """ return repr(self) -- GitLab