From 689f5fc3324c9f07796c847839b20c33bad0d867 Mon Sep 17 00:00:00 2001
From: Paul McCarthy <pauldmccarthy@gmail.com>
Date: Wed, 6 Feb 2019 17:53:06 +0000
Subject: [PATCH] ENH: Fleshed out FileTreeQuery implementation. Need to write
 some tests.

---
 fsl/utils/filetree/query.py | 264 ++++++++++++++++++++++++++++--------
 1 file changed, 207 insertions(+), 57 deletions(-)

diff --git a/fsl/utils/filetree/query.py b/fsl/utils/filetree/query.py
index 22827dea2..ba1b8d973 100644
--- a/fsl/utils/filetree/query.py
+++ b/fsl/utils/filetree/query.py
@@ -5,7 +5,20 @@
 # Author: Paul McCarthy <pauldmccarthy@gmail.com>
 # Author: Michiel Cottaar <michiel.cottaar@.ndcn.ox.ac.uk>
 #
-"""
+"""This module contains the :class:`FileTreeQuery` class, which can be used to
+search for files in a directory described by a `.FileTree`. A
+``FileTreeQuery`` object returns :class:`Match` objects which each represent a
+file that is described by the ``FileTree``, and which is present in the
+directory.
+
+The following utility functions, used by the ``FileTreeQuery`` class, are also
+defined in this module:
+
+.. autosummary::
+   :nosignatures:
+
+   scan
+   allVariables
 """
 
 
@@ -13,107 +26,244 @@ import logging
 import collections
 
 import os.path as op
-from typing import Dict, Set, List
+from typing import Dict, List, Tuple
+
+import numpy as np
 
 
 log = logging.getLogger(__name__)
 
 
 class FileTreeQuery(object):
+    """The ``FileTreeQuery`` class uses a :class:`.FileTree` to search
+    a directory for files which match a specific query.
+
+
+    """
+
 
     def __init__(self, tree):
+        """Create a ``FileTreeQuery``.
+
+        :arg tree: The ``FileTree`` object
         """
+
+        # Find all files present in the directory
+        # (as Match objects), and find all variables,
+        # plus their values, and all short names,
+        # that are present in the directory.
+        matches                = scan(tree)
+        allvars, shortnamevars = allVariables(tree, matches)
+
+        # Now we are going to build a series of ND
+        # arrays to store Match objects. We create
+        # one array for each short name. Each axis
+        # in an array corresponds to a variable
+        # present in files of that short name type,
+        # and each position along an axis corresponds
+        # to one value of that variable.
+        #
+        # These arrays will be used to store and
+        # retrieve Match objects - given a short
+        # name and a set of variable values, we
+        # can quickly find the corresponding Match
+        # object (or objects).
+
+        # matcharrays contains {shortname : ndarray}
+        # mappings, and varidxs contains
+        # {shortname : {varvalue : index}} mappings
+        matcharrays = {}
+        varidxs     = {}
+
+        for shortname in shortnamevars.keys():
+
+            snvars    = shortnamevars[shortname]
+            snvarlens = [len(allvars[v]) for v in snvars]
+
+            # An ND array for this short
+            # name. Each element is a
+            # Match object, or nan.
+            matcharray    = np.zeros(snvarlens, dtype=np.object)
+            matcharray[:] = np.nan
+
+            # indices into the match array
+            # for each variable value
+            snvaridxs = {}
+            for v in snvars:
+                snvaridxs[v] = {n : i for i, n in enumerate(allvars[v])}
+
+            matcharrays[shortname] = matcharray
+            varidxs[    shortname] = snvaridxs
+
+        # Populate the match arrays
+        for match in matches:
+            snvars    = shortnamevars[match.short_name]
+            snvaridxs = varidxs[      match.short_name]
+            snarr     = matcharrays[  match.short_name]
+            idx       = []
+            for var in snvars:
+
+                # TODO handle optional variables. Need
+                # an extra element on each axis which
+                # represents a missing value
+                val = match.variables[var]
+                idx.append(snvaridxs[var][val])
+
+            snarr[tuple(idx)] = match
+
+        self.__allvars       = allvars
+        self.__shortnamevars = shortnamevars
+        self.__matches       = matches
+        self.__matcharrays   = matcharrays
+        self.__varidxs       = varidxs
+
+
+    def axes(self, short_name) -> List[str]:
+        """Returns a list containing the names of variables present in files
+        of the given ``short_name`` type, in the same order of the axes of
+        :class:`Match` arrays that are returned by the :meth:`query` method.
         """
-        self.__tree      = tree
-        self.__matches   = Match.scan(tree)
-        self.__variables = Match.allVariables(tree, self.__matches)
+        return self.__shortnamevars[short_name]
 
 
-    def variables(self) -> Dict[str, Set]:
+    def variables(self, short_name=None) -> Dict[str, List]:
         """Return a dict of ``{variable : [values]}`` mappings.
         This dict describes all variables and their possible values in
         the tree.
+
+        If a ``short_name`` is specified, only variables which are present in
+        files of that ``short_name`` type are returned.
         """
-        return dict(self.__variables)
+        if short_name is None:
+            return dict(self.__allvars)
+        else:
+            varnames = self.__shortnamevars[short_name]
+            return {var : self.__allvars[var] for var in varnames}
 
 
-    def query(self, **variables) -> List[str]:
-        """Return all ``Match`` objects which match the given set of
-        ``variable=value`` arguments.
+    @property
+    def short_names(self) -> List[str]:
+        """Returns a list containing all short names of the ``FileTree`` that
+        are present in the directory.
         """
-        hits = []
+        return list(self.__shortnamevars.keys())
 
-        for m in self.__matches:
-            if all([m.variables.get(n, None) == v
-                    for n, v in variables.items()]):
-                hits.append(m)
 
-        return hits
+    def query(self, short_name, **variables):
+        """Search for files of the given ``short_name``, which match
+        the specified ``variables``.
 
+        :arg short_name: Short name of files to search for.
 
-class Match(object):
-    """
-    Filename matching a template in the file tree
+        All other arguments are
+        """
+
+        varnames    = list(variables.keys())
+        allvarnames = self.__shortnamevars[short_name]
+        varidxs     = self.__varidxs[    short_name]
+        matcharray  = self.__matcharrays[short_name]
+        slc         = []
+
+        for var in allvarnames:
+
+            if var in varnames: val = variables[var]
+            else:               val = '*'
+
+            # We're using np.newaxis to retain
+            # the full dimensionality of the
+            # array, so that the axis labels
+            # returned by the axes() method
+            # are valid.
+            if val == '*': slc.append(slice(None))
+            else:          slc.extend([np.newaxis, varidxs[var][val]])
+
+        return matcharray[tuple(slc)]
+
+
+def scan(tree):
+    """Scans the directory of the given ``FileTree`` to find all files which
+    match a tree template.
+
+    :return: list of :class:`Match` objects
     """
 
-    @staticmethod
-    def allVariables(tree, matches) -> Dict[str, Set]:
-        """Returns a dict of ``{ variable : [values] }`` mappings
-        containing all variables and their possible values present
-        in the given list of ``Match`` objects.
-        """
-        allvars = collections.defaultdict(set)
+    matches = []
+    for template in tree.templates:
+        for filename in tree.get_all(template, glob_vars='all'):
 
-        for m in matches:
-            for var, val in m.variables.items():
-                allvars[var].add(val)
-        return allvars
+            if not op.isfile(filename):
+                continue
 
+            variables = tree.extract_variables(template, filename)
+            variables = {var : val
+                         for var, val in variables.items()
+                         if val is not None}
 
-    @staticmethod
-    def scan(tree):
-        """
-        Scans the disk to find any matches
+            matches.append(Match(filename, template, variables))
 
-        :return: list of :class:`Match` objects
-        """
+    for tree_name, sub_tree in tree.sub_trees:
+        matches.extend(Match.scan(sub_tree))
+
+    return matches
 
-        matches = []
-        for template in tree.templates:
-            for filename in tree.get_all(template, glob_vars='all'):
 
-                if not op.isfile(filename):
-                    continue
+def allVariables(tree, matches) -> Tuple[Dict[str, List], Dict[str, List]]:
+    """Identifies the ``FileTree`` variables which are actually represented
+    in files in the directory.
 
-                variables = tree.extract_variables(template, filename)
-                variables = {var : val
-                             for var, val in variables.items()
-                             if val is not None}
+    :arg filetree: The ``FileTree``object
+    :arg matches:  list of ``Match`` objects (e.g. as returned by :func:`scan`)
 
-                matches.append(Match(filename, template, variables))
+    :returns: a tuple containing two dicts:
 
-        for tree_name, sub_tree in tree.sub_trees:
-            matches.extend(Match.scan(sub_tree))
+               - A dict of ``{ variable : [values] }`` mappings containing all
+                 variables and their possible values present in the given list
+                 of ``Match`` objects.
 
-        return matches
+               - A dict of ``{ short_name : [variables] }`` mappings,
+                 containing the variables which are relevant to each short
+                 name.
+    """
+    allvars       = collections.defaultdict(set)
+    allshortnames = collections.defaultdict(set)
+
+    for m in matches:
+        for var, val in m.variables.items():
+            allvars[      var]         .add(val)
+            allshortnames[m.short_name].add(var)
+
+    allvars       = {var : list(sorted(vals))
+                     for var, vals in allvars.items()}
+    allshortnames = {sn  : list(sorted(vars))
+                     for sn, vars in allshortnames.items()}
+
+    return allvars, allshortnames
+
+
+class Match(object):
+    """A ``Match`` object represents a file with a name matching a template in
+    a ``FileTree``.
+    """
 
 
     def __init__(self, filename, short_name, variables):
-        """
-        Defines a new match
+        """Create a ``Match`` object. All arguments are added as attributes.
 
-        :param filename: name of existing file
-        :param short_name: template identifier
-        :param variables: variable values
+        :arg filename:   name of existing file
+        :arg short_name: template identifier
+        :arg variables:  Dictionary of ``{variable : value}`` mappings
+                         containing all variables present in the file name.
         """
-        self.filename = filename
+        self.filename   = filename
         self.short_name = short_name
-        self.variables = dict(variables)
+        self.variables  = dict(variables)
 
 
     def __repr__(self):
-        return self.filename
+        """Returns a string representation of this ``Match``. """
+        return 'Match({})'.format(self.filename)
 
 
     def __str__(self):
+        """Returns a string representation of this ``Match``. """
         return repr(self)
-- 
GitLab