query.py 11.3 KB
Newer Older
1
2
#!/usr/bin/env python
#
3
# query.py - The FileTreeQuery class
4
5
6
7
#
# Author: Paul McCarthy <pauldmccarthy@gmail.com>
# Author: Michiel Cottaar <michiel.cottaar@.ndcn.ox.ac.uk>
#
8
9
10
11
12
13
14
15
16
17
18
19
20
21
"""This module contains the :class:`FileTreeQuery` class, which can be used to
search for files in a directory described by a `.FileTree`. A
``FileTreeQuery`` object returns :class:`Match` objects which each represent a
file that is described by the ``FileTree``, and which is present in the
directory.

The following utility functions, used by the ``FileTreeQuery`` class, are also
defined in this module:

.. autosummary::
   :nosignatures:

   scan
   allVariables
22
"""
23
24
25
26


import logging
import collections
27
28

import os.path as op
29
30
31
from typing import Dict, List, Tuple

import numpy as np
32

33
34
from . import FileTree

35
36
37
38
39

log = logging.getLogger(__name__)


class FileTreeQuery(object):
40
41
42
    """The ``FileTreeQuery`` class uses a :class:`.FileTree` to search
    a directory for files which match a specific query.

43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
    A ``FileTreeQuery`` scans the contents of a directory which is described
    by a :class:`.FileTree`, and identifies all file types (a.k.a. _templates_
    or _short names_) that are present, and the values of variables within each
    short name that are present. The :meth:`query` method can be used to
    retrieve files which match a specific short name, and variable values.

    The :meth:`query` method returns a multi-dimensional ``numpy.array``
    which contains :class:`Match` objects, where each dimension one
    represents variable for the short name in question.

    Example usage::

        >>> from fsl.utils.filetree import FileTree, FileTreeQuery

        >>> tree  = FileTree.read('bids_raw', './my_bids_data')
        >>> query = FileTreeQuery(tree)

        >>> query.axes('anat_image')
        ['acq', 'ext', 'modality', 'participant', 'rec', 'run_index',
         'session']

        >>> query.variables('anat_image')
        {'acq': [None],
         'ext': ['.nii.gz'],
         'modality': ['T1w', 'T2w'],
         'participant': ['01', '02', '03'],
         'rec': [None],
         'run_index': [None, '01', '02', '03'],
         'session': [None]}

        >>> query.query('anat_image', participant='01')
        array([[[[[[[Match(./my_bids_data/sub-01/anat/sub-01_T1w.nii.gz)],
                    [nan],
                    [nan],
                    [nan]]]],

                 [[[[Match(./my_bids_data/sub-01/anat/sub-01_T2w.nii.gz)],
                    [nan],
                    [nan],
                    [nan]]]]]]], dtype=object)
83
84
    """

85
86

    def __init__(self, tree):
87
88
89
        """Create a ``FileTreeQuery``. The contents of the tree directory are
        scanned via the :func:`scan` function, which may take some time for
        large data sets.
90

91
        :arg tree: The :class:`.FileTree` object
92
        """
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164

        # Find all files present in the directory
        # (as Match objects), and find all variables,
        # plus their values, and all short names,
        # that are present in the directory.
        matches                = scan(tree)
        allvars, shortnamevars = allVariables(tree, matches)

        # Now we are going to build a series of ND
        # arrays to store Match objects. We create
        # one array for each short name. Each axis
        # in an array corresponds to a variable
        # present in files of that short name type,
        # and each position along an axis corresponds
        # to one value of that variable.
        #
        # These arrays will be used to store and
        # retrieve Match objects - given a short
        # name and a set of variable values, we
        # can quickly find the corresponding Match
        # object (or objects).

        # matcharrays contains {shortname : ndarray}
        # mappings, and varidxs contains
        # {shortname : {varvalue : index}} mappings
        matcharrays = {}
        varidxs     = {}

        for shortname in shortnamevars.keys():

            snvars    = shortnamevars[shortname]
            snvarlens = [len(allvars[v]) for v in snvars]

            # An ND array for this short
            # name. Each element is a
            # Match object, or nan.
            matcharray    = np.zeros(snvarlens, dtype=np.object)
            matcharray[:] = np.nan

            # indices into the match array
            # for each variable value
            snvaridxs = {}
            for v in snvars:
                snvaridxs[v] = {n : i for i, n in enumerate(allvars[v])}

            matcharrays[shortname] = matcharray
            varidxs[    shortname] = snvaridxs

        # Populate the match arrays
        for match in matches:
            snvars    = shortnamevars[match.short_name]
            snvaridxs = varidxs[      match.short_name]
            snarr     = matcharrays[  match.short_name]
            idx       = []
            for var in snvars:

                val = match.variables[var]
                idx.append(snvaridxs[var][val])

            snarr[tuple(idx)] = match

        self.__allvars       = allvars
        self.__shortnamevars = shortnamevars
        self.__matches       = matches
        self.__matcharrays   = matcharrays
        self.__varidxs       = varidxs


    def axes(self, short_name) -> List[str]:
        """Returns a list containing the names of variables present in files
        of the given ``short_name`` type, in the same order of the axes of
        :class:`Match` arrays that are returned by the :meth:`query` method.
165
        """
166
        return self.__shortnamevars[short_name]
167
168


169
    def variables(self, short_name=None) -> Dict[str, List]:
170
171
172
        """Return a dict of ``{variable : [values]}`` mappings.
        This dict describes all variables and their possible values in
        the tree.
173
174
175

        If a ``short_name`` is specified, only variables which are present in
        files of that ``short_name`` type are returned.
176
        """
177
        if short_name is None:
178
            return {var : list(vals) for var, vals in self.__allvars.items()}
179
180
        else:
            varnames = self.__shortnamevars[short_name]
181
            return {var : list(self.__allvars[var]) for var in varnames}
182
183


184
185
186
187
    @property
    def short_names(self) -> List[str]:
        """Returns a list containing all short names of the ``FileTree`` that
        are present in the directory.
188
        """
189
        return list(self.__shortnamevars.keys())
190
191


192
193
    def query(self, short_name, **variables):
        """Search for files of the given ``short_name``, which match
194
195
        the specified ``variables``. All hits are returned for variables
        that are unspecified.
196

197
        :arg short_name: Short name of files to search for.
198

199
200
201
202
203
204
205
206
        All other arguments are assumed to be ``variable=value`` pairs,
        used to restrict which matches are returned. All values are returned
        for variables that are not specified, or variables which are given a
        value of ``'*'``.

        :returns: A ``numpy.array`` of ``Match`` objects, with axes
                  corresponding to the labels returned by the :meth:`axes`
                  method.
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
        """

        varnames    = list(variables.keys())
        allvarnames = self.__shortnamevars[short_name]
        varidxs     = self.__varidxs[    short_name]
        matcharray  = self.__matcharrays[short_name]
        slc         = []

        for var in allvarnames:

            if var in varnames: val = variables[var]
            else:               val = '*'

            # We're using np.newaxis to retain
            # the full dimensionality of the
            # array, so that the axis labels
            # returned by the axes() method
            # are valid.
            if val == '*': slc.append(slice(None))
            else:          slc.extend([np.newaxis, varidxs[var][val]])

        return matcharray[tuple(slc)]


231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
class Match(object):
    """A ``Match`` object represents a file with a name matching a template in
    a ``FileTree``.  The :func:`scan` function and :meth:`FileTree.query`
    method both return ``Match`` objects.
    """


    def __init__(self, filename, short_name, variables):
        """Create a ``Match`` object. All arguments are added as attributes.

        :arg filename:   name of existing file
        :arg short_name: template identifier
        :arg variables:  Dictionary of ``{variable : value}`` mappings
                         containing all variables present in the file name.
        """
        self.__filename   = filename
        self.__short_name = short_name
        self.__variables  = dict(variables)


    @property
    def filename(self):
        return self.__filename


    @property
    def short_name(self):
        return self.__short_name


    @property
    def variables(self):
        return dict(self.__variables)


    def __eq__(self, other):
        return (isinstance(other, Match)            and
                self.filename   == other.filename   and
                self.short_name == other.short_name and
                self.variables  == other.variables)


    def __lt__(self, other):
        return isinstance(other, Match) and self.filename < other.filename


    def __le__(self, other):
        return isinstance(other, Match) and self.filename <= other.filename


    def __repr__(self):
        """Returns a string representation of this ``Match``. """
        return 'Match({})'.format(self.filename)


    def __str__(self):
        """Returns a string representation of this ``Match``. """
        return repr(self)


def scan(tree : FileTree) -> List[Match]:
292
293
294
295
    """Scans the directory of the given ``FileTree`` to find all files which
    match a tree template.

    :return: list of :class:`Match` objects
296
297
    """

298
299
300
    matches = []
    for template in tree.templates:
        for filename in tree.get_all(template, glob_vars='all'):
301

302
303
            if not op.isfile(filename):
                continue
304

305
            variables = dict(tree.extract_variables(template, filename))
306

307
            matches.append(Match(filename, template, variables))
308

309
310
311
312
    for tree_name, sub_tree in tree.sub_trees:
        matches.extend(Match.scan(sub_tree))

    return matches
313

314

315
316
317
def allVariables(
        tree    : FileTree,
        matches : List[Match]) -> Tuple[Dict[str, List], Dict[str, List]]:
318
319
    """Identifies the ``FileTree`` variables which are actually represented
    in files in the directory.
320

321
322
    :arg filetree: The ``FileTree``object
    :arg matches:  list of ``Match`` objects (e.g. as returned by :func:`scan`)
323

324
    :returns: a tuple containing two dicts:
325

326
327
328
               - A dict of ``{ variable : [values] }`` mappings containing all
                 variables and their possible values present in the given list
                 of ``Match`` objects.
329

330
331
332
333
334
335
336
337
338
339
340
341
               - A dict of ``{ short_name : [variables] }`` mappings,
                 containing the variables which are relevant to each short
                 name.
    """
    allvars       = collections.defaultdict(set)
    allshortnames = collections.defaultdict(set)

    for m in matches:
        for var, val in m.variables.items():
            allvars[      var]         .add(val)
            allshortnames[m.short_name].add(var)

342
343
344
345
346
347
    # allow us to compare None with strings
    def key(v):
        if v is None: return ''
        else:         return v

    allvars       = {var : list(sorted(vals, key=key))
348
349
350
351
352
                     for var, vals in allvars.items()}
    allshortnames = {sn  : list(sorted(vars))
                     for sn, vars in allshortnames.items()}

    return allvars, allshortnames