query.py 13.7 KB
Newer Older
1
2
#!/usr/bin/env python
#
3
# query.py - The FileTreeQuery class
4
5
6
7
#
# Author: Paul McCarthy <pauldmccarthy@gmail.com>
# Author: Michiel Cottaar <michiel.cottaar@.ndcn.ox.ac.uk>
#
8
"""This module contains the :class:`FileTreeQuery` class, which can be used to
9
search for files in a directory described by a :class:`.FileTree`. A
10
11
12
13
14
15
16
17
18
19
20
21
``FileTreeQuery`` object returns :class:`Match` objects which each represent a
file that is described by the ``FileTree``, and which is present in the
directory.

The following utility functions, used by the ``FileTreeQuery`` class, are also
defined in this module:

.. autosummary::
   :nosignatures:

   scan
   allVariables
22
"""
23
24


25
26
27
import              logging
import              collections
import functools as ft
28
29

import os.path as op
30
31
32
from typing import Dict, List, Tuple

import numpy as np
33

34
35
from fsl.utils.deprecated import deprecated
from .                    import FileTree
36

37
38
39
40
41

log = logging.getLogger(__name__)


class FileTreeQuery(object):
42
43
44
    """The ``FileTreeQuery`` class uses a :class:`.FileTree` to search
    a directory for files which match a specific query.

45
    A ``FileTreeQuery`` scans the contents of a directory which is described
Paul McCarthy's avatar
Paul McCarthy committed
46
47
    by a :class:`.FileTree`, and identifies all file types (a.k.a. *templates*
    or *short names*) that are present, and the values of variables within each
48
    short name that are present. The :meth:`query` method can be used to
49
    retrieve files which match a specific template, and variable values.
50

Paul McCarthy's avatar
Paul McCarthy committed
51
52
    The :meth:`query` method returns a collection of :class:`Match` objects,
    each of which represents one file which matches the query.
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

    Example usage::

        >>> from fsl.utils.filetree import FileTree, FileTreeQuery

        >>> tree  = FileTree.read('bids_raw', './my_bids_data')
        >>> query = FileTreeQuery(tree)

        >>> query.axes('anat_image')
        ['acq', 'ext', 'modality', 'participant', 'rec', 'run_index',
         'session']

        >>> query.variables('anat_image')
        {'acq': [None],
         'ext': ['.nii.gz'],
         'modality': ['T1w', 'T2w'],
         'participant': ['01', '02', '03'],
         'rec': [None],
         'run_index': [None, '01', '02', '03'],
         'session': [None]}

        >>> query.query('anat_image', participant='01')
75
76
77
78
79
80
81
        [Match(./my_bids_data/sub-01/anat/sub-01_T1w.nii.gz),
         Match(./my_bids_data/sub-01/anat/sub-01_T2w.nii.gz)]


    Matches for templates contained within sub-trees are referred to by
    constructing a hierarchical path from the sub-tree template name(s),
    and the template name - see the :meth:`Match.full_name` method.
82
83
    """

84
85

    def __init__(self, tree):
86
87
88
        """Create a ``FileTreeQuery``. The contents of the tree directory are
        scanned via the :func:`scan` function, which may take some time for
        large data sets.
89

90
        :arg tree: The :class:`.FileTree` object
91
        """
92
93
        # Hard-code into the templates any pre-defined variables
        tree = tree.partial_fill()
94
95
96

        # Find all files present in the directory
        # (as Match objects), and find all variables,
97
        # plus their values, and all templates,
98
        # that are present in the directory.
99
100
        matches               = scan(tree)
        allvars, templatevars = allVariables(tree, matches)
101
102
103

        # Now we are going to build a series of ND
        # arrays to store Match objects. We create
104
        # one array for each template. Each axis
105
        # in an array corresponds to a variable
106
        # present in files of that template type,
107
108
109
110
        # and each position along an axis corresponds
        # to one value of that variable.
        #
        # These arrays will be used to store and
111
112
113
        # retrieve Match objects - given a template
        # and a set of variable values, we can
        # quickly find the corresponding Match
114
115
        # object (or objects).

116
        # matcharrays contains {template : ndarray}
117
        # mappings, and varidxs contains
118
        # {template : {varvalue : index}} mappings
119
120
121
        matcharrays = {}
        varidxs     = {}

122
        for template, tvars in templatevars.items():
123

124
            tvarlens = [len(allvars[v]) for v in tvars]
125

126
127
128
129
130
131
            # "Scalar" match objects - templates
            # which have no variables, and for
            # which zero or one file is present
            if len(tvarlens) == 0:
                tvarlens = 1

132
133
134
            # An ND array for this short
            # name. Each element is a
            # Match object, or nan.
135
            matcharray    = np.zeros(tvarlens, dtype=np.object)
136
137
138
139
            matcharray[:] = np.nan

            # indices into the match array
            # for each variable value
140
141
142
            tvaridxs = {}
            for v in tvars:
                tvaridxs[v] = {n : i for i, n in enumerate(allvars[v])}
143

144
145
            matcharrays[template] = matcharray
            varidxs[    template] = tvaridxs
146
147
148

        # Populate the match arrays
        for match in matches:
149
150
151
152
            tvars    = templatevars[match.full_name]
            tvaridxs = varidxs[     match.full_name]
            tarr     = matcharrays[ match.full_name]
            idx      = []
153

154
155
156
157
158
159
            if len(match.variables) == 0:
                idx = [0]
            else:
                for var in tvars:
                    val = match.variables[var]
                    idx.append(tvaridxs[var][val])
160

161
            tarr[tuple(idx)] = match
162

163
        self.__tree          = tree
164
        self.__allvars       = allvars
165
        self.__templatevars  = templatevars
166
167
168
169
170
        self.__matches       = matches
        self.__matcharrays   = matcharrays
        self.__varidxs       = varidxs


171
    def axes(self, template) -> List[str]:
172
        """Returns a list containing the names of variables present in files
173
        of the given ``template`` type, in the same order of the axes of
174
        :class:`Match` arrays that are returned by the :meth:`query` method.
175
        """
176
        return self.__templatevars[template]
177
178


179
    def variables(self, template=None) -> Dict[str, List]:
180
181
182
        """Return a dict of ``{variable : [values]}`` mappings.
        This dict describes all variables and their possible values in
        the tree.
183

184
185
        If a ``template`` is specified, only variables which are present in
        files of that ``template`` type are returned.
186
        """
187
        if template is None:
188
            return {var : list(vals) for var, vals in self.__allvars.items()}
189
        else:
190
            varnames = self.__templatevars[template]
191
            return {var : list(self.__allvars[var]) for var in varnames}
192
193


194
195
196
197
198
199
200
201
    @property
    def tree(self):
        """Returns the :class:`.FileTree` associated with this
        ``FileTreeQuery``.
        """
        return self.__tree


202
    @property
203
204
205
206
207
208
209
210
211
    def templates(self) -> List[str]:
        """Returns a list containing all templates of the ``FileTree`` that
        are present in the directory.
        """
        return list(self.__templatevars.keys())


    @property
    @deprecated('2.6.0', '3.0.0', 'Use templates instead')
212
    def short_names(self) -> List[str]:
213
        """Returns a list containing all templates of the ``FileTree`` that
214
        are present in the directory.
215
        """
216
        return self.templates
217
218


219
220
    def query(self, template, asarray=False, **variables):
        """Search for files of the given ``template``, which match
221
222
        the specified ``variables``. All hits are returned for variables
        that are unspecified.
223

224
        :arg template: Template of files to search for.
225

226
227
228
229
230
231
        :arg asarray:  If ``True``, the relevant :class:`Match` objects are
                       returned in a in a ND ``numpy.array`` where each
                       dimension corresponds to a variable for the
                       ``templates`` in question (as returned by
                       :meth:`axes`). Otherwise (the default), they are
                       returned in a list.
232

233
234
235
236
237
        All other arguments are assumed to be ``variable=value`` pairs,
        used to restrict which matches are returned. All values are returned
        for variables that are not specified, or variables which are given a
        value of ``'*'``.

238
239
        :returns: A list  of ``Match`` objects, (or a ``numpy.array`` if
                  ``asarray=True``).
240
241
242
        """

        varnames    = list(variables.keys())
243
244
245
        allvarnames = self.__templatevars[template]
        varidxs     = self.__varidxs[     template]
        matcharray  = self.__matcharrays[ template]
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
        slc         = []

        for var in allvarnames:

            if var in varnames: val = variables[var]
            else:               val = '*'

            # We're using np.newaxis to retain
            # the full dimensionality of the
            # array, so that the axis labels
            # returned by the axes() method
            # are valid.
            if val == '*': slc.append(slice(None))
            else:          slc.extend([np.newaxis, varidxs[var][val]])

261
262
263
264
        result = matcharray[tuple(slc)]

        if asarray: return result
        else:       return [m for m in result.flat if isinstance(m, Match)]
265
266


267
@ft.total_ordering
268
269
270
271
272
273
274
class Match(object):
    """A ``Match`` object represents a file with a name matching a template in
    a ``FileTree``.  The :func:`scan` function and :meth:`FileTree.query`
    method both return ``Match`` objects.
    """


275
    def __init__(self, filename, template, tree, variables):
276
277
278
        """Create a ``Match`` object. All arguments are added as attributes.

        :arg filename:   name of existing file
279
        :arg template:   template identifier
280
        :arg tree:       :class:`.FileTree` which contains this ``Match``
281
282
283
284
        :arg variables:  Dictionary of ``{variable : value}`` mappings
                         containing all variables present in the file name.
        """
        self.__filename   = filename
285
        self.__template   = template
286
        self.__tree       = tree
287
288
289
290
291
292
293
294
295
        self.__variables  = dict(variables)


    @property
    def filename(self):
        return self.__filename


    @property
296
    @deprecated('2.6.0', '3.0.0', 'Use template instead')
297
    def short_name(self):
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
        return self.template


    @property
    def template(self):
        return self.__template


    @property
    def full_name(self):
        """The ``full_name`` of a ``Match`` is a combination of the
        ``template`` (i.e. the matched template), and the name(s) of
        the relevant ``FileTree`` objects.

        It allows one to unamiguously identify the location of a ``Match``
        in a ``FileTree`` hierarchy, where the same ``short_name`` may be
        used in different sub-trees.
        """

        def parents(tree):
            if tree.parent is None:
                return []
            else:
                return [tree.parent] + parents(tree.parent)

        trees = [self.tree] + parents(self.tree)

        # Drop the root tree
        trees = list(reversed(trees))[1:]

        return '/'.join([t.name for t in trees] + [self.template])
329
330


331
332
333
334
335
    @property
    def tree(self):
        return self.__tree


336
337
338
339
340
341
342
343
    @property
    def variables(self):
        return dict(self.__variables)


    def __eq__(self, other):
        return (isinstance(other, Match)            and
                self.filename   == other.filename   and
344
                self.template   == other.template   and
345
                self.tree       is other.tree       and
346
347
348
349
350
351
352
353
354
                self.variables  == other.variables)


    def __lt__(self, other):
        return isinstance(other, Match) and self.filename < other.filename


    def __repr__(self):
        """Returns a string representation of this ``Match``. """
355
        return 'Match({}: {})'.format(self.full_name, self.filename)
356
357
358
359
360
361
362
363


    def __str__(self):
        """Returns a string representation of this ``Match``. """
        return repr(self)


def scan(tree : FileTree) -> List[Match]:
364
365
366
    """Scans the directory of the given ``FileTree`` to find all files which
    match a tree template.

367
368
    :arg tree: :class:`.FileTree` to scan
    :returns:  list of :class:`Match` objects
369
370
    """

371
372
    matches = []
    for template in tree.templates:
373

374
        for variables in tree.get_all_vars(template, glob_vars='all'):
375

376
            filename = tree.update(**variables).get(template)
377

378
            if not op.isfile(filename):
379
                continue
380

381
            matches.append(Match(filename, template, tree, variables))
382

383
384
    for tree_name, sub_tree in tree.sub_trees.items():
        matches.extend(scan(sub_tree))
385
386

    return matches
387

388

389
390
391
def allVariables(
        tree    : FileTree,
        matches : List[Match]) -> Tuple[Dict[str, List], Dict[str, List]]:
392
393
    """Identifies the ``FileTree`` variables which are actually represented
    in files in the directory.
394

Paul McCarthy's avatar
Paul McCarthy committed
395
    :arg filetree: The ``FileTree`` object
396
    :arg matches:  list of ``Match`` objects (e.g. as returned by :func:`scan`)
397

398
    :returns: a tuple containing two dicts:
399

400
401
402
               - A dict of ``{ variable : [values] }`` mappings containing all
                 variables and their possible values present in the given list
                 of ``Match`` objects.
403

404
405
               - A dict of ``{ full_name : [variables] }`` mappings,
                 containing the variables which are relevant to each template.
406
    """
407
    allvars      = collections.defaultdict(set)
408
    alltemplates = {}
409
410

    for m in matches:
411
412
413
414

        if m.full_name not in alltemplates:
            alltemplates[m.full_name] = set()

415
        for var, val in m.variables.items():
416
417
            allvars[     var]        .add(val)
            alltemplates[m.full_name].add(var)
418

419
420
421
422
423
    # allow us to compare None with strings
    def key(v):
        if v is None: return ''
        else:         return v

424
425
    allvars      = {var : list(sorted(vals, key=key))
                    for var, vals in allvars.items()}
426
427
    alltemplates = {tn  : list(sorted(vars))
                    for tn, vars in alltemplates.items()}
428

429
    return allvars, alltemplates