path.py 14.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
#!/usr/bin/env python
#
# path.py - Utility functions for working with file/directory paths.
#
# Author: Paul McCarthy <pauldmccarthy@gmail.com>
#
"""This module contains a few utility functions for working with file system
paths.


.. autosummary::
   :nosignatures:

   deepest
15
   shallowest
16
   allFiles
17
   hasExt
18
19
   addExt
   removeExt
20
   getExt
21
22
   splitExt
   getFileGroup
23
   removeDuplicates
24
   uniquePrefix
25
26
27
28
"""


import os.path as op
29
30
import            os
import            glob
31
32


33
class PathError(Exception):
34
35
36
    """``Exception`` class raised by the functions defined in this module
    when something goes wrong.
    """
37
38
39
    pass


40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def deepest(path, suffixes):
    """Finds the deepest directory which ends with one of the given
    sequence of suffixes, or returns ``None`` if no directories end
    with any of the suffixes.
    """

    path = path.strip()

    if path == op.sep or path == '':
        return None

    path = path.rstrip(op.sep)

    if any([path.endswith(s) for s in suffixes]):
        return path

    return deepest(op.dirname(path), suffixes)


def shallowest(path, suffixes):
    """Finds the shallowest directory which ends with one of the given
    sequence of suffixes, or returns ``None`` if no directories end
    with any of the suffixes.
63
64
    """

65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
    path = path.strip()

    # We've reached the root of the file system
    if path == op.sep or path == '':
        return None

    path   = path.rstrip(op.sep)
    parent = shallowest(op.dirname(path), suffixes)

    if parent is not None:
        return parent

    if any([path.endswith(s) for s in suffixes]):
        return path

80
    return None
81
82


83
84
85
86
87
88
89
90
91
92
93
94
95
96
def allFiles(root):
    """Return a list containing all files which exist underneath the specified
    ``root`` directory.
    """

    files = []

    for dirpath, _, filenames in os.walk(root):
        filenames = [op.join(dirpath, f) for f in filenames]
        files.extend(filenames)

    return files


97
98
99
100
def hasExt(path, allowedExts):
    """Convenience function which returns ``True`` if the given ``path``
    ends with any of the given ``allowedExts``, ``False`` otherwise.
    """
101
    return any([path.endswith(e) for e in allowedExts])
102
103


104
def addExt(prefix,
105
           allowedExts=None,
106
107
           mustExist=True,
           defaultExt=None,
108
109
           fileGroups=None,
           unambiguous=True):
110
111
    """Adds a file extension to the given file ``prefix``.

112
    If ``mustExist`` is False, and the file does not already have a
113
114
115
116
    supported extension, the default extension is appended and the new
    file name returned. If the prefix already has a supported extension,
    it is returned unchanged.

117
118
    If ``mustExist`` is ``True`` (the default), the function checks to see
    if any files exist that have the given prefix, and a supported file
119
    extension.  A :exc:`PathError` is raised if:
120
121

       - No files exist with the given prefix and a supported extension.
122

123
124
       - ``fileGroups is None`` and ``unambiguous is True``, and more than
         one file exists with the given prefix, and a supported extension.
125
126
127

    Otherwise the full file name is returned.

128
129
    :arg prefix:      The file name prefix to modify.

130
    :arg allowedExts: List of allowed file extensions.
131

132
    :arg mustExist:   Whether the file must exist or not.
133

134
    :arg defaultExt:  Default file extension to use.
135
136

    :arg fileGroups:  Recognised file groups - see :func:`getFileGroup`.
137
138
139
140
141

    :arg unambiguous: If ``True`` (the default), and more than one file
                      exists with the specified ``prefix``, a
                      :exc:`PathError` is raised. Otherwise, a list
                      containing *all* matching files is returned.
142
143
    """

144
145
146
147
148
    if allowedExts is None: allowedExts = []
    if fileGroups  is None: fileGroups  = {}

    if defaultExt is not None and defaultExt not in allowedExts:
        allowedExts.append(defaultExt)
149

150
151
152
    if not mustExist:

        # the provided file name already
Paul McCarthy's avatar
Paul McCarthy committed
153
        # ends with a supported extension
154
        if hasExt(prefix, allowedExts):
155
156
157
            return prefix

        if defaultExt is not None: return prefix + defaultExt
158
159
160
161
162
163
164
        else:                      return prefix

    # If no allowed extensions were
    # provided, or the provided prefix
    # already ends with a supported
    # extension, check to see that it
    # exists.
165
    if len(allowedExts) == 0 or hasExt(prefix, allowedExts):
166
        allPaths = [prefix]
167

168
169
170
171
    # Otherwise, make a bunch of file names, one per
    # supported extension, and test to see if exactly
    # one of them exists.
    else:
172
        allPaths = [prefix + ext for ext in allowedExts]
173

174
175
    allPaths = [p for p in allPaths if op.isfile(p)]
    nexists  = len(allPaths)
176
177
178

    # Could not find any supported file
    # with the specified prefix
179
180
    if nexists == 0:
        raise PathError('Could not find a supported file '
181
                        'with prefix "{}"'.format(prefix))
182

183
184
185
186
187
188
189
190
    # If ambiguity is ok, return
    # all matching paths
    elif not unambiguous:
        return allPaths

    # Ambiguity is not ok! More than
    # one supported file with the
    # specified prefix.
191
192
193
194
    elif nexists > 1:

        # Remove non-existent paths from the
        # extended list, get all their
195
196
197
198
199
200
201
202
203
204
        # suffixes, and see if they match
        # any file groups.
        suffixes     = [getExt(p, allowedExts) for p in allPaths]
        groupMatches = [sorted(suffixes) == sorted(g) for g in fileGroups]

        # Is there a match for a file suffix group?
        # If not, multiple files with the specified
        # prefix exist, and there is no way to
        # resolve the ambiguity.
        if sum(groupMatches) != 1:
205
            raise PathError('More than one file with '
206
207
                            'prefix "{}"'.format(prefix))

208
209
        # Otherwise, we return a path
        # to the file which matches the
210
211
212
        # first suffix in the group.
        groupIdx = groupMatches.index(True)
        allPaths = [prefix + fileGroups[groupIdx][0]]
213
214
215

    # Return the full file name of the
    # supported file that was found
216
    return allPaths[0]
217
218


219
220
def removeExt(filename, allowedExts=None):
    """Returns the base name of the given file name.  See :func:`splitExt`. """
221

222
    return splitExt(filename, allowedExts)[0]
223
224


225
226
def getExt(filename, allowedExts=None):
    """Returns the extension of the given file name.  See :func:`splitExt`. """
227

228
    return splitExt(filename, allowedExts)[1]
229
230


231
232
def splitExt(filename, allowedExts=None):
    """Returns the base name and the extension from the given file name.
233
234

    If ``allowedExts`` is ``None``, this function is equivalent to using::
235

236
        os.path.splitext(filename)
237
238

    If ``allowedExts`` is provided, but the file does not end with an allowed
239
    extension, a tuple containing ``(filename, '')`` is returned.
240

241
    :arg filename:    The file name to split.
242

243
244
245
246
247
248
    :arg allowedExts: Allowed/recognised file extensions.
    """

    # If allowedExts is not specified,
    # we just use op.splitext
    if allowedExts is None:
249
        return op.splitext(filename)
250
251
252
253

    # Otherwise, try and find a suffix match
    extMatches = [filename.endswith(ext) for ext in allowedExts]

254
    # No match, assume there is no extension
255
    if not any(extMatches):
256
        return filename, ''
257

258
    # Otherwise split the filename
259
    # into its base and its extension
260
    extIdx = extMatches.index(True)
261
262
263
264
265
    extLen = len(allowedExts[extIdx])

    return filename[:-extLen], filename[-extLen:]


266
267
268
269
270
def getFileGroup(path,
                 allowedExts=None,
                 fileGroups=None,
                 fullPaths=True,
                 unambiguous=False):
271
    """If the given ``path`` is part of a ``fileGroup``, returns a list
272
273
274
    containing the paths to all other files in the group (including the
    ``path`` itself).

275
276
277
278
    If the ``path`` does not appear to be part of a file group, or appears to
    be part of an incomplete file group, a list containing only the ``path``
    is returned.

279
    If the ``path`` does not exist, or appears to be part of more than one
280
    file group, a :exc:`PathError` is raised.
281
282
283
284
285
286

    File groups can be used to specify a collection of file suffixes which
    should always exist alongside each other. This can be used to resolve
    ambiguity when multiple files exist with the same ``prefix`` and supported
    extensions (e.g. ``file.hdr`` and ``file.img``). The file groups are
    specified as a list of sequences, for example::
287

288
289
        [('.img',    '.hdr'),
         ('.img.gz', '.hdr.gz')]
290

291
    If you specify ``fileGroups=[('.img', '.hdr')]`` and ``prefix='file'``, and
292
293
294
295
    both ``file.img`` and ``file.hdr`` exist, the :func:`addExt` function would
    return ``file.img`` (i.e. the file which matches the first extension in
    the group).

296
297
298
    Similarly, if you call the :func:`.imcp.imcp` or :func:`.imcp.immv`
    functions with the above parameters, both ``file.img`` and ``file.hdr``
    will be moved.
299
300
301
302

    .. note:: The primary use-case of file groups is to resolve ambiguity with
              respect to NIFTI and ANALYSE75 image pairs. By specifying
              ``fileGroups=[('.img', '.hdr'), ('.img.gz', '.hdr.gz')]``, the
303
304
305
306
              :func:`addExt`, :func:`.imcp.immv` and :func:`.imcp.imcp`
              functions are able to figure out what you mean when you specify
              ``file``, and both ``file.hdr`` and ``file.img`` (or
              ``file.hdr.gz`` and ``file.img.gz``) exist.
307

308
    :arg path:        Path to the file. Must contain the file extension.
309

310
    :arg allowedExts: Allowed/recognised file extensions.
311

312
    :arg fileGroups:  Recognised file groups.
313

314
315
316
    :arg fullPaths:   If ``True`` (the default), full file paths (relative to
                      the ``path``) are returned. Otherwise, only the file
                      extensions in the group are returned.
317

318
    :arg unambiguous: Defaults to ``False``. If ``True``, and the path
319
                      is not unambiguously part of one group, or part of
320
321
                      no groups, a :exc:`PathError` is raised.
                      Otherwise, the path is returned.
322
323
    """

324
    path = addExt(path, allowedExts, mustExist=True, fileGroups=fileGroups)
325
    base, ext = splitExt(path, allowedExts)
326

327
328
329
    if fileGroups is None:
        if fullPaths: return [path]
        else:         return [ext]
330
331
332

    matchedGroups     = []
    matchedGroupFiles = []
333
334
    fullMatches       = 0
    partialMatches    = 0
335
336
337

    for group in fileGroups:

338
        if ext != '' and ext not in group:
339
340
341
            continue

        groupFiles = [base + s for s in group]
342
        exist      = [op.exists(f) for f in groupFiles]
343

344
345
        if any(exist):
            partialMatches += 1
346

347
348
349
350
        if all(exist):
            fullMatches += 1
            matchedGroups    .append(group)
            matchedGroupFiles.append(groupFiles)
351

352
    # Path is not part of any group
353
    if partialMatches == 0:
354
355
        if fullPaths: return [path]
        else:         return [ext]
356

357
358
    # If the given path is part of more
    # than one existing file group, we
359
    # can't resolve this ambiguity.
360
    if fullMatches > 1:
361
362
363
        raise PathError('Path is part of multiple '
                        'file groups: {}'.format(path))

364
365
366
367
368
369
370
371
    # If the unambiguous flag is not set,
    # we don't care about partial matches
    if not unambiguous:
        partialMatches = 0

    # The path is unambiguously part of a
    # complete file group - resolve it to
    # the first element of the group
Paul McCarthy's avatar
Paul McCarthy committed
372
    if fullMatches == 1 and partialMatches <= 1:
373
374
375
376
377
        if fullPaths: return matchedGroupFiles[0]
        else:         return matchedGroups[    0]

    # The path appears to be part of
    # an incomplete group - this is
Paul McCarthy's avatar
Paul McCarthy committed
378
    # potentially ambiguous, so give
379
380
381
382
383
    # up (but see the partialMatches
    # clobber above).
    elif partialMatches > 0:
        raise PathError('Path is part of an incomplete '
                        'file group: {}'.format(path))
384

385
386
387
388
    else:
        if fullPaths: return [path]
        else:         return [ext]

389
390

def removeDuplicates(paths, allowedExts=None, fileGroups=None):
391
    """Reduces the list of ``paths`` down to those which are unique with
392
393
394
    respect to the specified ``fileGroups``.

    For example, if you have a directory containing::
395

396
397
398
399
400
401
402
403
404
405
406
407
        001.hdr
        001.img
        002.hdr
        002.img
        003.hdr
        003.img

    And you call ``removeDuplicates`` like so::

         paths       = ['001.img', '001.hdr',
                        '002.img', '002.hdr',
                        '003.img', '003.hdr']
408

409
410
411
412
413
414
415
416
417
418
419
420
421
422
         allowedExts = ['.img',  '.hdr']
         fileGroups  = [('.img', '.hdr')]

         removeDuplicates(paths, allowedExts, fileGroups)

    The returned list will be::

         ['001.img', '002.img', '003.img']

    If you provide ``allowedExts``, you may specify incomplete ``paths`` (i.e.
    without extensions), as long as there are no path ambiguities.

    A :exc:`PathError` will be raised if any of the ``paths`` do not exist,
    or if there are any ambiguities with respect to incomplete paths.
423
424

    :arg paths:       List of paths to reduce.
425
426
427
428
429
430
431
432
433
434
435
436

    :arg allowedExts: Allowed/recognised file extensions.

    :arg fileGroups:  Recognised file groups - see :func:`getFileGroup`.
    """

    unique = []

    for path in paths:

        groupFiles = getFileGroup(path, allowedExts, fileGroups)

437
        if not any([p in unique for p in groupFiles]):
438
439
440
            unique.append(groupFiles[0])

    return unique
441
442
443
444
445
446


def uniquePrefix(path):
    """Return the longest prefix for the given file name which unambiguously
    identifies it, relative to the other files in the same directory.

447
    Raises a :exc:`PathError` if a unique prefix could not be found (which
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
    will never happen if the path is valid).
    """

    dirname, filename = op.split(path)

    idx    = 0
    prefix = op.join(dirname, filename[0])
    hits   = glob.glob('{}*'.format(prefix))

    while True:

        # Found a unique prefix
        if len(hits) == 1:
            break

        # Should never happen if path is valid
        elif len(hits) == 0 or idx >= len(filename) - 1:
465
            raise PathError('No unique prefix for {}'.format(filename))
466
467
468
469
470
471
472
473

        # Not unique - continue looping
        else:
            idx    += 1
            prefix  = prefix + filename[idx]
            hits    = [h for h in hits if h.startswith(prefix)]

    return prefix