data_utils.py 14.3 KB
Newer Older
Andrei Roibu's avatar
Andrei Roibu committed
1
2
3
4
"""Data Processing Functions

Description:

5
    This folder contains several functions which, either on their own or included in larger pieces of software, perform processing tasks on the data.
Andrei Roibu's avatar
Andrei Roibu committed
6

7
8
9
10
11
Usage:

    To use content from this folder, import the functions and instantiate them as you wish to use them:

        from utils.data_utils import function_name
Andrei Roibu's avatar
Andrei Roibu committed
12
13
14

"""

15
import os
16
import h5py
17
18
from glob import glob
import numpy as np
19
import nibabel as nib
20
import torch
21
import torch.utils.data as data
22
import nibabel as nb
23
import random
24
import configparser
25

26
def directory_reader(folder_location, write_txt=False):
27
28
29
30
31
32
33
    """Produces a list of of data-tags which are accessible

    This function looks in a large data directory, and returns a list of sub-directories which are accessible.
    This is done as currently, not all UK Biobank Data is accessible due to privacy issues.

    Args:
        folder_location (str): A string containing the address of the required directory.
34
        write_txt (bool): Flag indicating if a .txt file should be created.
35
36
37
38
    Returns:
        A list of strings containing the available sub-directories. This is also printed out as a .txt file
    """

39
    out_file = open("files.txt", 'w')
40
41
42
    subDirectoryList = []

    for directory in os.listdir(folder_location):
43
44
45
46
        if os.path.isdir(os.path.join(folder_location, directory)):
            filename = folder_location+directory
            if os.access(filename, os.R_OK):
                string = directory+'\n'
47
48
                if write_txt == True:
                    out_file.write(string)
49
                subDirectoryList.append(directory)
50
51
52

    return subDirectoryList

53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def data_test_train_validation_split(folder_location, train_percentage, validation_percentage):
    """Produces lists of train, test and validation data

    This function looks at the list of all available directories and returns three lists of dsub-directories.
    These lists are the lists required for training, testing and validation.
    
    Args:
        folder_location (str): A string containing the address of the required directory.
        train_percentage (int): Percentage of data to be used for training
        validation_percentage (int): Percentage of data to be used for validation

    """

    subDirectoryList = directory_reader(folder_location)

    random.shuffle(subDirectoryList)

    subDirectoryList = np.array(subDirectoryList)

    train, validation, test = np.split(subDirectoryList, [int(train_percentage/100 * len(subDirectoryList)), int((train_percentage+validation_percentage)/100 * len(subDirectoryList))])

    np.savetxt('train.txt', train)
    np.savetxt('test.txt', test)
    np.savetxt('validation.txt', validation)

def update_shuffling_flag(file_name):
79
80
81
82
83
84
85
86
87
88
89
90
91
    """ Update shuffling flag

    Changes shuffling flag in settings to False once data has been shuffled

    Args:
        file_name (str): The settings file name
    """

    config = configparser.ConfigParser()
    config.read(file_name)
    config.set('DATA', 'data_split_flag', 'False')
    with open(file_name, 'w') as configfile:
        config.write(configfile)
92

93
def tract_sum_generator(folder_path):
94
95
    """Sums the tracts of different dMRI files

96
97
    THIS FUNCTION IS NOT DEPRECATED: SummedTractMaps generated remotely

98
99
100
101
102
    When performing subject-specific probabilistic diffusion tractography using standard-space protocols, 27 tracts are created.
    This function loops through all the tracts, sums them and returns the summed tract map.
    This function also outputs the summed tract map as a Nifti (.nii.gz) file.

    Args:
103
        folder_location (str): A string containing the address of the required directory.
104
105
    """

Andrei Roibu's avatar
Andrei Roibu committed
106
    tractMapName = 'tracts/tractsNorm.nii.gz'
107

108
    subDirectoryList = directory_reader(folder_path)
109

110
111
    viableSubDirectories = len(subDirectoryList)
    counter = 0
112

113
114
115
116
    if not os.path.exists('/well/win/users/hsv459/functionmapper/datasets/dMRI'):
        if not os.path.exists('/well/win/users/hsv459/functionmapper/datasets'):
            os.mkdir('/well/win/users/hsv459/functionmapper/datasets')
        os.mkdir('/well/win/users/hsv459/functionmapper/datasets/dMRI')
117

118
    for subDirectory in subDirectoryList:
119
120
        tractedMapsPath = os.path.join(folder_location, str(
            subDirectory), 'dMRI/autoptx_preproc/tracts/')
121

122
        sum_flag = False  # This is a flat showing us if this is the first tracted to be summed
123

124
125
        print("Summing the tract number: {}/{}".format(counter, viableSubDirectories))

126
127
        for tract in os.listdir(tractedMapsPath):
            if os.path.isdir(os.path.join(tractedMapsPath, tract)):
128
129
                tractedMapPath = os.path.join(
                    tractedMapsPath, tract, tractMapName)
130
                tractedMapImg = nib.load(tractedMapPath)
131

132
133
134
135
                tractedMap = tractedMapImg.get_fdata()

                #  the affine array stores the relationship between voxel coordinates in the image data array and coordinates in the reference space

136
                tractedMapAffine = tractedMapImg.affine
137
138
139
140
141
142
143
144
145

                if sum_flag == False:
                    tractedMapSum = np.copy(tractedMap)
                else:
                    tractedMapSum = np.sum(tractedMapSum, tractedMap)

        tractedMapSumPath = '/well/win/users/hsv459/functionmapper/datasets/dMRI'
        tractsSumName = str(subDirectory) + ".nii.gz"
        tractedMapSumImg = nib.Nifti1Image(tractedMapSum, tractedMapAffine)
146
147
        nib.save(tractedMapSumImg, os.path.join(
            tractedMapSumPath, tractsSumName))
148

149
        counter += 1
150

151
152
    return None

153

154
155
156
157
158
159
160
161
162
163
class DataMapper(data.Dataset):
    """Data Mapper Class.

    This class represents a generic parent class for mapping between keys and data samples.
    The class represents a subclass/child class of data.Dataset, inheriting its functionality.
    This class is composed of a __init__ constructor, a __getitem__(), supporting fetching a data sample for a given key, and __len__(), which returns the size of the dataset.

    Args:
        X (HDF5 datafile): hierarchically organized input data
        y (HDF5 datafile): hierarchically organized output data 
164

165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
    Returns:
        input_data (torch.tensor): Tensor representation of the input data
        label_data (torch.tensor): Tensor representation of the output data
        int: lenght of the output
    """

    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __getitem__(self, index):
        input_data = torch.from_numpy(self.X[index])
        label_data = torch.from_numpy(self.y[index])
        return input_data, label_data

    def __len__(self):
        return len(self.y)

183

184
def get_datasets(data_parameters):
185
186
187
188
189
190
191
192
193
194
195
196
197
    """Data Loader Function.

    This function loads the various data file and returns the relevand mapped datasets.

    Args:
        data_parameters (dict): Dictionary containing relevant information for the datafiles.
        data_parameters = {
            data_directory: 'path/to/directory'
            train_data_file: 'training_data'
            train_output_targets: 'training_targets'
            test_data_file: 'testing_data'
            test_target_file: 'testing_targets'
        }
198

199
200
201
202
    Returns:
        touple: the relevant train and test datasets
    """

203
204
205
206
    training_data = h5py.File(os.path.join(
        data_parameters['data_directory'], data_parameters['training_data']), 'r')
    testing_data = h5py.File(os.path.join(
        data_parameters['data_directory'], data_parameters['testing_data']), 'r')
207

208
209
210
211
    training_labels = h5py.File(os.path.join(
        data_parameters['data_directory'], data_parameters['training_targets']), 'r')
    testing_labels = h5py.File(os.path.join(
        data_parameters['data_directory'], data_parameters['testing_targets']), 'r')
212
213
214
215
216

    return (
        DataMapper(training_data['data'][()], training_labels['label'][()]),
        DataMapper(testing_data['data'][()], testing_labels['label'][()])
    )
217

218

219
220
221
222
223
224
225
226
def create_folder(path):
    """Folder Creator

    A function which creates a folder at a given path if one does not exist

    Args:
        path (str): destination to check for folder existance
    """
227

228
229
230
    if not os.path.exists(path):
        os.mkdir(path)

231

232
def load_file_paths(data_directory, data_list, targets_directory=None):
233
    """File Loader
234

235
236
237
238
239
    This function returns a list of combined file paths for the input data and labelled output data.

    Args:
        data_directory (str): Path to input data directory
        data_list (str): Path to a .txt file containing the input files for consideration
240
        targets_directory (str): Path to labelled data (Y-equivalent); None if during evaluation.
241
242
243
244
245

    Returns:
        file_paths (list): List containing the input data and target labelled output data

    Raises:
246
        ValueError: "Invalid data entry - check code and data entry format!"
247
248
249
250
251
252
253
254
    """

    if data_list:
        with open(data_list) as data_list_file:
            volumes_to_be_used = data_list_file.read().splitlines()
    else:
        volumes_to_be_used = [files for files in os.listdir(data_directory)]

255
    if targets_directory == None:
256
257
        file_paths = [[os.path.join(data_directory, volume)]
                      for volume in volumes_to_be_used]
258
    else:
259
260
        file_paths = [[os.path.join(data_directory, volume), os.join.path(
            targets_directory, volume)] for volume in volumes_to_be_used]
261
262

    return file_paths
263

264

265
266
267
268
269
270
271
272
273
def load_and_preprocess(file_paths, orientation):
    """Load & Preprocess

    This function is composed of two other function calls: one that calls a function loading the data, and another which preprocesses the data to the required format.
    # TODO: Need to check if any more proprocessing would be required besides summing the tracts!

    Args:
        file_paths (list): List containing the input data and target labelled output data
        orientation (str): String detailing the current view (COR, SAG, AXL)
274

275
276
277
278
279
280
281
282
283
284
    Returns:
        volume (np.array): Array of training image data of data type dtype.
        label_map (np.array): Array of labelled image data of data type dtype.
        header (class): 'nibabel.nifti1.Nifti1Header' class object, containing image metadata
    """

    volume, label_map, header = load(file_paths, orientation)

    return volume, label_map, header

285

286
def load(file_path, orientation):
287
288
289
290
291
    """Load Data Function

    This function loads the required data files and extracts relevant information from it.

    Args:
292
        file_path (list): List containing the input data and target labelled output data
293
        orientation (str): String detailing the current view (COR, SAG, AXL)
294

295
296
297
298
299
300
    Returns:
        volume (np.array): Array of training image data of data type dtype.
        label_map (np.array): Array of labelled image data of data type dtype.
        header (class): 'nibabel.nifti1.Nifti1Header' class object, containing image metadata
    """

301
    nifty_volume, label_map = nb.load(file_path[0]), nb.load(file_path[1])
302
    volume, label_map = nifty_volume.get_fdata(), label_map.ged_fdata()
303
304
305
306

    # Do we need min-max normalization here? Will need to check when debuggint and testing
    volume = (volume - np.min(volume)) / (np.max(volume) - np.min(volume))

307
    volume, label_map = set_orientation(volume, label_map, orientation)
308
309

    return volume, label_map, nifty_volume.header
310

311

312
313
314
315
316
317
318
319
320
321
def preprocess():

    # IDEEA - Might be worth adding the summed tracts function either to here, or the preprocessor file!
    """Data ProProcessing Function

    This function carries out several specific pre-processing operations on the loaded data.

    As things are - no specific pre-processing currently required!
    """

Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
322
    return None
323

324

325
326
327
328
329
330
331
332
333
def set_orientation(volume, label_map, orientation):
    """Load Data Orientation

    This function modifies the orientation of the input and output data depending on the required orientation.

    Args:
        volume (np.array): Array of training image data of data type dtype.
        label_map (np.array): Array of labelled image data of data type dtype.
        orientation (str): String detailing the current view (COR, SAG, AXL)
334

335
336
337
    Returns:
        volume (np.array): Array of training image data of data type dtype.
        label_map (np.array): Array of labelled image data of data type dtype.
338

339
340
341
342
343
344
345
346
    Raises:
        ValueError: Orientation value is invalid. It must be either >>coronal<<, >>axial<< or >>sagital<<
    """

    # TODO: will need to check if these alignments correpond with our data.
    # These alignments work for ADNI

    if orientation == "sagittal":
347
        return volume, label_map  # This is assumed to be the default orientation
348
    elif orientation == "axial":
349
        return volume.transpose((1, 2, 0)), label_map.transpose((1, 2, 0))
350
    elif orientation == "coronal":
351
        return volume.transpose((2, 0, 1)), label_map.transpose((2, 0, 1))
352
    else:
353
354
355
        raise ValueError(
            "Orientation value is invalid. It must be either >>coronal<<, >>axial<< or >>sagital<< ")

356

357
358
359
360
361
362
363
364
365
366
367
368
369
def load_and_preprocess_evaluation(file_path, orientation, min_max=True):
    """Load & Preprocessing before evaluation

    This function loads a nifty file and returns its volume and header information

    Args:
        file_path (str): Path to the desired file
        orientation (str): String detailing the current view (COR, SAG, AXL)
        min_max (bool): Flag for inducing min-max normalization of the volume

    Returns:
        volume (np.array): Array of training image data of data type dtype.
        header (class): 'nibabel.nifti1.Nifti1Header' class object, containing image metadata
370

371
    Raises:
372
        ValueError: "Orientation value is invalid. It must be either >>coronal<<, >>axial<< or >>sagital<< "
373
374
375
376
    """

    nifty_volume = nb.load(file_path[0])

377
378
379
380
381
382
383
384
385
    volume = nifty_volume.get_fdata()
    header = nifty_volume.header

    if min_max:
        volume = (volume - np.min(volume)) / (np.max(volume) - np.min(volume))
    else:
        volume = np.round(volume)

    if orientation == "sagittal":
386
        return volume  # This is assumed to be the default orientation
387
388
389
390
391
    elif orientation == "axial":
        return volume.transpose((1, 2, 0))
    elif orientation == "coronal":
        return volume.transpose((2, 0, 1))
    else:
392
393
        raise ValueError(
            "Orientation value is invalid. It must be either >>coronal<<, >>axial<< or >>sagital<< ")
394
395

    return volume, header
396

397

398
if __name__ == "__main__":
399

400
    folder_location = '/well/win-biobank/projects/imaging/data/data3/subjectsAll/'
401
    tract_sum_generator(folder_location)