data_utils.py 11.2 KB
Newer Older
Andrei Roibu's avatar
Andrei Roibu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
"""Data Processing Functions

Description:
-------------
This folder contains several functions which, either on their own or included in larger pieces of software, perform processing tasks on the data.

Usage
-------------
To use content from this folder, import the functions and instantiate them as you wish to use them:

    from utils.data_utils import function_name

"""

15
import os
16
import h5py
17
18
from glob import glob
import numpy as np
19
import nibabel as nib
20
import torch
21
import torch.utils.data as data
22
import nibabel as nb
23

24
def directory_reader(folder_location):
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
    """Produces a list of of data-tags which are accessible

    This function looks in a large data directory, and returns a list of sub-directories which are accessible.
    This is done as currently, not all UK Biobank Data is accessible due to privacy issues.

    Args:
        folder_location (str): A string containing the address of the required directory.
    
    Returns:
        A list of strings containing the available sub-directories. This is also printed out as a .txt file

    Raises:
        None
    """

40
    out_file = open("files.txt", 'w')
41
42
43
    subDirectoryList = []

    for directory in os.listdir(folder_location):
44
45
46
47
48
49
        if os.path.isdir(os.path.join(folder_location, directory)):
            filename = folder_location+directory
            if os.access(filename, os.R_OK):
                string = directory+'\n'
                out_file.write(string)
                subDirectoryList.append(directory)
50
51
52

    return subDirectoryList

53
def tract_sum_generator(folder_path):
54
55
56
57
58
59
60
    """Sums the tracts of different dMRI files

    When performing subject-specific probabilistic diffusion tractography using standard-space protocols, 27 tracts are created.
    This function loops through all the tracts, sums them and returns the summed tract map.
    This function also outputs the summed tract map as a Nifti (.nii.gz) file.

    Args:
61
        folder_location (str): A string containing the address of the required directory.
62
63
64
65
66
67
68
69
70

    Returns:
        None

    Raises:
        None

    """

Andrei Roibu's avatar
Andrei Roibu committed
71
    tractMapName = 'tracts/tractsNorm.nii.gz'
72

73
    subDirectoryList = dirReader(folder_path)
74

75
76
    viableSubDirectories = len(subDirectoryList)
    counter = 0
77

78
79
80
81
    if not os.path.exists('/well/win/users/hsv459/functionmapper/datasets/dMRI'):
        if not os.path.exists('/well/win/users/hsv459/functionmapper/datasets'):
            os.mkdir('/well/win/users/hsv459/functionmapper/datasets')
        os.mkdir('/well/win/users/hsv459/functionmapper/datasets/dMRI')
82

83
    for subDirectory in subDirectoryList:
84
        tractedMapsPath = os.path.join(folder_location, str(subDirectory), 'dMRI/autoptx_preproc/tracts/')
85

86
        sum_flag = False # This is a flat showing us if this is the first tracted to be summed
87

88
89
        print("Summing the tract number: {}/{}".format(counter, viableSubDirectories))

90
91
92
93
        for tract in os.listdir(tractedMapsPath):
            if os.path.isdir(os.path.join(tractedMapsPath, tract)):
                tractedMapPath = os.path.join(tractedMapsPath, tract, tractMapName)
                tractedMapImg = nib.load(tractedMapPath)
94

95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
                tractedMap = tractedMapImg.get_fdata()

                #  the affine array stores the relationship between voxel coordinates in the image data array and coordinates in the reference space

                tractedMapAffine = tractedMapImg.affine 

                if sum_flag == False:
                    tractedMapSum = np.copy(tractedMap)
                else:
                    tractedMapSum = np.sum(tractedMapSum, tractedMap)

        
        tractedMapSumPath = '/well/win/users/hsv459/functionmapper/datasets/dMRI'
        tractsSumName = str(subDirectory) + ".nii.gz"
        tractedMapSumImg = nib.Nifti1Image(tractedMapSum, tractedMapAffine)
        nib.save(tractedMapSumImg, os.path.join(tractedMapSumPath, tractsSumName))
111

112
113
        counter +=1 

114
115
    return None

116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
class DataMapper(data.Dataset):
    """Data Mapper Class.

    This class represents a generic parent class for mapping between keys and data samples.
    The class represents a subclass/child class of data.Dataset, inheriting its functionality.
    This class is composed of a __init__ constructor, a __getitem__(), supporting fetching a data sample for a given key, and __len__(), which returns the size of the dataset.

    Args:
        X (HDF5 datafile): hierarchically organized input data
        y (HDF5 datafile): hierarchically organized output data 
    
    Returns:
        input_data (torch.tensor): Tensor representation of the input data
        label_data (torch.tensor): Tensor representation of the output data
        int: lenght of the output

    Raises:
        None
    """

    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __getitem__(self, index):
        input_data = torch.from_numpy(self.X[index])
        label_data = torch.from_numpy(self.y[index])
        return input_data, label_data

    def __len__(self):
        return len(self.y)

def get_datasets(data_parameters):
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170

    """Data Loader Function.

    This function loads the various data file and returns the relevand mapped datasets.

    Args:
        data_parameters (dict): Dictionary containing relevant information for the datafiles.
        data_parameters = {
            data_directory: 'path/to/directory'
            train_data_file: 'training_data'
            train_output_targets: 'training_targets'
            test_data_file: 'testing_data'
            test_target_file: 'testing_targets'
        }
    
    Returns:
        touple: the relevant train and test datasets

    Raises:
        None
    """

171
172
    training_data = h5py.File(os.path.join(data_parameters['data_directory'], data_parameters['training_data']), 'r')
    testing_data = h5py.File(os.path.join(data_parameters['data_directory'], data_parameters['testing_data']), 'r')
173

174
175
    training_labels = h5py.File(os.path.join(data_parameters['data_directory'], data_parameters['training_targets']), 'r')
    testing_labels = h5py.File(os.path.join(data_parameters['data_directory'], data_parameters['testing_targets']), 'r')
176
177
178
179
180

    return (
        DataMapper(training_data['data'][()], training_labels['label'][()]),
        DataMapper(testing_data['data'][()], testing_labels['label'][()])
    )
181

182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
def create_folder(path):
    """Folder Creator

    A function which creates a folder at a given path if one does not exist

    Args:
        path (str): destination to check for folder existance

    Returns:
        None

    Raises:
        None
    """
    
    if not os.path.exists(path):
        os.mkdir(path)

200
201
def load_file_paths(data_directory, targets_directory, data_list):
    """File Loader
202

203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
    This function returns a list of combined file paths for the input data and labelled output data.

    Args:
        data_directory (str): Path to input data directory
        targets_directory (str): Path to labelled data (Y-equivalent)
        data_list (str): Path to a .txt file containing the input files for consideration

    Returns:
        file_paths (list): List containing the input data and target labelled output data

    Raises:
        None
    """

    if data_list:
        with open(data_list) as data_list_file:
            volumes_to_be_used = data_list_file.read().splitlines()
    else:
        volumes_to_be_used = [files for files in os.listdir(data_directory)]

    # mri/orig.mgz - TODO this needs to be changed in accordance with the actual file names - to be defined later!
    file_paths = [[os.path.join(data_directory, volume, 'mri/orig.mgz'), os.join.path(targets_directory, volume, 'label.mgz')] for volume in volumes_to_be_used]

    return file_paths
227

228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
def load_and_preprocess(file_paths, orientation):
    """Load & Preprocess

    This function is composed of two other function calls: one that calls a function loading the data, and another which preprocesses the data to the required format.
    # TODO: Need to check if any more proprocessing would be required besides summing the tracts!

    Args:
        file_paths (list): List containing the input data and target labelled output data
        orientation (str): String detailing the current view (COR, SAG, AXL)
    
    Returns:
        volume (np.array): Array of training image data of data type dtype.
        label_map (np.array): Array of labelled image data of data type dtype.
        header (class): 'nibabel.nifti1.Nifti1Header' class object, containing image metadata
    
    Raises:
        None
    """

    volume, label_map, header = load(file_paths, orientation)

    return volume, label_map, header

251
def load(file_paths, orientation):
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
    """Load Data Function

    This function loads the required data files and extracts relevant information from it.

    Args:
        file_paths (list): List containing the input data and target labelled output data
        orientation (str): String detailing the current view (COR, SAG, AXL)
    
    Returns:
        volume (np.array): Array of training image data of data type dtype.
        label_map (np.array): Array of labelled image data of data type dtype.
        header (class): 'nibabel.nifti1.Nifti1Header' class object, containing image metadata
    
    Raises:
        None
    """

269
270
271
272
273
274
275
276
277
    nifty_volume, label_map = nb.load(file_paths[0]), nb.load(file_paths[1])
    volume, label_map = nifty_volume.get_fdata(), nift_labeled_volume.ged_fdata()

    # Do we need min-max normalization here? Will need to check when debuggint and testing
    volume = (volume - np.min(volume)) / (np.max(volume) - np.min(volume))

    volume, label_map = 

    return volume, label_map, nifty_volume.header
278
279
280
281
282
283
284
285
286
287
288
289
290
291

def preprocess():

    # IDEEA - Might be worth adding the summed tracts function either to here, or the preprocessor file!

    """Data ProProcessing Function

    This function carries out several specific pre-processing operations on the loaded data.

    As things are - no specific pre-processing currently required!
    """

    return pass

292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
def set_orientation(volume, label_map, orientation):
    """Load Data Orientation

    This function modifies the orientation of the input and output data depending on the required orientation.

    Args:
        volume (np.array): Array of training image data of data type dtype.
        label_map (np.array): Array of labelled image data of data type dtype.
        orientation (str): String detailing the current view (COR, SAG, AXL)
    
    Returns:
        volume (np.array): Array of training image data of data type dtype.
        label_map (np.array): Array of labelled image data of data type dtype.
    
    Raises:
        ValueError: Orientation value is invalid. It must be either >>coronal<<, >>axial<< or >>sagital<<
    """

    # TODO: will need to check if these alignments correpond with our data.
    # These alignments work for ADNI

    if orientation == "sagittal":
        return volume, label_map # This is assumed to be the default orientation
    elif orientation == "axial":
        return volume.transpose((1, 2, 0)), label_map.transpose((2, 0, 1))
    elif orientation == "coronal":
        return
    else:
        raise ValueError("Orientation value is invalid. It must be either >>coronal<<, >>axial<< or >>sagital<< ")

322
if __name__ == "__main__":
323

324
    folder_location = '/well/win-biobank/projects/imaging/data/data3/subjectsAll/'
325
    tractSum(folder_location)