data_utils.py 33 KB
Newer Older
Andrei Roibu's avatar
Andrei Roibu committed
1
2
3
4
"""Data Processing Functions

Description:

5
    This folder contains several functions which, either on their own or included in larger pieces of software, perform processing tasks on the data.
Andrei Roibu's avatar
Andrei Roibu committed
6

7
8
9
10
11
Usage:

    To use content from this folder, import the functions and instantiate them as you wish to use them:

        from utils.data_utils import function_name
Andrei Roibu's avatar
Andrei Roibu committed
12
13
14

"""

15
import os
16
import h5py
17
import pickle
18
19
from glob import glob
import numpy as np
20
import nibabel as nib
21
import torch
22
import torch.utils.data as data
23
import configparser
24
25
from fsl.data.image import Image
from fsl.utils.image.resample import resampleToPixdims
26
from sklearn.model_selection import KFold, train_test_split
27

Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
28

29
def directory_reader(folder_location, subject_number=None, write_txt=False):
30
31
32
33
34
35
36
    """Produces a list of of data-tags which are accessible

    This function looks in a large data directory, and returns a list of sub-directories which are accessible.
    This is done as currently, not all UK Biobank Data is accessible due to privacy issues.

    Args:
        folder_location (str): A string containing the address of the required directory.
37
        write_txt (bool): Flag indicating if a .txt file should be created.
38
        suject_number (int): Number of subjects to be considered for a job. Useful when wanting to train on datasizes smaller than total datapoints available in a datafolder.
39
    Returns:
40
        subDirectoryList (list): A list of strings containing the available sub-directories. This is also printed out as a .txt file
41
    """
42
43
    if write_txt == True:
        out_file = open("files.txt", 'w')
44
45
46

    subDirectoryList = []

47
48
49
    number_of_subjects = 0

    if subject_number is None:
50
51
        subject_number = len(os.listdir(os.path.join(
            os.path.expanduser("~"), folder_location)))
52

53
    for directory in os.listdir(folder_location):
54
        if number_of_subjects < subject_number:
55
            if os.path.isdir(os.path.join(folder_location, directory)) and os.path.exists(os.path.join(folder_location, directory, "dMRI/autoptx_preproc/")) and os.path.exists(os.path.join(folder_location, directory, "fMRI/rfMRI_25.dr/")):
56
57
58
59
60
61
62
63
64
65
                filename = folder_location+directory
                if os.access(filename, os.R_OK):
                    string = directory
                    if write_txt == True:
                        out_file.write(string)
                        out_file.write("\n")
                    subDirectoryList.append(directory)
                    number_of_subjects += 1
        else:
            break
66
67
68

    return subDirectoryList

Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
69

70
def data_file_reader(data_file_path, folder_location, subject_number=None):
71
72
73
74
    """Data File reader

    Args:
        data_file_path (str): Path to the file containing the data
75
76
        folder_location (str): A string containing the address of the required directory.
        subject_number (int): Number of subjects to be considered for a job. Useful when wanting to train on datasizes smaller than total datapoints available in a datafolder.
77
78
79
80
81
82
83
84
85

    Returns:
        subDirectoryList (list): A list of strings containing the available sub-directories
    """

    with open(data_file_path) as files:
        subDirectoryList = files.read().split('\n')
        subDirectoryList.remove('')

86
87
88
89
90
91
92
93
    for directory in subDirectoryList:
        if os.path.exists(os.path.join(folder_location, directory, "dMRI/autoptx_preproc/")) == False:
            if os.path.exists(os.path.join(folder_location, directory, "fMRI/rfMRI_25.dr/")) == False:
                subDirectoryList.remove(directory)

    if subject_number is not None:
        subDirectoryList = subDirectoryList[:subject_number]

94
95
96
    return subDirectoryList


97
def data_test_train_validation_split(data_folder_name, test_percentage, subject_number, data_directory, train_inputs, train_targets, mean_mask_path, data_file=None, K_fold=None):
98
99
100
101
    """Produces lists of train, test and validation data

    This function looks at the list of all available directories and returns three lists of dsub-directories.
    These lists are the lists required for training, testing and validation.
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
102

103
    Args:
104
        data_folder_name (str): The name of the folder where the string data is being output
105
        test_percentage (int): Percentage of data to be used for testing
106
        suject_number (int): Number of subjects to be considered for a job. Useful when wanting to train on datasizes smaller than total datapoints available in a datafolder.
107
108
109
110
        data_directory (str): A string containing the address of the required directory.
        train_inputs (str): Path to the training input files
        train_targets (str): Path to the training target files
        mean_mask_path (str): Path to the dualreg subject mean mask
111
        K_fold (int): Number of folds for splitting the training data
112
        data_file (str): Name of *.txt file containing a list of the required data
113
114
115

    """

116
    if data_file is not None:
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
117
118
        subDirectoryList = data_file_reader(
            data_file, data_directory, subject_number)
119
    else:
120
        subDirectoryList = directory_reader(data_directory, subject_number)
121
122

    subDirectoryList = np.array(subDirectoryList)
123
124
    create_folder(data_folder_name)

125
126
    train_data, test = train_test_split(
        subDirectoryList, test_size=test_percentage/100, random_state=42, shuffle=True)
127
128
    if os.path.exists(os.path.join(data_folder_name, 'test.txt')):
        os.remove(os.path.join(data_folder_name, 'test.txt'))
129
130
    np.savetxt(os.path.join(data_folder_name, 'test.txt'), test, fmt='%s')

131
    if K_fold is None:
132
133
134
        train, validation = train_test_split(
            train_data, test_size=int(len(test)), random_state=42, shuffle=True)

135
        if os.path.exists(os.path.join(data_folder_name, 'train.txt')):
136
            os.remove(os.path.join(data_folder_name, 'train.txt'))
137
138
        np.savetxt(os.path.join(data_folder_name,
                                'train.txt'), train, fmt='%s')
139
140

        if os.path.exists(os.path.join(data_folder_name, 'validation.txt')):
141
            os.remove(os.path.join(data_folder_name, 'validation.txt'))
142
        np.savetxt(os.path.join(data_folder_name, 'validation.txt'),
143
                   validation, fmt='%s')
144

145
146
147
148
149
150
151
152
        print('Generating training dataset min and max...')

        data_extremes(os.path.join(data_folder_name, 'scaling_factors.pkl'),
                      os.path.join(data_folder_name, 'train.txt'),
                      data_directory, train_inputs, train_targets, mean_mask_path)

        print('Global training dataset min and max values generated!')

153
    else:
154
        k_fold = KFold(n_splits=K_fold)
155
156
157
        k = 0
        for train_index, validation_index in k_fold.split(train_data):
            train, validation = train_data[train_index], train_data[validation_index]
158

159
160
161
            if os.path.exists(os.path.join(data_folder_name, 'train' + str(k+1) + '.txt')):
                os.remove(os.path.join(data_folder_name,
                                       'train' + str(k+1) + '.txt'))
162
163
            np.savetxt(os.path.join(data_folder_name, 'train' +
                                    str(k+1)+'.txt'), train, fmt='%s')
164

165
166
167
            if os.path.exists(os.path.join(data_folder_name, 'validation' + str(k+1) + '.txt')):
                os.remove(os.path.join(data_folder_name,
                                       'validation' + str(k+1) + '.txt'))
168
169
170
            np.savetxt(os.path.join(data_folder_name, 'validation' +
                                    str(k+1)+'.txt'), validation, fmt='%s')

171
172
173
174
175
176
177
178
179
180
181
            print(
                'Generating training dataset min and max for fold K = {}/{}...'.format(k+1, K_fold))

            data_extremes(os.path.join(data_folder_name, 'scaling_factors'+str(k+1)+'.pkl'),
                          os.path.join(data_folder_name,
                                       'train' + str(k+1) + '.txt'),
                          data_directory, train_inputs, train_targets, mean_mask_path)

            print(
                'Global training dataset min and max values for fold K = {}/{} generated!'.format(k+1, K_fold))

182
            k += 1
183

184

185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
def data_extremes(output_file_path, train_list, data_directory, train_inputs, train_targets, mean_mask_path):
    """Calculates min-max for train dataset

    This function calculates the min and max of the entire training dataset.
    After, it saves the values to a pkl file.

    Args:
        output_file_path (str): Path to the output file
        train_list (str): Path to the file containing the list of training volumes
        data_directory (str): A string containing the address of the required directory.
        train_inputs (str): Path to the training input files
        train_targets (str): Path to the training target files
        mean_mask_path (str): Path to the dualreg subject mean mask
    """

    min_input = None
    max_input = None
    min_target = None
    max_target = None

    dualreg_subject_mean = Image(mean_mask_path).data[:, :, :, 0]

    with open(train_list) as volume_list:
        lines = volume_list.read().split('\n')

    for line in lines:
        if line == '':
            pass
        else:
            input_path = os.path.join(data_directory, line, train_inputs)
            target_path = os.path.join(data_directory, line, train_targets)

            input_volume, _ = resampleToPixdims(Image(input_path), (2,2,2))
            target_volume = np.subtract(Image(target_path).data[:,:,:,0], dualreg_subject_mean)

220
221
            target_volume[target_volume < 0] = 0

222
223
            # TODO: Currently, the volume of the fMRI data is hardcoded, only loading in the DMN. This needs to be updated in later iterations.

224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
#            if min_input == None:
#                min_input = np.min(input_volume)
#            elif min_input > np.min(input_volume):
#                min_input = np.min(input_volume)
#
#            if max_input == None:
#                max_input = np.max(input_volume)
#            elif max_input < np.max(input_volume):
#                max_input = np.max(input_volume)
#
#            if min_target == None:
#                min_target = np.min(target_volume)
#            elif min_target > np.min(target_volume):
#                min_target = np.min(target_volume)
#
#            if max_target == None:
#                max_target = np.max(target_volume)
#            elif max_target < np.max(target_volume):
#                max_target = np.max(target_volume)

244
            if min_input == None:
245
246
247
                min_input = np.percentile(input_volume, 1)
            elif min_input > np.percentile(input_volume, 1):
                min_input = np.percentile(input_volume, 1)
248
249

            if max_input == None:
250
251
252
                max_input = np.percentile(input_volume, 99)
            elif max_input < np.percentile(input_volume, 99):
                max_input = np.percentile(input_volume, 99)
253
254

            if min_target == None:
255
256
257
                min_target = np.percentile(target_volume, 1)
            elif min_target > np.percentile(target_volume, 1):
                min_target = np.percentile(target_volume, 1)
258
259

            if max_target == None:
260
261
262
263
                max_target = np.percentile(target_volume, 99)
            elif max_target < np.percentile(target_volume, 99):
                max_target = np.percentile(target_volume, 99)

264
265
266
267
268
269
270
271
272
273
274

            del input_volume, target_volume, input_path, target_path

    if os.path.exists(output_file_path):
        os.remove(output_file_path)

    with open(output_file_path, 'wb') as output_file:
        pickle.dump([min_input, max_input, min_target,
                     max_target], output_file)


275
def update_shuffling_flag(file_name):
276
277
278
279
280
281
282
283
284
285
286
287
288
    """ Update shuffling flag

    Changes shuffling flag in settings to False once data has been shuffled

    Args:
        file_name (str): The settings file name
    """

    config = configparser.ConfigParser()
    config.read(file_name)
    config.set('DATA', 'data_split_flag', 'False')
    with open(file_name, 'w') as configfile:
        config.write(configfile)
289

Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
290

291
class DataMapper(data.Dataset):
292
    """Data Mapper Class
293
294
295
296

    This class represents a generic parent class for mapping between keys and data samples.
    The class represents a subclass/child class of data.Dataset, inheriting its functionality.
    This class is composed of a __init__ constructor, a __getitem__(), supporting fetching a data sample for a given key, and __len__(), which returns the size of the dataset.
297
    This class also has several other helper functions.
298
299

    Args:
300
301
        filename (str): Path to file containing the relevant volume indicator numbers
        data_directory (str): Directory where the various subjects are stored.
302
303
        data_file (str): Intenal path for each subject to the relevant normalized summed dMRI tracts
        output_targets (str): Internal path for each subject to the relevant rsfMRI data
304
        mean_mask_path (str): Path to the dualreg subject mean mask
305
        scaling_factors (str): Path to the file containing the scaling factors
306
        mean_reduction (bool): Flag indicating if the targets should be de-meaned using the mean_mask_path
307

308
    Returns:
309
310
        X_volume (torch.tensor): Tensor representation of the input data
        y_volume (torch.tensor): Tensor representation of the output data
311
312
        int: lenght of the output

313
    """
314

315
    def __init__(self, filename, data_directory, data_file, output_targets, mean_mask_path, scaling_factors, mean_reduction=False):
316
317
        # Initialize everything, and only store in memory the text data file.
        # Memory usage limited by only storing the text string information, not the actual volumes.
318
        # TODO: Currently, the timepoint in the fMRI data (y_volume) is hardcoded, only loading in the DMN. This needs to be updated in later iterations.
319
320
        self.filename = filename
        self.data_directory = data_directory
321
322
        self.data_file = data_file
        self.output_targets = output_targets
323
324
        self.sample_pairs = []
        self._get_datasets()
325
326
        self.mean_mask_path = mean_mask_path
        self.mean_reduction = mean_reduction
327
        self.scaling_factors = scaling_factors
328

Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
329
    def __len__(self):
330
        return len(self.sample_pairs)
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
331

332
    def __getitem__(self, index):
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
333

334
        X_path, y_path = self.sample_pairs[index]
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
335

336
        X_volume = torch.from_numpy(self.scale_volume(self.resample(X_path), self.scaling_factors, target_flag=False))
337

338
        if self.mean_reduction == True:
339
            y_volume = torch.from_numpy(self.scale_volume(self.subtract_mean(y_path, self.mean_mask_path), self.scaling_factors, target_flag=True))
340
        else:
341
            y_volume = torch.from_numpy(self.scale_volume(self.convert_to_numpy(y_path)[:, :, :, 0], self.scaling_factors, target_flag=True))
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
342
343
344

        return X_volume, y_volume

345
346
347
348
349
    def _get_datasets(self):
        """File path generator

        Helper function which reads all the various strings and generates the required paths.
        """
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
350

351
        with open(self.filename) as files:
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
352
            lines = files.read().split('\n')
353
354
355
356
357

        for line in lines:
            if line == '':
                pass
            else:
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
358
                X_path = os.path.join(
359
                    os.path.expanduser("~"), self.data_directory, line, self.data_file)
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
360
                y_path = os.path.join(
361
                    os.path.expanduser("~"), self.data_directory, line, self.output_targets)
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
362

363
                self.sample_pairs.append((X_path, y_path))
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
364

365
366
367
368
369
370
371
372
373
374
375
376
377
    def resample(self, path):
        """dMRI Resample

        Helper function downsampling the dMRI data from 1mm to 2mm.
        This is due to GPU memory / RAM limitations during training.
        The resampleToPixdims also carries out data smoothing.

        Args:
            path (str): Path to the relevant volume

        Returns:
            volume_resampled (np.array): Resampled volume
        """
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
378
379
        volume_resampled, _ = resampleToPixdims(
            self.read_data_files(path), (2, 2, 2))
380

381
        return volume_resampled
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
382

383
384
385
386
387
388
389
390
391
392
393
394
395
396
    def read_data_files(self, path):
        """Volume Reader

        Helper function reading the relvant volume. 

        Args:
            path (str): Path to the relevant volume

        Returns:
            volume_image (class): fsl.data.image.Image class
        """

        volume_image = Image(path)
        return volume_image
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
397

398
399
400
401
402
403
404
405
406
407
408
409
410
411
    def convert_to_numpy(self, path):
        """Numpy wrapper

        Helper function wrapping the conversaion of a volume to numpy

        Args:
            path (str): Path to the relevant volume.

        Returns
            volume_numpy (np.array): Numpy array representation of volume data.
        """

        volume_numpy = self.read_data_files(path).data
        return volume_numpy
412

413
414
415
416
417
418
419
420
421
422
423
424
    def subtract_mean(self, path, mean_mask_path):
        """Mean Mask Substraction

        Helper function which substracts the dualreg mean subject mask

        Args:
            mean_mask_path (str): Path to the dualreg subject mean mask

        Returns:
            subtracted_volume (np.array): Numpy array representation of the subtracted volume data
        """

425
426
427
428
        dualreg_subject_mean = self.convert_to_numpy(mean_mask_path)[:, :, :, 0]
  
        volume = self.convert_to_numpy(path)[:, :, :, 0]

429
        subtracted_volume = np.subtract(volume, dualreg_subject_mean)
430

431
432
        return subtracted_volume

433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
    def scale_volume(self, volume, scaling_factors, target_flag=False):
        """Scaling function

        This function reads the scaling factors from the saved file and then scales the data.

        Args:
            volume (np.array): Unscalled volume
            scaling_factors (str): Path to the scaling factor file
            target_flag (bool): Flag signaling if the file is a target or an input

        Returns:
            scaled_volume (np.array): Scaled volume
        """

        with open(scaling_factors, 'rb') as input_file:
            min_input, max_input, min_target, max_target = pickle.load(input_file)

450
451
452
        # Steve Scaling
        min_input, max_input, min_target, max_target = [0.0, 0.2, 0.0, 10.0]

453
454
455
456
457
458
        if target_flag == False:
            min_value = min_input
            max_value = max_input
        elif target_flag == True:
            min_value = min_target
            max_value = max_target
459
460
461
462
463
464
465

        # Set all negative elements to 0
        volume[volume < 0] = 0.0        

        # Eliminating outliers
        volume[volume > max_value] = max_value
        volume[volume < min_value] = min_value
466
467
468
469
470
        
        # Normalization to [0, 1]
        scaled_volume = np.divide(np.subtract(volume, min_value), np.subtract(max_value, min_value)) 
        # Scaling between [-1, 1]
        # scaled_volume = np.add(-1.0, np.multiply(2.0, np.divide(np.subtract(volume, min_value), np.subtract(max_value, min_value))))
471
472
473
        
        # No scaling performed
        # scaled_volume = volume
474
475
476

        return scaled_volume

Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
477

478
def get_datasets(data_parameters):
479
480
481
482
483
484
485
486
487
488
    """Data Loader Function.

    This function loads the various data file and returns the relevand mapped datasets.

    Args:
        data_parameters (dict): Dictionary containing relevant information for the datafiles.
        data_parameters = {
            data_directory: 'path/to/directory'
            train_data_file: 'training_data'
            train_output_targets: 'training_targets'
489
490
491
492
            train_list = 'train.txt'
            validation_list = 'validation.txt'
            validation_data_file: 'testing_data'
            validation_target_file: 'testing_targets'
493
        }
494

495
496
497
498
    Returns:
        touple: the relevant train and test datasets
    """

499
500
501
502
    train_filename = data_parameters['train_list']
    data_directory = data_parameters['data_directory']
    train_data_file = data_parameters['train_data_file']
    train_output_targets = data_parameters['train_output_targets']
503

504
505
506
    validation_filename = data_parameters['validation_list']
    validation_data_file = data_parameters['validation_data_file']
    validation_output_targets = data_parameters['validation_target_file']
507

508
509
510
    mean_mask_path = data_parameters['mean_mask_path']
    mean_reduction = data_parameters['mean_reduction']

511
512
    scaling_factors = data_parameters['scaling_factors']

513
    return (
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
514
        DataMapper(train_filename, data_directory,
515
                   train_data_file, train_output_targets, mean_mask_path, scaling_factors, mean_reduction),
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
516
        DataMapper(validation_filename, data_directory,
517
                   validation_data_file, validation_output_targets, mean_mask_path, scaling_factors, mean_reduction)
518
    )
519

520

521
522
523
524
525
526
527
528
def create_folder(path):
    """Folder Creator

    A function which creates a folder at a given path if one does not exist

    Args:
        path (str): destination to check for folder existance
    """
529

530
531
532
    if not os.path.exists(path):
        os.mkdir(path)

533

534
def load_file_paths(data_directory, data_list, mapping_data_file, targets_directory=None, target_file=None):
535
    """File Loader
536

537
538
539
540
541
    This function returns a list of combined file paths for the input data and labelled output data.

    Args:
        data_directory (str): Path to input data directory
        data_list (str): Path to a .txt file containing the input files for consideration
542
        mapping_data_file (str): Path to the input files
543
        targets_directory (str): Path to labelled data (Y-equivalent); None if during evaluation.
544
545
546
547
548

    Returns:
        file_paths (list): List containing the input data and target labelled output data

    Raises:
549
        ValueError: "Invalid data entry - check code and data entry format!"
550
551
552
553
554
555
556
557
    """

    if data_list:
        with open(data_list) as data_list_file:
            volumes_to_be_used = data_list_file.read().splitlines()
    else:
        volumes_to_be_used = [files for files in os.listdir(data_directory)]

558
559
    if targets_directory == None or target_file == None:
        file_paths = [[os.path.join(data_directory, volume, mapping_data_file)]
560
                      for volume in volumes_to_be_used]
561
    else:
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
562
        file_paths = [[os.path.join(data_directory, volume, mapping_data_file), os.path.join(
563
            targets_directory, volume)] for volume in volumes_to_be_used]
564
565

    return file_paths
566

567

568
569
def load_and_preprocess_evaluation(file_path):
    """Load & Preprocessing before evaluation
570

571
    This function loads a nifty file and returns its volume and header information
572
573

    Args:
574
        file_path (str): Path to the desired file
575

576
577
578
    Returns:
        volume (np.array): Array of training image data of data type dtype.
        header (class): 'nibabel.nifti1.Nifti1Header' class object, containing image metadata
579
580
581
582
        xform (np.array): Array of shape (4, 4), containing the adjusted voxel-to-world transformation for the spatial dimensions of the resampled data

    Raises:
        ValueError: "Orientation value is invalid. It must be either >>coronal<<, >>axial<< or >>sagital<< "
583
584
    """

585
586
587
    original_image = Image(file_path[0])
    volume, xform = resampleToPixdims(original_image, (2, 2, 2))
    header = Image(volume, header=original_image.header, xform=xform).header
588

589
    return volume, header, xform
590

591

592
593
def load_and_preprocess_targets(target_path, mean_mask_path):
    """Load & Preprocessing targets before evaluation
594

595
    This function loads a nifty file and returns its volume, a de-meaned volume and header information
596
597

    Args:
598
599
        file_path (str): Path to the desired target file
        mean_mask_path (str): Path to the dualreg subject mean mask
600

601
    Returns:
602
603
604
605
606
        target (np.array): Array of training image data of data type dtype.
        target_demeaned (np.array): Array of training data from which the group mean has been subtracted

    Raises:
        ValueError: "Orientation value is invalid. It must be either >>coronal<<, >>axial<< or >>sagital<< "
607
608
    """

609
610
    target = Image(target_path[0]).data[:,:,:,0]
    target_demeaned = np.subtract(target, Image(mean_mask_path).data[:,:,:,0])
611

612
    return target, target_demeaned
613
614


615
# Deprecated Functions & Classes & Methods:
616

617

618
619
620
621
622
623
624
625
626
def set_orientation(volume, label_map, orientation):
    """Load Data Orientation

    This function modifies the orientation of the input and output data depending on the required orientation.

    Args:
        volume (np.array): Array of training image data of data type dtype.
        label_map (np.array): Array of labelled image data of data type dtype.
        orientation (str): String detailing the current view (COR, SAG, AXL)
627

628
629
630
    Returns:
        volume (np.array): Array of training image data of data type dtype.
        label_map (np.array): Array of labelled image data of data type dtype.
631

632
633
634
635
636
637
638
639
    Raises:
        ValueError: Orientation value is invalid. It must be either >>coronal<<, >>axial<< or >>sagital<<
    """

    # TODO: will need to check if these alignments correpond with our data.
    # These alignments work for ADNI

    if orientation == "sagittal":
640
        return volume, label_map  # This is assumed to be the default orientation
641
    elif orientation == "axial":
642
        return volume.transpose((1, 2, 0)), label_map.transpose((1, 2, 0))
643
    elif orientation == "coronal":
644
        return volume.transpose((2, 0, 1)), label_map.transpose((2, 0, 1))
645
    else:
646
647
648
        raise ValueError(
            "Orientation value is invalid. It must be either >>coronal<<, >>axial<< or >>sagital<< ")

649

650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
def tract_sum_generator(folder_path):
    """Sums the tracts of different dMRI files

    THIS FUNCTION IS NOT DEPRECATED: SummedTractMaps generated remotely

    When performing subject-specific probabilistic diffusion tractography using standard-space protocols, 27 tracts are created.
    This function loops through all the tracts, sums them and returns the summed tract map.
    This function also outputs the summed tract map as a Nifti (.nii.gz) file.

    Args:
        folder_location (str): A string containing the address of the required directory.
    """

    tractMapName = 'tracts/tractsNorm.nii.gz'

    subDirectoryList = directory_reader(folder_path)

    viableSubDirectories = len(subDirectoryList)
    counter = 0

    if not os.path.exists('/well/win/users/hsv459/functionmapper/datasets/dMRI'):
        if not os.path.exists('/well/win/users/hsv459/functionmapper/datasets'):
            os.mkdir('/well/win/users/hsv459/functionmapper/datasets')
        os.mkdir('/well/win/users/hsv459/functionmapper/datasets/dMRI')

    for subDirectory in subDirectoryList:
        tractedMapsPath = os.path.join(folder_location, str(
            subDirectory), 'dMRI/autoptx_preproc/tracts/')

        sum_flag = False  # This is a flat showing us if this is the first tracted to be summed

        print("Summing the tract number: {}/{}".format(counter, viableSubDirectories))

        for tract in os.listdir(tractedMapsPath):
            if os.path.isdir(os.path.join(tractedMapsPath, tract)):
                tractedMapPath = os.path.join(
                    tractedMapsPath, tract, tractMapName)
                tractedMapImg = nib.load(tractedMapPath)

                tractedMap = tractedMapImg.get_fdata()

                #  the affine array stores the relationship between voxel coordinates in the image data array and coordinates in the reference space

                tractedMapAffine = tractedMapImg.affine

                if sum_flag == False:
                    tractedMapSum = np.copy(tractedMap)
                else:
                    tractedMapSum = np.sum(tractedMapSum, tractedMap)

        tractedMapSumPath = '/well/win/users/hsv459/functionmapper/datasets/dMRI'
        tractsSumName = str(subDirectory) + ".nii.gz"
        tractedMapSumImg = nib.Nifti1Image(tractedMapSum, tractedMapAffine)
        nib.save(tractedMapSumImg, os.path.join(
            tractedMapSumPath, tractsSumName))

        counter += 1

    return None

Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
710

711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
class DataMapperHDF5(data.Dataset):
    """Data Mapper Class.

    THIS CLASS IS NOT DEPRECATED!

    This class represents a generic parent class for mapping between keys and data samples.
    The class represents a subclass/child class of data.Dataset, inheriting its functionality.
    This class is composed of a __init__ constructor, a __getitem__(), supporting fetching a data sample for a given key, and __len__(), which returns the size of the dataset.

    Args:
        X (HDF5 datafile): hierarchically organized input data
        y (HDF5 datafile): hierarchically organized output data 

    Returns:
        input_data (torch.tensor): Tensor representation of the input data
        label_data (torch.tensor): Tensor representation of the output data
        int: lenght of the output
    """

    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __getitem__(self, index):
        input_data = torch.from_numpy(self.X[index])
        label_data = torch.from_numpy(self.y[index])
        return input_data, label_data

    def __len__(self):
        return len(self.y)

Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
742

743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
def get_datasetsHDF5(data_parameters):
    """Data Loader Function.

    THIS FUNCTION IS NOT DEPRECATED: Loader function rewritten. 

    This function loads the various data file and returns the relevand mapped datasets.

    Args:
        data_parameters (dict): Dictionary containing relevant information for the datafiles.
        data_parameters = {
            data_directory: 'path/to/directory'
            train_data_file: 'training_data'
            train_output_targets: 'training_targets'
            train_list = 'train.txt'
            validation_list = 'validation.txt'
            test_list = 'test.txt'
            test_data_file: 'testing_data'
            test_target_file: 'testing_targets'
        }

    Returns:
        touple: the relevant train and test datasets
    """

    training_data = h5py.File(os.path.join(
        data_parameters['data_directory'], data_parameters['training_data']), 'r')
    testing_data = h5py.File(os.path.join(
        data_parameters['data_directory'], data_parameters['testing_data']), 'r')

    training_labels = h5py.File(os.path.join(
        data_parameters['data_directory'], data_parameters['training_targets']), 'r')
    testing_labels = h5py.File(os.path.join(
        data_parameters['data_directory'], data_parameters['testing_targets']), 'r')

    return (
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
778
779
        DataMapperHDF5(training_data['data'][()],
                       training_labels['label'][()]),
780
781
        DataMapperHDF5(testing_data['data'][()], testing_labels['label'][()])
    )
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821


def load_and_preprocess_evaluation2D(file_path, orientation, min_max=True):
    """Load & Preprocessing before evaluation

    This function loads a nifty file and returns its volume and header information

    Args:
        file_path (str): Path to the desired file
        orientation (str): String detailing the current view (COR, SAG, AXL)
        min_max (bool): Flag for inducing min-max normalization of the volume

    Returns:
        volume (np.array): Array of training image data of data type dtype.
        header (class): 'nibabel.nifti1.Nifti1Header' class object, containing image metadata

    Raises:
        ValueError: "Orientation value is invalid. It must be either >>coronal<<, >>axial<< or >>sagital<< "
    """

    nifty_volume = nib.load(file_path[0])

    volume = nifty_volume.get_fdata()
    header = nifty_volume.header

    if min_max:
        volume = (volume - np.min(volume)) / (np.max(volume) - np.min(volume))
    else:
        volume = np.round(volume)

    if orientation == "sagittal":
        return volume  # This is assumed to be the default orientation
    elif orientation == "axial":
        return volume.transpose((1, 2, 0))
    elif orientation == "coronal":
        return volume.transpose((2, 0, 1))
    else:
        raise ValueError(
            "Orientation value is invalid. It must be either >>coronal<<, >>axial<< or >>sagital<< ")

Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
822
    return volume, header
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870


def load_and_preprocess(file_paths, orientation):
    """Load & Preprocess

    This function is composed of two other function calls: one that calls a function loading the data, and another which preprocesses the data to the required format.
    # TODO: Need to check if any more proprocessing would be required besides summing the tracts!

    Args:
        file_paths (list): List containing the input data and target labelled output data
        orientation (str): String detailing the current view (COR, SAG, AXL)

    Returns:
        volume (np.array): Array of training image data of data type dtype.
        label_map (np.array): Array of labelled image data of data type dtype.
        header (class): 'nibabel.nifti1.Nifti1Header' class object, containing image metadata
    """

    volume, label_map, header = load(file_paths, orientation)

    return volume, label_map, header


def load(file_path, orientation):
    """Load Data Function

    This function loads the required data files and extracts relevant information from it.

    Args:
        file_path (list): List containing the input data and target labelled output data
        orientation (str): String detailing the current view (COR, SAG, AXL)

    Returns:
        volume (np.array): Array of training image data of data type dtype.
        label_map (np.array): Array of labelled image data of data type dtype.
        header (class): 'nibabel.nifti1.Nifti1Header' class object, containing image metadata
    """

    nifty_volume, label_map = nib.load(file_path[0]), nib.load(file_path[1])
    volume, label_map = nifty_volume.get_fdata(), label_map.ged_fdata()

    # Do we need min-max normalization here? Will need to check when debuggint and testing
    volume = (volume - np.min(volume)) / (np.max(volume) - np.min(volume))

    volume, label_map = set_orientation(volume, label_map, orientation)

    return volume, label_map, nifty_volume.header