data_utils.py 30.3 KB
Newer Older
Andrei Roibu's avatar
Andrei Roibu committed
1
2
3
4
"""Data Processing Functions

Description:

5
    This folder contains several functions which, either on their own or included in larger pieces of software, perform processing tasks on the data.
Andrei Roibu's avatar
Andrei Roibu committed
6

7
8
9
10
11
Usage:

    To use content from this folder, import the functions and instantiate them as you wish to use them:

        from utils.data_utils import function_name
Andrei Roibu's avatar
Andrei Roibu committed
12
13
14

"""

15
import os
16
import h5py
17
import pickle
18
19
from glob import glob
import numpy as np
20
import nibabel as nib
21
import torch
22
import torch.utils.data as data
23
import configparser
24
import pandas as pd
25
26
from fsl.data.image import Image
from fsl.utils.image.resample import resampleToPixdims
27
from sklearn.model_selection import KFold, train_test_split
28

Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
29

30
def directory_reader(folder_location, subject_number=None, write_txt=False):
31
32
33
34
35
36
37
    """Produces a list of of data-tags which are accessible

    This function looks in a large data directory, and returns a list of sub-directories which are accessible.
    This is done as currently, not all UK Biobank Data is accessible due to privacy issues.

    Args:
        folder_location (str): A string containing the address of the required directory.
38
        write_txt (bool): Flag indicating if a .txt file should be created.
39
        suject_number (int): Number of subjects to be considered for a job. Useful when wanting to train on datasizes smaller than total datapoints available in a datafolder.
40
    Returns:
41
        subDirectoryList (list): A list of strings containing the available sub-directories. This is also printed out as a .txt file
42
    """
43
44
    if write_txt == True:
        out_file = open("files.txt", 'w')
45
46
47

    subDirectoryList = []

48
49
50
    number_of_subjects = 0

    if subject_number is None:
51
52
        subject_number = len(os.listdir(os.path.join(
            os.path.expanduser("~"), folder_location)))
53

54
    for directory in os.listdir(folder_location):
55
        if number_of_subjects < subject_number:
56
            if os.path.isdir(os.path.join(folder_location, directory)) and os.path.exists(os.path.join(folder_location, directory, "dMRI/autoptx_preproc/")) and os.path.exists(os.path.join(folder_location, directory, "fMRI/rfMRI_25.dr/")):
57
58
59
60
61
62
63
64
65
66
                filename = folder_location+directory
                if os.access(filename, os.R_OK):
                    string = directory
                    if write_txt == True:
                        out_file.write(string)
                        out_file.write("\n")
                    subDirectoryList.append(directory)
                    number_of_subjects += 1
        else:
            break
67

68
    return subDirectoryList, number_of_subjects
69

Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
70

71
def data_file_reader(data_file_path, folder_location, subject_number=None):
72
73
74
75
    """Data File reader

    Args:
        data_file_path (str): Path to the file containing the data
76
77
        folder_location (str): A string containing the address of the required directory.
        subject_number (int): Number of subjects to be considered for a job. Useful when wanting to train on datasizes smaller than total datapoints available in a datafolder.
78
79
80
81
82

    Returns:
        subDirectoryList (list): A list of strings containing the available sub-directories
    """

83
84
    with open(data_file_path) as volume_list:
        directories = volume_list.read().split('\n')
85

86
87
88
    file_counter = 0
    subDirectoryList = []
   
89
    if subject_number is not None:
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
        for directory in directories:
            if file_counter < subject_number:
                if directory == '':
                    pass
                else:
                    if os.path.isdir(os.path.join(folder_location, directory)) and os.path.exists(os.path.join(folder_location, directory, "dMRI/autoptx_preproc/")) and os.path.exists(os.path.join(folder_location, directory, "fMRI/rfMRI_25.dr/")):
                        subDirectoryList.append(directory)
                        file_counter += 1
    else:
        for directory in directories:
            if directory == '':
                pass
            else:
                if os.path.isdir(os.path.join(folder_location, directory)) and os.path.exists(os.path.join(folder_location, directory, "dMRI/autoptx_preproc/")) and os.path.exists(os.path.join(folder_location, directory, "fMRI/rfMRI_25.dr/")):
                    subDirectoryList.append(directory)
                    file_counter += 1
106

107
    return subDirectoryList, file_counter
108
109


110
111
def data_preparation(data_folder_name, test_percentage, subject_number, data_directory, train_inputs, train_targets, rsfMRI_mean_mask_path, dMRI_mean_mask_path, data_file=None, K_fold=None):
    """ Data preparation function
112

113
114
    This function conducts data preparation opreations, including regression weight calculation, data splitting and scaling factor calculation. 
    Produces lists of train, test and validation data
115
116
    This function looks at the list of all available directories and returns three lists of dsub-directories.
    These lists are the lists required for training, testing and validation.
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
117

118
    Args:
119
        data_folder_name (str): The name of the folder where the string data is being output
120
        test_percentage (int): Percentage of data to be used for testing
121
        suject_number (int): Number of subjects to be considered for a job. Useful when wanting to train on datasizes smaller than total datapoints available in a datafolder.
122
123
124
        data_directory (str): A string containing the address of the required directory.
        train_inputs (str): Path to the training input files
        train_targets (str): Path to the training target files
125
126
        rsfMRI_mean_mask_path (str): Path to the dualreg subject mean mask
        dMRI_mean_mask_path (str): Path to the summed tract mean mask
127
        K_fold (int): Number of folds for splitting the training data
128
        data_file (str): Name of *.txt file containing a list of the required data
129
130
131

    """

132
    if data_file is not None:
133
        subDirectoryList, _ = data_file_reader(
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
134
            data_file, data_directory, subject_number)
135
    else:
136
        subDirectoryList, _ = directory_reader(data_directory, subject_number)
137

138
139
140
141
142
143
    # Produce the regression weights - Separate Function

    regression_weight_dataframe_builder(subDirectoryList, data_folder_name, data_directory,
                                        train_inputs, train_targets, rsfMRI_mean_mask_path, dMRI_mean_mask_path)

    # Splitting the data into train-validation-test
144

145
    subDirectoryList = np.array(subDirectoryList)
146
147
    create_folder(data_folder_name)

148
149
    train_data, test = train_test_split(
        subDirectoryList, test_size=test_percentage/100, random_state=42, shuffle=True)
150
151
    if os.path.exists(os.path.join(data_folder_name, 'test.txt')):
        os.remove(os.path.join(data_folder_name, 'test.txt'))
152
153
    np.savetxt(os.path.join(data_folder_name, 'test.txt'), test, fmt='%s')

154
    if K_fold is None:
155
156
157
        train, validation = train_test_split(
            train_data, test_size=int(len(test)), random_state=42, shuffle=True)

158
        if os.path.exists(os.path.join(data_folder_name, 'train.txt')):
159
            os.remove(os.path.join(data_folder_name, 'train.txt'))
160
161
        np.savetxt(os.path.join(data_folder_name,
                                'train.txt'), train, fmt='%s')
162
163

        if os.path.exists(os.path.join(data_folder_name, 'validation.txt')):
164
            os.remove(os.path.join(data_folder_name, 'validation.txt'))
165
        np.savetxt(os.path.join(data_folder_name, 'validation.txt'),
166
                   validation, fmt='%s')
167

168
169
        print('Generating training dataset min and max...')

170
171
172
        # data_extremes(os.path.join(data_folder_name, 'scaling_factors.pkl'),
        #               os.path.join(data_folder_name, 'train.txt'),
        #               data_directory, train_inputs, train_targets, mean_mask_path)
173
174
175

        print('Global training dataset min and max values generated!')

176
    else:
177
        k_fold = KFold(n_splits=K_fold)
178
179
180
        k = 0
        for train_index, validation_index in k_fold.split(train_data):
            train, validation = train_data[train_index], train_data[validation_index]
181

182
183
184
            if os.path.exists(os.path.join(data_folder_name, 'train' + str(k+1) + '.txt')):
                os.remove(os.path.join(data_folder_name,
                                       'train' + str(k+1) + '.txt'))
185
186
            np.savetxt(os.path.join(data_folder_name, 'train' +
                                    str(k+1)+'.txt'), train, fmt='%s')
187

188
189
190
            if os.path.exists(os.path.join(data_folder_name, 'validation' + str(k+1) + '.txt')):
                os.remove(os.path.join(data_folder_name,
                                       'validation' + str(k+1) + '.txt'))
191
192
193
            np.savetxt(os.path.join(data_folder_name, 'validation' +
                                    str(k+1)+'.txt'), validation, fmt='%s')

194
195
196
            print(
                'Generating training dataset min and max for fold K = {}/{}...'.format(k+1, K_fold))

197
198
199
200
            # data_extremes(os.path.join(data_folder_name, 'scaling_factors'+str(k+1)+'.pkl'),
            #               os.path.join(data_folder_name,
            #                            'train' + str(k+1) + '.txt'),
            #               data_directory, train_inputs, train_targets, mean_mask_path)
201
202
203
204

            print(
                'Global training dataset min and max values for fold K = {}/{} generated!'.format(k+1, K_fold))

205
            k += 1
206

207

208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
def data_extremes(output_file_path, train_list, data_directory, train_inputs, train_targets, mean_mask_path):
    """Calculates min-max for train dataset

    This function calculates the min and max of the entire training dataset.
    After, it saves the values to a pkl file.

    Args:
        output_file_path (str): Path to the output file
        train_list (str): Path to the file containing the list of training volumes
        data_directory (str): A string containing the address of the required directory.
        train_inputs (str): Path to the training input files
        train_targets (str): Path to the training target files
        mean_mask_path (str): Path to the dualreg subject mean mask
    """

    min_input = None
    max_input = None
    min_target = None
    max_target = None

    dualreg_subject_mean = Image(mean_mask_path).data[:, :, :, 0]

    with open(train_list) as volume_list:
        lines = volume_list.read().split('\n')

    for line in lines:
        if line == '':
            pass
        else:
            input_path = os.path.join(data_directory, line, train_inputs)
            target_path = os.path.join(data_directory, line, train_targets)

240
241
242
            input_volume, _ = resampleToPixdims(Image(input_path), (2, 2, 2))
            target_volume = np.subtract(
                Image(target_path).data[:, :, :, 0], dualreg_subject_mean)
243

244
245
            target_volume[target_volume < 0] = 0

246
247
            # TODO: Currently, the volume of the fMRI data is hardcoded, only loading in the DMN. This needs to be updated in later iterations.

248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
        #    if min_input == None:
        #        min_input = np.min(input_volume)
        #    elif min_input > np.min(input_volume):
        #        min_input = np.min(input_volume)

        #    if max_input == None:
        #        max_input = np.max(input_volume)
        #    elif max_input < np.max(input_volume):
        #        max_input = np.max(input_volume)

        #    if min_target == None:
        #        min_target = np.min(target_volume)
        #    elif min_target > np.min(target_volume):
        #        min_target = np.min(target_volume)

        #    if max_target == None:
        #        max_target = np.max(target_volume)
        #    elif max_target < np.max(target_volume):
        #        max_target = np.max(target_volume)
267

268
            if min_input == None:
269
270
271
                min_input = np.percentile(input_volume, 1)
            elif min_input > np.percentile(input_volume, 1):
                min_input = np.percentile(input_volume, 1)
272
273

            if max_input == None:
274
275
276
                max_input = np.percentile(input_volume, 99)
            elif max_input < np.percentile(input_volume, 99):
                max_input = np.percentile(input_volume, 99)
277
278

            if min_target == None:
279
280
281
                min_target = np.percentile(target_volume, 1)
            elif min_target > np.percentile(target_volume, 1):
                min_target = np.percentile(target_volume, 1)
282
283

            if max_target == None:
284
285
286
287
                max_target = np.percentile(target_volume, 99)
            elif max_target < np.percentile(target_volume, 99):
                max_target = np.percentile(target_volume, 99)

288
289
290
291
292
293
294
295
296
297
            del input_volume, target_volume, input_path, target_path

    if os.path.exists(output_file_path):
        os.remove(output_file_path)

    with open(output_file_path, 'wb') as output_file:
        pickle.dump([min_input, max_input, min_target,
                     max_target], output_file)


298
def update_shuffling_flag(file_name):
299
300
301
302
303
304
305
306
307
308
309
310
311
    """ Update shuffling flag

    Changes shuffling flag in settings to False once data has been shuffled

    Args:
        file_name (str): The settings file name
    """

    config = configparser.ConfigParser()
    config.read(file_name)
    config.set('DATA', 'data_split_flag', 'False')
    with open(file_name, 'w') as configfile:
        config.write(configfile)
312

Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
313

314
class DataMapper(data.Dataset):
315
    """Data Mapper Class
316
317
318
319

    This class represents a generic parent class for mapping between keys and data samples.
    The class represents a subclass/child class of data.Dataset, inheriting its functionality.
    This class is composed of a __init__ constructor, a __getitem__(), supporting fetching a data sample for a given key, and __len__(), which returns the size of the dataset.
320
    This class also has several other helper functions.
321
322

    Args:
323
324
        filename (str): Path to file containing the relevant volume indicator numbers
        data_directory (str): Directory where the various subjects are stored.
325
326
        data_file (str): Intenal path for each subject to the relevant normalized summed dMRI tracts
        output_targets (str): Internal path for each subject to the relevant rsfMRI data
327
328
        dMRI_mean_mask_path (str): Path to the summed tracts mean mask
        rsfMRI_mean_mask_path (str): Path to the dualreg mean mask
329
        scaling_factors (str): Path to the file containing the scaling factors
330
        regression_weights (str): Path to the file containing the regression_weights
331
        mean_reduction (bool): Flag indicating if the targets should be de-meaned using the mean_mask_path
332

333
    Returns:
334
335
        X_volume (torch.tensor): Tensor representation of the input data
        y_volume (torch.tensor): Tensor representation of the output data
336
337
        int: lenght of the output

338
    """
339

340
    def __init__(self, filename, data_directory, data_file, output_targets, dMRI_mean_mask_path, rsfMRI_mean_mask_path, scaling_factors, regression_weights, mean_reduction=False):
341
342
        # Initialize everything, and only store in memory the text data file.
        # Memory usage limited by only storing the text string information, not the actual volumes.
343
        # TODO: Currently, the timepoint in the fMRI data (y_volume) is hardcoded, only loading in the DMN. This needs to be updated in later iterations.
344
345
        self.filename = filename
        self.data_directory = data_directory
346
347
        self.data_file = data_file
        self.output_targets = output_targets
348
349
        self.sample_pairs = []
        self._get_datasets()
350
351
        self.dMRI_mean_mask_path = dMRI_mean_mask_path
        self.rsfMRI_mean_mask_path = rsfMRI_mean_mask_path
352
        self.mean_reduction = mean_reduction
353
        self.scaling_factors = scaling_factors
354
        self.regression_weights = regression_weights
355

Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
356
    def __len__(self):
357
        return len(self.sample_pairs)
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
358

359
    def __getitem__(self, index):
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
360

361
        X_path, y_path, subject_id = self.sample_pairs[index]
362

363
        if self.mean_reduction == True:
364
365
366
367
            X_volume = torch.from_numpy(self.scale_volume(self.linear_regress_mean(
                X_path, subject_id, target_flag=False), target_flag=False))
            y_volume = torch.from_numpy(self.scale_volume(self.linear_regress_mean(
                y_path, subject_id, target_flag=True), target_flag=True))
368
        else:
369
370
371
372
            X_volume = torch.from_numpy(self.scale_volume(
                self.resample(X_path), target_flag=False))
            y_volume = torch.from_numpy(self.scale_volume(
                self.convert_to_numpy(y_path)[:, :, :, 0], target_flag=True))
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
373
374
375

        return X_volume, y_volume

376
377
378
379
380
    def _get_datasets(self):
        """File path generator

        Helper function which reads all the various strings and generates the required paths.
        """
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
381

382
        with open(self.filename) as files:
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
383
            lines = files.read().split('\n')
384
385
386
387
388

        for line in lines:
            if line == '':
                pass
            else:
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
389
                X_path = os.path.join(
390
                    os.path.expanduser("~"), self.data_directory, line, self.data_file)
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
391
                y_path = os.path.join(
392
                    os.path.expanduser("~"), self.data_directory, line, self.output_targets)
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
393

394
                self.sample_pairs.append((X_path, y_path, line))
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
395

396
397
398
399
400
401
402
403
404
405
406
407
408
    def resample(self, path):
        """dMRI Resample

        Helper function downsampling the dMRI data from 1mm to 2mm.
        This is due to GPU memory / RAM limitations during training.
        The resampleToPixdims also carries out data smoothing.

        Args:
            path (str): Path to the relevant volume

        Returns:
            volume_resampled (np.array): Resampled volume
        """
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
409
410
        volume_resampled, _ = resampleToPixdims(
            self.read_data_files(path), (2, 2, 2))
411

412
        return volume_resampled
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
413

414
415
416
417
418
419
420
421
422
423
424
425
426
427
    def read_data_files(self, path):
        """Volume Reader

        Helper function reading the relvant volume. 

        Args:
            path (str): Path to the relevant volume

        Returns:
            volume_image (class): fsl.data.image.Image class
        """

        volume_image = Image(path)
        return volume_image
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
428

429
430
431
432
433
434
435
436
437
438
439
440
441
442
    def convert_to_numpy(self, path):
        """Numpy wrapper

        Helper function wrapping the conversaion of a volume to numpy

        Args:
            path (str): Path to the relevant volume.

        Returns
            volume_numpy (np.array): Numpy array representation of volume data.
        """

        volume_numpy = self.read_data_files(path).data
        return volume_numpy
443

444
445
    def linear_regress_mean(self, path, subject, target_flag):
        """Linear regressed mean subtraction
446

447
        Helper function which substracts or regressed the dual mean subject mask
448
449

        Args:
450
451
452
            path (str): Path to the relevant volume.
            subject (str): Subject ID of the subject volume to be regressed
            target_flag (bool): Flag signaling if the file is a target or an input
453
454

        Returns:
455
            regressed_volume (np.array): Numpy array representation of the subtracted volume data
456
457
        """

458
459
460
461
462
463
464
465
466
467
468
469
        if target_flag == False:
            subject_volume = self.resample(path)
            group_mean = self.convert_to_numpy(self.dMRI_mean_mask_path)
            dataframe_key = 'w_dMRI'
        elif target_flag == True:
            subject_volume = self.convert_to_numpy(path)[:, :, :, 0]
            group_mean = self.convert_to_numpy(
                self.rsfMRI_mean_mask_path)[:, :, :, 0]
            dataframe_key = 'w_rsfMRI'

        weight = pd.read_pickle(
            self.regression_weights).loc[subject][dataframe_key]
470

471
472
        regressed_volume = np.subtract(
            subject_volume, np.multiply(weight, group_mean))
473

474
        return regressed_volume
475

476
    def scale_volume(self, volume, target_flag=False):
477
478
479
480
481
482
483
484
485
486
487
488
        """Scaling function

        This function reads the scaling factors from the saved file and then scales the data.

        Args:
            volume (np.array): Unscalled volume
            target_flag (bool): Flag signaling if the file is a target or an input

        Returns:
            scaled_volume (np.array): Scaled volume
        """

489
490
491
        with open(self.scaling_factors, 'rb') as input_file:
            min_input, max_input, min_target, max_target = pickle.load(
                input_file)
492

493
494
495
        # Steve Scaling
        min_input, max_input, min_target, max_target = [0.0, 0.2, 0.0, 10.0]

496
497
498
499
500
501
        if target_flag == False:
            min_value = min_input
            max_value = max_input
        elif target_flag == True:
            min_value = min_target
            max_value = max_target
502
503

        # Set all negative elements to 0
504
        volume[volume < 0] = 0.0
505
506
507
508

        # Eliminating outliers
        volume[volume > max_value] = max_value
        volume[volume < min_value] = min_value
509

510
        # Normalization to [0, 1]
511
512
        scaled_volume = np.divide(np.subtract(
            volume, min_value), np.subtract(max_value, min_value))
513
514
        # Scaling between [-1, 1]
        # scaled_volume = np.add(-1.0, np.multiply(2.0, np.divide(np.subtract(volume, min_value), np.subtract(max_value, min_value))))
515

516
517
        # No scaling performed
        # scaled_volume = volume
518
519
520

        return scaled_volume

Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
521

522
def get_datasets(data_parameters):
523
524
525
526
527
528
529
530
531
532
    """Data Loader Function.

    This function loads the various data file and returns the relevand mapped datasets.

    Args:
        data_parameters (dict): Dictionary containing relevant information for the datafiles.
        data_parameters = {
            data_directory: 'path/to/directory'
            train_data_file: 'training_data'
            train_output_targets: 'training_targets'
533
534
535
536
            train_list = 'train.txt'
            validation_list = 'validation.txt'
            validation_data_file: 'testing_data'
            validation_target_file: 'testing_targets'
537
        }
538

539
540
541
542
    Returns:
        touple: the relevant train and test datasets
    """

543
544
545
546
    train_filename = data_parameters['train_list']
    data_directory = data_parameters['data_directory']
    train_data_file = data_parameters['train_data_file']
    train_output_targets = data_parameters['train_output_targets']
547

548
549
550
    validation_filename = data_parameters['validation_list']
    validation_data_file = data_parameters['validation_data_file']
    validation_output_targets = data_parameters['validation_target_file']
551

552
553
    dMRI_mean_mask_path = data_parameters['dmri_mean_mask_path']
    rsfMRI_mean_mask_path = data_parameters['rsfmri_mean_mask_path']
554
555
    mean_reduction = data_parameters['mean_reduction']

556
    scaling_factors = data_parameters['scaling_factors']
557
    regression_weights = data_parameters['regression_weights']
558

559
    return (
560
561
562
563
        DataMapper(train_filename, data_directory, train_data_file, train_output_targets,
                   dMRI_mean_mask_path, rsfMRI_mean_mask_path, scaling_factors, regression_weights, mean_reduction),
        DataMapper(validation_filename, data_directory, validation_data_file, validation_output_targets,
                   dMRI_mean_mask_path, rsfMRI_mean_mask_path, scaling_factors, regression_weights, mean_reduction)
564
    )
565

566

567
568
569
570
571
572
573
574
def create_folder(path):
    """Folder Creator

    A function which creates a folder at a given path if one does not exist

    Args:
        path (str): destination to check for folder existance
    """
575

576
577
578
    if not os.path.exists(path):
        os.mkdir(path)

579

580
def load_file_paths(data_directory, data_list, mapping_data_file, targets_directory=None, target_file=None):
581
    """File Loader
582

583
584
585
586
587
    This function returns a list of combined file paths for the input data and labelled output data.

    Args:
        data_directory (str): Path to input data directory
        data_list (str): Path to a .txt file containing the input files for consideration
588
        mapping_data_file (str): Path to the input files
589
        targets_directory (str): Path to labelled data (Y-equivalent); None if during evaluation.
590
591
592
593
594

    Returns:
        file_paths (list): List containing the input data and target labelled output data

    Raises:
595
        ValueError: "Invalid data entry - check code and data entry format!"
596
597
598
599
600
601
602
603
    """

    if data_list:
        with open(data_list) as data_list_file:
            volumes_to_be_used = data_list_file.read().splitlines()
    else:
        volumes_to_be_used = [files for files in os.listdir(data_directory)]

604
605
    if targets_directory == None or target_file == None:
        file_paths = [[os.path.join(data_directory, volume, mapping_data_file)]
606
                      for volume in volumes_to_be_used]
607
    else:
Andrei-Claudiu Roibu's avatar
Andrei-Claudiu Roibu committed
608
        file_paths = [[os.path.join(data_directory, volume, mapping_data_file), os.path.join(
609
            targets_directory, volume)] for volume in volumes_to_be_used]
610

Andrei Roibu's avatar
Andrei Roibu committed
611
    return file_paths, volumes_to_be_used 
612

613

614
615
def load_and_preprocess_evaluation(file_path):
    """Load & Preprocessing before evaluation
616

617
    This function loads a nifty file and returns its volume and header information
618
619

    Args:
620
        file_path (str): Path to the desired file
621

622
623
624
    Returns:
        volume (np.array): Array of training image data of data type dtype.
        header (class): 'nibabel.nifti1.Nifti1Header' class object, containing image metadata
625
626
627
628
        xform (np.array): Array of shape (4, 4), containing the adjusted voxel-to-world transformation for the spatial dimensions of the resampled data

    Raises:
        ValueError: "Orientation value is invalid. It must be either >>coronal<<, >>axial<< or >>sagital<< "
629
630
    """

631
632
633
    original_image = Image(file_path[0])
    volume, xform = resampleToPixdims(original_image, (2, 2, 2))
    header = Image(volume, header=original_image.header, xform=xform).header
634

635
    return volume, header, xform
636

637

638
639
def load_and_preprocess_targets(target_path, mean_mask_path):
    """Load & Preprocessing targets before evaluation
640

641
    This function loads a nifty file and returns its volume, a de-meaned volume and header information
642
643

    Args:
644
645
        file_path (str): Path to the desired target file
        mean_mask_path (str): Path to the dualreg subject mean mask
646

647
    Returns:
648
649
650
651
652
        target (np.array): Array of training image data of data type dtype.
        target_demeaned (np.array): Array of training data from which the group mean has been subtracted

    Raises:
        ValueError: "Orientation value is invalid. It must be either >>coronal<<, >>axial<< or >>sagital<< "
653
654
    """

655
656
657
    target = Image(target_path[0]).data[:, :, :, 0]
    target_demeaned = np.subtract(
        target, Image(mean_mask_path).data[:, :, :, 0])
658

659
660
    return target, target_demeaned

661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693

def regression_weight_dataframe_builder(subDirectoryList, data_folder_name, data_directory, train_inputs, train_targets, rsfMRI_mean_mask_path, dMRI_mean_mask_path):
    """Builds a regression weights database

    This function constructs a database containing the dMRI and rsfMRI mean-regression weights for all utilised subjects.
    The function saves this database as a pickle file.

    Args:
        subDirectoryList (list): List of all subjects contained in the train-validation-test dataset. 
        data_folder_name (str): The name of the folder where the string data is being output
        data_directory (str): A string containing the address of the required directory.
        train_inputs (str): Path to the training input files
        train_targets (str): Path to the training target files
        train_targets (str): Path to the training target files
        rsfMRI_mean_mask_path (str): Path to the dualreg subject mean mask

    """

    regression_weights = {}

    for subject in subDirectoryList:
        w_dMRI, w_rsfMRI = regression_weight_calculator(
            data_directory, subject, train_inputs, train_targets, rsfMRI_mean_mask_path, dMRI_mean_mask_path)
        regression_weights[subject] = [w_dMRI, w_rsfMRI]

    regression_weights_df = pd.DataFrame.from_dict(
        regression_weights, orient='index', columns=['w_dMRI', 'w_rsfMRI'])

    regression_weights_df.to_pickle(os.path.join(
        data_folder_name, 'regression_weights.pkl'))


def regression_weight_calculator(data_directory, subject, train_inputs, train_targets, rsfMRI_mean_mask_path, dMRI_mean_mask_path):
694
695
696
697
698
699
700
    """ Calculator for linear regression weights

    This function cals the calculator for the weights required for peforming linear regression

    Args:
        data_directory (str): A string containing the address of the required directory.
        subject (str): Path to the relevant subject's data file
701
702
703
704
        train_inputs (str): Path to the training input files
        train_targets (str): Path to the training target files
        rsfMRI_mean_mask_path (str): Path to the dualreg subject mean mask
        dMRI_mean_mask_path (str): Path to the summed tract mean mask
705
706
707
708
709
710

    Returns:
        w_dMRI (float): Linear regression weight for dMRI data
        w_rsfMRI (flat): Linear regression weight for rsfMRI data
    """

711
712
713
714
    w_dMRI = weight_calculator(data_directory, subject, train_inputs, train_targets,
                               rsfMRI_mean_mask_path, dMRI_mean_mask_path, data_type='dmri')
    w_rsfMRI = weight_calculator(data_directory, subject, train_inputs, train_targets,
                                 rsfMRI_mean_mask_path, dMRI_mean_mask_path, data_type='fmri')
715
716
717

    return w_dMRI, w_rsfMRI

718
719

def weight_calculator(data_directory, subject, train_inputs, train_targets, rsfMRI_mean_mask_path, dMRI_mean_mask_path, data_type):
720
721
722
723
724
725
726
727
    """ Calculator for linear regression weights

    This function calcualtes the weights required for peforming linear regression

    Args:
        data_directory (str): A string containing the address of the required directory.
        subject (str): Path to the relevant subject's data file
        data_type (str): Flag indicating the data type
728
729
730
731
        train_inputs (str): Path to the training input files
        train_targets (str): Path to the training target files
        rsfMRI_mean_mask_path (str): Path to the dualreg subject mean mask
        dMRI_mean_mask_path (str): Path to the summed tract mean mask
732
733
734
735
736
737

    Returns:
        weigth (float): Linear regressiong weight. 
    """

    if data_type == 'dmri':
738
739
        mean_path = dMRI_mean_mask_path
        data_path = train_inputs
740
        mean_volume = Image(mean_path).data
741
742
743
744
745
746
747
748
749
750
        subject_path = os.path.join(os.path.expanduser(
            "~"), data_directory, subject, data_path)
        subject_volume, _ = resampleToPixdims(Image(subject_path), (2, 2, 2))
    elif data_type == 'fmri':
        mean_path = rsfMRI_mean_mask_path
        data_path = train_targets
        mean_volume = Image(mean_path).data[:, :, :, 0]
        subject_path = os.path.join(os.path.expanduser(
            "~"), data_directory, subject, data_path)
        subject_volume = Image(subject_path).data[:, :, :, 0]
751
752
753
754
755
756
757

    x = np.reshape(mean_volume, -1)
    y = np.reshape(subject_volume, -1)
    x_matrix = np.vstack((np.ones(len(x)), x)).T
    beta_hat = np.linalg.inv(x_matrix.T.dot(x_matrix)).dot(x_matrix.T).dot(y)
    w = beta_hat[1]

758
    return w