Commit 214a3932 authored by Andrei Roibu's avatar Andrei Roibu
Browse files

separeted data preprocessing from main run call, added storing as hdf5 database

parent 29d9668e
""" Database Generator
Description:
This file contains the required fuctions for generating the databases required for training the network.
This file is designed to be a standalone package, intendead to be run separately from the main network.
Usage:
To use content from this folder, import the functions and instantiate them as you wish to use them:
from utils.hdf5_generator import function_name
"""
import os
import h5py
import numpy as np
import utils.data_utils as data_utils
import utils.preprocessor as preprocessor
from utils.settings import Settings
from utils.common_utils import create_folder
def convert_hdf5(data_parameters, file_information):
# First, we split the data:
if data_parameters['data_split_flag'] == True:
print('Data is shuffling... This could take a few minutes!')
if data_parameters['use_data_file'] == True:
train_subjects, validation_subjects = preprocessor.data_preparation(data_parameters['data_folder_name'],
data_parameters['test_percentage'],
data_parameters['subject_number'],
data_directory=data_parameters['data_directory'],
train_inputs=data_parameters['train_data_file'],
train_targets=data_parameters['train_output_targets'],
rsfMRI_mean_mask_path=data_parameters['rsfmri_mean_mask_path'],
dMRI_mean_mask_path=data_parameters['dmri_mean_mask_path'],
data_file=data_parameters['data_file'],
)
else:
train_subjects, validation_subjects = preprocessor.data_preparation(data_parameters['data_folder_name'],
data_parameters['test_percentage'],
data_parameters['subject_number'],
data_directory=data_parameters['data_directory'],
train_inputs=data_parameters['train_data_file'],
train_targets=data_parameters['train_output_targets'],
rsfMRI_mean_mask_path=data_parameters['rsfmri_mean_mask_path'],
dMRI_mean_mask_path=data_parameters['dmri_mean_mask_path'],
)
preprocessor.update_shuffling_flag('utils/hdf5_settings.ini')
print('Data shuffling... Complete!')
elif data_parameters['train_test_file_read_flag'] == True:
# Read the subjects from the files!
train_subjects = data_utils.load_subjects_from_path(data_directory=data_parameters['data_directory'],
data_list=data_parameters['train_list'])
validation_subjects = data_utils.load_subjects_from_path(data_directory=data_parameters['data_directory'],
data_list=data_parameters['validation_list'])
else:
raise ValueError('Either a split flag, or a read-from-file flag must be provided as True')
# Then, we have to read the various test and train data, process them and write them to H5
# First, let's do this for the training data
print('-> Processing training data:')
train_dMRI, train_rsfMRI = preprocessor.load_datasets(subjects = train_subjects,
data_directory = data_parameters['data_directory'],
input_file = data_parameters['train_data_file'],
output_target = data_parameters['train_output_targets'],
mean_regression_flag = data_parameters['mean_regression_flag'],
mean_regression_all_flag = data_parameters['mean_regression_all_flag'],
regression_weights_path = data_parameters['regression_weights_path'],
dMRI_mean_mask_path = data_parameters['dmri_mean_mask_path'],
rsfMRI_mean_mask_path = data_parameters['rsfmri_mean_mask_path'],
mean_subtraction_flag = data_parameters['mean_subtraction_flag'],
scale_volumes_flag = data_parameters['scale_volumes_flag'],
normalize_flag = data_parameters['normalize_flag'],
negative_flag = data_parameters['negative_flag'],
outlier_flag = data_parameters['outlier_flag'],
shrinkage_flag = data_parameters['shrinkage_flag'],
hard_shrinkage_flag = data_parameters['hard_shrinkage_flag']
)
write_hdf5(train_dMRI, train_rsfMRI, file_information, mode='train')
# Then, we'll do it for the validation data
print('-> Processing validation data:')
validation_dMRI, validation_rsfMRI = preprocessor.load_datasets(subjects = validation_subjects,
data_directory = data_parameters['data_directory'],
input_file = data_parameters['train_data_file'],
output_target = data_parameters['train_output_targets'],
mean_regression_flag = data_parameters['mean_regression_flag'],
mean_regression_all_flag = data_parameters['mean_regression_all_flag'],
regression_weights_path = data_parameters['regression_weights_path'],
dMRI_mean_mask_path = data_parameters['dmri_mean_mask_path'],
rsfMRI_mean_mask_path = data_parameters['rsfmri_mean_mask_path'],
mean_subtraction_flag = data_parameters['mean_subtraction_flag'],
scale_volumes_flag = data_parameters['scale_volumes_flag'],
normalize_flag = data_parameters['normalize_flag'],
negative_flag = data_parameters['negative_flag'],
outlier_flag = data_parameters['outlier_flag'],
shrinkage_flag = data_parameters['shrinkage_flag'],
hard_shrinkage_flag = data_parameters['hard_shrinkage_flag']
)
write_hdf5(validation_dMRI, validation_rsfMRI, file_information, mode='validation')
def write_hdf5(input_volumes, target_volumes, file_information, mode):
""" HDF5 Writer
Function which writes the hdf5 files.
Args:
input_volumes (list): List of all the input volumes.
target_volumes (list) List of all the target volumes.
file_information (dict): Dictionary containing the outputs paths for the various databases
mode (str): String indicating the type of data observed
"""
with h5py.File(file_information[mode]['input'], 'w') as data_handle:
data_handle.create_dataset('input', data=input_volumes)
with h5py.File(file_information[mode]['target'], 'w') as data_handle:
data_handle.create_dataset('target', data=target_volumes)
if __name__ == "__main__":
print('Started Data Generation!')
settings = Settings('utils/hdf5_settings.ini')
data_parameters = settings['DATA']
create_folder(data_parameters['data_folder_name'])
file_information = {
'train': {"input" : os.path.join(data_parameters['data_directory'], data_parameters['input_data_train']),
"target" : os.path.join(data_parameters['data_directory'], data_parameters['target_data_train']),
},
'validation': {"input" : os.path.join(data_parameters['data_directory'], data_parameters['input_data_validation']),
"target" : os.path.join(data_parameters['data_directory'], data_parameters['target_data_validation']),
}
}
convert_hdf5(data_parameters, file_information)
print('Completed Data Generation!')
[DATA]
data_folder_name = "datasets"
use_data_file = False
data_directory = "/well/win-biobank/projects/imaging/data/data3/subjectsAll/"
data_file = "/well/win-biobank/projects/imaging/data/data3/subjectsAll/subj_22k.txt"
data_split_flag = False
train_test_file_read_flag = True
test_percentage = 5
subject_number = 12000
train_list = "datasets/train.txt"
validation_list = "datasets/validation.txt"
regression_weights_path = "datasets/regression_weights.pkl"
train_data_file = "dMRI/autoptx_preproc/tractsNormSummed.nii.gz"
train_output_targets = "fMRI/rfMRI_25.dr/dr_stage2.nii.gz"
rsfmri_mean_mask_path = "utils/mean_dr_stage2.nii.gz"
dmri_mean_mask_path = "utils/mean_tractsNormSummed_downsampled.nii.gz"
mean_regression_flag = False
mean_regression_all_flag = False
mean_subtraction_flag = True
scale_volumes_flag = True
normalize_flag = True
negative_flag = True
outlier_flag = True
shrinkage_flag = False
hard_shrinkage_flag = False
input_data_train = "input_data_train.h5"
target_data_train = "target_data_train.h5"
input_data_validation = "input_data_validation.h5"
target_data_validation = "target_data_validation.h5"
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment