"""Data Pre-processing Functions Description: This file contains the required functions for pre-processing the data. Usage: To use content from this folder, import the functions and instantiate them as you wish to use them: from utils.preprocessor import function_name """ import os import pickle import numpy as np import configparser import pandas as pd from fsl.data.image import Image from fsl.utils.image.resample import resampleToPixdims from fsl.utils.image.roi import roi from sklearn.model_selection import train_test_split from common_utils import create_folder def directory_reader(folder_location, subject_number=None, write_txt=False): """Produces a list of of data-tags which are accessible This function looks in a large data directory, and returns a list of sub-directories which are accessible. This is done as currently, not all UK Biobank Data is accessible due to privacy issues. Args: folder_location (str): A string containing the address of the required directory. write_txt (bool): Flag indicating if a .txt file should be created. suject_number (int): Number of subjects to be considered for a job. Useful when wanting to train on datasizes smaller than total datapoints available in a datafolder. Returns: subDirectoryList (list): A list of strings containing the available sub-directories. This is also printed out as a .txt file """ if write_txt == True: out_file = open("files.txt", 'w') subDirectoryList = [] number_of_subjects = 0 if subject_number is None: subject_number = len(os.listdir(os.path.join( os.path.expanduser("~"), folder_location))) for directory in os.listdir(folder_location): if number_of_subjects < subject_number: if os.path.isdir(os.path.join(folder_location, directory)) and os.path.exists(os.path.join(folder_location, directory, "dMRI/autoptx_preproc/")) and os.path.exists(os.path.join(folder_location, directory, "fMRI/rfMRI_25.dr/")): filename = folder_location+directory if os.access(filename, os.R_OK): string = directory if write_txt == True: out_file.write(string) out_file.write("\n") subDirectoryList.append(directory) number_of_subjects += 1 else: break return subDirectoryList, number_of_subjects def data_file_reader(data_file_path, folder_location, subject_number=None): """Data File reader Args: data_file_path (str): Path to the file containing the data folder_location (str): A string containing the address of the required directory. subject_number (int): Number of subjects to be considered for a job. Useful when wanting to train on datasizes smaller than total datapoints available in a datafolder. Returns: subDirectoryList (list): A list of strings containing the available sub-directories """ with open(data_file_path) as volume_list: directories = volume_list.read().split('\n') file_counter = 0 subDirectoryList = [] if subject_number is not None: for directory in directories: if file_counter < subject_number: if directory == '': pass else: if os.path.isdir(os.path.join(folder_location, directory)) and os.path.exists(os.path.join(folder_location, directory, "dMRI/autoptx_preproc/")) and os.path.exists(os.path.join(folder_location, directory, "fMRI/rfMRI_25.dr/")): subDirectoryList.append(directory) file_counter += 1 else: for directory in directories: if directory == '': pass else: if os.path.isdir(os.path.join(folder_location, directory)) and os.path.exists(os.path.join(folder_location, directory, "dMRI/autoptx_preproc/")) and os.path.exists(os.path.join(folder_location, directory, "fMRI/rfMRI_25.dr/")): subDirectoryList.append(directory) file_counter += 1 return subDirectoryList, file_counter def data_preparation(data_folder_name, test_percentage, subject_number, data_directory, train_inputs, train_targets, rsfMRI_mean_mask_path, dMRI_mean_mask_path, data_file=None): """ Data preparation function This function conducts data preparation opreations, including regression weight calculation, data splitting and scaling factor calculation. Produces lists of train, test and validation data This function looks at the list of all available directories and returns three lists of dsub-directories. These lists are the lists required for training, testing and validation. Args: data_folder_name (str): The name of the folder where the string data is being output test_percentage (int): Percentage of data to be used for testing suject_number (int): Number of subjects to be considered for a job. Useful when wanting to train on datasizes smaller than total datapoints available in a datafolder. data_directory (str): A string containing the address of the required directory. train_inputs (str): Path to the training input files train_targets (str): Path to the training target files rsfMRI_mean_mask_path (str): Path to the dualreg subject mean mask dMRI_mean_mask_path (str): Path to the summed tract mean mask data_file (str): Name of *.txt file containing a list of the required data Returns: train (list): List of the train subjects validation (list): List containing the subjects used for validation """ if data_file is not None: subDirectoryList, _ = data_file_reader(data_file, data_directory, subject_number) else: subDirectoryList, _ = directory_reader(data_directory, subject_number) # Produce the regression weights - Separate Function regression_weight_dataframe_builder(subDirectoryList, data_folder_name, data_directory, train_inputs, train_targets, rsfMRI_mean_mask_path, dMRI_mean_mask_path) # Splitting the data into train-validation-test subDirectoryList = np.array(subDirectoryList) # create_folder(data_folder_name) train_data, test = train_test_split( subDirectoryList, test_size=test_percentage/100, random_state=42, shuffle=True) if os.path.exists(os.path.join(data_folder_name, 'test.txt')): os.remove(os.path.join(data_folder_name, 'test.txt')) np.savetxt(os.path.join(data_folder_name, 'test.txt'), test, fmt='%s') train, validation = train_test_split( train_data, test_size=int(len(test)), random_state=42, shuffle=True) return train, validation def update_shuffling_flag(file_name): """ Update shuffling flag Changes shuffling flag in settings to False once data has been shuffled Args: file_name (str): The settings file name """ config = configparser.ConfigParser() config.read(file_name) config.set('DATA', 'data_split_flag', 'False') with open(file_name, 'w') as configfile: config.write(configfile) def regression_weight_dataframe_builder(subDirectoryList, data_folder_name, data_directory, train_inputs, train_targets, rsfMRI_mean_mask_path, dMRI_mean_mask_path): """Builds a regression weights database This function constructs a database containing the dMRI and rsfMRI mean-regression weights for all utilised subjects. The function saves this database as a pickle file. Args: subDirectoryList (list): List of all subjects contained in the train-validation-test dataset. data_folder_name (str): The name of the folder where the string data is being output data_directory (str): A string containing the address of the required directory. train_inputs (str): Path to the training input files train_targets (str): Path to the training target files train_targets (str): Path to the training target files rsfMRI_mean_mask_path (str): Path to the dualreg subject mean mask """ regression_weights = {} for subject in subDirectoryList: w_dMRI, w_rsfMRI = regression_weight_calculator( data_directory, subject, train_inputs, train_targets, rsfMRI_mean_mask_path, dMRI_mean_mask_path) regression_weights[subject] = [w_dMRI, w_rsfMRI] regression_weights_df = pd.DataFrame.from_dict( regression_weights, orient='index', columns=['w_dMRI', 'w_rsfMRI']) regression_weights_df.to_pickle(os.path.join( data_folder_name, 'regression_weights.pkl')) def regression_weight_calculator(data_directory, subject, train_inputs, train_targets, rsfMRI_mean_mask_path, dMRI_mean_mask_path): """ Calculator for linear regression weights This function cals the calculator for the weights required for peforming linear regression Args: data_directory (str): A string containing the address of the required directory. subject (str): Path to the relevant subject's data file train_inputs (str): Path to the training input files train_targets (str): Path to the training target files rsfMRI_mean_mask_path (str): Path to the dualreg subject mean mask dMRI_mean_mask_path (str): Path to the summed tract mean mask Returns: w_dMRI (float): Linear regression weight for dMRI data w_rsfMRI (flat): Linear regression weight for rsfMRI data """ w_dMRI = weight_calculator(data_directory, subject, train_inputs, train_targets, rsfMRI_mean_mask_path, dMRI_mean_mask_path, data_type='dmri') w_rsfMRI = weight_calculator(data_directory, subject, train_inputs, train_targets, rsfMRI_mean_mask_path, dMRI_mean_mask_path, data_type='fmri') return w_dMRI, w_rsfMRI def weight_calculator(data_directory, subject, train_inputs, train_targets, rsfMRI_mean_mask_path, dMRI_mean_mask_path, data_type): """ Calculator for linear regression weights This function calcualtes the weights required for peforming linear regression Args: data_directory (str): A string containing the address of the required directory. subject (str): Path to the relevant subject's data file data_type (str): Flag indicating the data type train_inputs (str): Path to the training input files train_targets (str): Path to the training target files rsfMRI_mean_mask_path (str): Path to the dualreg subject mean mask dMRI_mean_mask_path (str): Path to the summed tract mean mask Returns: weigth (float): Linear regressiong weight. """ if data_type == 'dmri': mean_path = dMRI_mean_mask_path data_path = train_inputs mean_volume = Image(mean_path).data subject_path = os.path.join(os.path.expanduser( "~"), data_directory, subject, data_path) subject_volume, _ = resampleToPixdims(Image(subject_path), (2, 2, 2)) elif data_type == 'fmri': mean_path = rsfMRI_mean_mask_path data_path = train_targets mean_volume = Image(mean_path).data[:, :, :, 0] subject_path = os.path.join(os.path.expanduser( "~"), data_directory, subject, data_path) subject_volume = Image(subject_path).data[:, :, :, 0] x = np.reshape(mean_volume, -1) y = np.reshape(subject_volume, -1) x_matrix = np.vstack((np.ones(len(x)), x)).T beta_hat = np.linalg.inv(x_matrix.T.dot(x_matrix)).dot(x_matrix.T).dot(y) w = beta_hat[1] return w def load_datasets(subjects, data_directory, input_file, output_target, mean_regression_flag, mean_regression_all_flag, regression_weights_path, dMRI_mean_mask_path, rsfMRI_mean_mask_path, mean_subtraction_flag, scale_volumes_flag, normalize_flag, minus_one_scaling_flag, negative_flag, outlier_flag, shrinkage_flag, hard_shrinkage_flag, crop_flag): """ Dataset loader and pre-processor This function acts as a wrapper for loading and pre-processing of the datasets. Args: subjects (list): List of all the subjects to be processed and loaded data_directory (str): Directory where the various subjects are stored. input_file (str): Intenal path for each subject to the relevant normalized summed dMRI tracts output_target (str): Internal path for each subject to the relevant rsfMRI data mean_regression_flag (bool): Flag indicating if the volumes should be de-meaned by regression using the mean_mask_path mean_regression_all_flag (bool): Flag indicating if both the input and target volumes should be regressed. If False, only targets are regressed. regression_weights_path (str): Path to the file containing the regression_weights dMRI_mean_mask_path (str): Path to the summed tracts mean mask rsfMRI_mean_mask_path (str): Path to the dualreg mean mask mean_subtraction_flag (bool): Flag indicating if the targets should be de-meaned by subtraction using the mean_mask_path scale_volumes_flag (bool): Flag indicating if the volumes should be scaled. normalize_flag (bool): Flag signaling if the volume should be normalized ([0,1] if True) or scaled to [-1,1] if False. minus_one_scaling_flag negative_flag (bool): Flag indicating if all the negative values should be 0-ed. outlier_flag (bool): Flag indicating if outliers should be set to the min/max values. shrinkage_flag (bool): Flag indicating if shrinkage should be applied. hard_shrinkage_flag (bool): Flag indicating if hard shrinkage should be applied. If False, soft shrinkage is applied. crop_flag (bool): Flag indicating if the volumes should be cropped from 91x109x91 to 72x90x77 to reduce storage space and speed-up training Returns: input_volumes (list): List of all the input volumes. target_volumes (list) List of all the target volumes. """ print("Loading and pre-processing data...") input_volumes, target_volumes = [], [] len_subjects = len(subjects) for index, subject in enumerate(subjects): input_volume, target_volume = load_and_preprocess(subject, data_directory, input_file, output_target, mean_regression_flag, mean_regression_all_flag, regression_weights_path, dMRI_mean_mask_path, rsfMRI_mean_mask_path, mean_subtraction_flag, scale_volumes_flag, normalize_flag, minus_one_scaling_flag, negative_flag, outlier_flag, shrinkage_flag, hard_shrinkage_flag, crop_flag) input_volumes.append(input_volume) target_volumes.append(target_volume) print("\r Processed {:.3f}%: {}/{} inputs, {}/{} targets".format(index/len_subjects * 100.0, len(input_volumes), len_subjects, len(target_volumes), len_subjects), end='') return input_volumes, target_volume def load_and_preprocess(subject, data_directory, input_file, output_target, mean_regression_flag, mean_regression_all_flag, regression_weights_path, dMRI_mean_mask_path, rsfMRI_mean_mask_path, mean_subtraction_flag, scale_volumes_flag, normalize_flag, minus_one_scaling_flag, negative_flag, outlier_flag, shrinkage_flag, hard_shrinkage_flag, crop_flag): """ Subject loader and pre-processor This function acts as a wrapper for loading and pre-processing individual subjects. Args: subject (str): The identifier number for each subject. data_directory (str): Directory where the various subjects are stored. input_file (str): Intenal path for each subject to the relevant normalized summed dMRI tracts output_target (str): Internal path for each subject to the relevant rsfMRI data mean_regression_flag (bool): Flag indicating if the volumes should be de-meaned by regression using the mean_mask_path mean_regression_all_flag (bool): Flag indicating if both the input and target volumes should be regressed. If False, only targets are regressed. regression_weights_path (str): Path to the file containing the regression_weights dMRI_mean_mask_path (str): Path to the summed tracts mean mask rsfMRI_mean_mask_path (str): Path to the dualreg mean mask mean_subtraction_flag (bool): Flag indicating if the targets should be de-meaned by subtraction using the mean_mask_path scale_volumes_flag (bool): Flag indicating if the volumes should be scaled. normalize_flag (bool): Flag signaling if the volume should be normalized ([0,1] if True) or scaled to [-1,1] if False. minus_one_scaling_flag negative_flag (bool): Flag indicating if all the negative values should be 0-ed. outlier_flag (bool): Flag indicating if outliers should be set to the min/max values. shrinkage_flag (bool): Flag indicating if shrinkage should be applied. hard_shrinkage_flag (bool): Flag indicating if hard shrinkage should be applied. If False, soft shrinkage is applied. crop_flag (bool): Flag indicating if the volumes should be cropped from 91x109x91 to 72x90x77 to reduce storage space and speed-up training Returns: input_volume (np.array): Numpy array representing the preprocessed input volume. target_volume (np.array) Numpy array representing the preprocessed target volume. """ input_volume, target_volume = load_data(subject, data_directory, input_file, output_target, crop_flag) input_volume, target_volume = preprocess(input_volume, target_volume, subject, mean_regression_flag, mean_regression_all_flag, regression_weights_path, dMRI_mean_mask_path, rsfMRI_mean_mask_path, mean_subtraction_flag, scale_volumes_flag, normalize_flag, minus_one_scaling_flag, negative_flag, outlier_flag, shrinkage_flag, hard_shrinkage_flag, crop_flag) return input_volume, target_volume def load_data(subject, data_directory, input_file, output_target, crop_flag=False): """ Load subject data This function generates relevant paths for the input and target files for each subject, and then loads them as numpy arrays. Args: subject (str): Subject ID of the subject volume to be regressed. data_directory (str): Directory where the various subjects are stored. input_file (str): Intenal path for each subject to the relevant normalized summed dMRI tracts output_target (str): Internal path for each subject to the relevant rsfMRI data crop_flag (bool): Flag indicating if the volumes should be cropped from 91x109x91 to 72x90x77 to reduce storage space and speed-up training Returns: input_volume target_volume """ input_path = os.path.join(os.path.expanduser("~"), data_directory, subject, input_file) target_path = os.path.join(os.path.expanduser("~"), data_directory, subject, output_target) if crop_flag == False: input_volume, _ = resampleToPixdims(Image(input_path), (2,2,2)) target_volume = Image(target_path).data[:, :, :, 0] elif crop_flag == True: input_image = Image(input_path) resampled_volume, xform = resampleToPixdims(input_image, (2,2,2)) input_volume = roi(Image(resampled_volume, header=input_image.header, xform=xform),((9,81),(10,100),(0,77))).data target_volume = roi(Image(target_path),((9,81),(10,100),(0,77))).data[:, :, :, 0] return input_volume, target_volume def preprocess(input_volume, target_volume, subject, mean_regression_flag, mean_regression_all_flag, regression_weights_path, dMRI_mean_mask_path, rsfMRI_mean_mask_path, mean_subtraction_flag, scale_volumes_flag, normalize_flag, minus_one_scaling_flag, negative_flag, outlier_flag, shrinkage_flag, hard_shrinkage_flag, crop_flag): """Conducts pre-processing based on arguments Function which wraps the various pre-processing subfunctions for every volume. Args: input_volume (np.array): Numpy array representing the un-preprocessed input volume. target_volume (np.array): Numpy array representing the un-preprocessed target volume. subject (str): Subject ID of the subject volume to be regressed. mean_regression_flag (bool): Flag indicating if the volumes should be de-meaned by regression using the mean_mask_path mean_regression_all_flag (bool): Flag indicating if both the input and target volumes should be regressed. If False, only targets are regressed. regression_weights_path (str): Path to the file containing the regression_weights dMRI_mean_mask_path (str): Path to the summed tracts mean mask rsfMRI_mean_mask_path (str): Path to the dualreg mean mask mean_subtraction_flag (bool): Flag indicating if the targets should be de-meaned by subtraction using the mean_mask_path scale_volumes_flag (bool): Flag indicating if the volumes should be scaled. normalize_flag (bool): Flag signaling if the volume should be normalized ([0,1] if True) or scaled to [-1,1] if False. minus_one_scaling_flag negative_flag (bool): Flag indicating if all the negative values should be 0-ed. outlier_flag (bool): Flag indicating if outliers should be set to the min/max values. shrinkage_flag (bool): Flag indicating if shrinkage should be applied. hard_shrinkage_flag (bool): Flag indicating if hard shrinkage should be applied. If False, soft shrinkage is applied. crop_flag (bool): Flag indicating if the volumes should be cropped from 91x109x91 to 72x90x77 to reduce storage space and speed-up training Returns: input_volume target_volume """ if mean_regression_flag == True: if mean_regression_all_flag == True: # Regress both inputs and targets input_volume = linear_regress_mean(input_volume, subject, regression_weights_path, crop_flag, target_flag=False, dMRI_mean_mask_path=dMRI_mean_mask_path) target_volume = linear_regress_mean(target_volume, subject, regression_weights_path, crop_flag, target_flag=True, rsfMRI_mean_mask_path=rsfMRI_mean_mask_path) # Set scaling parameters to Andrei Scaling scaling_parameters = [-0.0626, 0.1146, -14.18, 16.9475] elif mean_regression_all_flag == False: # Regress only targets, leave inputs as they are target_volume = linear_regress_mean(target_volume, subject, regression_weights_path, crop_flag, target_flag=True, rsfMRI_mean_mask_path=rsfMRI_mean_mask_path) # Set scaling parameters to Mixed Scaling scaling_parameters = [0.0, 0.2, -14.18, 16.9475] elif mean_subtraction_flag == True: # Subtract the mean from targets, leave inputs as they are target_volume = subtract_mean(target_volume, crop_flag, rsfMRI_mean_mask_path) # Set Scaling parameters to Steve Scaling scaling_parameters = [0.0, 0.2, 0.0, 10.0] if scale_volumes_flag == True: input_volume = volume_scaling(input_volume, scaling_parameters, normalize_flag, minus_one_scaling_flag, negative_flag, outlier_flag, shrinkage_flag, hard_shrinkage_flag, target_flag=False) target_volume = volume_scaling(target_volume, scaling_parameters, normalize_flag, minus_one_scaling_flag, negative_flag, outlier_flag, shrinkage_flag, hard_shrinkage_flag, target_flag=True) return input_volume, target_volume def linear_regress_mean(volume, subject, regression_weights_path, crop_flag, target_flag, dMRI_mean_mask_path=None, rsfMRI_mean_mask_path=None): """ Linear regressed mean subtraction Helper function which substracts or regressed the dual mean subject mask Args: volume (np.array): Numpy array representation of the original volume data. subject (str): Subject ID of the subject volume to be regressed. regression_weights_path (str): Path to the file containing the regression_weights. target_flag (bool): Flag signaling if the file is a target or an input. dMRI_mean_mask_path (str): Path to the summed tracts mean mask rsfMRI_mean_mask_path (str): Path to the dualreg mean mask Returns: regressed_volume (np.array): Numpy array representation of the subtracted volume data """ if target_flag == False: if crop_flag == False: group_mean = Image(dMRI_mean_mask_path).data elif crop_flag == True: group_mean = roi(Image(dMRI_mean_mask_path),((9,81),(10,100),(0,77))).data dataframe_key = 'w_dMRI' elif target_flag == True: if crop_flag == False: group_mean = Image(rsfMRI_mean_mask_path).data[:, :, :, 0] elif crop_flag == True: group_mean = roi(Image(rsfMRI_mean_mask_path),((9,81),(10,100),(0,77))).data[:, :, :, 0] dataframe_key = 'w_rsfMRI' weight = pd.read_pickle(regression_weights_path).loc[subject][dataframe_key] volume = np.subtract(volume, np.multiply(weight, group_mean)) return volume def subtract_mean(volume, crop_flag, rsfMRI_mean_mask_path): """Mean Mask Substraction Helper function which substracts the dualreg mean subject mask Args: volume (np.array): Numpy array representation of the original volume data mean_mask_path (str): Path to the dualreg subject mean mask Returns: subtracted_volume (np.array): Numpy array representation of the subtracted volume data """ if crop_flag == False: dualreg_subject_mean = Image(rsfMRI_mean_mask_path).data[:, :, :, 0] elif crop_flag == True: dualreg_subject_mean = roi(Image(rsfMRI_mean_mask_path),((9,81),(10,100),(0,77))).data[:, :, :, 0] volume = np.subtract(volume, dualreg_subject_mean) return volume def volume_scaling(volume, scaling_parameters, normalize_flag, minus_one_scaling_flag, negative_flag, outlier_flag, shrinkage_flag, hard_shrinkage_flag, target_flag): """ Volume Scaling Function This function applies various scaling operations to the volumes, based on their nature and the employed scaling strategy. Args: volume (np.array): Numpy array representing the un-scalled volume. scaling_parameters (list): List of scaling parameters. target_flag (bool): Flag signaling if the file is a target or an input. normalize_flag (bool): Flag signaling if the volume should be normalized ([0,1] if True). minus_one_scaling_flag (bool): Flag signaling if the volume should be scaled to [-1,1] if True negative_flag (bool): Flag indicating if all the negative values should be 0-ed. outlier_flag (bool): Flag indicating if outliers should be set to the min/max values. shrinkage_flag (bool): Flag indicating if shrinkage should be applied. hard_shrinkage_flag (bool): Flag indicating if hard shrinkage should be applied. If False, soft shrinkage is applied. Returns: volume (np.array): Numpy array representing the scalled volume. """ min_input, max_input, min_target, max_target = scaling_parameters if target_flag == False: min_value = min_input max_value = max_input elif target_flag == True: min_value = min_target max_value = max_target if shrinkage_flag == True: if target_flag == True: lambd = 3.0 # Hard coded, equivalent to tht 1p and 99p values across the whole population in UKBB elif target_flag == False: lambd = 0.003 # Hard coded, equivalent to tht 1p and 99p values across the whole population in UKBB if hard_shrinkage_flag == True: volume = hard_shrinkage(volume, lambd) elif hard_shrinkage_flag == False: volume = soft_shrinkage(volume, lambd) min_value += lambd max_value -= lambd if negative_flag == True: volume[volume < 0.0] = 0.0 min_value = 0.0 if outlier_flag == True: volume[volume > max_value] = max_value volume[volume < min_value] = min_value if normalize_flag == True: # Normalization to [0, 1] volume = np.divide(np.subtract(volume, min_value), np.subtract(max_value, min_value)) elif minus_one_scaling_flag == True: # Scaling between [-1, 1] volume = np.add(-1.0, np.multiply(2.0, np.divide(np.subtract(volume, min_value), np.subtract(max_value, min_value)))) # Else, no scaling occus, but the other flags can still hold true if the scaling flag is true! return volume def hard_shrinkage(volume, lambd): """ Hard Shrinkage This function performs a hard shrinkage on the volumes. volume = { x , x > lambd | x < -lambd 0 , x e [-lambd, lambd] } Args: volume (np.array): Unshrunken volume lambd (float): Threshold parameter Returns: volume (np.array) : Hard shrunk volume """ volume[np.where(np.logical_and(volume >= -lambd, volume <= lambd))] = 0.0 return volume def soft_shrinkage(volume, lambd): """ Soft Shrinkage This function performs a soft shrinkage on the volumes. volume = { x + lambd , x < -lambd 0 , x e [-lambd, lambd] x - lambd , x > lambd } Args: volume (np.array): Unshrunken volume lambd (float): Threshold parameter Returns: volume (np.array) : Soft shrunk volume """ volume[np.where(np.logical_and(volume >= -lambd, volume <= lambd))] = 0.0 volume[volume < -lambd] = volume[volume < -lambd] + lambd volume[volume > lambd] = volume[volume > lambd] - lambd return volume