Commit 0b755689 authored by Andrei-Claudiu Roibu's avatar Andrei-Claudiu Roibu 🖥
Browse files

started adding KFold cross validation - completed data stream

parent 72461c75
......@@ -357,17 +357,18 @@ if __name__ == '__main__':
if data_parameters['data_split_flag'] == True:
if data_parameters['use_data_file'] == True:
data_test_train_validation_split(data_parameters['data_folder_name'],
data_parameters['train_percentage'],
data_parameters['validation_percentage'],
data_parameters['subject_number'],
data_file= data_parameters['data_file'])
data_parameters['test_percentage'],
data_parameters['subject_number'],
data_file= data_parameters['data_file'],
K_fold= data_parameters['k_fold']
)
else:
data_test_train_validation_split(data_parameters['data_folder_name'],
data_parameters['train_percentage'],
data_parameters['validation_percentage'],
data_parameters['subject_number'],
data_directory= data_parameters['data_directory'])
data_parameters['test_percentage'],
data_parameters['subject_number'],
data_directory= data_parameters['data_directory'],
K_fold= data_parameters['k_fold']
)
update_shuffling_flag('settings.ini')
......
[DATA]
data_folder_name = "datasets"
use_data_file = False
data_directory = "../well/win-biobank/projects/imaging/data/data3/subjectsAll/"
data_file = "../well/win-biobank/projects/imaging/data/data3/subjectsAll/subj_22k.txt"
k_fold = None
data_split_flag = False
train_percentage = 90
validation_percentage = 5
test_percentage = 5
subject_number = None
train_list = "train.txt"
validation_list = "validation.txt"
......@@ -12,8 +14,6 @@ train_data_file = "dMRI/autoptx_preproc/tractsNormSummed.nii.gz"
train_output_targets = "fMRI/rfMRI_25.dr/dr_stage2.nii.gz"
validation_data_file = "dMRI/autoptx_preproc/tractsNormSummed.nii.gz"
validation_target_file = "fMRI/rfMRI_25.dr/dr_stage2.nii.gz"
data_file = "../well/win-biobank/projects/imaging/data/data3/subjectsAll/subj_22k.txt"
use_data_file = False
[TRAINING]
training_batch_size = 2
......@@ -31,7 +31,6 @@ learning_rate_scheduler_step_size = 3
learning_rate_scheduler_gamma = 1e-1
use_last_checkpoint = False
final_model_output_file = "finetuned_alldata.pth.tar"
cross_validation = False
[NETWORK]
kernel_heigth = 5
......
......@@ -16,6 +16,7 @@ setup(
'torch',
'h5py',
'fslpy',
'tensorboard'
'tensorboard',
'sklearn',
],
)
......@@ -19,10 +19,10 @@ import numpy as np
import nibabel as nib
import torch
import torch.utils.data as data
import random
import configparser
from fsl.data.image import Image
from fsl.utils.image.resample import resampleToPixdims
from sklearn.model_selection import KFold, train_test_split
def directory_reader(folder_location, subject_number=None, write_txt=False):
......@@ -81,7 +81,7 @@ def data_file_reader(data_file_path):
return subDirectoryList
def data_test_train_validation_split(data_folder_name, train_percentage, validation_percentage, subject_number, data_directory = None, data_file = None):
def data_test_train_validation_split(data_folder_name, test_percentage, subject_number, data_directory= None, data_file= None, K_fold= None):
"""Produces lists of train, test and validation data
This function looks at the list of all available directories and returns three lists of dsub-directories.
......@@ -90,9 +90,9 @@ def data_test_train_validation_split(data_folder_name, train_percentage, validat
Args:
data_folder_name (str): The name of the folder where the string data is being output
data_directory (str): A string containing the address of the required directory.
train_percentage (int): Percentage of data to be used for training
validation_percentage (int): Percentage of data to be used for validation
train_petest_percentage (int): Percentage of data to be used for testing
suject_number (int): Number of subjects to be considered for a job. Useful when wanting to train on datasizes smaller than total datapoints available in a datafolder.
K_fold (int): Number of folds for splitting the training data
data_file (str): Name of *.txt file containing a list of the required data
Raises:
......@@ -107,56 +107,32 @@ def data_test_train_validation_split(data_folder_name, train_percentage, validat
raise ValueError(
'Invalid data input! Either a data_file.txt containing all data, or a data_directory string needs to be passed')
random.shuffle(subDirectoryList)
subDirectoryList = np.array(subDirectoryList)
train, validation, test = np.split(subDirectoryList, [int(train_percentage/100 * len(
subDirectoryList)), int((train_percentage+validation_percentage)/100 * len(subDirectoryList))])
create_folder(data_folder_name)
np.savetxt(os.path.join(data_folder_name, 'train.txt'), train, fmt='%s')
train_data, test = train_test_split(subDirectoryList, test_size= test_percentage/100, random_state= 42, shuffle= True)
np.savetxt(os.path.join(data_folder_name, 'test.txt'), test, fmt='%s')
np.savetxt(os.path.join(data_folder_name, 'validation.txt'), validation, fmt='%s')
def data_test_train_validation_split_Kfold_cross_validation(data_folder_name, K_fold, subject_number, data_directory = None, data_file = None):
"""Produces lists of train, test and validation data
This function looks at the list of all available directories and returns three lists of dsub-directories.
These lists are the lists required for training, testing and validation.
Args:
data_directory (str): A string containing the address of the required directory.
train_percentage (int): Percentage of data to be used for training
validation_percentage (int): Percentage of data to be used for validation
suject_number (int): Number of subjects to be considered for a job. Useful when wanting to train on datasizes smaller than total datapoints available in a datafolder.
Raises:
ValueError: 'Invalid data input! Either a data_file.txt containing all data, or a data_directory string needs to be passed'
"""
print("Test={}".format(test))
if data_file is None:
subDirectoryList = directory_reader(data_directory, subject_number)
elif data_directory is None:
subDirectoryList = data_file_reader(data_file)
if K_fold is None:
train, validation = train_test_split(train_data, test_size= int(len(test)), random_state= 42, shuffle= True)
np.savetxt(os.path.join(data_folder_name, 'train.txt'), train, fmt='%s')
np.savetxt(os.path.join(data_folder_name, 'validation.txt'), validation, fmt='%s')
print("Train={}, Validation={}".format(train, validation))
else:
raise ValueError(
'Invalid data input! Either a data_file.txt containing all data, or a data_directory string needs to be passed')
random.shuffle(subDirectoryList)
subDirectoryList = np.array(subDirectoryList)
train, validation, test = np.split(subDirectoryList, [int(train_percentage/100 * len(
subDirectoryList)), int((train_percentage+validation_percentage)/100 * len(subDirectoryList))])
create_folder(data_folder_name)
np.savetxt(os.path.join(data_folder_name, 'train.txt'), train, fmt='%s')
np.savetxt(os.path.join(data_folder_name, 'test.txt'), test, fmt='%s')
np.savetxt(os.path.join(data_folder_name, 'validation.txt'), validation, fmt='%s')
k_fold = KFold(n_splits= K_fold)
k = 0
for train_index, validation_index in k_fold.split(train_data):
train, validation = train_data[train_index], train_data[validation_index]
np.savetxt(os.path.join(data_folder_name, 'train'+str(k+1)+'.txt'), train, fmt='%s')
np.savetxt(os.path.join(data_folder_name, 'validation'+str(k+1)+'.txt'), validation, fmt='%s')
print("K={}, Train={}, Validation={}".format(k, train, validation))
k += 1
def update_shuffling_flag(file_name):
""" Update shuffling flag
......@@ -641,25 +617,57 @@ def get_datasetsHDF5(data_parameters):
if __name__ == '__main__':
data_file_path = 'train.txt'
# data_file_path = 'train.txt'
subDirectoryList = data_file_reader(data_file_path)
# subDirectoryList = data_file_reader(data_file_path)
print(subDirectoryList)
print(type(subDirectoryList))
# print(subDirectoryList)
# print(type(subDirectoryList))
folder_location = "../well/win-biobank/projects/imaging/data/data3/subjectsAll/"
# folder_location = "../well/win-biobank/projects/imaging/data/data3/subjectsAll/"
subDirectoryList2 = directory_reader(folder_location)
# subDirectoryList2 = directory_reader(folder_location)
print(subDirectoryList2)
print(type(subDirectoryList2))
# print(subDirectoryList2)
# print(type(subDirectoryList2))
data_folder_name = "datasets"
train_percentage = 90
validation_percentage = 5
test_percentage = 5
K_fold = None
subject_number = None
data_directory = "../well/win-biobank/projects/imaging/data/data3/subjectsAll/"
data_test_train_validation_split(data_folder_name, train_percentage, validation_percentage, subject_number, data_directory = data_directory)
data_test_train_validation_split(data_folder_name, test_percentage, subject_number, data_directory = data_directory, K_fold= K_fold)
# data_test_train_validation_split_Kfold_cross_validation(data_folder_name, K_fold, subject_number, data_directory = data_directory)
# data = np.arange(23)
# K = 10
# test_size = int(len(data)/K)
# for k in range(K):
# if k == 0:
# test_slice, remainder = np.split(data.copy(), [test_size], axis=0)
# print("k= {}, test_slice={}, remainder={}".format(k, test_slice, remainder))
# else:
# remainder[(k-1) * test_size: k *test_size], test_slice = test_slice, remainder[(k-1) * test_size: k * test_size].copy()
# print("k= {}, test_slice={}, remainder={}".format(k, test_slice, remainder))
# print('SKLEARN TIME!')
# from sklearn.model_selection import KFold, train_test_split
# kf = KFold(n_splits=K)
# k = 0
# train_data, test_data = train_test_split(data, test_size= 0.1)
# print('train_data= {}, test_data={}'.format(train_data, test_data))
# for train_index, test_index in kf.split(train_data):
# train, test = train_data[train_index], train_data[test_index]
# print("k= {}, val_slice={}, train={}".format(k, test, train))
# k+=1
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment