diff --git a/getting_started/file_management.ipynb b/getting_started/file_management.ipynb index c3f212ac750af8958df4bf1aed4644872b905652..b63ece9e98275faaec5aab4a6a366f231edf294f 100644 --- a/getting_started/file_management.ipynb +++ b/getting_started/file_management.ipynb @@ -38,7 +38,7 @@ "\n", "\n", "If you are impatient, feel free to dive straight in to the exercises, and use the\n", - "other sections as a reference. You might miss out on some neat tricks though ...\n", + "other sections as a reference. You might miss out on some neat tricks though.\n", "\n", "\n", "* [Managing files and directories](#managing-files-and-directories)\n", @@ -58,10 +58,11 @@ "* [Exercises](#exercises)\n", " * [Re-name subject directories](#re-name-subject-directories)\n", " * [Re-organise a data set](#re-organise-a-data-set)\n", + " * [Re-name subject files](#re-name-subject-files)\n", " * [Compress all uncompressed images](#compress-all-uncompressed-images)\n", " * [Write your own `os.path.splitext`](#write-your-own-os-path-splitext)\n", " * [Write a function to return a specific image file](#write-a-function-to-return-a-specific-image-file)\n", - "\n", + " * [Solutions](#solutions)\n", "\n", "\n", "<a class=\"anchor\" id=\"managing-files-and-directories\"></a>\n", @@ -521,6 +522,7 @@ "> having it return a tuple. For example, the implementation of `op.split` might\n", "> look something like this:\n", ">\n", + ">\n", "> ```\n", "> def mysplit(path):\n", "> dirname = op.dirname(path)\n", @@ -910,33 +912,66 @@ "### Re-name subject directories\n", "\n", "\n", - "Rename the subject directories in raw_mri_data so that the subject IDs are\n", - "padded with zeros, and thus will be able to be sorted alphabetically.\n", + "Write a function which can rename the subject directories in `raw_mri_data` so\n", + "that the subject IDs are padded with zeros, and thus will be able to be sorted\n", + "alphabetically. This function:\n", + "\n", + "\n", + " - Should accept the path to the parent directory of the data set\n", + " (`raw_mri_data` in this case).\n", + " - Should be able to handle any number of subjects\n", + " > Hint: `numpy.log10`\n", + "\n", + " - May assume that the subject directory names follow the pattern\n", + " `subj_[id]`, where `[id]` is the integer subject ID.\n", "\n", "\n", "<a class=\"anchor\" id=\"re-organise-a-data-set\"></a>\n", "### Re-organise a data set\n", "\n", "\n", - "Separate the data for each group (patients: 1, 4, 7, 8, 9, and controls: 2, 3,\n", - "5, 6, 10) into sub-directories.\n", + "Write a function which can be used to separate the data for each group\n", + "(patients: 1, 4, 7, 8, 9, and controls: 2, 3, 5, 6, 10) into sub-directories\n", + "`CON` and `PAT`.\n", + "\n", + "This function should work with any number of groups, and should accept three\n", + "parameters:\n", + "\n", + " - The root directory of the data set (e.g. `raw_mri_data`).\n", + " - A list of strings, the labels for each group.\n", + " - A list of lists, with each list containing the subject IDs for one group.\n", + "\n", "\n", + "<a class=\"anchor\" id=\"re-name-subject-files\"></a>\n", + "### Re-name subject files\n", "\n", - "<a class=\"anchor\" id=\"rename-files\"></a>\n", - "### Rename files\n", "\n", + "Write a function which, given a subject directory, renames all of the image\n", + "files for this subject so that they are prefixed with `[group]_subj_[id]`,\n", + "where `[group]` is either `CON` or `PAT`, and `[id]` is the (zero-padded)\n", + "subject ID.\n", "\n", - "Rename all of the scans so that theuy are prefixed with '[group]_subj_[id]',\n", - "where [group] is either CON or PAT, and [id] is the (zero-padded) subject ID.\n", + "\n", + "This function should accept the following parameters:\n", + " - The subject directory\n", + " - The subject group\n", + "\n", + "\n", + "**Bonus 1** Make your function work with both `.nii` and `.nii.gz` files.\n", + "\n", + "**Bonus 2** If you completed [the previous exercise](#re-organise-a-data-set),\n", + "write a second function which accepts the data set directory as a sole\n", + "parameter, and then calls the first function for every subject.\n", "\n", "\n", "<a class=\"anchor\" id=\"compress-all-uncompressed-images\"></a>\n", "### Compress all uncompressed images\n", "\n", "\n", - "Learn how to compress a file using the built-in\n", - "[`gzip`](https://docs.python.org/3.5/library/gzip.html) library, and compress\n", - "all of those uncompressed image files.\n", + "Write a function which recursively scans a directory, and replaces all `.nii`\n", + "files with `.nii.gz` files, using the built-in\n", + "[`gzip`](https://docs.python.org/3.5/library/gzip.html) library to perform\n", + "the compression.\n", "\n", "\n", "<a class=\"anchor\" id=\"write-your-own-os-path-splitext\"></a>\n", @@ -954,9 +989,63 @@ "### Write a function to return a specific image file\n", "\n", "\n", - "Write a function which is given a group, numeric subject ID, and scan type\n", - "(t1, t2, task, rest), and returns the fully resolved path to the relevant\n", - "image file." + "Assuming that you have completed the previous exercises, and re-organised\n", + "`raw_mri_data` so that it has the structure:\n", + "\n", + " `raw_mri_data/[group]/subj_[id]/[group]_subj_[id]_[modality].nii.gz`\n", + "\n", + "write a function which is given:\n", + "\n", + " - the data set directory\n", + " - a group label\n", + " - integer ubject ID\n", + " - modality (`'t1'`, `'t2'`, `'task'`, `'rest'`)\n", + "\n", + "and which returns the fully resolved path to the relevant image file.\n", + "\n", + " > Hint: Python has [regular\n", + " expressions](https://docs.python.org/3.5/library/re.html) - you might want\n", + " to use one to cope with zero-padding.\n", + "\n", + "**Bonus** Modify the function so the group label does not need to be passed in.\n", + "\n", + "\n", + "<a class=\"anchor\" id=\"solutions\"></a>\n", + "### Solutions\n", + "\n", + "\n", + "Use the `print_solution` function, defined below, to print the solution for a\n", + "specific exercise." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pygments import highlight\n", + "from pygments.lexers import PythonLexer\n", + "from pygments.formatters import HtmlFormatter\n", + "import IPython\n", + "\n", + "# Pass the title of the exercise you\n", + "# are interested to this function\n", + "def print_solution(extitle):\n", + " solfile = ''.join([c.lower() if c.isalnum() else '_' for c in extitle])\n", + " solfile = op.join('.solutions', '{}.py'.format(solfile))\n", + "\n", + " if not op.exists(solfile):\n", + " print('Can\\'t find solution to exercise \"{}\"'.format(extitle))\n", + " return\n", + "\n", + " with open(solfile, 'rt') as f:\n", + " code = f.read()\n", + "\n", + " formatter = HtmlFormatter()\n", + " return IPython.display.HTML('<style type=\"text/css\">{}</style>{}'.format(\n", + " formatter.get_style_defs('.highlight'),\n", + " highlight(code, PythonLexer(), formatter)))" ] } ], diff --git a/getting_started/file_management.md b/getting_started/file_management.md index c6745e6d37240e9fad8a76424266978737ed4b92..b7b2ecd18f37c3be5380377a8cdd7f6f3bb1ce02 100644 --- a/getting_started/file_management.md +++ b/getting_started/file_management.md @@ -32,7 +32,7 @@ managing file and directory _paths_. If you are impatient, feel free to dive straight in to the exercises, and use the -other sections as a reference. You might miss out on some neat tricks though ... +other sections as a reference. You might miss out on some neat tricks though. * [Managing files and directories](#managing-files-and-directories) @@ -52,10 +52,11 @@ other sections as a reference. You might miss out on some neat tricks though ... * [Exercises](#exercises) * [Re-name subject directories](#re-name-subject-directories) * [Re-organise a data set](#re-organise-a-data-set) + * [Re-name subject files](#re-name-subject-files) * [Compress all uncompressed images](#compress-all-uncompressed-images) * [Write your own `os.path.splitext`](#write-your-own-os-path-splitext) * [Write a function to return a specific image file](#write-a-function-to-return-a-specific-image-file) - + * [Solutions](#solutions) <a class="anchor" id="managing-files-and-directories"></a> @@ -394,6 +395,7 @@ print('Directory and base names: {}'.format(op.split( path))) > having it return a tuple. For example, the implementation of `op.split` might > look something like this: > +> > ``` > def mysplit(path): > dirname = op.dirname(path) @@ -677,33 +679,66 @@ print(op.join(op.sep, 'home', 'fsluser', '.bash_profile')) ### Re-name subject directories -Rename the subject directories in raw_mri_data so that the subject IDs are -padded with zeros, and thus will be able to be sorted alphabetically. +Write a function which can rename the subject directories in `raw_mri_data` so +that the subject IDs are padded with zeros, and thus will be able to be sorted +alphabetically. This function: + + + - Should accept the path to the parent directory of the data set + (`raw_mri_data` in this case). + - Should be able to handle any number of subjects + > Hint: `numpy.log10` + + - May assume that the subject directory names follow the pattern + `subj_[id]`, where `[id]` is the integer subject ID. <a class="anchor" id="re-organise-a-data-set"></a> ### Re-organise a data set -Separate the data for each group (patients: 1, 4, 7, 8, 9, and controls: 2, 3, -5, 6, 10) into sub-directories. +Write a function which can be used to separate the data for each group +(patients: 1, 4, 7, 8, 9, and controls: 2, 3, 5, 6, 10) into sub-directories +`CON` and `PAT`. + +This function should work with any number of groups, and should accept three +parameters: + + - The root directory of the data set (e.g. `raw_mri_data`). + - A list of strings, the labels for each group. + - A list of lists, with each list containing the subject IDs for one group. + +<a class="anchor" id="re-name-subject-files"></a> +### Re-name subject files -<a class="anchor" id="rename-files"></a> -### Rename files +Write a function which, given a subject directory, renames all of the image +files for this subject so that they are prefixed with `[group]_subj_[id]`, +where `[group]` is either `CON` or `PAT`, and `[id]` is the (zero-padded) +subject ID. -Rename all of the scans so that theuy are prefixed with '[group]_subj_[id]', -where [group] is either CON or PAT, and [id] is the (zero-padded) subject ID. + +This function should accept the following parameters: + - The subject directory + - The subject group + + +**Bonus 1** Make your function work with both `.nii` and `.nii.gz` files. + +**Bonus 2** If you completed [the previous exercise](#re-organise-a-data-set), +write a second function which accepts the data set directory as a sole +parameter, and then calls the first function for every subject. <a class="anchor" id="compress-all-uncompressed-images"></a> ### Compress all uncompressed images -Learn how to compress a file using the built-in -[`gzip`](https://docs.python.org/3.5/library/gzip.html) library, and compress -all of those uncompressed image files. +Write a function which recursively scans a directory, and replaces all `.nii` +files with `.nii.gz` files, using the built-in +[`gzip`](https://docs.python.org/3.5/library/gzip.html) library to perform +the compression. <a class="anchor" id="write-your-own-os-path-splitext"></a> @@ -721,6 +756,56 @@ uncompressed NIFTI images. ### Write a function to return a specific image file -Write a function which is given a group, numeric subject ID, and scan type -(t1, t2, task, rest), and returns the fully resolved path to the relevant -image file. \ No newline at end of file +Assuming that you have completed the previous exercises, and re-organised +`raw_mri_data` so that it has the structure: + + `raw_mri_data/[group]/subj_[id]/[group]_subj_[id]_[modality].nii.gz` + +write a function which is given: + + - the data set directory + - a group label + - integer ubject ID + - modality (`'t1'`, `'t2'`, `'task'`, `'rest'`) + +and which returns the fully resolved path to the relevant image file. + + > Hint: Python has [regular + expressions](https://docs.python.org/3.5/library/re.html) - you might want + to use one to cope with zero-padding. + +**Bonus** Modify the function so the group label does not need to be passed in. + + +<a class="anchor" id="solutions"></a> +### Solutions + + +Use the `print_solution` function, defined below, to print the solution for a +specific exercise. + + +``` +from pygments import highlight +from pygments.lexers import PythonLexer +from pygments.formatters import HtmlFormatter +import IPython + +# Pass the title of the exercise you +# are interested to this function +def print_solution(extitle): + solfile = ''.join([c.lower() if c.isalnum() else '_' for c in extitle]) + solfile = op.join('.solutions', '{}.py'.format(solfile)) + + if not op.exists(solfile): + print('Can\'t find solution to exercise "{}"'.format(extitle)) + return + + with open(solfile, 'rt') as f: + code = f.read() + + formatter = HtmlFormatter() + return IPython.display.HTML('<style type="text/css">{}</style>{}'.format( + formatter.get_style_defs('.highlight'), + highlight(code, PythonLexer(), formatter))) +``` diff --git a/getting_started/file_management/.solutions/compress_all_uncompressed_images.py b/getting_started/file_management/.solutions/compress_all_uncompressed_images.py new file mode 100644 index 0000000000000000000000000000000000000000..2c180ed7714ce260ccbc91dd753e8785af65dd36 --- /dev/null +++ b/getting_started/file_management/.solutions/compress_all_uncompressed_images.py @@ -0,0 +1,24 @@ +import os.path as op +import os +import gzip + + +def compress_all(dirname): + """Recursively scans through `dirname`, and compresses all `.nii` files + with gzip, replacing them with `.nii.gz` files. + + :arg dirname: Directory to scan for `.nii` files. + """ + + for root, dirs, files in os.walk(dirname): + + uncmpfiles = [f for f in files if f.endswith('.nii')] + + infiles = [op.join(root, uf) for uf in uncmpfiles] + outfiles = ['{}.gz'.format(inf) for inf in infiles] + + for infile, outfile in zip(infiles, outfiles): + with open( infile, 'rb') as inf, \ + gzip.open(outfile, 'wb') as outf: + outf.write(inf.read()) + os.remove(infile) diff --git a/getting_started/file_management/.solutions/re_name_subject_directories.py b/getting_started/file_management/.solutions/re_name_subject_directories.py new file mode 100644 index 0000000000000000000000000000000000000000..38a1e59bacacaaa7bf24155939d9c623cbfee06a --- /dev/null +++ b/getting_started/file_management/.solutions/re_name_subject_directories.py @@ -0,0 +1,39 @@ +import os.path as op +import glob +import shutil +import numpy as np + + +def rename_subject_dirs(dirname): + """Renames all directories in `dirname` which have the form `subj_[id]`, + where `[id]` is an integer specifying the subject IDs. + + Each subject directory is renamed such that the subject IDs are padded + with zeros, thus allowing the directories to be sorted alphabetically. + + :arg dirname: Data set directory. + """ + + # get a list of all + # subject directories + subjdirs = list(glob.glob(op.join(dirname, 'subj_*'))) + + # get a list of subject IDs + subjids = [int(sd.split('_')[1]) for sd in subjdirs] + + # figure out the maximmum + # number of digits we need + ndigits = int(np.ceil(np.log10(max(subjids) + 1))) + + # create a format string + # which will pad an ID with + # the required number of zeros + fmtstr = 'subj_{{:0{}d}}'.format(ndigits) + + # generate new subject + # directory names + newsubjdirs = [op.join(dirname, fmtstr.format(sid)) for sid in subjids] + + # rename each subject dir + for subjdir, newsubjdir in zip(subjdirs, newsubjdirs): + shutil.move(subjdir, newsubjdir) diff --git a/getting_started/file_management/.solutions/re_name_subject_files.py b/getting_started/file_management/.solutions/re_name_subject_files.py new file mode 100644 index 0000000000000000000000000000000000000000..eb3ed871b72417c20c25837bb3e439f37d8c4052 --- /dev/null +++ b/getting_started/file_management/.solutions/re_name_subject_files.py @@ -0,0 +1,72 @@ +import os.path as op +import os +import glob + + +def rename_subject_files(subjdir, group): + """Renames all of the NIFTI files contained in `subjdir`, adding + the prefix `[group]_subj_[id]`, where `[group]` is equal to the + specified `group`, and `[id`] is gleaned from `subjdir` (assumed + to be called `subj_[id]`). + + :arg subjdir: Directory containing NIFTI files for one subject + :arg group: Name of the group this subject belongs to. + """ + + # Normalise the subject directory name. + # We pass subjdir through abspath so that + # this function will accept relative + # paths, and through normpath to ensure + # that there is no trailing slash. + subjdir = op.normpath(op.abspath(subjdir)) + + # Now we can extract the subject id. + # Note that we don't convert the subject + # ID to a string here - this means that + # any zero-padding will be preserved. + subjid = op.basename(subjdir) + subjid = subjid.split('_')[1] + + # Get a list of all nifti images + # in the subject directory. + imgfiles = list(glob.glob(op.join(subjdir, '*.nii')) + + glob.glob(op.join(subjdir, '*.nii.gz'))) + + # Generate new file names + # for all of these images + newimgfiles = ['{}_subj_{}_{}'.format(group, subjid, op.basename(imgf)) + for imgf in imgfiles] + newimgfiles = [op.join(subjdir, imgf) for imgf in newimgfiles] + + # Rename all the images + for imgfile, newimgfile in zip(imgfiles, newimgfiles): + os.rename(imgfile, newimgfile) + + +def rename_all_subject_files(dirname): + """Calls `rename_subject_files` on every subject directory in the specified + `dirname`. + + :arg dirname: Data set directory.. Assumed to contain a sub-directory for + each group, which in turn contain sub-directories for each + subject. + """ + + # get a list of all + # group directories + groupdirs = glob.glob(op.join(dirname, '*')) + groupdirs = [gdir for gdir in groupdirs if op.isdir(gdir)] + + for groupdir in groupdirs: + + # get the group name + group = op.basename(op.normpath(groupdir)) + + # get the list of subject + # directories in this group + subjdirs = glob.glob(op.join(groupdir, 'subj_*')) + + # apply rename_subject_files + # to each subject dir + for subjdir in subjdirs: + rename_subject_files(subjdir, group) diff --git a/getting_started/file_management/.solutions/re_organise_a_data_set.py b/getting_started/file_management/.solutions/re_organise_a_data_set.py new file mode 100644 index 0000000000000000000000000000000000000000..b39435c9c1b1985792289041ce771f8b0a614d6d --- /dev/null +++ b/getting_started/file_management/.solutions/re_organise_a_data_set.py @@ -0,0 +1,36 @@ +import os +import os.path as op +import glob +import shutil + + +def reorganise_data_set(dirname, groupLabels, groups): + """Re-organises the subject directories in the given `dirname` by group, + according to the labels in `groupLabels`, and the group definitions in + `groups`. + + :arg dirname: Data set directory. + :arg groupLabels: Sequence of labels, one for each group + :arg groups: Sequence of group definitions - each group is defined + by a sequence of subject IDs. + """ + + # Get lists of subject directories + # and corresponding subject IDs + subjdirs = list(glob.glob(op.join(dirname, 'subj_*'))) + subjids = [int(sd.split('_')[1]) for sd in subjdirs] + + # For each group + for glabel, group in zip(groupLabels, groups): + + # Make the group directory + groupdir = op.join(dirname, glabel) + os.mkdir(groupdir) + + # For each subject in this group + for sid in group: + + # Lookup the subject directory, + # and move it into the group dir + subjdir = subjdirs[subjids.index(sid)] + shutil.move(subjdir, groupdir) diff --git a/getting_started/file_management/.solutions/write_a_function_to_return_a_specific_image_file.py b/getting_started/file_management/.solutions/write_a_function_to_return_a_specific_image_file.py new file mode 100644 index 0000000000000000000000000000000000000000..dad45f5d5fc6d20d9e447b86a33d90a31bb9d4c1 --- /dev/null +++ b/getting_started/file_management/.solutions/write_a_function_to_return_a_specific_image_file.py @@ -0,0 +1,106 @@ +import os.path as op +import os +import re +import glob + + + +def get_image(dirname, group, subjID, modality): + """Finds the NIFTI image for the given `modality, in the specified + `dirname`, which is for the specified `subjID`. + + :arg dirname: Data set directory + :arg group: Group label + :arg subjID: Subject ID + :arg modality: Image modality + :returns: The path to the specified image, or `None` if it cannot be + found. + """ + + # Get the group directory, and + # list of all subject directories + groupdir = op.join(dirname, group) + subjdirs = list(glob.glob(op.join(groupdir, 'subj_*'))) + + # Define a regex which we can + # use to identify the appropriate + # subject directory + subjpat = re.compile('subj_(0*{})'.format(subjID)) + padsubjID = None + + # Look for the relevant subject + # directory. When we find it, we + # store the zero-padded version + # of the subject ID, so we can + # use it to construct the final + # file name. + for subjdir in subjdirs: + match = subjpat.fullmatch(op.basename(subjdir)) + if match is not None: + padsubjID = match.groups(0)[0] + break + + # Could not identify + # subject directory + else: + return None + + # Construct and return + # the relevant file name + fname = '{}_subj_{}_{}.nii.gz'.format(group, padsubjID, modality) + fname = op.join(subjdir, fname) + + if op.exists(fname): + return fname + else: + return None + + +def get_image_nogroup(dirname, subjID, modality): + """Finds the NIFTI image for the given `modality, in the specified + `dirname`, which is for the specified `subjID`, who is in the specified + `group`. + + :arg dirname: Data set directory + :arg subjID: Subject ID + :arg modality: Image modality + :returns: The path to the specified image, or `None` if it cannot be + found. + """ + + # Define a regex which we can + # use to identify the appropriate + # subject directory + subjpat = re.compile('subj_(0*{})'.format(subjID)) + + # Look for the relevant subject + # directory. When we find it, we + # store the zero-padded version + # of the subject ID, so we can + # use it to construct the final + # file name. + subjdir = None + padsubjID = None + for root, dirs, files in os.walk(dirname): + for d in dirs: + match = subjpat.fullmatch(d) + if match is not None: + subjdir = op.join(root, d) + padsubjID = match.groups(0)[0] + + if subjdir is None: + return None + + # Get the name of the group + # this subject is in + group = op.basename(op.dirname(subjdir)) + + # Construct and return + # the relevant file name + fname = '{}_subj_{}_{}.nii.gz'.format(group, padsubjID, modality) + fname = op.join(subjdir, fname) + + if op.exists(fname): + return fname + else: + return None diff --git a/getting_started/file_management/.solutions/write_your_own_os_path_splitext.py b/getting_started/file_management/.solutions/write_your_own_os_path_splitext.py new file mode 100644 index 0000000000000000000000000000000000000000..b3716701065aed5d9d3a33095d737fea0c0b6017 --- /dev/null +++ b/getting_started/file_management/.solutions/write_your_own_os_path_splitext.py @@ -0,0 +1,29 @@ +def nifti_splitext(path, exts=None): + """Splits the given path, assumed to be a NIFTI file, into its + prefix and suffix components. + + :arg path: Path to split + :arg exts: List of recognised file extensions. Defaults to + `['.nii', '.nii.gz']`, but can be overridden. + + :returns: A tuple containing: + - The part of `path` before the extension + - The extension + """ + if exts is None: + exts = ['.nii', '.nii.gz'] + + # Try and find a suffix match + extMatches = [path.endswith(ext) for ext in exts] + + # No match - there is + # no supported extension + if not any(extMatches): + return path, '' + + # Otherwise split the path + # into its base and its extension + extIdx = extMatches.index(True) + extLen = len(exts[extIdx]) + + return path[:-extLen], path[-extLen:]