Commit 71b01b8c authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

Merge branch 'rf/subtable_merge_perf' into 'master'

Rf/subtable merge perf

See merge request !59
parents 6e544503 02eda7de
......@@ -2,6 +2,18 @@ FUNPACK changelog
2.2.1 (Monday 4th May 2020)
* Reverted some changes to :meth:`.DataTable.merge` which caused performance
2.2.0 (Friday 1st May 2020)
......@@ -6,7 +6,7 @@
__version__ = '2.2.0'
__version__ = '2.2.1'
"""The ``funpack`` versioning scheme roughly follows Semantic Versioning
......@@ -12,6 +12,8 @@ class which holds a reference to the loaded data.
import itertools as it
import multiprocessing as mp
import multiprocessing.dummy as mpd
import random
import string
import logging
import contextlib
import collections
......@@ -31,6 +33,11 @@ variable IDs really shoulld not conflict with actual UKB variable IDs.
MODIFIED_COLUMN = ''.join(random.choices(string.ascii_letters, k=20))
"""Flag used internally by the :class:`DataTable` when merging subtables.
class Column:
"""The ``Column`` is a simple container class containing metadata
about a single column in a data file.
......@@ -177,8 +184,9 @@ class DataTable(util.Singleton):
The :meth:`subtable` method can be used to generate a replica
``DataTable`` with a specific subset of columns. It is intended for
parallelisation, so that only the required data is copied over to tasks
running in other processes. The :meth:`subtable` and :meth:`merge` methods
parallelisation, so that child processes are given a view of only the
columns that are relevant to them, and as little copying between processes
as possible takes place. The :meth:`subtable` and :meth:`merge` methods
are intended to be used like so:
1. Create subtables which only contain data for specific columns::
......@@ -199,6 +207,10 @@ class DataTable(util.Singleton):
for subtable in subtables:
Modifications must occur through the :meth:`DataTable.__setitem__`
interface, so it can keep track of which columns have been modified.
Addition or removal of columns or rows on subtables is not supported.
......@@ -209,7 +221,8 @@ class DataTable(util.Singleton):
"""Create a ``DataTable``.
:arg data: ``pandas.DataFrame`` containing the data.
......@@ -221,6 +234,9 @@ class DataTable(util.Singleton):
:arg njobs: Number of jobs to use for parallelising tasks.
:arg mgr: :class:`multiprocessing.Manager` object for
:arg subtable: For internal use. Used to differentiate between the
main ``DataTable``, and child ``DataTable`` objects
created via :meth:`subtable`.
self.__data = data
......@@ -229,6 +245,7 @@ class DataTable(util.Singleton):
self.__cattable = cattable
self.__njobs = njobs
self.__mgr = mgr
self.__subtable = subtable
self.__flags = collections.defaultdict(set)
# The varmap is a dictionary of
......@@ -252,6 +269,7 @@ class DataTable(util.Singleton):
......@@ -264,7 +282,8 @@ class DataTable(util.Singleton):
self.__cattable = state[3]
self.__varmap = state[4]
self.__colmap = state[5]
self.__flags = state[6]
self.__subtable = state[6]
self.__flags = state[7]
self.__njobs = 1
self.__mgr = None
......@@ -286,6 +305,14 @@ class DataTable(util.Singleton):
def isSubtable(self):
"""Returns ``True`` if this ``DataTable`` was created by a
:meth:`subtable` call.
return self.__subtable
def manager(self):
"""Returns a ``multiprocessing.Manager`` for sharing data between
......@@ -550,11 +577,32 @@ class DataTable(util.Singleton):
def __setitem__(self, slc, value):
"""Set the specified slice in the data. This method has
the same interface as the ``pandas.DataFrame.loc`` accessor.
"""Set the specified slice in the data.
This method suports a limited form of the ``pandas.DataFrame.loc``
interface. Slices/labels for both rows and columns must be specified,
and columns may only be specified via a slice, list, tuple, or
individual label.
if not isinstance(slc, tuple) or len(slc) != 2:
raise RuntimeError('DataTable.__setitem__ requires both '
'rows and columns to be indexed.')
self.__data.loc[slc] = value
# Flag the column as modified. This is
# detected by the merge method when
# it merges subtables back in.
if self.isSubtable:
cols = slc[1]
if isinstance(cols, slice):
cols = self.__data.columns[cols]
elif not isinstance(cols, (list, tuple)):
cols = [cols]
for col in cols:
self.addFlag(self.__colmap[col], MODIFIED_COLUMN)
def __len__(self):
"""Returns the number of rows in the data set. """
......@@ -568,7 +616,7 @@ class DataTable(util.Singleton):
This method can be used to create a replica ``DataTable`` where the
underlying ``pandas.DataFrame`` only contains the specified
columns. It is intended to be used when parallelising tasks, so that
only necessary columns are copied back and forth between processes.
child processes are given a view of only the relevant columns.
:arg columns: List of :class:`Column` objects.
:arg rows: Sequence of row indices.
......@@ -591,61 +639,39 @@ class DataTable(util.Singleton):
def merge(self, subtable):
"""Merge the data from the given ``subtable`` into this ``DataTable``.
It is assumed that ``subtable`` contains a sub-set of the columns
in this ``DataTable``.
:arg subtable: ``DataTable`` returned by :meth:`subtable`.
if self.shape[0] == subtable.shape[0]:
subrows = None
subrows = slice(None)
subrows = subtable.index
if self.shape[1] == subtable.shape[1]:
subcols = None
subcols = [ for c in subtable.dataColumns]
# subtable == main table?
if (subrows is None) and (subcols is None):
self.__data = subtable[:, :]
# if subject indices match, we
# can use pd.concat to merge
# the two dataframes fast.
elif subrows is None:
# take a copy of the column names
# so we can restore their ordering
origcols = self.__data.columns
self.__data = self.__data.drop(columns=subcols)
self.__data = pd.concat((self.__data, subtable[:, :]),
# restore column ordering
self.__data = self.__data[origcols]
# otherwise we perform slower
# df.loc based assignment
# (slower, because data copying
# has to take place)
if subcols is None:
subcols = slice(None)
self[subrows, subcols] = subtable[subrows, subcols]
# only copy modified columns - we assume
# that all changes to the subtable
# occurred via DataTable.__setitem__
subcols = [ for c in subtable.dataColumns
if MODIFIED_COLUMN in subtable.getFlags(c)]
if len(subcols) > 0:
self.__data.loc[subrows, subcols] = subtable[subrows, subcols]
for subcol in subtable.dataColumns:
mycol = self.__colmap[]
myflags = self.__flags[mycol]
subflags = subtable.getFlags(subcol)
subflags = subflags.difference((MODIFIED_COLUMN,))
self.__flags[mycol] = myflags.union(subflags)
if subcol.metadata is not None:
File mode changed from 100644 to 100755
%% Cell type:markdown id: tags:
![win logo](win.png)
# `funpack` (
> Paul McCarthy <>
`funpack` is a command-line program which you can use to extract data from UK
BioBank (and other tabular) data.
You can give `funpack` one or more input files (e.g. `.csv`, `.tsv`), and it
will merge them together, perform some preprocessing, and produce a single
output file.
A large number of rules are built into `funpack` which are specific to the UK
BioBank data set. But you can control and customise everything that `funpack`
does to your data, including which rows and columns to extract, and which
cleaning/processing steps to perform on each column.
`funpack` comes installed with recent versions of
[FSL]( You can also install `funpack`
via `conda`:
> ```
> conda install -c conda-forge fmrib-unpack
> ```
Or using `pip`:
> ```
> pip install fmrib-unpack
> ```
Get command-line help by typing:
> ```
> funpack -h
> ```
**Important** The examples in this notebook assume that you have installed `funpack`
2.2.0 or newer.
2.2.1 or newer.
%% Cell type:code id: tags:
``` bash
funpack -V
%% Cell type:markdown id: tags:
### Contents
1. [Overview](#Overview)
1. [Import](#1.-Import)
2. [Cleaning](#2.-Cleaning)
3. [Processing](#3.-Processing)
4. [Export](#4.-Export)
2. [Examples](#Examples)
3. [Import examples](#Import-examples)
1. [Selecting variables (columns)](#Selecting-variables-(columns))
1. [Selecting individual variables](#Selecting-individual-variables)
2. [Selecting variable ranges](#Selecting-variable-ranges)
3. [Selecting variables with a file](#Selecting-variables-with-a-file)
4. [Selecting variables from pre-defined categories](#Selecting-variables-from-pre-defined-categories)
2. [Selecting subjects (rows)](#Selecting-subjects-(rows))
1. [Selecting individual subjects](#Selecting-individual-subjects)
2. [Selecting subject ranges](#Selecting-subject-ranges)
3. [Selecting subjects from a file](#Selecting-subjects-from-a-file)
4. [Selecting subjects by variable value](#Selecting-subjects-by-variable-value)
5. [Excluding subjects](#Excluding-subjects)
3. [Selecting visits](#Selecting-visits)
1. [Evaluating expressions across visits](#Evaluating-expressions-across-visits)
4. [Merging multiple input files](#Merging-multiple-input-files)
1. [Merging by subject](#Merging-by-subject)
2. [Merging by column](#Merging-by-column)
3. [Naive merging](#Merging-by-column)
4. [Cleaning examples](#Cleaning-examples)
1. [NA insertion](#NA-insertion)
2. [Variable-specific cleaning functions](#Variable-specific-cleaning-functions)
3. [Categorical recoding](#Categorical-recoding)
4. [Child value replacement](#Child-value-replacement)
5. [Processing examples](#Processing-examples)
1. [Sparsity check](#Sparsity-check)
2. [Redundancy check](#Redundancy-check)
3. [Categorical binarisation](#Categorical-binarisation)
6. [Custom cleaning, processing and loading - funpack plugins](#Custom-cleaning,-processing-and-loading---funpack-plugins)
1. [Custom cleaning functions](#Custom-cleaning-functions)
2. [Custom processing functions](#Custom-processing-functions)
3. [Custom file loaders](#Custom-file-loaders)
7. [Miscellaneous topics](#Miscellaneous-topics)
1. [Non-numeric data](#Non-numeric-data)
2. [Dry run](#Dry-run)
3. [Built-in rules](#Built-in-rules)
4. [Using a configuration file](#Using-a-configuration-file)
5. [Working with unknown/uncategorised variables](#Working-with-unknown/uncategorised-variables)
# Overview
`funpack` performs the following steps:
## 1. Import
All data files are loaded in, unwanted columns and subjects are dropped, and
the data files are merged into a single table (a.k.a. data frame). Multiple
files can be merged according to an index column (e.g. subject ID). Or, if the
input files contain the same columns/subjects, they can be naively
concatenated along rows or columns.
## 2. Cleaning
The following cleaning steps are applied to each column:
1. **NA value replacement:** Specific values for some columns are replaced
with NA, for example, variables where a value of `-1` indicates *Do not
2. **Variable-specific cleaning functions:** Certain columns are
re-formatted; for example, the [ICD10](
disease codes can be converted to integer representations.
3. **Categorical recoding:** Certain categorical columns are re-coded.
4. **Child value replacement:** NA values within some columns which are
dependent upon other columns may have values inserted based on the values
of their parent columns.
## 3. Processing
During the processing stage, columns may be removed, merged, or expanded into
additional columns. For example, a categorical column may be expanded into a set
of binary columns, one for each category.
A column may also be removed on the basis of being too sparse, or being
redundant with respect to another column.
## 4. Export
The processed data can be saved as a `.csv`, `.tsv`, or `.hdf5` file.
# Examples
Throughout these examples, we are going to use a few command line
options, which you will probably **not** normally want to use:
- `-ow` (short for `--overwrite`): This tells `funpack` not to complain if
the output file already exists.
- `-q` (short for `--quiet`): This tells `funpack` to be quiet. Without the
`-q` option, `funpack` can be quite verbose, which can be annoying, but is
very useful when things go wrong. A good strategy is to tell `funpack` to
produce verbose output using the `--noisy` (`-n` for short) option, and to
send all of its output to a log file with the `--log_file` (or `-lf`)
option. For example:
> ```
> funpack -n -n -n -lf log.txt out.tsv in.tsv
> ```
Here's the first example input data set, with UK BioBank-style column names:
%% Cell type:code id: tags:
``` bash
cat data_01.tsv
%% Cell type:markdown id: tags:
The numbers in each column name typically represent:
1. The variable ID
2. The visit, for variables which were collected at multiple points in time.
3. The "instance", for multi-valued variables.
Note that one **variable** is typically associated with several **columns**,
although we're keeping things simple for this first example - there is only
one visit for each variable, and there are no mulit-valued variables.
> _Most but not all_ variables in the UK BioBank contain data collected at
> different visits, the times that the participants visited a UK BioBank
> assessment centre. However there are some variables (e.g. [ICD10 diagnosis
> codes]( for which
> this is not the case.
# Import examples
## Selecting variables (columns)
You can specify which variables you want to load in the following ways, using
the `--variable` (`-v` for short), `--category` (`-c` for short) and
`--column` (`-co` for short) command line options:
* By variable ID
* By variable ranges
* By a text file which contains the IDs you want to keep.
* By pre-defined variable categories
* By column name
### Selecting individual variables
Simply provide the IDs of the variables you want to extract:
%% Cell type:code id: tags:
``` bash
funpack -q -ow -v 1 -v 5 out.tsv data_01.tsv
cat out.tsv
%% Cell type:markdown id: tags:
### Selecting variable ranges
The `--variable`/`-v` option accepts MATLAB-style ranges of the form
`start:step:stop` (where the `stop` is inclusive):
%% Cell type:code id: tags:
``` bash
funpack -q -ow -v 1:3:10 out.tsv data_01.tsv
cat out.tsv
%% Cell type:markdown id: tags:
### Selecting variables with a file
If your variables of interest are listed in a plain-text file, you can simply
pass that file:
%% Cell type:code id: tags:
``` bash
echo -e "1\n6\n9" > vars.txt
funpack -q -ow -v vars.txt out.tsv data_01.tsv
cat out.tsv
%% Cell type:markdown id: tags:
### Selecting variables from pre-defined categories
Some UK BioBank-specific categories are [built into
`funpack`](#Built-in-rules), but you can also define your own categories - you
just need to create a `.tsv` file, and pass it to `funpack` via the
`--category_file` (`-cf` for short):
%% Cell type:code id: tags:
``` bash
echo -e "ID\tCategory\tVariables" > custom_categories.tsv
echo -e "1\tCool variables\t1:5,7" >> custom_categories.tsv
echo -e "2\tUncool variables\t6,8:10" >> custom_categories.tsv
cat custom_categories.tsv
%% Cell type:markdown id: tags:
Use the `--category` (`-c` for short) to select categories to output. You can
refer to categories by their ID:
%% Cell type:code id: tags:
``` bash
funpack -q -ow -cf custom_categories.tsv -c 1 out.tsv data_01.tsv
cat out.tsv
%% Cell type:markdown id: tags:
Or by name:
%% Cell type:code id: tags:
``` bash
funpack -q -ow -cf custom_categories.tsv -c uncool out.tsv data_01.tsv
cat out.tsv
%% Cell type:markdown id: tags:
### Selecting column names
If you are working with data that has non-UK BioBank style column names, you
can use the `--column` (`-co` for short) to select individual columns by their
name, rather than the variable with which they are associated. The `--column`
option accepts full column names, and also shell-style wildcard patterns:
%% Cell type:code id: tags:
``` bash
funpack -q -ow -co 4-0.0 -co "??-0.0" out.tsv data_01.tsv
cat out.tsv
%% Cell type:markdown id: tags:
## Selecting subjects (rows)
`funpack` assumes that the first column in every input file is a subject
ID. You can specify which subjects you want to load via the `--subject` (`-s`
for short) option. You can specify subjects in the same way that you specified
variables above, and also:
* By specifying a conditional expression on variable values - only subjects
for which the expression evaluates to true will be imported
* By specifying subjects to exclude
### Selecting individual subjects
%% Cell type:code id: tags:
``` bash
funpack -q -ow -s 1 -s 3 -s 5 out.tsv data_01.tsv
cat out.tsv
%% Cell type:markdown id: tags:
### Selecting subject ranges
%% Cell type:code id: tags:
``` bash
funpack -q -ow -s 2:2:10 out.tsv data_01.tsv
cat out.tsv
%% Cell type:markdown id: tags:
### Selecting subjects from a file
%% Cell type:code id: tags:
``` bash
echo -e "5\n6\n7\n8\n9\n10" > subjects.txt
funpack -q -ow -s subjects.txt out.tsv data_01.tsv