Commit de0bf7ed authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

Merge branch 'mnt/warning' into 'master'


See merge request !76
parents 9a7fdc99 552cc89d
......@@ -2,6 +2,28 @@ FUNPACK changelog
2.5.2 (Monday 15th March 2021)
* A warning message is now emitted if a processing step is requested for a
variable that is not present in the input data.
* Fixed an issue with the ``binariseCatgegorical`` processing function where,
when it was applied to multiple variables, each with a separate ``take``
variable (as is the case in the ``fmrib`` configuration), would cause an
error if any of the ``take`` variables were not present.
2.5.1 (Wednesday 3rd March 2021)
......@@ -6,7 +6,7 @@
__version__ = '2.5.1'
__version__ = '2.5.2'
"""The ``funpack`` versioning scheme roughly follows Semantic Versioning
......@@ -144,8 +144,8 @@ def processData(dtable):
for i in dtable.proctable.index:
procs, vids, parallel, bcastIdxs = retrieveProcess(dtable, i)
allvids = list(it.chain(*vids))
procs, vids, parallel = retrieveProcess(dtable, i)
allvids = list(it.chain(*vids))
if len(allvids) == 0:
......@@ -163,7 +163,7 @@ def processData(dtable):
with util.timed(, log, logging.DEBUG, fmt=fmt), \
tempfile.TemporaryDirectory() as workDir:
runProcess(proc, dtable, vids, workDir, parallel, bcastIdxs)
runProcess(proc, dtable, vids, workDir, parallel)
def retrieveProcess(dtable, procIdx):
......@@ -172,7 +172,7 @@ def retrieveProcess(dtable, procIdx):
process should be applied to.
:arg dtable: The :class:`.DataTable`
:arg procIdx: Index into the processing table
:arg procIdx: Index into the processing table
:returns: A tuple containing:
- A dict of ``{ name : Process }`` mappings containing
......@@ -183,17 +183,12 @@ def retrieveProcess(dtable, procIdx):
- ``True`` if the process can be applied in parallel
across the variable groups, ``False`` otherwise.
- A list of indices specifying the broadcast arguments
for each variable group, or ``None`` if no broadcasting
can take place.
i = procIdx
ptable = dtable.proctable
all_vids = dtable.variables
all_vids = [v for v in all_vids if v != 0]
bcastIdxs = None
# For each process, the processing table
# contains a "process variable type",
......@@ -230,18 +225,16 @@ def retrieveProcess(dtable, procIdx):
# apply independently to specified variables
elif pvtype == 'independent':
vids = [([v], i) for i, v in enumerate(vids) if dtable.present(v)]
bcastIdxs = [v[1] for v in vids]
vids = [v[0] for v in vids]
vids = [[v] for v in vids]
# apply simultaneously to specified variables
else: # 'vids'
vids = [[v for v in vids if dtable.present(v)]]
else: # ptype == 'vids'
vids = [vids]
return procs, vids, 'independent' in pvtype, bcastIdxs
return procs, vids, 'independent' in pvtype
def runProcess(proc, dtable, vids, workDir, parallel, bcastIdxs):
def runProcess(proc, dtable, vids, workDir, parallel):
"""Called by :func:`processData`. Runs the given process, and updates
the :class:`.DataTable` as needed.
......@@ -253,29 +246,64 @@ def runProcess(proc, dtable, vids, workDir, parallel, bcastIdxs):
:arg parallel: If ``True``, each variable group is processed in parallel.
Otherwise they are processed sequentially.
:arg bcastIDxs: Indices for broadcast arguments, for each variable group.
results = []
if bcastIdxs is None:
bcastIdxs = [None] * len(vids)
def filterMissing(vids):
"""Takes a list of variable IDs and removes those that are not
present in the data set, emitting a warning for each ID that
is removed.
if not proc.filterMissing:
return vids
filtered = []
for vid in vids:
if dtable.present(vid):
log.warning('Process %s refers to missing variable %u! '
'(%s)',, vid, proc.processString)
return filtered
# run process serially
if not parallel:
for vg in vids:
vg = filterMissing(vg)
results.append(, vg))
# or run in parallel across vid groups
with dtable.pool() as pool:
# Note. This code is horrible for a number of
# reasons, including that parallelisation in
# older versions of FUNPACK worked differently,
# and for the sake of preserving backwards
# compatibility with resepct to the use of the
# argument broadcast feature (although I
# seriously doubt that anybody is even using
# broadcasting).
# gather all variables required by this process -
# the ones which are specified in the processing
# table, along with any auxillary ones speciified
# as arguments to the process.
allvids = [vg + proc.auxillaryVariables(bi)
for vg, bi in zip(vids, bcastIdxs)]
# as arguments to the process. "allvids" is used
# to creae the sub-table, and "vids" is the list
# of vids that we ask the processing function to
# process.
unfiltered = vids
vids = []
allvids = []
bcastIdxs = []
for i, vg in enumerate(unfiltered):
vg = filterMissing(vg)
auxvids = filterMissing(proc.auxillaryVariables(i))
if len(vg) > 0:
vids .append(vg)
allvids .append(vg + auxvids)
# generate a subtable for each variable group -
# this is to minimise the amount of data that
......@@ -449,14 +477,15 @@ class Process:
def __init__(self, ptype, name, args, kwargs):
def __init__(self, ptype, name, args, kwargs, procstr):
"""Create a ``Process``.
:arg ptype: Process type - either ``cleaner`` or ``processor``
(see the :mod:`.custom` module).
:arg name: Process name
:arg args: Positional arguments to pass to the process function.
:arg kwargs: Keyword arguments to pass to the process function.
:arg ptype: Process type - either ``cleaner`` or ``processor``
(see the :mod:`.custom` module).
:arg name: Process name
:arg args: Positional arguments to pass to the process function.
:arg kwargs: Keyword arguments to pass to the process function.
:arg procstr: Input string containing the process specification.
Any keyword arguments which begin with ``'broadcast_'`` are separated
out the other keyword arguments - see the :meth:`run` method for more
......@@ -476,12 +505,13 @@ class Process:
# cleaner functions are not
# defined in processing_functions,
# so in this case func will be None.
self.__ptype = ptype
self.__name = name
self.__args = args
self.__kwargs = normalKwargs
self.__bcastKwargs = bcastKwargs
self.__metaproc = normalKwargs.pop('metaproc', None)
self.__ptype = ptype
self.__name = name
self.__args = args
self.__kwargs = normalKwargs
self.__procstr = procstr
self.__bcastKwargs = bcastKwargs
self.__metaproc = normalKwargs.pop('metaproc', None)
def __repr__(self):
......@@ -515,6 +545,14 @@ class Process:
return self.__kwargs
def processString(self):
"""Returns the original string, from the processing table/
command-line, which defines this ``Process``.
return self.__procstr
def broadcastKwargs(self):
"""Returns the keyword arguments for this ``Process`` which
......@@ -524,6 +562,22 @@ class Process:
return self.__bcastKwargs
def filterMissing(self):
"""Return ``True`` if this processing function expects that the list of
variable IDs which it is given will not contain the IDs of variables
which are not present in the data.
This property is set via a ``filterMissing`` argument passed to the
processor decorator function. Its default value is ``True``.
Note: This is a hack which is only used by the
:func:`.binariseCategorical` function, and which is in place because
that function used to be parallelised differently.
return custom.args(self.__ptype, self.__name).get('filterMissing', True)
def auxillaryVariables(self, broadcastIndex=None):
"""Returns a list of "auxillary" variables for this process. Auxillary
variables are variables which a process is not being applied to, but
......@@ -533,6 +587,10 @@ class Process:
The names of any arguments which contain auxillary variables are
specified via the ``auxvids`` argument to the processor decorator
Note: This is a hack which is only used by the
:func:`.binariseCategorical` function, and which is in place because
that function used to be parallelised differently.
auxargs = custom.args(self.__ptype, self.__name).get('auxvids', [])
......@@ -648,7 +706,7 @@ def parseProcesses(procs, ptype):
if not custom.exists(ptype, name):
raise NoSuchProcessError(name)
return Process(ptype, name, args, kwargs)
return Process(ptype, name, args, kwargs, procs)
parser = pp.delimitedList(makeParser().setParseAction(makeProcess))
......@@ -286,7 +286,23 @@ def _pairwiseRemoveIfRedundant(dtable, data, corrthres, nathres=None):
return redundant
# auxvids tells the processing runner the
# "take" argument refers to other variables
# which are not processed, but are needed
# to perform the processing.
# "filterMissing" tells the processing
# runner *not* to remove variables which
# are not present in the data from the list
# of vids that are passed in - we do our
# own check here.
# Both of the above are ridiculous hacks
# which are in place because this function,
# and FUNPACK, used to parallelise things
# differently, and to preserve backwards
# compatibility.
@custom.processor(auxvids=['take'], filterMissing=False)
def binariseCategorical(dtable,
......@@ -391,6 +407,8 @@ def binariseCategorical(dtable,
if nameFormat is None:
nameFormat = defaultNameFormat[acrossVisits, acrossInstances]
# if take is a single vid or None,
# we turn it into [take] * len(vids)
if not isinstance(take, collections.Sequence):
take = [take] * len(vids)
......@@ -406,6 +424,13 @@ def binariseCategorical(dtable,
for vid, takevid in zip(vids, take):
if (not dtable.present(vid)) or \
(takevid is not None and not dtable.present(takevid)):
log.warning('Variable %u (or take: %s) is not present in the '
'data set - skipping the binariseCategorical step',
vid, takevid)
colgrps = gatherColumnGroups(vid)
if takevid is None: takegrps = [None] * len(colgrps)
......@@ -85,7 +85,7 @@ def isSparse(
:arg minstd: Minimum standard deviation, for numeric/categorical types.
:arg mincat: Minimum size/proportion of largest category,
:arg mincat: Minimum size/proportion of smallest category,
for integer/categorical types.
:arg maxcat: Maximum size/proportion of largest category,
%% Cell type:markdown id: tags:
![win logo](attachment:win.png)
# `funpack` (
> Paul McCarthy <>
`funpack` is a command-line program which you can use to extract data from UK
BioBank (and other tabular) data.
You can give `funpack` one or more input files (e.g. `.csv`, `.tsv`), and it
will merge them together, perform some preprocessing, and produce a single
output file.
A large number of rules are built into `funpack` which are specific to the UK
BioBank data set. But you can control and customise everything that `funpack`
does to your data, including which rows and columns to extract, and which
cleaning/processing steps to perform on each column.
`funpack` comes installed with recent versions of
[FSL]( You can also install `funpack`
via `conda`:
> ```
> conda install -c conda-forge fmrib-unpack
> ```
Or using `pip`:
> ```
> pip install fmrib-unpack
> ```
Get command-line help by typing:
> ```
> funpack -h
> ```
**Important** The examples in this notebook assume that you have installed `funpack`
2.5.1 or newer.
2.5.2 or newer.
%% Cell type:code id: tags:
``` bash
funpack -V
%% Cell type:markdown id: tags:
> _Note:_ If the above command produces a `NameError`, you may need to change
> the Jupyter Notebook kernel type to **Bash** - you can do so via the
> **Kernel -> Change Kernel** menu option.
### Contents
1. [Overview](#Overview)
1. [Import](#1.-Import)
2. [Cleaning](#2.-Cleaning)
3. [Processing](#3.-Processing)
4. [Export](#4.-Export)
2. [Examples](#Examples)
3. [Import examples](#Import-examples)
1. [Selecting variables (columns)](#Selecting-variables-(columns))
1. [Selecting individual variables](#Selecting-individual-variables)
2. [Selecting variable ranges](#Selecting-variable-ranges)
3. [Selecting variables with a file](#Selecting-variables-with-a-file)
4. [Selecting variables from pre-defined categories](#Selecting-variables-from-pre-defined-categories)
2. [Selecting subjects (rows)](#Selecting-subjects-(rows))
1. [Selecting individual subjects](#Selecting-individual-subjects)
2. [Selecting subject ranges](#Selecting-subject-ranges)
3. [Selecting subjects from a file](#Selecting-subjects-from-a-file)
4. [Selecting subjects by variable value](#Selecting-subjects-by-variable-value)
5. [Excluding subjects](#Excluding-subjects)
3. [Selecting visits](#Selecting-visits)
1. [Evaluating expressions across visits](#Evaluating-expressions-across-visits)
4. [Merging multiple input files](#Merging-multiple-input-files)
1. [Merging by subject](#Merging-by-subject)
2. [Merging by column](#Merging-by-column)
3. [Naive merging](#Merging-by-column)
4. [Cleaning examples](#Cleaning-examples)
1. [NA insertion](#NA-insertion)
2. [Variable-specific cleaning functions](#Variable-specific-cleaning-functions)
3. [Categorical recoding](#Categorical-recoding)
4. [Child value replacement](#Child-value-replacement)
5. [Processing examples](#Processing-examples)
1. [Sparsity check](#Sparsity-check)
2. [Redundancy check](#Redundancy-check)
3. [Categorical binarisation](#Categorical-binarisation)
6. [Custom cleaning, processing and loading - funpack plugins](#Custom-cleaning,-processing-and-loading---funpack-plugins)
1. [Custom cleaning functions](#Custom-cleaning-functions)
2. [Custom processing functions](#Custom-processing-functions)
3. [Custom file loaders](#Custom-file-loaders)
7. [Miscellaneous topics](#Miscellaneous-topics)
1. [Non-numeric data](#Non-numeric-data)
2. [Dry run](#Dry-run)
3. [Built-in rules](#Built-in-rules)
4. [Using a configuration file](#Using-a-configuration-file)
5. [Working with unknown/uncategorised variables](#Working-with-unknown/uncategorised-variables)
# Overview
`funpack` performs the following steps:
## 1. Import
All data files are loaded in, unwanted columns and subjects are dropped, and
the data files are merged into a single table (a.k.a. data frame). Multiple
files can be merged according to an index column (e.g. subject ID). Or, if the
input files contain the same columns/subjects, they can be naively
concatenated along rows or columns.
> _Note:_ FUNPACK refers to UK Biobank **Data fields** as **variables**. The
> two terms can be considered equivalent.
## 2. Cleaning
The following cleaning steps are applied to each column:
1. **NA value replacement:** Specific values for some columns are replaced
with NA, for example, variables where a value of `-1` indicates *Do not
2. **Variable-specific cleaning functions:** Certain columns are
re-formatted; for example, the [ICD10](
disease codes can be converted to integer representations.
3. **Categorical recoding:** Certain categorical columns are re-coded.
4. **Child value replacement:** NA values within some columns which are
dependent upon other columns may have values inserted based on the values
of their parent columns.
## 3. Processing
During the processing stage, columns may be removed, merged, or expanded into
additional columns. For example, a categorical column may be expanded into a set
of binary columns, one for each category.
A column may also be removed on the basis of being too sparse, or being
redundant with respect to another column.
## 4. Export
The processed data can be saved as a `.csv`, `.tsv`, or `.hdf5` file.
# Examples
Throughout these examples, we are going to use a few command line
options, which you will probably **not** normally want to use:
- `-ow` (short for `--overwrite`): This tells `funpack` not to complain if
the output file already exists.
- `-q` (short for `--quiet`): This tells `funpack` to be quiet. Without the
`-q` option, `funpack` can be quite verbose, which can be annoying, but is
very useful when things go wrong. A good strategy is to tell `funpack` to
produce verbose output using the `--noisy` (`-n` for short) option, and to
send all of its output to a log file with the `--log_file` (or `-lf`)
option. For example:
> ```
> funpack -n -n -n -lf log.txt out.tsv in.tsv
> ```
%% Cell type:code id: tags:
``` bash
alias funpack="funpack -ow -q"
%% Cell type:markdown id: tags:
Here's the first example input data set, with UK BioBank-style column names:
%% Cell type:code id: tags:
``` bash
cat data_01.tsv
%% Cell type:markdown id: tags:
The numbers in each column name typically represent:
1. The variable ID
2. The visit, for variables which were collected at multiple points in time.
3. The "instance", for multi-valued variables.
Note that one **variable** is typically associated with several **columns**,
although we're keeping things simple for this first example - there is only
one visit for each variable, and there are no mulit-valued variables.
> _Most but not all_ variables in the UK BioBank contain data collected at
> different visits, the times that the participants visited a UK BioBank
> assessment centre. However there are some variables (e.g. [ICD10 diagnosis
> codes]( for which
> this is not the case.
# Import examples
## Selecting variables (columns)
You can specify which variables you want to load in the following ways, using
the `--variable` (`-v` for short), `--category` (`-c` for short) and
`--column` (`-co` for short) command line options:
* By variable ID
* By variable ranges
* By a text file which contains the IDs you want to keep.
* By pre-defined variable categories
* By column name
### Selecting individual variables
Simply provide the IDs of the variables you want to extract:
%% Cell type:code id: tags:
``` bash
funpack -v 1 -v 5 out.tsv data_01.tsv
cat out.tsv
%% Cell type:markdown id: tags:
### Selecting variable ranges
The `--variable`/`-v` option accepts MATLAB-style ranges of the form
`start:step:stop` (where the `stop` is inclusive):
%% Cell type:code id: tags:
``` bash
funpack -v 1:3:10 out.tsv data_01.tsv
cat out.tsv
%% Cell type:markdown id: tags:
### Selecting variables with a file
If your variables of interest are listed in a plain-text file, you can simply
pass that file:
%% Cell type:code id: tags:
``` bash
echo -e "1\n6\n9" > vars.txt
funpack -v vars.txt out.tsv data_01.tsv
cat out.tsv