Commit 333591f5 authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

Merge branch 'rf/date-norm-in-cleaning' into 'master'

Rf/date norm in cleaning

See merge request !61
parents 8caed2de 37d137d3
......@@ -2,8 +2,8 @@ FUNPACK changelog
=================
2.3.0 (Under development)
-------------------------
2.3.0 (Tuesday 12th May 2020)
-----------------------------
Changed
......@@ -15,7 +15,11 @@ Changed
variables. This should give superior performance.
* Revisited the :meth:`.DataTable.merge` to optimise performance in all
scenarios.
* Improved performance of the :mod:`.fmrib` date/time normalisation routines.
* Improved performance of the :mod:`.fmrib` date/time normalisation routines,
and changed their usage so they are now applied as "cleaning" functions
after data import, rather than just before export. This means that date/
time columns can be subjected to the redundancy check (as they will have
a numeric type), and will improve data export performance.
2.2.1 (Monday 4th May 2020)
......
......@@ -6,7 +6,7 @@
#
__version__ = '2.3.0.dev0'
__version__ = '2.3.0'
"""The ``funpack`` versioning scheme roughly follows Semantic Versioning
conventions.
"""
......
......@@ -8,9 +8,20 @@
# Use local settings
config_file local
# Contains some FMRIB-specific plugin functions,
# including date/time normalisation.
plugin_file fmrib
# Drop non-numeric columns - the main output
# file only contains numeric data.
suppress_non_numerics
# Only import variables from FMRIB-curated categories,
# largely drawn from showcase categories
category_file fmrib/categories.tsv
#
# FUNPACK processing stages
# FUNPACK cleaning/processing stages
#
# - NA insertion
# - Categorical recoding
......@@ -38,33 +49,22 @@ config_file local
# - NA insertion
datacoding_file fmrib/datacodings_navalues.tsv
# - Categorical recoding
datacoding_file fmrib/datacodings_recoding.tsv
# - Cleaning
variable_file fmrib/variables_clean.tsv
# - Child value replacement
variable_file fmrib/variables_parentvalues.tsv
# - Processing
processing_file fmrib/processing.tsv
# FMRIB-curated categories, largely drawn from showcase categories
category_file fmrib/categories.tsv
#
# FMRIB processing of dates
#
# Date/timestamp normalisation (performed in the FUNPACK cleaning stage)
# Converts a date or date+time into a single value x, where floor(x) is the
# calendar year and the fraction day/time within the year *except* 'a day'
# is redefined as the time between 7am and 8pm (scanning only takes place
# within these hours.
#
plugin_file fmrib
date_format FMRIBImagingDate
time_format FMRIBImagingTime
type_file fmrib/datetime_formatting.tsv
# - Child value replacement
variable_file fmrib/variables_parentvalues.tsv
# Drop non-numeric columns - the main output file only contains numeric data.
suppress_non_numerics
# - Processing -
processing_file fmrib/processing.tsv
Type Clean
date normalisedDate
time normalisedAcquisitionTime
\ No newline at end of file
......@@ -89,12 +89,13 @@ def formatColumn(col,
# fall back to date/time formatting
# if relevant for this column
if formatter is None:
if vtype == util.CTYPES.date:
formatter = dateFormat
elif vtype == util.CTYPES.time or \
pdtypes.is_datetime64_any_dtype(series):
formatter = timeFormat
if formatter is None and pdtypes.is_datetime64_any_dtype(series):
# use dateFormat if we know the column
# is a date (and not datetime), otherwise
# use timeFormat if the column is a
# datetime, or unknown type.
if vtype == util.CTYPES.date: formatter = dateFormat
else: formatter = timeFormat
if formatter is not None:
log.debug('Formatting column %s%s with %s formatter',
......
......@@ -88,30 +88,33 @@ def load_FMRIBImaging(infile):
return df
@funpack.formatter('FMRIBImagingDate')
def normalisedDate(dtable, column, series):
@funpack.cleaner()
def normalisedDate(dtable, vid):
"""Converts date values into a numeric fractional year representation.
Converts a date into a single value x, where ``floor(x)`` is the calendar
year and the ``x mod 1`` is the fractional day within the year. The
conversion takes leap years into account.
"""
datetimes = series.to_numpy()
years = datetimes.astype('datetime64[Y]')
days = datetimes.astype('datetime64[D]')
# convert to day of year
# calculate fraction of day
days = (days - years).astype(np.float32)
years = (years + 1970) .astype(np.float32)
leaps = pd.DatetimeIndex(datetimes).is_leap_year + 365
for col in dtable.columns(vid):
series = dtable[:, col.name]
datetimes = series.to_numpy()
years = datetimes.astype('datetime64[Y]')
days = datetimes.astype('datetime64[D]')
# calculate and return fraction of year
return pd.Series(years + (days / leaps), name=series.name)
# convert to day of year
# calculate fraction of day
days = (days - years).astype(np.float32)
years = (years + 1970) .astype(np.float32)
leaps = pd.DatetimeIndex(datetimes).is_leap_year + 365
# calculate fraction of year
dtable[:, col.name] = years + (days / leaps)
@funpack.formatter('FMRIBImagingTime')
def normalisedAcquisitionTime(dtable, column, series):
@funpack.cleaner()
def normalisedAcquisitionTime(dtable, vid):
"""Converts timestamps into a numeric fractional year representation.
Converts a date or date+time into a single value x, where `floor(x)` is the
......@@ -119,23 +122,25 @@ def normalisedAcquisitionTime(dtable, column, series):
redefined as the time between 7am and 8pm (UK BioBank scanning only takes
place within these hours).
"""
datetimes = series.to_numpy()
years = datetimes.astype('datetime64[Y]')
days = datetimes.astype('datetime64[D]')
hours = datetimes.astype('datetime64[h]')
mins = datetimes.astype('datetime64[m]')
secs = datetimes.astype('datetime64[s]')
# convert to day of year, hour
# of day, second of hour, then
# calculate fraction of day
secs = (secs - mins) .astype(np.float32)
mins = (mins - hours).astype(np.float32)
hours = (hours - days) .astype(np.float32)
days = (days - years).astype(np.float32)
years = (years + 1970) .astype(np.float32)
dayfracs = ((hours - 7) + (mins / 60) + (secs / 3600)) / 13
leaps = pd.DatetimeIndex(datetimes).is_leap_year + 365
# calculate and return fraction of year
return pd.Series(years + (days + dayfracs) / leaps, name=series.name)
for col in dtable.columns(vid):
series = dtable[:, col.name]
datetimes = series.to_numpy()
years = datetimes.astype('datetime64[Y]')
days = datetimes.astype('datetime64[D]')
hours = datetimes.astype('datetime64[h]')
mins = datetimes.astype('datetime64[m]')
secs = datetimes.astype('datetime64[s]')
# convert to day of year, hour
# of day, second of hour, then
# calculate fraction of day
secs = (secs - mins) .astype(np.float32)
mins = (mins - hours).astype(np.float32)
hours = (hours - days) .astype(np.float32)
days = (days - years).astype(np.float32)
years = (years + 1970) .astype(np.float32)
dayfracs = ((hours - 7) + (mins / 60) + (secs / 3600)) / 13
leaps = pd.DatetimeIndex(datetimes).is_leap_year + 365
# calculate and return fraction of year
dtable[:, col.name] = years + (days + dayfracs) / leaps
%% Cell type:markdown id: tags:
![win logo](win.png)
# `funpack` (https://git.fmrib.ox.ac.uk/fsl/funpack)
> Paul McCarthy <paul.mccarthy@ndcn.ox.ac.uk>
> ([WIN@FMRIB](https://www.win.ox.ac.uk/))
`funpack` is a command-line program which you can use to extract data from UK
BioBank (and other tabular) data.
You can give `funpack` one or more input files (e.g. `.csv`, `.tsv`), and it
will merge them together, perform some preprocessing, and produce a single
output file.
A large number of rules are built into `funpack` which are specific to the UK
BioBank data set. But you can control and customise everything that `funpack`
does to your data, including which rows and columns to extract, and which
cleaning/processing steps to perform on each column.
`funpack` comes installed with recent versions of
[FSL](https://fsl.fmrib.ox.ac.uk/fsl/fslwiki/). You can also install `funpack`
via `conda`:
> ```
> conda install -c conda-forge fmrib-unpack
> ```
Or using `pip`:
> ```
> pip install fmrib-unpack
> ```
Get command-line help by typing:
> ```
> funpack -h
> ```
**Important** The examples in this notebook assume that you have installed `funpack`
2.3.0.dev0 or newer.
2.3.0 or newer.
%% Cell type:code id: tags:
``` bash
funpack -V
```
%% Cell type:markdown id: tags:
### Contents
1. [Overview](#Overview)
1. [Import](#1.-Import)
2. [Cleaning](#2.-Cleaning)
3. [Processing](#3.-Processing)
4. [Export](#4.-Export)
2. [Examples](#Examples)
3. [Import examples](#Import-examples)
1. [Selecting variables (columns)](#Selecting-variables-(columns))
1. [Selecting individual variables](#Selecting-individual-variables)
2. [Selecting variable ranges](#Selecting-variable-ranges)
3. [Selecting variables with a file](#Selecting-variables-with-a-file)
4. [Selecting variables from pre-defined categories](#Selecting-variables-from-pre-defined-categories)
2. [Selecting subjects (rows)](#Selecting-subjects-(rows))
1. [Selecting individual subjects](#Selecting-individual-subjects)
2. [Selecting subject ranges](#Selecting-subject-ranges)
3. [Selecting subjects from a file](#Selecting-subjects-from-a-file)
4. [Selecting subjects by variable value](#Selecting-subjects-by-variable-value)
5. [Excluding subjects](#Excluding-subjects)
3. [Selecting visits](#Selecting-visits)
1. [Evaluating expressions across visits](#Evaluating-expressions-across-visits)
4. [Merging multiple input files](#Merging-multiple-input-files)
1. [Merging by subject](#Merging-by-subject)
2. [Merging by column](#Merging-by-column)
3. [Naive merging](#Merging-by-column)
4. [Cleaning examples](#Cleaning-examples)
1. [NA insertion](#NA-insertion)
2. [Variable-specific cleaning functions](#Variable-specific-cleaning-functions)
3. [Categorical recoding](#Categorical-recoding)
4. [Child value replacement](#Child-value-replacement)
5. [Processing examples](#Processing-examples)
1. [Sparsity check](#Sparsity-check)
2. [Redundancy check](#Redundancy-check)
3. [Categorical binarisation](#Categorical-binarisation)
6. [Custom cleaning, processing and loading - funpack plugins](#Custom-cleaning,-processing-and-loading---funpack-plugins)
1. [Custom cleaning functions](#Custom-cleaning-functions)
2. [Custom processing functions](#Custom-processing-functions)
3. [Custom file loaders](#Custom-file-loaders)
7. [Miscellaneous topics](#Miscellaneous-topics)
1. [Non-numeric data](#Non-numeric-data)
2. [Dry run](#Dry-run)
3. [Built-in rules](#Built-in-rules)
4. [Using a configuration file](#Using-a-configuration-file)
5. [Working with unknown/uncategorised variables](#Working-with-unknown/uncategorised-variables)
# Overview
`funpack` performs the following steps:
## 1. Import
All data files are loaded in, unwanted columns and subjects are dropped, and
the data files are merged into a single table (a.k.a. data frame). Multiple
files can be merged according to an index column (e.g. subject ID). Or, if the
input files contain the same columns/subjects, they can be naively
concatenated along rows or columns.
## 2. Cleaning
The following cleaning steps are applied to each column:
1. **NA value replacement:** Specific values for some columns are replaced
with NA, for example, variables where a value of `-1` indicates *Do not
know*.
2. **Variable-specific cleaning functions:** Certain columns are
re-formatted; for example, the [ICD10](https://en.wikipedia.org/wiki/ICD-10)
disease codes can be converted to integer representations.
3. **Categorical recoding:** Certain categorical columns are re-coded.
4. **Child value replacement:** NA values within some columns which are
dependent upon other columns may have values inserted based on the values
of their parent columns.
## 3. Processing
During the processing stage, columns may be removed, merged, or expanded into
additional columns. For example, a categorical column may be expanded into a set
of binary columns, one for each category.
A column may also be removed on the basis of being too sparse, or being
redundant with respect to another column.
## 4. Export
The processed data can be saved as a `.csv`, `.tsv`, or `.hdf5` file.
# Examples
Throughout these examples, we are going to use a few command line
options, which you will probably **not** normally want to use:
- `-ow` (short for `--overwrite`): This tells `funpack` not to complain if
the output file already exists.
- `-q` (short for `--quiet`): This tells `funpack` to be quiet. Without the
`-q` option, `funpack` can be quite verbose, which can be annoying, but is
very useful when things go wrong. A good strategy is to tell `funpack` to
produce verbose output using the `--noisy` (`-n` for short) option, and to
send all of its output to a log file with the `--log_file` (or `-lf`)
option. For example:
> ```
> funpack -n -n -n -lf log.txt out.tsv in.tsv
> ```
Here's the first example input data set, with UK BioBank-style column names:
%% Cell type:code id: tags:
``` bash
cat data_01.tsv
```
%% Cell type:markdown id: tags:
The numbers in each column name typically represent:
1. The variable ID
2. The visit, for variables which were collected at multiple points in time.
3. The "instance", for multi-valued variables.
Note that one **variable** is typically associated with several **columns**,
although we're keeping things simple for this first example - there is only
one visit for each variable, and there are no mulit-valued variables.
> _Most but not all_ variables in the UK BioBank contain data collected at
> different visits, the times that the participants visited a UK BioBank
> assessment centre. However there are some variables (e.g. [ICD10 diagnosis
> codes](https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=41202)) for which
> this is not the case.
# Import examples
## Selecting variables (columns)
You can specify which variables you want to load in the following ways, using
the `--variable` (`-v` for short), `--category` (`-c` for short) and
`--column` (`-co` for short) command line options:
* By variable ID
* By variable ranges
* By a text file which contains the IDs you want to keep.
* By pre-defined variable categories
* By column name
### Selecting individual variables
Simply provide the IDs of the variables you want to extract:
%% Cell type:code id: tags:
``` bash
funpack -q -ow -v 1 -v 5 out.tsv data_01.tsv
cat out.tsv
```
%% Cell type:markdown id: tags:
### Selecting variable ranges
The `--variable`/`-v` option accepts MATLAB-style ranges of the form
`start:step:stop` (where the `stop` is inclusive):
%% Cell type:code id: tags:
``` bash
funpack -q -ow -v 1:3:10 out.tsv data_01.tsv
cat out.tsv
```
%% Cell type:markdown id: tags:
### Selecting variables with a file
If your variables of interest are listed in a plain-text file, you can simply
pass that file:
%% Cell type:code id: tags:
``` bash
echo -e "1\n6\n9" > vars.txt
funpack -q -ow -v vars.txt out.tsv data_01.tsv
cat out.tsv
```
%% Cell type:markdown id: tags:
### Selecting variables from pre-defined categories
Some UK BioBank-specific categories are [built into
`funpack`](#Built-in-rules), but you can also define your own categories - you
just need to create a `.tsv` file, and pass it to `funpack` via the
`--category_file` (`-cf` for short):
%% Cell type:code id: tags:
``` bash
echo -e "ID\tCategory\tVariables" > custom_categories.tsv
echo -e "1\tCool variables\t1:5,7" >> custom_categories.tsv
echo -e "2\tUncool variables\t6,8:10" >> custom_categories.tsv
cat custom_categories.tsv
```
%% Cell type:markdown id: tags:
Use the `--category` (`-c` for short) to select categories to output. You can
refer to categories by their ID:
%% Cell type:code id: tags:
``` bash
funpack -q -ow -cf custom_categories.tsv -c 1 out.tsv data_01.tsv
cat out.tsv
```
%% Cell type:markdown id: tags:
Or by name:
%% Cell type:code id: tags:
``` bash
funpack -q -ow -cf custom_categories.tsv -c uncool out.tsv data_01.tsv
cat out.tsv
```
%% Cell type:markdown id: tags:
### Selecting column names
If you are working with data that has non-UK BioBank style column names, you
can use the `--column` (`-co` for short) to select individual columns by their
name, rather than the variable with which they are associated. The `--column`
option accepts full column names, and also shell-style wildcard patterns:
%% Cell type:code id: tags:
``` bash
funpack -q -ow -co 4-0.0 -co "??-0.0" out.tsv data_01.tsv
cat out.tsv
```
%% Cell type:markdown id: tags:
## Selecting subjects (rows)
`funpack` assumes that the first column in every input file is a subject
ID. You can specify which subjects you want to load via the `--subject` (`-s`
for short) option. You can specify subjects in the same way that you specified
variables above, and also:
* By specifying a conditional expression on variable values - only subjects
for which the expression evaluates to true will be imported
* By specifying subjects to exclude
### Selecting individual subjects
%% Cell type:code id: tags:
``` bash
funpack -q -ow -s 1 -s 3 -s 5 out.tsv data_01.tsv
cat out.tsv
```
%% Cell type:markdown id: tags:
### Selecting subject ranges
%% Cell type:code id: tags:
``` bash
funpack -q -ow -s 2:2:10 out.tsv data_01.tsv
cat out.tsv
```
%% Cell type:markdown id: tags:
### Selecting subjects from a file
%% Cell type:code id: tags:
``` bash
echo -e "5\n6\n7\n8\n9\n10" > subjects.txt
funpack -q -ow -s subjects.txt out.tsv data_01.tsv
cat out.tsv
```
%% Cell type:markdown id: tags:
### Selecting subjects by variable value
The `--subject` option accepts *variable expressions* - you can write an
expression performing numerical comparisons on variables (denoted with a
leading `v`) and combine these expressions using boolean algebra. Only
subjects for which the expression evaluates to true will be imported. For
example, to only import subjects where variable 1 is greater than 10, and
variable 2 is less than 70, you can type:
%% Cell type:code id: tags:
``` bash
funpack -q -ow -sp -s "v1 > 10 && v2 < 70" out.tsv data_01.tsv
cat out.tsv
```
%% Cell type:markdown id: tags: