Commit d3786a2f authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

Merge branch 'enh/remove_na_rows' into 'master'

Enh/remove na rows

See merge request !54
parents 302a5f65 2b72fe60
......@@ -2,8 +2,16 @@ FUNPACK changelog
=================
2.1.0 (Under development)
-------------------------
2.1.0 (Tuesday 21st April 2020)
-------------------------------
Added
^^^^^
* New ``--drop_na_rows`` option, which tells ``funpack`` to drop rows which do
not contain a value for any column.
Changed
......
......@@ -6,7 +6,7 @@
#
__version__ = '2.1.0.dev0'
__version__ = '2.1.0'
"""The ``funpack`` versioning scheme roughly follows Semantic Versioning
conventions.
"""
......
......@@ -143,6 +143,7 @@ CLI_ARGUMENTS = collections.OrderedDict((
(('V', 'version'), {'action' : 'version',
'version' :
'%(prog)s {}'.format(VERSION)}),
(('dn', 'drop_na_rows'), {'action' : 'store_true'}),
(('d', 'dry_run'), {'action' : 'store_true'}),
(('nb', 'no_builtins'), {'action' : 'store_true'}),
(('nj', 'num_jobs'), {'type' : int,
......@@ -162,8 +163,8 @@ CLI_DESCRIPTIONS = {
'variable/data coding IDs.',
'Export options' :
'Non-numeric columns are exported to the main output file by default, but\n'
'you can control this behaviour using one of the following options:\n'
'Non-numeric columns are exported to the main output file by default,\n'
'but you can control this behaviour using one of the following options:\n'
'\n'
' - The --suppress_non_numerics option tells FUNPACK to only save\n'
' numeric columns to the main output file.\n'
......@@ -381,6 +382,8 @@ CLI_ARGUMENT_HELP = {
# Miscellaneous options
'version' : 'Print version and exit.',
'dry_run' : 'Print a summary of what would happen and exit.',
'drop_na_rows' : 'Drop rows which do not contain data for any column. '
'will take place on both import and export.',
'no_builtins' : 'Do not use the built in variable or data coding tables.',
'num_jobs' : 'Maximum number of jobs to run in parallel. Set to 1 '
'(the default) to disable parallelisation. Set to -1 '
......
......@@ -45,7 +45,8 @@ def formatColumn(col,
dtable,
dateFormat,
timeFormat,
formatters):
formatters,
logmsg=None):
"""Formats the data for the specified column. The ``dtable`` is updated
in-place.
......@@ -61,9 +62,18 @@ def formatColumn(col,
specifying custom formatters to use for specific
variables.
:arg logmsg: Message to include in log message.
:returns: A reference to ``dtable``.
"""
if dateFormat is None: dateFormat = 'default'
if timeFormat is None: timeFormat = 'default'
if formatters is None: formatters = {}
if logmsg is None: logmsg = ''
else: logmsg = ' [{}]'.format(logmsg)
vid = col.basevid
vartable = dtable.vartable
series = dtable[:, col.name]
......@@ -87,16 +97,20 @@ def formatColumn(col,
formatter = timeFormat
if formatter is not None:
log.debug('Formatting column %s with %s formatter',
col.name, formatter)
log.debug('Formatting column %s%s with %s formatter',
col.name, logmsg, formatter)
series = custom.runFormatter(formatter, dtable, col, series)
# apply column-specific fill
# value, if there is one
if col.fillval is not None:
series.fillna(value=col.fillval, inplace=True)
series = series.fillna(value=col.fillval)
# update datatable if any
# formatting took place
if (formatter is not None) or (col.fillval is not None):
dtable[:, col.name] = series
dtable[:, col.name] = series
return dtable
......
......@@ -15,6 +15,7 @@ import pandas as pd
import pandas.api.types as pdtypes
from . import custom
from . import exporting
log = logging.getLogger(__name__)
......@@ -39,6 +40,10 @@ def exportHDF5(dtable,
outfile,
key=None,
style=None,
dropNaRows=False,
dateFormat=None,
timeFormat=None,
formatters=None,
**kwargs):
"""Export data to a HDF5 file.
......@@ -59,6 +64,17 @@ def exportHDF5(dtable,
:arg style: HDF5 style to use (see above). Defaults to
:attr:`HDF5_STYLE`.
:arg dropNaRows: If ``True``, rows which do not contain data for any
columns are not exported.
:arg dateFormat: Name of formatter to use for date columns.
:arg timeFormat: Name of formatter to use for time columns.
:arg formatters: Dict of ``{ [vid|column] : formatter }`` mappings,
specifying custom formatters to use for specific
variables.
"""
if key is None: key = HDF5_KEY
......@@ -70,6 +86,13 @@ def exportHDF5(dtable,
# drop non-[numeric/string] types
cols = []
for col in dtable.dataColumns:
exporting.formatColumn(col,
dtable,
dateFormat=dateFormat,
timeFormat=timeFormat,
formatters=formatters)
series = dtable[:, col.name]
if pdtypes.is_numeric_dtype(series) or \
pdtypes.is_string_dtype( series):
......@@ -79,7 +102,10 @@ def exportHDF5(dtable,
'numeric/string columns can be exported to HDF5.',
col.name, outfile)
dtable = dtable.subtable(cols)
if dropNaRows: rows = dtable[:, :].notna().any('columns')
else: rows = None
dtable = dtable.subtable(cols, rows)
if style == 'pandas':
exportPandasStyle(dtable,
......
......@@ -14,8 +14,7 @@ import os.path as op
import os
import logging
import numpy as np
import pandas.api.types as pdtypes
import numpy as np
from . import util
from . import custom
......@@ -52,6 +51,7 @@ def exportTSV(dtable,
sep=None,
missingValues=None,
numRows=None,
dropNaRows=False,
dateFormat=None,
timeFormat=None,
formatters=None,
......@@ -74,6 +74,9 @@ def exportTSV(dtable,
:arg numRows: Number of rows to write at a time. Defaults to
:attr:`NUM_ROWS`.
:arg dropNaRows: If ``True``, rows which do not contain data for any
columns are not exported.
:arg dateFormat: Name of formatter to use for date columns.
:arg timeFormat: Name of formatter to use for time columns.
......@@ -86,9 +89,6 @@ def exportTSV(dtable,
if sep is None: sep = '\t'
if missingValues is None: missingValues = ''
if numRows is None: numRows = NUM_ROWS
if dateFormat is None: dateFormat = 'default'
if timeFormat is None: timeFormat = 'default'
if formatters is None: formatters = {}
# We're going to output each chunk of
# subjects to a separate file (in
......@@ -109,6 +109,7 @@ def exportTSV(dtable,
func = ft.partial(writeDataFrame,
sep=sep,
missingValues=missingValues,
dropNaRows=dropNaRows,
dateFormat=dateFormat,
timeFormat=timeFormat,
formatters=formatters)
......@@ -135,6 +136,7 @@ def writeDataFrame(dtable,
chunki,
sep,
missingValues,
dropNaRows,
dateFormat,
timeFormat,
formatters):
......@@ -154,6 +156,9 @@ def writeDataFrame(dtable,
:arg missingValues: String to use for missing/NA values.
:arg dropNaRows: If ``True``, rows which do not contain data for any
columns are not exported.
:arg dateFormat: Name of formatter to use for date columns.
:arg timeFormat: Name of formatter to use for time columns.
......@@ -163,30 +168,24 @@ def writeDataFrame(dtable,
variables.
"""
vartable = dtable.vartable
fmtfunc = ft.partial(exporting.formatColumn,
dateFormat=dateFormat,
timeFormat=timeFormat,
formatters=formatters)
for col in dtable.dataColumns:
for coli, col in enumerate(dtable.dataColumns):
vid = col.vid
name = col.name
series = dtable[:, name]
logmsg = 'chunk {}, col {} / {}'.format(
chunki, coli, len(dtable.dataColumns))
if vid in vartable.index: vtype = vartable['Type'][vid]
else: vtype = None
if any((vid in formatters,
name in formatters,
vtype in (util.CTYPES.date, util.CTYPES.time),
pdtypes.is_datetime64_any_dtype(series),
col.fillval is not None)):
dtable[:, col.name] = fmtfunc(col, dtable)[:, col.name]
exporting.formatColumn(
col,
dtable,
dateFormat=dateFormat,
timeFormat=timeFormat,
formatters=formatters,
logmsg=logmsg)
towrite = dtable[:, :]
if dropNaRows:
towrite.dropna(how='all', inplace=True)
log.info('Writing %u columns and %u rows [chunk %u] to %s ...',
len(dtable.dataColumns), len(dtable), chunki, outfile)
......
......@@ -78,6 +78,7 @@ def importData(datafiles,
mergeAxis=None,
mergeStrategy=None,
indexVisits=False,
dropNaRows=False,
loaders=None,
njobs=1,
mgr=None,
......@@ -93,69 +94,72 @@ def importData(datafiles,
3. Creates and returns a :class:`DataTable`.
:arg datafiles: Path to the data file(s)
:arg datafiles: Path to the data file(s)
:arg vartable: The data coding table
:arg vartable: The data coding table
:arg proctable: The processing table
:arg proctable: The processing table
:arg cattable: The category table
:arg cattable: The category table
:arg variables: List of variable IDs to import
:arg variables: List of variable IDs to import
:arg colnames: List of names/glob-style wildcard patterns
specifying columns to import.
:arg colnames: List of names/glob-style wildcard patterns
specifying columns to import.
:arg categories: List of category names to import
:arg categories: List of category names to import
:arg subjects: List of subjects to include
:arg subjects: List of subjects to include
:arg subjectExprs: List of subject inclusion expressions
:arg subjectExprs: List of subject inclusion expressions
:arg exclude: List of subjects to exclude
:arg exclude: List of subjects to exclude
:arg encoding: Character encoding(s) for data file(s). See
:func:`loadData`.
:arg encoding: Character encoding(s) for data file(s). See
:func:`loadData`.
:arg trustTypes: If ``True``, it is assumed that columns with a
known data type do not contain any bad/unparseable
values. This improves performance, but will cause
an error if the assumption does not hold.
:arg trustTypes: If ``True``, it is assumed that columns with a
known data type do not contain any bad/unparseable
values. This improves performance, but will cause
an error if the assumption does not hold.
:arg indexes: Dict of ``{filename : [index]}`` mappings,
specifying the position of the column(s) to use as
the index. Defaults to 0 (the first column).
:arg indexes: Dict of ``{filename : [index]}`` mappings,
specifying the position of the column(s) to use as
the index. Defaults to 0 (the first column).
:arg mergeAxis: Merging axis to use when loading multiple data
files - see the :func:`mergeData` function.
:arg mergeAxis: Merging axis to use when loading multiple data
files - see the :func:`mergeData` function.
:arg mergeStrategy: Merging strategy to use when loading multiple
data files - see the :func:`mergeData` function.
:arg mergeStrategy: Merging strategy to use when loading multiple
data files - see the :func:`mergeData` function.
:arg indexVisits: Re-arrange the data so that rows are indexed by
subject ID and visit, rather than visits being
split into separate columns. Only applied to
variables which are labelled with Instancing 2.
:arg indexVisits: Re-arrange the data so that rows are indexed by
subject ID and visit, rather than visits being
split into separate columns. Only applied to
variables which are labelled with Instancing 2.
:arg loaders: Dict of ``{ file : loaderName }`` mappings
containing custom sniffers/loaders to be used for
specific files. See the :mod:`.custom` module.
:arg dropNaRows: If ``True``, rows which do not contain data for any
columns are not loaded.
:arg njobs: Number of processes to use for parallelising tasks.
:arg loaders: Dict of ``{ file : loaderName }`` mappings
containing custom sniffers/loaders to be used for
specific files. See the :mod:`.custom` module.
:arg mgr: :class:`multiprocessing.Manager` object for
parallelisation
:arg njobs: Number of processes to use for parallelising tasks.
:arg dryrun: If ``True`` the data is not loaded.
:arg mgr: :class:`multiprocessing.Manager` object for
parallelisation
:returns: A tuple containing:
:arg dryrun: If ``True`` the data is not loaded.
:returns: A tuple containing:
- A :class:`DataTable`, which contains references
to the data, and the variable and procesing
tables.
- A :class:`DataTable`, which contains references
to the data, and the variable and procesing
tables.
- A list of :class:`.Column` objects that were not
loaded from each input file.
- A list of :class:`.Column` objects that were not
loaded from each input file.
"""
variables = filter.restrictVariables(cattable, variables, categories)
......@@ -185,6 +189,7 @@ def importData(datafiles,
mergeAxis=mergeAxis,
mergeStrategy=mergeStrategy,
indexVisits=indexVisits,
dropNaRows=dropNaRows,
loaders=loaders,
trustTypes=trustTypes,
pool=pool,
......@@ -239,6 +244,7 @@ def loadFiles(datafiles,
mergeAxis=None,
mergeStrategy=None,
indexVisits=False,
dropNaRows=False,
loaders=None,
pool=None,
dryrun=False):
......@@ -285,6 +291,9 @@ def loadFiles(datafiles,
separate columns. Only applied to variables which are
labelled with Instancing 2.
:arg dropNaRows: If ``True``, rows which do not contain data for any
columns are not loaded.
:arg loaders: Dict of ``{ file : loaderName }`` mappings containing
custom loaders/sniffers to be used for specific files.
See the :mod:`.custom` module.
......@@ -347,6 +356,7 @@ def loadFiles(datafiles,
subjectExprs=subjectExprs,
exclude=exclude,
indexVisits=indexVisits,
dropNaRows=dropNaRows,
encoding=fencoding,
trustTypes=trustTypes,
pool=pool)
......@@ -395,6 +405,7 @@ def loadFile(fname,
subjectExprs=None,
exclude=None,
indexVisits=False,
dropNaRows=False,
encoding=None,
trustTypes=False,
pool=None):
......@@ -434,6 +445,9 @@ def loadFile(fname,
separate columns. Only applied to variables which are
labelled with Instancing 2.
:arg dropNaRows: If ``True``, rows which do not contain data for any
columns are not loaded.
:arg encoding: Character encoding (or sequence of encodings, one
for each data file). Defaults to ``latin1``.
......@@ -518,6 +532,7 @@ def loadFile(fname,
'subjectExprs' : subjectExprs,
'exclude' : exclude,
'indexByVisit' : indexVisits,
'dropNaRows' : dropNaRows,
'encoding' : encoding,
'trustTypes' : trustTypes,
'dtypes' : dtypes,
......@@ -578,6 +593,7 @@ def loadChunk(i,
subjectExprs,
exclude,
indexByVisit,
dropNaRows,
encoding,
trustTypes,
dtypes,
......@@ -620,6 +636,9 @@ def loadChunk(i,
to re-arrange the loaded data so it is indexed by
visit number, in addition to row ID.
:arg dropNaRows: If ``True``, rows which do not contain data for any
columns are not loaded.
:arg encoding: Character encoding (or sequence of encodings, one
for each data file). Defaults to ``latin1``.
......@@ -663,6 +682,10 @@ def loadChunk(i,
**dlargs)
gotrows = len(df)
# drop NA rows if requested
if dropNaRows:
df.dropna(how='all', inplace=True)
# If a subject/expression/exclude list
# is provided, filter the rows accordingly
df = filter.filterSubjects(df, toload, subjects, subjectExprs, exclude)
......
......@@ -197,6 +197,7 @@ def doImport(args, mgr):
mergeAxis=args.merge_axis,
mergeStrategy=args.merge_strategy,
indexVisits=args.index_visits,
dropNaRows=args.drop_na_rows,
loaders=args.loader,
njobs=args.num_jobs,
mgr=mgr,
......@@ -359,6 +360,7 @@ def doExport(dtable, args):
# General export options
fileFormat=args.format,
numRows=args.num_rows,
dropNaRows=args.drop_na_rows,
dateFormat=args.date_format,
timeFormat=args.time_format,
formatters=args.var_format,
......
%% Cell type:markdown id: tags:
![win logo](win.png)
# `funpack` (https://git.fmrib.ox.ac.uk/fsl/funpack)
> Paul McCarthy <paul.mccarthy@ndcn.ox.ac.uk>
> ([WIN@FMRIB](https://www.win.ox.ac.uk/))
`funpack` is a command-line program which you can use to extract data from UK
BioBank (and other tabular) data.
You can give `funpack` one or more input files (e.g. `.csv`, `.tsv`), and it
will merge them together, perform some preprocessing, and produce a single
output file.
A large number of rules are built into `funpack` which are specific to the UK
BioBank data set. But you can control and customise everything that `funpack`
does to your data, including which rows and columns to extract, and which
cleaning/processing steps to perform on each column.
`funpack` comes installed with recent versions of
[FSL](https://fsl.fmrib.ox.ac.uk/fsl/fslwiki/). You can also install `funpack`
via `conda`:
> ```
> conda install -c conda-forge fmrib-unpack
> ```
Or using `pip`:
> ```
> pip install fmrib-unpack
> ```
Get command-line help by typing:
> ```
> funpack -h
> ```
**Important** The examples in this notebook assume that you have installed `funpack`
2.1.0.dev0 or newer.
2.1.0 or newer.
%% Cell type:code id: tags:
``` bash
funpack -V
```
%% Cell type:markdown id: tags:
### Contents
1. [Overview](#Overview)
1. [Import](#1.-Import)
2. [Cleaning](#2.-Cleaning)
3. [Processing](#3.-Processing)
4. [Export](#4.-Export)
2. [Examples](#Examples)
3. [Import examples](#Import-examples)
1. [Selecting variables (columns)](#Selecting-variables-(columns))
1. [Selecting individual variables](#Selecting-individual-variables)
2. [Selecting variable ranges](#Selecting-variable-ranges)
3. [Selecting variables with a file](#Selecting-variables-with-a-file)
4. [Selecting variables from pre-defined categories](#Selecting-variables-from-pre-defined-categories)
2. [Selecting subjects (rows)](#Selecting-subjects-(rows))
1. [Selecting individual subjects](#Selecting-individual-subjects)
2. [Selecting subject ranges](#Selecting-subject-ranges)
3. [Selecting subjects from a file](#Selecting-subjects-from-a-file)
4. [Selecting subjects by variable value](#Selecting-subjects-by-variable-value)
5. [Excluding subjects](#Excluding-subjects)
3. [Selecting visits](#Selecting-visits)
1. [Evaluating expressions across visits](#Evaluating-expressions-across-visits)
4. [Merging multiple input files](#Merging-multiple-input-files)
1. [Merging by subject](#Merging-by-subject)
2. [Merging by column](#Merging-by-column)
3. [Naive merging](#Merging-by-column)
4. [Cleaning examples](#Cleaning-examples)
1. [NA insertion](#NA-insertion)
2. [Variable-specific cleaning functions](#Variable-specific-cleaning-functions)
3. [Categorical recoding](#Categorical-recoding)
4. [Child value replacement](#Child-value-replacement)
5. [Processing examples](#Processing-examples)
1. [Sparsity check](#Sparsity-check)
2. [Redundancy check](#Redundancy-check)
3. [Categorical binarisation](#Categorical-binarisation)
6. [Custom cleaning, processing and loading - funpack plugins](#Custom-cleaning,-processing-and-loading---funpack-plugins)
1. [Custom cleaning functions](#Custom-cleaning-functions)
2. [Custom processing functions](#Custom-processing-functions)
3. [Custom file loaders](#Custom-file-loaders)
7. [Miscellaneous topics](#Miscellaneous-topics)
1. [Non-numeric data](#Non-numeric-data)
2. [Dry run](#Dry-run)
3. [Built-in rules](#Built-in-rules)
4. [Using a configuration file](#Using-a-configuration-file)
5. [Working with unknown/uncategorised variables](#Working-with-unknown/uncategorised-variables)
# Overview
`funpack` performs the following steps:
## 1. Import
All data files are loaded in, unwanted columns and subjects are dropped, and
the data files are merged into a single table (a.k.a. data frame). Multiple
files can be merged according to an index column (e.g. subject ID). Or, if the
input files contain the same columns/subjects, they can be naively
concatenated along rows or columns.
## 2. Cleaning
The following cleaning steps are applied to each column: