Commit 27aa52d9 authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

Merge branch 'enh/remove-duplicates' into 'master'

ENH: New --remove_duplicates cli option

See merge request !84
parents d0eeace0 a95a08ca
......@@ -2,8 +2,16 @@ FUNPACK changelog
=================
2.9.0 (Under development)
-------------------------
2.9.0 (Tuesady 28th December 2021)
----------------------------------
Added
^^^^^
* New ``--remove_duplicates`` option, which causes columns with duplicate
names to be removed, with only the first retained.
Changed
......
......@@ -6,7 +6,7 @@
#
__version__ = '2.8.0'
__version__ = '2.9.0'
"""The ``funpack`` versioning scheme roughly follows Semantic Versioning
conventions.
"""
......
......@@ -54,6 +54,7 @@ CLI_ARGUMENTS = collections.OrderedDict((
'default' : DEFAULT_MERGE_AXIS}),
(('ms', 'merge_strategy'), {'choices' : AVAILABLE_MERGE_STRATEGIES,
'default' : DEFAULT_MERGE_STRATEGY}),
(('rm', 'remove_duplicates'), {'action' : 'store_true'}),
(('rd', 'rename_duplicates'), {'action' : 'store_true'}),
(('cfg', 'config_file'), {'action' : 'append'}),
(('vf', 'variable_file'), {'action' : 'append'}),
......@@ -224,6 +225,9 @@ CLI_ARGUMENT_HELP = {
'Options are "naive", "intersection"/"inner", or "union"/'
'"outer".'.format(DEFAULT_MERGE_STRATEGY),
'remove_duplicates' :
'Remove duplicate columns, only retaining the first.',
'rename_duplicates' :
'Rename any duplicate columns so that all columns have a unique name.',
......@@ -686,6 +690,11 @@ def _prepareInputAndOutputFiles(args):
if args.loader is not None:
args.loader = {op.realpath(f) : n for f, n in args.loader}
# Remove/rename duplicates options are mutually exclusive
if args.remove_duplicates and args.rename_duplicates:
raise ValueError('Only one of--remove_duplicates and '
'--rename_duplicates may be used.')
# turn index indices into dict of
# { file : [index] } mappings
if args.index is not None:
......
......@@ -298,7 +298,8 @@ class FileInfo:
indexes=None,
loaders=None,
encodings=None,
renameDuplicates=False):
renameDuplicates=False,
renameSuffix=None):
"""Create a ``FileInfo`` object.
:arg datafiles: Path to input file, or sequence of paths.
......@@ -311,6 +312,9 @@ class FileInfo:
specifying non-standard file encodings.
:arg renameDuplicates: If ``True``, duplicate columns are re-named -
see :func:`renameDuplicateColumns`.
:arg renameSuffix: Passed as ``suffix`` to
:func:`renameDuplicateColumns`, if
``renameDuplicates is True``.
"""
if isinstance(datafiles, str): datafiles = [datafiles]
......@@ -322,7 +326,8 @@ class FileInfo:
indexes,
loaders,
encodings,
renameDuplicates)
renameDuplicates,
renameSuffix=renameSuffix)
self.__datafiles = list(datafiles)
self.__indexes = dict(indexes)
......@@ -388,7 +393,8 @@ def fileinfo(datafiles,
indexes=None,
sniffers=None,
encodings=None,
renameDuplicates=False):
renameDuplicates=False,
renameSuffix=None):
"""Identifies the format of each input data file, and extracts/generates
column names and variable IDs for every column.
......@@ -410,6 +416,10 @@ def fileinfo(datafiles,
which have the same name are renamed - see
:func:`renameDuplicateColumns`.
:arg renameSuffix: Passed as ``suffix`` to
:func:`renameDuplicateColumns`, if
``renameDuplicates is True``.
:returns: A tuple containing:
- List of ``csv`` dialect types
......@@ -517,22 +527,28 @@ def fileinfo(datafiles,
col.name = util.generateColumnName(vid, 0, 0)
if renameDuplicates:
renameDuplicateColumns(it.chain(*cols))
renameDuplicateColumns(it.chain(*cols), suffix=renameSuffix)
return dialects, headers, cols
def renameDuplicateColumns(cols):
def renameDuplicateColumns(cols, suffix=None):
"""Identifies any columns which have the same name, and re-names the
subsequent ones. If ``N`` columns have the same name ``X``, they are
renamed ``X``, ``X.1``, ``X.2``, ``...``, ``X.<N-1>``.
renamed ``X``, ``X.1<suffix>``, ``X.2<suffix>``, ``...``,
``X.<N-1><suffix>``.
The ``name`` attribute of each :class:`.Column` object is modified
in-place.
:arg cols: Sequence of :class:`.Column` objects.
:arg cols: Sequence of :class:`.Column` objects.
:arg suffix: String to append to the name of all renamed columns.
Defaults to an empty string.
"""
if suffix is None:
suffix = ''
counts = collections.defaultdict(list)
for col in cols:
......@@ -544,7 +560,7 @@ def renameDuplicateColumns(cols):
counts[col.name].append(col)
count = len(counts[col.name])
if count > 1:
newname = '{}.{}'.format(col.name, count - 1)
newname = '{}.{}{}'.format(col.name, count - 1, suffix)
col.name = newname
log.warning('Duplicate column detected (%s: %s) - renamed to %s',
......
......@@ -17,5 +17,6 @@ from .core import (importData,
MERGE_AXIS,
MERGE_STRATEGY,
MERGE_AXIS_OPTIONS,
MERGE_STRATEGY_OPTIONS) # noqa
from .filter import (restrictVariables,) # noqa
MERGE_STRATEGY_OPTIONS) # noqa
from .filter import (restrictVariables,
REMOVE_DUPLICATE_COLUMN_IDENTIFIER) # noqa
......@@ -66,6 +66,7 @@ def importData(fileinfo,
cattable,
variables=None,
colnames=None,
excludeColnames=None,
categories=None,
subjects=None,
subjectExprs=None,
......@@ -89,61 +90,65 @@ def importData(fileinfo,
3. Creates and returns a :class:`DataTable`.
:arg fileinfo: :class:`.FileInfo` object describing the input file(s).
:arg fileinfo: :class:`.FileInfo` object describing the input
file(s).
:arg vartable: The data coding table
:arg vartable: The data coding table
:arg proctable: The processing table
:arg proctable: The processing table
:arg cattable: The category table
:arg cattable: The category table
:arg variables: List of variable IDs to import
:arg variables: List of variable IDs to import
:arg colnames: List of names/glob-style wildcard patterns
specifying columns to import.
:arg colnames: List of names/glob-style wildcard patterns
specifying columns to import.
:arg categories: List of category names to import
:arg excludeColnames: List of column name suffixes specifying columns
to exclude.
:arg subjects: List of subjects to include
:arg categories: List of category names to import
:arg subjectExprs: List of subject inclusion expressions
:arg subjects: List of subjects to include
:arg exclude: List of subjects to exclude
:arg subjectExprs: List of subject inclusion expressions
:arg trustTypes: If ``True``, it is assumed that columns with a
known data type do not contain any bad/unparseable
values. This improves performance, but will cause
an error if the assumption does not hold.
:arg exclude: List of subjects to exclude
:arg mergeAxis: Merging axis to use when loading multiple data
files - see the :func:`mergeData` function.
:arg trustTypes: If ``True``, it is assumed that columns with a
known data type do not contain any bad/unparseable
values. This improves performance, but will cause
an error if the assumption does not hold.
:arg mergeStrategy: Merging strategy to use when loading multiple
data files - see the :func:`mergeData` function.
:arg mergeAxis: Merging axis to use when loading multiple data
files - see the :func:`mergeData` function.
:arg indexVisits: Re-arrange the data so that rows are indexed by
subject ID and visit, rather than visits being
split into separate columns. Only applied to
variables which are labelled with Instancing 2.
:arg mergeStrategy: Merging strategy to use when loading multiple
data files - see the :func:`mergeData` function.
:arg dropNaRows: If ``True``, rows which do not contain data for any
columns are not loaded.
:arg indexVisits: Re-arrange the data so that rows are indexed by
subject ID and visit, rather than visits being
split into separate columns. Only applied to
variables which are labelled with Instancing 2.
:arg njobs: Number of processes to use for parallelising tasks.
:arg dropNaRows: If ``True``, rows which do not contain data for any
columns are not loaded.
:arg mgr: :class:`multiprocessing.Manager` object for
parallelisation
:arg njobs: Number of processes to use for parallelising tasks.
:arg dryrun: If ``True`` the data is not loaded.
:arg mgr: :class:`multiprocessing.Manager` object for
parallelisation
:returns: A tuple containing:
:arg dryrun: If ``True`` the data is not loaded.
:returns: A tuple containing:
- A :class:`DataTable`, which contains references
to the data, and the variable and procesing
tables.
- A :class:`DataTable`, which contains references
to the data, and the variable and procesing
tables.
- A list of :class:`.Column` objects that were not
loaded from each input file.
- A list of :class:`.Column` objects that were not
loaded from each input file.
"""
variables = filter.restrictVariables(cattable, variables, categories)
......@@ -152,7 +157,8 @@ def importData(fileinfo,
cols, drop = filter.columnsToLoad(fileinfo,
vartable,
variables,
colnames)
colnames,
excludeColnames)
# Load those columns, merging
# multiple input files.
......
......@@ -22,6 +22,14 @@ import funpack.loadtables as loadtables
log = logging.getLogger(__name__)
REMOVE_DUPLICATE_COLUMN_IDENTIFIER = '.REMOVE_DUPLICATE'
"""Identifier which is appended to the names of duplicate columns that
are to be removed. Use of this identifier is not hard-coded anywhere -
this module is just a convenient location for its definition. See
the :func:`funpack.main.doImport` function.
"""
def _ispattern(s):
"""Returns ``True`` if ``s`` looks like a ``fnmatch``-style pattern,
``False`` otherwise.
......@@ -58,20 +66,28 @@ def restrictVariables(cattable, variables, categories):
return variables
def columnsToLoad(fileinfo, vartable, variables, colnames):
def columnsToLoad(fileinfo,
vartable,
variables,
colnames=None,
excludeColnames=None):
"""Determines which columns should be loaded from ``datafiles``.
Peeks at the first line of the data file (assumed to contain column names),
then uses the variable table to determine which of them should be loaded.
:arg fileinfo: :class:`.FileInfo` object describing the input file(s).
:arg fileinfo: :class:`.FileInfo` object describing the input
file(s).
:arg vartable: Variable table
:arg vartable: Variable table
:arg variables: List of variables to load.
:arg variables: List of variables to load.
:arg colnames: List of column names/glob-style wildcard patterns,
specifying columns to load.
:arg colnames: List of column names/glob-style wildcard patterns,
specifying columns to load.
:arg excludeColnames: List of column name suffixes specifying columns to
exclude. This overrides ``colnames``.
:returns: A tuple containing:
......@@ -84,6 +100,9 @@ def columnsToLoad(fileinfo, vartable, variables, colnames):
*ignore*.
"""
if excludeColnames is None:
excludeColnames = []
# We apply these cleaning steps by
# omitting the relevant columns.
loadFuncNames = ['remove', 'keepVisits']
......@@ -131,6 +150,15 @@ def columnsToLoad(fileinfo, vartable, variables, colnames):
load[col.datafile].append(col)
continue
# excludeColnames takes precedence
# over all other column selection
# mechanisms
for suf in excludeColnames:
for col in list(cols):
if col.name.endswith(suf):
cols.remove(col)
drop.append(col)
# Figure out whether each
# column should be loaded.
# We load all columns which
......@@ -155,7 +183,7 @@ def columnsToLoad(fileinfo, vartable, variables, colnames):
# if there are any glob patterns, do
# an exhaustive search (*very* slow)
if any([_ispattern(c) for c in colnames]):
if any(_ispattern(c) for c in colnames):
for i, col in enumerate(cols):
hits = [fnmatch.fnmatch(col.name, pat) for pat in colnames]
loadflags[i] = loadflags[i] or any(hits)
......
......@@ -168,11 +168,25 @@ def doImport(args, mgr):
each input file.
"""
# if --remove_duplicates, we append
# an identifying suffix to the names
# of columns to be removed. This is
# then passed through as an exclusion
# pattern to the importData function
# via its excludeColnames option.
if args.remove_duplicates:
suffix = importing.REMOVE_DUPLICATE_COLUMN_IDENTIFIER
renameDuplicates = True
else:
suffix = None
renameDuplicates = args.rename_duplicates
finfo = fileinfo.FileInfo(args.infile,
indexes=args.index,
loaders=args.loader,
encodings=args.encoding,
renameDuplicates=args.rename_duplicates)
renameDuplicates=renameDuplicates,
renameSuffix=suffix)
with util.timed('Table import', log):
vartable, proctable, cattable, unknowns, uncategorised = \
......@@ -198,6 +212,8 @@ def doImport(args, mgr):
variables = args.variable
categories = args.category
columns = args.column
if suffix is None: excludeColnames = []
else: excludeColnames = [suffix]
# Import data
with util.timed('Data import', log):
......@@ -208,6 +224,7 @@ def doImport(args, mgr):
cattable=cattable,
variables=variables,
colnames=columns,
excludeColnames=excludeColnames,
categories=categories,
subjects=subjects,
subjectExprs=exprs,
......
%% Cell type:markdown id: tags:
# FUNPACK overview
![win logo](attachment:win.png)
> **Note:** If you have `funpack` installed, you can start an interactive
> version of this page by running `funpack_demo`.
`funpack` is a command-line program which you can use to extract data from UK
BioBank (and other tabular) data.
You can give `funpack` one or more input files (e.g. `.csv`, `.tsv`), and it
will merge them together, perform some preprocessing, and produce a single
output file.
A large number of rules are built into `funpack` which are specific to the UK
BioBank data set. But you can control and customise everything that `funpack`
does to your data, including which rows and columns to extract, and which
cleaning/processing steps to perform on each column.
**Important** The examples in this notebook assume that you have installed
`funpack` 2.8.0 or newer.
`funpack` 2.9.0 or newer.
%% Cell type:code id: tags:
``` bash
funpack -V
```
%% Cell type:markdown id: tags:
> _Note:_ If the above command produces a `NameError`, you may need to change
> the Jupyter Notebook kernel type to **Bash** - you can do so via the
> **Kernel -> Change Kernel** menu option.
## Contents
1. [Overview](#Overview)
2. [Examples](#Examples)
3. [Import examples](#Import-examples)
4. [Cleaning examples](#Cleaning-examples)
5. [Processing examples](#Processing-examples)
6. [Custom cleaning, processing and loading - funpack plugins](#Custom-cleaning,-processing-and-loading-funpack---plugins)
7. [Miscellaneous topics](#Miscellaneous-topics)
## Overview
`funpack` performs the following steps:
### 1. Import
All data files are loaded in, unwanted columns and subjects are dropped, and
the data files are merged into a single table (a.k.a. data frame). Multiple
files can be merged according to an index column (e.g. subject ID). Or, if the
input files contain the same columns/subjects, they can be naively
concatenated along rows or columns.
> _Note:_ FUNPACK refers to UK Biobank **Data fields** as **variables**. The
> two terms can be considered equivalent.
### 2. Cleaning
The following cleaning steps are applied to each column:
1. **NA value replacement:** Specific values for some columns are replaced
with NA, for example, variables where a value of `-1` indicates *Do not
know*.
2. **Variable-specific cleaning functions:** Certain columns are
re-formatted; for example, the [ICD10](https://en.wikipedia.org/wiki/ICD-10)
disease codes can be converted to integer representations.
3. **Categorical recoding:** Certain categorical columns are re-coded.
4. **Child value replacement:** NA values within some columns which are
dependent upon other columns may have values inserted based on the values
of their parent columns.
### 3. Processing
During the processing stage, columns may be removed, merged, or expanded into
additional columns. For example, a categorical column may be expanded into a set
of binary columns, one for each category.
A column may also be removed on the basis of being too sparse, or being
redundant with respect to another column.
### 4. Export
The processed data can be saved as a `.csv`, `.tsv`, or `.hdf5` file.
## Examples
Throughout these examples, we are going to use a few command line
options, which you will probably **not** normally want to use:
- `-ow` (short for `--overwrite`): This tells `funpack` not to complain if
the output file already exists.
- `-q` (short for `--quiet`): This tells `funpack` to be quiet. Without the
`-q` option, `funpack` can be quite verbose, which can be annoying, but is
very useful when things go wrong. A good strategy is to tell `funpack` to
produce verbose output using the `--noisy` (`-n` for short) option, and to
send all of its output to a log file with the `--log_file` (or `-lf`)
option. For example:
> ```
> funpack -n -n -n -lf log.txt out.tsv in.tsv
> ```
%% Cell type:code id: tags:
``` bash
alias funpack="funpack -ow -q"
```
%% Cell type:markdown id: tags:
Here's the first example input data set, with UK BioBank-style column names:
%% Cell type:code id: tags:
``` bash
cat data_01.tsv
```
%% Cell type:markdown id: tags:
The numbers in each column name typically represent:
1. The variable ID
2. The visit, for variables which were collected at multiple points in time.
3. The "instance", for multi-valued variables.
Note that one **variable** is typically associated with several **columns**,
although we're keeping things simple for this first example - there is only
one visit for each variable, and there are no mulit-valued variables.
> _Most but not all_ variables in the UK BioBank contain data collected at
> different visits, the times that the participants visited a UK BioBank
> assessment centre. However there are some variables (e.g. [ICD10 diagnosis
> codes](https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=41202)) for which
> this is not the case.
## Import examples
### Selecting variables (columns)
You can specify which variables you want to load in the following ways, using
the `--variable` (`-v` for short), `--category` (`-c` for short) and
`--column` (`-co` for short) command line options:
* By variable ID
* By variable ranges
* By a text file which contains the IDs you want to keep.
* By pre-defined variable categories
* By column name
#### Selecting individual variables
Simply provide the IDs of the variables you want to extract:
%% Cell type:code id: tags:
``` bash
funpack -v 1 -v 5 out.tsv data_01.tsv
cat out.tsv
```
%% Cell type:markdown id: tags:
#### Selecting variable ranges
The `--variable`/`-v` option accepts MATLAB-style ranges of the form
`start:step:stop` (where the `stop` is inclusive):
%% Cell type:code id: tags:
``` bash
funpack -v 1:3:10 out.tsv data_01.tsv
cat out.tsv
```
%% Cell type:markdown id: tags:
#### Selecting variables with a file
If your variables of interest are listed in a plain-text file, you can simply
pass that file:
%% Cell type:code id: tags:
``` bash