Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
FSL
funpack
Commits
27aa52d9
Commit
27aa52d9
authored
Dec 28, 2021
by
Paul McCarthy
🚵
Browse files
Merge branch 'enh/remove-duplicates' into 'master'
ENH: New --remove_duplicates cli option See merge request
!84
parents
d0eeace0
a95a08ca
Changes
14
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
CHANGELOG.rst
View file @
27aa52d9
...
...
@@ -2,8 +2,16 @@ FUNPACK changelog
=================
2.9.0 (Under development)
-------------------------
2.9.0 (Tuesady 28th December 2021)
----------------------------------
Added
^^^^^
* New ``--remove_duplicates`` option, which causes columns with duplicate
names to be removed, with only the first retained.
Changed
...
...
funpack/__init__.py
View file @
27aa52d9
...
...
@@ -6,7 +6,7 @@
#
__version__
=
'2.
8
.0'
__version__
=
'2.
9
.0'
"""The ``funpack`` versioning scheme roughly follows Semantic Versioning
conventions.
"""
...
...
funpack/config.py
View file @
27aa52d9
...
...
@@ -54,6 +54,7 @@ CLI_ARGUMENTS = collections.OrderedDict((
'default'
:
DEFAULT_MERGE_AXIS
}),
((
'ms'
,
'merge_strategy'
),
{
'choices'
:
AVAILABLE_MERGE_STRATEGIES
,
'default'
:
DEFAULT_MERGE_STRATEGY
}),
((
'rm'
,
'remove_duplicates'
),
{
'action'
:
'store_true'
}),
((
'rd'
,
'rename_duplicates'
),
{
'action'
:
'store_true'
}),
((
'cfg'
,
'config_file'
),
{
'action'
:
'append'
}),
((
'vf'
,
'variable_file'
),
{
'action'
:
'append'
}),
...
...
@@ -224,6 +225,9 @@ CLI_ARGUMENT_HELP = {
'Options are "naive", "intersection"/"inner", or "union"/'
'"outer".'
.
format
(
DEFAULT_MERGE_STRATEGY
),
'remove_duplicates'
:
'Remove duplicate columns, only retaining the first.'
,
'rename_duplicates'
:
'Rename any duplicate columns so that all columns have a unique name.'
,
...
...
@@ -686,6 +690,11 @@ def _prepareInputAndOutputFiles(args):
if
args
.
loader
is
not
None
:
args
.
loader
=
{
op
.
realpath
(
f
)
:
n
for
f
,
n
in
args
.
loader
}
# Remove/rename duplicates options are mutually exclusive
if
args
.
remove_duplicates
and
args
.
rename_duplicates
:
raise
ValueError
(
'Only one of--remove_duplicates and '
'--rename_duplicates may be used.'
)
# turn index indices into dict of
# { file : [index] } mappings
if
args
.
index
is
not
None
:
...
...
funpack/fileinfo.py
View file @
27aa52d9
...
...
@@ -298,7 +298,8 @@ class FileInfo:
indexes
=
None
,
loaders
=
None
,
encodings
=
None
,
renameDuplicates
=
False
):
renameDuplicates
=
False
,
renameSuffix
=
None
):
"""Create a ``FileInfo`` object.
:arg datafiles: Path to input file, or sequence of paths.
...
...
@@ -311,6 +312,9 @@ class FileInfo:
specifying non-standard file encodings.
:arg renameDuplicates: If ``True``, duplicate columns are re-named -
see :func:`renameDuplicateColumns`.
:arg renameSuffix: Passed as ``suffix`` to
:func:`renameDuplicateColumns`, if
``renameDuplicates is True``.
"""
if
isinstance
(
datafiles
,
str
):
datafiles
=
[
datafiles
]
...
...
@@ -322,7 +326,8 @@ class FileInfo:
indexes
,
loaders
,
encodings
,
renameDuplicates
)
renameDuplicates
,
renameSuffix
=
renameSuffix
)
self
.
__datafiles
=
list
(
datafiles
)
self
.
__indexes
=
dict
(
indexes
)
...
...
@@ -388,7 +393,8 @@ def fileinfo(datafiles,
indexes
=
None
,
sniffers
=
None
,
encodings
=
None
,
renameDuplicates
=
False
):
renameDuplicates
=
False
,
renameSuffix
=
None
):
"""Identifies the format of each input data file, and extracts/generates
column names and variable IDs for every column.
...
...
@@ -410,6 +416,10 @@ def fileinfo(datafiles,
which have the same name are renamed - see
:func:`renameDuplicateColumns`.
:arg renameSuffix: Passed as ``suffix`` to
:func:`renameDuplicateColumns`, if
``renameDuplicates is True``.
:returns: A tuple containing:
- List of ``csv`` dialect types
...
...
@@ -517,22 +527,28 @@ def fileinfo(datafiles,
col
.
name
=
util
.
generateColumnName
(
vid
,
0
,
0
)
if
renameDuplicates
:
renameDuplicateColumns
(
it
.
chain
(
*
cols
))
renameDuplicateColumns
(
it
.
chain
(
*
cols
)
,
suffix
=
renameSuffix
)
return
dialects
,
headers
,
cols
def
renameDuplicateColumns
(
cols
):
def
renameDuplicateColumns
(
cols
,
suffix
=
None
):
"""Identifies any columns which have the same name, and re-names the
subsequent ones. If ``N`` columns have the same name ``X``, they are
renamed ``X``, ``X.1``, ``X.2``, ``...``, ``X.<N-1>``.
renamed ``X``, ``X.1<suffix>``, ``X.2<suffix>``, ``...``,
``X.<N-1><suffix>``.
The ``name`` attribute of each :class:`.Column` object is modified
in-place.
:arg cols: Sequence of :class:`.Column` objects.
:arg cols: Sequence of :class:`.Column` objects.
:arg suffix: String to append to the name of all renamed columns.
Defaults to an empty string.
"""
if
suffix
is
None
:
suffix
=
''
counts
=
collections
.
defaultdict
(
list
)
for
col
in
cols
:
...
...
@@ -544,7 +560,7 @@ def renameDuplicateColumns(cols):
counts
[
col
.
name
].
append
(
col
)
count
=
len
(
counts
[
col
.
name
])
if
count
>
1
:
newname
=
'{}.{}'
.
format
(
col
.
name
,
count
-
1
)
newname
=
'{}.{}
{}
'
.
format
(
col
.
name
,
count
-
1
,
suffix
)
col
.
name
=
newname
log
.
warning
(
'Duplicate column detected (%s: %s) - renamed to %s'
,
...
...
funpack/importing/__init__.py
View file @
27aa52d9
...
...
@@ -17,5 +17,6 @@ from .core import (importData,
MERGE_AXIS
,
MERGE_STRATEGY
,
MERGE_AXIS_OPTIONS
,
MERGE_STRATEGY_OPTIONS
)
# noqa
from
.filter
import
(
restrictVariables
,)
# noqa
MERGE_STRATEGY_OPTIONS
)
# noqa
from
.filter
import
(
restrictVariables
,
REMOVE_DUPLICATE_COLUMN_IDENTIFIER
)
# noqa
funpack/importing/core.py
View file @
27aa52d9
...
...
@@ -66,6 +66,7 @@ def importData(fileinfo,
cattable
,
variables
=
None
,
colnames
=
None
,
excludeColnames
=
None
,
categories
=
None
,
subjects
=
None
,
subjectExprs
=
None
,
...
...
@@ -89,61 +90,65 @@ def importData(fileinfo,
3. Creates and returns a :class:`DataTable`.
:arg fileinfo: :class:`.FileInfo` object describing the input file(s).
:arg fileinfo: :class:`.FileInfo` object describing the input
file(s).
:arg vartable: The data coding table
:arg vartable:
The data coding table
:arg proctable: The processing table
:arg proctable:
The processing table
:arg cattable: The category table
:arg cattable:
The category table
:arg variables: List of variable IDs to import
:arg variables:
List of variable IDs to import
:arg colnames: List of names/glob-style wildcard patterns
specifying columns to import.
:arg colnames:
List of names/glob-style wildcard patterns
specifying columns to import.
:arg categories: List of category names to import
:arg excludeColnames: List of column name suffixes specifying columns
to exclude.
:arg
subject
s: List of
subjects to include
:arg
categorie
s: List of
category names to import
:arg subject
Exprs:
List of subject inclu
sion expressions
:arg subject
s:
List of subject
s to
inclu
de
:arg
exclude:
List of subject
s to exclude
:arg
subjectExprs:
List of subject
inclusion expressions
:arg trustTypes: If ``True``, it is assumed that columns with a
known data type do not contain any bad/unparseable
values. This improves performance, but will cause
an error if the assumption does not hold.
:arg exclude: List of subjects to exclude
:arg mergeAxis: Merging axis to use when loading multiple data
files - see the :func:`mergeData` function.
:arg trustTypes: If ``True``, it is assumed that columns with a
known data type do not contain any bad/unparseable
values. This improves performance, but will cause
an error if the assumption does not hold.
:arg merge
Strategy: Merging strategy
to use when loading multiple
data
files - see the :func:`mergeData` function.
:arg merge
Axis: Merging axis
to use when loading multiple
data
files - see the :func:`mergeData` function.
:arg indexVisits: Re-arrange the data so that rows are indexed by
subject ID and visit, rather than visits being
split into separate columns. Only applied to
variables which are labelled with Instancing 2.
:arg mergeStrategy: Merging strategy to use when loading multiple
data files - see the :func:`mergeData` function.
:arg dropNaRows: If ``True``, rows which do not contain data for any
columns are not loaded.
:arg indexVisits: Re-arrange the data so that rows are indexed by
subject ID and visit, rather than visits being
split into separate columns. Only applied to
variables which are labelled with Instancing 2.
:arg njobs: Number of processes to use for parallelising tasks.
:arg dropNaRows: If ``True``, rows which do not contain data for any
columns are not loaded.
:arg mgr: :class:`multiprocessing.Manager` object for
parallelisation
:arg njobs: Number of processes to use for parallelising tasks.
:arg dryrun: If ``True`` the data is not loaded.
:arg mgr: :class:`multiprocessing.Manager` object for
parallelisation
:returns: A tuple containing:
:arg dryrun: If ``True`` the data is not loaded.
:returns: A tuple containing:
- A :class:`DataTable`, which contains references
to the data, and the variable and procesing
tables.
- A :class:`DataTable`, which contains references
to the data, and the variable and procesing
tables.
- A list of :class:`.Column` objects that were not
loaded from each input file.
- A list of :class:`.Column` objects that were not
loaded from each input file.
"""
variables
=
filter
.
restrictVariables
(
cattable
,
variables
,
categories
)
...
...
@@ -152,7 +157,8 @@ def importData(fileinfo,
cols
,
drop
=
filter
.
columnsToLoad
(
fileinfo
,
vartable
,
variables
,
colnames
)
colnames
,
excludeColnames
)
# Load those columns, merging
# multiple input files.
...
...
funpack/importing/filter.py
View file @
27aa52d9
...
...
@@ -22,6 +22,14 @@ import funpack.loadtables as loadtables
log
=
logging
.
getLogger
(
__name__
)
REMOVE_DUPLICATE_COLUMN_IDENTIFIER
=
'.REMOVE_DUPLICATE'
"""Identifier which is appended to the names of duplicate columns that
are to be removed. Use of this identifier is not hard-coded anywhere -
this module is just a convenient location for its definition. See
the :func:`funpack.main.doImport` function.
"""
def
_ispattern
(
s
):
"""Returns ``True`` if ``s`` looks like a ``fnmatch``-style pattern,
``False`` otherwise.
...
...
@@ -58,20 +66,28 @@ def restrictVariables(cattable, variables, categories):
return
variables
def
columnsToLoad
(
fileinfo
,
vartable
,
variables
,
colnames
):
def
columnsToLoad
(
fileinfo
,
vartable
,
variables
,
colnames
=
None
,
excludeColnames
=
None
):
"""Determines which columns should be loaded from ``datafiles``.
Peeks at the first line of the data file (assumed to contain column names),
then uses the variable table to determine which of them should be loaded.
:arg fileinfo: :class:`.FileInfo` object describing the input file(s).
:arg fileinfo: :class:`.FileInfo` object describing the input
file(s).
:arg vartable: Variable table
:arg vartable:
Variable table
:arg variables: List of variables to load.
:arg variables:
List of variables to load.
:arg colnames: List of column names/glob-style wildcard patterns,
specifying columns to load.
:arg colnames: List of column names/glob-style wildcard patterns,
specifying columns to load.
:arg excludeColnames: List of column name suffixes specifying columns to
exclude. This overrides ``colnames``.
:returns: A tuple containing:
...
...
@@ -84,6 +100,9 @@ def columnsToLoad(fileinfo, vartable, variables, colnames):
*ignore*.
"""
if
excludeColnames
is
None
:
excludeColnames
=
[]
# We apply these cleaning steps by
# omitting the relevant columns.
loadFuncNames
=
[
'remove'
,
'keepVisits'
]
...
...
@@ -131,6 +150,15 @@ def columnsToLoad(fileinfo, vartable, variables, colnames):
load
[
col
.
datafile
].
append
(
col
)
continue
# excludeColnames takes precedence
# over all other column selection
# mechanisms
for
suf
in
excludeColnames
:
for
col
in
list
(
cols
):
if
col
.
name
.
endswith
(
suf
):
cols
.
remove
(
col
)
drop
.
append
(
col
)
# Figure out whether each
# column should be loaded.
# We load all columns which
...
...
@@ -155,7 +183,7 @@ def columnsToLoad(fileinfo, vartable, variables, colnames):
# if there are any glob patterns, do
# an exhaustive search (*very* slow)
if
any
(
[
_ispattern
(
c
)
for
c
in
colnames
]
):
if
any
(
_ispattern
(
c
)
for
c
in
colnames
):
for
i
,
col
in
enumerate
(
cols
):
hits
=
[
fnmatch
.
fnmatch
(
col
.
name
,
pat
)
for
pat
in
colnames
]
loadflags
[
i
]
=
loadflags
[
i
]
or
any
(
hits
)
...
...
funpack/main.py
View file @
27aa52d9
...
...
@@ -168,11 +168,25 @@ def doImport(args, mgr):
each input file.
"""
# if --remove_duplicates, we append
# an identifying suffix to the names
# of columns to be removed. This is
# then passed through as an exclusion
# pattern to the importData function
# via its excludeColnames option.
if
args
.
remove_duplicates
:
suffix
=
importing
.
REMOVE_DUPLICATE_COLUMN_IDENTIFIER
renameDuplicates
=
True
else
:
suffix
=
None
renameDuplicates
=
args
.
rename_duplicates
finfo
=
fileinfo
.
FileInfo
(
args
.
infile
,
indexes
=
args
.
index
,
loaders
=
args
.
loader
,
encodings
=
args
.
encoding
,
renameDuplicates
=
args
.
rename_duplicates
)
renameDuplicates
=
renameDuplicates
,
renameSuffix
=
suffix
)
with
util
.
timed
(
'Table import'
,
log
):
vartable
,
proctable
,
cattable
,
unknowns
,
uncategorised
=
\
...
...
@@ -198,6 +212,8 @@ def doImport(args, mgr):
variables
=
args
.
variable
categories
=
args
.
category
columns
=
args
.
column
if
suffix
is
None
:
excludeColnames
=
[]
else
:
excludeColnames
=
[
suffix
]
# Import data
with
util
.
timed
(
'Data import'
,
log
):
...
...
@@ -208,6 +224,7 @@ def doImport(args, mgr):
cattable
=
cattable
,
variables
=
variables
,
colnames
=
columns
,
excludeColnames
=
excludeColnames
,
categories
=
categories
,
subjects
=
subjects
,
subjectExprs
=
exprs
,
...
...
funpack/scripts/demo/funpack_demonstration.ipynb
View file @
27aa52d9
%% Cell type:markdown id: tags:
# FUNPACK overview

> **Note:** If you have `funpack` installed, you can start an interactive
> version of this page by running `funpack_demo`.
`funpack`
is a command-line program which you can use to extract data from UK
BioBank (and other tabular) data.
You can give
`funpack`
one or more input files (e.g.
`.csv`
,
`.tsv`
), and it
will merge them together, perform some preprocessing, and produce a single
output file.
A large number of rules are built into
`funpack`
which are specific to the UK
BioBank data set. But you can control and customise everything that
`funpack`
does to your data, including which rows and columns to extract, and which
cleaning/processing steps to perform on each column.
**Important**
The examples in this notebook assume that you have installed
`funpack`
2.
8
.0 or newer.
`funpack`
2.
9
.0 or newer.
%% Cell type:code id: tags:
```
bash
funpack
-V
```
%% Cell type:markdown id: tags:
> _Note:_ If the above command produces a `NameError`, you may need to change
> the Jupyter Notebook kernel type to **Bash** - you can do so via the
> **Kernel -> Change Kernel** menu option.
## Contents
1.
[
Overview
](
#Overview
)
2.
[
Examples
](
#Examples
)
3.
[
Import examples
](
#Import-examples
)
4.
[
Cleaning examples
](
#Cleaning-examples
)
5.
[
Processing examples
](
#Processing-examples
)
6.
[
Custom cleaning, processing and loading - funpack plugins
](
#Custom-cleaning,-processing-and-loading-funpack---plugins
)
7.
[
Miscellaneous topics
](
#Miscellaneous-topics
)
## Overview
`funpack`
performs the following steps:
### 1. Import
All data files are loaded in, unwanted columns and subjects are dropped, and
the data files are merged into a single table (a.k.a. data frame). Multiple
files can be merged according to an index column (e.g. subject ID). Or, if the
input files contain the same columns/subjects, they can be naively
concatenated along rows or columns.
> _Note:_ FUNPACK refers to UK Biobank **Data fields** as **variables**. The
> two terms can be considered equivalent.
### 2. Cleaning
The following cleaning steps are applied to each column:
1.
**NA value replacement:**
Specific values for some columns are replaced
with NA, for example, variables where a value of
`-1`
indicates
*
Do not
know
*
.
2.
**Variable-specific cleaning functions:**
Certain columns are
re-formatted; for example, the
[
ICD10
](
https://en.wikipedia.org/wiki/ICD-10
)
disease codes can be converted to integer representations.
3.
**Categorical recoding:**
Certain categorical columns are re-coded.
4.
**Child value replacement:**
NA values within some columns which are
dependent upon other columns may have values inserted based on the values
of their parent columns.
### 3. Processing
During the processing stage, columns may be removed, merged, or expanded into
additional columns. For example, a categorical column may be expanded into a set
of binary columns, one for each category.
A column may also be removed on the basis of being too sparse, or being
redundant with respect to another column.
### 4. Export
The processed data can be saved as a
`.csv`
,
`.tsv`
, or
`.hdf5`
file.
## Examples
Throughout these examples, we are going to use a few command line
options, which you will probably
**not**
normally want to use:
-
`-ow`
(short for
`--overwrite`
): This tells
`funpack`
not to complain if
the output file already exists.
-
`-q`
(short for
`--quiet`
): This tells
`funpack`
to be quiet. Without the
`-q`
option,
`funpack`
can be quite verbose, which can be annoying, but is
very useful when things go wrong. A good strategy is to tell
`funpack`
to
produce verbose output using the
`--noisy`
(
`-n`
for short) option, and to
send all of its output to a log file with the
`--log_file`
(or
`-lf`
)
option. For example:
> ```
> funpack -n -n -n -lf log.txt out.tsv in.tsv
> ```
%% Cell type:code id: tags:
```
bash
alias
funpack
=
"funpack -ow -q"
```
%% Cell type:markdown id: tags:
Here's the first example input data set, with UK BioBank-style column names:
%% Cell type:code id: tags:
```
bash
cat
data_01.tsv
```
%% Cell type:markdown id: tags:
The numbers in each column name typically represent:
1.
The variable ID
2.
The visit, for variables which were collected at multiple points in time.
3.
The "instance", for multi-valued variables.
Note that one
**variable**
is typically associated with several
**columns**
,
although we're keeping things simple for this first example - there is only
one visit for each variable, and there are no mulit-valued variables.
> _Most but not all_ variables in the UK BioBank contain data collected at
> different visits, the times that the participants visited a UK BioBank
> assessment centre. However there are some variables (e.g. [ICD10 diagnosis
> codes](https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=41202)) for which
> this is not the case.
## Import examples
### Selecting variables (columns)
You can specify which variables you want to load in the following ways, using
the
`--variable`
(
`-v`
for short),
`--category`
(
`-c`
for short) and
`--column`
(
`-co`
for short) command line options:
*
By variable ID
*
By variable ranges
*
By a text file which contains the IDs you want to keep.
*
By pre-defined variable categories
*
By column name
#### Selecting individual variables
Simply provide the IDs of the variables you want to extract:
%% Cell type:code id: tags:
```
bash
funpack
-v
1
-v
5 out.tsv data_01.tsv
cat
out.tsv
```
%% Cell type:markdown id: tags:
#### Selecting variable ranges
The
`--variable`
/
`-v`
option accepts MATLAB-style ranges of the form
`start:step:stop`
(where the
`stop`
is inclusive):
%% Cell type:code id: tags:
```
bash
funpack
-v
1:3:10 out.tsv data_01.tsv
cat
out.tsv
```
%% Cell type:markdown id: tags:
#### Selecting variables with a file
If your variables of interest are listed in a plain-text file, you can simply
pass that file:
%% Cell type:code id: tags:
```
bash