Commit e29b3ae4 authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

Merge branch 'rf/redundant-discount-unselected-datafields' into 'master'

ENH: Option to ignore redundancy check for uncategorised/unknown columns

See merge request !82
parents 2bd56dd4 f3d362ab
FUNPACK changelog
=================
2.8.0 (Thursday 19th August 2021)
---------------------------------
Added
^^^^^
* New ``skipUnknowns`` option to the ``removeIfRedundant`` processing function,
whereby if a column is found to be redundant with respect to a column of an
unkknown or uncategorised data field, it is *not* dropped. This option is
used in the ``fmrib`` configuration.
Changed
^^^^^^^
* Added a recoding rule for data coding `100348
https://biobank.ctsu.ox.ac.uk/crystal/coding.cgi?id=100348`_ to the
``fmrib`` configuration.
2.7.1 (Tuesday 22nd June 2021)
......
......@@ -257,8 +257,8 @@ names from the ``table`` object like so::
colnames = vertcat(colnames{:});
If you have used the ``--write_description`` or``--description_file`` options,
you can load in the descriptions for each column as follows::
If you have used the ``--write_description`` or ``--description_file``
options, you can load in the descriptions for each column as follows::
descs = readtable('out_descriptions.tsv', ...
'FileType', 'text', ...
......
......@@ -6,7 +6,7 @@
#
__version__ = '2.7.1'
__version__ = '2.8.0'
"""The ``funpack`` versioning scheme roughly follows Semantic Versioning
conventions.
"""
......
ID Category Variables
1 age, sex, brain MRI protocol, Phase 31,34,21022,22200,25780
2 genetics 21000,22000:22125,22190,22191,22194,22201:22325,22182,22800:22823
2 genetics 21000,22000:22125,22190:22194,22201:22325,22182,22800:22823
3 early life factors 52,129,130,1677,1687,1697,1737,1767,1777,1787,21066,20022
10 lifestyle and environment - general 3:6,132,189,670,680,699,709,728,738,767,777,1031,1797,1807,1835,1845,1873,1883,2139,2149,2159,2237,2375,2385,2395,2405,2267,2277,2714:10:2834,2946,3526,3536,3546,3581,3591,3659,3669,3700,3710,3720,3829,3839,3849,3872,3882,3912,3942,3972,3982,4501,4674,4825,4836,5057,6138,6142,6139:6141,6145:6146,6160,10016,10105,10114,10721,10722,10740,10749,10860,10877,10886,20074:20075,20107,20110:20113,20118:20119,20121,22501,22599,22606,22700,22702,22704,24003:24021,24024,24500:24508,26410:26434
11 lifestyle and environment - exercise and work 1001,1011,796,806,816,826,845,864,874,884,894,904,914,924,943,971,981,991,1021,1050:10:1220,2624,2634,3426,3637,3647,6143,6162,6164,10953,10962,10971,22604,22605,22607:22615,22620,22630,22631,22640:22655,104900,104910,104920
......
......@@ -24,6 +24,7 @@ ID RawLevels NewLevels
100338 1,0,2 0,1,2
100345 1,2 2,1
100347 0,2,1 0,1,2
100348 4,3,2,1 1,2,3,4
100402 1,2,3,4,5,6 5,4,3,2,1,0
100499 11,12,13 1,2,3
100605 -7 0
......
......@@ -33,5 +33,7 @@ all_except,6150,6155,20001,20002,20003,20004,20199,40001,40002,40006,40011,40012
41202,41203,41270,41271 removeIfSparse(minpres=10)
# Drop columns which are correlated with other columns (the one
# with more missing values is dropped).
all removeIfRedundant(0.99, 0.2)
# with more missing values is dropped). If processing
# unknown/uncategorised data fields, do not remove columns that
# are redundant w.r.t. those unknowns/uncategorised.
all removeIfRedundant(0.99, 0.2, skipUnknowns=True)
......@@ -157,7 +157,7 @@ def formatCompound(dtable, column, series, delim=','):
if isinstance(sample, str):
return series
if not isinstance(sample, (np.ndarray, collections.Sequence)):
if not isinstance(sample, (np.ndarray, collections.abc.Sequence)):
return series
def fmt(val):
......
......@@ -174,7 +174,7 @@ def convert_Process_Variable(val : str) -> Tuple[str, List[int]]:
if len(tokens) == 1 and \
tokens[0] in ('all', 'all_independent',
'all_except', 'all_independent_except'):
return tokens[0], []
return np.array((tokens[0], []), dtype=object)
if tokens[0] in ('independent', 'all_except', 'all_independent_except'):
ptype = tokens[0]
......@@ -182,7 +182,9 @@ def convert_Process_Variable(val : str) -> Tuple[str, List[int]]:
else:
ptype = 'vids'
return ptype, list(it.chain(*[util.parseMatlabRange(t) for t in tokens]))
vids = list(it.chain(*[util.parseMatlabRange(t) for t in tokens]))
return np.array((ptype, vids), dtype=object)
def convert_Process(
......@@ -591,9 +593,10 @@ def loadVariableTable(
# interfere with subsequent rule assignments (as
# these series may contain numpy arrays or
# list-likes)
for col in ('NAValues', 'RawLevels', 'NewLevels', 'ChildValues'):
newseries = vartable.apply(convert, axis=1, args=(col,))
vartable[col] = newseries.astype(object)
if len(vartable) > 0:
for col in ('NAValues', 'RawLevels', 'NewLevels', 'ChildValues'):
newseries = vartable.apply(convert, axis=1, args=(col,))
vartable[col] = newseries.astype(object)
# Before merging the cleaning functions
# in, we generate a list of variables
......@@ -716,7 +719,7 @@ def loadTableBases() -> Tuple[pd.DataFrame, pd.DataFrame]:
dcbase = pd.DataFrame({'ID' : encodings['encoding_id']}).set_index('ID')
varbase = pd.DataFrame({
'ID' : fields['field_id'],
'ID' : fields['field_id'].astype(np.uint64),
'Type' : fields['value_type'].combine(fields['base_type'],
settype),
'Description' : fields['title'],
......@@ -782,7 +785,8 @@ def mergeTableFiles(
log.debug('Loading %s table from %s', what, f)
table = pd.read_csv(f, '\t',
table = pd.read_csv(f,
sep='\t',
index_col=0,
dtype=dtypes,
converters=converters)
......@@ -805,14 +809,12 @@ def mergeTableFiles(
# non-na values in later files take
# precedence.
for c in [c[:-2] for c in merged.columns if c.endswith('_x')]:
bcolname = c + '_x'
tcolname = c + '_y'
bcol = merged[bcolname]
tcol = merged[tcolname]
notna = tcol.notna()
bcol.loc[notna] = tcol[notna]
merged[c] = bcol
merged = merged.drop(columns=[bcolname, tcolname])
bname = c + '_x'
tname = c + '_y'
merged[c] = merged[bname]
notna = merged[tname].notna()
merged.loc[notna, c] = merged.loc[notna, tname]
merged = merged.drop(columns=[bname, tname])
base = merged
# no base, and no files
......@@ -1117,7 +1119,8 @@ def loadProcessingTable(
if (procfile is not None) and (not skipProcessing):
log.debug('Loading processing table from %s', procfile)
proctable = pd.read_csv(procfile, '\t',
proctable = pd.read_csv(procfile,
sep='\t',
index_col=False,
skip_blank_lines=True,
comment='#',
......@@ -1132,12 +1135,14 @@ def loadProcessingTable(
for i, (vids, procs) in enumerate(prependProcess):
vids = convert_Process_Variable(vids)
procs = convert_Process('processor', procs)
proctable.loc[i, ['Variable', 'Process']] = [vids, procs]
proctable.at[i, 'Variable'] = np.array(vids, dtype=object)
proctable.at[i, 'Process'] = procs
for i, (vids, procs) in enumerate(appendProcess, len(proctable.index)):
vids = convert_Process_Variable(vids)
procs = convert_Process('processor', procs)
proctable.loc[i, ['Variable', 'Process']] = [vids, procs]
proctable.at[i, 'Variable'] = np.array(vids, dtype=object)
proctable.at[i, 'Process'] = procs
proctable.sort_index(inplace=True)
......@@ -1152,7 +1157,7 @@ def loadCategoryTable(catfile : str = None) -> pd.DataFrame:
if catfile is not None:
log.debug('Loading category table from %s', catfile)
cattable = pd.read_csv(catfile,
'\t',
sep='\t',
index_col=0,
dtype=CATTABLE_DTYPES,
converters=CATTABLE_CONVERTERS)
......
......@@ -319,7 +319,8 @@ def splitDataTable(dtable, args):
# if suppress, only numeric columns
# are saved to main output file
if args.suppress_non_numerics:
log.debug('Separating out %u numeric columns for export', len(ncols))
log.debug('Separating out %u / %u numeric columns for export',
len(ncols), len(dtable.dataColumns))
dtables.append((dtable.subtable(ncols), args.outfile))
else:
dtables.append((dtable, args.outfile))
......@@ -327,7 +328,8 @@ def splitDataTable(dtable, args):
# if write, non-numeric columns
# are saved to an auxillary file
if args.write_non_numerics:
log.debug('Separating out %u non-numeric columns for export', len(nncols))
log.debug('Separating out %u / %u non-numeric columns for export',
len(nncols), len(dtable.dataColumns))
dtables.append((dtable.subtable(nncols), args.non_numerics_file))
return dtables
......@@ -519,10 +521,10 @@ def doSummaryExport(dtable, args):
vartable = dtable.vartable
vids = sorted(dtable.variables)[1:]
sumdf = pd.DataFrame(columns=['ID', 'NAValues',
'RawLevels', 'NewLevels',
sumdf = pd.DataFrame(columns=['NAValues', 'RawLevels', 'NewLevels',
'ParentValues', 'ChildValues',
'Clean', 'Flags']).set_index('ID')
'Clean', 'Flags'], index=vids)
sumdf.index.name = 'ID'
with util.timed('Summary export', log):
for vid in vids:
......
......@@ -76,6 +76,8 @@ from . import processing_functions_core as core
from . import util
from . import custom
from . import datatable
from . import loadtables
log = logging.getLogger(__name__)
......@@ -150,17 +152,29 @@ def removeIfSparse(
@custom.processor()
def removeIfRedundant(dtable, vids, corrthres, nathres=None, pairwise=False):
def removeIfRedundant(dtable : datatable.DataTable,
vids : List[int],
corrthres : float,
nathres : Optional[float] = None,
pairwise : Optional[bool] = False,
skipUnknowns : Optional[bool] = False):
"""removeIfRedundant(corrthres, [nathres], [pairwise])
Removes columns deemed to be redundant.
Removes columns from the variables in ``vids`` if they are redundant.
Redundancy is determined by calculating the correlation between pairs
of columns - see the :func:`.isRedundant` function.
of columns using `corrthres` and `nathres` - see the :func:`.isRedundant`
function.
If ``pairwise`` is ``True``, an alternative implementation is used which
may be faster on data sets with high missingness correlation.
:arg pairwise: Use alternative pairwise implementation.
:arg skipUnknowns: If ``True``, columns which are deemed to be redundant
with respect to an unknown or uncategorised column
are *not* dropped.
"""
# Ignore non-numeric columns
......@@ -169,19 +183,37 @@ def removeIfRedundant(dtable, vids, corrthres, nathres=None, pairwise=False):
colnames = [c.name for c in cols]
data = dtable[:, colnames]
if pairwise:
redundant = _pairwiseRemoveIfRedundant(
dtable, data, corrthres, nathres)
else:
redundant = _removeIfRedundant(dtable, data, corrthres, nathres)
with np.errstate(divide='ignore'):
if pairwise:
redundant = _pairwiseRemoveIfRedundant(
dtable, data, corrthres, nathres)
else:
redundant = _removeIfRedundant(
dtable, data, corrthres, nathres)
redundant = util.dedup(sorted(redundant))
if skipUnknowns:
copy = []
for idxa, idxb in redundant:
colb = cols[idxa]
bvid = colb.vid
cats = loadtables.variableCategories(dtable.cattable, [bvid])[bvid]
if 'unknown' in cats or 'uncategorised' in cats:
namea = colnames[idxa]
nameb = colnames[idxb]
log.debug('Column %s is redundant with %s, but %s is '
'unknown / uncategorised, so %s will not be '
'dropped', namea, nameb, nameb, namea)
else:
copy.append((idxa, idxb))
redundant = copy
if len(redundant) > 0:
log.debug('Dropping %u redundant columns: %s ...',
len(redundant), redundant[:5])
log.debug('Dropping %u redundant columns', len(redundant))
return [cols[r] for r in redundant]
return [cols[r[0]] for r in redundant]
def _removeIfRedundant(dtable, data, corrthres, nathres=None):
......@@ -193,7 +225,9 @@ def _removeIfRedundant(dtable, data, corrthres, nathres=None):
:arg corrthres: Correlation threshold - see :func:`.redundantColumns`.
:arg nathres: Missingness correlation threshold - see
:func:`.redundantColumns`.
:returns: A sequence of indices denoting the redundant columns.
:returns: Sequence of tuples of column indices, where each tuple
``(a, b)`` indicates that column ``a`` is redundant with
respect to column ``b``.
"""
return core.matrixRedundantColumns(data, corrthres, nathres)
......@@ -208,7 +242,9 @@ def _pairwiseRemoveIfRedundant(dtable, data, corrthres, nathres=None):
:arg corrthres: Correlation threshold - see :func:`.redundantColumns`.
:arg nathres: Missingness correlation threshold - see
:func:`.redundantColumns`.
:returns: A sequence of indices denoting the redundant columns.
:returns: Sequence of tuples of column indices, where each tuple
``(a, b)`` indicates that column ``a`` is redundant with
respect to column ``b``.
"""
ncols = len(data.columns)
......@@ -409,7 +445,7 @@ def binariseCategorical(dtable,
# if take is a single vid or None,
# we turn it into [take] * len(vids)
if not isinstance(take, collections.Sequence):
if not isinstance(take, collections.abc.Sequence):
take = [take] * len(vids)
if len(take) != len(vids):
......
......@@ -248,7 +248,7 @@ def pairwiseRedundantColumns(
colpairs : np.ndarray,
corrthres : float,
token : Optional[str] = None
) -> List[int]:
) -> List[Tuple[int, int]]:
"""Identifies redundant columns based on their correlation with each
other by comparing each pair of columns one by one.
......@@ -262,7 +262,9 @@ def pairwiseRedundantColumns(
:arg token: Identifier string for log messages.
:returns: Sequence of redundant column indices.
:returns: Sequence of tuples of column indices, where each tuple
``(a, b)`` indicates that column ``a`` is redundant with
respect to column ``b``.
"""
if len(colpairs) == 0:
......@@ -271,7 +273,7 @@ def pairwiseRedundantColumns(
if token is None: token = ''
else: token = '[{}] '.format(token)
redundant = set()
redundant = {}
nacounts = data.isna().sum(axis=0).to_numpy()
# calculate correlation between column pairs
......@@ -292,17 +294,18 @@ def pairwiseRedundantColumns(
if nacounts[coli] > nacounts[colj]: drop, keep = coli, colj
else: drop, keep = colj, coli
log.debug('%sColumn %s is redundant (correlation with %s: %f)',
token, data.columns[drop], data.columns[keep], corr)
redundant.add(drop)
if drop not in redundant:
log.debug('%sColumn %s is redundant (correlation with %s: %f)',
token, data.columns[drop], data.columns[keep], corr)
redundant[drop] = keep
return list(redundant)
return list(redundant.items())
def matrixRedundantColumns(
data : pd.DataFrame,
corrthres : float,
nathres : Optional[float] = None) -> np.ndarray:
nathres : Optional[float] = None) -> List[Tuple[int, int]]:
"""Identifies redundant columns based on their correlation with each
other using dot products to calculate a correlation matrix.
......@@ -316,7 +319,9 @@ def matrixRedundantColumns(
``corrthres`` *and* a missing-value correlation greater
than ``nathres`` to be identified as redundant.
:returns: Sequence of redundant column indices.
:returns: Sequence of tuples of column indices, where each tuple
``(a, b)`` indicates that column ``a`` is redundant with
respect to column ``b``.
"""
if len(data.columns) < 2:
......@@ -396,7 +401,7 @@ def matrixRedundantColumns(
# for each correlated pair, we flag the
# one with more missing values as redundant
def mostna(coli, colj):
def correlatedPairs(coli, colj):
if nacounts[coli] > nacounts[colj]: drop, keep = coli, colj
else: drop, keep = colj, coli
......@@ -407,12 +412,25 @@ def matrixRedundantColumns(
log.debug('Column %s is redundant (correlation with %s: %0.6f)',
columns[drop], columns[keep], corr)
return drop
return drop, keep
mostna = np.vectorize(mostna, [np.uint32])
redundant = mostna(colpairs[0], colpairs[1])
# Generate a sequence of pairs, where the first
# element is the column to drop, and the second
# element is the column it is redundant w.r.t.
correlatedPairs = np.vectorize(correlatedPairs, [np.uint32, np.uint32])
colsa, colsb = correlatedPairs(colpairs[0], colpairs[1])
return np.unique(redundant)
if len(colsa) == 0:
return []
# Return only one pair for each column
# that is to be dropped (the first
# pair in the natural column ordering).
idxs = np.unique(colsa, return_index=True)[1]
colsa = colsa[idxs]
colsb = colsb[idxs]
return list(zip(colsa, colsb))
def binariseCategorical(
......
%% Cell type:markdown id: tags:
# FUNPACK overview
![win logo](attachment:win.png)
> **Note:** If you have `funpack` installed, you can start an interactive
> version of this page by running `funpack_demo`.
`funpack` is a command-line program which you can use to extract data from UK
BioBank (and other tabular) data.
You can give `funpack` one or more input files (e.g. `.csv`, `.tsv`), and it
will merge them together, perform some preprocessing, and produce a single
output file.
A large number of rules are built into `funpack` which are specific to the UK
BioBank data set. But you can control and customise everything that `funpack`
does to your data, including which rows and columns to extract, and which
cleaning/processing steps to perform on each column.
**Important** The examples in this notebook assume that you have installed
`funpack` 2.7.1 or newer.
`funpack` 2.8.0 or newer.
%% Cell type:code id: tags:
``` bash
funpack -V
```
%% Cell type:markdown id: tags:
> _Note:_ If the above command produces a `NameError`, you may need to change
> the Jupyter Notebook kernel type to **Bash** - you can do so via the
> **Kernel -> Change Kernel** menu option.
## Contents
1. [Overview](#Overview)
2. [Examples](#Examples)
3. [Import examples](#Import-examples)
4. [Cleaning examples](#Cleaning-examples)
5. [Processing examples](#Processing-examples)
6. [Custom cleaning, processing and loading - funpack plugins](#Custom-cleaning,-processing-and-loading-funpack---plugins)
7. [Miscellaneous topics](#Miscellaneous-topics)
## Overview
`funpack` performs the following steps:
### 1. Import
All data files are loaded in, unwanted columns and subjects are dropped, and
the data files are merged into a single table (a.k.a. data frame). Multiple
files can be merged according to an index column (e.g. subject ID). Or, if the
input files contain the same columns/subjects, they can be naively
concatenated along rows or columns.
> _Note:_ FUNPACK refers to UK Biobank **Data fields** as **variables**. The
> two terms can be considered equivalent.
### 2. Cleaning
The following cleaning steps are applied to each column:
1. **NA value replacement:** Specific values for some columns are replaced
with NA, for example, variables where a value of `-1` indicates *Do not
know*.
2. **Variable-specific cleaning functions:** Certain columns are
re-formatted; for example, the [ICD10](https://en.wikipedia.org/wiki/ICD-10)
disease codes can be converted to integer representations.
3. **Categorical recoding:** Certain categorical columns are re-coded.
4. **Child value replacement:** NA values within some columns which are
dependent upon other columns may have values inserted based on the values
of their parent columns.
### 3. Processing
During the processing stage, columns may be removed, merged, or expanded into
additional columns. For example, a categorical column may be expanded into a set
of binary columns, one for each category.
A column may also be removed on the basis of being too sparse, or being
redundant with respect to another column.
### 4. Export
The processed data can be saved as a `.csv`, `.tsv`, or `.hdf5` file.
## Examples
Throughout these examples, we are going to use a few command line
options, which you will probably **not** normally want to use:
- `-ow` (short for `--overwrite`): This tells `funpack` not to complain if
the output file already exists.
- `-q` (short for `--quiet`): This tells `funpack` to be quiet. Without the
`-q` option, `funpack` can be quite verbose, which can be annoying, but is
very useful when things go wrong. A good strategy is to tell `funpack` to
produce verbose output using the `--noisy` (`-n` for short) option, and to
send all of its output to a log file with the `--log_file` (or `-lf`)
option. For example:
> ```
> funpack -n -n -n -lf log.txt out.tsv in.tsv
> ```
%% Cell type:code id: tags:
``` bash
alias funpack="funpack -ow -q"
```
%% Cell type:markdown id: tags:
Here's the first example input data set, with UK BioBank-style column names:
%% Cell type:code id: tags:
``` bash
cat data_01.tsv
```
%% Cell type:markdown id: tags:
The numbers in each column name typically represent:
1. The variable ID
2. The visit, for variables which were collected at multiple points in time.
3. The "instance", for multi-valued variables.
Note that one **variable** is typically associated with several **columns**,
although we're keeping things simple for this first example - there is only
one visit for each variable, and there are no mulit-valued variables.
> _Most but not all_ variables in the UK BioBank contain data collected at
> different visits, the times that the participants visited a UK BioBank
> assessment centre. However there are some variables (e.g. [ICD10 diagnosis
> codes](https://biobank.ctsu.ox.ac.uk/crystal/field.cgi?id=41202)) for which
> this is not the case.