Commit e5df476b authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

Merge branch 'enh/select_by_column_name' into 'master'

Enh/select by column name

See merge request fsl/ukbparse!98
parents d34aad97 3d058619
Pipeline #3194 passed with stages
in 8 minutes and 23 seconds
......@@ -2,7 +2,7 @@
======================
0.13.1 (Thursday 20th Deember 2018)
0.14.0 (Tuesday 25th December 2018)
-----------------------------------
......@@ -10,6 +10,37 @@ Added
^^^^^
* New ``--column`` option, allowing columns to be selected by name/name
pattern.
* ``ukbparse`` can now be installed from `conda-forge
<https://anaconda.org/conda-forge/ukbparse>`_.
Changed
^^^^^^^
* The index column in the output file no longer defaults to being named
``'eid'``. It defaults to the name of the index in the input file, but
can still be overridden by the ``--output_id_column`` option.
Fixed
^^^^^
* Blank lines are now allowed in configuration files (#2)
* Fix to derived column names for ICD10 variables in default processing rules.
0.13.1 (Thursday 20th December 2018)
------------------------------------
Added
^^^^^
* Unit test to make sure that ``ukbparse`` crashes if given bad input
arguments.
......
......@@ -5,6 +5,9 @@
.. image:: https://img.shields.io/pypi/v/ukbparse.svg
:target: https://pypi.python.org/pypi/ukbparse/
.. image:: https://anaconda.org/conda-forge/ukbparse/badges/version.svg
:target: https://anaconda.org/conda-forge/ukbparse
.. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.1997626.svg
:target: https://doi.org/10.5281/zenodo.1997626
......@@ -25,6 +28,11 @@ Install ``ukbparse`` via pip::
pip install ukbparse
Or from ``conda-forge``::
conda install -c conda-forge ukbparse
Comprehensive documentation does not yet exist.
......
......@@ -6,7 +6,7 @@
#
__version__ = '0.13.1'
__version__ = '0.14.0'
"""The ``ukbparse`` versioning scheme roughly follows Semantic Versioning
conventions.
"""
......
......@@ -44,7 +44,6 @@ DEFAULT_MERGE_STRATEGY = importing.MERGE_STRATEGY
DEFAULT_EXPORT_FORMAT = exporting.EXPORT_FORMAT
AVAILABLE_MERGE_AXES = importing.MERGE_AXIS_OPTIONS
AVAILABLE_MERGE_STRATEGIES = importing.MERGE_STRATEGY_OPTIONS
DEFAULT_OUTPUT_ID_COLUMN = exporting.ID_COLUMN
DEFAULT_COLUMN_PATTERN = exporting.COLUMN_PATTERN
DEFAULT_TSV_SEP = exporting_tsv.TSV_SEP
DEFAULT_HDF5_KEY = exporting_hdf5.HDF5_KEY
......@@ -79,6 +78,7 @@ CLI_ARGUMENTS = collections.OrderedDict((
(('r', 'remove_unknown'), {'action' : 'store_true'}),
(('s', 'subject'), {'action' : 'append'}),
(('v', 'variable'), {'action' : 'append'}),
(('co', 'column'), {'action' : 'append'}),
(('c', 'category'), {'action' : 'append'}),
(('vi', 'visit'), {'action' : 'append'}),
(('ex', 'exclude'), {'action' : 'append'})]),
......@@ -124,7 +124,7 @@ CLI_ARGUMENTS = collections.OrderedDict((
(('rc', 'rename_column'), {'action' : 'append',
'nargs' : 2,
'metavar' : ('OLD_NAME', 'NEW_NAME')}), # noqa
(('oi', 'output_id_column'), {'default' : DEFAULT_OUTPUT_ID_COLUMN}), # noqa
(('oi', 'output_id_column'), {}),
(('edf', 'date_format'), {'default' : 'default'}),
(('etf', 'time_format'), {'default' : 'default'}),
(('nr', 'num_rows'), {'type' : int}),
......@@ -256,6 +256,11 @@ CLI_ARGUMENT_HELP = {
'variable IDs, to import. Can be used multiple times. Implies '
'--remove_unknown.',
'column' :
'Name of column to import. Can also be a glob-style wildcard pattern - '
'columns with a name matching the pattern will be imported. Can be used '
'multiple times. Implies --remove_unknown.',
'category' :
'Category ID or label to import. Can be used multiple times. Implies '
'--remove_unknown.',
......@@ -330,8 +335,7 @@ CLI_ARGUMENT_HELP = {
'be used multiple times',
'output_id_column' :
'Name of ID column in output file '
'(default: \'{}\')'.format(DEFAULT_OUTPUT_ID_COLUMN),
'Name of ID column in output file.',
'date_format' :
'Formatter to use for date variables (default: "default").',
......@@ -661,9 +665,12 @@ def parseArgs(argv=None, namespace=None):
if args.global_clean is None: args.global_clean = visit
else: args.global_clean += ',' + visit
# If variables/categories are explicitly
# specified, remove_unknown is implied.
if args.variable is not None or args.category is not None:
# If variables/categories/columns are
# explicitly specified, remove_unknown
# is implied.
if any((args.variable is not None,
args.category is not None,
args.column is not None)):
args.remove_unknown = True
# categories can be specified
......@@ -725,8 +732,9 @@ def loadConfigFile(cfgfile):
log.debug('Loading arguments from configuration file %s', cfgfile)
with open(cfgfile, 'rt') as f:
lines = [l.strip() for l in f.readlines()]
lines = [l for l in lines if not l.startswith('#')]
lines = [line.strip() for line in f.readlines()]
lines = [line for line in lines if line != '']
lines = [line for line in lines if not line.startswith('#')]
for line in lines:
words = list(shlex.split(line))
......
Variable Process
40001 binariseCategorical(acrossInstances=True, acrossVisits=True, nameFormat='{vid}{value}')
40002 binariseCategorical(acrossInstances=True, acrossVisits=True, nameFormat='{vid}{value}')
40006 binariseCategorical(acrossInstances=True, acrossVisits=True, nameFormat='{vid}{value}')
41202 binariseCategorical(acrossInstances=True, nameFormat='{vid}{value}-{visit}')
41204 binariseCategorical(acrossInstances=True, nameFormat='{vid}{value}-{visit}')
40001 binariseCategorical(acrossInstances=True, acrossVisits=True, nameFormat='{vid}{value:0.0f}')
40002 binariseCategorical(acrossInstances=True, acrossVisits=True, nameFormat='{vid}{value:0.0f}')
40006 binariseCategorical(acrossInstances=True, acrossVisits=True, nameFormat='{vid}{value:0.0f}')
41202 binariseCategorical(acrossInstances=True, nameFormat='{vid}{value:0.0f}-{visit}')
41204 binariseCategorical(acrossInstances=True, nameFormat='{vid}{value:0.0f}-{visit}')
all_independent removeIfSparse(minpres=51, maxcat=0.99)
all removeIfRedundant(0.99, 0.2)
......@@ -37,10 +37,6 @@ Setting this variable to ``None`` has the same effect as setting it to
"""
ID_COLUMN = 'eid'
"""Default name for the output ID column. """
EXPORT_FORMAT = 'tsv'
"""Default export format."""
......@@ -120,8 +116,8 @@ def exportData(dtable,
to be used as the ``variable`` in ``colpat`` when
generating output column names.
:arg idcol: Name to use for ID column. Defaults to
:attr:`ID_COLUMN`.
:arg idcol: Name to use for ID column. Defaults to the original
index column name (``pandas.DataFrame.index.name``).
:arg fileFormat: File format to export to - the name of a ``@exporter``
plugin. If not provided, defaults to
......@@ -130,7 +126,7 @@ def exportData(dtable,
if colpat is None: colpat = COLUMN_PATTERN
if fileFormat is None: fileFormat = EXPORT_FORMAT
if idcol is None: idcol = ID_COLUMN
if idcol is None: idcol = dtable.index.name
if subjects is None:
subjects = dtable.index
......
......@@ -83,7 +83,7 @@ def exportTSV(dtable,
cstart = chunki * numRows
cend = cstart + numRows
csubjs = subjects[cstart:cend]
towrite = pd.DataFrame()
towrite = pd.DataFrame(index=csubjs)
for col in dtable.allColumns:
......
......@@ -359,16 +359,20 @@ def fileinfo(datafiles, indexes=None, sniffers=None):
for ci, col in enumerate(fcols):
# UKB-style - all good
if col.vid is not None:
# Index columns always get
# a variable ID of 0
if ci == idxcol:
vid = 0
# UKB-style - we already
# have a variable id, visit,
# and instance.
elif col.vid is not None:
continue
# Non-UKB style file - assign
# a (vid, visit, instance) to
# each column, giving the index
# column vid 0
if ci == idxcol:
vid = 0
# each column
else:
vid = autovid
autovid += 1
......
......@@ -13,6 +13,7 @@ import os.path as op
import itertools as it
import functools as ft
import multiprocessing.dummy as mpd
import fnmatch
import logging
import warnings
import collections
......@@ -67,6 +68,7 @@ def importData(datafiles,
proctable,
cattable,
variables=None,
colnames=None,
categories=None,
subjects=None,
encoding=None,
......@@ -102,6 +104,9 @@ def importData(datafiles,
:arg variables: List of variable IDs to import
:arg colnames: List of names/glob-style wildcard patterns
specifying columns to import.
:arg categories: List of category names to import
:arg subjects: List of subjects to include.
......@@ -169,6 +174,7 @@ def importData(datafiles,
cols, drop = columnsToLoad(datafiles,
vartable,
variables,
colnames,
unknownVars,
removeUnknown,
indexes=indexes,
......@@ -335,6 +341,7 @@ def restrictVariables(cattable, variables, categories):
def columnsToLoad(datafiles,
vartable,
variables,
colnames,
unknownVars,
removeUnknown,
indexes=None,
......@@ -351,6 +358,10 @@ def columnsToLoad(datafiles,
:arg variables: List of variables to load. If provided,
``removeUnknown`` is ignored.
:arg colnames: List of column names/glob-style wildcard patterns,
specifying columns to load. If provided,
``removeUnknown`` is ignored.
:arg unknownVars: List of :class:`.Column` objects representing unknown
variables
......@@ -369,11 +380,8 @@ def columnsToLoad(datafiles,
- A dict of ``{ file : [Column] }`` mappings, the
:class:`.Column` objects to *load* from each input
file.
Note that the columns are not necessarily ordered
in the same way that they are in the input files -
the header column will always be first in each list.
file. The columns (including the index column) are
ordered as they appear in the file.
- A list containing the :class:`.Column` objects to
*ignore*.
......@@ -382,7 +390,10 @@ def columnsToLoad(datafiles,
if sniffers is None: sniffers = {}
if indexes is None: indexes = {}
if variables is not None: removeUnknown = False
if colnames is not None: removeUnknown = False
# Turn the unknonwVars list
# into a list of variable IDs
unknownVids = list(sorted(set([c.vid for c in unknownVars])))
if isinstance(datafiles, six.string_types):
......@@ -392,24 +403,26 @@ def columnsToLoad(datafiles,
# omitting the relevant columns.
loadFuncNames = ['remove', 'keepVisits']
# Peek at the columns that are
# in the input files. Save a ref
# to the first column (assumed
# to be the ID column)
# Peek at the columns that
# are in the input files.
allcols = fileinfo.fileinfo(datafiles,
indexes=indexes,
sniffers=sniffers)[2]
indexes = [indexes.get(f, 0) for f in datafiles]
hdrcols = [c[ i] for c, i in zip(allcols, indexes)]
allcols = [c[:i] + c[i + 1:] for c, i in zip(allcols, indexes)]
allcols = list(it.chain(*allcols))
ncols = len(list(it.chain(*allcols)))
# re-organise them - a list of
# columns for each variable ID
# re-organise the columns - a list of
# columns for each variable ID. We do
# this because, for a given VID, we
# want to pass all columns at once to
# the cleaning function(s) below.
byvid = collections.defaultdict(list)
for col in allcols:
for col in it.chain(*allcols):
byvid[col.vid].append(col)
# Build a full list of index
# columns for each data file.
indexes = [indexes.get(f, 0) for f in datafiles]
# retrieve all cleaning steps -
# we are only going to apply the
# cleaning steps that will
......@@ -422,30 +435,50 @@ def columnsToLoad(datafiles,
# Loop through all columns in
# the data, and build a list of
# the ones we want to load. The
# end result will be organised
# by the data files.
#
# We load the ID column for every
# file - it will appear first in
# the list for each input file.
# end result will be an ordered
# dict of { file : [column] }
# mappings, and a list of columns
# to drop.
drop = []
load = collections.OrderedDict(
[(f, [hc]) for f, hc in zip(datafiles, hdrcols)])
load = collections.OrderedDict([(f, []) for f in datafiles])
for vid, cols in byvid.items():
# index column - load it!
# (the fileinfo function gives
# index columns a variable ID
# of 0).
if vid == 0:
for col in cols:
load[col.datafile].append(col)
continue
# variable list provided, but this
# variable is not in it - don't load.
if variables is not None and vid not in variables:
drop.extend(cols)
continue
# column is flagged as unknown,
# and we have been told to
# ignore unknown columns
# variable is flagged as unknown,
# and we have been told to ignore
# unknown variables
if removeUnknown and vid in unknownVids:
drop.extend(cols)
continue
# column names/patterns specified -
# filter the list of columns based
# on whether they match any of the
# patterns specified.
if colnames is not None:
for col in list(cols):
hits = [fnmatch.fnmatch(col.name, pat) for pat in colnames]
if not any(hits):
cols.remove(col)
drop.append(col)
# cleaning specified for this variable
if vid in ppvids:
......@@ -470,9 +503,15 @@ def columnsToLoad(datafiles,
for col in cols:
load[col.datafile].append(col)
# Final step - the column lists for each
# file are not necessarily ordered by
# their position in the file. Re-order
# them so they are.
for fname, cols in list(load.items()):
load[fname].sort(key=lambda c: c.index)
log.debug('Identified %i / %i columns to be loaded',
sum([len(c) for c in load.values()]),
len(allcols) + len(datafiles))
sum([len(c) for c in load.values()]), ncols)
return load, drop
......@@ -501,8 +540,7 @@ def loadData(datafiles,
:arg columns: Dict of ``{ file : [Column] }`` mappings,
defining the columns to load, as returned by
:func:`columnsToLoad`. It is assumed that the
first column in each list is the index column.
:func:`columnsToLoad`.
:arg nrows: Number of rows to read at a time. Defaults to
:attr:`NUM_ROWS`.
......@@ -654,8 +692,8 @@ def loadFile(fname,
in the file.
:arg toload: Sequence of :class:`.Column` objects describing the columns
that should be loaded. It is assumed that the first column
in this list is the index column.
that should be loaded, as generated by
:func:`columnsToLoad`.
:arg index: Column position of index column (starting from 0). Defaults
to 0.
......@@ -701,6 +739,26 @@ def loadFile(fname,
def shouldLoad(c):
return c in toloadnames
# The read_csv function requires the
# index argument to be specified
# relative to the usecols argument:
#
# - https://stackoverflow.com/a/45943627
# - https://github.com/pandas-dev/pandas/issues/9098
# - https://github.com/pandas-dev/pandas/issues/2654
#
# So here we make index relative to
# toloadnames.
#
# We also drop the index column from
# the toload list - after the call to
# read_csv, we want our Column list
# to align with the pandas Series
# objects (which won't include the
# index).
index = [i for i, c in enumerate(toload) if c.index == index][0]
toload.pop(index)
# Figure out suitable data types to
# store the data for each column.
# Only date/time columns are converted
......@@ -718,11 +776,7 @@ def loadFile(fname,
else: header = None
log.debug('Loading %u columns from %s: %s ...',
len(toload), fname, toloadnames[:5])
# we can discard the index column
# from the toload list now
toload = [c for c in toload if c.index != index]
len(toload) + 1, fname, toloadnames[:5])
if dialect == 'whitespace': dlargs = {'delim_whitespace' : True}
else: dlargs = {'dialect' : dialect}
......@@ -737,6 +791,7 @@ def loadFile(fname,
with warnings.catch_warnings():
warnings.filterwarnings('ignore', module='pandas.io.parsers')
warnings.filterwarnings('ignore', category=pd.errors.DtypeWarning)
dfiter = pd.read_csv(fname,
header=header,
names=allcolnames,
......
......@@ -415,34 +415,43 @@ def loadVariableTable(datafiles,
if sniffers is None: sniffers = {}
if indexes is None: indexes = {}
if varfile is not None:
log.debug('Loading variable table from %s', varfile)
vartable = pd.read_csv(varfile, '\t',
index_col=0,
dtype=VARTABLE_DTYPES,
converters=VARTABLE_CONVERTERS)
else:
vartable = pd.DataFrame(columns=VARTABLE_COLUMNS[1:])
vartable.index.name = VARTABLE_COLUMNS[0]
if dcfile is not None:
log.debug('Loading data coding table from %s', dcfile)
dctable = pd.read_csv(dcfile, '\t',
index_col=0,
dtype=DCTABLE_DTYPES,
converters=DCTABLE_CONVERTERS)
else:
dctable = pd.DataFrame(columns=DCTABLE_COLUMNS[1:])
dctable.index.name = DCTABLE_COLUMNS[0]
if typefile is not None:
log.debug('Loading type table from %s', typefile)
tytable = pd.read_csv(typefile, '\t',
index_col=0,
converters=TYPETABLE_CONVERTERS)
else:
tytable = pd.DataFrame(columns=TYPETABLE_COLUMNS[1:])
tytable.index.name = TYPETABLE_COLUMNS[0]
def load_table_file(fname, what, dtypes, converters, columns):
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=pd.errors.ParserWarning)
if fname is not None:
log.debug('Loading %s table from %s', what, varfile)
table = pd.read_csv(fname, '\t',
index_col=0,
dtype=dtypes,
converters=converters)
else:
table = pd.DataFrame(columns=columns[1:])
table.index.name = columns[0]
if list(sorted(table.columns)) != sorted(columns[1:]):
raise ValueError('Missing/unrecognised columns in table file {} - '
'should be {}, but file contained {}.'.format(
fname, columns, table.columns))
return table
vartable = load_table_file(varfile,
'variable',
VARTABLE_DTYPES,
VARTABLE_CONVERTERS,
VARTABLE_COLUMNS)
dctable = load_table_file(dcfile,
'data coding',
DCTABLE_DTYPES,
DCTABLE_CONVERTERS,
DCTABLE_COLUMNS)
tytable = load_table_file(typefile,
'type',
TYPETABLE_DTYPES,
TYPETABLE_CONVERTERS,
TYPETABLE_COLUMNS)
# Make sure data types are aligned,
# otherwise we may run into problems
......
......@@ -11,13 +11,12 @@ import multiprocessing as mp
import sys
import shutil
import logging
import fnmatch
import tempfile
import warnings
import datetime
import calendar
import pandas as pd
import ukbparse
import ukbparse.util as util
import ukbparse.icd10 as icd10
......@@ -105,6 +104,7 @@ def main(argv=None):
try:
with util.timed(
None, log, fmt='Total time: %i minutes, %i seconds (%+iMB)'):
dtable, unknowns, drop = doImport(args, pool, mgr)
if args.dry_run:
......@@ -175,10 +175,12 @@ def doImport(args, pool, mgr):
if not args.dry_run and args.import_all:
variables = None
categories = None
columns = None
removeUnknown = None
else:
variables = args.variable
categories = args.category
columns = args.column
removeUnknown = args.remove_unknown
# Import data
......@@ -189,6 +191,7 @@ def doImport(args, pool, mgr):
proctable=proctable,
cattable=cattable,
variables=variables,
colnames=columns,
categories=categories,
subjects=subjects,
encoding=args.encoding,
......@@ -265,11 +268,13 @@ def finaliseColumns(dtable, args, unknowns):
vids = importing.restrictVariables(
dtable.cattable, args.variable, args.category)
# args.remove_unknown is only applied if
# variables were not already restricted
# by args.variable and/or args.category
# args.remove_unknown is only applied
# if variables/columns were not already
# restricted by args.variable,
# args.category, and or args.column
removeUnknown = all((vids is None,
args.remove_unknown,
args.column is None,
len(unknowns) > 0))
# apply removeUnknown
......@@ -280,6 +285,15 @@ def finaliseColumns(dtable, args, unknowns):
if vid in uvids:
vids.remove(vid)
# apply column patterns
if args.column is not None:
remove = []
for col in list(dtable.allColumns[1:]):
hits = [fnmatch.fnmatch(col.name, pat) for pat in args.column]