Commit b122004e authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

ENH: If --remove_duplicates, duplicate columns are identified and

renamed. New importData option excludeColnames will be used to remove
them (not implemented yet)
parent e3425a5d
......@@ -17,5 +17,6 @@ from .core import (importData,
MERGE_AXIS,
MERGE_STRATEGY,
MERGE_AXIS_OPTIONS,
MERGE_STRATEGY_OPTIONS) # noqa
from .filter import (restrictVariables,) # noqa
MERGE_STRATEGY_OPTIONS) # noqa
from .filter import (restrictVariables,
REMOVE_DUPLICATE_COLUMN_IDENTIFIER) # noqa
......@@ -66,6 +66,7 @@ def importData(fileinfo,
cattable,
variables=None,
colnames=None,
excludeColnames=None,
categories=None,
subjects=None,
subjectExprs=None,
......@@ -89,61 +90,65 @@ def importData(fileinfo,
3. Creates and returns a :class:`DataTable`.
:arg fileinfo: :class:`.FileInfo` object describing the input file(s).
:arg fileinfo: :class:`.FileInfo` object describing the input
file(s).
:arg vartable: The data coding table
:arg vartable: The data coding table
:arg proctable: The processing table
:arg proctable: The processing table
:arg cattable: The category table
:arg cattable: The category table
:arg variables: List of variable IDs to import
:arg variables: List of variable IDs to import
:arg colnames: List of names/glob-style wildcard patterns
specifying columns to import.
:arg colnames: List of names/glob-style wildcard patterns
specifying columns to import.
:arg categories: List of category names to import
:arg excludeColnames: List of names/glob-style wildcard patterns
specifying columns to exclude.
:arg subjects: List of subjects to include
:arg categories: List of category names to import
:arg subjectExprs: List of subject inclusion expressions
:arg subjects: List of subjects to include
:arg exclude: List of subjects to exclude
:arg subjectExprs: List of subject inclusion expressions
:arg trustTypes: If ``True``, it is assumed that columns with a
known data type do not contain any bad/unparseable
values. This improves performance, but will cause
an error if the assumption does not hold.
:arg exclude: List of subjects to exclude
:arg mergeAxis: Merging axis to use when loading multiple data
files - see the :func:`mergeData` function.
:arg trustTypes: If ``True``, it is assumed that columns with a
known data type do not contain any bad/unparseable
values. This improves performance, but will cause
an error if the assumption does not hold.
:arg mergeStrategy: Merging strategy to use when loading multiple
data files - see the :func:`mergeData` function.
:arg mergeAxis: Merging axis to use when loading multiple data
files - see the :func:`mergeData` function.
:arg indexVisits: Re-arrange the data so that rows are indexed by
subject ID and visit, rather than visits being
split into separate columns. Only applied to
variables which are labelled with Instancing 2.
:arg mergeStrategy: Merging strategy to use when loading multiple
data files - see the :func:`mergeData` function.
:arg dropNaRows: If ``True``, rows which do not contain data for any
columns are not loaded.
:arg indexVisits: Re-arrange the data so that rows are indexed by
subject ID and visit, rather than visits being
split into separate columns. Only applied to
variables which are labelled with Instancing 2.
:arg njobs: Number of processes to use for parallelising tasks.
:arg dropNaRows: If ``True``, rows which do not contain data for any
columns are not loaded.
:arg mgr: :class:`multiprocessing.Manager` object for
parallelisation
:arg njobs: Number of processes to use for parallelising tasks.
:arg dryrun: If ``True`` the data is not loaded.
:arg mgr: :class:`multiprocessing.Manager` object for
parallelisation
:returns: A tuple containing:
:arg dryrun: If ``True`` the data is not loaded.
:returns: A tuple containing:
- A :class:`DataTable`, which contains references
to the data, and the variable and procesing
tables.
- A :class:`DataTable`, which contains references
to the data, and the variable and procesing
tables.
- A list of :class:`.Column` objects that were not
loaded from each input file.
- A list of :class:`.Column` objects that were not
loaded from each input file.
"""
variables = filter.restrictVariables(cattable, variables, categories)
......
......@@ -22,6 +22,14 @@ import funpack.loadtables as loadtables
log = logging.getLogger(__name__)
REMOVE_DUPLICATE_COLUMN_IDENTIFIER = '.REMOVE_DUPLICATE'
"""Identifier which is appended to the names of duplicate columns that
are to be removed. Use of this identifier is not hard-coded anywhere -
this module is just a convenient location for its definition. See
the :func:`funpack.main.doImport` function.
"""
def _ispattern(s):
"""Returns ``True`` if ``s`` looks like a ``fnmatch``-style pattern,
``False`` otherwise.
......
......@@ -168,11 +168,24 @@ def doImport(args, mgr):
each input file.
"""
# if --remove_duplicates, we append
# an identifying suffix to the names
# of columns to be removed. This is
# then passed through as an exclusion
# pattern to the importData function.
if args.remove_duplicates:
suffix = importing.REMOVE_DUPLICATE_COLUMN_IDENTIFIER
renameDuplicates = True
else:
suffix = None
renameDuplicates = args.rename_duplicates
finfo = fileinfo.FileInfo(args.infile,
indexes=args.index,
loaders=args.loader,
encodings=args.encoding,
renameDuplicates=args.rename_duplicates)
renameDuplicates=renameDuplicates,
renameSuffix=suffix)
with util.timed('Table import', log):
vartable, proctable, cattable, unknowns, uncategorised = \
......@@ -208,6 +221,7 @@ def doImport(args, mgr):
cattable=cattable,
variables=variables,
colnames=columns,
excludeColnames='*' + suffix,
categories=categories,
subjects=subjects,
subjectExprs=exprs,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment