Commit e3425a5d authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

ENH: New --remove_duplicates cli option. Fileinfo has option to append suffix

to renamed duplicate columns. This will then be used to identify columns for
removal.
parent d0eeace0
......@@ -54,6 +54,7 @@ CLI_ARGUMENTS = collections.OrderedDict((
'default' : DEFAULT_MERGE_AXIS}),
(('ms', 'merge_strategy'), {'choices' : AVAILABLE_MERGE_STRATEGIES,
'default' : DEFAULT_MERGE_STRATEGY}),
(('rm', 'remove_duplicates'), {'action' : 'store_true'}),
(('rd', 'rename_duplicates'), {'action' : 'store_true'}),
(('cfg', 'config_file'), {'action' : 'append'}),
(('vf', 'variable_file'), {'action' : 'append'}),
......@@ -224,6 +225,9 @@ CLI_ARGUMENT_HELP = {
'Options are "naive", "intersection"/"inner", or "union"/'
'"outer".'.format(DEFAULT_MERGE_STRATEGY),
'remove_duplicates' :
'Remove duplicate columns, only retaining the first.',
'rename_duplicates' :
'Rename any duplicate columns so that all columns have a unique name.',
......@@ -686,6 +690,11 @@ def _prepareInputAndOutputFiles(args):
if args.loader is not None:
args.loader = {op.realpath(f) : n for f, n in args.loader}
# Remove/rename duplicates options are mutually exclusive
if args.remove_duplicates and args.rename_duplicates:
raise ValueError('Only one of--remove_duplicates and '
'--rename_duplicates may be used.')
# turn index indices into dict of
# { file : [index] } mappings
if args.index is not None:
......
......@@ -298,7 +298,8 @@ class FileInfo:
indexes=None,
loaders=None,
encodings=None,
renameDuplicates=False):
renameDuplicates=False,
renameSuffix=None):
"""Create a ``FileInfo`` object.
:arg datafiles: Path to input file, or sequence of paths.
......@@ -311,6 +312,9 @@ class FileInfo:
specifying non-standard file encodings.
:arg renameDuplicates: If ``True``, duplicate columns are re-named -
see :func:`renameDuplicateColumns`.
:arg renameSuffix: Passed as ``suffix`` to
:func:`renameDuplicateColumns`, if
``renameDuplicates is True``.
"""
if isinstance(datafiles, str): datafiles = [datafiles]
......@@ -322,7 +326,8 @@ class FileInfo:
indexes,
loaders,
encodings,
renameDuplicates)
renameDuplicates,
renameSuffix=renameSuffix)
self.__datafiles = list(datafiles)
self.__indexes = dict(indexes)
......@@ -388,7 +393,8 @@ def fileinfo(datafiles,
indexes=None,
sniffers=None,
encodings=None,
renameDuplicates=False):
renameDuplicates=False,
renameSuffix=None):
"""Identifies the format of each input data file, and extracts/generates
column names and variable IDs for every column.
......@@ -410,6 +416,10 @@ def fileinfo(datafiles,
which have the same name are renamed - see
:func:`renameDuplicateColumns`.
:arg renameSuffix: Passed as ``suffix`` to
:func:`renameDuplicateColumns`, if
``renameDuplicates is True``.
:returns: A tuple containing:
- List of ``csv`` dialect types
......@@ -517,22 +527,28 @@ def fileinfo(datafiles,
col.name = util.generateColumnName(vid, 0, 0)
if renameDuplicates:
renameDuplicateColumns(it.chain(*cols))
renameDuplicateColumns(it.chain(*cols), suffix=renameSuffix)
return dialects, headers, cols
def renameDuplicateColumns(cols):
def renameDuplicateColumns(cols, suffix=None):
"""Identifies any columns which have the same name, and re-names the
subsequent ones. If ``N`` columns have the same name ``X``, they are
renamed ``X``, ``X.1``, ``X.2``, ``...``, ``X.<N-1>``.
renamed ``X``, ``X.1<suffix>``, ``X.2<suffix>``, ``...``,
``X.<N-1><suffix>``.
The ``name`` attribute of each :class:`.Column` object is modified
in-place.
:arg cols: Sequence of :class:`.Column` objects.
:arg cols: Sequence of :class:`.Column` objects.
:arg suffix: String to append to the name of all renamed columns.
Defaults to an empty string.
"""
if suffix is None:
suffix = ''
counts = collections.defaultdict(list)
for col in cols:
......@@ -544,7 +560,7 @@ def renameDuplicateColumns(cols):
counts[col.name].append(col)
count = len(counts[col.name])
if count > 1:
newname = '{}.{}'.format(col.name, count - 1)
newname = '{}.{}{}'.format(col.name, count - 1, suffix)
col.name = newname
log.warning('Duplicate column detected (%s: %s) - renamed to %s',
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment