Commit a460a146 authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

ENH,RF: Implement excludeColnames - only support suffixes to avoid fnmatch

performance hit.
parent b122004e
......@@ -104,8 +104,8 @@ def importData(fileinfo,
:arg colnames: List of names/glob-style wildcard patterns
specifying columns to import.
:arg excludeColnames: List of names/glob-style wildcard patterns
specifying columns to exclude.
:arg excludeColnames: List of column name suffixes specifying columns
to exclude.
:arg categories: List of category names to import
......@@ -157,7 +157,8 @@ def importData(fileinfo,
cols, drop = filter.columnsToLoad(fileinfo,
vartable,
variables,
colnames)
colnames,
excludeColnames)
# Load those columns, merging
# multiple input files.
......
......@@ -66,20 +66,28 @@ def restrictVariables(cattable, variables, categories):
return variables
def columnsToLoad(fileinfo, vartable, variables, colnames):
def columnsToLoad(fileinfo,
vartable,
variables,
colnames=None,
excludeColnames=None):
"""Determines which columns should be loaded from ``datafiles``.
Peeks at the first line of the data file (assumed to contain column names),
then uses the variable table to determine which of them should be loaded.
:arg fileinfo: :class:`.FileInfo` object describing the input file(s).
:arg fileinfo: :class:`.FileInfo` object describing the input
file(s).
:arg vartable: Variable table
:arg vartable: Variable table
:arg variables: List of variables to load.
:arg variables: List of variables to load.
:arg colnames: List of column names/glob-style wildcard patterns,
specifying columns to load.
:arg colnames: List of column names/glob-style wildcard patterns,
specifying columns to load.
:arg excludeColnames: List of column name suffixes specifying columns to
exclude. This overrides ``colnames``.
:returns: A tuple containing:
......@@ -92,6 +100,9 @@ def columnsToLoad(fileinfo, vartable, variables, colnames):
*ignore*.
"""
if excludeColnames is None:
excludeColnames = []
# We apply these cleaning steps by
# omitting the relevant columns.
loadFuncNames = ['remove', 'keepVisits']
......@@ -139,6 +150,15 @@ def columnsToLoad(fileinfo, vartable, variables, colnames):
load[col.datafile].append(col)
continue
# excludeColnames takes precedence
# over all other column selection
# mechanisms
for suf in excludeColnames:
for col in list(cols):
if col.name.endswith(suf):
cols.remove(col)
drop.append(col)
# Figure out whether each
# column should be loaded.
# We load all columns which
......@@ -163,7 +183,7 @@ def columnsToLoad(fileinfo, vartable, variables, colnames):
# if there are any glob patterns, do
# an exhaustive search (*very* slow)
if any([_ispattern(c) for c in colnames]):
if any(_ispattern(c) for c in colnames):
for i, col in enumerate(cols):
hits = [fnmatch.fnmatch(col.name, pat) for pat in colnames]
loadflags[i] = loadflags[i] or any(hits)
......
......@@ -221,7 +221,7 @@ def doImport(args, mgr):
cattable=cattable,
variables=variables,
colnames=columns,
excludeColnames='*' + suffix,
excludeColnames=[suffix],
categories=categories,
subjects=subjects,
subjectExprs=exprs,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment