Commit 90409c1b authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

RF: Change isSparse so it returns the reason and value which caused a column

to fail the test
parent a0c4ab45
......@@ -94,13 +94,16 @@ def removeIfSparse(dtable,
log.debug('Checking column %s for sparsity', col.name)
if core.isSparse(dtable[:, col.name],
vtype,
minpres=minpres,
minstd=minstd,
maxcat=maxcat,
absolute=absolute):
log.debug('Dropping sparse column %s', col.name)
isSparse, test, val = core.isSparse(dtable[:, col.name],
vtype,
minpres=minpres,
minstd=minstd,
maxcat=maxcat,
absolute=absolute)
if isSparse:
log.debug('Dropping sparse column %s (%s: %f)',
col.name, test, val)
remove.append(col)
return remove
......
......@@ -70,7 +70,17 @@ def isSparse(data,
an absolute count. Otherwise ``minpres`` is interpreted
as a proportion.
:returns: ``True`` if the data is sparse, ``False`` otherwise.
:returns: A tuple containing:
- ``True`` if the data is sparse, ``False`` otherwise.
- If the data is sparse, one of ``'minpres'``,
``'minstd'``, or ``'maxcat'``, indicating the cause of
its sparsity. ``None`` if the data is not sparse.
- If the data is sparse, the value of the criteria which
caused the data to fail the test. ``None`` if the data
is not sparse.
"""
presmask = data.notnull()
......@@ -95,13 +105,14 @@ def isSparse(data,
pres = float(len(present)) / len(data)
if pres < minpres:
return True
return True, 'minpres', pres
# stddev is not large enough (for
# numerical/categorical types)
if isnumeric and minstd is not None:
if (present - present.mean()).std() <= minstd:
return True
std = (present - present.mean()).std()
if std <= minstd:
return True, 'minstd', std
# for categorical types
if iscategorical and maxcat is not None:
......@@ -109,11 +120,11 @@ def isSparse(data,
# one category is too dominant
uniqvals = np.unique(present)
uniqcounts = [sum(present == u) for u in uniqvals]
catcount = float(max(uniqcounts)) / len(present)
if catcount >= maxcat:
return True, 'maxcat', catcount
if float(max(uniqcounts)) / len(present) >= maxcat:
return True
return False
return False, None, None
def redundantColumns(data, columns, corrthres, nathres=None):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment