Commit 5a5cf7d5 authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

ENH,RF: isSparse function has a new "mincat" test, which tests the size of the

smallest category
parent c0efc9b5
......@@ -74,15 +74,17 @@ def removeIfSparse(dtable,
vids,
minpres=None,
minstd=None,
mincat=None,
maxcat=None,
absolute=True):
"""removeIfSparse([minpres], [minstd], [maxcat], [absolute])
abspres=True,
abscat=True):
"""removeIfSparse([minpres], [minstd], [mincat], [maxcat], [abspres], [abscat])
Removes columns deemed to be sparse.
Removes columns for the variables in ``vids`` if they are sparse.
See the :func:`.isSparse` function.
"""
""" # noqa
remove = []
......@@ -98,8 +100,10 @@ def removeIfSparse(dtable,
vtype,
minpres=minpres,
minstd=minstd,
mincat=mincat,
maxcat=maxcat,
absolute=absolute)
abspres=abspres,
abscat=abscat)
if isSparse:
log.debug('Dropping sparse column %s (%s: %f)',
......
......@@ -35,22 +35,29 @@ def isSparse(data,
ctype,
minpres=None,
minstd=None,
mincat=None,
maxcat=None,
absolute=True):
abspres=True,
abscat=True):
"""Returns ``True`` if the given data looks sparse, ``False`` otherwise.
Used by :func:`removeIfSparse`.
The check is based on up to three criteria:
The check is based on the following criteria:
- The number/proportion of non-NA values must be greater than
or equal to ``minpres``.
- The standard deviation of the data must be greater than ``minstd``.
- For integer and categorical types, the proportion of the largest
- For integer and categorical types, the number/proportion of the largest
category must be less than ``maxcat``.
- For integer and categorical types, the number/proportion of the largest
category must be greater than ``mincat``.
If any of these criteria are not met, the data is considered to be sparse.
Each criteria can be disabled by passing in ``None`` for the relevant
parameter.
......@@ -63,20 +70,28 @@ def isSparse(data,
:arg minstd: Minimum standard deviation, for numeric/categorical types.
:arg maxcat: Maximum size (as a proportion) of largest category,
:arg mincat: Minimum size/proportion of largest category,
for integer/categorical types.
:arg maxcat: Maximum size/proportion of largest category,
for integer/categorical types.
:arg absolute: If ``True`` (the default), ``minpres`` is interpreted as
:arg abspres: If ``True`` (the default), ``minpres`` is interpreted as
an absolute count. Otherwise ``minpres`` is interpreted
as a proportion.
:arg abscat: If ``True`` (the default), ``mincat`` and ``maxcat`` are
interpreted as absolute counts. Otherwise ``mincat`` and
``maxcat`` are interpreted as proportions
:returns: A tuple containing:
- ``True`` if the data is sparse, ``False`` otherwise.
- If the data is sparse, one of ``'minpres'``,
``'minstd'``, or ``'maxcat'``, indicating the cause of
its sparsity. ``None`` if the data is not sparse.
``'minstd'``, ``mincat``, or ``'maxcat'``, indicating
the cause of its sparsity. ``None`` if the data is not
sparse.
- If the data is sparse, the value of the criteria which
caused the data to fail the test. ``None`` if the data
......@@ -85,6 +100,22 @@ def isSparse(data,
presmask = data.notnull()
present = data[presmask]
ntotal = len(data)
npresent = len(present)
def fixabs(val, isabs):
# Turn proportion into
# an absolute count
if not isabs:
return np.round(val * ntotal)
# ignore absolute thresholds if
# total data length is less than it
elif len(data) < val:
return npresent
else:
return val
iscategorical = ctype in (util.CTYPES.integer,
util.CTYPES.categorical_single,
......@@ -93,19 +124,8 @@ def isSparse(data,
# not enough values
if minpres is not None:
if absolute:
pres = len(present)
# ignore absolute minpres threshold if
# total data length is less than it
if len(data) < minpres:
minpres = pres
else:
pres = float(len(present)) / len(data)
if pres < minpres:
return True, 'minpres', pres
if npresent < fixabs(minpres, abspres):
return True, 'minpres', npresent
# stddev is not large enough (for
# numerical/categorical types)
......@@ -115,14 +135,25 @@ def isSparse(data,
return True, 'minstd', std
# for categorical types
if iscategorical and maxcat is not None:
if iscategorical and ((maxcat is not None) or (mincat is not None)):
if maxcat is not None: maxcat = fixabs(maxcat, abscat)
if mincat is not None: mincat = fixabs(mincat, abscat)
# one category is too dominant
# mincat - smallest category is too small
# maxcat - one category is too dominant
uniqvals = np.unique(present)
uniqcounts = [sum(present == u) for u in uniqvals]
catcount = float(max(uniqcounts)) / len(present)
if catcount >= maxcat:
return True, 'maxcat', catcount
nmincat = min(uniqcounts)
nmaxcat = max(uniqcounts)
if mincat is not None:
if nmincat < mincat:
return True, 'mincat', nmincat
if maxcat is not None:
if nmaxcat >= maxcat:
return True, 'maxcat', nmaxcat
return False, None, None
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment