Commit 1c201239 authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

RF: Refactor removeIfRedundant to use new redundantColumns signature,

and (more importantly) avoid evaluating column pairs more than once
parent 689ed4bf
......@@ -72,6 +72,7 @@ import pandas as pd
import pandas.api.types as pdtypes
from . import processing_functions_core as core
from . import util
from . import custom
......@@ -138,53 +139,74 @@ def removeIfRedundant(dtable, vids, corrthres, nathres=None):
of columns - see the :func:`.isRedundant` function.
"""
def dedup(seq):
"""Remove duplicates from a sequence, preserving order.
Returns a list.
"""
newseq = []
for i in seq:
if i not in newseq:
newseq.append(i)
return newseq
# Ignore non-numeric columns
cols = list(it.chain(*[dtable.columns(v) for v in vids]))
cols = [c for c in cols if pdtypes.is_numeric_dtype(dtable[:, c.name])]
# We need to compare every pair of
# columns. We parallelise this by
# splitting the columns into chunks,
# and then passing pairs of chunks
# to the redundantColumns function.
ncols = len(cols)
chunksize = max(2, int(np.ceil(ncols / mp.cpu_count())))
chunks = [cols[i:i + chunksize] for i in range(0, ncols, chunksize)]
chunkpairs = list(it.combinations_with_replacement(chunks, 2))
# gather all of the columns for each
# pair of chunks, and extract a view
# of the data for each chunk pair,
# containing only the relevant columns.
chunkcols = [dedup(c1 + c2) for c1, c2 in chunkpairs]
chunkcols = [[c.name for c in ccs] for ccs in chunkcols]
chunkdata = [dtable[:, ccs] for ccs in chunkcols]
log.debug('Checking %u columns for redundancy (%u tasks)',
ncols, len(chunkdata))
func = ft.partial(core.redundantColumns,
corrthres=corrthres, nathres=nathres)
with dtable.pool() as pool:
redundant = pool.starmap(func, zip(chunkdata, chunkcols))
toremove = dedup(it.chain(*redundant))
if len(toremove) > 0:
cols = list(it.chain(*[dtable.columns(v) for v in vids]))
cols = [c for c in cols if pdtypes.is_numeric_dtype(dtable[:, c.name])]
colnames = [c.name for c in cols]
# evaluate all pairs at once
if not dtable.parallel:
colpairs = list(it.combinations(colnames, 2))
log.debug('Checking %u columns for redundancy', len(cols))
redundant = core.redundantColumns(
dtable[:, :], colpairs, corrthres=corrthres, nathres=nathres)
# evaluate in parallel
else:
# We need to compare every pair of
# columns. We parallelise this by
# splitting the columns into chunks,
# and then passing pairs of chunks
# to the redundantColumns function.
ncols = len(cols)
chunksize = max(2, int(np.ceil(ncols / mp.cpu_count())))
chunks = [colnames[i:i + chunksize]
for i in range(0, ncols, chunksize)]
# we build a list of column
# pairs, and views into the
# dataframe, for each chunk
chunkpairs = []
chunkdata = []
# while doing so, we keep track
# of column pairs that have been
# assigned to a chunk, so we don't
# evaluate any pair more than once
assignedcolpairs = set()
# for every chunk, and
# every pair of chunks
for chunk1, chunk2 in it.combinations_with_replacement(chunks, 2):
chunkcols = util.dedup(chunk1 + chunk2)
ichunkpairs = []
for colpair in it.combinations(chunkcols, 2):
if colpair not in assignedcolpairs:
ichunkpairs .append(colpair)
assignedcolpairs.add( colpair)
if len(ichunkpairs) > 0:
ichunkcols = util.dedup(it.chain(*ichunkpairs))
ichunkdata = dtable[:, ichunkcols]
chunkpairs.append(ichunkpairs)
chunkdata .append(ichunkdata)
log.debug('Checking %u columns for redundancy (%u tasks)',
len(cols), len(chunkdata))
func = ft.partial(core.redundantColumns,
corrthres=corrthres, nathres=nathres)
with dtable.pool() as pool:
redundant = pool.starmap(func, zip(chunkdata, chunkpairs))
redundant = list(it.chain(*redundant))
redundant = util.dedup(sorted(redundant))
if len(redundant) > 0:
log.debug('Dropping %u redundant columns: %s ...',
len(toremove), toremove[:5])
len(redundant), redundant[:5])
cols = collections.OrderedDict([(c.name, c) for c in cols])
return [cols[tr] for tr in toremove]
cols = collections.OrderedDict(zip(colnames, cols))
return [cols[r] for r in redundant]
@custom.processor()
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment