Commit 3b942cee authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

RF: Fix redundant logging (was printing binarised correlation values)

parent 412a4d2b
......@@ -312,8 +312,11 @@ def matrixRedundantColumns(
if len(data.columns) < 2:
return []
df = data
data = data.to_numpy(dtype=np.float32, copy=True)
# Keep a copy of the column names for logging.
# Create a 2D matrix containing all data, using
# float32 to limit memory consumption.
columns = data.columns
data = data.to_numpy(dtype=np.float32, copy=True)
namask = np.isnan(data)
nacounts = namask.sum(axis=0)
......@@ -363,7 +366,9 @@ def matrixRedundantColumns(
# correlation coefficient
coef = (xy - n * mx * my) / (sx * sy)
# ignore nans/infs, binarise
# ignore nans/infs, binarise. Keep a copy
# of the raw correlations for logging.
rawcoef = coef
coef[~np.isfinite(coef)] = 0
coef = np.abs(coef) > corrthres
......@@ -386,11 +391,11 @@ def matrixRedundantColumns(
else: drop, keep = colj, coli
# coef is only the upper triangle
if coli <= colj: corr = coef[coli, colj]
else: corr = coef[colj, coli]
if coli <= colj: corr = rawcoef[coli, colj]
else: corr = rawcoef[colj, coli]
log.debug('Column %s is redundant (correlation with %s: %f)',
df.columns[drop], df.columns[keep], corr)
log.debug('Column %s is redundant (correlation with %s: %0.6f)',
columns[drop], columns[keep], corr)
return drop
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment