Commit eb230b75 authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

RF: Converting unknown columns to a default type is too problematic - it

changes the default behaviour of inferring column types when working with
non-UKB data. Instead, ensure that navalues/rawlevels/etc are coerced to
appropriate type when applied. Also only calculate raw/new level correlation
on numeric columns
parent 40ae569c
......@@ -92,10 +92,12 @@ def applyNAInsertion(dtable):
if not dtable.present(vid):
continue
columns = dtable.columns(vid)
dtype = dtable[:, columns[0].name].dtype
values = np.array(vtable['NAValues'][vid]).astype(dtype)
navals = {v : np.nan for v in values}
for col in dtable.columns(vid):
series = dtable[:, col.name]
values = vtable['NAValues'][vid].astype(series.dtype)
navals = {v : np.nan for v in values}
dtable[:, col.name] = dtable[:, col.name].replace(navals)
......@@ -160,10 +162,10 @@ def _runChildValues(dtable, exprs, cvals, vid):
# present for dependent and parent variables.
# Replacement on child variables for which
# this assumption does not hold is skipped.
expr = exprs[ vid]
cval = cvals[ vid]
visits = dtable.visits( vid)
instances = dtable.instances(vid)
expr = exprs[ vid]
cval = np.array(cvals[ vid])
visits = dtable.visits( vid)
instances = dtable.instances(vid)
for visit, instance in it.product(visits, instances):
......@@ -186,7 +188,7 @@ def _runChildValues(dtable, exprs, cvals, vid):
# And there should only be one
# variable for a given
# (vid, visit, instance)
if any([len(pc) != 1 for pc in pcols]):
if any(len(pc) != 1 for pc in pcols):
continue
except KeyError:
......@@ -221,7 +223,8 @@ def _runChildValues(dtable, exprs, cvals, vid):
mask = mask & dtable[:, colname].isna()
# Finally we apply it to the data.
dtable[mask, colname] = cval[idxs[mask]]
dtype = dtable[:, colname].dtype
dtable[mask, colname] = cval[idxs[mask]].astype(dtype)
def applyChildValues(dtable):
......@@ -283,12 +286,13 @@ def applyNewLevels(dtable):
continue
for col in dtable.columns(vid):
old = dtable[:, col.name]
valmap = dict(zip(rawlevels[vid].astype(old.dtype),
newlevels[vid]))
new = old.replace(valmap)
corr = old.corr(new)
dtable[:, col.name] = new
old = dtable[:, col.name]
valmap = dict(zip(np.array(rawlevels[vid]).astype(old.dtype),
np.array(newlevels[vid]).astype(old.dtype)))
new = old.replace(valmap)
if corr < 0:
dtable.addFlag(col, 'inverted')
if np.issubdtype(old.dtype, np.number):
if old.corr(new) < 0:
dtable.addFlag(col, 'inverted')
dtable[:, col.name] = new
......@@ -66,7 +66,6 @@ DATA_TYPES = {
CTYPES.categorical_multiple_non_numeric : str,
CTYPES.text : str,
CTYPES.compound : str,
CTYPES.unknown : np.float32,
}
"""Default internal data type to use for the different variable types.
Used by the :func:`columnTypes` function. These types may be overridden
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment