Commit 33103aca authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

ENH: --trust_types implementation

parent b01c3f8d
......@@ -276,7 +276,7 @@ def removeSubjects(dtable, exclude=None, exprs=None):
instances = [dtable.instances(v) for v in vids]
except KeyError as e:
raise RuntimeError('Unknown variable used in exclude expression: '
'{}'.format(exprs))
'{} ({})'.format(exprs, e))
# Calculate the intersection of visits/
# instances across all variables - we
......@@ -803,15 +803,21 @@ def loadFile(fname,
# Figure out suitable data types to
# store the data for each column.
# Only date/time columns are converted
# during load - this is done for us
# by Pandas. We performs numeric
# conversion after load, via the
# coerceToNumeric function.
# pd.read_csv wants the date columns
# to be specified separately.
vttypes, dtypes = loadtables.columnTypes(vartable, toload)
datecols = [c.name for c, t in zip(toload, vttypes)
if t in (util.CTYPES.date, util.CTYPES.time)]
# If we think there might be bad data
# in the input, only date/time columns
# are converted during load, and we
# perform numeric conversion after
# load, via the coerceToNumeric
# function.
if not trustTypes:
dtypes = None
# input may or may not
# have a header row
if header: header = 0
......@@ -838,6 +844,7 @@ def loadFile(fname,
header=header,
names=allcolnames,
index_col=index,
dtype=dtypes,
usecols=shouldLoad,
parse_dates=datecols,
infer_datetime_format=True,
......@@ -859,18 +866,23 @@ def loadFile(fname,
log.debug('Processing chunk %i (kept %i / %i rows)',
i + 1, len(df), nrows)
# If not trustTypes, we manually convert
# each column to its correct type.
#
# We have to do this after load, as
# pd.read_csv will raise an error if
# a column that is specified as
# numeric contains non-numeric data.
# So we coerce data types after the
# data has been loaded. This causes
# non-numeric data to be set to nan.
cfunc = ft.partial(coerceToNumeric, vartable)
series = [df[c.name] for c in toload]
series = pool.starmap(cfunc, zip(series, toload))
if not trustTypes:
cfunc = ft.partial(coerceToNumeric, vartable)
series = [df[c.name] for c in toload]
series = pool.starmap(cfunc, zip(series, toload))
for col, s in zip(toload, series):
df.loc[:, col.name] = s
for col, s in zip(toload, series):
df.loc[:, col.name] = s
fdata.append(df)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment