Commit b01c3f8d authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

ENH: New --trust_types argumentm to choose between trusting the input, and

manually casting/coercing it all. Not implemented yet
parent d15f096f
......@@ -69,6 +69,7 @@ CLI_ARGUMENTS = collections.OrderedDict((
('Import options', [
(('ia', 'import_all'), {'action' : 'store_true'}),
(('r', 'remove_unknown'), {'action' : 'store_true'}),
(('tt', 'trust_types'), {'action' : 'store_true'}),
(('s', 'subject'), {'action' : 'append'}),
(('v', 'variable'), {'action' : 'append'}),
(('co', 'column'), {'action' : 'append'}),
......@@ -241,6 +242,12 @@ CLI_ARGUMENT_HELP = {
'Remove variables which are not listed in variable table. Implied if '
'variables are specified using --variable or --category.',
'trust_types' :
'Assume that columns in the input data with a known numeric type do not '
'contain any bad/unparseable values. Using this flag will improve import '
'performance, but will cause funpack to crash if the input file(s) do '
'contain bad values.',
'subject' :
'Subject ID, range, comma-separated list, or file containing a list of '
'subject IDs, or variable expression, denoting subjects to include. Can '
......
......@@ -73,6 +73,7 @@ def importData(datafiles,
subjects=None,
encoding=None,
unknownVars=None,
trustTypes=False,
removeUnknown=True,
indexes=None,
mergeAxis=None,
......@@ -117,6 +118,11 @@ def importData(datafiles,
:arg unknownVars: List of :class:`.Column` objects representing
unknown variables
:arg trustTypes: If ``True``, it is assumed that columns with a
known data type do not contain any bad/unparseable
values. This improves performance, but will cause
an error if the assumption does not hold.
:arg removeUnknown: If ``True`` (the default), any variables in
``datafile`` which are not in ``varfile`` are not
loaded. Ignored if ``variables``or ``categories``
......@@ -191,6 +197,7 @@ def importData(datafiles,
mergeAxis=mergeAxis,
mergeStrategy=mergeStrategy,
loaders=loaders,
trustTypes=trustTypes,
lowMemory=lowMemory,
workDir=workDir,
pool=pool,
......@@ -548,6 +555,7 @@ def loadData(datafiles,
subjects=None,
encoding=None,
indexes=None,
trustTypes=False,
mergeAxis=None,
mergeStrategy=None,
loaders=None,
......@@ -579,6 +587,9 @@ def loadData(datafiles,
the position of the column to use as the index.
Defaults to 0 (the first column).
:arg trustTypes: Assume that columns with known data type do not contain
any bad/unparseable values.
:arg mergeAxis: Merging axis to use when loading multiple data files -
see the :func:`mergeData` function. Defaults to
:attr:`MERGE_AXIS`.
......@@ -661,6 +672,7 @@ def loadData(datafiles,
nrows=nrows,
subjects=subjects,
encoding=fencoding,
trustTypes=trustTypes,
lowMemory=lowMemory,
workDir=workDir,
pool=pool,
......@@ -699,53 +711,58 @@ def loadFile(fname,
nrows=None,
subjects=None,
encoding=None,
trustTypes=False,
lowMemory=False,
workDir=None,
pool=None,
mgr=None):
"""Loads data from the specified file.
:arg fname: Path to the data file
:arg fname: Path to the data file
:arg vartable: Variable table
:arg vartable: Variable table
:arg header: ``True`` if the file has a header row, ``False``
otherwise.
:arg header: ``True`` if the file has a header row, ``False`` otherwise.
:arg dialect: File dialect (see :func:`.fileinfo`).
:arg dialect: File dialect (see :func:`.fileinfo`).
:arg allcols: Sequence of :class:`.Column` objects describing all
columns in the file.
:arg allcols: Sequence of :class:`.Column` objects describing all columns
in the file.
:arg toload: Sequence of :class:`.Column` objects describing the
columns that should be loaded, as generated by
:func:`columnsToLoad`.
:arg toload: Sequence of :class:`.Column` objects describing the columns
that should be loaded, as generated by
:func:`columnsToLoad`.
:arg index: Column position of index column (starting from 0).
Defaults to 0.
:arg index: Column position of index column (starting from 0). Defaults
to 0.
:arg nrows: Number of rows to read at a time. Defaults to
attr:`NUM_ROWS`.
:arg nrows: Number of rows to read at a time. Defaults to
attr:`NUM_ROWS`.
:arg subjects: List of subjects to include.
:arg subjects: List of subjects to include.
:arg encoding: Character encoding (or sequence of encodings, one
for each data file). Defaults to ``latin1``.
:arg encoding: Character encoding (or sequence of encodings, one
for each data file). Defaults to ``latin1``.
:arg trustTypes: Assume that columns with known data type do not contain
any bad/unparseable values.
:arg lowMemory: Store intermediate results on disk to save RAM (see
:mod:`.storage`).
:arg lowMemory: Store intermediate results on disk to save RAM (see
:mod:`.storage`).
:arg workDir: Directory to store intermediate files (see
:mod:`.storage`). Only relevant when
``lowMemory is True``
:arg workDir: Directory to store intermediate files (see
:mod:`.storage`). Only relevant when
``lowMemory is True``
:arg pool: ``multiprocessing.Pool`` object for running tasks in
parallel. Only relevant when ``lowMemory is True``.
:arg pool: ``multiprocessing.Pool`` object for running tasks in
parallel. Only relevant when ``lowMemory is True``.
:arg mgr: ``multiprocessing.Manager`` to use for sharing state.
Only relevant when ``lowMemory is True``.
:arg mgr: ``multiprocessing.Manager`` to use for sharing state.
Only relevant when ``lowMemory is True``.
:returns: A ``pandas.DataFrame``, or a
:class:`.HDFStoreCollection`, containing the data.
:returns: A ``pandas.DataFrame``, or a
:class:`.HDFStoreCollection`, containing the data.
"""
ownPool = pool is None
......
......@@ -211,6 +211,7 @@ def doImport(args, pool, mgr):
encoding=args.encoding,
indexes=args.index,
unknownVars=unknowns,
trustTypes=args.trust_types,
removeUnknown=removeUnknown,
mergeAxis=args.merge_axis,
mergeStrategy=args.merge_strategy,
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment