Commit 30dd19b1 authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

RF: Changed default tsv export behaviour so that it parallelises. Disabled if

--num_jobs is set to 1.
parent 6be2002e
......@@ -121,7 +121,6 @@ CLI_ARGUMENTS = collections.OrderedDict((
(('oi', 'output_id_column'), {}),
(('edf', 'date_format'), {'default' : 'default'}),
(('etf', 'time_format'), {'default' : 'default'}),
(('nr', 'num_rows'), {'type' : int}),
(('uf', 'unknown_vars_file'), {}),
(('imf', 'icd10_map_file'), {}),
(('def', 'description_file'), {}),
......@@ -131,6 +130,7 @@ CLI_ARGUMENTS = collections.OrderedDict((
(('ts', 'tsv_sep'), {'default' : DEFAULT_TSV_SEP}),
(('tm', 'tsv_missing_values'), {'default' : ''}),
(('nn', 'non_numeric_file'), {}),
(('nr', 'num_rows'), {'type' : int}),
(('tvf', 'tsv_var_format'), {'nargs' : 2,
'metavar' : ('VID', 'FORMATTER'),
'action' : 'append'})]),
......@@ -346,9 +346,6 @@ CLI_ARGUMENT_HELP = {
'time_format' :
'Formatter to use for time variables (default: "default").',
'num_rows' :
'Number of rows to write at a time.',
'unknown_vars_file' :
'Save list of unknown variables/columns to file. Only applicable if '
'--import_all is enabled.',
......@@ -369,6 +366,8 @@ CLI_ARGUMENT_HELP = {
'tsv_missing_values' :
'String to use for missing values in output file (default: empty '
'string).' ,
'num_rows' :
'Number of rows to write at a time. Ignored if --num_jobs is set to 1.',
'non_numeric_file' :
'Export all non-numeric columns (after formatting) to this file instead '
'of the primary output file.',
......@@ -389,7 +388,8 @@ CLI_ARGUMENT_HELP = {
'low_memory' : 'Deprecated, has no effect.',
'work_dir' : 'Deprecated, has no effect.',
'log_file' : 'Save log messages to file.',
'num_jobs' : 'Maximum number of jobs to run in parallel. '
'num_jobs' : 'Maximum number of jobs to run in parallel. Set to 1 '
'to disable parallelisation. '
'(default: number of available CPUs [{} on '
'this platform]).'.format(mp.cpu_count()),
'pass_through' : 'Do not perform any cleaning or processing on the data - '
......
......@@ -26,6 +26,12 @@ TSV_SEP = '\t'
"""Default separator string to use in TSV-style output files."""
NUM_ROWS = 10000
"""Default number of rows to export at a time by :func:`exportTSV` - the
default value for its ``numRows`` argument.
"""
@custom.exporter('tsv')
def exportTSV(dtable,
outfile,
......@@ -70,18 +76,19 @@ def exportTSV(dtable,
specifying custom formatters to use for specific
variables.
:arg numRows: Number of rows to write at a time. Defaults to writing
all rows in one go.
:arg numRows: Number of rows to write at a time. Defaults to
:attr:`NUM_ROWS`.
:arg nonNumericFile: If provided, non-numeric columns (after formatting)
are saved to this file instead of to ``outfile``
"""
if sep is None: sep = TSV_SEP
if missingValues is None: missingValues = ''
if dateFormat is None: dateFormat = 'default'
if timeFormat is None: timeFormat = 'default'
if formatters is None: formatters = {}
if numRows is None: numRows = len(dtable)
if numRows is None: numRows = NUM_ROWS
# We're going to output each chunk of
# subjects to a separate file (in
......
......@@ -104,7 +104,8 @@ def main(argv=None):
# We need to initialise icd10
# before the worker processes
# are created
# are created, so its state is
# shared by all processes.
icd10.initialise(mgr)
pool = mp.Pool(args.num_jobs)
......@@ -376,6 +377,13 @@ def doExport(dtable, args):
if exprs is not None:
subjects = None
# If exporting to TSV, and not parallelising,
# we export the entire file in one go. Because
# what's the point in chunked export if we're
# not parallelising across chunks?
if args.num_jobs <= 1:
args.num_rows = len(dtable)
with util.timed('Data export', log):
exporting.exportData(
dtable,
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment