Commit e2add4c4 authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

RF,BK: Intending to move column formatting/non-numeric split out of the

exporting code and into a prior step.
parent ffc576f1
......@@ -30,7 +30,7 @@ To ensure that the ``funpack`` command line help is nicely formatted, all
plugin functions should have a docstring of the form::
\"\"\"functionSignature(args)
Short description of function.
Short description of function on a single line.
Extended description
\"\"\"
......
......@@ -41,6 +41,67 @@ def exportData(dtable, outfile, fileFormat, **kwargs):
custom.runExporter(fileFormat, dtable, outfile, **kwargs)
def formatColumn(col,
dtable,
dateFormat,
timeFormat,
formatters,
chunki):
"""Formats the data for the specified column.
:arg col: :class:`.Column` to format
:arg dtable: :class:`.DataTable` containing the data
:arg dateFormat: Name of formatter to use for date columns.
:arg timeFormat: Name of formatter to use for time columns.
:arg formatters: Dict of ``{ [vid|column] : formatter }`` mappings,
specifying custom formatters to use for specific
variables.
:arg chunki: Output chunk index (used for logging).
:returns: ``pandas.Series`` instance containing the formatted data.
"""
vid = col.basevid
vartable = dtable.vartable
series = dtable[:, col.name]
# formatters can be specified
# by VID or by column name
formatter = formatters.get(vid, None)
if formatter is None:
formatter = formatters.get(col.name, None)
if vid in vartable.index: vtype = vartable['Type'][vid]
else: vtype = None
# fall back to date/time formatting
# if relevant for this column
if formatter is None:
if vtype == util.CTYPES.date:
formatter = dateFormat
elif vtype == util.CTYPES.time or \
pdtypes.is_datetime64_any_dtype(series):
formatter = timeFormat
if formatter is not None:
log.debug('Formatting column %s [chunk %u] with %s formatter',
col.name, chunki, formatter)
series = custom.runFormatter(formatter, dtable, col, series)
# apply column-specific fill
# value, if there is one
if col.fillval is not None:
series.fillna(value=col.fillval, inplace=True)
return series
@custom.formatter('default')
def defaultDateTimeFormat(dtable, column, series):
"""Default format converter for date and time columns. """
......
......@@ -9,12 +9,10 @@
import logging
import h5py
import numpy as np
import pandas as pd
import pandas.api.types as pdtypes
import h5py
import numpy as np
import pandas as pd
from . import util
from . import custom
......@@ -120,33 +118,20 @@ def exportPandasStyle(dtable,
else: s.append(key, towrite, format='table')
def exportFunpackStyle(dtable,
outfile,
key,
dateFormat=None,
timeFormat=None):
def exportFunpackStyle(dtable, outfile, key, **kwargs):
"""Save the data to a ``funpack``-style HDF5 file.
Each column is saved as a separate data set, and named according to the
column name. All columns are saved under a single group named according to
the ``key`` argument.
:arg dtable: :class:`.DataTable` containing the data
:arg outfile: File to output to
:arg key: Name to give the HDF5 group.
:arg dtable: :class:`.DataTable` containing the data
:arg dateFormat: Name of formatter to use for date columns.
:arg outfile: File to output to
:arg timeFormat: Name of formatter to use for time columns.
:arg key: Name to give the HDF5 group.
"""
if dateFormat is None: dateFormat = 'default'
if timeFormat is None: timeFormat = 'default'
vartable = dtable.vartable
log.info('Writing %u columns in to %s ...',
len(dtable.allColumns), outfile)
......@@ -158,27 +143,8 @@ def exportFunpackStyle(dtable,
for col in dtable.dataColumns:
name = '/'.join((key, col.name))
series = dtable[:, col.name]
vid = col.basevid
formatter = None
if vid in vartable.index: vtype = vartable['Type'][vid]
else: vtype = None
if vtype == util.CTYPES.date:
formatter = dateFormat
elif vtype == util.CTYPES.time or \
pdtypes.is_datetime64_any_dtype(series):
formatter = timeFormat
if formatter is not None:
log.debug('Formatting column %s with %s formatter',
name, formatter)
series = custom.runFormatter(
formatter, dtable, col, series)
else:
series = np.asarray(series)
name = '/'.join((key, col.name))
series = np.asarray(dtable[:, col.name])
if np.issubdtype(series.dtype, np.number):
dtype = None
......
......@@ -67,7 +67,7 @@ def exportTSV(dtable,
:arg outfile: File to output to
:arg sep: Separator character to use. Defaults to `'\\t'`
:arg sep: Separator character to use. Defaults to ``'\\t'``
:arg missingValues: String to use for missing/NA values. Defaults to the
empty string.
......@@ -245,63 +245,3 @@ def writeDataFrame(dtable,
na_rep=missingValues,
header=header,
index_label=idcol)
def formatColumn(col,
dtable,
dateFormat,
timeFormat,
formatters,
chunki):
"""Formats the data for the specified column.
:arg col: :class:`.Column` to format
:arg dtable: :class:`.DataTable` containing the data
:arg dateFormat: Name of formatter to use for date columns.
:arg timeFormat: Name of formatter to use for time columns.
:arg formatters: Dict of ``{ [vid|column] : formatter }`` mappings,
specifying custom formatters to use for specific
variables.
:arg chunki: Output chunk index (used for logging).
:returns: ``pandas.Series`` instance containing the formatted data.
"""
vid = col.basevid
vartable = dtable.vartable
series = dtable[:, col.name]
# formatters can be specified
# by VID or by column name
formatter = formatters.get(vid, None)
if formatter is None:
formatter = formatters.get(col.name, None)
if vid in vartable.index: vtype = vartable['Type'][vid]
else: vtype = None
# fall back to date/time formatting
# if relevant for this column
if formatter is None:
if vtype == util.CTYPES.date:
formatter = dateFormat
elif vtype == util.CTYPES.time or \
pdtypes.is_datetime64_any_dtype(series):
formatter = timeFormat
if formatter is not None:
log.debug('Formatting column %s [chunk %u] with %s formatter',
col.name, chunki, formatter)
series = custom.runFormatter(formatter, dtable, col, series)
# apply column-specific fill
# value, if there is one
if col.fillval is not None:
series.fillna(value=col.fillval, inplace=True)
return series
......@@ -125,6 +125,7 @@ def main(argv=None):
dryrun.doDryRun(dtable, unknowns, uncategorised, drop, args)
else:
doCleanAndProcess( dtable, args)
doFormat( dtable, args)
doUnknownsExport( dtable, args, unknowns, uncategorised)
doExport( dtable, args)
doICD10Export( args)
......@@ -267,6 +268,28 @@ def doCleanAndProcess(dtable, args):
processing.processData(dtable)
def doFormat(dtable, args):
"""Data formatting stage. This is called just prior to :func:`doExport`.
It applies any variable-specific formatting that has been specified, and
also formats date/time columns.
:arg dtable: :class:`.DataTable` containing the data
:arg args: :class:`argparse.Namespace` object containing command line
arguments
"""
dateFormat = args.date_format
timeFormat = args.time_format
formatters = args.tsv_var_format
# build a list of subtables for columns
# which need formatting - all date/time
# columns, and any for awhich a custom
# formatter has been provided.
subtables = []
def doExport(dtable, args):
"""Data export stage.
......@@ -289,14 +312,11 @@ def doExport(dtable, args):
# General export options
fileFormat=args.format,
dateFormat=args.date_format,
timeFormat=args.time_format,
numRows=args.num_rows,
# TSV options
sep=args.tsv_sep,
missingValues=args.tsv_missing_values,
formatters=args.tsv_var_format,
nonNumericFile=args.non_numeric_file,
# HDF5 options
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment