exporting_tsv.py 9.57 KB
Newer Older
1
2
#!/usr/bin/env python
#
3
# exporting_tsv.py - Export data to a TSV file.
4
5
6
#
# Author: Paul McCarthy <pauldmccarthy@gmail.com>
#
7
8
"""This module provides the :func:`exportTSV` and :func:`exportCSV` functions,
which export the data contained in a :class:`.DataTable` to a TSV or CSV file.
9
"""
10
11


12
13
14
15
import functools as ft
import os.path   as op
import              os
import              logging
16
17
18
19
20
21
22
23
24
25
26
27

import numpy            as np
import pandas           as pd
import pandas.api.types as pdtypes

from . import util
from . import custom


log = logging.getLogger(__name__)


28
29
30
31
32
33
NUM_ROWS = 10000
"""Default number of rows to export at a time by :func:`exportTSV` - the
default value for its ``numRows`` argument.
"""


34
35
36
37
38
39
40
41
@custom.exporter('csv')
def exportCSV(dtable, outfile, **kwargs):
    """Export data to a CSV-style file.

    This function is identical to the :func:`exportTSV`, except that the
    default value for the `sep`` argument is a ``','`` instead of a ``'\t'``.
    """

42
43
44
45
    sep = kwargs.pop('sep', None)
    if sep is None:
        sep = ','

46
47
48
    exportTSV(dtable, outfile, sep=sep, **kwargs)


49
50
51
52
53
54
55
56
57
@custom.exporter('tsv')
def exportTSV(dtable,
              outfile,
              sep=None,
              missingValues=None,
              dateFormat=None,
              timeFormat=None,
              formatters=None,
              numRows=None,
58
              nonNumericFile=None,
59
60
61
              **kwargs):
    """Export data to a TSV-style file.

62
63
64
65
    This may be parallelised by row - chunks of ``numRows`` rows will be
    saved to separate temporary output files in parallel, and then concatenated
    afterwards to produce the final output file.

66
    :arg dtable:         :class:`.DataTable` containing the data
67

68
    :arg outfile:        File to output to
69

70
    :arg sep:            Separator character to use. Defaults to `'\\t'`
71

72
73
    :arg missingValues:  String to use for missing/NA values. Defaults to the
                         empty string.
74

75
    :arg dateFormat:     Name of formatter to use for date columns.
76

77
    :arg timeFormat:     Name of formatter to use for time columns.
78

79
80
81
    :arg formatters:     Dict of ``{ [vid|column] : formatter }`` mappings,
                         specifying custom formatters to use for specific
                         variables.
82

83
84
    :arg numRows:        Number of rows to write at a time. Defaults to
                         :attr:`NUM_ROWS`.
85
86
87

    :arg nonNumericFile: If provided, non-numeric columns (after formatting)
                         are saved to this file instead of to ``outfile``
88
    """
89

90
    if sep           is None: sep           = '\t'
91
92
93
94
    if missingValues is None: missingValues = ''
    if dateFormat    is None: dateFormat    = 'default'
    if timeFormat    is None: timeFormat    = 'default'
    if formatters    is None: formatters    = {}
95
    if numRows       is None: numRows       = NUM_ROWS
96

97
98
    # We're going to output each chunk of
    # subjects to a separate file (in
99
    # parallel), and then cat the files
100
    # together afterwards
101
102
103
104
105
    index      = dtable.index
    nchunks    = int(np.ceil(len(index) / numRows))
    idxchunks  = [index[i:i + numRows] for i in
                  range(0, len(index), numRows)]
    subtables  = [dtable.subtable(rows=c) for c in idxchunks]
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
    outfiles   = ['{}_{}'.format(outfile, i) for i in range(nchunks)]

    if nonNumericFile is not None:
        nnfiles = ['{}_{}'.format(nonNumericFile, i) for i in range(nchunks)]
    else:
        nnfiles = [None] * nchunks

    # write each chunk in parallel
    args = zip(subtables,
               outfiles,
               nnfiles,
               [True] + [False] * (nchunks - 1),
               range(nchunks))
    func = ft.partial(writeDataFrame,
                      sep=sep,
                      missingValues=missingValues,
122
123
124
                      dateFormat=dateFormat,
                      timeFormat=timeFormat,
                      formatters=formatters)
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154

    with dtable.pool() as pool:
        pool.starmap(func, args)

    # concatenate the chunks to
    # form the final output file
    if len(outfiles) == 1:
        os.rename(outfiles[0], outfile)
    else:
        util.cat(outfiles, outfile)

    if nonNumericFile is not None and all([op.exists(f) for f in nnfiles]):
        if len(nnfiles) == 1:
            os.rename(nnfiles[0], nonNumericFile)
        else:
            util.cat(nnfiles, nonNumericFile)

    # remove intermediate files
    for f in outfiles + nnfiles:
        if f is not None and op.exists(f):
            os.remove(f)


def writeDataFrame(dtable,
                   outfile,
                   nonNumericFile,
                   header,
                   chunki,
                   sep,
                   missingValues,
155
156
157
158
159
160
161
162
163
164
165
                   dateFormat,
                   timeFormat,
                   formatters):
    """Writes all of the data in ``dtable`` to ``outfile``.

    Called by :func:`exportTSV` to output one chunk of data.

    :arg dtable:         :class:`.DataTable` containing the data

    :arg outfile:        File to output to

166
167
168
    :arg nonNumericFile: If not ``None``, non-numeric columns (after
                         formatting) are saved to this file instead of to
                         ``outfile``.
169
170
171
172
173

    :arg header:         If ``True``, write the header row (column names).

    :arg chunki:         Chunk index (used for logging)

174
    :arg sep:            Separator character to use.
175

176
    :arg missingValues:  String to use for missing/NA values.
177

178
179
180
181
182
183
184
185
    :arg dateFormat:     Name of formatter to use for date columns.

    :arg timeFormat:     Name of formatter to use for time columns.

    :arg formatters:     Dict of ``{ [vid|column] : formatter }`` mappings,
                         specifying custom formatters to use for specific
                         variables.
    """
186

187
188
189
190
    # If nonNumericFile is specified, we
    # store the names of all numeric and
    # non-numeric columns here so we can
    # figure out which columns to put
191
192
193
    # where. We run the columns through
    # formatting before deciding whether
    # they are numeric or non-numeric.
194
195
196
    numericCols    = []
    nonNumericCols = []

197
    columns = dtable.dataColumns
198
    towrite = pd.DataFrame(index=dtable.index)
199

Paul McCarthy's avatar
Paul McCarthy committed
200
201
    log.info('Writing %u columns and %u rows [chunk %u] to %s ...',
             len(columns), len(dtable), chunki, outfile)
202
203
204
205
206

    # Format every column, and
    # separate out into numeric
    # and non-numeric
    for col in columns:
207
        name   = col.name
208
209
        series = formatColumn(
            col, dtable, dateFormat, timeFormat, formatters, chunki)
210
211
212
213
214
215
216
217
218

        towrite[name] = series

        # now we can tell whether this column
        # is numeric or non- numeric - store
        # its name accordingly
        if pdtypes.is_numeric_dtype(series): numericCols   .append(name)
        else:                                nonNumericCols.append(name)

219
    # Do we have any non-numeric columns?
220
221
222
    if (nonNumericFile is None) or (len(nonNumericCols) == 0):
        numericChunk   = towrite
        nonNumericFile = None
223

224
    else:
225
226
227
        numericChunk    = towrite[numericCols]
        nonNumericChunk = towrite[nonNumericCols]

228
229
230
231
232
        log.debug('Redirecting %i non-numeric columns to %s '
                  '(remaining %i columns will be written to %s)',
                  len(nonNumericCols), nonNumericFile,
                  len(numericCols),    outfile)

233
234
    if header: idcol = towrite.index.name
    else:      idcol = None
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249

    numericChunk.to_csv(outfile,
                        sep=sep,
                        na_rep=missingValues,
                        header=header,
                        index_label=idcol)

    if nonNumericFile is not None:
        nonNumericChunk.to_csv(nonNumericFile,
                               sep=sep,
                               na_rep=missingValues,
                               header=header,
                               index_label=idcol)


250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
def formatColumn(col,
                 dtable,
                 dateFormat,
                 timeFormat,
                 formatters,
                 chunki):
    """Formats the data for the specified column.

    :arg col:        :class:`.Column` to format

    :arg dtable:     :class:`.DataTable` containing the data

    :arg dateFormat: Name of formatter to use for date columns.

    :arg timeFormat: Name of formatter to use for time columns.

    :arg formatters: Dict of ``{ [vid|column] : formatter }`` mappings,
                     specifying custom formatters to use for specific
                     variables.

    :arg chunki:     Output chunk index (used for logging).

    :returns:        ``pandas.Series`` instance containing the formatted data.
    """

275
    vid      = col.basevid
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
    vartable = dtable.vartable
    series   = dtable[:, col.name]

    # formatters can be specified
    # by VID or by column name
    formatter = formatters.get(vid, None)
    if formatter is None:
        formatter = formatters.get(col.name, None)

    if vid in vartable.index: vtype = vartable['Type'][vid]
    else:                     vtype = None

    # fall back to date/time formatting
    # if relevant for this column
    if formatter is None:
        if   vtype == util.CTYPES.date:
            formatter = dateFormat
        elif vtype == util.CTYPES.time or \
             pdtypes.is_datetime64_any_dtype(series):
            formatter = timeFormat

    if formatter is not None:
        log.debug('Formatting column %s [chunk %u] with %s formatter',
                  col.name, chunki, formatter)
        series = custom.runFormatter(formatter, dtable, col, series)

302
303
304
305
306
    # apply column-specific fill
    # value, if there is one
    if col.fillval is not None:
        series.fillna(value=col.fillval, inplace=True)

307
    return series