Commit a0c4ab45 authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

Merge branch 'rf/unknown_unprocessed' into 'master'

Rf/unknown unprocessed

See merge request fsl/ukbparse!115
parents ae664cb5 c281dea3
Pipeline #3627 passed with stages
in 11 minutes and 44 seconds
......@@ -2,6 +2,9 @@
set -e
apt-get update -y
apt-get install -y bsdmainutils
pip install wheel
python setup.py sdist
python setup.py bdist_wheel
......
#!/bin/bash
set -e
apt-get update -y
apt-get install -y bsdmainutils
pip install --upgrade pip setuptools
pip install -r requirements.txt
pip install -r requirements-demo.txt
......
......@@ -96,6 +96,12 @@ test:3.6:
<<: *test_template
test:3.7:
stage: test
image: python:3.7
<<: *test_template
#############
# Style stage
#############
......
......@@ -2,6 +2,39 @@
======================
0.17.0 (Monday 22nd April 2019)
-------------------------------
Added
^^^^^
* New ``--non_numeric_file`` option allows non-numeric columns to be saved to
a separate file (TSV export only).
* Built-in ``fmrib.cfg`` configuration file, which can be used via ``-cfg
fmrib``.
Changed
^^^^^^^
* The file generated by ``--unknown_vars_file`` now includes variables which
are known, but are not in an existing category, and do not have any cleaning
or processing rules specified for them.
* Built-in categories have been updated.
Fixed
^^^^^
* A bug in the column names generated for binarised ICD10 categorical codes
has been fixed. This bug would potentially have resulted in collisions
between column names for different ICD10 codes.
0.16.0 (Friday 22nd March 2019)
-------------------------------
......
......@@ -6,7 +6,7 @@
#
__version__ = '0.16.0'
__version__ = '0.17.0'
"""The ``ukbparse`` versioning scheme roughly follows Semantic Versioning
conventions.
"""
......
......@@ -154,7 +154,7 @@ def _runChildValues(dtable, exprs, cvals, vid):
if not dtable.present(vid):
return
# TODO I'm currently evaluating expressions
# NOTE I'm currently evaluating expressions
# *within visit* and *within instance*, i.e.
# for a variable at a given visit/instance,
# i am evaluating the expression on the
......
......@@ -137,6 +137,7 @@ CLI_ARGUMENTS = collections.OrderedDict((
('TSV export options', [
(('ts', 'tsv_sep'), {'default' : DEFAULT_TSV_SEP}),
(('tm', 'tsv_missing_values'), {'default' : ''}),
(('nn', 'non_numeric_file'), {}),
(('tvf', 'tsv_var_format'), {'nargs' : 2,
'metavar' : ('VID', 'FORMATTER'),
'action' : 'append'})]),
......@@ -184,10 +185,13 @@ CLI_DESCRIPTIONS = {
'Export options' :
'The --unknown_vars_file (only active if --import_all is also active)\n'
'allows a file to be saved containing information about columns which\n'
'were in the input data, but not in the variable table. It contains\n'
'four columns - the column name, the originating input file, whether\n'
'the column passed the processing stage (e.g. sparsity/redundancy\n'
'checks), and whether the column was exported.',
'were in the input data, but were either not in the variable table, or\n'
'were uncategorised and did not have any cleaning/processing rules\n'
'specified. It contains five columns - the column name, the originating\n'
'input file, the reason the column is being included (either unknown\n'
'or uncategorised/unprocessed), whether the column passed the processing\n'
'stage (e.g. sparsity/redundancy checks), and whether the column was\n'
'exported.',
}
......@@ -361,11 +365,12 @@ CLI_ARGUMENT_HELP = {
'tsv_sep' :
'Column separator string to use in output file (default: "{}")'.format(
DEFAULT_TSV_SEP.replace('\t', '\\t')),
'tsv_missing_values' :
'String to use for missing values in output file (default: empty '
'string).' ,
'non_numeric_file' :
'Export all non-numeric columns (after formatting) to this file instead '
'of the primary output file.',
'tsv_var_format' :
'Apply custom formatter to the specified variable.',
......@@ -771,6 +776,15 @@ def loadConfigFile(cfgfile):
argv = []
# If the specified config file does
# not exist, assume it is a reference
# to a built-in config file
if not op.exists(cfgfile):
if not cfgfile.endswith('.cfg'):
cfgfile = cfgfile + '.cfg'
moddir = op.dirname(op.abspath(__file__))
cfgfile = op.join(moddir, 'configs', cfgfile)
log.debug('Loading arguments from configuration file %s', cfgfile)
with open(cfgfile, 'rt') as f:
......
overwrite
import_all
unknown_vars_file unknowns.tsv
non_numeric_file non_numerics.tsv
icd10_map_file icd10_codes.tsv
plugin_file fmrib
loader FMRIB_internal_info.txt FMRIBImaging
date_format FMRIBImagingDate
time_format FMRIBImagingTime
category 1
category 2
category 3
category 10
category 11
category 12
category 13
category 14
category 20
category 21
category 22
category 23
category 24
category 25
category 30
category 32
category 50
category 51
......@@ -131,6 +131,8 @@ class PluginRegistry(object):
# it is a direct reference to a
# file in the plugins directory.
if not op.exists(filename):
if not filename.endswith('.py'):
filename = filename + '.py'
moddir = op.dirname(op.abspath(__file__))
filename = op.join(moddir, 'plugins', filename)
......
......@@ -2,20 +2,20 @@ ID Category Variables
1 age, sex, brain MRI protocol, Phase 31,34,22200,25780
2 genetics 21000,22000:22125,22201:22325
3 early life factors 52,129,130,1677,1687,1697,1737,1767,1777,1787,20022
10 lifestyle and environment - general 3:6,189,670,680,699,709,728,738,1031,1797,1807,1835,1845,1873,1883,2139,2149,2375,2385,2395,2405,2267,2277,2714:10:2834,2946,3526,3536,3546,3581,3591,3659,3669,3700,3710,3720,3829,3839,3849,3872,3882,3912,3942,3972,3982,4501,4674,4825,4836,5057,6138,6142,6139:6141,6145:6146,6160,10016,10105,10114,10721,10722,10740,10749,10860,10877,10886,20074:20075,20110:20113,20118:20119,20121,22501,22599,22606,22700,22702,22704
10 lifestyle and environment - general 3:6,189,670,680,699,709,728,738,1031,1797,1807,1835,1845,1873,1883,2139,2149,2159,2375,2385,2395,2405,2267,2277,2714:10:2834,2946,3526,3536,3546,3581,3591,3659,3669,3700,3710,3720,3829,3839,3849,3872,3882,3912,3942,3972,3982,4501,4674,4825,4836,5057,6138,6142,6139:6141,6145:6146,6160,10016,10105,10114,10721,10722,10740,10749,10860,10877,10886,20074:20075,20110:20113,20118:20119,20121,22501,22599,22606,22700,22702,22704
11 lifestyle and environment - exercise and work 1001,1011,767,777,796,806,816,826,845,864,874,884,894,904,914,924,943,971,981,991,1021,1050:10:1220,2624,2634,3426,3637,3647,6143,6162,6164,10953,10962,10971,22604,22605,22607:22615,22620,22630,22631,22640:22655,104900,104910,104920
12 lifestyle and environment - food and drink 1289:10:1389,1408:10:1548,2654,3680,6144,10007,10723,10767,10776,10855,10912,20084:20094,20098:20109,100001:100009,100011:100019,100021:100025,100010:10:100560,100760:10:104670
13 lifestyle and environment - alcohol 1558:10:1628,2664,3731,3859,4407,4418,4429,4440,4451,4462,5364,10818,20095:20097,20117,100580:10:100740
14 lifestyle and environment - tobacco 1239:10:1279,2644,2867:10:2907,2926,2936,3159,3436:10:3506,5959,6157,6158,6183,6194,10115,10827,10895,20116,22506:22508
20 physical measures - general 46:50,1707,1717,1727,1747,1757,2306,3062:3065,3088,3089,3160,10691,10694:10696,20015,21001,21002,22400:22410,22412,22414,22427,23098:23130,23244:23289
21 physical measures - bone density and sizes 77,78,3083:3085,3143:3144,3148,4100:4101,4104:4106,4119:4120,4124:4125,4138:4141,4143:4146,23200:23243,23290:23320
22 physical measures - cardiac & blood vessels 93:95,102,4079,4080,4194,4195,4196,5983,5984,5986,5992,5993,6019,6020,6022,6024,6032:6034,6038,6039,12673:12687,12697,12698,12702,21021,22420:22426
23 hearing test 4229,4230,4232,4233,4236,4240:4244,4268:4270,4275:4277,10793,20019,20021,20060
24 eye test 5084:5089,5096:5119,5132:5135,5149,5155:5164,5181,5198,5221,5237,5251,5254:5257,5262:5265,5276,5292,5306,5324:5325,5327,6070,6072,20052,20055
25 physical activity measures 90002:90003,90011:90013,90015,90019:90083,90086:90089,90091:90150,90159:90177,90179,90182:90195
21 physical measures - bone density and sizes 77,78,3083:3085,3143:3144,3147:3148,4092,4095,4100:4101,4104:4106,4119:4120,4124:4125,4138:4141,4143:4146,23200:23243,23290:23320
22 physical measures - cardiac & blood vessels 93:95,102,4079,4080,4194,4195,4196,4205,5983,5984,5986,5992,5993,6014:6017,6019,6020,6022,6024,6032:6034,6038,6039,12673:12687,12697,12698,12702,21021,22420:22426
23 hearing test 4229,4230,4232,4233,4236,4240:4244,4268:4270,4275:4277,4849,10793,20019,20021,20060
24 eye test 5084:5091,5096:5119,5132:5136,5138:5149,5152,5155:5164,5181,5183,5198,5215,5221,5237,5251,5254:5259,5262:5267,5274,5276,5292,5306,5324:5328,6070,6072,20052,20055
25 physical activity measures 90002:90003,90010:90013,90015,90017:90177,90179,90181:90195
30 blood assays 74,30000:10:30300,30104,30112,30114,30172,30174,30242,30252,30254
31 brain IDPs 25000:25746,25761:25768,25781:25920
32 cognitive phenotypes 111,398:404,630,4250:4256,4258:4259,4282:4283,4285:4286,4288:4292,4957,4968,4979,4990,5001,5012,5556,5699,5779,5790,5866,10137:10141,10144,10146:10147,10241,10609,10612,20016,20018,20023,20082,20128:20157,20159,20169:2:20191,20193,20195,20196,20198,20200,20229,20230,20240,20242,20244:20248
50 health and medical history, health outcomes 84,87,92,134:137,2178,2188,2207:10:2257,2296,2316,2335:10:2365,2415,2443:10:2473,2492,2674,2684,2704,2844,2956:10:2986,3005,3079,3140,3393,3404,3414,3571,3606,3616,3627,3741,3751,3761,3773,3786,3799,3809,3894,3992,4012,4022,4041,4056,4067,4689,4700,4717,4728,4792,4803,4814,5408,5419,5430,5441,5452,5463,5474,5485,5496,5507,5518,5529,5540,5610,5832,5843,5855,5877,5890,5901,5912,5923,5934,5945,6119,6147,6148,6149,6150,6151,6152,6153,6154,6155,6159,6177,6179,6205,10004,10006,10854,20001:20011,22126:22181,22502:22505,22616,22618,22619,40001:41253,42000:42013
51 mental health self-report 1920:10:2110,4526,4537,4548,4559,4570,4581,4598,4609,4620,4631,4653,5375,5386,5663,5674,6156
99 miscellaneous 19,21,53,54,68,120,132,757,1647,2129,3137,12139,12140,12141,12187,12188,12223,12224,12253,12254,12623,12624,12651,12652,12663,12664,12671,12695,12699,12700,12704,12706,12848,12851,12854,20012:20014,20024,20049,20053:20054,20061:20062,20077,20078,20083,20114:20115,20158,20201:20227,20249:20253,21003,22499,22500,22600:22603,22617,22660:22664,25747:25753,40000,105010,105030,110005,110006
32 cognitive phenotypes 62,111,398:404,630,4250:4256,4258:4259,4282:4283,4285:4286,4288:4292,4294,4935,4957,4968,4979,4990,5001,5012,5556,5699,5779,5790,5866,10137:10141,10144:10147,10241,10609:10610,10612,20016,20018,20023,20082,20128:20157,20159,20165,20169:2:20191,20193,20195,20196,20198,20200,20229,20230,20240,20242,20244:20248
50 health and medical history, health outcomes 84,87,92,134:137,2178,2188,2207:10:2257,2296,2316,2335:10:2365,2415,2443:10:2473,2492,2674,2684,2694,2704,2844,2956:10:2986,3005,3079,3140,3393,3404,3414,3571,3606,3616,3627,3741,3751,3761,3773,3786,3799,3809,3894,3992,4012,4022,4041,4056,4067,4689,4700,4717,4728,4792,4803,4814,5408,5419,5430,5441,5452,5463,5474,5485,5496,5507,5518,5529,5540,5610,5832,5843,5855,5877,5890,5901,5912,5923,5934,5945,6119,6147,6148,6149,6150,6151,6152,6153,6154,6155,6159,6177,6179,6205,10004:10006,10854,20001:20011,22126:22181,22502:22505,22616,22618,22619,40001:41253,42000:42013
51 mental health self-report 1920:10:2110,4526,4537,4548,4559,4570,4581,4598,4609,4620,4631,4642,4653,5375,5386,5663,5674,6156
99 miscellaneous 19,21,53,54,68,120,132,757,1647,2129,3066,3137,4257,4293,4295,5987:5988,5991,10697,12139,12140,12141,12187,12188,12223,12224,12253,12254,12623,12624,12651,12652,12663,12664,12671,12695,12699,12700,12704,12706,12848,12851,12854,20012:20014,20024,20031:20032,20049,20053:20054,20061:20062,20077,20078,20083,20114:20115,20158,20201:20227,20249:20253,21003,22499,22500,22600:22603,22617,22660:22664,25747:25753,30002:10:30302,30003:10:30303,40000,105010,105030,110005,110006
Variable Process
40001 binariseCategorical(acrossInstances=True, acrossVisits=True, nameFormat='{vid}{value}')
40002 binariseCategorical(acrossInstances=True, acrossVisits=True, nameFormat='{vid}{value}')
40006 binariseCategorical(acrossInstances=True, acrossVisits=True, nameFormat='{vid}{value}')
41202 binariseCategorical(acrossInstances=True, nameFormat='{vid}{value}-{visit}')
41204 binariseCategorical(acrossInstances=True, nameFormat='{vid}{value}-{visit}')
40001 binariseCategorical(acrossInstances=True, nameFormat='{vid}{value}-{visit}.0')
40002 binariseCategorical(acrossInstances=True, nameFormat='{vid}{value}-{visit}.0')
40006 binariseCategorical(acrossInstances=True, nameFormat='{vid}{value}-{visit}.0')
41202 binariseCategorical(acrossInstances=True, nameFormat='{vid}{value}-{visit}.0')
41204 binariseCategorical(acrossInstances=True, nameFormat='{vid}{value}-{visit}.0')
all_independent removeIfSparse(minpres=51, maxcat=0.99)
all removeIfRedundant(0.99, 0.2)
......@@ -13,18 +13,21 @@ anything.
import ukbparse
def doDryRun(dtable, unknowns, dropped, args):
def doDryRun(dtable, unknowns, unprocessed, dropped, args):
"""Dry run.
Prints out information about the cleaning/processing that would be applied.
:arg dtable: :class:`.DataTable` containing the data
:arg unknowns: List of :class:`.Column` objects representing the
unknown columns.
:arg dropped: List of :class:`.Column` objects representing the
unknown columns.
:arg args: :class:`argparse.Namespace` object containing command line
arguments
:arg dtable: :class:`.DataTable` containing the data
:arg unknowns: List of :class:`.Column` objects representing the
unknown columns.
:arg unprocessed: A sequence of :class:`.Column` objects representing
columns which are uncategorised, and have no processing
or cleaning rules specified on them.
:arg dropped: List of :class:`.Column` objects representing the
unknown columns.
:arg args: :class:`argparse.Namespace` object containing command
line arguments
"""
variables = dtable.variables
......@@ -36,19 +39,23 @@ def doDryRun(dtable, unknowns, dropped, args):
print(' [Using naive merge strategy - column/'
'variable count may not be accurate!]')
print(' Loaded columns: {}'.format(len(dtable.allColumns)))
print(' Loaded columns: {}'.format(len(dtable.allColumns)))
if args.noisy > 0:
print('\n'.join([' {}'.format(c.name) for c in dtable.allColumns]))
print(' Ignored columns: {}'.format(len(dropped)))
print(' Ignored columns: {}'.format(len(dropped)))
if args.noisy > 0:
print('\n'.join([' {}'.format(c.name) for c in dropped]))
print(' Unknown columns: {}'.format(len(unknowns)))
print(' Unknown columns: {}'.format(len(unknowns)))
if args.noisy > 0:
print('\n'.join([' {}'.format(c.name) for c in unknowns]))
print(' Loaded variables: {}'.format(len(variables)))
print(' Unprocessed/uncategorised columns: {}'.format(len(unprocessed)))
if args.noisy > 0:
print('\n'.join([' {}'.format(c.name) for c in unprocessed]))
print(' Loaded variables: {}'.format(len(variables)))
if args.noisy > 0:
print('\n'.join([' {}'.format(v) for v in variables]))
print()
......
......@@ -119,6 +119,8 @@ def exportData(dtable,
:arg fileFormat: File format to export to - the name of a ``@exporter``
plugin. If not provided, defaults to
:attr:`EXPORT_FORMAT`
All other arguments are passed through to the exporter plugin.
"""
if fileFormat is None: fileFormat = EXPORT_FORMAT
......
......@@ -35,35 +35,39 @@ def exportTSV(dtable,
timeFormat=None,
formatters=None,
numRows=None,
nonNumericFile=None,
**kwargs):
"""Export data to a TSV-style file.
:arg dtable: :class:`.DataTable` containing the data
:arg dtable: :class:`.DataTable` containing the data
:arg outfile: File to output to
:arg outfile: File to output to
:arg subjects: Sequence containing subjects (and order) to export.
:arg subjects: Sequence containing subjects (and order) to export.
:arg idcol: Name to use for the subject ID column
:arg idcol: Name to use for the subject ID column
:arg colnames: Sequence containing column names
:arg colnames: Sequence containing column names
:arg sep: Separator character to use. Defaults to
:attr:`TSV_SEP`
:arg sep: Separator character to use. Defaults to
:attr:`TSV_SEP`
:arg missingValues: String to use for missing/NA values. Defaults to the
empty string.
:arg missingValues: String to use for missing/NA values. Defaults to the
empty string.
:arg dateFormat: Name of formatter to use for date columns.
:arg dateFormat: Name of formatter to use for date columns.
:arg timeFormat: Name of formatter to use for time columns.
:arg timeFormat: Name of formatter to use for time columns.
:arg formatters: Dict of ``{ [vid|column] : formatter }`` mappings,
specifying custom formatters to use for specific
variables.
:arg formatters: Dict of ``{ [vid|column] : formatter }`` mappings,
specifying custom formatters to use for specific
variables.
:arg numRows: Number of rows to write at a time. Defaults to writing
all rows in one go.
:arg numRows: Number of rows to write at a time. Defaults to writing
all rows in one go.
:arg nonNumericFile: If provided, non-numeric columns (after formatting)
are saved to this file instead of to ``outfile``
"""
if sep is None: sep = TSV_SEP
if missingValues is None: missingValues = ''
......@@ -76,6 +80,15 @@ def exportTSV(dtable,
nchunks = int(np.ceil(len(subjects) / numRows))
vartable = dtable.vartable
# Created during the first iteration.
# If nonNumericFile is specified, we
# store the names of all numeric and
# non-numeric columns here so we can
# figure out which columns to put
# where.
numericCols = None
nonNumericCols = None
log.info('Writing %u columns in %u chunk(s) to %s ...',
len(dtable.allColumns), nchunks, outfile)
......@@ -123,18 +136,65 @@ def exportTSV(dtable,
else:
towrite[name] = series
if chunki > 0:
mode = 'a'
header = False
index_label = None
if nonNumericFile is None:
numericCols = colnames
nonNumericCols = None
numericChunk = towrite
nonNumericChunk = None
else:
mode = 'w'
header = colnames
index_label = idcol
towrite.to_csv(outfile,
sep=sep,
na_rep=missingValues,
header=header,
index_label=index_label,
mode=mode)
if numericCols is None:
numericCols = [c for c in towrite.columns
if pdtypes.is_numeric_dtype(towrite[c])]
nonNumericCols = [c for c in towrite.columns
if not pdtypes.is_numeric_dtype(towrite[c])]
log.debug('Redirecting %i non-numeric columns to %s '
'(remaining %i columns will be written to %s)',
len(nonNumericCols), nonNumericFile,
len(numericCols), outfile)
numericChunk = towrite[numericCols]
nonNumericChunk = towrite[nonNumericCols]
_writeChunk(numericChunk,
chunki,
outfile,
sep,
missingValues,
numericCols,
idcol)
if nonNumericFile is not None:
_writeChunk(nonNumericChunk,
chunki,
nonNumericFile,
sep,
missingValues,
nonNumericCols,
idcol)
def _writeChunk(chunk, i, outfile, sep, missingValues, colnames, idcol):
"""Write a chunk of data to a file.
:arg chunk: ``pandas.DataFrame`` containing the data to write
:arg i: Chunk index - if ``0``, column headers are written
:arg outfile: File to write to
:arg sep: Separater character
:arg missingValues: String to use for missing values
:arg colnames: Names for all columns
:arg idcol: Name for the index column
"""
if i > 0:
mode = 'a'
colnames = False
idcol = None
else:
mode = 'w'
chunk.to_csv(outfile,
sep=sep,
na_rep=missingValues,
header=colnames,
index_label=idcol,
mode=mode)
......@@ -52,7 +52,7 @@ def codeToNumeric(code):
try:
if code[0].isalpha() and code[1:].isdecimal():
prefix = ord(code[0].lower()) - ord('a') + 11
return prefix + int(code[1:])
return int(str(prefix) + str(int(code[1:])))
else:
return np.nan
except Exception:
......
......@@ -320,19 +320,27 @@ def loadTables(datafiles,
- The variable table
- The processing table
- The category table
- List of integer variable IDs which are present in the
data, but were not present in the variable table.
- List of :class:`.Column` objects representing columns
which were in the data file(s), but not in the variable
table
- List of :class:`.Column` objects representing columns
which are uncategorised, and do not have any cleaning or
processing explicitly applied for them.
"""
vartable, uvs = loadVariableTable(datafiles,
varfiles,
dcfiles,
typefile,
**kw)
proctable = loadProcessingTable(procfile, **kw)
cattable = loadCategoryTable(catfile, uvs)
vartable, unk, unc = loadVariableTable(datafiles,
varfiles,
dcfiles,
typefile,
**kw)
proctable = loadProcessingTable(procfile, **kw)
cattable = loadCategoryTable(catfile, unk)
unc = identifyUnprocessedVariables(vartable,
proctable,
cattable,
unc)
return vartable, proctable, cattable, uvs
return vartable, proctable, cattable, unk, unc
def loadDefaultTables(datafiles, **kw):
......@@ -425,6 +433,10 @@ def loadVariableTable(datafiles,
- A sequence of :class:`.Column` objects representing variables
which were present in the data files, but not in the variable
table, but were added to the variable table.
- A sequence of :class:`.Column` objects representing variables
which were present in the data files and in the variable
table, but which did not have any cleaning rules specified.
"""
if sniffers is None: sniffers = {}
......@@ -518,10 +530,34 @@ def loadVariableTable(datafiles,
['ParentValues', 'ChildValues'],
childValues)
# Before merging the cleaning functions
# in, we generate a list of variables
# which are "uncleaned", i.e. have not
# had any cleaning specified, as this
# may indicate that a variable has been
# overlooked.
#
# If a variable has indirectly had NA
# value insertion or recoding applied
# via its data coding, it is not included
# in this list.
if clean is not None: ucmask = ~vartable.index.isin(clean.keys())
else: ucmask = vartable.index.notna()
ucmask = (ucmask &
vartable['NAValues'] .isna() &
vartable['RawLevels'] .isna() &
vartable['ParentValues'].isna() &
vartable['Clean'] .isna())
ucmask = ucmask[ucmask]
uncleanVars = [c for c in cols
if (c.vid in ucmask.index and
c not in unknownVars)]
# Merge clean options into variable table
mergeCleanFunctions(vartable, tytable, clean, typeClean, globalClean)
return vartable, unknownVars
return vartable, unknownVars, uncleanVars
def loadTableBases():
......@@ -1047,3 +1083,47 @@ def columnTypes(vartable, columns):
dtypes[name] = dtype
return vttypes, dtypes
def identifyUnprocessedVariables(vartable,
proctable,
cattable,
unclean):
"""Called by :func:`loadTables`. Identifies all variables which are in the
data file(s), but which:
- are uncategorised (not present in any categories in the category table),
or
- did not have any cleaning/processing specifically applied to them or to
their data coding.
Such variables might have been overlooked, so the user may need to be
warned about them.
:arg vartable: Variable table
:arg proctable: Processing table
:arg cattable: Category table
:arg unclean: List of :class:`.Column` objects represennting columns for
which there were no cleaning rules specified. Generated by
:func:`loadVariableTable`.
:returns: A list of variables which are not in any category, and do
not have any cleaning or processing rules applied.
"""
unprocessed = []
def isCategorised(col):
def inCategory(catvars):
return col.vid in catvars
return cattable['Variables'].apply(inCategory).any()
def isProcessed(col):
def inProcess(procvars):
return isinstance(procvars, list) and col.vid in procvars
return proctable['Variable'].apply(inProcess).any()
for c in unclean:
if not (isCategorised(c) or isProcessed(c)):
unprocessed.append(c)
return unprocessed
......@@ -106,13 +106,13 @@ def main(argv=None):
with util.timed(
None, log, fmt='Total time: %i minutes, %i seconds (%+iMB)'):
dtable, unknowns, drop = doImport(args, pool, mgr)
dtable, unknowns, unprocessed, drop = doImport(args, pool, mgr)
if args.dry_run:
dryrun.doDryRun(dtable, unknowns, drop, args)
dryrun.doDryRun(dtable, unknowns, unprocessed, drop, args)
else:
doCleanAndProcess( dtable, args)
finaliseColumns( dtable, args, unknowns)
finaliseColumns( dtable, args, unknowns, unprocessed)
doExport( dtable, args)
doICD10Export( args)
......@@ -143,34 +143,43 @@ def doImport(args, pool, mgr):
- A :class:`.DataTable` containing the data
- A sequence of :class:`.Column` objects representing the
unknown columns.
- A sequence of :class:`.Column` objects representing columns
which are uncategorised, and have no processing or cleaning
rules specified on them.