Commit 698371f7 authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

Merge branch 'rf/nn' into 'master'

Rf/nn

See merge request !51
parents 902a8b39 9a5cf04d
......@@ -2,6 +2,43 @@ FUNPACK changelog
=================
2.0.0 (Tuesday 7th April 2020)
------------------------------
Changed
^^^^^^^
* The ``fmrib`` and ``fmrib_logs`` configuration profiles no longer define the
variables/categories to be loaded - by default all variables in the input file
will be loaded and processed.
* The ``--non_numeric_file`` option has been replaced with ``--suppress_non_numerics``
(which tells FUNPACK to only save numeric columns to the main output file),
and the ``--write_non_numerics`` and ``--non_numerics_file`` options, which
tell FUNPACK to save non-numeric columns to an auxillary output file.
* The ``--tsv_var_format`` option has been renamed to ``--var_format``, and is
applied to all export formats.
* The default output file format is now inferred from the output file suffix -
one of ``tsv``, ``csv``, or ``h5``.
* The format of the ``--unknown_vars_file`` has changed - the ``processed``
column has been removed (as with the removal of the ``--import_all`` option,
it is now equivalent to the ``exported`` column), and uncategorised columns
now have a ``class`` of ``uncategorised`` instead of ``unprocessed``.
Removed
^^^^^^^
* Removed several obscure, redundant, or deprecated options, including
``--import_all``, ``--remove_unknown``, ``--pass_through``,
``--output_id_column``, ``--column_pattern``, ``--column_name``,
``--low_memory``, and ``--work_dir``.
* Removed the unused :mod:`funpack.storage` module.
* Removed the unused :meth:`.DataTable.order` method.
1.9.1 (Sunday 29th March 2020)
------------------------------
......
......@@ -6,7 +6,7 @@
#
__version__ = '2.0.0.dev0'
__version__ = '2.0.0'
"""The ``funpack`` versioning scheme roughly follows Semantic Versioning
conventions.
"""
......
......@@ -24,9 +24,7 @@ import funpack
import funpack.util as util
import funpack.custom as custom
import funpack.importing as importing
import funpack.exporting as exporting
import funpack.exporting_hdf5 as exporting_hdf5
import funpack.exporting_tsv as exporting_tsv
log = logging.getLogger(__name__)
......@@ -35,11 +33,8 @@ log = logging.getLogger(__name__)
VERSION = funpack.__version__
DEFAULT_MERGE_AXIS = importing.MERGE_AXIS
DEFAULT_MERGE_STRATEGY = importing.MERGE_STRATEGY
DEFAULT_EXPORT_FORMAT = exporting.EXPORT_FORMAT
AVAILABLE_MERGE_AXES = importing.MERGE_AXIS_OPTIONS
AVAILABLE_MERGE_STRATEGIES = importing.MERGE_STRATEGY_OPTIONS
DEFAULT_COLUMN_PATTERN = exporting.COLUMN_PATTERN
DEFAULT_TSV_SEP = exporting_tsv.TSV_SEP
DEFAULT_HDF5_KEY = exporting_hdf5.HDF5_KEY
DEFAULT_HDF5_STYLE = exporting_hdf5.HDF5_STYLE
AVAILABLE_HDF5_STYLES = exporting_hdf5.HDF5_STYLES
......@@ -112,37 +107,33 @@ CLI_ARGUMENTS = collections.OrderedDict((
'metavar' : ('VARS', 'PROCS')})]),
('Export options', [
(('f', 'format'), {'default' : DEFAULT_EXPORT_FORMAT}),
(('cp', 'column_pattern'), {'default' : DEFAULT_COLUMN_PATTERN}),
(('rc', 'rename_column'), {'action' : 'append',
'nargs' : 2,
'metavar' : ('OLD_NAME', 'NEW_NAME')}), # noqa
(('oi', 'output_id_column'), {}),
(('edf', 'date_format'), {'default' : 'default'}),
(('etf', 'time_format'), {'default' : 'default'})]),
('TSV export options', [
(('ts', 'tsv_sep'), {'default' : DEFAULT_TSV_SEP}),
(('tm', 'tsv_missing_values'), {'default' : ''}),
(('nn', 'non_numeric_file'), {}),
(('nr', 'num_rows'), {'type' : int}),
(('tvf', 'tsv_var_format'), {'nargs' : 2,
'metavar' : ('VID', 'FORMATTER'),
'action' : 'append'})]),
(('f', 'format'), {}),
(('edf', 'date_format'), {'default' : 'default'}),
(('etf', 'time_format'), {'default' : 'default'}),
(('evf', 'var_format'), {'nargs' : 2,
'metavar' : ('VID', 'FORMATTER'),
'action' : 'append'}),
(('esn', 'suppress_non_numerics'), {'action' : 'store_true'}),
(('nr', 'num_rows'), {'type' : int})]),
('TSV/CSV export options', [
(('ts', 'tsv_sep'), {}),
(('tm', 'tsv_missing_values'), {'default' : ''})]),
('HDF5 export options', [
(('hk', 'hdf5_key'), {'default' : DEFAULT_HDF5_KEY}),
(('hs', 'hdf5_style'), {'default' : DEFAULT_HDF5_STYLE,
'choices' : AVAILABLE_HDF5_STYLES})]),
('Auxillary output file options', [
(('wl', 'write_log'), {'action' : 'store_true'}),
(('wnn', 'write_non_numerics'), {'action' : 'store_true'}),
(('wu', 'write_unknown_vars'), {'action' : 'store_true'}),
(('wim', 'write_icd10_map'), {'action' : 'store_true'}),
(('wde', 'write_description'), {'action' : 'store_true'}),
(('ws', 'write_summary'), {'action' : 'store_true'}),
(('lf', 'log_file'), {}),
(('nnf', 'non_numerics_file'), {}),
(('uf', 'unknown_vars_file'), {}),
(('imf', 'icd10_map_file'), {}),
(('def', 'description_file'), {}),
......@@ -170,12 +161,26 @@ CLI_DESCRIPTIONS = {
'times - all provided files will be merged into a single table using the\n'
'variable/data coding IDs.',
'Export options' :
'Non-numeric columns are exported to the main output file by default, but\n'
'you can control this behaviour using one of the following options:\n'
'\n'
' - The --suppress_non_numerics option tells FUNPACK to only save\n'
' numeric columns to the main output file.\n'
' - The --write_non_numerics option (described in the "Auxillary output\n'
' file options" section) tells FUNPACK to save non-numeric columns to\n'
' a separate output file.\n'
'\n'
'Note that the --suppress_non_numerics option is independent from the\n'
'--write_non_numerics option - if you want to save non-numeric columns\n'
'to a separate file, instead of to the main file, you must use both\n'
'options together.',
'Auxillary output file options' :
'If the --write_log option is used, a default name, based on the main\n'
'output file name, will be given to the log file. Alternatively, the\n'
'--log_file option can be used with a specific name to use for the log\n'
'file. This logic also applies to the other auxillary output files.\n'
''
'\n'
'The --unknown_vars_file allows a file to be saved containing\n'
'information about columns which were in the input data, but were either\n'
......@@ -322,39 +327,26 @@ CLI_ARGUMENT_HELP = {
# Export options
'format' :
'Output file format (default: "{}").'.format(DEFAULT_EXPORT_FORMAT),
'column_pattern' :
'Pattern defining output column names (default: "{}").'.format(
DEFAULT_COLUMN_PATTERN),
'rename_column' :
'Rename the given column instead of applying --column_pattern. Can '
'be used multiple times',
'output_id_column' :
'Name of ID column in output file.',
'Output file format (default: inferred from the output file suffix - one '
'of "tsv", "csv", or "h5").',
'date_format' :
'Formatter to use for date variables (default: "default").',
'time_format' :
'Formatter to use for time variables (default: "default").',
'var_format' :
'Apply custom formatter to the specified variable.',
'num_rows' :
'Number of rows to write at a time. Ignored if --num_jobs is set to 1.',
'suppress_non_numerics' :
'Do not save non-numeric columns to the main output file.',
# TSV export options
'tsv_sep' :
'Column separator string to use in output file (default: "{}")'.format(
DEFAULT_TSV_SEP.replace('\t', '\\t')),
# TSV/CSV export options
'tsv_sep' :
'Column separator string to use in output file (default: "," for csv, '
'"\\t" for tsv).',
'tsv_missing_values' :
'String to use for missing values in output file (default: empty '
'string).' ,
'num_rows' :
'Number of rows to write at a time. Ignored if --num_jobs is set to 1.',
'non_numeric_file' :
'Export all non-numeric columns (after formatting) to this file instead '
'of the primary output file.',
'tsv_var_format' :
'Apply custom formatter to the specified variable.',
# HDF5 export options
'hdf5_key' :
......@@ -365,30 +357,24 @@ CLI_ARGUMENT_HELP = {
# aux file options
'write_log' : 'Save log messages to file.',
'write_non_numerics' : 'Save non-numeric columns to file.',
'write_unknown_vars' :
'Save list of unknown/uncategorised variables/columns to file.',
'write_icd10_map' :
'Save converted ICD10 code mappings to file',
'write_description' :
'Save descriptions of each column to file',
'write_summary' :
'Save summary of cleaning applied to each column to file',
'log_file' : 'Save log messages to file.',
'non_numerics_file' : 'Save non-numeric columns to file.',
'unknown_vars_file' :
'Save list of unknown/uncategorised variables/columns to file.',
'icd10_map_file' :
'Save converted ICD10 code mappings to file',
'description_file' :
'Save descriptions of each column to file',
'summary_file' :
'Save summary of cleaning applied to each column to file',
......@@ -585,8 +571,11 @@ def parseArgs(argv=None, namespace=None):
# assign default names to
# auxillary output files
auxfiles = ['log', 'unknown_vars', 'icd10_map', 'description', 'summary']
outbase = op.splitext(args.outfile)[0]
outbase, outext = op.splitext(args.outfile)
auxfiles = ['log', 'non_numerics', 'unknown_vars',
'icd10_map', 'description', 'summary']
auxexts = collections.defaultdict(lambda : '.txt',
non_numerics=outext)
for auxfile in auxfiles:
writeatt = 'write_{}'.format(auxfile)
fileatt = '{}_file' .format(auxfile)
......@@ -594,9 +583,12 @@ def parseArgs(argv=None, namespace=None):
filename = getattr(args, fileatt)
if write and filename is None:
setattr(args, fileatt, '{}_{}.txt'.format(outbase, auxfile))
setattr(args, fileatt, '{}_{}{}'.format(outbase,
auxfile,
auxexts[auxfile]))
else:
setattr(args, writeatt, filename is not None)
# -1 implies max-parallel
if args.num_jobs <= -1: args.num_jobs = mp.cpu_count()
elif args.num_jobs == 0: args.num_jobs = 1
......@@ -626,23 +618,23 @@ def parseArgs(argv=None, namespace=None):
args.index = indexes
# turn formatters into dict of { vid : name } mappings
if args.tsv_var_format is not None:
tsv_var_format = {}
if args.var_format is None:
args.var_format = []
for i, (v, n) in enumerate(args.tsv_var_format):
var_format = {}
for i, (v, n) in enumerate(args.var_format):
# Formatters should be set on integer
# variable IDs. But we allow non-integers
# to pass through, as the exportData
# function will also check against column
# names.
try: v = int(v)
except ValueError: pass
# Formatters should be set on integer
# variable IDs. But we allow non-integers
# to pass through, as the exportData
# function will also check against column
# names.
try: v = int(v)
except ValueError: pass
tsv_var_format[v] = n
var_format[v] = n
args.tsv_var_format = tsv_var_format
args.var_format = var_format
# turn --subject/--variable/--exclude
# arguments into lists of IDs. If
......@@ -741,12 +733,6 @@ def parseArgs(argv=None, namespace=None):
try: args.category[i] = int(c)
except ValueError: continue
# convert rename_column from a sequence of
# [(oldname, newname)] pairs into a dict of
# { oldname : newname } mappings.
if args.rename_column is not None:
args.rename_column = dict(args.rename_column)
def numlist(s):
return np.fromstring(s, sep=',', dtype=np.float)
......
......@@ -6,7 +6,7 @@
# Use local settings
config_file local
config_file local
#
......@@ -37,37 +37,34 @@ config_file local
# Suppress with `-sp`
# - NA insertion
datacoding_file fmrib/datacodings_navalues.tsv
datacoding_file fmrib/datacodings_navalues.tsv
# - Categorical recoding
datacoding_file fmrib/datacodings_recoding.tsv
datacoding_file fmrib/datacodings_recoding.tsv
# - Cleaning
variable_file fmrib/variables_clean.tsv
variable_file fmrib/variables_clean.tsv
# - Child value replacement
variable_file fmrib/variables_parentvalues.tsv
variable_file fmrib/variables_parentvalues.tsv
# - Processing
processing_file fmrib/processing.tsv
processing_file fmrib/processing.tsv
#
# FMRIB Categories
#
# FMRIB-curated categories, largely drawn from showcase categories, with default
# suggested subset to use
#
category_file fmrib/categories.tsv
# Select useful categories
config_file fmrib_cats
# FMRIB-curated categories, largely drawn from showcase categories
category_file fmrib/categories.tsv
#
# FMRIB processing of dates
#
# Converts a date or date+time into a single value x, where floor(x) is the
# calendar year and the fraction day/time within the year *except* 'a day'
# is redefined as the time between 7am and 8pm (scanning only takes place
# Converts a date or date+time into a single value x, where floor(x) is the
# calendar year and the fraction day/time within the year *except* 'a day'
# is redefined as the time between 7am and 8pm (scanning only takes place
# within these hours.
#
plugin_file fmrib
date_format FMRIBImagingDate
time_format FMRIBImagingTime
plugin_file fmrib
date_format FMRIBImagingDate
time_format FMRIBImagingTime
# Drop non-numeric columns - the main output file only contains numeric data.
suppress_non_numerics
......@@ -19,7 +19,6 @@ ID ParentValues ChildValues
1021 v6164 == 4 0
1120 v1110 == -1 || v1110 == 0 0
1130 v1110 == -1 || v1110 == 0 0
2654 v1428 == 3 0
2664 v1558 >= 1 && v1558 <= 5 && v1568 == 3 0
2867 v1249 == 1 0
2877 v1249 == 1 0
......
#
# FUNPACK "fmrib_cats.cfg" configuration file
#
# Provides recommended set of categories to process, from full list of
# Provides recommended set of categories to process, from full list of
# FMRIB-curated categories (see funpack/configs/fmrib/categories.tsv)
#
category 1
category 2
category 3
category 10
category 11
category 12
category 13
category 14
category 20
category 21
category 22
category 23
category 24
category 25
category 26
category 30
category 32
category 50
category 51
category 60
category 70
category 98
category 99
category 1
category 2
category 3
category 10
category 11
category 12
category 13
category 14
category 20
category 21
category 22
category 23
category 24
category 25
category 26
category 30
category 32
category 50
category 51
category 60
category 70
category 98
category 99
......@@ -7,8 +7,10 @@
# separate file containing summary information about these variables.
#
# base this config on fmrib_logs
# base this config on fmrib_logs,
# and include all fmrib categories
config_file fmrib_logs
config_file fmrib_cats
# Include unknown/uncategorised variables
category unknown
......
#
# FUNPACK "fmrib_standard.cfg" configuration file
#
# This is the configuration used for standard internal FMRIB runs.
config_file fmrib_logs
config_file fmrib_cats
......@@ -7,8 +7,8 @@
# Overwrite files
overwrite
# Number of cores to use in parallelisation; -1 for 'all possible'
num_jobs -1
# Number of cores to use in parallelisation; -1 for 'all possible'
num_jobs -1
# It is always recommended to trust data types of each column; without this option
# processing is considerably slower. Disable when debugging a possibly prolematic file.
......
......@@ -30,7 +30,7 @@ To ensure that the ``funpack`` command line help is nicely formatted, all
plugin functions should have a docstring of the form::
\"\"\"functionSignature(args)
Short description of function.
Short description of function on a single line.
Extended description
\"\"\"
......
......@@ -9,7 +9,7 @@ encoding_id title availability coded_as structure num_members descript
7 Yes No dichotomous choice 0 11 1 2 Options for basic Yes or No choice
8 Calendar Month 0 11 1 12 Calendar month within year
9 Sex 0 11 1 2 Biological sex
10 Assessment Centre 0 11 1 26 The assessment centre at which a participant consented
10 Assessment Centre 0 11 1 27 The assessment centre at which a participant consented
11 Consent statuses 0 11 1 4 Ongoing status/nature of participant consent
12 ACE boolean 0 11 1 2 True/False boolean value
13 event-time-exceptions 0 31 1 2 Coding for time unknown
......@@ -72,10 +72,11 @@ encoding_id title availability coded_as structure num_members descript
91 Urban/rural area definitions 0 11 1 17 Population density descriptions for UK areas.
92 Card faces used by touchscreen 0 11 1 25 Descriptions of the card faces displayed on the touchscreen at the UK Biobank centres during cognitive testing.
93 Accelerometer data quality 0 11 1 3 Indicates the perceived quality/usability of the accelerometer datafile.
96 Scale of liking 0 11 1 11 This scale was record the degree to which a participant liked a particular food, product or experience. Only the values 1, 5 and 9 were assigned explicit meanings - intermediate values should be interpreted as lying between these.
100 Pass/Fail test result 0 11 1 2 Results of a pass/fail test
101 Result of a pass/fail/not-tested test 0 11 1 3 Results of a pass/fail test with individuals who were explicitly not-tested indicated.
102 inclusion status 0 11 1 2 Whether a participant was included in a particular subset
123 UK Biobank staff 0 11 1 1087 This is a pseudonymised index of the UK Biobank staff who have had authority to sign-off individual-level records which are included in the UK Biobank core repository. UK Biobank will not publish or reveal the actual identiites of staff.
123 UK Biobank staff 0 11 1 1115 This is a pseudonymised index of the UK Biobank staff who have had authority to sign-off individual-level records which are included in the UK Biobank core repository. UK Biobank will not publish or reveal the actual identiites of staff.
165 Assay correction level 0 11 1 3 Indicates the type of assay correct that was applied to a biochemistry result.
170 Map co-ordinates 0 11 1 1 Special map co-ordinates
196 Result flags for sodium in urine assay 0 41 1 2 Result flags returned by sodium in urine assay when outside of device analytical range.
......@@ -206,7 +207,7 @@ encoding_id title availability coded_as structure num_members descript
527 TAF answer-set 527 0 11 1 6 Multiple choice answers from Thoughts and Feelings questionnaire
528 TAF answer-set 528 0 11 1 3 Multiple choice answers from Thoughts and Feelings questionnaire
529 TAF answer-set 529 0 11 1 6 Multiple choice answers from Thoughts and Feelings questionnaire
530 TAF answer-set 530 0 11 1 4 Multiple choice answers from Thoughts and Feelings questionnaire
530 TAF answer-set 530 0 11 1 3 Multiple choice answers from Thoughts and Feelings questionnaire
531 TAF answer-set 531 0 11 1 7 Multiple choice answers from Thoughts and Feelings questionnaire
532 TAF answer-set 532 0 11 1 6 Multiple choice answers from Thoughts and Feelings questionnaire
533 TAF answer-set 533 0 11 1 4 Multiple choice answers from Thoughts and Feelings questionnaire
......@@ -284,6 +285,7 @@ encoding_id title availability coded_as structure num_members descript
1210 ACE-coding1210 0 41 1 1 Artificial coding, generated after data collection
1211 ACE-coding1211 0 41 1 2 Artificial coding, generated after data collection
1212 UKB Assessment Centre visit 0 11 1 2 Denotes the assessment centre visit at which a particular item of data was produced.
1313 Masked date for location 0 51 1 1 Used to obscure dates which are near to participant DOB.
1317 Scanner position issues 0 31 1 1 Identifies problems determining position of MRI scannes.
1401 Mental health problems 0 11 1 18 List of professionally diagnosed mental health problems
1405 Depression substances 0 11 1 4 Substances taken to potentially alleviate depressive symptoms.
......@@ -323,6 +325,9 @@ encoding_id title availability coded_as structure num_members descript
5012 Answers to FI12 0 11 1 5 List of possible answers to online fluid intelligence question FI12.
5013 Answers to FI13 0 11 1 5 List of possible answers to online fluid intelligence question FI13.
5014 Answers to FI14 0 11 1 5 List of possible answers to online fluid intelligence question FI14.
5159 Mood scale 0 11 1 5 Scale of answers to mood questions.
5160 Input device/method 0 11 1 5 Method of data input used by participant
5178 Pointing method 0 11 1 5 Method used by participants to point-to/select/click a control on a screen.
6312 Tower responses 0 11 1 6 Responses were integers in the range 1-6.
6314 Word/picture groups 0 11 1 340 Relates indices of word/picture groups to the actual words displayed
6315 Position of picture on screen 0 11 1 4 Index relating each picture to their position on screen
......
This diff is collapsed.
......@@ -359,49 +359,6 @@ class DataTable:
return [c for c in self.allColumns if c.vid != 0]
def order(self, vids):
"""Orders the data columns according to the given list of variable
IDs.
This method only affects the order in which columns and variables are
returned from the :meth:`variables`, :meth:`allColumns`, and
:meth:`dataColumns` methods - the order of the underlying dataframe
is not changed.
Variables which are in the data, but not in the ``vids`` list, are
removed from the data set.
:arg vids: Sequence of variable IDs in the desired order.
"""
if not all([self.present(v) for v in vids]):
raise ValueError('One of these variables is not '
'in the data: {}'.format(vids))
oldvarmap = self.__varmap
newvarmap = collections.OrderedDict()
# index column stays
newvarmap[0] = oldvarmap[0]
# generate new variable map
# (a dict of {vid : [Column]}
# mappings)
for vid in vids:
newvarmap[vid] = self.columns(vid)
# drop any columns associated
# with variables that were not
# listed in the new order.
for vid, cols in list(oldvarmap.items()):
if vid == 0:
continue
if vid not in vids:
self.removeColumns(cols)
self.__varmap = newvarmap
def present(self, variable, visit=None, instance=None):
"""Returns ``True`` if the specified variable (and optionally visit/
instance) is present in the data, ``False`` otherwise.
......
......@@ -7,12 +7,10 @@
"""This module provides functions for exporting data to a file. """
import itertools as it
import logging
import collections
import numpy as np
import pandas as pd
import pandas.api.types as pdtypes
from . import util
......@@ -22,113 +20,84 @@ from . import custom
log = logging.getLogger(__name__)
COLUMN_PATTERN = '{name}'
"""Default output column naming pattern. A python-style formatting string
which may refer to:
- ``'{variable}'``
- ``'{name}'``
- ``'{description}'``
- ``'{visit}'``
- ``'{instance}'``
"""
EXPORT_FORMAT = 'tsv'
"""Default export format."""
def genColumnNames(dtable, colpat=None, colmap=None):
"""Generate column names to use in the output file.
def exportData(dtable, outfile, fileFormat, **kwargs):
"""Export the data contained in ``dtable`` to ``outfile`` using the
specified ``fileFormat``.
:arg dtable: :class:`.DataTable` containing the data to export.
:arg dtable: :class:`.DataTable` containing the data to export.
:arg colpat: Output column name pattern. If not provided, defaults to
:attr:`COLUMN_PATTERN`.
:arg outfile: File to export data to.
:arg colmap: Dictionary containing ``{variable : name}`` mappings.