Commit 1c9f55c0 authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

Merge branch 'bf/column_pattern' into 'master'

Bf/column pattern

See merge request fsl/ukbparse!113
parents 8f83fbcc f48aa823
Pipeline #3486 passed with stages
in 10 minutes and 38 seconds
...@@ -2,6 +2,17 @@ ...@@ -2,6 +2,17 @@
====================== ======================
0.15.1 (Thursday 21st March 2019)
---------------------------------
Fixed
^^^^^
* Fixed a bug which arose when using the ``--rename_column`` option.
0.15.0 (Monday 18th March 2019) 0.15.0 (Monday 18th March 2019)
------------------------------- -------------------------------
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
# #
__version__ = '0.15.0' __version__ = '0.15.1'
"""The ``ukbparse`` versioning scheme roughly follows Semantic Versioning """The ``ukbparse`` versioning scheme roughly follows Semantic Versioning
conventions. conventions.
""" """
......
...@@ -22,7 +22,7 @@ from . import custom ...@@ -22,7 +22,7 @@ from . import custom
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
COLUMN_PATTERN = None COLUMN_PATTERN = '{name}'
"""Default output column naming pattern. A python-style formatting string """Default output column naming pattern. A python-style formatting string
which may refer to: which may refer to:
...@@ -31,9 +31,6 @@ which may refer to: ...@@ -31,9 +31,6 @@ which may refer to:
- ``'{description}'`` - ``'{description}'``
- ``'{visit}'`` - ``'{visit}'``
- ``'{instance}'`` - ``'{instance}'``
Setting this variable to ``None`` has the same effect as setting it to
``'{name}'``.
""" """
...@@ -41,7 +38,7 @@ EXPORT_FORMAT = 'tsv' ...@@ -41,7 +38,7 @@ EXPORT_FORMAT = 'tsv'
"""Default export format.""" """Default export format."""
def genColumnNames(dtable, colpat, colmap=None): def genColumnNames(dtable, colpat=None, colmap=None):
"""Generate column names to use in the output file. """Generate column names to use in the output file.
:arg dtable: :class:`.DataTable` containing the data to export. :arg dtable: :class:`.DataTable` containing the data to export.
...@@ -54,8 +51,8 @@ def genColumnNames(dtable, colpat, colmap=None): ...@@ -54,8 +51,8 @@ def genColumnNames(dtable, colpat, colmap=None):
:returns: A dictionary containing ``{incolumn : outcolumn}`` mappings. :returns: A dictionary containing ``{incolumn : outcolumn}`` mappings.
""" """
if colmap is None: if colpat is None: colpat = COLUMN_PATTERN
colmap = {} if colmap is None: colmap = {}
variables = dtable.variables variables = dtable.variables
newcols = {} newcols = {}
...@@ -74,18 +71,18 @@ def genColumnNames(dtable, colpat, colmap=None): ...@@ -74,18 +71,18 @@ def genColumnNames(dtable, colpat, colmap=None):
desc = '' desc = ''
for visit, instance in it.product(visits, instances): for visit, instance in it.product(visits, instances):
for oldcol in dtable.columns(var, visit, instance):
oldcol = dtable.columns(var, visit, instance)[0] newcol = colmap.get(oldcol.name, None)
newcol = colmap.get(oldcol.name, None)
if newcol is None: if newcol is None:
newcol = colpat.format(variable=var, newcol = colpat.format(variable=var,
visit=visit, visit=visit,
name=oldcol.name, name=oldcol.name,
description=desc, description=desc,
instance=instance) instance=instance)
newcols[oldcol.name] = newcol newcols[oldcol.name] = newcol
return newcols return newcols
...@@ -124,7 +121,6 @@ def exportData(dtable, ...@@ -124,7 +121,6 @@ def exportData(dtable,
:attr:`EXPORT_FORMAT` :attr:`EXPORT_FORMAT`
""" """
if colpat is None: colpat = COLUMN_PATTERN
if fileFormat is None: fileFormat = EXPORT_FORMAT if fileFormat is None: fileFormat = EXPORT_FORMAT
if idcol is None: idcol = dtable.index.name if idcol is None: idcol = dtable.index.name
...@@ -133,11 +129,8 @@ def exportData(dtable, ...@@ -133,11 +129,8 @@ def exportData(dtable,
else: else:
subjects = [s for s in subjects if s in dtable.index] subjects = [s for s in subjects if s in dtable.index]
if colpat is not None or colmap is not None: colnames = genColumnNames(dtable, colpat, colmap)
colnames = genColumnNames(dtable, colpat, colmap) colnames = [colnames[c.name] for c in dtable.allColumns[1:]]
colnames = [colnames[c.name] for c in dtable.allColumns[1:]]
else:
colnames = [c.name for c in dtable.allColumns[1:]]
custom.runExporter( custom.runExporter(
fileFormat, dtable, outfile, subjects, idcol, colnames, **kwargs) fileFormat, dtable, outfile, subjects, idcol, colnames, **kwargs)
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
import itertools as it import itertools as it
import functools as ft import functools as ft
import os.path as op
import unittest.mock as mock import unittest.mock as mock
import os import os
import shutil import shutil
...@@ -168,16 +169,21 @@ table_templates = { ...@@ -168,16 +169,21 @@ table_templates = {
'processing' : '', 'processing' : '',
} }
def gen_tables(variables, vtypes=None): def gen_tables(variables, vtypes=None, datafiles=None):
if vtypes is None: if vtypes is None:
vtypes = {} vtypes = {}
if datafiles is not None:
datafiles = [op.abspath(f) for f in datafiles]
with tempdir(): with tempdir():
with open('datafile.txt', 'wt') as f: if datafiles is None:
colnames = ['eid'] + ['{}-0.0'.format(v) for v in variables] with open('datafile.txt', 'wt') as f:
f.write('\t'.join(colnames)) colnames = ['eid'] + ['{}-0.0'.format(v) for v in variables]
f.write('\t'.join(colnames))
datafiles = ['datafile.txt']
for table in ['variables', for table in ['variables',
'datacodings', 'datacodings',
...@@ -200,7 +206,7 @@ def gen_tables(variables, vtypes=None): ...@@ -200,7 +206,7 @@ def gen_tables(variables, vtypes=None):
f.write(tmpl.format(variable=v, type=vtype) + '\n') f.write(tmpl.format(variable=v, type=vtype) + '\n')
return loadtables.loadTables( return loadtables.loadTables(
['datafile.txt'], datafiles,
'variables.tsv', 'variables.tsv',
'datacodings.tsv', 'datacodings.tsv',
'types.tsv', 'types.tsv',
......
...@@ -35,6 +35,10 @@ def test_genColumnNames(): ...@@ -35,6 +35,10 @@ def test_genColumnNames():
dtable.vartable.loc[6, 'Description'] = 'abcde' dtable.vartable.loc[6, 'Description'] = 'abcde'
exp = {c.name : c.name for c in dtable.allColumns[1:]}
names = exporting.genColumnNames(dtable)
assert exp == names
colpat = '{variable}|{name}|{description}|{visit}|{instance}' colpat = '{variable}|{name}|{description}|{visit}|{instance}'
colmap = { '1-0.0' : 'variable_one', '2-0.0' : 'variable_two'} colmap = { '1-0.0' : 'variable_one', '2-0.0' : 'variable_two'}
names = exporting.genColumnNames(dtable, colpat, colmap) names = exporting.genColumnNames(dtable, colpat, colmap)
...@@ -54,6 +58,42 @@ def test_genColumnNames(): ...@@ -54,6 +58,42 @@ def test_genColumnNames():
assert exp == names assert exp == names
exp = ['var1', 'var2'] + ['{}-0.0'.format(v) for v in range(3, 11)]
exp = {c.name : n for c, n in zip(dtable.allColumns[1:], exp)}
names = exporting.genColumnNames(dtable, None, {'1-0.0' : 'var1',
'2-0.0' : 'var2'})
assert exp == names
exp = ['{}##0'.format(v) for v in range(1, 11)]
exp = {c.name : n for c, n in zip(dtable.allColumns[1:], exp)}
names = exporting.genColumnNames(dtable, '{variable}##{visit}')
assert exp == names
df = pd.DataFrame({'col1' : [1, 2, 3],
'col2' : [4, 5, 6],
'id' : [1, 2, 3]}).set_index('id')
dtable = gen_DataTableFromDataFrame(df)
exp = {c.name : c.name for c in dtable.allColumns[1:]}
names = exporting.genColumnNames(dtable)
assert exp == names
exp = ['col1', 'cc2']
exp = {c.name : e for c, e in zip(dtable.allColumns[1:], exp)}
names = exporting.genColumnNames(dtable, None, {'col2' : 'cc2'})
assert exp == names
exp = ['00col1', 'cc2']
exp = {c.name : e for c, e in zip(dtable.allColumns[1:], exp)}
names = exporting.genColumnNames(dtable, '{visit}{instance}{name}',
{'col2' : 'cc2'})
assert exp == names
exp = ['00col1', '00col2']
exp = {c.name : e for c, e in zip(dtable.allColumns[1:], exp)}
names = exporting.genColumnNames(dtable, '{visit}{instance}{name}')
assert exp == names
def test_exportData(): def test_exportData():
...@@ -447,7 +487,7 @@ def test_exporting_id_column(): ...@@ -447,7 +487,7 @@ def test_exporting_id_column():
with open('data.txt', 'wt') as f: with open('data.txt', 'wt') as f:
f.write(data) f.write(data)
vartable, proctable, cattable, _ = gen_tables([1]) vartable, proctable, cattable, _ = gen_tables([1], datafiles=['data.txt'])
dt, _ = importing.importData('data.txt', vartable, proctable, cattable) dt, _ = importing.importData('data.txt', vartable, proctable, cattable)
exporting.exportData(dt, exporting.exportData(dt,
......
...@@ -42,7 +42,7 @@ ...@@ -42,7 +42,7 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"ukbparse 0.15.0\n" "ukbparse 0.15.1\n"
] ]
} }
], ],
...@@ -2172,7 +2172,7 @@ ...@@ -2172,7 +2172,7 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"ukbparse 0.15.0 dry run\n", "ukbparse 0.15.1 dry run\n",
"\n", "\n",
"Input data\n", "Input data\n",
" Loaded columns: 11\n", " Loaded columns: 11\n",
...@@ -2229,7 +2229,7 @@ ...@@ -2229,7 +2229,7 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"ukbparse 0.15.0 dry run\n", "ukbparse 0.15.1 dry run\n",
"\n", "\n",
"Input data\n", "Input data\n",
" Loaded columns: 14190\n", " Loaded columns: 14190\n",
...@@ -3167,7 +3167,7 @@ ...@@ -3167,7 +3167,7 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"ukbparse 0.15.0 dry run\n", "ukbparse 0.15.1 dry run\n",
"\n", "\n",
"Input data\n", "Input data\n",
" Loaded columns: 11\n", " Loaded columns: 11\n",
......
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
![image.png](attachment:image.png) ![image.png](attachment:image.png)
# `ukbparse` # `ukbparse`
> Paul McCarthy <paul.mccarthy@ndcn.ox.ac.uk> ([WIN@FMRIB](https://www.win.ox.ac.uk/)) > Paul McCarthy <paul.mccarthy@ndcn.ox.ac.uk> ([WIN@FMRIB](https://www.win.ox.ac.uk/))
`ukbparse` is a command-line program which you can use to extract data from UK BioBank (and other tabular) data. `ukbparse` is a command-line program which you can use to extract data from UK BioBank (and other tabular) data.
You can give `ukbparse` one or more input files (e.g. `.csv`, `.tsv`), and it will merge them together, perform some preprocessing, and produce a single output file. You can give `ukbparse` one or more input files (e.g. `.csv`, `.tsv`), and it will merge them together, perform some preprocessing, and produce a single output file.
A large number of rules are built into `ukbparse` which are specific to the UK BioBank data set. But you can control and customise everything that `ukbparse` does to your data, including which rows and columns to extract, and which cleaning/processing steps to perform on each column. A large number of rules are built into `ukbparse` which are specific to the UK BioBank data set. But you can control and customise everything that `ukbparse` does to your data, including which rows and columns to extract, and which cleaning/processing steps to perform on each column.
The `ukbparse` source code is available at https://git.fmrib.ox.ac.uk/fsl/ukbparse. You can install `ukbparse` into a Python environment using `pip`: The `ukbparse` source code is available at https://git.fmrib.ox.ac.uk/fsl/ukbparse. You can install `ukbparse` into a Python environment using `pip`:
pip install ukbparse pip install ukbparse
Get command-line help by typing: Get command-line help by typing:
ukbparse -h ukbparse -h
*The examples in this notebook assume that you have installed `ukbparse` 0.14.0 or newer.* *The examples in this notebook assume that you have installed `ukbparse` 0.14.0 or newer.*
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
ukbparse -V ukbparse -V
``` ```
%%%% Output: stream %%%% Output: stream
ukbparse 0.15.0 ukbparse 0.15.1
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Contents ### Contents
1. [Overview](#Overview) 1. [Overview](#Overview)
1. [Import](#1.-Import) 1. [Import](#1.-Import)
2. [Cleaning](#2.-Cleaning) 2. [Cleaning](#2.-Cleaning)
3. [Processing](#3.-Processing) 3. [Processing](#3.-Processing)
4. [Export](#4.-Export) 4. [Export](#4.-Export)
2. [Examples](#Examples) 2. [Examples](#Examples)
3. [Import examples](#Import-examples) 3. [Import examples](#Import-examples)
1. [Selecting variables (columns)](#Selecting-variables-(columns)) 1. [Selecting variables (columns)](#Selecting-variables-(columns))
1. [Selecting individual variables](#Selecting-individual-variables) 1. [Selecting individual variables](#Selecting-individual-variables)
2. [Selecting variable ranges](#Selecting-variable-ranges) 2. [Selecting variable ranges](#Selecting-variable-ranges)
3. [Selecting variables with a file](#Selecting-variables-with-a-file) 3. [Selecting variables with a file](#Selecting-variables-with-a-file)
4. [Selecting variables from pre-defined categories](#Selecting-variables-from-pre-defined-categories) 4. [Selecting variables from pre-defined categories](#Selecting-variables-from-pre-defined-categories)
2. [Selecting subjects (rows)](#Selecting-subjects-(rows)) 2. [Selecting subjects (rows)](#Selecting-subjects-(rows))
1. [Selecting individual subjects](#Selecting-individual-subjects) 1. [Selecting individual subjects](#Selecting-individual-subjects)
2. [Selecting subject ranges](#Selecting-subject-ranges) 2. [Selecting subject ranges](#Selecting-subject-ranges)
3. [Selecting subjects from a file](#Selecting-subjects-from-a-file) 3. [Selecting subjects from a file](#Selecting-subjects-from-a-file)
4. [Selecting subjects by variable value](#Selecting-subjects-by-variable-value) 4. [Selecting subjects by variable value](#Selecting-subjects-by-variable-value)
5. [Excluding subjects](#Excluding-subjects) 5. [Excluding subjects](#Excluding-subjects)
3. [Selecting visits](#Selecting-visits) 3. [Selecting visits](#Selecting-visits)
4. [Merging multiple input files](#Merging-multiple-input-files) 4. [Merging multiple input files](#Merging-multiple-input-files)
1. [Merging by subject](#Merging-by-subject) 1. [Merging by subject](#Merging-by-subject)
2. [Merging by column](#Merging-by-column) 2. [Merging by column](#Merging-by-column)
3. [Naive merging](#Merging-by-column) 3. [Naive merging](#Merging-by-column)
4. [Cleaning examples](#Cleaning-examples) 4. [Cleaning examples](#Cleaning-examples)
1. [NA insertion](#NA-insertion) 1. [NA insertion](#NA-insertion)
2. [Variable-specific cleaning functions](#Variable-specific-cleaning-functions) 2. [Variable-specific cleaning functions](#Variable-specific-cleaning-functions)
3. [Categorical recoding](#Categorical-recoding) 3. [Categorical recoding](#Categorical-recoding)
4. [Child value replacement](#Child-value-replacement) 4. [Child value replacement](#Child-value-replacement)
5. [Processing examples](#Processing-examples) 5. [Processing examples](#Processing-examples)
1. [Sparsity check](#Sparsity-check) 1. [Sparsity check](#Sparsity-check)
2. [Redundancy check](#Redundancy-check) 2. [Redundancy check](#Redundancy-check)
3. [Categorical binarisation](#Categorical-binarisation) 3. [Categorical binarisation](#Categorical-binarisation)
6. [Custom cleaning, processing and loading - ukbparse plugins](#Custom-cleaning,-processing-and-loading---ukbparse-plugins) 6. [Custom cleaning, processing and loading - ukbparse plugins](#Custom-cleaning,-processing-and-loading---ukbparse-plugins)
1. [Custom cleaning functions](#Custom-cleaning-functions) 1. [Custom cleaning functions](#Custom-cleaning-functions)
2. [Custom processing functions](#Custom-processing-functions) 2. [Custom processing functions](#Custom-processing-functions)
3. [Custom file loaders](#Custom-file-loaders) 3. [Custom file loaders](#Custom-file-loaders)
7. [Miscellaneous topics](#Miscellaneous-topics) 7. [Miscellaneous topics](#Miscellaneous-topics)
1. [Dry run](#Dry-run) 1. [Dry run](#Dry-run)
2. [Built-in rules](#Built-in-rules) 2. [Built-in rules](#Built-in-rules)
3. [Using a configuration file](#Using-a-configuration-file) 3. [Using a configuration file](#Using-a-configuration-file)
4. [Reporting unknown variables](#Reporting-unknown-variables) 4. [Reporting unknown variables](#Reporting-unknown-variables)
5. [Low-memory mode](#Low-memory-mode) 5. [Low-memory mode](#Low-memory-mode)
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Overview # Overview
`ukbparse` performs the following steps: `ukbparse` performs the following steps:
## 1. Import ## 1. Import
All data files are loaded in, unwanted columns and subjects are dropped, and the data files are merged into a single table (a.k.a. data frame). Multiple files can be merged according to an index column (e.g. subject ID). Or, if the input files contain the same columns/subjects, they can be naively concatenated along rows or columns. All data files are loaded in, unwanted columns and subjects are dropped, and the data files are merged into a single table (a.k.a. data frame). Multiple files can be merged according to an index column (e.g. subject ID). Or, if the input files contain the same columns/subjects, they can be naively concatenated along rows or columns.
## 2. Cleaning ## 2. Cleaning
The following cleaning steps are applied to each column: The following cleaning steps are applied to each column:
1. **NA value replacement:** Specific values for some columns are replaced with NA, for example, variables where a value of `-1` indicates *Do not know*. 1. **NA value replacement:** Specific values for some columns are replaced with NA, for example, variables where a value of `-1` indicates *Do not know*.
2. **Variable-specific cleaning functions:** Certain columns are re-formatted - for example, the [ICD10](https://en.wikipedia.org/wiki/ICD-10) disease codes are converted to integer representations. 2. **Variable-specific cleaning functions:** Certain columns are re-formatted - for example, the [ICD10](https://en.wikipedia.org/wiki/ICD-10) disease codes are converted to integer representations.
3. **Categorical recoding:** Certain categorical columns are re-coded. 3. **Categorical recoding:** Certain categorical columns are re-coded.
4. **Child value replacement:** NA values within some columns which are dependent upon other columns may have values inserted based on the values of their parent columns. 4. **Child value replacement:** NA values within some columns which are dependent upon other columns may have values inserted based on the values of their parent columns.
## 3. Processing ## 3. Processing
During the processing stage, columns may be removed, merged, or expanded into additional columns. For example, a categorical column may be expanded into a set of binary columns, one for each category. During the processing stage, columns may be removed, merged, or expanded into additional columns. For example, a categorical column may be expanded into a set of binary columns, one for each category.
A column may also be removed on the basis of being too sparse, or being redundant with respect to another column. A column may also be removed on the basis of being too sparse, or being redundant with respect to another column.
## 4. Export ## 4. Export
The processed data can be saved as a `.csv`, `.tsv`, or `.hdf5` file. The processed data can be saved as a `.csv`, `.tsv`, or `.hdf5` file.
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Examples # Examples
Throughout these examples, we are going to use a few command line options, which you will probably **not** normally want to use: Throughout these examples, we are going to use a few command line options, which you will probably **not** normally want to use:
- `-nb` (short for `--no_builtins`): This tells `ukbparse` not to use the built-in processing rules, which are specifically tailored for UK BioBank data. - `-nb` (short for `--no_builtins`): This tells `ukbparse` not to use the built-in processing rules, which are specifically tailored for UK BioBank data.
- `-ow` (short for `--overwrite`): This tells `ukbparse` not to complain if the output file already exists. - `-ow` (short for `--overwrite`): This tells `ukbparse` not to complain if the output file already exists.
- `-q` (short for `--quiet`): This tells `ukbparse` to be quiet. - `-q` (short for `--quiet`): This tells `ukbparse` to be quiet.
Without the `-q` option, `ukbparse` can be quite verbose, which can be annoying, but is very useful when things go wrong. A good strategy is to tell `ukbparse` to send all of its output to a log file with the `--log_file` (or `-lf`) option. For example: Without the `-q` option, `ukbparse` can be quite verbose, which can be annoying, but is very useful when things go wrong. A good strategy is to tell `ukbparse` to send all of its output to a log file with the `--log_file` (or `-lf`) option. For example:
ukbparse --log_file log.txt out.tsv in.tsv ukbparse --log_file log.txt out.tsv in.tsv
Here's the first example input data set, with UK BioBank-style column names: Here's the first example input data set, with UK BioBank-style column names:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
cat data_01.tsv cat data_01.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0 7-0.0 8-0.0 9-0.0 10-0.0 eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0 7-0.0 8-0.0 9-0.0 10-0.0
1 31 65 10 11 84 22 56 65 90 12 1 31 65 10 11 84 22 56 65 90 12
2 56 52 52 42 89 35 3 65 50 67 2 56 52 52 42 89 35 3 65 50 67
3 45 84 20 84 93 36 96 62 48 59 3 45 84 20 84 93 36 96 62 48 59
4 7 46 37 48 80 20 18 72 37 27 4 7 46 37 48 80 20 18 72 37 27
5 8 86 51 68 80 84 11 28 69 10 5 8 86 51 68 80 84 11 28 69 10
6 6 29 85 59 7 46 14 60 73 80 6 6 29 85 59 7 46 14 60 73 80
7 24 49 41 46 92 23 39 68 7 63 7 24 49 41 46 92 23 39 68 7 63
8 80 92 97 30 92 83 98 36 6 23 8 80 92 97 30 92 83 98 36 6 23
9 84 59 89 79 16 12 95 73 2 62 9 84 59 89 79 16 12 95 73 2 62
10 23 96 67 41 8 20 97 57 59 23 10 23 96 67 41 8 20 97 57 59 23
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
The numbers in each column name represent: The numbers in each column name represent:
1. The variable ID 1. The variable ID
2. The visit, for variables which were collected at multiple points in time. 2. The visit, for variables which were collected at multiple points in time.
3. The "instance", for multi-valued variables. 3. The "instance", for multi-valued variables.
Note that one **variable** is typically associated with several **columns**, although we're keeping things simple for this first example - there is only one visit for each variable, and there are no mulit-valued variables. Note that one **variable** is typically associated with several **columns**, although we're keeping things simple for this first example - there is only one visit for each variable, and there are no mulit-valued variables.
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Import examples # Import examples
## Selecting variables (columns) ## Selecting variables (columns)
You can specify which variables you want to load in the following ways, using the `--variable` (`-v` for short) and `--category` (`-c` for short) command line options: You can specify which variables you want to load in the following ways, using the `--variable` (`-v` for short) and `--category` (`-c` for short) command line options:
* By variable ID * By variable ID
* By variable ranges * By variable ranges
* By a text file which contains the IDs you want to keep. * By a text file which contains the IDs you want to keep.
* By pre-defined variable categories * By pre-defined variable categories
* By column name * By column name
### Selecting individual variables ### Selecting individual variables
Simply provide the IDs of the variables you want to extract: Simply provide the IDs of the variables you want to extract:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
ukbparse -nb -q -ow -v 1 -v 5 out.tsv data_01.tsv ukbparse -nb -q -ow -v 1 -v 5 out.tsv data_01.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 5-0.0 eid 1-0.0 5-0.0
1 31 84 1 31 84
2 56 89 2 56 89
3 45 93 3 45 93
4 7 80 4 7 80
5 8 80 5 8 80
6 6 7 6 6 7
7 24 92 7 24 92
8 80 92 8 80 92
9 84 16 9 84 16
10 23 8 10 23 8
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Selecting variable ranges ### Selecting variable ranges
The `--variable`/`-v` option accepts MATLAB-style ranges of the form `start:step:stop` (where the `stop` is inclusive): The `--variable`/`-v` option accepts MATLAB-style ranges of the form `start:step:stop` (where the `stop` is inclusive):
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
ukbparse -nb -q -ow -v 1:3:10 out.tsv data_01.tsv ukbparse -nb -q -ow -v 1:3:10 out.tsv data_01.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 4-0.0 7-0.0 10-0.0 eid 1-0.0 4-0.0 7-0.0 10-0.0
1 31 11 56 12 1 31 11 56 12
2 56 42 3 67 2 56 42 3 67
3 45 84 96 59 3 45 84 96 59
4 7 48 18 27 4 7 48 18 27
5 8 68 11 10 5 8 68 11 10
6 6 59 14 80 6 6 59 14 80
7 24 46 39 63 7 24 46 39 63
8 80 30 98 23 8 80 30 98 23
9 84 79 95 62 9 84 79 95 62
10 23 41 97 23 10 23 41 97 23
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Selecting variables with a file ### Selecting variables with a file
If your variables of interest are listed in a plain-text file, you can simply pass that file: If your variables of interest are listed in a plain-text file, you can simply pass that file:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
echo -e "1\n6\n9" > vars.txt echo -e "1\n6\n9" > vars.txt
ukbparse -nb -q -ow -v vars.txt out.tsv data_01.tsv ukbparse -nb -q -ow -v vars.txt out.tsv data_01.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 6-0.0 9-0.0 eid 1-0.0 6-0.0 9-0.0
1 31 22 90 1 31 22 90
2 56 35 50 2 56 35 50
3 45 36 48 3 45 36 48
4 7 20 37 4 7 20 37
5 8 84 69 5 8 84 69
6 6 46 73 6 6 46 73
7 24 23 7 7 24 23 7
8 80 83 6 8 80 83 6
9 84 12 2 9 84 12 2
10 23 20 59 10 23 20 59
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Selecting variables from pre-defined categories ### Selecting variables from pre-defined categories
Some UK BioBank-specific categories are baked into `ukbparse`, but you can also define your own categories - you just need to create a `.tsv` file, and pass it to `ukbparse` via the `--category_file` (`-cf` for short): Some UK BioBank-specific categories are baked into `ukbparse`, but you can also define your own categories - you just need to create a `.tsv` file, and pass it to `ukbparse` via the `--category_file` (`-cf` for short):
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
echo -e "ID\tCategory\tVariables" > custom_categories.tsv echo -e "ID\tCategory\tVariables" > custom_categories.tsv
echo -e "1\tCool variables\t1:5,7" >> custom_categories.tsv echo -e "1\tCool variables\t1:5,7" >> custom_categories.tsv
echo -e "2\tUncool variables\t6,8:10" >> custom_categories.tsv echo -e "2\tUncool variables\t6,8:10" >> custom_categories.tsv
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Use the `--category` (`-c` for short) to select categories to output. You can refer to categories by their ID: Use the `--category` (`-c` for short) to select categories to output. You can refer to categories by their ID:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
ukbparse -nb -q -ow -cf custom_categories.tsv -c 1 out.tsv data_01.tsv ukbparse -nb -q -ow -cf custom_categories.tsv -c 1 out.tsv data_01.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 7-0.0 eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 7-0.0
1 31 65 10 11 84 56 1 31 65 10 11 84 56
2 56 52 52 42 89 3 2 56 52 52 42 89 3
3 45 84 20 84 93 96 3 45 84 20 84 93 96
4 7 46 37 48 80 18 4 7 46 37 48 80 18
5 8 86 51 68 80 11 5 8 86 51 68 80 11
6 6 29 85 59 7 14 6 6 29 85 59 7 14
7 24 49 41 46 92 39 7 24 49 41 46 92 39
8 80 92 97 30 92 98 8 80 92 97 30 92 98
9 84 59 89 79 16 95 9 84 59 89 79 16 95
10 23 96 67 41 8 97 10 23 96 67 41 8 97
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Or by name: Or by name:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
ukbparse -nb -q -ow -cf custom_categories.tsv -c uncool out.tsv data_01.tsv ukbparse -nb -q -ow -cf custom_categories.tsv -c uncool out.tsv data_01.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 6-0.0 8-0.0 9-0.0 10-0.0 eid 6-0.0 8-0.0 9-0.0 10-0.0
1 22 65 90 12 1 22 65 90 12
2 35 65 50 67 2 35 65 50 67
3 36 62 48 59 3 36 62 48 59
4 20 72 37 27 4 20 72 37 27
5 84 28 69 10 5 84 28 69 10
6 46 60 73 80 6 46 60 73 80
7 23 68 7 63 7 23 68 7 63
8 83 36 6 23 8 83 36 6 23
9 12 73 2 62 9 12 73 2 62
10 20 57 59 23 10 20 57 59 23
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Selecting column names ### Selecting column names
If you are working with data that has non-UK BioBank style column names, you can use the `--column` (`-co` for short) to select individual columns by their name, rather than the variable with which they are associated. The `--column` option accepts full column names, and also shell-style wildcard patterns: If you are working with data that has non-UK BioBank style column names, you can use the `--column` (`-co` for short) to select individual columns by their name, rather than the variable with which they are associated. The `--column` option accepts full column names, and also shell-style wildcard patterns:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
ukbparse -nb -q -ow -co 4-0.0 -co "??-0.0" out.tsv data_01.tsv ukbparse -nb -q -ow -co 4-0.0 -co "??-0.0" out.tsv data_01.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 4-0.0 10-0.0 eid 4-0.0 10-0.0
1 11 12 1 11 12
2 42 67 2 42 67
3 84 59 3 84 59
4 48 27 4 48 27
5 68 10 5 68 10
6 59 80 6 59 80
7 46 63 7 46 63
8 30 23 8 30 23
9 79 62 9 79 62
10 41 23 10 41 23
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Selecting subjects (rows) ## Selecting subjects (rows)
`ukbparse` assumes that the first column in every input file is a subject ID. You can specify which subjects you want to load via the `--subject` (`-s` for short) option. You can specify subjects in the same way that you specified variables above, and also: `ukbparse` assumes that the first column in every input file is a subject ID. You can specify which subjects you want to load via the `--subject` (`-s` for short) option. You can specify subjects in the same way that you specified variables above, and also:
* By specifying a conditional expression on variable values - only subjects for which the expression evaluates to true will be imported * By specifying a conditional expression on variable values - only subjects for which the expression evaluates to true will be imported
* By specifying subjects to exclude * By specifying subjects to exclude
### Selecting individual subjects ### Selecting individual subjects
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
ukbparse -nb -q -ow -s 1 -s 3 -s 5 out.tsv data_01.tsv ukbparse -nb -q -ow -s 1 -s 3 -s 5 out.tsv data_01.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0 7-0.0 8-0.0 9-0.0 10-0.0 eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0 7-0.0 8-0.0 9-0.0 10-0.0
1 31 65 10 11 84 22 56 65 90 12 1 31 65 10 11 84 22 56 65 90 12
3 45 84 20 84 93 36 96 62 48 59 3 45 84 20 84 93 36 96 62 48 59
5 8 86 51 68 80 84 11 28 69 10 5 8 86 51 68 80 84 11 28 69 10
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Selecting subject ranges ### Selecting subject ranges
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
ukbparse -nb -q -ow -s 2:2:10 out.tsv data_01.tsv ukbparse -nb -q -ow -s 2:2:10 out.tsv data_01.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0 7-0.0 8-0.0 9-0.0 10-0.0 eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0 7-0.0 8-0.0 9-0.0 10-0.0
2 56 52 52 42 89 35 3 65 50 67 2 56 52 52 42 89 35 3 65 50 67
4 7 46 37 48 80 20 18 72 37 27 4 7 46 37 48 80 20 18 72 37 27
6 6 29 85 59 7 46 14 60 73 80 6 6 29 85 59 7 46 14 60 73 80
8 80 92 97 30 92 83 98 36 6 23 8 80 92 97 30 92 83 98 36 6 23
10 23 96 67 41 8 20 97 57 59 23 10 23 96 67 41 8 20 97 57 59 23
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Selecting subjects from a file ### Selecting subjects from a file
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
echo -e "5\n6\n7\n8\n9\n10" > subjects.txt echo -e "5\n6\n7\n8\n9\n10" > subjects.txt
ukbparse -nb -q -ow -s subjects.txt out.tsv data_01.tsv ukbparse -nb -q -ow -s subjects.txt out.tsv data_01.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0 7-0.0 8-0.0 9-0.0 10-0.0 eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0 7-0.0 8-0.0 9-0.0 10-0.0
5 8 86 51 68 80 84 11 28 69 10 5 8 86 51 68 80 84 11 28 69 10
6 6 29 85 59 7 46 14 60 73 80 6 6 29 85 59 7 46 14 60 73 80
7 24 49 41 46 92 23 39 68 7 63 7 24 49 41 46 92 23 39 68 7 63
8 80 92 97 30 92 83 98 36 6 23 8 80 92 97 30 92 83 98 36 6 23
9 84 59 89 79 16 12 95 73 2 62 9 84 59 89 79 16 12 95 73 2 62
10 23 96 67 41 8 20 97 57 59 23 10 23 96 67 41 8 20 97 57 59 23
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Selecting subjects by variable value ### Selecting subjects by variable value
The `--subject` option accepts *variable expressions* - you can write an expression performing numerical comparisons on variables (denoted with a leading `v`) and combine these expressions using boolean algebra. Only subjects for which the expression evaluates to true will be imported. For example, to only import subjects where variable 1 is greater than 10, and variable 2 is less than 70, you can type: The `--subject` option accepts *variable expressions* - you can write an expression performing numerical comparisons on variables (denoted with a leading `v`) and combine these expressions using boolean algebra. Only subjects for which the expression evaluates to true will be imported. For example, to only import subjects where variable 1 is greater than 10, and variable 2 is less than 70, you can type:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
ukbparse -nb -q -ow -sp -s "v1 > 10 && v2 < 70" out.tsv data_01.tsv ukbparse -nb -q -ow -sp -s "v1 > 10 && v2 < 70" out.tsv data_01.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0 7-0.0 8-0.0 9-0.0 10-0.0 eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0 7-0.0 8-0.0 9-0.0 10-0.0
1 31 65 10 11 84 22 56 65 90 12 1 31 65 10 11 84 22 56 65 90 12
2 56 52 52 42 89 35 3 65 50 67 2 56 52 52 42 89 35 3 65 50 67
7 24 49 41 46 92 23 39 68 7 63 7 24 49 41 46 92 23 39 68 7 63
9 84 59 89 79 16 12 95 73 2 62 9 84 59 89 79 16 12 95 73 2 62
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
The following symbols can be used in variable expressions: The following symbols can be used in variable expressions:
| Symbol | Meaning | | Symbol | Meaning |
|---------------------------|--------------------------| |---------------------------|--------------------------|
| `==` | equal to | | `==` | equal to |
| `!=` | not equal to | | `!=` | not equal to |
| `>` | greater than | | `>` | greater than |
| `>=` | greater than or equal to | | `>=` | greater than or equal to |
| `<` | less than | | `<` | less than |
| `<=` | less than or equal to | | `<=` | less than or equal to |
| `na` | N/A | | `na` | N/A |
| `&&` | logical and | | `&&` | logical and |
| <code>&#x7c;&#x7c;</code> | logical or | | <code>&#x7c;&#x7c;</code> | logical or |
| `~` | logical not | | `~` | logical not |
| `()` | To denote precedence | | `()` | To denote precedence |
### Excluding subjects ### Excluding subjects
The `--exclude` (`-ex` for short) option allows you to exclude subjects - it accepts individual IDs, an ID range, or a file containing IDs. The `--exclude`/`-ex` option takes precedence over the `--subject`/`-s` option: The `--exclude` (`-ex` for short) option allows you to exclude subjects - it accepts individual IDs, an ID range, or a file containing IDs. The `--exclude`/`-ex` option takes precedence over the `--subject`/`-s` option:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
ukbparse -nb -q -ow -s 1:8 -ex 5:10 out.tsv data_01.tsv ukbparse -nb -q -ow -s 1:8 -ex 5:10 out.tsv data_01.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0 7-0.0 8-0.0 9-0.0 10-0.0 eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0 7-0.0 8-0.0 9-0.0 10-0.0
1 31 65 10 11 84 22 56 65 90 12 1 31 65 10 11 84 22 56 65 90 12
2 56 52 52 42 89 35 3 65 50 67 2 56 52 52 42 89 35 3 65 50 67
3 45 84 20 84 93 36 96 62 48 59 3 45 84 20 84 93 36 96 62 48 59
4 7 46 37 48 80 20 18 72 37 27 4 7 46 37 48 80 20 18 72 37 27
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Selecting visits ## Selecting visits
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Many variables in the UK BioBank data contain observations at multiple points in time, or visits. `ukbparse` allows you to specify which visits you are interested in. Here is an example data set with variables that have data for multiple visits (remember that the second number in the column names denotes the visit): Many variables in the UK BioBank data contain observations at multiple points in time, or visits. `ukbparse` allows you to specify which visits you are interested in. Here is an example data set with variables that have data for multiple visits (remember that the second number in the column names denotes the visit):
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
cat data_02.tsv cat data_02.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 2-0.0 2-1.0 2-2.0 3-0.0 3-1.0 4-0.0 5-0.0 eid 1-0.0 2-0.0 2-1.0 2-2.0 3-0.0 3-1.0 4-0.0 5-0.0
1 86 76 82 75 34 99 50 5 1 86 76 82 75 34 99 50 5
2 20 25 40 44 30 57 54 44 2 20 25 40 44 30 57 54 44
3 85 2 48 42 23 77 84 27 3 85 2 48 42 23 77 84 27
4 23 30 18 97 44 55 97 20 4 23 30 18 97 44 55 97 20
5 83 45 76 51 18 64 8 33 5 83 45 76 51 18 64 8 33
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
We can use the `--visit` (`-vi` for short) option to get just the last visit for each variable: We can use the `--visit` (`-vi` for short) option to get just the last visit for each variable:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
ukbparse -nb -q -ow -vi last out.tsv data_02.tsv ukbparse -nb -q -ow -vi last out.tsv data_02.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 2-2.0 3-1.0 4-0.0 5-0.0 eid 1-0.0 2-2.0 3-1.0 4-0.0 5-0.0
1 86 75 99 50 5 1 86 75 99 50 5
2 20 44 57 54 44 2 20 44 57 54 44
3 85 42 77 84 27 3 85 42 77 84 27
4 23 97 55 97 20 4 23 97 55 97 20
5 83 51 64 8 33 5 83 51 64 8 33
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
You can also specify which visit you want by its number: You can also specify which visit you want by its number:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
ukbparse -nb -q -ow -vi 1 out.tsv data_02.tsv ukbparse -nb -q -ow -vi 1 out.tsv data_02.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 2-1.0 3-1.0 eid 2-1.0 3-1.0
1 82 99 1 82 99
2 40 57 2 40 57
3 48 77 3 48 77
4 18 55 4 18 55
5 76 64 5 76 64
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Merging multiple input files ## Merging multiple input files
If your data is split across multiple files, you can specify how `ukbparse` should merge them together. If your data is split across multiple files, you can specify how `ukbparse` should merge them together.
### Merging by subject ### Merging by subject
For example, let's say we have these two input files (shown side-by-side): For example, let's say we have these two input files (shown side-by-side):
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
echo " " | paste data_03.tsv - data_04.tsv echo " " | paste data_03.tsv - data_04.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 2-0.0 3-0.0 eid 4-0.0 5-0.0 6-0.0 eid 1-0.0 2-0.0 3-0.0 eid 4-0.0 5-0.0 6-0.0
1 89 47 26 2 19 17 62 1 89 47 26 2 19 17 62
2 94 37 70 3 41 12 7 2 94 37 70 3 41 12 7
3 63 5 97 4 8 86 9 3 63 5 97 4 8 86 9
4 98 97 91 5 7 65 71 4 98 97 91 5 7 65 71
5 37 10 11 6 3 23 15 5 37 10 11 6 3 23 15
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Note that each file contains different variables, and different, but overlapping, subjects. By default, when you pass these files to `ukbparse`, it will output the intersection of the two files (more formally known as an *inner join*), i.e. subjects which are present in both files: Note that each file contains different variables, and different, but overlapping, subjects. By default, when you pass these files to `ukbparse`, it will output the intersection of the two files (more formally known as an *inner join*), i.e. subjects which are present in both files:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
ukbparse -nb -q -ow out.tsv data_03.tsv data_04.tsv ukbparse -nb -q -ow out.tsv data_03.tsv data_04.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0 eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0
2 94 37 70 19 17 62 2 94 37 70 19 17 62
3 63 5 97 41 12 7 3 63 5 97 41 12 7
4 98 97 91 8 86 9 4 98 97 91 8 86 9
5 37 10 11 7 65 71 5 37 10 11 7 65 71
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
If you want to keep all subjects, you can instruct `ukbparse` to output the union (a.k.a. *outer join*) via the `--merge_strategy` (`-ms` for short) option: If you want to keep all subjects, you can instruct `ukbparse` to output the union (a.k.a. *outer join*) via the `--merge_strategy` (`-ms` for short) option:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
ukbparse -nb -q -ow -ms outer out.tsv data_03.tsv data_04.tsv ukbparse -nb -q -ow -ms outer out.tsv data_03.tsv data_04.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0 eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0
1 89.0 47.0 26.0 1 89.0 47.0 26.0
2 94.0 37.0 70.0 19.0 17.0 62.0 2 94.0 37.0 70.0 19.0 17.0 62.0
3 63.0 5.0 97.0 41.0 12.0 7.0 3 63.0 5.0 97.0 41.0 12.0 7.0
4 98.0 97.0 91.0 8.0 86.0 9.0 4 98.0 97.0 91.0 8.0 86.0 9.0
5 37.0 10.0 11.0 7.0 65.0 71.0 5 37.0 10.0 11.0 7.0 65.0 71.0
6 3.0 23.0 15.0 6 3.0 23.0 15.0
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Merging by column ### Merging by column
Your data may be organised in a different way. For example, these next two files contain different groups of subjects, but overlapping columns: Your data may be organised in a different way. For example, these next two files contain different groups of subjects, but overlapping columns:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
echo " " | paste data_05.tsv - data_06.tsv echo " " | paste data_05.tsv - data_06.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 eid 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0 eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 eid 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0
1 69 80 70 60 42 4 17 36 56 90 12 1 69 80 70 60 42 4 17 36 56 90 12
2 64 15 82 99 67 5 63 16 87 57 63 2 64 15 82 99 67 5 63 16 87 57 63
3 33 67 58 96 26 6 43 19 84 53 63 3 33 67 58 96 26 6 43 19 84 53 63
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
In this case, we need to tell `ukbparse` to merge along the row axis, rather than along the column axis. We can do this with the `--merge_axis` (`-ma` for short) option: In this case, we need to tell `ukbparse` to merge along the row axis, rather than along the column axis. We can do this with the `--merge_axis` (`-ma` for short) option:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
ukbparse -nb -q -ow -ma rows out.tsv data_05.tsv data_06.tsv ukbparse -nb -q -ow -ma rows out.tsv data_05.tsv data_06.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 2-0.0 3-0.0 4-0.0 5-0.0 eid 2-0.0 3-0.0 4-0.0 5-0.0
1 80 70 60 42 1 80 70 60 42
2 15 82 99 67 2 15 82 99 67
3 67 58 96 26 3 67 58 96 26
4 17 36 56 90 4 17 36 56 90
5 63 16 87 57 5 63 16 87 57
6 43 19 84 53 6 43 19 84 53
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Again, if we want to retain all columns, we can tell `ukbparse` to perform an outer join with the `-ms` option: Again, if we want to retain all columns, we can tell `ukbparse` to perform an outer join with the `-ms` option:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
ukbparse -nb -q -ow -ma rows -ms outer out.tsv data_05.tsv data_06.tsv ukbparse -nb -q -ow -ma rows -ms outer out.tsv data_05.tsv data_06.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0 eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0
1 69.0 80 70 60 42 1 69.0 80 70 60 42
2 64.0 15 82 99 67 2 64.0 15 82 99 67
3 33.0 67 58 96 26 3 33.0 67 58 96 26
4 17 36 56 90 12.0 4 17 36 56 90 12.0
5 63 16 87 57 63.0 5 63 16 87 57 63.0
6 43 19 84 53 63.0 6 43 19 84 53 63.0
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Naive merging ### Naive merging
Finally, your data may be organised such that you simply want to "paste", or concatenate them together, along either rows or columns. For example, your data files might look like this: Finally, your data may be organised such that you simply want to "paste", or concatenate them together, along either rows or columns. For example, your data files might look like this:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
echo " " | paste data_07.tsv - data_08.tsv echo " " | paste data_07.tsv - data_08.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 2-0.0 3-0.0 eid 4-0.0 5-0.0 6-0.0 eid 1-0.0 2-0.0 3-0.0 eid 4-0.0 5-0.0 6-0.0
1 30 99 57 1 16 54 60 1 30 99 57 1 16 54 60
2 3 6 75 2 43 59 9 2 3 6 75 2 43 59 9
3 13 91 36 3 71 73 38 3 13 91 36 3 71 73 38
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Here, we have columns for different variables on the same set of subjects, and we just need to concatenate them together horizontally. We do this by using `--merge_strategy naive` (`-ms naive` for short): Here, we have columns for different variables on the same set of subjects, and we just need to concatenate them together horizontally. We do this by using `--merge_strategy naive` (`-ms naive` for short):
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
ukbparse -q -ow -ms naive out.tsv data_07.tsv data_08.tsv ukbparse -q -ow -ms naive out.tsv data_07.tsv data_08.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0 eid 1-0.0 2-0.0 3-0.0 4-0.0 5-0.0 6-0.0
1 30 99 57.0 16.0 54.0 60.0 1 30 99 57.0 16.0 54.0 60.0
2 3 6 75.0 43.0 59.0 9.0 2 3 6 75.0 43.0 59.0 9.0
3 13 91 36.0 71.0 73.0 38.0 3 13 91 36.0 71.0 73.0 38.0
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
For files which need to be concatenated vertically, such as these: For files which need to be concatenated vertically, such as these:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
echo " " | paste data_09.tsv - data_10.tsv echo " " | paste data_09.tsv - data_10.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 2-0.0 3-0.0 eid 1-0.0 2-0.0 3-0.0 eid 1-0.0 2-0.0 3-0.0 eid 1-0.0 2-0.0 3-0.0
1 16 34 10 4 40 89 58 1 16 34 10 4 40 89 58
2 62 78 16 5 25 75 9 2 62 78 16 5 25 75 9
3 72 29 53 6 28 74 57 3 72 29 53 6 28 74 57
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
We need to tell `ukbparse` which axis to concatenate along, again using the `-ma` option: We need to tell `ukbparse` which axis to concatenate along, again using the `-ma` option:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
ukbparse -nb -q -ow -ms naive -ma rows out.tsv data_09.tsv data_10.tsv ukbparse -nb -q -ow -ms naive -ma rows out.tsv data_09.tsv data_10.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 2-0.0 3-0.0 eid 1-0.0 2-0.0 3-0.0
1 16 34 10 1 16 34 10
2 62 78 16 2 62 78 16
3 72 29 53 3 72 29 53
4 40 89 58 4 40 89 58
5 25 75 9 5 25 75 9
6 28 74 57 6 28 74 57
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Cleaning examples # Cleaning examples
Once the data has been imported, a sequence of cleaning steps are applied to each column. Once the data has been imported, a sequence of cleaning steps are applied to each column.
## NA insertion ## NA insertion
For some variables it may make sense to discard or ignore certain values. For example, if an individual selects *"Do not know"* to a question such as *"How much milk did you drink yesterday?"*, that answer will be coded with a specific value (e.g. `-1`). It does not make any sense to included these values in most analyses, so `ukbparse` can be used to mark such values as *Not Available (NA)*. For some variables it may make sense to discard or ignore certain values. For example, if an individual selects *"Do not know"* to a question such as *"How much milk did you drink yesterday?"*, that answer will be coded with a specific value (e.g. `-1`). It does not make any sense to included these values in most analyses, so `ukbparse` can be used to mark such values as *Not Available (NA)*.
A large number of NA insertion rules, specific to UK BioBank variables, are coded into `ukbparse` (although they will not be used in these examples, as we are using the `--no_builtins`/`-nb` option). You can also specify your own rules via the `--na_values` (`-nv` for short) option. A large number of NA insertion rules, specific to UK BioBank variables, are coded into `ukbparse` (although they will not be used in these examples, as we are using the `--no_builtins`/`-nb` option). You can also specify your own rules via the `--na_values` (`-nv` for short) option.
Let's say we have this data set: Let's say we have this data set:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
cat data_11.tsv cat data_11.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 2-0.0 3-0.0 eid 1-0.0 2-0.0 3-0.0
1 4 1 6 1 4 1 6
2 2 6 0 2 2 6 0
3 7 0 -1 3 7 0 -1
4 -1 6 1 4 -1 6 1
5 2 8 4 5 2 8 4
6 0 2 7 6 0 2 7
7 -1 0 0 7 -1 0 0
8 7 7 2 8 7 7 2
9 4 -1 -1 9 4 -1 -1
10 8 -1 2 10 8 -1 2
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
For variable 1, we want to ignore values of -1, for variable 2 we want to ignore -1 and 0, and for variable 3 we want to ignore 1 and 2: For variable 1, we want to ignore values of -1, for variable 2 we want to ignore -1 and 0, and for variable 3 we want to ignore 1 and 2:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
ukbparse -nb -q -ow -nv 1 " -1" -nv 2 " -1,0" -nv 3 "1,2" out.tsv data_11.tsv ukbparse -nb -q -ow -nv 1 " -1" -nv 2 " -1,0" -nv 3 "1,2" out.tsv data_11.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 2-0.0 3-0.0 eid 1-0.0 2-0.0 3-0.0
1 4.0 1.0 6.0 1 4.0 1.0 6.0
2 2.0 6.0 0.0 2 2.0 6.0 0.0
3 7.0 -1.0 3 7.0 -1.0
4 6.0 4 6.0
5 2.0 8.0 4.0 5 2.0 8.0 4.0
6 0.0 2.0 7.0 6 0.0 2.0 7.0
7 0.0 7 0.0
8 7.0 7.0 8 7.0 7.0
9 4.0 -1.0 9 4.0 -1.0
10 8.0 10 8.0
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
> The `--na_values` option expects two arguments: > The `--na_values` option expects two arguments:
> * The variable ID > * The variable ID
> * A comma-separated list of values to replace with NA > * A comma-separated list of values to replace with NA
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Variable-specific cleaning functions ## Variable-specific cleaning functions
A small number of cleaning/preprocessing functions are built into `ukbparse`, which can be applied to specific variables. For example, some variables in the UK BioBank contain ICD10 disease codes, which may be more useful if converted to a numeric format. Imagine that we have some data with ICD10 codes: A small number of cleaning/preprocessing functions are built into `ukbparse`, which can be applied to specific variables. For example, some variables in the UK BioBank contain ICD10 disease codes, which may be more useful if converted to a numeric format. Imagine that we have some data with ICD10 codes:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
cat data_12.tsv cat data_12.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 eid 1-0.0
1 A481 1 A481
2 A590 2 A590
3 B391 3 B391
4 D596 4 D596
5 Z980 5 Z980
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
We can use the `--clean` (`-cl` for short) option with the built-in `convertICD10Codes` cleaning function to convert the codes to a numeric representation: We can use the `--clean` (`-cl` for short) option with the built-in `convertICD10Codes` cleaning function to convert the codes to a numeric representation:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
ukbparse -nb -q -ow -cl 1 convertICD10Codes out.tsv data_12.tsv ukbparse -nb -q -ow -cl 1 convertICD10Codes out.tsv data_12.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 eid 1-0.0
1 492 1 492
2 601 2 601
3 403 3 403
4 610 4 610
5 1016 5 1016
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
> The `--clean` option expects two arguments: > The `--clean` option expects two arguments:
> * The variable ID > * The variable ID
> * The cleaning function to apply. Some cleaning functions accept arguments - refer to the command-line help for a summary of available functions. > * The cleaning function to apply. Some cleaning functions accept arguments - refer to the command-line help for a summary of available functions.
> >
> You can define your own cleaning functions by passing them in as a `--plugin_file` (see the [section on custom plugins below](#Custom-cleaning,-processing-and-loading----ukbparse-plugins)). > You can define your own cleaning functions by passing them in as a `--plugin_file` (see the [section on custom plugins below](#Custom-cleaning,-processing-and-loading----ukbparse-plugins)).
### Example: flattening hierarchical data ### Example: flattening hierarchical data
Several variables in the UK Biobank (including the ICD10 disease categorisations) are organised in a hierarchical manner - each value is a child of a more general parent category. The `flattenHierarchical` cleaninng function can be used to replace each value in a data set with the value that corresponds to a parent category. Let's apply this to our example ICD10 data set. Several variables in the UK Biobank (including the ICD10 disease categorisations) are organised in a hierarchical manner - each value is a child of a more general parent category. The `flattenHierarchical` cleaninng function can be used to replace each value in a data set with the value that corresponds to a parent category. Let's apply this to our example ICD10 data set.
> `ukbparse` needs to know the data coding of hierarchical variables, as it uses this to look up an internal table containing the hierarchy information. So in this example we are creating a dummy variable table file which tells `ukbparse` that the example data uses [data coding 19](https://biobank.ctsu.ox.ac.uk/crystal/coding.cgi?id=19), which is the ICD10 data coding. > `ukbparse` needs to know the data coding of hierarchical variables, as it uses this to look up an internal table containing the hierarchy information. So in this example we are creating a dummy variable table file which tells `ukbparse` that the example data uses [data coding 19](https://biobank.ctsu.ox.ac.uk/crystal/coding.cgi?id=19), which is the ICD10 data coding.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
echo -e "ID\tType\tDescription\tDataCoding\tNAValues\tRawLevels\tNewLevels\tParentValues\tChildValues\tClean echo -e "ID\tType\tDescription\tDataCoding\tNAValues\tRawLevels\tNewLevels\tParentValues\tChildValues\tClean
1\t\t\t19 1\t\t\t19
" > variables.tsv " > variables.tsv
ukbparse -nb -q -ow -vf variables.tsv -cl 1 flattenHierarchical out.tsv data_12.tsv ukbparse -nb -q -ow -vf variables.tsv -cl 1 flattenHierarchical out.tsv data_12.tsv
cat out.tsv cat out.tsv
``` ```
%%%% Output: stream %%%% Output: stream
eid 1-0.0 eid 1-0.0
1 Chapter I 1 Chapter I
2 Chapter I 2 Chapter I
3 Chapter I 3 Chapter I
4 Chapter III 4 Chapter III
5 Chapter XXI 5 Chapter XXI
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Aside: ICD10 mapping file ### Aside: ICD10 mapping file
`ukbparse` has a feature specific to these ICD10 disease categorisations - you can use the `--icd10_map_file` (`-imf` for short) option to tell `ukbparse` to save a file which contains a list of all ICD10 codes that were present in the input data, and the corresponding numerical codes that `ukbparse` generated: `ukbparse` has a feature specific to these ICD10 disease categorisations - you can use the `--icd10_map_file` (`-imf` for short) option to tell `ukbparse` to save a file which contains a list of all ICD10 codes that were present in the input data, and the corresponding numerical codes that `ukbparse` generated:
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` bash ``` bash
ukbparse -nb -q -ow -cl 1 convertICD10Codes -imf icd10_codes.tsv out.tsv data_12.tsv ukbparse -nb -q -ow -cl 1 convertICD10Codes -imf icd10_codes.tsv out.tsv data_12.tsv
cat icd10_codes.tsv cat icd10_codes.tsv
```