Commit d3c2b522 authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

Merge branch 'enh/exclude-variables' into 'master'

Options to exclude variables/categories

See merge request !92
parents 3a222219 8181f1f1
......@@ -3,6 +3,25 @@ FUNPACK release history
3.2.0 (Friday 13th May 2022)
----------------------------
Added
^^^^^
* New ``--exclude_variable`` and ``--exclude_category`` options, allowing
variables / categories to be explicitly excluded from being imported.
Changed
^^^^^^^
* The :func:`.removeIfRedundant` processing function now uses ``double``
precision by default.
3.1.1 (Tuesday 10th May 2022)
-----------------------------
......
......@@ -6,7 +6,7 @@
#
__version__ = '3.1.1'
__version__ = '3.2.0'
"""The ``funpack`` versioning scheme roughly follows Semantic Versioning
conventions.
"""
......
......@@ -5,7 +5,9 @@
# Author: Paul McCarthy <pauldmccarthy@gmail.com>
#
"""This module contains functions for parsing ``funpack`` command line
arguments and configuration files.
arguments and configuration files. A ``funpack`` configuration file
is simply a plain-text file which contains command-line options,
sans-hyphens.
"""
......@@ -64,14 +66,16 @@ CLI_ARGUMENTS = collections.OrderedDict((
(('cf', 'category_file'), {})]),
('Import options', [
(('s', 'subject'), {'action' : 'append'}),
(('v', 'variable'), {'action' : 'append'}),
(('co', 'column'), {'action' : 'append'}),
(('c', 'category'), {'action' : 'append'}),
(('vi', 'visit'), {'action' : 'append'}),
(('ex', 'exclude'), {'action' : 'append'}),
(('iv', 'index_visits'), {'action' : 'store_true'}),
(('tt', 'trust_types'), {'action' : 'store_true'})]),
(('s', 'subject'), {'action' : 'append'}),
(('v', 'variable'), {'action' : 'append'}),
(('co', 'column'), {'action' : 'append'}),
(('c', 'category'), {'action' : 'append'}),
(('vi', 'visit'), {'action' : 'append'}),
(('ex', 'exclude'), {'action' : 'append'}),
(('ev', 'exclude_variable'), {'action' : 'append'}),
(('ec', 'exclude_category'), {'action' : 'append'}),
(('iv', 'index_visits'), {'action' : 'store_true'}),
(('tt', 'trust_types'), {'action' : 'store_true'})]),
('Cleaning options', [
(('sn', 'skip_insertna'), {'action' : 'store_true'}),
......@@ -281,6 +285,15 @@ CLI_ARGUMENT_HELP = {
'Subject ID, range, comma-separated list, or file containing a list of '
'subject IDs specifying subjects to exclude. Can be used multiple times.',
'exclude_variable' :
'Variable ID, range, comma-separated list, or file containing a list of '
'variable IDs, to exclude. Takes precedence over the --variable and '
'--category options. Can be used multiple times.',
'exclude_category' :
'Category ID or label to exclude. Takes precedence over the --variable '
'and --category options. Can be used multiple times.',
'index_visits' :
'If set, the data is re-arranged so that visits form part of the row '
'indices, rather than being stored in separate columns for each variable. '
......@@ -800,10 +813,11 @@ def _prepareSubjectAndVariableSelectors(args):
:arg args: ``argparse.Namespace`` object.
"""
# turn --subject/--variable/--exclude
# arguments into lists of IDs. If
# error is True, an error is raised on
# unparseable arguments.
# turn --subject/--variable/--exclude/
# --exclude_variable arguments into
# lists of IDs. If error is True, an
# error is raised on unparseable
# arguments.
def replaceIDs(things, error=True):
newthings = []
failed = []
......@@ -854,11 +868,17 @@ def _prepareSubjectAndVariableSelectors(args):
# a list of integer IDs, but subject is
# transformed into a tuple containing
# ([ID], [exprStr])
if args.subject is not None: args.subject = replaceIDs(args.subject,
False)
else: args.subject = None, None
if args.variable is not None: args.variable = replaceIDs(args.variable)[0]
if args.exclude is not None: args.exclude = replaceIDs(args.exclude)[0]
if args.subject is not None:
args.subject = replaceIDs(args.subject, False)
else:
args.subject = None, None
if args.variable is not None:
args.variable = replaceIDs(args.variable)[0]
if args.exclude is not None:
args.exclude = replaceIDs(args.exclude)[0]
if args.exclude_variable is not None:
args.exclude_variable = replaceIDs(args.exclude_variable)[0]
def _prepareCategorySelectors(args):
......@@ -875,6 +895,10 @@ def _prepareCategorySelectors(args):
for i, c in enumerate(args.category):
try: args.category[i] = int(c)
except ValueError: continue
if args.exclude_category is not None:
for i, c in enumerate(args.exclude_category):
try: args.exclude_category[i] = int(c)
except ValueError: continue
def _prepareColumnSelectors(args):
......
ID Category Variables
1 age, sex, brain MRI protocol, Phase 0,31,34,21022,22200,25780
0 exclude 21811
1 ID, age, sex, brain MRI protocol Phase 0,31,33,34,52,21022,22200,25780
2 genetics 21000,22000:22031,22041:22125,22190:22194,22201:22325,22182,22800:22823
3 early life factors 52,129,130,1677,1687,1697,1737,1767,1777,1787,21066,20022
3 early life factors 129,130,1677,1687,1697,1737,1767,1777,1787,21066,20022
10 lifestyle and environment - general 3:6,132,189,670,680,699,709,728,738,767,777,1031,1797,1807,1835,1845,1873,1883,2139,2149,2159,2237,2375,2385,2395,2405,2267,2277,2714:10:2834,2946,3526,3536,3546,3581,3591,3659,3669,3700,3710,3720,3829,3839,3849,3872,3882,3912,3942,3972,3982,4501,4674,4825,4836,5057,6138,6142,6139:6141,6145:6146,6160,10016,10105,10114,10721,10722,10740,10749,10860,10877,10886,20074:20075,20107,20110:20113,20118:20119,20121,22501,22599,22606,22700,22702,22704,24003:24024,24500:24508,26410:26434
11 lifestyle and environment - exercise and work 1001,1011,796,806,816,826,845,864,874,884,894,904,914,924,943,971,981,991,1021,1050:10:1220,2624,2634,3426,3637,3647,6143,6162,6164,10953,10962,10971,20277,20614,20656,20657,20668,20669,20670,20733,20741,20749,22604,22605,22607:22615,22620,22630,22631,22640:22655,104900,104910,104920
12 lifestyle and environment - food and drink 1289:10:1389,1408:10:1548,2654,3089,3680,6144,10007,10723,10767,10776,10855,10912,20084:20094,20098:20106,20108:20109,20600:20613,20615:20616,20618:20640,20642:20655,20658:20667,20671:20681,20683:20708,20710:20728,20730:20732,20734:20740,20743:20748,100001:100009,100011:100019,100021:100025,100010:10:100560,100760:10:104670
......@@ -11,10 +12,10 @@ ID Category Variables
21 physical measures - bone density and sizes 77,78,3083:3086,3143:3144,3146:3148,4092,4095,4100:4101,4103:4106,4119:4120,4122:4125,4138:4147,23200:23243,23290:23320
22 physical measures - cardiac & blood vessels 93:95,102,4079,4080,4136,4194:4196,4198:4200,4204:4205,4207,5983,5984,5986,5992,5993,6014:6017,6019,6020,6022,6024,6032:6034,6038,6039,12673:12687,12336,12338,12340,12697,12698,12702,21021,22330:22338,22420:22426,22670:22685
23 hearing test 4229:4230,4232:4237,4239:4247,4249,4268:4270,4272,4275:4277,4279,4849,10793,20019,20021,20060
24 eye test 5076:5079,5082:5091,5096:5119,5132:5136,5138:5149,5152,5155:5164,5181:5183,5186,5188,5190,5193,5198:5199,5201,5202,5204,5206,5208,5209,5211,5215,5221,5237,5251,5254:5259,5261:5267,5273,5274,5276,5292,5306,5324:5328,6070:6075,20052,20055,20057,20261:20262
24 eye test 5076:5079,5082:5091,5096:5119,5132:5136,5138:5149,5152,5155:5164,5181:5183,5186,5188,5190,5193,5198:5199,5201,5202,5204,5206,5208,5209,5211,5215,5221,5237,5251,5254:5259,5261:5267,5273,5274,5276,5292,5306,5324:5328,6070:6075,20055,20057,20261:20262
25 physical activity measures 5985,22032:22040,90002:90003,90010:90013,90015:90177,90179:90195
26 abdominal measures 22415:22417,22432:22436
30 blood assays 74,23000:23044,23049:23060,23062,23063,23065:23071,23073:23075,23400:23434,23436:23440,23442:23578,30000:10:30300,30104,30112,30114,30172,30174,30242,30252,30254,30314:10:30344,30364:10:30424,30500:10:30530,30600:10:30890
30 blood assays 74,23000:23044,23049:23060,23062,23063,23065:23071,23073:23075,23400:23434,23436:23440,23442:23578,30000:10:30300,30314:10:30344,30364:10:30424,30500:10:30530,30600:10:30890
31 brain IDPs 25000:25746,25754:25759,25761:25768,25781:25930,26500:27772
32 cognitive phenotypes 62,111,396:404,630,4250:4256,4258:4260,4281:4283,4285,4287,4290:4292,4294,4924,4935,4957,4968,4979,4990,5001,5012,5556,5699,5779,5790,5866,6312,6332,6333,6348:6351,6362,6373,6374,6382,6383,6770:6773,10133:10134,10136:10144,10146:10147,10241,10609:10610,10612,20016,20018,20023,20082,20128:20157,20159,20165,20167,20169:2:20197,20196:2:20200,20229,20230,20240,20242,20244:20248,21004,23045:23047,23072,23076:23079,23321:23324
50 health and medical history, health outcomes 84,87,92,134:137,2178,2188,2207,2217,2227,2247,2257,2296,2316,2335:10:2365,2415,2443:10:2473,2492,2674,2684,2694,2704,2844,2956:10:2986,3005,3079,3140,3393,3404,3414,3571,3606,3616,3627,3741,3751,3761,3773,3786,3799,3809,3894,3992,4012,4022,4041,4056,4067,4689,4700,4717,4728,4792,4803,4814,5408,5419,5430,5441,5452,5463,5474,5485,5496,5507,5518,5529,5540,5610,5832,5843,5855,5877,5890,5901,5912,5923,5934,5945,6119,6147,6148,6149,6150,6151,6152,6153,6154,6155,6159,6177,6179,6205,6671,10004:10006,10854,20001:20011,20199,21024:21045,21047:21061,21064:21065,21067,21068,21070:21076,22126:22181,22502:22505,22616,22618,22619,27984,40001:41253,41256,41258,41266,41267,41269:41273,41275:41278,41284:41286,42000:42013
......@@ -22,5 +23,5 @@ ID Category Variables
52 experience of pain 120000:120127
60 health dates 41257,41260,41262,41263,41268,41280:41283,42014:2:42020,42026,42030,42032,130004,130008,130014:2:130022,130062,130064,130070,130082,130104,130106,130134,130148,130174:2:130178,130184:2:130190,130194:2:130200,130202,130212,130216,130218,130224:2:130230,130254,130264,130310,130320,130336:2:130344,130622:2:130626,130632,130634,130642,130646,130648,130656:2:130660,130664,130666,130670,130686,130688,130696:2:130708,130714,130718,130722:2:130726,130734:2:130738,130770,130774,130784,130792,130804,130814,130818,130820,130826:2:130832,130836:2:130842,130846,130848,130852,130854,130868,130874,130890:2:130898,130902:2:130910,130914:2:130924,130932,130944,130998,131000,131022,131030,131032,131036,131038,131042,131046,131048,131052:2:131056,131060:2:131066,131070:2:131078,131082:2:131088,131092,131102:2:131110,131114,131118,131124:2:131132,131136,131138,131142,131144,131148,131150,131154,131158,131160,131164,131166,131174,131178:2:131186,131190,131192,131196,131198,131202,131204,131208:2:131216,131220:2:131224,131228,131230,131234,131236,131242:2:131246,131250,131252,131256:2:131264,131270,131280,131282,131286,131290,131296:2:131300,131304:2:131310,131314,131316,131322,131324,131328,131330,131338,131342:2:131356,131360:2:131370,131374,131378:2:131392,131396,131400:2:131410,131414,131416,131422:2:131432,131436,131440:2:131446,131450,131456,131458,131462:2:131484,131488:2:131494,131498,131502,131518,131524,131528,131534,131538,131540,131546,131548,131554,131556,131560:2:131586,131590:2:131594,131598:2:131604,131608:2:131620,131624:2:131654,131658,131662,131666:2:131670,131674:2:131684,131688:2:131692,131698:2:131708,131716,131720,131722,131726:2:131730,131734:2:131742,131746,131748,131754,131760,131766,131768,131774,131778,131782,131788:2:131798,131802:2:131806,131810,131812,131820:2:131826,131830,131834,131836,131840,131848:2:131852,131858:2:131864,131868:2:131888,131892,131894,131898,131900,131904,131906,131910:2:131914,131916,131918,131922:2:131930,131934,131938:2:131942,131946:2:131950,131954:2:131964,131970:2:131980,131986:2:131994,132002,132004,132008,132014,132016,132020,132022,132030:2:132038,132042,132050,132054:2:132058,132062:2:132066,132070:2:132078,132082:2:132092,132096:2:132112,132116,132118,132122,132124,132128:2:132152,132156,132160:2:132170,132174,132186,132188,132192:2:132196,132202,132206,132212,132216,132220,132224,132230,132232,132238:2:132244,132248:2:132252,132256,132260:2:132264,132268,132274:2:132280,132288,132298,132312,132468,132500,132510,132522,132532,132536,132542,132562,132574
70 health sources 42015:2:42019,42027,42031,42033,130005,130009,130015:2:130023,130063,130065,130071,130083,130105,130107,130135,130149,130175:2:130179,130185:2:130191,130195:2:130201,130203,130213,130217,130219,130225:2:130231,130265,130311,130321,130337,130341:2:130345,130623:2:130627,130633,130635,130643,130647,130649,130657:2:130661,130665,130667,130671,130687,130689,130697:2:130709,130715,130719,130723:2:130727,130735,130737,130739,130771,130775,130785,130793,130805,130815,130819,130821,130827:2:130833,130839,130843,130847,130849,130853,130855,130869,130875,130891:2:130899,130903:2:130911,130915:2:130925,130933,130999,131001,131023,131031,131033,131037,131039,131043,131047,131049,131053,131055,131057,131061:2:131067,131071:2:131079,131083:2:131089,131093,131103:2:131111,131115,131119,131125,131129,131131,131133,131137,131139,131143,131145,131149,131151,131155,131159,131161,131165,131167,131175,131179:2:131187,131191,131193,131197,131199,131203,131205,131209:2:131217,131223,131225,131229,131231,131237,131243:2:131247,131251,131253,131257:2:131265,131271,131281,131283,131287,131291,131297,131299,131305,131307,131309,131311,131315,131317,131323,131325,131329,131331,131339,131343:2:131357,131361:2:131371,131375,131381:2:131393,131397,131401,131403,131407:2:131411,131415,131417,131423:2:131433,131437,131441:2:131447,131451,131457,131459,131463:2:131485,131489:2:131495,131499,131503,131519,131525,131529,131535,131539,131541,131547,131549,131555,131557,131561,131563,131565:2:131587,131591,131593,131595,131599:2:131605,131609:2:131621,131625:2:131655,131659,131663,131667:2:131671,131675:2:131685,131689:2:131693,131699:2:131709,131717,131721,131723,131727:2:131731,131735:2:131743,131747,131749,131755,131761,131767,131769,131775,131779,131783,131791:2:131799,131803,131805,131807,131811,131813,131821:2:131827,131831,131835,131837,131841,131849,131851,131859:2:131865,131869:2:131889,131893,131895,131899,131901,131905,131907,131911,131913:2:131919,131923:2:131931,131935,131939,131941,131943,131947,131949,131951,131955:2:131965,131971:2:131981,131987:2:131995,132003,132005,132009,132015,132017,132021,132023,132031:2:132039,132043,132051,132055,132057,132059,132063:2:132067,132071:2:132079,132083:2:132093,132097:2:132109,132111,132113,132117,132119,132123,132125,132129:2:132153,132157,132161:2:132171,132175,132187,132189,132193:2:132197,132203,132207,132213,132217,132221,132225,132245,132265,132269,132275:2:132281,132289,132299,132469,132501,132511,132523,132533,132537,132543,132563,132575,132313
98 pending 41259,41261,41264,42038:42040
99 miscellaneous 19,21,35:45,53:55,68,96,120,200,393,757,1647,2129,3060,3061,3066,3077,3081:3082,3090,3132,3137,3166,4081,4093,4096,4186,4206,4238,4248,4257,4286,4288:4289,4293,4295,5074,5075,5080,5081,5214,5253,5270,5987:5988,5991,6023,6025,6334,6448,6459,6470,6481,6492,6503,6514,6525,6536,6547,10145,10697,12139:12141,12148,12187,12188,12223,12224,12253,12254,12291,12323,12623,12624,12651:12654,12658,12663,12664,12671,12688,12695,12699,12700,12704,12706,12848,12851,12854,20012:20014,20024:20025,20031:20032,20035,20041:20051,20053,20054,20058:20059,20061:20062,20072,20077:20081,20083,20114:20115,20158,20201:20227,20249:20254,20259,20260,20263,20400,20750,20751,21003,21011:21018,21023,21069,21611,21621,21622,21625,21631,21634,21642,21651,21661:21666,21671,21711,21721:21723,21725,21731:21734,21736,21738,21741,21742,21751,21761:21766,21771,21811,21821:21823,21825,21831:21834,21836,21838,21841:21842,21851,21861:21866,21871,22499,22500,22600:22603,22617,22660:22664,23048,23160:23164,23650,23762,23774,23775,25747:25753,30001:10:30301,30002:10:30102,30122:10:30162,30182:10:30232,30262:10:30302,30003:10:30303,30004:10:30094,30124:10:30164,30184:10:30244,30264:10:30304,30354,30502:10:30522,30532,30601:10:30891,30602:10:30742,30762:10:30892,30605:10:30895,30666,30796,30806,30826,30856,30897,40000,41289,41290,90001,90004,105010,105030,110001,110002,110005,110006,110008,120128
98 pending, to sort out categories later 41259,41261,41264,42038:42040
99 misc, ignored 19,21,35:45,53:55,68,96,120,200,393,757,1647,2129,3060,3061,3066,3077,3081:3082,3090,3132,3137,3166,4081,4093,4096,4186,4206,4238,4248,4257,4286,4288:4289,4293,4295,5074,5075,5080,5081,5214,5253,5270,5987:5988,5991,6023,6025,6334,6448,6459,6470,6481,6492,6503,6514,6525,6536,6547,10145,10697,12139:12141,12148,12187,12188,12223,12224,12253,12254,12291,12323,12623,12624,12651:12654,12658,12663,12664,12671,12688,12695,12699,12700,12704,12706,12848,12851,12854,20012:20014,20024:20025,20031:20032,20035,20041:20054,20058:20059,20061:20062,20072,20077:20081,20083,20114:20115,20158,20201:20227,20249:20254,20259,20260,20263,20400,20750,20751,21003,21011:21018,21023,21069,21611,21621,21622,21625,21631,21634,21642,21651,21661:21666,21671,21711,21721:21723,21725,21731:21734,21736,21738,21741,21742,21751,21761:21766,21771,21821:21823,21825,21831:21834,21836,21838,21841:21842,21851,21861:21866,21871,22499,22500,22600:22603,22617,22660:22664,23048,23160:23164,23650,23762,23774,23775,25747:25753,30001:10:30301,30002:10:30302,30003:10:30303,30004:10:30304,30354,30502:10:30522,30532,30601:10:30891,30602:10:30742,30762:10:30892,30605:10:30895,30666,30796,30806,30826,30856,30897,40000,41289,41290,90001,90004,105010,105030,110001,110002,110005,110006,110008,120128
......@@ -5,26 +5,27 @@
# FMRIB-curated categories (see funpack/configs/fmrib/categories.tsv)
#
category 1
category 2
category 3
category 10
category 11
category 12
category 13
category 14
category 20
category 21
category 22
category 23
category 24
category 25
category 26
category 30
category 32
category 50
category 51
category 60
category 70
category 98
category 99
exclude_category 0
category 1
category 2
category 3
category 10
category 11
category 12
category 13
category 14
category 20
category 21
category 22
category 23
category 24
category 25
category 26
category 30
category 32
category 50
category 51
category 60
category 70
category 98
category 99
......@@ -71,6 +71,8 @@ def importData(fileinfo,
subjects=None,
subjectExprs=None,
exclude=None,
excludeVariables=None,
excludeCategories=None,
trustTypes=False,
mergeAxis=None,
mergeStrategy=None,
......@@ -90,73 +92,79 @@ def importData(fileinfo,
3. Creates and returns a :class:`DataTable`.
:arg fileinfo: :class:`.FileInfo` object describing the input
file(s).
:arg fileinfo: :class:`.FileInfo` object describing the input
file(s).
:arg vartable: The data coding table
:arg vartable: The data coding table
:arg proctable: The processing table
:arg proctable: The processing table
:arg cattable: The category table
:arg cattable: The category table
:arg variables: List of variable IDs to import
:arg variables: List of variable IDs to import
:arg colnames: List of names/glob-style wildcard patterns
specifying columns to import.
:arg colnames: List of names/glob-style wildcard patterns
specifying columns to import.
:arg excludeColnames: List of column name suffixes specifying columns
to exclude.
:arg excludeColnames: List of column name suffixes specifying columns
to exclude.
:arg categories: List of category names to import
:arg categories: List of category names/IDs to import
:arg subjects: List of subjects to include
:arg subjects: List of subjects to include
:arg subjectExprs: List of subject inclusion expressions
:arg subjectExprs: List of subject inclusion expressions
:arg exclude: List of subjects to exclude
:arg exclude: List of subjects to exclude
:arg trustTypes: If ``True``, it is assumed that columns with a
known data type do not contain any bad/unparseable
values. This improves performance, but will cause
an error if the assumption does not hold.
:arg excludeVariables: List of variables to exclude
:arg mergeAxis: Merging axis to use when loading multiple data
files - see the :func:`mergeData` function.
:arg excludeCategories: List of category names/IDs to exclude
:arg mergeStrategy: Merging strategy to use when loading multiple
data files - see the :func:`mergeData` function.
:arg trustTypes: If ``True``, it is assumed that columns with a
known data type do not contain any bad/unparseable
values. This improves performance, but will cause
an error if the assumption does not hold.
:arg indexVisits: Re-arrange the data so that rows are indexed by
subject ID and visit, rather than visits being
split into separate columns. Only applied to
variables which are labelled with Instancing 2.
:arg mergeAxis: Merging axis to use when loading multiple data
files - see the :func:`mergeData` function.
:arg dropNaRows: If ``True``, rows which do not contain data for any
columns are not loaded.
:arg mergeStrategy: Merging strategy to use when loading multiple
data files - see the :func:`mergeData` function.
:arg njobs: Number of processes to use for parallelising tasks.
:arg indexVisits: Re-arrange the data so that rows are indexed by
subject ID and visit, rather than visits being
split into separate columns. Only applied to
variables which are labelled with Instancing 2.
:arg mgr: :class:`multiprocessing.Manager` object for
parallelisation
:arg dropNaRows: If ``True``, rows which do not contain data for any
columns are not loaded.
:arg dryrun: If ``True`` the data is not loaded.
:arg njobs: Number of processes to use for parallelising tasks.
:returns: A tuple containing:
:arg mgr: :class:`multiprocessing.Manager` object for
parallelisation
- A :class:`DataTable`, which contains references
to the data, and the variable and procesing
tables.
:arg dryrun: If ``True`` the data is not loaded.
- A list of :class:`.Column` objects that were not
loaded from each input file.
:returns: A tuple containing:
- A :class:`DataTable`, which contains references
to the data, and the variable and procesing
tables.
- A list of :class:`.Column` objects that were not
loaded from each input file.
"""
variables = filter.restrictVariables(cattable, variables, categories)
variables, excludevars = filter.restrictVariables(
cattable, variables, categories, excludeVariables, excludeCategories)
# Figure out which columns to load
cols, drop = filter.columnsToLoad(fileinfo,
vartable,
variables,
excludevars,
colnames,
excludeColnames)
......
......@@ -15,6 +15,10 @@ import fnmatch
import logging
import collections
from typing import Sequence, Union
import pandas as pd
import funpack.expression as expression
import funpack.loadtables as loadtables
......@@ -37,19 +41,32 @@ def _ispattern(s):
return any([c in s for c in '*?['])
def restrictVariables(cattable, variables, categories):
"""Determines which variables should be loaded (and the order
they should appear in the output) from the given sequences of
``variables`` and ``categories``.
def restrictVariables(
cattable : pd.DataFrame,
variables : Sequence[int] = None,
categories : Sequence[Union[str, int]] = None,
excludeVariables : Sequence[int] = None,
excludeCategories : Sequence[Union[str, int]] = None
) -> Union[None, Sequence[int]]:
"""Determines which variables should be loaded (and the order they should
appear in the output), and which variables should be excluded, from the
given sequences of ``variables``, ``categories``, and ``excludeVariables``
and ``excludeCategories``.
If neither ``variables`` nor ``categories`` are provided, ``None`` is
returned, indicating that all variables should be loaded.
:arg cattable: The category table
:arg variables: List of variable IDs to import.
:arg categories: List of category names or IDs to import.
:arg excludeVariables: List of variable IDs to exclude.
:arg excludeCategories: List of category names or IDs to exclude.
:arg cattable: The category table
:arg variables: List of variable IDs to import. May be ``None``.
:arg categories: List of category names to import. May be ``None``.
:returns: Sequence of variables to load, or ``None`` if all
variables should be loaded.
:returns: A tuple containing:
- a sequence of variables to load, or ``None`` if
all variables should be loaded.
- a sequence of variables to exclude, or ``None``
if no variables should be excluded.
"""
# Build a list of all the variables we
......@@ -63,12 +80,25 @@ def restrictVariables(cattable, variables, categories):
catvars = loadtables.categoryVariables(cattable, categories)
variables = variables + [c for c in catvars if c not in variables]
return variables
exclude = []
if excludeVariables is not None:
exclude = list(excludeVariables)
if excludeCategories is not None:
catvars = loadtables.categoryVariables(cattable, excludeCategories)
exclude = exclude + catvars
if variables is not None:
variables = [v for v in variables if v not in exclude]
return variables, exclude
def columnsToLoad(fileinfo,
vartable,
variables,
exclude=None,
colnames=None,
excludeColnames=None):
"""Determines which columns should be loaded from ``datafiles``.
......@@ -83,6 +113,8 @@ def columnsToLoad(fileinfo,
:arg variables: List of variables to load.
:arg exclude: List of variables to exclude.
:arg colnames: List of column names/glob-style wildcard patterns,
specifying columns to load.
......@@ -100,8 +132,8 @@ def columnsToLoad(fileinfo,
*ignore*.
"""
if excludeColnames is None:
excludeColnames = []
if exclude is None: exclude = []
if excludeColnames is None: excludeColnames = []
# We apply these cleaning steps by
# omitting the relevant columns.
......@@ -150,9 +182,11 @@ def columnsToLoad(fileinfo,
load[col.datafile].append(col)
continue
# excludeColnames takes precedence
# over all other column selection
# mechanisms
# exclude/excludeColnames take precedence
# over all other column selection mechanisms
if vid in exclude:
drop.extend(cols)
continue
for suf in excludeColnames:
for col in list(cols):
if col.name.endswith(suf):
......
......@@ -208,10 +208,8 @@ def doImport(args, mgr):
prependProcess=args.prepend_process,
appendProcess=args.append_process)
subjects, exprs = args.subject
variables = args.variable
categories = args.category
columns = args.column
if suffix is None: excludeColnames = []
else: excludeColnames = [suffix]
......@@ -222,13 +220,15 @@ def doImport(args, mgr):
vartable=vartable,
proctable=proctable,
cattable=cattable,
variables=variables,
colnames=columns,
variables=args.variable,
colnames=args.column,
excludeColnames=excludeColnames,
categories=categories,
categories=args.category,
subjects=subjects,
subjectExprs=exprs,
exclude=args.exclude,
excludeVariables=args.exclude_variable,
excludeCategories=args.exclude_category,
trustTypes=args.trust_types,
mergeAxis=args.merge_axis,
mergeStrategy=args.merge_strategy,
......
......@@ -185,9 +185,10 @@ def removeIfSparse(
def removeIfRedundant(dtable : datatable.DataTable,
vids : List[int],
corrthres : float,
nathres : Optional[float] = None,
pairwise : Optional[bool] = False,
skipUnknowns : Optional[bool] = False):
nathres : float = None,
skipUnknowns : bool = False,
precision : str = None,
pairwise : bool = False):
"""removeIfRedundant(corrthres, [nathres])
Removes columns deemed to be redundant.
......@@ -207,6 +208,12 @@ def removeIfRedundant(dtable : datatable.DataTable,
The ``skipUnknowns`` option defaults to ``False``. If it is set to
``True``, columns which are deemed to be redundant with respect to an
unknown or uncategorised column are **not** dropped.
The ``precision`` option can be set to either ``'double'`` (the default)
or ``'single'`` - this controls whether 32 bit (single) or 64 bit (double)
precision floating point is used for the correlation calculation. Double
precision is recommended, as the correlation calculation algorithm can
be unstable for data with large values (>10e5).
"""
# :arg pairwise: Use alternative pairwise implementation. If ``pairwise``
# is ``True``, an alternative implementation is used which
......@@ -226,7 +233,7 @@ def removeIfRedundant(dtable : datatable.DataTable,
dtable, data, corrthres, nathres)
else:
redundant = _removeIfRedundant(
dtable, data, corrthres, nathres)
dtable, data, corrthres, nathres, precision)
redundant = util.dedup(sorted(redundant))
......@@ -253,7 +260,7 @@ def removeIfRedundant(dtable : datatable.DataTable,
return [cols[r[0]] for r in redundant]
def _removeIfRedundant(dtable, data, corrthres, nathres=None):
def _removeIfRedundant(dtable, data, corrthres, nathres=None, precision=None):
"""Default fast implementation of redundancy check. Used when the
``pairwise`` option to :func:`removeIfRedundant` is ``False``.
......@@ -262,12 +269,13 @@ def _removeIfRedundant(dtable, data, corrthres, nathres=None):
:arg corrthres: Correlation threshold - see :func:`.redundantColumns`.
:arg nathres: Missingness correlation threshold - see
:func:`.redundantColumns`.
:arg precision: Floating point precision -``'single'`` or ``'double'``.
:returns: Sequence of tuples of column indices, where each tuple
``(a, b)`` indicates that column ``a`` is redundant with
respect to column ``b``.
"""
return core.matrixRedundantColumns(data, corrthres, nathres)
return core.matrixRedundantColumns(data, corrthres, nathres, precision)
def _pairwiseRemoveIfRedundant(dtable, data, corrthres, nathres=None):
......
......@@ -267,7 +267,8 @@ def pairwiseRedundantColumns(
def matrixRedundantColumns(
data : pd.DataFrame,
corrthres : float,
nathres : Optional[float] = None) -> List[Tuple[int, int]]:
nathres : float = None,
precision : str = None) -> List[Tuple[int, int]]:
"""Identifies redundant columns based on their correlation with each
other using dot products to calculate a correlation matrix.
......@@ -281,6 +282,12 @@ def matrixRedundantColumns(
``corrthres`` *and* a missing-value correlation greater
than ``nathres`` to be identified as redundant.
:arg precision: ``'double'`` (the default) or ``'single'``, specifying
the floating point precision to use. Note that the
algorithm used to calculate the correlation values is
unstable for data with a range larger than ~10e5, so
double precision is used as default.
:returns: Sequence of tuples of column indices, where each tuple
``(a, b)`` indicates that column ``a`` is redundant with
respect to column ``b``.
......@@ -289,11 +296,17 @@ def matrixRedundantColumns(
if len(data.columns) < 2:
return []
if precision == 'single':
dtype = np.float32
elif precision in (None, 'double'):
dtype = np.float64
else:
raise ValueError(f'Invalid precision: {precision}')
# Keep a copy of the column names for logging.
# Create a 2D matrix containing all data, using
# float32 to limit memory consumption.
# Create a 2D matrix containing all data
columns = data.columns
data = data.to_numpy(dtype=np.float32, copy=True)
data = data.to_numpy(dtype=dtype, copy=True)
namask = np.isnan(data)
nacounts = namask.sum(axis=0)
......@@ -309,7 +322,7 @@ def matrixRedundantColumns(
data[namask] = 0
# p=present elements
namask = (~namask).astype(np.float32)
namask = (~namask).astype(dtype)
Ap = Bp = namask
A = B = data
......
......@@ -31,7 +31,7 @@
"\n",
"\n",
"**Important** The examples in this notebook assume that you have installed\n",
"FUNPACK 3.1.1 or newer.\n",
"FUNPACK 3.2.0 or newer.\n",
"\n",
"\n",
"> **Note:** The `fmrib_unpack` command was called `funpack` in older versions\n",
......
%% Cell type:markdown id: tags:
# FUNPACK overview
![win logo](attachment:win.png)
> **Note:** If you have FUNPACK installed, you can start an interactive
> version of this page by running `fmrib_unpack_demo`.
FUNPACK is a command-line program which you can use to extract data from UK
BioBank (and other tabular) data. You can run FUNPACK by calling the
`fmrib_unpack` command.
You can give FUNPACK one or more input files (e.g. `.csv`, `.tsv`), and it
will merge them together, perform some preprocessing, and produce a single
output file.
A large number of rules are built into FUNPACK which are specific to the UK
BioBank data set. But you can control and customise everything that FUNPACK
does to your data, including which rows and columns to extract, and which
cleaning/processing steps to perform on each column.