Commit d5c3d3b5 authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

Merge branch 'rf/navalue-types' into 'master'

Update cleaning options to support non-numeric

Closes #23

See merge request !80
parents 012a9748 55189eb4
......@@ -2,6 +2,21 @@ FUNPACK changelog
=================
2.7.0 (Friday 14th May 2021)
----------------------------
Changed
^^^^^^^
* The ``--na_values``, ``--child_values`` and ``--recoding`` options can now
be applied to non-numeric data fields.
* Updated internal data field and data coding to the latest from the UK
BioBank showcase.
* Updates to FMRIB categories and processing rules.
2.6.0 (Monday 29th March 2021)
------------------------------
......
......@@ -6,7 +6,7 @@
#
__version__ = '2.6.0'
__version__ = '2.7.0'
"""The ``funpack`` versioning scheme roughly follows Semantic Versioning
conventions.
"""
......
......@@ -92,10 +92,12 @@ def applyNAInsertion(dtable):
if not dtable.present(vid):
continue
columns = dtable.columns(vid)
dtype = dtable[:, columns[0].name].dtype
values = np.array(vtable['NAValues'][vid]).astype(dtype)
navals = {v : np.nan for v in values}
for col in dtable.columns(vid):
series = dtable[:, col.name]
values = vtable['NAValues'][vid].astype(series.dtype)
navals = {v : np.nan for v in values}
dtable[:, col.name] = dtable[:, col.name].replace(navals)
......@@ -160,10 +162,10 @@ def _runChildValues(dtable, exprs, cvals, vid):
# present for dependent and parent variables.
# Replacement on child variables for which
# this assumption does not hold is skipped.
expr = exprs[ vid]
cval = cvals[ vid]
visits = dtable.visits( vid)
instances = dtable.instances(vid)
expr = exprs[ vid]
cval = np.array(cvals[ vid])
visits = dtable.visits( vid)
instances = dtable.instances(vid)
for visit, instance in it.product(visits, instances):
......@@ -186,7 +188,7 @@ def _runChildValues(dtable, exprs, cvals, vid):
# And there should only be one
# variable for a given
# (vid, visit, instance)
if any([len(pc) != 1 for pc in pcols]):
if any(len(pc) != 1 for pc in pcols):
continue
except KeyError:
......@@ -221,7 +223,8 @@ def _runChildValues(dtable, exprs, cvals, vid):
mask = mask & dtable[:, colname].isna()
# Finally we apply it to the data.
dtable[mask, colname] = cval[idxs[mask]]
dtype = dtable[:, colname].dtype
dtable[mask, colname] = cval[idxs[mask]].astype(dtype)
def applyChildValues(dtable):
......@@ -283,12 +286,33 @@ def applyNewLevels(dtable):
continue
for col in dtable.columns(vid):
old = dtable[:, col.name]
valmap = dict(zip(rawlevels[vid].astype(old.dtype),
newlevels[vid]))
new = old.replace(valmap)
corr = old.corr(new)
dtable[:, col.name] = new
old = dtable[:, col.name]
# Make sure raw/new levels are numeric.
# We should support re-coding integer
# values into decimal values, so new
# levels are coerced to floating point
if np.issubdtype(old.dtype, np.number):
valmap = dict(zip(np.array(rawlevels[vid]).astype(old.dtype),
np.array(newlevels[vid]).astype(np.float64)))
# Otherwise (text, date, etc) assume
# that raw/new levels aree already of
# the correct type
else:
valmap = dict(zip(rawlevels[vid], newlevels[vid]))
new = old.replace(valmap)
# For numeric columns, check to see if
# the recoding has caused the data to
# become inversely correlated with its
# un-re-encoded version, and add an
# "inverted" flag accordingly. This
# will end up in the summary file (see
# --write_summary)
if np.issubdtype(old.dtype, np.number):
if old.corr(new) < 0:
dtable.addFlag(col, 'inverted')
if corr < 0:
dtable.addFlag(col, 'inverted')
dtable[:, col.name] = new
......@@ -915,26 +915,23 @@ def _prepareCleaningSelectors(args):
:arg args: ``argparse.Namespace`` object.
"""
def numlist(s):
return np.fromstring(s, sep=',', dtype=np.float)
# convert na_values from a sequence of [(varid, str)]
# pairs into a dict of {varid : [value]} mappings
# convert na_values from a sequence of [(varid, values)]
# pairs into a dict of {varid : values} mappings
if args.na_values is not None:
args.na_values = {int(vid) : numlist(values)
for vid, values in args.na_values}
args.na_values = {int(vid) : values for vid, values in args.na_values}
# Convert recoding from a sequence of [(varid, rawlevels, newlevels)]
# tuples to a dict of {varid : (rawlevels, newlevels)} mappings
if args.recoding is not None:
args.recoding = {int(vid) : (numlist(rawlevels), numlist(newlevels))
args.recoding = {int(vid) : (rawlevels, newlevels)
for vid, rawlevels, newlevels in args.recoding}
# Convert child_values from a sequence of
# [(varid, exprs, values)] tuples to a dict of
# {varid : ([exprs], [values])} mappings
# {varid : (exprs, values)} mappings
if args.child_values is not None:
args.child_values = {int(vid) : (exprs, numlist(values))
args.child_values = {int(vid) : (exprs, values)
for vid, exprs, values in args.child_values}
# convert clean from a sequence of [(varid, expr)]
......
ID Category Variables
1 age, sex, brain MRI protocol, Phase 31,34,21022,22200,25780
2 genetics 21000,22000:22125,22201:22325,22182,22800:22823
2 genetics 21000,22000:22125,22190,22191,22194,22201:22325,22182,22800:22823
3 early life factors 52,129,130,1677,1687,1697,1737,1767,1777,1787,21066,20022
10 lifestyle and environment - general 3:6,132,189,670,680,699,709,728,738,767,777,1031,1797,1807,1835,1845,1873,1883,2139,2149,2159,2237,2375,2385,2395,2405,2267,2277,2714:10:2834,2946,3526,3536,3546,3581,3591,3659,3669,3700,3710,3720,3829,3839,3849,3872,3882,3912,3942,3972,3982,4501,4674,4825,4836,5057,6138,6142,6139:6141,6145:6146,6160,10016,10105,10114,10721,10722,10740,10749,10860,10877,10886,20074:20075,20110:20113,20118:20119,20121,22501,22599,22606,22700,22702,22704,24003:24020,24024,24500:24508,26410:26434
10 lifestyle and environment - general 3:6,132,189,670,680,699,709,728,738,767,777,1031,1797,1807,1835,1845,1873,1883,2139,2149,2159,2237,2375,2385,2395,2405,2267,2277,2714:10:2834,2946,3526,3536,3546,3581,3591,3659,3669,3700,3710,3720,3829,3839,3849,3872,3882,3912,3942,3972,3982,4501,4674,4825,4836,5057,6138,6142,6139:6141,6145:6146,6160,10016,10105,10114,10721,10722,10740,10749,10860,10877,10886,20074:20075,20107,20110:20113,20118:20119,20121,22501,22599,22606,22700,22702,22704,24003:24021,24024,24500:24508,26410:26434
11 lifestyle and environment - exercise and work 1001,1011,796,806,816,826,845,864,874,884,894,904,914,924,943,971,981,991,1021,1050:10:1220,2624,2634,3426,3637,3647,6143,6162,6164,10953,10962,10971,22604,22605,22607:22615,22620,22630,22631,22640:22655,104900,104910,104920
12 lifestyle and environment - food and drink 1289:10:1389,1408:10:1548,2654,3089,3680,6144,10007,10723,10767,10776,10855,10912,20084:20094,20098:20109,100001:100009,100011:100019,100021:100025,100010:10:100560,100760:10:104670
12 lifestyle and environment - food and drink 1289:10:1389,1408:10:1548,2654,3089,3680,6144,10007,10723,10767,10776,10855,10912,20084:20094,20098:20106,20108:20109,100001:100009,100011:100019,100021:100025,100010:10:100560,100760:10:104670
13 lifestyle and environment - alcohol 1558:10:1628,2664,3731,3859,4407,4418,4429,4440,4451,4462,5364,10818,20095:20097,20117,20403:20410,20414:20416,100580:10:100740
14 lifestyle and environment - tobacco 1239:10:1279,2644,2867:10:2907,2926,2936,3159,3436:10:3506,5959,6157,6158,6183,6194,10115,10827,10895,20116,20160:20162,22506:22508
20 physical measures - general 46:51,1707,1717,1727,1747,1757,2306,3059,3062:3065,3088,3160,10691,10693:10696,10714,10717,12143:12144,20015,20255:20258,21001,21002,22400:22414,22427,23098:23130,23244:23289
21 physical measures - bone density and sizes 77,78,3083:3086,3143:3144,3146:3148,4092,4095,4100:4101,4103:4106,4119:4120,4122:4125,4138:4147,23200:23243,23290:23320
22 physical measures - cardiac & blood vessels 93:95,102,4079,4080,4136,4194:4196,4198:4200,4204:4205,4207,5983,5984,5986,5992,5993,6014:6017,6019,6020,6022,6024,6032:6034,6038,6039,12673:12687,12336,12338,12340,12697,12698,12702,21021,22330:22338,22420:22426,22670:22685
23 hearing test 4229:4230,4232:4237,4239:4247,4249,4268:4270,4272,4275:4277,4279,4849,10793,20019,20021,20060
24 eye test 5076:5079,5082:5091,5096:5119,5132:5136,5138:5149,5152,5155:5164,5181:5183,5186,5188,5190,5193,5198:5199,5201,5202,5204,5206,5208,5209,5211,5215,5221,5237,5251,5254:5259,5261:5267,5273,5274,5276,5292,5306,5324:5328,6070:6075,20052,20055,20261:20262
24 eye test 5076:5079,5082:5091,5096:5119,5132:5136,5138:5149,5152,5155:5164,5181:5183,5186,5188,5190,5193,5198:5199,5201,5202,5204,5206,5208,5209,5211,5215,5221,5237,5251,5254:5259,5261:5267,5273,5274,5276,5292,5306,5324:5328,6070:6075,20052,20055,20057,20261:20262
25 physical activity measures 5985,90002:90003,90010:90013,90015:90177,90179:90195
26 abdominal measures 22415:22416
30 blood assays 74,23000:23044,23049:23060,23062,23063,23065:23071,23073:23075,30000:10:30300,30104,30112,30114,30172,30174,30242,30252,30254,30314:10:30344,30364:10:30424,30500:10:30530,30600:10:30890
31 brain IDPs 25000:25746,25754:25759,25761:25768,25781:25920,26500:26514,26517:26518,26520:27772
32 cognitive phenotypes 62,111,396:404,630,4250:4256,4258:4260,4281:4283,4285,4287,4290:4292,4294,4924,4935,4957,4968,4979,4990,5001,5012,5556,5699,5779,5790,5866,6312,6332,6333,6348:6351,6362,6373,6374,6382,6383,6671,6770:6773,10133:10134,10136:10144,10146:10147,10241,10609:10610,10612,20016,20018,20023,20082,20128:20157,20159,20165,20167,20169:2:20195,20196:2:20200,20229,20230,20240,20242,20244:20248,21004,23321:23324
50 health and medical history, health outcomes 84,87,92,134:137,2178,2188,2207,2217,2227,2247,2257,2296,2316,2335:10:2365,2415,2443:10:2473,2492,2674,2684,2694,2704,2844,2956:10:2986,3005,3079,3140,3393,3404,3414,3571,3606,3616,3627,3741,3751,3761,3773,3786,3799,3809,3894,3992,4012,4022,4041,4056,4067,4689,4700,4717,4728,4792,4803,4814,5408,5419,5430,5441,5452,5463,5474,5485,5496,5507,5518,5529,5540,5610,5832,5843,5855,5877,5890,5901,5912,5923,5934,5945,6119,6147,6148,6149,6150,6151,6152,6153,6154,6155,6159,6177,6179,6205,10004:10006,10854,20001:20011,20199,21024:21045,21047:21061,21064:21065,21067,21068,21070:21076,22126:22181,22502:22505,22616,22618,22619,40001:41253,41256,41258,41266,41267,41269:41273,41275:41278,41284:41286,42000:42013
30 blood assays 74,23000:23044,23049:23060,23062,23063,23065:23071,23073:23075,23400:23407,23409:23412,23414,23416,23419,23421:23434,23437,23438,23440,23442:23481,23484,23491,23492,23499,23501:23503,23505:23509,23511,23512,23515,23516,23519,23522:23530,23537,23538,23542,23544:23546,23548:23551,23556:23558,23564,23568,23571:23574,23576:23578,30000:10:30300,30104,30112,30114,30172,30174,30242,30252,30254,30314:10:30344,30364:10:30424,30500:10:30530,30600:10:30890
31 brain IDPs 25000:25746,25754:25759,25761:25768,25781:25920,26500:27772
32 cognitive phenotypes 62,111,396:404,630,4250:4256,4258:4260,4281:4283,4285,4287,4290:4292,4294,4924,4935,4957,4968,4979,4990,5001,5012,5556,5699,5779,5790,5866,6312,6332,6333,6348:6351,6362,6373,6374,6382,6383,6770:6773,10133:10134,10136:10144,10146:10147,10241,10609:10610,10612,20016,20018,20023,20082,20128:20157,20159,20165,20167,20169:2:20195,20196:2:20200,20229,20230,20240,20242,20244:20248,21004,23321:23324
50 health and medical history, health outcomes 84,87,92,134:137,2178,2188,2207,2217,2227,2247,2257,2296,2316,2335:10:2365,2415,2443:10:2473,2492,2674,2684,2694,2704,2844,2956:10:2986,3005,3079,3140,3393,3404,3414,3571,3606,3616,3627,3741,3751,3761,3773,3786,3799,3809,3894,3992,4012,4022,4041,4056,4067,4689,4700,4717,4728,4792,4803,4814,5408,5419,5430,5441,5452,5463,5474,5485,5496,5507,5518,5529,5540,5610,5832,5843,5855,5877,5890,5901,5912,5923,5934,5945,6119,6147,6148,6149,6150,6151,6152,6153,6154,6155,6159,6177,6179,6205,6671,10004:10006,10854,20001:20011,20199,21024:21045,21047:21061,21064:21065,21067,21068,21070:21076,22126:22181,22502:22505,22616,22618,22619,40001:41253,41256,41258,41266,41267,41269:41273,41275:41278,41284:41286,42000:42013
51 mental health self-report 1920:10:2110,4526,4537,4548,4559,4570,4581,4598,4609,4620,4631,4642,4653,5375,5386,5663,5674,6156,20122,20126:20127,20401,20411,20417:20423,20425:20429,20431:20442,20445:20450,20453:20460,20463,20465:20467,20470:20471,20473,20476,20477,20479:20484,20485:20502,20505:20544,20546:20551,20553:20554,21062:21063
60 health dates 41257,41260,41262,41263,41268,41280:41283,42014,42016,130004,130008,130014:2:130020,130062,130064,130070,130082,130106,130134,130174:2:130178,130184:2:130190,130194,130202,130216,130218,130224:2:130230,130264,130310,130320,130336,130338,130342,130344,130622,130624,130648,130656:2:130660,130664,130670,130686,130696:2:130708,130714,130718,130722,130726,130734,130736,130770,130774,130792,130814,130818,130820,130826,130828,130832,130854,130868,130892:2:130898,130902:2:130910,130914,130918,130922,130924,130998,131000,131022,131030,131032,131042,131046,131048,131052:2:131056,131060:2:131064,131070:2:131076,131086,131102,131114,131118,131124,131128:2:131132,131136,131138,131142,131144,131148,131150,131154,131158,131164,131166,131178:2:131186,131190,131192,131196,131198,131204,131208:2:131212,131216,131222,131224,131228,131230,131234,131236,131242,131252,131256:2:131264,131270,131282,131286,131296,131298,131304:2:131308,131314,131316,131322,131324,131338,131342,131344,131348:2:131356,131360,131366:2:131370,131374,131382,131386,131390,131392,131396,131402,131404,131408,131410,131414,131416,131424:2:131432,131436,131442,131456,131458,131462:2:131476,131480:2:131484,131490:2:131494,131498,131528,131534,131538,131540,131546,131548,131554,131556,131560:2:131586,131590:2:131594,131598:2:131604,131608,131612:2:131620,131624:2:131654,131666:2:131670,131674:2:131684,131688,131692,131698:2:131708,131720,131722,131726,131730,131734:2:131742,131746,131748,131754,131760,131768,131774,131778,131782,131790:2:131798,131802:2:131806,131810,131812,131822:131826,131830,131836,131850,131852,131858,131864,131868:2:131888,131892,131894,131900,131906,131910:2:131914,131916,131918,131922:2:131928,131934,131938:2:131942,131946:2:131950,131954:2:131964,131970:2:131974,131980,131988:2:131992,132002,132008,132016,132020,132022,132030:2:132038,132042,132050,132054:2:132058,132062:2:132066,132070:2:132078,132082:2:132088,132092,132096:2:132106,132110,132112,132116,132118,132122,132124,132128:2:132152,132156,132160:2:132170,132186,132192,132194,132202,132206,132212,132216,132220,132224,132230,132238:2:132244,132250,132252,132260:2:132264,132268,132274:2:132280,132298,132522,132532,132542,132562,132574,132312
70 health sources 42015,42017,130005,130009,130015:2:130019,130063,130065,130071,130083,130107,130135,130175:130179,130185:2:130191,130195,130203,130217,130219,130225,130231,130265,130311,130337,130343,130345,130623,130625,130649,130657:2:130661,130665,130671,130687,130697:2:130709,130715,130719,130723,130727,130735,130737,130771,130775,130793,130815,130819,130821,130827,130829,130833,130855,130869,130893:2:130899,130903:2:130911,130915,130919,130923,130925,130999,131001,131023,131031,131033,131043,131047,131049,131053,131055,131057,131061,131063,131065,131071:2:131077,131087,131103,131115,131119,131125,131129,131131,131133,131137,131139,131145,131149,131151,131155,131159,131165,131167,131179:2:131187,131191,131193,131197,131199,131205,131209,131211,131213,131217,131223,131225,131229,131231,131237,131243,131253,131257:2:131265,131271,131283,131287,131297,131299,131305,131307,131309,131315,131317,131323,131325,131339,131343,131345,131349:131357,131361,131367:131371,131375,131383,131387,131391,131393,131397,131403,131409,131411,131415,131417,131425:2:131433,131437,131443,131457,131459,131463:2:131477,131481,131483,131485,131491:2:131495,131499,131529,131535,131539,131541,131547,131549,131555,131557,131561,131563,131565:2:131587,131591,131593,131595,131599:2:131605,131609,131613:2:131621,131625:2:131655,131667:2:131671,131675:2:131685,131689,131693,131701:2:131709,131727,131731,131735:2:131743,131747,131749,131755,131761,131769,131775,131779,131783,131793,131795,131797,131803,131805,131807,131811,131813,131823,131825,131827,131831,131837,131851,131859,131865,131869:2:131889,131893,131895,131901,131907,131911,131913:2:131919,131923:2:131929,131935,131939,131941,131943,131947,131949,131951,131955:2:131965,131971,131973,131975,131981,131989,131991,131993,132003,132009,132017,132021,132023,132031:2:132039,132043,132051,132055,132057,132059,132063:2:132067,132071:2:132079,132083:2:132089,132093,132097:2:132107,132111,132113,132117,132119,132123,132125,132129:2:132153,132157,132161:2:132171,132187,132193,132195,132203,132207,132213,132217,132221,132225,132245,132265,132269,132275:2:132281,132299,132523,132533,132543,132563,132575,132313
60 health dates 41257,41260,41262,41263,41268,41280:41283,42014:2:42020,42026,42030,130004,130008,130014:2:130022,130062,130064,130070,130082,130104,130106,130134,130148,130174:2:130178,130184:2:130190,130194,130200,130202,130212,130216,130218,130224:2:130230,130254,130264,130310,130320,130336:2:130344,130622:2:130626,130632,130634,130642,130646,130648,130656:2:130660,130664,130666,130670,130686,130688,130696:2:130708,130714,130718,130722:2:130726,130734:2:130738,130770,130774,130784,130792,130804,130814,130818,130820,130826:2:130832,130836:2:130842,130846,130848,130852,130854,130868,130874,130890:2:130898,130902:2:130910,130914,130918:2:130924,130932,130944,130998,131000,131022,131030,131032,131036,131038,131042,131046,131048,131052:2:131056,131060:2:131066,131070:2:131078,131082:2:131088,131092,131102:2:131110,131114,131118,131124:2:131132,131136,131138,131142,131144,131148,131150,131154,131158,131160,131164,131166,131174,131178:2:131186,131190,131192,131196,131198,131202,131204,131208:2:131216,131220:2:131224,131228,131230,131234,131236,131242:2:131246,131250,131252,131256:2:131264,131270,131280,131282,131286,131290,131296:2:131300,131304:2:131310,131314,131316,131322,131324,131328,131330,131338,131342:2:131356,131360:2:131370,131374,131378:2:131392,131396,131400:2:131410,131414,131416,131422:2:131432,131436,131440:2:131446,131450,131456,131458,131462:2:131484,131488:2:131494,131498,131502,131518,131524,131528,131534,131538,131540,131546,131548,131554,131556,131560:2:131586,131590:2:131594,131598:2:131604,131608:2:131620,131624:2:131654,131658,131662,131666:2:131670,131674:2:131684,131688:2:131692,131698:2:131708,131720,131722,131726:2:131730,131734:2:131742,131746,131748,131754,131760,131766,131768,131774,131778,131782,131788:2:131798,131802:2:131806,131810,131812,131820:2:131826,131830,131834,131836,131840,131848:2:131852,131858:2:131864,131868:2:131888,131892,131894,131898,131900,131904,131906,131910:2:131914,131916,131918,131922:2:131930,131934,131938:2:131942,131946:2:131950,131954:2:131964,131970:2:131980,131986:2:131994,132002,132004,132008,132014,132016,132020,132022,132030:2:132038,132042,132050,132054:2:132058,132062:2:132066,132070:2:132078,132082:2:132092,132096:2:132112,132116,132118,132122,132124,132128:2:132152,132156,132160:2:132170,132174,132186,132188,132192:2:132196,132202,132206,132212,132216,132220,132224,132230,132232,132238:2:132244,132248:2:132252,132256,132260:2:132264,132268,132274:2:132280,132288,132298,132312,132468,132500,132510,132522,132532,132536,132542,132562,132574
70 health sources 42015:2:42019,42027,42031,130005,130009,130015:2:130023,130063,130065,130071,130083,130105,130107,130135,130149,130175:2:130179,130185:2:130191,130195,130201,130203,130213,130217,130219,130225:2:130231,130265,130311,130321,130337,130343,130345,130623:2:130627,130633,130635,130643,130647,130649,130657:2:130661,130665,130667,130671,130687,130689,130697:2:130709,130715,130719,130723:2:130727,130735,130737,130739,130771,130775,130785,130793,130805,130815,130819,130821,130827:2:130833,130839,130843,130849,130853,130855,130869,130875,130891:2:130899,130903:2:130911,130915,130919:2:130925,130933,130999,131001,131023,131031,131033,131037,131039,131043,131047,131049,131053,131055,131057,131061:2:131067,131071:2:131079,131083:2:131087,131093,131103:2:131111,131115,131119,131125,131129,131131,131133,131137,131139,131143,131145,131149,131151,131155,131159,131161,131165,131167,131175,131179:2:131187,131191,131193,131197,131199,131203,131205,131209:2:131217,131223,131225,131229,131231,131237,131243:2:131247,131251,131253,131257:2:131265,131271,131281,131283,131287,131291,131297,131299,131305,131307,131309,131311,131315,131317,131323,131325,131331,131339,131343:2:131357,131361:2:131371,131375,131381:2:131393,131397,131401,131403,131407:2:131411,131415,131417,131423:2:131433,131437,131441:2:131447,131451,131457,131459,131463:2:131485,131489:2:131495,131499,131503,131519,131525,131529,131535,131539,131541,131547,131549,131555,131557,131561,131563,131565:2:131587,131591,131593,131595,131599:2:131605,131609:2:131621,131625:2:131655,131659,131663,131667:2:131671,131675:2:131685,131689:2:131693,131699:2:131709,131721,131723,131727:2:131731,131735:2:131743,131747,131749,131755,131761,131767,131769,131775,131779,131783,131791:2:131799,131803,131805,131807,131811,131813,131821:2:131827,131831,131835,131837,131841,131849,131851,131859:2:131865,131869:2:131889,131893,131895,131899,131901,131905,131907,131911,131913:2:131919,131923:2:131931,131935,131939,131941,131943,131947,131949,131951,131955:2:131965,131971:2:131981,131987:2:131995,132003,132005,132009,132015,132017,132021,132023,132031:2:132039,132043,132051,132055,132057,132059,132063:2:132067,132071:2:132079,132083:2:132093,132097:2:132109,132111,132113,132117,132119,132123,132125,132129:2:132153,132157,132161:2:132171,132175,132187,132189,132193:2:132197,132203,132207,132213,132217,132221,132225,132245,132265,132269,132275:2:132281,132289,132299,132469,132501,132511,132523,132533,132537,132543,132563,132575,132313
98 pending 41259,41261,41264,42038:42040
99 miscellaneous 19,21,35:45,53:55,68,96,120,200,393,757,1647,2129,3060,3061,3066,3077,3081:3082,3090,3132,3137,3166,4081,4093,4096,4206,4238,4248,4257,4286,4288:4289,4293,4295,5074,5075,5080,5081,5214,5253,5270,5987:5988,5991,6023,6025,6334,10145,10697,12139:12141,12148,12187,12188,12223,12224,12253,12254,12291,12323,12623,12624,12651:12654,12658,12663,12664,12671,12688,12695,12699,12700,12704,12706,12848,12851,12854,20012:20014,20024:20025,20031:20032,20035,20041:20054,20058:20059,20061:20062,20072,20077:20081,20083,20114:20115,20158,20201:20227,20249:20254,20259,20260,20263,20400,21003,21011:21018,21023,21069,21611,21621,21622,21625,21631,21634,21642,21651,21661:21666,21671,21711,21721:21723,21725,21731:21734,21736,21738,21741,21742,21751,21761:21766,21771,21811,21821:21823,21825,21831:21834,21836,21838,21841:21842,21851,21861:21866,21871,22499,22500,22600:22603,22617,22660:22664,23048,23160:23164,25747:25753,30001:10:30301,30002:10:30302,30003:10:30303,30004:10:30304,30354,30502:10:30522,30532,30601:10:30891,30615,30622,30635,30645,30665,30666,30692,30725,30755,30775,30795,30796,30805,30806,30825,30826,30835,30845,30855,30856,30875,30885,30895,30897,40000,90001,90004,105010,105030,110005,110006,110008
99 miscellaneous 19,21,35:45,53:55,68,96,120,200,393,757,1647,2129,3060,3061,3066,3077,3081:3082,3090,3132,3137,3166,4081,4093,4096,4186,4206,4238,4248,4257,4286,4288:4289,4293,4295,5074,5075,5080,5081,5214,5253,5270,5987:5988,5991,6023,6025,6334,10145,10697,12139:12141,12148,12187,12188,12223,12224,12253,12254,12291,12323,12623,12624,12651:12654,12658,12663,12664,12671,12688,12695,12699,12700,12704,12706,12848,12851,12854,20012:20014,20024:20025,20031:20032,20035,20041:20054,20058:20059,20061:20062,20072,20077:20081,20083,20114:20115,20158,20201:20227,20249:20254,20259,20260,20263,20400,21003,21011:21018,21023,21069,21611,21621,21622,21625,21631,21634,21642,21651,21661:21666,21671,21711,21721:21723,21725,21731:21734,21736,21738,21741,21742,21751,21761:21766,21771,21811,21821:21823,21825,21831:21834,21836,21838,21841:21842,21851,21861:21866,21871,22499,22500,22600:22603,22617,22660:22664,23048,23160:23164,25747:25753,30001:10:30301,30002:10:30302,30003:10:30303,30004:10:30304,30354,30502:10:30522,30532,30601:10:30891,30615,30622,30635,30645,30665,30666,30692,30725,30755,30775,30795,30796,30805,30806,30825,30826,30835,30845,30855,30856,30875,30885,30895,30897,40000,90001,90004,105010,105030,110005,110006,110008
......@@ -4,12 +4,14 @@ ID NAValues
37 -1,-3
90 -3
101 -1
272 1900-01-01
402 0
480 -818
485 -1
486 -121,-818
493 -121
496 -818
819 1900-01-01,1901-01-01,1902-02-02,1903-03-03,2037-07-07
946 -818
1001 -1,-3
1010 -11,-13,-21,-23
......
......@@ -76,7 +76,7 @@ encoding_id title availability coded_as structure num_members descript
100 Pass/Fail test result 0 11 1 2 Results of a pass/fail test
101 Result of a pass/fail/not-tested test 0 11 1 3 Results of a pass/fail test with individuals who were explicitly not-tested indicated.
102 inclusion status 0 11 1 2 Whether a participant was included in a particular subset
123 UK Biobank staff 0 11 1 1115 This is a pseudonymised index of the UK Biobank staff who have had authority to sign-off individual-level records which are included in the UK Biobank core repository. UK Biobank will not publish or reveal the actual identiites of staff.
123 UK Biobank staff 0 11 1 1128 This is a pseudonymised index of the UK Biobank staff who have had authority to sign-off individual-level records which are included in the UK Biobank core repository. UK Biobank will not publish or reveal the actual identiites of staff.
165 Assay correction level 0 11 1 3 Indicates the type of assay correct that was applied to a biochemistry result.
169 Blood haplotypes 0 41 1 6 Blood haplotypes
170 Map co-ordinates 0 11 1 1 Special map co-ordinates
......@@ -308,21 +308,48 @@ encoding_id title availability coded_as structure num_members descript
1834 TRUD mapping of Read2 into 3-character ICD10 0 41 1 22340 TRUD mapping of Read2 into 3-character ICD10<p>ICD-10 codes, terms and text used by permission of WHO, from: International Statistical Classification of Diseases and Related Health Problems, Tenth Revision (ICD-10). Vols 1-3. Geneva, World Health Organization, 1992-2016.<p>Contains information from NHS Digital, licenced under the current version of the Open Government Licence available at www.nationalarchives.gov.uk/doc/open-government-licence/open-government-licence.htm.
1835 TRUD mapping of Read3 into 3-character ICD10 0 41 1 33284 TRUD mapping of Read3 into 3-character ICD10<p>ICD-10 codes, terms and text used by permission of WHO, from: International Statistical Classification of Diseases and Related Health Problems, Tenth Revision (ICD-10). Vols 1-3. Geneva, World Health Organization, 1992-2016.<p>Contains information from NHS Digital, licenced under the current version of the Open Government Licence available at www.nationalarchives.gov.uk/doc/open-government-licence/open-government-licence.htm.
1836 Partial mapping of ICD9 to ICD10 at 3-character level 0 41 1 2875 This encoding maps ICD9 (~C87~) codes onto the 3-character level of the ICD10 (~C19~) codes. It is not intended to be exhaustive and cases where the mapping is ambiguous have been omitted.<p>ICD-10 codes, terms and text used by permission of WHO, from: International Statistical Classification of Diseases and Related Health Problems, Tenth Revision (ICD-10). Vols 1-3. Geneva, World Health Organization, 1992-2016.
1853 COVID19 test locations 0 11 1 39 Locations/methods used to generate samples for COVID19 testing.
1853 COVID19 test locations 0 11 1 44 Locations/methods used to generate samples for COVID19 testing.
1854 Test result 0 11 1 2 Result of a binary test
1855 Origin of test sample 0 11 1 2 Indicates where a participant was believed to be (or be doing) when their sample was taken.
1856 COVID19 testing laboratories 0 11 1 141 Laboratories performing tests for COVID19
1856 COVID19 testing laboratories 0 11 1 175 Laboratories performing tests for COVID19
1862 Diagnosis route 0 11 1 6 Method of diagnosis
1965 Lost to follow-up 0 11 1 5 This field indicates the participants for whom it is believed to be impossible to fully complete data collection and the reason for this belief.
1970 Record providence 0 41 1 6 Indicates the provider/origin of Hes data records.
1990 Trail completion 0 11 1 1 Indicates that a participant did not complete a trail challenge.
2171 Source(s) of first reports of health outcomes 0 11 1 8 Data source(s) of each code mapped to 3-character ICD10
2226 LIMS data transfer route 0 11 1 3 This identifies the route whereby data from laboratory analysers reached the central UK Biobank LIMS (laboratory information management system) database. It's relevance is that it affects the interpretation of the time that an item of data was acquired, since the timestamp reflects the time that the data reached the LIMS system rather than when the actual measurement was done.
2301 Spectrometer ID 0 11 1 6 This is an arbitrary encoding to all samples analysed on the same spectrometer to be identified.
2302 QC flags 0 11 1 2 QC flags for Nightingale data
2310 QC flags (nightingale) 0 11 1 10 Nightingale QC flags
2360 EMIS clinical value coding 0 31 1 6 Special codes used to redact values associated with EMIS GP clinical codes.<p>Please note that floating-point format appears as scientific notation in the Showcase and obscures small differences between the individual codes. These differences are visible when the encoding is downloaded.
2730 Onset speed 0 11 1 4 Speed of onset
3002 Headache pain frequency 0 11 1 5 Frequency of pain when headaches were at their worst with option of prefer not to answer
3003 Symptoms severity 0 11 1 5 Severity of symptoms over past week with option of prefer not to answer
3004 Mobility problem severity 0 11 1 5 Severity of problems with walking
3005 Anxiety severity 0 11 1 5 Severity of anxiety for describing health today
3006 Depression frequency 0 11 1 5 Frequency of current depression symptoms with option of prefer not to answer
3008 Location of pain that bothers most 0 11 1 14 Location of pain that bothers most in last three months with option of prefer not to answer
3012 Level of difficulty experienced with tasks 0 11 1 5 Level of difficulty experienced with tasks with option of prefer not to answer
3013 Length of time suffering with pain 0 11 1 5 Length of time suffering with pain or discomfort with options of do not know or prefer not to answer
3014 Scale of pain over last 24 hours 0 11 1 4 Scale of pain over last 24 hours with options of do not know and prefer not to answer
3015 Level of pain relief provided by treatments or medications 0 11 1 13 Percentage of pain relief received from treatments or medications with option of prefer not to answer
3017 DNK/PNA 0 11 1 2 Do not know or prefer not to answer
3018 Length of time since headaches began or ended 0 11 1 3 Length of time since headaches began or ended with option of prefer not to answer
3019 Self-care problem severity 0 11 1 5 Severity of problems with self-care today
3020 Usual activities problem severity 0 11 1 5 Severity of problems with usual activities today
3021 Pain severity 0 11 1 5 Severity of pain or discomfort today
3022 Location of pain during past week 0 11 1 21 Location of pain or terderness during past week with option of prefer not to answer
3175 GP code type 0 11 1 11 Coding system used to classify GP clinical and prescription data
3311 Requesting organisation 0 11 1 15 Organisations responsible for requesting blood tests.
3311 Requesting organisation 0 11 1 16 Organisations responsible for requesting blood tests.
3432 UKB laboratory analysers 0 41 1 4 Analyser machines used at the UK Biobank laboratories.
3500 Welsh COVID19 testing laboratories 0 11 1 19 Welsh laboratories performing tests for COVID19
3501 Welsh COVID19 patient types 0 11 1 18 Descriptions of patient types for Welsh COVID19 testing
3502 Welsh COVID19 person types 0 11 1 80 Description of person tested for COVID19 (e.g. essential worker, inpatient, healthcare worker, education worker). Also provides additional information on whether the individual was asymptomatic or immunocompromised
3503 Welsh COVID19 test locations 0 11 1 9 Locations/methods used to generate samples for Welsh COVID19 testing
3504 Scottish COVID19 testing laboratories 0 41 1 18 Scottish laboratories performing tests for COVID19
3505 Scottish COVID19 medical facility types 0 11 1 8 Types of Scottish medical facilities performing tests for COVID19
3506 Scottish COVID19 medical facilities 0 41 1 255 Names of Scottish medical facilities performing tests for COVID19
3510 QC flag 0 11 1 10 QC flag for Nightgale data
4214 dm+d special codes 0 41 1 2 Special values used to recode missing dm+d codes
4917 Reportability 0 11 1 5 Reportability of assay results
4982 Block count exception 0 11 1 1 Used to indicate cases where block count is not known.
......@@ -343,7 +370,7 @@ encoding_id title availability coded_as structure num_members descript
5159 Mood scale 0 11 1 5 Scale of answers to mood questions.
5160 Input device/method 0 11 1 5 Method of data input used by participant
5178 Pointing method 0 11 1 5 Method used by participants to point-to/select/click a control on a screen.
5702 TPP GP clinical value coding 0 31 1 2 Special codes used to redact values associated with sensitive TPP GP clinical codes.
5702 TPP GP clinical value coding 0 31 1 5 Special codes used to redact values associated with sensitive TPP GP clinical codes.
6312 Tower responses 0 11 1 6 Responses were integers in the range 1-6.
6314 Word/picture groups 0 11 1 340 Relates indices of word/picture groups to the actual words displayed
6315 Position of picture on screen 0 11 1 4 Index relating each picture to their position on screen
......@@ -369,12 +396,13 @@ encoding_id title availability coded_as structure num_members descript
7010 Critical care discharge destination 0 11 1 6 Critical care discharge destination
7011 Critical care discharge destination location 0 11 1 9 Critical care discharge destination location
7012 Critical care APC relationship 0 11 1 14 Critical care APC relationship
7128 CTV3 clinical codes 0 41 1 332115 CTV3 clinical codes. Redacted sensitive/potentially identifying codes are represented by negative values.
7128 CTV3 clinical codes 0 41 1 332115 CTV3 clinical codes and redaction values, for use in TPP GP clinical data (COVID-19 research only; please see ~R3151~). Redacted sensitive/potentially identifying codes are represented by negative values. CTV3 codes are also employed in general-use GP clinical data (~F42040~). Please see the resources in ~L3000~ for more information and code lookups.
7310 Initial infection 0 11 1 7 Infections diagnosed alongside IBS
7618 Antigen QC 0 11 1 2 Description of antigen quality control observation.
7678 EMIS local script codes 0 41 1 6995 Local codes created by EMIS for use in GP prescription data. Codes that have been redacted are represented by negative values.
7689 EMIS local clinical codes 0 41 1 19280 Local codes created by EMIS for use in GP clinical data. Codes that have been redacted are represented by negative values.
8708 TPP local codes 0 41 1 5683 Codes created by TPP for use in GP clinical data. Redacted sensitive/potentially identifying codes are represented by negative values.
7667 Signal to noise ratio exclusion 0 31 1 1 Used to flag the value -99999 as not a valid result
7678 EMIS local script codes 0 41 1 7210 Local codes created by EMIS for use in GP prescription data. Codes that have been redacted are represented by negative values.
7689 EMIS local clinical codes 0 41 1 19614 Local codes created by EMIS for use in GP clinical data. Codes that have been redacted are represented by negative values.
8708 TPP local codes 0 41 1 5747 Codes created by TPP for use in GP clinical data. Redacted sensitive/potentially identifying codes are represented by negative values.
22000 Genotyping array 0 11 2 108 This encoding categorises the particular types of arrays used to measure genotypes in UK Biobank participants.
100001 WEBDIET-coding100000001 0 11 1 3 Coding for online diet questionnaire
100002 WEBDIET-coding100000002 0 11 1 5 Coding for online diet questionnaire
......
This diff is collapsed.
......@@ -21,8 +21,7 @@ import collections.abc as abc
import pandas as pd
from . import loadtables
from . import util
import funpack.util as util
log = logging.getLogger(__name__)
......@@ -498,6 +497,8 @@ class DataTable(util.Singleton):
(e.g. ``visit``, ``metadata``, etc).
"""
import funpack.loadtables as loadtables # noqa: E501 pylint: disable=import-outside-toplevel
if vids is None: vids = [None] * len(series)
if kwargs is None: kwargs = [None] * len(series)
......
......@@ -43,10 +43,15 @@ import logging
import warnings
import collections
from typing import Tuple, Sequence, Union, Dict, List, Type, Any, Callable
from typing_extensions import Literal
import numpy as np
import pandas as pd
import funpack.util as util
import funpack.fileinfo as finfo
import funpack.datatable as datatable
import funpack.processing as processing
import funpack.expression as expression
......@@ -54,7 +59,7 @@ import funpack.expression as expression
log = logging.getLogger(__name__)
def convert_type(val):
def convert_type(val : str) -> util.CTYPES:
"""Convert a string containing a UK BioBank type into a numerical
identifier for that type - see :attr:`funpack.util.CTYPES`.
"""
......@@ -86,7 +91,7 @@ def convert_type(val):
return valmap.get(val.lower(), util.CTYPES.unknown)
def convert_dtype(val):
def convert_dtype(val : str) -> Union[np.dtype, Literal[np.nan]]:
"""Convert a string containing a ``numpy.dtype`` (e.g. ``'float32'``)
into a ``dtype`` object.
"""
......@@ -102,7 +107,7 @@ def convert_dtype(val):
return dtype
def convert_comma_sep_text(val):
def convert_comma_sep_text(val : str) -> Union[List[str], Literal[np.nan]]:
"""Convert a string containing comma-separated text into a list. """
if val.strip() == '':
return np.nan
......@@ -110,7 +115,7 @@ def convert_comma_sep_text(val):
return [w.strip() for w in words]
def convert_comma_sep_numbers(val):
def convert_comma_sep_numbers(val : str) -> Union[np.ndarray, Literal[np.nan]]:
"""Convert a string containing comma-separated numbers into a ``numpy``
array.
"""
......@@ -119,7 +124,8 @@ def convert_comma_sep_numbers(val):
return np.fromstring(val, sep=',', dtype=np.float)
def convert_ParentValues(val):
def convert_ParentValues(
val : str) -> Union[List[expression.Expression], Literal[np.nan]]:
"""Convert a string containing a sequence of comma-separated
``ParentValue`` expressions into a sequence of :class:`.Expression`
objects.
......@@ -129,7 +135,7 @@ def convert_ParentValues(val):
return [expression.Expression(e) for e in val.split(',')]
def convert_Process_Variable(val):
def convert_Process_Variable(val : str) -> Tuple[str, List[int]]:
"""Convert a string containing a process variable specification - one of:
- One or more comma-separated MATLAB-style ``start:stop:step`` ranges,
......@@ -179,7 +185,10 @@ def convert_Process_Variable(val):
return ptype, list(it.chain(*[util.parseMatlabRange(t) for t in tokens]))
def convert_Process(ptype, val):
def convert_Process(
ptype : str,
val : str
) -> Dict[str, processing.Process]:
"""Convert a string containing a sequence of comma-separated ``Process`` or
``Clean`` expressions into an ``OrderedDict`` of :class:`.Process`
objects (with the process names used as dictionary keys).
......@@ -192,7 +201,7 @@ def convert_Process(ptype, val):
return collections.OrderedDict([(p.name, p) for p in procs])
def convert_category_variables(val):
def convert_category_variables(val : str) -> List[int]:
"""Convert a string containing a sequence of comma-separated variable IDs
or ranges into a list of variable IDs. Variables may be specified as
integer IDs, or via a MATLAB-style ``start:step:stop`` range. See
......@@ -272,11 +281,7 @@ VARTABLE_DTYPES = {
VARTABLE_CONVERTERS = {
'Type' : convert_type,
'InternalType' : convert_dtype,
'NAValues' : convert_comma_sep_numbers,
'RawLevels' : convert_comma_sep_numbers,
'NewLevels' : convert_comma_sep_numbers,
'ParentValues' : convert_ParentValues,
'ChildValues' : convert_comma_sep_numbers,
'Clean' : ft.partial(convert_Process, 'cleaner'),
}
"""Custom converter functinos to use for some columns in the variable
......@@ -293,15 +298,6 @@ DCTABLE_DTYPES = {
"""Types to use for some columns in the data coding table. """
DCTABLE_CONVERTERS = {
'NAValues' : convert_comma_sep_numbers,
'RawLevels' : convert_comma_sep_numbers,
'NewLevels' : convert_comma_sep_numbers,
}
"""Custom converter functinos to use for some columns in the data coding
table.
"""
TYPETABLE_DTYPES = {
'Type' : object,
......@@ -351,13 +347,19 @@ call to :func:`addImplicitCategories`).
"""
def loadTables(fileinfo,
varfiles=None,
dcfiles=None,
typefile=None,
procfile=None,
catfile=None,
**kw):
def loadTables(
fileinfo : finfo.FileInfo,
varfiles : Sequence[str] = None,
dcfiles : Sequence[str] = None,
typefile : str = None,
procfile : str = None,
catfile : str = None,
**kw
) -> Tuple[pd.DataFrame,
pd.DataFrame,
pd.DataFrame,
List[datatable.Column],
List[datatable.Column]]:
"""Loads the data tables used to run ``funpack``.
:arg fileinfo: :class:`.FileInfo` object describing the input data files.
......@@ -401,19 +403,23 @@ def loadTables(fileinfo,
return vartable, proctable, cattable, unk, unc
def loadVariableTable(fileinfo,
varfiles=None,
dcfiles=None,
typefile=None,
noBuiltins=False,
naValues=None,
childValues=None,
recoding=None,
clean=None,
typeClean=None,
globalClean=None,
dropAbsent=True,
**kwargs):
def loadVariableTable(
fileinfo : finfo.FileInfo,
varfiles : Sequence[str] = None,
dcfiles : Sequence[str] = None,
typefile : str = None,
noBuiltins : bool = False,
naValues : Dict[int, str] = None,
childValues : Dict[int, Tuple[str, str]] = None,
recoding : Dict[int, Tuple[str, str]] = None,
clean : Dict[int, str] = None,
typeClean : Dict[util.CTYPES, str] = None,
globalClean : str = None,
dropAbsent : bool = True,
**kwargs # pylint: disable=unused-argument
) -> Tuple[pd.DataFrame,
Sequence[datatable.Column],
Sequence[datatable.Column]]:
"""Given variable table and datacoding table file names, builds and returns
the variable table.
......@@ -429,30 +435,37 @@ def loadVariableTable(fileinfo,
:arg noBuiltins: If provided, the built-in variable and datacoding base
tables are not loaded.
:arg naValues: Dictionary of ``{vid : [values]}`` mappings, specifying
values which should be replaced with NA.
:arg naValues: Dictionary of ``{vid : values}`` mappings, specifying
values which should be replaced with NA. The values
are expected to be strings of comma-separated values.
:arg childValues: Dictionary of ``{vid : [exprs], [values]}`` mappings,
:arg childValues: Dictionary of ``{vid : (exprs, values)}`` mappings,
specifying parent value expressions, and corresponding
child values.
child values. The expressions and values
are expected to be strings of comma-separated values
of the same length.
:arg recoding: Dictionary of ``{vid : [rawlevel], [newlevel]}``
mappings
:arg recoding: Dictionary of ``{vid : (rawlevel, newlevel)}``
mappings. The raw and enw levels are expected to be
strings of comma-separated values of the same length.
:arg clean: Dictionary of ``{vid : expr}`` mappings containing
cleaning functions to apply - this will override
any cleaning specified in the variable file, and
any cleaning specified in ``typeClean``.
any cleaning specified in ``typeClean``. The expressions
are expected to be strings.
:arg typeClean: Dictionary of ``{type : expr}`` mappings containing
cleaning functions to apply to all variables of a
specific type - this will override any cleaning
specified in the type file.
specified in the type file. The expressions
are expected to be strings.
:arg globalClean: Expression containing cleaning functions to
apply to every variable - this will be performed after
variable-specific cleaning in the variable table,
or specified via ``clean`` or ``typeClean``.
or specified via ``clean`` or ``typeClean``. The
expressions are expected to be strings.
:arg dropAbsent: If ``True`` (the default), remove all variables from the
variable table which are not present in the data
......@@ -486,7 +499,7 @@ def loadVariableTable(fileinfo,
dcfiles,
'data coding',
DCTABLE_DTYPES,
DCTABLE_CONVERTERS,
{},
DCTABLE_COLUMNS)
tytable = mergeTableFiles(None,
[typefile],
......@@ -533,29 +546,55 @@ def loadVariableTable(fileinfo,
# table (overriding whatever was specified
# in the datacoding/variable tables)
if naValues is not None:
naValues = {vid : np.array(vals) for vid, vals in naValues.items()}
mergeIntoVariableTable(
vartable,
'NAValues',
naValues)
if recoding is not None:
recoding = {vid : (np.array(raw), np.array(new))
for vid, (raw, new) in recoding.items()}
mergeIntoVariableTable(
vartable,
['RawLevels', 'NewLevels'],
recoding)
if childValues is not None:
childValues = {vid : (convert_ParentValues(expr),
np.array(values))
childValues = {vid : (convert_ParentValues(expr), values)
for vid, (expr, values) in childValues.items()}
mergeIntoVariableTable(
vartable,
['ParentValues', 'ChildValues'],
childValues)
# navalues, raw/new levels and child values
# are all still comma-separated strings -
# convert them to types appropriate to the
# datafield/variable
def convert(rowvalues, column):
val = rowvalues[column]
if pd.isna(val):
return np.nan
val = convert_comma_sep_text(val)
ctype = rowvalues['Type']
dtype = util.DATA_TYPES.get(ctype, None)
if ctype in (util.CTYPES.date, util.CTYPES.time):
return pd.to_datetime(val).values
else:
return np.array(val, dtype=dtype)
# Make sure the series type stays as "object" - if
# no rules are specified on any vid, the convert
# function will return all nans, and pandas will
# coerce the series type to float64, which may