Commit 7b65cb1a authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

TEST: Various type-related updates, also test --na_values, --recoding and

--child_values support for non-numeric data
parent 0210adc8
......@@ -247,8 +247,6 @@ def test_codeToNumeric():
dt = None
def test_makeNa():
data = np.zeros((100, 3), dtype=np.float32)
......@@ -293,18 +291,30 @@ def test_makeNa():
def test_applyNewLevels():
data = np.random.randint(0, 10, (50, 4))
data[:, 0] = np.arange(1, 51)
eids = np.arange(1, 51)
data1 = np.random.randint(0, 10, 50)
data2 = np.random.randint(0, 10, 50)
data3 = np.random.randint(0, 10, 50)
data4 = np.array(random.choices(string.digits, k=50))
codes1 = np.random.randint(100, 200, 10)
codes2 = np.random.randint(100, 200, 10)
codes3 = np.arange(9, -1, -1)
exp1 = [codes1[data[i, 1]] for i in range(50)]
exp2 = [codes2[data[i, 2]] for i in range(50)]
exp3 = [codes3[data[i, 3]] for i in range(50)]
vartable, proctable, cattable, _ = gen_tables([1, 2, 3])
codes4 = list(string.digits)
random.shuffle(codes4)
codes4 = np.array(codes4)
exp1 = [codes1[ data1[i]] for i in range(50)]
exp2 = [codes2[ data2[i]] for i in range(50)]
exp3 = [codes3[ data3[i]] for i in range(50)]
exp4 = [codes4[int(data4[i])] for i in range(50)]
vartable, proctable, cattable, _ = gen_tables(
[1, 2, 3, 4],
vtypes={1 : 'integer',
2 : 'integer',
3 : 'integer',
4 : 'text'})
with warnings.catch_warnings():
warnings.simplefilter('ignore')
......@@ -314,15 +324,17 @@ def test_applyNewLevels():
vartable['NewLevels'][2] = codes2
vartable['RawLevels'][3] = np.arange(10)
vartable['NewLevels'][3] = codes3
vartable['RawLevels'][4] = np.array([c for c in string.digits])
vartable['NewLevels'][4] = codes4
cols = ['eid', '1-0.0', '2-0.0', '3-0.0']
cols = ['eid', '1-0.0', '2-0.0', '3-0.0', '4-0.0']
with tempdir():
with open('data.txt', 'wt') as f:
f.write('\t'.join(cols) + '\n')
np.savetxt(f, data, delimiter='\t')
for eid, d1, d2, d3, d4 in zip(eids, data1, data2, data3, data4):
f.write(f'{eid}\t{d1}\t{d2}\t{d3}\t{d4}\n')
finfo = fileinfo.FileInfo('data.txt')
dt, _ = importing.importData(finfo,
......@@ -335,6 +347,7 @@ def test_applyNewLevels():
assert np.all(dt[:, '1-0.0'] == exp1)
assert np.all(dt[:, '2-0.0'] == exp2)
assert np.all(dt[:, '3-0.0'] == exp3)
assert np.all(dt[:, '4-0.0'] == exp4)
col3 = dt.columns(3)[0]
......@@ -344,34 +357,51 @@ def test_applyNewLevels():
def test_applyNAInsertion():
data = np.random.randint(0, 10, (100, 3)).astype(np.float32)
data[:, 0] = np.arange(1, 101)
miss1 = np.random.choice(range(10), 4, replace=False)
miss2 = np.random.choice(range(10), 4, replace=False)
eids = np.arange(1, 101)
data1 = np.random.randint(0, 10, 100).astype(np.float32)
data2 = np.random.randint(0, 10, 100).astype(np.float32)
data3 = np.array(random.choices(string.ascii_letters, k=100))
data1 = pd.Series(data1, index=eids)
data2 = pd.Series(data2, index=eids)
data3 = pd.Series(data3, index=eids)
exp1 = data[:, 1].copy()
exp2 = data[:, 2].copy()
miss1 = np.random.choice(data1, 4, replace=False)
miss2 = np.random.choice(data2, 4, replace=False)
miss3 = np.random.choice(data3, 4, replace=False)
exp1 = data1.copy()
exp2 = data2.copy()
exp3 = data3.copy()
for m in miss1: exp1[exp1 == m] = np.nan
for m in miss2: exp2[exp2 == m] = np.nan
for i, e in enumerate(exp3):
if e in miss3:
exp3.iloc[i] = np.nan
vartable, proctable, cattable, _ = gen_tables([1, 2, 3])
vartable, proctable, cattable, _ = gen_tables(
[1, 2, 3], vtypes={1 : 'integer',
2 : 'integer',
3 : 'text'})
with warnings.catch_warnings():
warnings.simplefilter("ignore")
vartable['NAValues'][1] = miss1
vartable['NAValues'][2] = miss2
vartable['NAValues'][3] = np.array([1, 2, 3])
vartable['NAValues'][3] = miss3
vartable['NAValues'][4] = np.array([1, 2, 3])
cols = ['eid', '1-0.0', '2-0.0']
cols = ['eid', '1-0.0', '2-0.0', '3-0.0']
with tempdir():
mgr = mp.Manager()
with open('data.txt', 'wt') as f:
f.write('\t'.join(cols) + '\n')
np.savetxt(f, data, delimiter='\t')
for eid, d1, d2, d3 in zip(eids, data1, data2, data3):
f.write(f'{eid}\t{d1}\t{d2}\t{d3}\n')
finfo = fileinfo.FileInfo('data.txt')
dt, _ = importing.importData(finfo,
......@@ -385,15 +415,19 @@ def test_applyNAInsertion():
na1mask = dt[:, '1-0.0'].isna()
na2mask = dt[:, '2-0.0'].isna()
na3mask = dt[:, '3-0.0'].isna()
d1 = dt[:, '1-0.0'][~na1mask]
d2 = dt[:, '2-0.0'][~na2mask]
d3 = dt[:, '3-0.0'][~na3mask]
assert np.all(na1mask == np.isnan(exp1))
assert np.all(na2mask == np.isnan(exp2))
assert np.all(na1mask == pd.isna(exp1))
assert np.all(na2mask == pd.isna(exp2))
assert np.all(na3mask == pd.isna(exp3))
assert np.all(d1 == exp1[~na1mask])
assert np.all(d2 == exp2[~na2mask])
assert np.all(d3 == exp3[~na3mask])
dt = None
mgr = None
pool = None
......@@ -402,17 +436,20 @@ def test_applyNAInsertion():
def test_applyChildValues():
sz = 100
data = np.zeros((sz, 6), dtype=np.float32)
data[:, 0] = np.arange(1, sz + 1)
data[:, 1:] = np.random.randint(1, 10, (sz, 5))
ndata = np.zeros((sz, 6), dtype=np.float32)
ndata[:, 0] = np.arange(1, sz + 1)
ndata[:, 1:] = np.random.randint(1, 10, (sz, 5))
sdata = np.array(random.choices(string.ascii_letters, k=sz))
sdata = pd.Series(sdata, index=ndata[:, 0])
cols = ['eid', '1-0.0', '2-0.0', '3-0.0', '4-0.0', '5-0.0']
cols = ['eid', '1-0.0', '2-0.0', '3-0.0', '4-0.0', '5-0.0', '6-0.0']
# parents
# 1: 2, 3
# 2: 3
# 3: 4
# 4: 5
# 6: 5
pvals = {
1 : 'v2 == 5, v3 > 5',
......@@ -420,7 +457,8 @@ def test_applyChildValues():
3 : 'v4 >= 6',
4 : 'v5 == 2',
5 : 'v123 == 7',
6 : 'v8 < 5'
6 : 'v5 < 5',
7 : 'v8 < 5'
}
cvals = {
1 : '100, 101',
......@@ -428,35 +466,50 @@ def test_applyChildValues():
3 : '300',
4 : '400',
5 : '123',
6 : '1234'
6 : 'abc',
7 : '1234'
}
data[ data[:, 5] == 2, 4] = np.nan
data[ data[:, 4] >= 6, 3] = np.nan
data[ data[:, 3] < 8, 2] = np.nan
data[(data[:, 2] == 5) | (data[:, 3] > 5), 1] = np.nan
ndata[ ndata[:, 5] == 2, 4] = np.nan
ndata[ ndata[:, 4] >= 6, 3] = np.nan
ndata[ ndata[:, 3] < 8, 2] = np.nan
ndata[(ndata[:, 2] == 5) | (ndata[:, 3] > 5), 1] = np.nan
sdata[ ndata[:, 5] < 5] = np.nan
nexp = np.copy(ndata)
nan1 = np.isnan(nexp[:, 1])
nexp[ nan1 & (nexp[:, 2] == 5), 1] = 100
nexp[ nan1 & (nexp[:, 3] > 5), 1] = 101
nexp[np.isnan(nexp[:, 2]) & (nexp[:, 3] < 8), 2] = 200
nexp[np.isnan(nexp[:, 3]) & (nexp[:, 4] >= 6), 3] = 300
nexp[np.isnan(nexp[:, 4]) & (nexp[:, 5] == 2), 4] = 400
sexp = sdata.copy()
sexp[pd.isna(sexp)] = 'abc'
exp = np.copy(data)
nan1 = np.isnan(exp[:, 1])
exp[ nan1 & (exp[:, 2] == 5), 1] = 100
exp[ nan1 & (exp[:, 3] > 5), 1] = 101
exp[np.isnan(exp[:, 2]) & (exp[:, 3] < 8), 2] = 200
exp[np.isnan(exp[:, 3]) & (exp[:, 4] >= 6), 3] = 300
exp[np.isnan(exp[:, 4]) & (exp[:, 5] == 2), 4] = 400
with tempdir():
mgr = mp.Manager()
with open('data.txt', 'wt') as f:
f.write('\t'.join(cols) + '\n')
np.savetxt(f, data, delimiter='\t')
for i in range(sz):
f.write('\t'.join([str(i) for i in ndata[i, :]]))
f.write(f'\t{sdata.iloc[i]}\n')
vartable, proctable, cattable, _ = gen_tables([1, 2, 3, 4, 5, 6])
vartable, proctable, cattable, _ = gen_tables(
[1, 2, 3, 4, 5, 6, 7],
vtypes={1 : 'integer',
2 : 'integer',
3 : 'integer',
4 : 'integer',
5 : 'integer',
6 : 'text',
7 : 'integer'})
vartable.loc[pvals.keys(), 'ParentValues'] = \
[lt.convert_ParentValues(v) for v in pvals.values()]
vartable.loc[cvals.keys(), 'ChildValues'] = \
[lt.convert_comma_sep_numbers(v) for v in cvals.values()]
[lt.convert_comma_sep_text(v) for v in cvals.values()]
finfo = fileinfo.FileInfo('data.txt')
dt, _ = importing.importData(finfo,
......@@ -468,7 +521,10 @@ def test_applyChildValues():
cleaning.applyChildValues(dt)
assert np.all(np.asarray(dt[:, :].values) == exp[:, 1:])
ngot = dt[:, cols[1:-1]]
sgot = dt[:, cols[-1]]
assert np.all(np.asarray(ngot.values) == nexp[:, 1:])
assert np.all(np.asarray(sgot.values) == sexp)
dt = None
mgr = None
pool = None
......
......@@ -290,10 +290,10 @@ def test_loadVariableTable_naValues():
99 : '10,11,12',
111 : '1,2,3'})
# 99 has unknown type, so it is interpreted as float
# 99 has unknown type, so it is left as string
expvids = [1, 2, 5, 6, 7, 99]
expnavals = [[1, 2, 3], [10, 20, 30], [], [], ['a', 'b', 'c'],
[10, 11, 12]]
['10', '11', '12']]
navals = [list(v) if isinstance(v, np.ndarray) else []
for v in vartable['NAValues']]
......@@ -319,9 +319,9 @@ def test_loadVariableTable_recoding():
expvids = [1, 2, 5, 6, 7, 99]
exprawlevels = [[10, 20], [1, 2, 3], [], [], ['a', 'b', 'c'],
[40, 50]]
['40', '50']]
expnewlevels = [[100, 200], [10, 20, 30], [], [], ['A', 'B', 'C'],
[400, 500]]
['400', '500']]
rawlevels = [list(v) if isinstance(v, np.ndarray) else []
for v in vartable['RawLevels']]
......@@ -353,7 +353,7 @@ def test_loadVariableTable_childValues():
expvids = [1, 2, 5, 6, 7, 99]
expparentvals = [[2], [], [], [], [99], [5]]
expchildvals = [[12345], [], [], [], ['ABC'], [54321]]
expchildvals = [[12345], [], [], [], ['ABC'], ['54321']]
parentvals = [[] if pd.isna(exps)
else list(it.chain(*[e.variables for e in exps]))
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment