Commit 643e9195 authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

ENH: Column class now has a "metadata" attribute for storing arbitrary

stuff. Processing functions have the option of returning metadata for newly
created columns. Used by binariseCategorical.
parent 0bb5527d
......@@ -42,7 +42,8 @@ class Column(object):
index,
vid=None,
visit=0,
instance=0):
instance=0,
metadata=None):
self.datafile = datafile
self.name = name
......@@ -50,6 +51,7 @@ class Column(object):
self.vid = vid
self.visit = visit
self.instance = instance
self.metadata = metadata
def __str__(self):
......@@ -356,7 +358,7 @@ class DataTable(object):
else: self.__varmap.pop(col.vid)
def addColumns(self, series, vids=None):
def addColumns(self, series, vids=None, meta=None):
"""Adds one or more new columns to the data set.
:arg series: Sequence of ``pandas.Series`` objects containing the
......@@ -365,10 +367,12 @@ class DataTable(object):
:arg vids: Sequence of variables each new column is associated
with. If ``None`` (the default), variable IDs are
automatically assigned.
:arg meta: Sequence of metadata associated with each new column.
"""
if vids is None:
vids = [None] * len(series)
if vids is None: vids = [None] * len(series)
if meta is None: meta = [None] * len(series)
for s in series:
if s.name in self.__data.columns:
......@@ -390,13 +394,13 @@ class DataTable(object):
# a vid for each column starting from here.
startvid = max(max(self.variables) + 1, AUTO_VARIABLE_ID)
for s, idx, vid in zip(series, idxs, vids):
for s, idx, vid, m in zip(series, idxs, vids, meta):
if vid is None:
vid = startvid
startvid = startvid + 1
col = Column(None, s.name, idx, vid, 0, 0)
col = Column(None, s.name, idx, vid, 0, 0, m)
self.__data[s.name] = s
# new column on existing variable.
......
......@@ -136,10 +136,7 @@ def runProcess(proc, dtable, vids):
remove = []
add = []
addvids = []
def genvids(result, vi, si):
if result[vi] is None: return [None] * len(result[si])
else: return result[vi]
addmeta = []
for result in results:
if result is None:
......@@ -152,15 +149,20 @@ def runProcess(proc, dtable, vids):
# series/vids to add
if len(result) == 2:
add .extend( result[0])
addvids.extend(genvids(result, 1, 0))
add .extend(result[0])
addvids.extend(result[1])
# columns to remove, and
# series/vids to add
elif len(result) == 3:
remove .extend( result[0])
add .extend( result[1])
addvids.extend(genvids(result, 2, 1))
elif len(result) in (3, 4):
if len(result) == 3: meta = [None] * len(result[1])
else: meta = result[3]
remove .extend(result[0])
add .extend(result[1])
addvids.extend(result[2])
addmeta.extend(meta)
else:
raise error
......@@ -170,7 +172,7 @@ def runProcess(proc, dtable, vids):
else:
raise error
if len(add) > 0: dtable.addColumns(add, addvids)
if len(add) > 0: dtable.addColumns(add, addvids, meta)
if len(remove) > 0: dtable.removeColumns(remove)
......
......@@ -43,6 +43,12 @@ Furthermore, all processing functions must return one of the following:
- List of ``Series`` to be added
- List of variable IDs for each new ``Series``.
- A ``tuple`` of length 3, containing the above, and:
- List of metadata associated with each of the new ``Series``. This will
be added to the :class:`.Column` objects that represent each of the new
``Series``.
The following processing functions are defined:
.. autosummary::
......@@ -217,6 +223,7 @@ def binariseCategorical(dtable,
toremove = []
newseries = []
newvids = []
newmeta = []
for vid in vids:
......@@ -261,13 +268,14 @@ def binariseCategorical(dtable,
}
newvids .append(vid)
newmeta .append(val)
newseries.append(pd.Series(
col,
index=dtable.index,
name=nameFormat.format(**fmtargs)))
if replace: return toremove, newseries, newvids
else: return newseries, newvids
if replace: return toremove, newseries, newvids, newmeta
else: return [], newseries, newvids, newmeta
@custom.processor()
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment