Commit a8584816 authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

ENH: Expression value can now be non-numeric (but must be quoted). New

'contains' operator for substring test
parent 67259895
......@@ -36,9 +36,12 @@ where:
- ``variable`` is the ID of a parent variable of the variable in question.
Variable IDs must be an integer preceded by the letter ``v``.
- ``operator`` is a comparison operator (e.g. equals, greater than, etc.).
- ``value`` is either ``'na'`` indicating missing, or a numeric value against
which the parent variable is to be compared.
- ``value`` is one of:
- ``'na'`` indicating missing,
- a numeric value against which the parent variable is to be compared.
- a non-numeric value (i.e. a string), against which the parent variable
is to be compared. The value must be quoted with either single or
double quotes.
The following comparison operators are allowed (and the symbols used in a
statement can be found in the :attr:`SYMBOLS` dictionary):
......@@ -53,7 +56,8 @@ statement can be found in the :attr:`SYMBOLS` dictionary):
The *equal to* and *not equal to* operators may be used with a value of
``'na'`` to test whether the values for a variable are missing or present
respectively.
respectively. Similarly, the *equal to* and *not equal to* operators may be
used with a non-numeric value to test for string equality.
Multiple conditional statements may be combined with ``and``, ``or``, and
``not`` logical operations (specific symbols can be found in the
......@@ -70,32 +74,34 @@ import collections
import itertools as it
import functools as ft
import pyparsing as pp
import numpy as np
log = logging.getLogger(__name__)
SYMBOLS = {
'var' : 'v',
'and' : '&&',
'or' : '||',
'not' : '~',
'any' : 'any',
'all' : 'all',
'eq' : '==',
'ne' : '!=',
'lt' : '<',
'le' : '<=',
'gt' : '>',
'ge' : '>=',
'na' : 'na',
'var' : 'v',
'and' : '&&',
'or' : '||',
'not' : '~',
'any' : 'any',
'all' : 'all',
'eq' : '==',
'ne' : '!=',
'lt' : '<',
'le' : '<=',
'gt' : '>',
'ge' : '>=',
'contains' : 'contains',
'na' : 'na',
}
"""This dictionary contains the symbols for variables and operations that
may be used in expressions.
"""
class Expression(object):
class Expression:
"""The ``Expression`` class is a convenience class which can be used to
parse and access an expression.
"""
......@@ -310,8 +316,9 @@ def makeParser():
return makeParser.parser
CMP = ['eq', 'ne', 'lt', 'le', 'gt', 'ge']
CMP = pp.oneOf([SYMBOLS[c] for c in CMP])
EQS = pp.oneOf([SYMBOLS[c] for c in ['eq', 'ne']])
CMPOP = pp.oneOf([SYMBOLS[c] for c in CMP])
EQOP = pp.oneOf([SYMBOLS[c] for c in ['eq', 'ne']])
STROP = pp.oneOf([SYMBOLS[c] for c in ['eq', 'ne', 'contains']])
ANY = pp.CaselessLiteral(SYMBOLS['any'])
ALL = pp.CaselessLiteral(SYMBOLS['all'])
AND = pp.CaselessLiteral(SYMBOLS['and'])
......@@ -319,14 +326,16 @@ def makeParser():
NOT = pp.CaselessLiteral(SYMBOLS['not'])
NA = pp.CaselessLiteral(SYMBOLS['na'])
NUM = pp.pyparsing_common.number
STR = pp.QuotedString("'") ^ pp.QuotedString('"')
VAR = (pp.CaselessLiteral(SYMBOLS['var']) +
pp.pyparsing_common.integer).setParseAction(parseVariable)
# a single conditional statement:
# "variable comparison_operator value"
NUMCOND = pp.Group(VAR + CMP + NUM).setParseAction(parseCondition)
NACOND = pp.Group(VAR + EQS + NA) .setParseAction(parseCondition)
COND = NUMCOND ^ NACOND
NUMCOND = pp.Group(VAR + CMPOP + NUM).setParseAction(parseCondition)
STRCOND = pp.Group(VAR + STROP + STR).setParseAction(parseCondition)
NACOND = pp.Group(VAR + EQOP + NA) .setParseAction(parseCondition)
COND = NUMCOND ^ STRCOND ^ NACOND
# the infixNotation helper does the heavy
# lifting for boolean/combine operations
......@@ -422,14 +431,23 @@ def parseBinary(toks):
return fn
def _isna( var, val, dt, data): return dt[:, data[var]].isna() # noqa
def _notna(var, val, dt, data): return dt[:, data[var]].notna() # noqa
def _eq( var, val, dt, data): return dt[:, data[var]] == val # noqa
def _ne( var, val, dt, data): return dt[:, data[var]] != val # noqa
def _gt( var, val, dt, data): return dt[:, data[var]] > val # noqa
def _ge( var, val, dt, data): return dt[:, data[var]] >= val # noqa
def _lt( var, val, dt, data): return dt[:, data[var]] < val # noqa
def _le( var, val, dt, data): return dt[:, data[var]] <= val # noqa
def _isna( var, val, dt, data): return dt[:, data[var]].isna() # noqa
def _notna( var, val, dt, data): return dt[:, data[var]].notna() # noqa
def _eq( var, val, dt, data): return dt[:, data[var]] == val # noqa
def _ne( var, val, dt, data): return dt[:, data[var]] != val # noqa
def _gt( var, val, dt, data): return dt[:, data[var]] > val # noqa
def _ge( var, val, dt, data): return dt[:, data[var]] >= val # noqa
def _lt( var, val, dt, data): return dt[:, data[var]] < val # noqa
def _le( var, val, dt, data): return dt[:, data[var]] <= val # noqa
# we can't perform str.contains
# on multiple columns at once
def _contains(var, val, dt, data):
cols = data[var]
result = np.zeros((len(dt), len(cols)), dtype=np.bool)
for i, col in enumerate(cols):
result[:, i] = dt[:, col].str.contains(val, case=False)
return result
def _asarray(func, *args):
......@@ -441,9 +459,15 @@ def _asarray(func, *args):
This function is used by :func:`parseCondition` to construct functions
for evaluating conditional statements.
"""
# DataFrame.to_numpy is only
# available in pandas >= 0.24
return func(*args).to_numpy()
val = func(*args)
if not isinstance(val, np.ndarray):
# DataFrame.to_numpy is only
# available in pandas >= 0.24
val = val.to_numpy()
return val
def parseCondition(toks):
......@@ -475,6 +499,7 @@ def parseCondition(toks):
elif operation == SYMBOLS['gt']: fn = _gt
elif operation == SYMBOLS['le']: fn = _le
elif operation == SYMBOLS['lt']: fn = _lt
elif operation == SYMBOLS['contains']: fn = _contains
fn = ft.partial(_asarray, fn, variable, value)
fn.ftype = 'condition'
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment