Commit c4afb999 authored by Paul McCarthy's avatar Paul McCarthy 🚵
Browse files

MNT: typing

parent 5317a643
......@@ -21,8 +21,7 @@ import collections.abc as abc
import pandas as pd
from . import loadtables
from . import util
import funpack.util as util
log = logging.getLogger(__name__)
......@@ -498,6 +497,8 @@ class DataTable(util.Singleton):
(e.g. ``visit``, ``metadata``, etc).
"""
import funpack.loadtables as loadtables # noqa: E501 pylint: disable=import-outside-toplevel
if vids is None: vids = [None] * len(series)
if kwargs is None: kwargs = [None] * len(series)
......
......@@ -43,10 +43,15 @@ import logging
import warnings
import collections
from typing import Tuple, Sequence, Union, Dict, List, Type, Any, Callable
from typing_extensions import Literal
import numpy as np
import pandas as pd
import funpack.util as util
import funpack.fileinfo as finfo
import funpack.datatable as datatable
import funpack.processing as processing
import funpack.expression as expression
......@@ -54,7 +59,7 @@ import funpack.expression as expression
log = logging.getLogger(__name__)
def convert_type(val):
def convert_type(val : str) -> util.CTYPES:
"""Convert a string containing a UK BioBank type into a numerical
identifier for that type - see :attr:`funpack.util.CTYPES`.
"""
......@@ -86,7 +91,7 @@ def convert_type(val):
return valmap.get(val.lower(), util.CTYPES.unknown)
def convert_dtype(val):
def convert_dtype(val : str) -> Union[np.dtype, Literal[np.nan]]:
"""Convert a string containing a ``numpy.dtype`` (e.g. ``'float32'``)
into a ``dtype`` object.
"""
......@@ -102,7 +107,7 @@ def convert_dtype(val):
return dtype
def convert_comma_sep_text(val):
def convert_comma_sep_text(val : str) -> Union[List[str], Literal[np.nan]]:
"""Convert a string containing comma-separated text into a list. """
if val.strip() == '':
return np.nan
......@@ -110,7 +115,7 @@ def convert_comma_sep_text(val):
return [w.strip() for w in words]
def convert_comma_sep_numbers(val):
def convert_comma_sep_numbers(val : str) -> Union[np.ndarray, Literal[np.nan]]:
"""Convert a string containing comma-separated numbers into a ``numpy``
array.
"""
......@@ -119,7 +124,8 @@ def convert_comma_sep_numbers(val):
return np.fromstring(val, sep=',', dtype=np.float)
def convert_ParentValues(val):
def convert_ParentValues(
val : str) -> Union[List[expression.Expression], Literal[np.nan]]:
"""Convert a string containing a sequence of comma-separated
``ParentValue`` expressions into a sequence of :class:`.Expression`
objects.
......@@ -129,7 +135,7 @@ def convert_ParentValues(val):
return [expression.Expression(e) for e in val.split(',')]
def convert_Process_Variable(val):
def convert_Process_Variable(val : str) -> Tuple[str, List[int]]:
"""Convert a string containing a process variable specification - one of:
- One or more comma-separated MATLAB-style ``start:stop:step`` ranges,
......@@ -179,7 +185,10 @@ def convert_Process_Variable(val):
return ptype, list(it.chain(*[util.parseMatlabRange(t) for t in tokens]))
def convert_Process(ptype, val):
def convert_Process(
ptype : str,
val : str
) -> Dict[str, processing.Process]:
"""Convert a string containing a sequence of comma-separated ``Process`` or
``Clean`` expressions into an ``OrderedDict`` of :class:`.Process`
objects (with the process names used as dictionary keys).
......@@ -192,7 +201,7 @@ def convert_Process(ptype, val):
return collections.OrderedDict([(p.name, p) for p in procs])
def convert_category_variables(val):
def convert_category_variables(val : str) -> List[int]:
"""Convert a string containing a sequence of comma-separated variable IDs
or ranges into a list of variable IDs. Variables may be specified as
integer IDs, or via a MATLAB-style ``start:step:stop`` range. See
......@@ -351,13 +360,19 @@ call to :func:`addImplicitCategories`).
"""
def loadTables(fileinfo,
varfiles=None,
dcfiles=None,
typefile=None,
procfile=None,
catfile=None,
**kw):
def loadTables(
fileinfo : finfo.FileInfo,
varfiles : Sequence[str] = None,
dcfiles : Sequence[str] = None,
typefile : str = None,
procfile : str = None,
catfile : str = None,
**kw
) -> Tuple[pd.DataFrame,
pd.DataFrame,
pd.DataFrame,
List[datatable.Column],
List[datatable.Column]]:
"""Loads the data tables used to run ``funpack``.
:arg fileinfo: :class:`.FileInfo` object describing the input data files.
......@@ -401,19 +416,23 @@ def loadTables(fileinfo,
return vartable, proctable, cattable, unk, unc
def loadVariableTable(fileinfo,
varfiles=None,
dcfiles=None,
typefile=None,
noBuiltins=False,
naValues=None,
childValues=None,
recoding=None,
clean=None,
typeClean=None,
globalClean=None,
dropAbsent=True,
**kwargs):
def loadVariableTable(
fileinfo : finfo.FileInfo,
varfiles : Sequence[str] = None,
dcfiles : Sequence[str] = None,
typefile : str = None,
noBuiltins : bool = False,
naValues : Dict[int, str] = None,
childValues : Dict[int, Tuple[str, str]] = None,
recoding : Dict[int, Tuple[str, str]] = None,
clean : Dict[int, str] = None,
typeClean : Dict[util.CTYPES, str] = None,
globalClean : str = None,
dropAbsent : bool = True,
**kwargs # pylint: disable=unused-argument
) -> Tuple[pd.DataFrame,
Sequence[datatable.Column],
Sequence[datatable.Column]]:
"""Given variable table and datacoding table file names, builds and returns
the variable table.
......@@ -429,30 +448,37 @@ def loadVariableTable(fileinfo,
:arg noBuiltins: If provided, the built-in variable and datacoding base
tables are not loaded.
:arg naValues: Dictionary of ``{vid : [values]}`` mappings, specifying
values which should be replaced with NA.
:arg naValues: Dictionary of ``{vid : values}`` mappings, specifying
values which should be replaced with NA. The values
are expected to be strings of comma-separated values.
:arg childValues: Dictionary of ``{vid : [exprs], [values]}`` mappings,
:arg childValues: Dictionary of ``{vid : (exprs, values)}`` mappings,
specifying parent value expressions, and corresponding
child values.
child values. The expressions and values
are expected to be strings of comma-separated values
of the same length.
:arg recoding: Dictionary of ``{vid : [rawlevel], [newlevel]}``
mappings
:arg recoding: Dictionary of ``{vid : (rawlevel, newlevel)}``
mappings. The raw and enw levels are expected to be
strings of comma-separated values of the same length.
:arg clean: Dictionary of ``{vid : expr}`` mappings containing
cleaning functions to apply - this will override
any cleaning specified in the variable file, and
any cleaning specified in ``typeClean``.
any cleaning specified in ``typeClean``. The expressions
are expected to be strings.
:arg typeClean: Dictionary of ``{type : expr}`` mappings containing
cleaning functions to apply to all variables of a
specific type - this will override any cleaning
specified in the type file.
specified in the type file. The expressions
are expected to be strings.
:arg globalClean: Expression containing cleaning functions to
apply to every variable - this will be performed after
variable-specific cleaning in the variable table,
or specified via ``clean`` or ``typeClean``.
or specified via ``clean`` or ``typeClean``. The
expressions are expected to be strings.
:arg dropAbsent: If ``True`` (the default), remove all variables from the
variable table which are not present in the data
......@@ -611,7 +637,7 @@ def loadVariableTable(fileinfo,
return vartable, unknownVars, uncleanVars
def loadTableBases():
def loadTableBases() -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Loads the UK Biobank variable and data coding schema files.
This function is called by :func:`loadVariableTable`. It loads the UK
......@@ -691,7 +717,14 @@ def loadTableBases():
return varbase, dcbase
def mergeTableFiles(base, fnames, what, dtypes, converters, columns):
def mergeTableFiles(
base : pd.DataFrame,
fnames : List[str],
what : str,
dtypes : Dict[str, Type],
converters : Dict[str, Callable],
columns : List[str]
) -> pd.DataFrame:
"""Load and merge one or more table files.
This function is called by :func:`loadVariableTable` to load the variable,
......@@ -790,7 +823,11 @@ def mergeTableFiles(base, fnames, what, dtypes, converters, columns):
return base
def sanitiseVariableTable(vartable, cols, dropAbsent):
def sanitiseVariableTable(
vartable : pd.DataFrame,
cols : Sequence[datatable.Column],
dropAbsent : bool
) -> List[datatable.Column]:
"""Ensures that the variable table contains an entry for every
variable in the input data.
......@@ -836,7 +873,10 @@ def sanitiseVariableTable(vartable, cols, dropAbsent):
return unknownVars
def mergeIntoVariableTable(vartable, cols, mapping):
def mergeIntoVariableTable(
vartable : pd.DataFrame,
cols : Sequence[str],
mapping : Union[str, Dict[int, Any]]):
"""Merge data from ``mapping`` into the variable table.
Called by :func:`loadVariableTable`.
......@@ -869,7 +909,9 @@ def mergeIntoVariableTable(vartable, cols, mapping):
vartable.at[vid, col] = val
def mergeDataCodingTable(vartable, dctable):
def mergeDataCodingTable(
vartable : pd.DataFrame,
dctable : pd.DataFrame):
"""Merges information from the data coding table into the variable
table.
......@@ -891,7 +933,12 @@ def mergeDataCodingTable(vartable, dctable):
vartable.loc[mask, field] = newvals
def mergeCleanFunctions(vartable, tytable, clean, typeClean, globalClean):
def mergeCleanFunctions(
vartable : pd.DataFrame,
tytable : pd.DataFrame,
clean : Dict[int, str],
typeClean : Dict[str, str],
globalClean : str):
"""Merges custom clean functions into the variable table.
Called by :func:`loadVariableTable`.
......@@ -964,7 +1011,12 @@ def mergeCleanFunctions(vartable, tytable, clean, typeClean, globalClean):
else: vartable.loc[ vid, 'Clean'].update(gpp)
def addNewVariable(vartable, vid, name, dtype=None, instancing=None):
def addNewVariable(
vartable : pd.DataFrame,
vid : int,
name : str,
dtype : np.dtype = None,
instancing : int = None):
"""Add a new row to the variable table.
The ``instancing`` argument defines the meaning of the
......@@ -1021,11 +1073,13 @@ def addNewVariable(vartable, vid, name, dtype=None, instancing=None):
vartable.loc[vid, 'Instancing'] = instancing
def loadProcessingTable(procfile=None,
skipProcessing=False,
prependProcess=None,
appendProcess=None,
**kwargs):
def loadProcessingTable(
procfile : str = None,
skipProcessing : bool = False,
prependProcess : Sequence[Tuple[List[int], str]] = None,
appendProcess : Sequence[Tuple[List[int], str]] = None,
**kwargs # pylint: disable=unused-argument
) -> pd.DataFrame:
"""Loads the processing table from the given file.
:arg procfile: Path to the processing table file.
......@@ -1077,7 +1131,7 @@ def loadProcessingTable(procfile=None,
return proctable
def loadCategoryTable(catfile=None):
def loadCategoryTable(catfile : str = None) -> pd.DataFrame:
"""Loads the category table from the given file.
:arg catfile: Path to the category file.
......@@ -1097,14 +1151,17 @@ def loadCategoryTable(catfile=None):
return cattable
def categoryVariables(cattable, categories):
def categoryVariables(
cattable : pd.DataFrame,
categories : Sequence[int]
) -> List[int]:
"""Returns a list of variable IDs from ``cattable`` which correspond to
the strings in ``categories``.
:arg cattable: The category table.
:arg categories: Sequence of integer category IDs or label sub-strings
specifying the categories to return.
:returns: A list of variable IDs as strings.
:returns: A list of variable IDs.
"""
allvars = []
......@@ -1128,7 +1185,10 @@ def categoryVariables(cattable, categories):
return allvars
def addImplicitCategories(cattable, unknown, uncat):
def addImplicitCategories(
cattable : pd.DataFrame,
unknown : Sequence[datatable.Column],
uncat : Sequence[datatable.Column]):
"""Adds some implicit/automatic categories to the category table.
The following implicit categories are added:
......@@ -1169,7 +1229,10 @@ def addImplicitCategories(cattable, unknown, uncat):
cattable.loc[idx, 'Variables'] = list(vids)
def columnTypes(vartable, columns):
def columnTypes(
vartable : pd.DataFrame,
columns : Sequence[datatable.Column]
) -> Tuple[List[util.CTYPES], Dict[str, np.dtype]]:
"""Retrieves the type of each column in ``cols`` as listed in ``vartable``.
Also identifies a suitable internal data type to use for each column where
possible.
......@@ -1214,7 +1277,10 @@ def columnTypes(vartable, columns):
return vttypes, dtypes
def identifyUncategorisedVariables(fileinfo, cattable):
def identifyUncategorisedVariables(
fileinfo : finfo.FileInfo,
cattable : pd.DataFrame
) -> List[datatable.Column]:
"""Called by :func:`loadTables`. Identifies all variables which are in the
data file(s), but which are uncategorised (not present in any categories
in the category table).
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment