General

Importing

from siuba.dply.vector import n, lead, lag

`siuba.dply.vector`

`between(x, left, right, default=False)`

Return whether a value is between left and right (including either side).

Examples:

>>> between(pd.Series([1,2,3]), 0, 2)
0     True
1     True
2    False
dtype: bool

Source code in siuba/dply/vector.py

@symbolic_dispatch(cls = Series)
def between(x, left, right, default = False):
    """Return whether a value is between left and right (including either side).

    Example:
        >>> between(pd.Series([1,2,3]), 0, 2)
        0     True
        1     True
        2    False
        dtype: bool

    Note:
        This is a thin wrapper around pd.Series.between(left, right)

    """
    # note: NA -> False, in tidyverse NA -> NA
    if default is not False:
        raise TypeError("between function must use default = False for pandas Series")

    return x.between(left, right)

`coalesce(x, *args)`

Returns a copy of x, with NaN values filled in from *args. Ignores indexes.

Parameters:

Name	Type	Description	Default
`x`	`None`	a pandas Series object	required
`*args`	`None`	other Series that are the same length as x, or a scalar	`()`

Examples:

>>> x = pd.Series([1.1, None, None])
>>> abc = pd.Series(['a', 'b', None])
>>> xyz = pd.Series(['x', 'y', 'z'])
>>> coalesce(x, abc)
0     1.1
1       b
2    None
dtype: object

>>> coalesce(x, abc, xyz)
0    1.1
1      b
2      z
dtype: object

Source code in siuba/dply/vector.py

@symbolic_dispatch(cls = Series)
def coalesce(x, *args):
    """Returns a copy of x, with NaN values filled in from \*args. Ignores indexes.

    Arguments:
        x: a pandas Series object
        *args: other Series that are the same length as x, or a scalar

    Examples:
        >>> x = pd.Series([1.1, None, None])
        >>> abc = pd.Series(['a', 'b', None])
        >>> xyz = pd.Series(['x', 'y', 'z'])
        >>> coalesce(x, abc)
        0     1.1
        1       b
        2    None
        dtype: object

        >>> coalesce(x, abc, xyz)
        0    1.1
        1      b
        2      z
        dtype: object

    """

    crnt = x.reset_index(drop = True)

    for other in args:
        if isinstance(other, pd.Series):
            other = other.reset_index(drop = True)

        crnt = crnt.where(crnt.notna(), other)

    crnt.index = x.index
    return crnt

`cumall(x)`

Return a same-length array. For each entry, indicates whether that entry and all previous are True-like.

Examples:

>>> cumall(pd.Series([True, False, False]))
0     True
1    False
2    False
dtype: bool

Source code in siuba/dply/vector.py

@symbolic_dispatch(cls = Series)
def cumall(x):
    """Return a same-length array. For each entry, indicates whether that entry and all previous are True-like.

    Example:
        >>> cumall(pd.Series([True, False, False]))
        0     True
        1    False
        2    False
        dtype: bool

    """
    return _expand_bool(x, np.all)

`cumany(x)`

Return a same-length array. For each entry, indicates whether that entry or any previous are True-like.

Examples:

>>> cumany(pd.Series([False, True, False]))
0    False
1     True
2     True
dtype: bool

Source code in siuba/dply/vector.py

@symbolic_dispatch(cls = Series)
def cumany(x):
    """Return a same-length array. For each entry, indicates whether that entry or any previous are True-like.

    Example:
        >>> cumany(pd.Series([False, True, False]))
        0    False
        1     True
        2     True
        dtype: bool

    """
    return _expand_bool(x, np.any)

`cume_dist(x, na_option='keep')`

Return the cumulative distribution corresponding to each value in x.

This reflects the proportion of values that are less than or equal to each value.

Source code in siuba/dply/vector.py

@symbolic_dispatch(cls = Series)
def cume_dist(x, na_option = "keep"):
    """Return the cumulative distribution corresponding to each value in x.

    This reflects the proportion of values that are less than or equal to each value.

    """
    return x.rank(method = "max", na_option = na_option) / x.count()

`cummean(x)`

Return a same-length array, containing the cumulative mean.

Source code in siuba/dply/vector.py

@symbolic_dispatch(cls = Series)
def cummean(x):
    """Return a same-length array, containing the cumulative mean."""
    return x.expanding().mean()

`dense_rank(x, na_option='keep')`

Return the dense rank.

This method of ranking returns values ranging from 1 to the number of unique entries. Ties are all given the same ranking.

Examples:

>>> dense_rank(pd.Series([1,3,3,5]))
0    1.0
1    2.0
2    2.0
3    3.0
dtype: float64

Source code in siuba/dply/vector.py

@symbolic_dispatch(cls = Series)
def dense_rank(x, na_option = "keep"):
    """Return the dense rank.

    This method of ranking returns values ranging from 1 to the number of unique entries.
    Ties are all given the same ranking.

    Example:

        >>> dense_rank(pd.Series([1,3,3,5]))
        0    1.0
        1    2.0
        2    2.0
        3    3.0
        dtype: float64


    """
    return x.rank(method = "dense", na_option = na_option)

`desc(x)`

Return array sorted in descending order.

Source code in siuba/dply/vector.py

@symbolic_dispatch(cls = Series)
def desc(x):
    """Return array sorted in descending order."""
    return x.sort_values(ascending = False).reset_index(drop = True)

`first(x, n, order_by=None, default=None)`

Return the nth entry of x. Similar to x[n].

Note: first(x) and last(x) are nth(x, 0) and nth(x, -1).

Parameters:

Name	Type	Description	Default
`x`	`None`	series to get entry from.	required
`n`	`None`	position of entry to get from x (0 indicates first entry).	required
`order_by`	`None`	optional Series used to reorder x.	`None`
`default`	`None`	(not implemented) value to return if no entry at n.	`None`

Examples:

>>> ser = pd.Series(['a', 'b', 'c'])
>>> nth(ser, 1)
'b'

>>> sorter = pd.Series([1, 2, 0])
>>> nth(ser, 1, order_by = sorter)
'a'

>>> nth(ser, 0), nth(ser, -1)
('a', 'c')

>>> first(ser), last(ser)
('a', 'c')

Source code in siuba/dply/vector.py

@symbolic_dispatch(cls = Series)
def nth(x, n, order_by = None, default = None):
    """Return the nth entry of x. Similar to x[n].

    Note:
        first(x) and last(x) are nth(x, 0) and nth(x, -1).

    Arguments:
        x: series to get entry from.
        n: position of entry to get from x (0 indicates first entry).
        order_by: optional Series used to reorder x.
        default: (not implemented) value to return if no entry at n.

    Examples:
        >>> ser = pd.Series(['a', 'b', 'c'])
        >>> nth(ser, 1)
        'b'

        >>> sorter = pd.Series([1, 2, 0])
        >>> nth(ser, 1, order_by = sorter)
        'a'

        >>> nth(ser, 0), nth(ser, -1)
        ('a', 'c')

        >>> first(ser), last(ser)
        ('a', 'c')

    """

    if default is not None:
        raise NotImplementedError("default argument not implemented") 

    # check indexing is in range, handles positive and negative cases.
    # TODO: is returning None the correct behavior for an empty Series?
    if n >= len(x) or abs(n) > len(x):
        return default

    if order_by is None:
        return x.iloc[n]

    # case where order_by is specified and n in range ----
    # TODO: ensure order_by is arraylike
    if not isinstance(order_by, pd.Series):
        raise NotImplementedError(
                "order_by argument is type %s, but currently only"
                "implemented for Series" % type(order_by)
                )

    if len(x) != len(order_by):
        raise ValueError("x and order_by arguments must be same length")

    order_indx = order_by.reset_index(drop = True).sort_values().index
    return x.iloc[order_indx[n]]

`lag(x, n=1, default=None)`

Return an array with each value replaced by the previous (or further backward) value in the array.

Parameters:

Name	Type	Description	Default
`x`	`None`	a pandas Series object	required
`n`	`None`	number of next values backward to replace each value with	`1`
`default`	`None`	what to replace the n final values of the array with	`None`

Examples:

>>> lag(pd.Series([1,2,3]), n=1)
0    NaN
1    1.0
2    2.0
dtype: float64

>>> lag(pd.Series([1,2,3]), n=1, default = 99)
0    99.0
1     1.0
2     2.0
dtype: float64

Source code in siuba/dply/vector.py

@symbolic_dispatch(cls = Series)
def lag(x, n = 1, default = None):
    """Return an array with each value replaced by the previous (or further backward) value in the array.

    Arguments:
        x: a pandas Series object
        n: number of next values backward to replace each value with
        default: what to replace the n final values of the array with

    Example:
        >>> lag(pd.Series([1,2,3]), n=1)
        0    NaN
        1    1.0
        2    2.0
        dtype: float64

        >>> lag(pd.Series([1,2,3]), n=1, default = 99)
        0    99.0
        1     1.0
        2     2.0
        dtype: float64


    """
    res = x.shift(n)

    if default is not None:
        res.iloc[:n] = default

    return res

`last(x, n, order_by=None, default=None)`

Return the nth entry of x. Similar to x[n].

Note: first(x) and last(x) are nth(x, 0) and nth(x, -1).

Parameters:

Name	Type	Description	Default
`x`	`None`	series to get entry from.	required
`n`	`None`	position of entry to get from x (0 indicates first entry).	required
`order_by`	`None`	optional Series used to reorder x.	`None`
`default`	`None`	(not implemented) value to return if no entry at n.	`None`

Examples:

>>> ser = pd.Series(['a', 'b', 'c'])
>>> nth(ser, 1)
'b'

>>> sorter = pd.Series([1, 2, 0])
>>> nth(ser, 1, order_by = sorter)
'a'

>>> nth(ser, 0), nth(ser, -1)
('a', 'c')

>>> first(ser), last(ser)
('a', 'c')

Source code in siuba/dply/vector.py

@symbolic_dispatch(cls = Series)
def nth(x, n, order_by = None, default = None):
    """Return the nth entry of x. Similar to x[n].

    Note:
        first(x) and last(x) are nth(x, 0) and nth(x, -1).

    Arguments:
        x: series to get entry from.
        n: position of entry to get from x (0 indicates first entry).
        order_by: optional Series used to reorder x.
        default: (not implemented) value to return if no entry at n.

    Examples:
        >>> ser = pd.Series(['a', 'b', 'c'])
        >>> nth(ser, 1)
        'b'

        >>> sorter = pd.Series([1, 2, 0])
        >>> nth(ser, 1, order_by = sorter)
        'a'

        >>> nth(ser, 0), nth(ser, -1)
        ('a', 'c')

        >>> first(ser), last(ser)
        ('a', 'c')

    """

    if default is not None:
        raise NotImplementedError("default argument not implemented") 

    # check indexing is in range, handles positive and negative cases.
    # TODO: is returning None the correct behavior for an empty Series?
    if n >= len(x) or abs(n) > len(x):
        return default

    if order_by is None:
        return x.iloc[n]

    # case where order_by is specified and n in range ----
    # TODO: ensure order_by is arraylike
    if not isinstance(order_by, pd.Series):
        raise NotImplementedError(
                "order_by argument is type %s, but currently only"
                "implemented for Series" % type(order_by)
                )

    if len(x) != len(order_by):
        raise ValueError("x and order_by arguments must be same length")

    order_indx = order_by.reset_index(drop = True).sort_values().index
    return x.iloc[order_indx[n]]

`lead(x, n=1, default=None)`

Return an array with each value replaced by the next (or further forward) value in the array.

Parameters:

Name	Type	Description	Default
`x`	`None`	a pandas Series object	required
`n`	`None`	number of next values forward to replace each value with	`1`
`default`	`None`	what to replace the n final values of the array with	`None`

Examples:

>>> lead(pd.Series([1,2,3]), n=1)
0    2.0
1    3.0
2    NaN
dtype: float64

>>> lead(pd.Series([1,2,3]), n=1, default = 99)
0     2
1     3
2    99
dtype: int64

Source code in siuba/dply/vector.py

@symbolic_dispatch(cls = Series)
def lead(x, n = 1, default = None):
    """Return an array with each value replaced by the next (or further forward) value in the array.

    Arguments:
        x: a pandas Series object
        n: number of next values forward to replace each value with
        default: what to replace the n final values of the array with

    Example:
        >>> lead(pd.Series([1,2,3]), n=1)
        0    2.0
        1    3.0
        2    NaN
        dtype: float64

        >>> lead(pd.Series([1,2,3]), n=1, default = 99)
        0     2
        1     3
        2    99
        dtype: int64

    """
    res = x.shift(-1*n, fill_value = default)

    return res

`min_rank(x, na_option='keep')`

Return the min rank. See pd.Series.rank with method="min" for details.

Source code in siuba/dply/vector.py

@symbolic_dispatch(cls = Series)
def min_rank(x, na_option = "keep"):
    """Return the min rank. See pd.Series.rank with method="min" for details.

    """
    return x.rank(method = "min", na_option = na_option)

`n(x)`

Return the total number of elements in the array (or rows in a DataFrame).

Examples:

>>> ser = pd.Series([1,2,3])
>>> n(ser)
3

>>> df = pd.DataFrame({'x': ser})
>>> n(df)
3

Source code in siuba/dply/vector.py

@symbolic_dispatch(cls = NDFrame)
def n(x):
    """Return the total number of elements in the array (or rows in a DataFrame).

    Example:
        >>> ser = pd.Series([1,2,3])
        >>> n(ser)
        3

        >>> df = pd.DataFrame({'x': ser})
        >>> n(df)
        3

    """
    if isinstance(x, pd.DataFrame):
        return x.shape[0]

    return len(x)

`n_distinct(x)`

Return the total number of distinct (i.e. unique) elements in an array.

Examples:

>>> n_distinct(pd.Series([1,1,2,2]))
2

Source code in siuba/dply/vector.py

@alias_series_agg('nunique')
@symbolic_dispatch(cls = Series)
def n_distinct(x):
    """Return the total number of distinct (i.e. unique) elements in an array.

    Example:
        >>> n_distinct(pd.Series([1,1,2,2]))
        2

    """
    return x.nunique()

`na_if(x, y)`

Return a array like x, but with values in y replaced by NAs.

Examples:

>>> na_if(pd.Series([1,2,3]), [1,3])
0    NaN
1    2.0
2    NaN
dtype: float64

Source code in siuba/dply/vector.py

@symbolic_dispatch(cls = Series)
def na_if(x, y):
    """Return a array like x, but with values in y replaced by NAs.

    Examples:
        >>> na_if(pd.Series([1,2,3]), [1,3])
        0    NaN
        1    2.0
        2    NaN
        dtype: float64

    """
    y = [y] if not np.ndim(y) else y

    tmp_x = x.copy(deep = True)
    tmp_x[x.isin(y)] = np.nan

    return tmp_x

`near(x)`

TODO: Not Implemented

Source code in siuba/dply/vector.py

@symbolic_dispatch(cls = Series)
def near(x):
    """TODO: Not Implemented"""
    raise NotImplementedError("near not implemented")

`nth(x, n, order_by=None, default=None)`

Return the nth entry of x. Similar to x[n].

Note: first(x) and last(x) are nth(x, 0) and nth(x, -1).

Parameters:

Name	Type	Description	Default
`x`	`None`	series to get entry from.	required
`n`	`None`	position of entry to get from x (0 indicates first entry).	required
`order_by`	`None`	optional Series used to reorder x.	`None`
`default`	`None`	(not implemented) value to return if no entry at n.	`None`

Examples:

>>> ser = pd.Series(['a', 'b', 'c'])
>>> nth(ser, 1)
'b'

>>> sorter = pd.Series([1, 2, 0])
>>> nth(ser, 1, order_by = sorter)
'a'

>>> nth(ser, 0), nth(ser, -1)
('a', 'c')

>>> first(ser), last(ser)
('a', 'c')

Source code in siuba/dply/vector.py

@symbolic_dispatch(cls = Series)
def nth(x, n, order_by = None, default = None):
    """Return the nth entry of x. Similar to x[n].

    Note:
        first(x) and last(x) are nth(x, 0) and nth(x, -1).

    Arguments:
        x: series to get entry from.
        n: position of entry to get from x (0 indicates first entry).
        order_by: optional Series used to reorder x.
        default: (not implemented) value to return if no entry at n.

    Examples:
        >>> ser = pd.Series(['a', 'b', 'c'])
        >>> nth(ser, 1)
        'b'

        >>> sorter = pd.Series([1, 2, 0])
        >>> nth(ser, 1, order_by = sorter)
        'a'

        >>> nth(ser, 0), nth(ser, -1)
        ('a', 'c')

        >>> first(ser), last(ser)
        ('a', 'c')

    """

    if default is not None:
        raise NotImplementedError("default argument not implemented") 

    # check indexing is in range, handles positive and negative cases.
    # TODO: is returning None the correct behavior for an empty Series?
    if n >= len(x) or abs(n) > len(x):
        return default

    if order_by is None:
        return x.iloc[n]

    # case where order_by is specified and n in range ----
    # TODO: ensure order_by is arraylike
    if not isinstance(order_by, pd.Series):
        raise NotImplementedError(
                "order_by argument is type %s, but currently only"
                "implemented for Series" % type(order_by)
                )

    if len(x) != len(order_by):
        raise ValueError("x and order_by arguments must be same length")

    order_indx = order_by.reset_index(drop = True).sort_values().index
    return x.iloc[order_indx[n]]

`ntile(x, n)`

TODO: Not Implemented

Source code in siuba/dply/vector.py

@symbolic_dispatch(cls = Series)
def ntile(x, n):
    """TODO: Not Implemented"""
    raise NotImplementedError("ntile not implemented")

`percent_rank(x, na_option='keep')`

Return the percent rank.

Note: Uses minimum rank, and reports the proportion of unique ranks each entry is greater than.

Examples:

>>> percent_rank(pd.Series([1, 2, 3]))
0    0.0
1    0.5
2    1.0
dtype: float64

>>> percent_rank(pd.Series([1, 2, 2]))
0    0.0
1    0.5
2    0.5
dtype: float64

>>> percent_rank(pd.Series([1]))
0   NaN
dtype: float64

Source code in siuba/dply/vector.py

@symbolic_dispatch(cls = Series)
def percent_rank(x, na_option = "keep"):
    """Return the percent rank.

    Note:
        Uses minimum rank, and reports the proportion of unique ranks each entry is greater than.

    Examples:
        >>> percent_rank(pd.Series([1, 2, 3]))
        0    0.0
        1    0.5
        2    1.0
        dtype: float64

        >>> percent_rank(pd.Series([1, 2, 2]))
        0    0.0
        1    0.5
        2    0.5
        dtype: float64

        >>> percent_rank(pd.Series([1]))
        0   NaN
        dtype: float64


    """
    return (min_rank(x) - 1) / (x.count() - 1)

`row_number(x)`

Return the row number (position) for each value in x, beginning with 1.

Examples:

>>> ser = pd.Series([7,8])
>>> row_number(ser)
0    1
1    2
dtype: int64

>>> row_number(pd.DataFrame({'a': ser}))
0    1
1    2
dtype: int64

>>> row_number(pd.Series([7,8], index = [3, 4]))
3    1
4    2
dtype: int64

Source code in siuba/dply/vector.py

@symbolic_dispatch(cls = NDFrame)
def row_number(x):
    """Return the row number (position) for each value in x, beginning with 1.

    Example:
        >>> ser = pd.Series([7,8])
        >>> row_number(ser)
        0    1
        1    2
        dtype: int64

        >>> row_number(pd.DataFrame({'a': ser}))
        0    1
        1    2
        dtype: int64

        >>> row_number(pd.Series([7,8], index = [3, 4]))
        3    1
        4    2
        dtype: int64


    """
    if isinstance(x, pd.DataFrame):
        n = x.shape[0]
    else:
        n = len(x)

    arr = np.arange(1, n + 1)

    # could use single dispatch, but for now ensure output data type matches input
    if isinstance(x, pd.Series):
        return x._constructor(arr, x.index, fastpath = True)

    return pd.Series(arr, x.index, fastpath = True)

Conditionals

Note that these functions currently can be imported from the top level:

from siuba import case_when, if_else

They will be moved into the siuba.dply.vector module, but the above import will continue to work for backwards compatibility.

`case_when(__data, cases)`

Generalized, vectorized if statement.

Parameters:

Name	Type	Description	Default
`__data`		The input data.	required
`cases`	`dict`	A mapping of condition : value.	required

Examples:

>>> import pandas as pd
>>> from siuba import _, case_when

>>> df = pd.DataFrame({"x": [1, 2, 3]})
>>> case_when(df, {_.x == 1: "one", _.x == 2: "two"})
0     one
1     two
2    None
dtype: object

>>> df >> case_when({_.x == 1: "one", _.x == 2: "two"})
0     one
1     two
2    None
dtype: object

>>> df >> case_when({_.x == 1: "one", _.x == 2: "two", True: "other"})
0      one
1      two
2    other
dtype: object

Source code in siuba/dply/verbs.py

@singledispatch2((pd.DataFrame,pd.Series))
def case_when(__data, cases: dict):
    """Generalized, vectorized if statement.

    Parameters
    ----------
    __data:
        The input data.
    cases: dict
        A mapping of condition : value.

    See Also
    --------
    if_else : Handles the special case of two conditions.

    Examples
    --------
    >>> import pandas as pd
    >>> from siuba import _, case_when

    >>> df = pd.DataFrame({"x": [1, 2, 3]})
    >>> case_when(df, {_.x == 1: "one", _.x == 2: "two"})
    0     one
    1     two
    2    None
    dtype: object

    >>> df >> case_when({_.x == 1: "one", _.x == 2: "two"})
    0     one
    1     two
    2    None
    dtype: object

    >>> df >> case_when({_.x == 1: "one", _.x == 2: "two", True: "other"})
    0      one
    1      two
    2    other
    dtype: object


    """
    if isinstance(cases, Call):
        cases = cases(__data)
    # TODO: handle when receive list of (k,v) pairs for py < 3.5 compat?

    stripped_cases = {strip_symbolic(k): strip_symbolic(v) for k,v in cases.items()}
    n = len(__data)
    out = np.repeat(None, n)
    for k, v in reversed(list(stripped_cases.items())):
        if callable(k):
            result = _val_call(k, __data, n)
            indx = np.where(result)[0]

            val_res = _val_call(v, __data, n, indx)
            out[indx] = val_res
        elif k:
            # e.g. k is just True, etc..
            val_res = _val_call(v, __data, n)
            out[:] = val_res

    # by recreating an array, attempts to cast as best dtype
    return pd.Series(list(out))

`if_else(condition, true, false)`

Parameters:

Name	Description	Default
`condition`	Logical vector (or lazy expression).	required
`true`	Values to be used when condition is True.	required
`false`	Values to be used when condition is False.	required

Examples:

>>> ser1 = pd.Series([1,2,3])
>>> if_else(ser1 > 2, np.nan, ser1)
0    1.0
1    2.0
2    NaN
dtype: float64

>>> from siuba import _
>>> f = if_else(_ < 2, _, 2)
>>> f(ser1)
0    1
1    2
2    2
dtype: int64

>>> import numpy as np
>>> ser2 = pd.Series(['NA', 'a', 'b'])
>>> if_else(ser2 == 'NA', np.nan, ser2)
0    NaN
1      a
2      b
dtype: object

Source code in siuba/dply/verbs.py

@singledispatch
def if_else(condition, true, false):
    """
    Parameters
    ----------
    condition:
        Logical vector (or lazy expression).
    true:
        Values to be used when condition is True.
    false:
        Values to be used when condition is False.

    See Also
    --------
    case_when : Generalized if_else, for handling many cases.

    Examples
    --------
    >>> ser1 = pd.Series([1,2,3])
    >>> if_else(ser1 > 2, np.nan, ser1)
    0    1.0
    1    2.0
    2    NaN
    dtype: float64

    >>> from siuba import _
    >>> f = if_else(_ < 2, _, 2)
    >>> f(ser1)
    0    1
    1    2
    2    2
    dtype: int64

    >>> import numpy as np
    >>> ser2 = pd.Series(['NA', 'a', 'b'])
    >>> if_else(ser2 == 'NA', np.nan, ser2)
    0    NaN
    1      a
    2      b
    dtype: object

    """
    raise_type_error(condition)