Skip to content

General

Importing

from siuba.dply.vector import n, lead, lag

siuba.dply.vector

between(x, left, right, default=False)

Return whether a value is between left and right (including either side).

Examples:

>>> between(pd.Series([1,2,3]), 0, 2)
0     True
1     True
2    False
dtype: bool
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def between(x, left, right, default = False):
    """Return whether a value is between left and right (including either side).

    Example:
        >>> between(pd.Series([1,2,3]), 0, 2)
        0     True
        1     True
        2    False
        dtype: bool

    Note:
        This is a thin wrapper around pd.Series.between(left, right)

    """
    # note: NA -> False, in tidyverse NA -> NA
    if default is not False:
        raise TypeError("between function must use default = False for pandas Series")

    return x.between(left, right)

coalesce(x, *args)

Returns a copy of x, with NaN values filled in from *args. Ignores indexes.

Parameters:

Name Type Description Default
x None

a pandas Series object

required
*args None

other Series that are the same length as x, or a scalar

()

Examples:

>>> x = pd.Series([1.1, None, None])
>>> abc = pd.Series(['a', 'b', None])
>>> xyz = pd.Series(['x', 'y', 'z'])
>>> coalesce(x, abc)
0     1.1
1       b
2    None
dtype: object
>>> coalesce(x, abc, xyz)
0    1.1
1      b
2      z
dtype: object
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def coalesce(x, *args):
    """Returns a copy of x, with NaN values filled in from \*args. Ignores indexes.

    Arguments:
        x: a pandas Series object
        *args: other Series that are the same length as x, or a scalar

    Examples:
        >>> x = pd.Series([1.1, None, None])
        >>> abc = pd.Series(['a', 'b', None])
        >>> xyz = pd.Series(['x', 'y', 'z'])
        >>> coalesce(x, abc)
        0     1.1
        1       b
        2    None
        dtype: object

        >>> coalesce(x, abc, xyz)
        0    1.1
        1      b
        2      z
        dtype: object

    """

    crnt = x.reset_index(drop = True)

    for other in args:
        if isinstance(other, pd.Series):
            other = other.reset_index(drop = True)

        crnt = crnt.where(crnt.notna(), other)

    crnt.index = x.index
    return crnt

cumall(x)

Return a same-length array. For each entry, indicates whether that entry and all previous are True-like.

Examples:

>>> cumall(pd.Series([True, False, False]))
0     True
1    False
2    False
dtype: bool
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def cumall(x):
    """Return a same-length array. For each entry, indicates whether that entry and all previous are True-like.

    Example:
        >>> cumall(pd.Series([True, False, False]))
        0     True
        1    False
        2    False
        dtype: bool

    """
    return _expand_bool(x, np.all)

cumany(x)

Return a same-length array. For each entry, indicates whether that entry or any previous are True-like.

Examples:

>>> cumany(pd.Series([False, True, False]))
0    False
1     True
2     True
dtype: bool
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def cumany(x):
    """Return a same-length array. For each entry, indicates whether that entry or any previous are True-like.

    Example:
        >>> cumany(pd.Series([False, True, False]))
        0    False
        1     True
        2     True
        dtype: bool

    """
    return _expand_bool(x, np.any)

cume_dist(x, na_option='keep')

Return the cumulative distribution corresponding to each value in x.

This reflects the proportion of values that are less than or equal to each value.

Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def cume_dist(x, na_option = "keep"):
    """Return the cumulative distribution corresponding to each value in x.

    This reflects the proportion of values that are less than or equal to each value.

    """
    return x.rank(method = "max", na_option = na_option) / x.count()

cummean(x)

Return a same-length array, containing the cumulative mean.

Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def cummean(x):
    """Return a same-length array, containing the cumulative mean."""
    return x.expanding().mean()

dense_rank(x, na_option='keep')

Return the dense rank.

This method of ranking returns values ranging from 1 to the number of unique entries. Ties are all given the same ranking.

Examples:

>>> dense_rank(pd.Series([1,3,3,5]))
0    1.0
1    2.0
2    2.0
3    3.0
dtype: float64
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def dense_rank(x, na_option = "keep"):
    """Return the dense rank.

    This method of ranking returns values ranging from 1 to the number of unique entries.
    Ties are all given the same ranking.

    Example:

        >>> dense_rank(pd.Series([1,3,3,5]))
        0    1.0
        1    2.0
        2    2.0
        3    3.0
        dtype: float64


    """
    return x.rank(method = "dense", na_option = na_option)

desc(x)

Return array sorted in descending order.

Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def desc(x):
    """Return array sorted in descending order."""
    return x.sort_values(ascending = False).reset_index(drop = True)

first(x, n, order_by=None, default=None)

Return the nth entry of x. Similar to x[n].

Note: first(x) and last(x) are nth(x, 0) and nth(x, -1).

Parameters:

Name Type Description Default
x None

series to get entry from.

required
n None

position of entry to get from x (0 indicates first entry).

required
order_by None

optional Series used to reorder x.

None
default None

(not implemented) value to return if no entry at n.

None

Examples:

>>> ser = pd.Series(['a', 'b', 'c'])
>>> nth(ser, 1)
'b'
>>> sorter = pd.Series([1, 2, 0])
>>> nth(ser, 1, order_by = sorter)
'a'
>>> nth(ser, 0), nth(ser, -1)
('a', 'c')
>>> first(ser), last(ser)
('a', 'c')
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def nth(x, n, order_by = None, default = None):
    """Return the nth entry of x. Similar to x[n].

    Note:
        first(x) and last(x) are nth(x, 0) and nth(x, -1).

    Arguments:
        x: series to get entry from.
        n: position of entry to get from x (0 indicates first entry).
        order_by: optional Series used to reorder x.
        default: (not implemented) value to return if no entry at n.

    Examples:
        >>> ser = pd.Series(['a', 'b', 'c'])
        >>> nth(ser, 1)
        'b'

        >>> sorter = pd.Series([1, 2, 0])
        >>> nth(ser, 1, order_by = sorter)
        'a'

        >>> nth(ser, 0), nth(ser, -1)
        ('a', 'c')

        >>> first(ser), last(ser)
        ('a', 'c')

    """

    if default is not None:
        raise NotImplementedError("default argument not implemented") 

    # check indexing is in range, handles positive and negative cases.
    # TODO: is returning None the correct behavior for an empty Series?
    if n >= len(x) or abs(n) > len(x):
        return default

    if order_by is None:
        return x.iloc[n]

    # case where order_by is specified and n in range ----
    # TODO: ensure order_by is arraylike
    if not isinstance(order_by, pd.Series):
        raise NotImplementedError(
                "order_by argument is type %s, but currently only"
                "implemented for Series" % type(order_by)
                )

    if len(x) != len(order_by):
        raise ValueError("x and order_by arguments must be same length")

    order_indx = order_by.reset_index(drop = True).sort_values().index
    return x.iloc[order_indx[n]]

lag(x, n=1, default=None)

Return an array with each value replaced by the previous (or further backward) value in the array.

Parameters:

Name Type Description Default
x None

a pandas Series object

required
n None

number of next values backward to replace each value with

1
default None

what to replace the n final values of the array with

None

Examples:

>>> lag(pd.Series([1,2,3]), n=1)
0    NaN
1    1.0
2    2.0
dtype: float64
>>> lag(pd.Series([1,2,3]), n=1, default = 99)
0    99.0
1     1.0
2     2.0
dtype: float64
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def lag(x, n = 1, default = None):
    """Return an array with each value replaced by the previous (or further backward) value in the array.

    Arguments:
        x: a pandas Series object
        n: number of next values backward to replace each value with
        default: what to replace the n final values of the array with

    Example:
        >>> lag(pd.Series([1,2,3]), n=1)
        0    NaN
        1    1.0
        2    2.0
        dtype: float64

        >>> lag(pd.Series([1,2,3]), n=1, default = 99)
        0    99.0
        1     1.0
        2     2.0
        dtype: float64


    """
    res = x.shift(n)

    if default is not None:
        res.iloc[:n] = default

    return res

last(x, n, order_by=None, default=None)

Return the nth entry of x. Similar to x[n].

Note: first(x) and last(x) are nth(x, 0) and nth(x, -1).

Parameters:

Name Type Description Default
x None

series to get entry from.

required
n None

position of entry to get from x (0 indicates first entry).

required
order_by None

optional Series used to reorder x.

None
default None

(not implemented) value to return if no entry at n.

None

Examples:

>>> ser = pd.Series(['a', 'b', 'c'])
>>> nth(ser, 1)
'b'
>>> sorter = pd.Series([1, 2, 0])
>>> nth(ser, 1, order_by = sorter)
'a'
>>> nth(ser, 0), nth(ser, -1)
('a', 'c')
>>> first(ser), last(ser)
('a', 'c')
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def nth(x, n, order_by = None, default = None):
    """Return the nth entry of x. Similar to x[n].

    Note:
        first(x) and last(x) are nth(x, 0) and nth(x, -1).

    Arguments:
        x: series to get entry from.
        n: position of entry to get from x (0 indicates first entry).
        order_by: optional Series used to reorder x.
        default: (not implemented) value to return if no entry at n.

    Examples:
        >>> ser = pd.Series(['a', 'b', 'c'])
        >>> nth(ser, 1)
        'b'

        >>> sorter = pd.Series([1, 2, 0])
        >>> nth(ser, 1, order_by = sorter)
        'a'

        >>> nth(ser, 0), nth(ser, -1)
        ('a', 'c')

        >>> first(ser), last(ser)
        ('a', 'c')

    """

    if default is not None:
        raise NotImplementedError("default argument not implemented") 

    # check indexing is in range, handles positive and negative cases.
    # TODO: is returning None the correct behavior for an empty Series?
    if n >= len(x) or abs(n) > len(x):
        return default

    if order_by is None:
        return x.iloc[n]

    # case where order_by is specified and n in range ----
    # TODO: ensure order_by is arraylike
    if not isinstance(order_by, pd.Series):
        raise NotImplementedError(
                "order_by argument is type %s, but currently only"
                "implemented for Series" % type(order_by)
                )

    if len(x) != len(order_by):
        raise ValueError("x and order_by arguments must be same length")

    order_indx = order_by.reset_index(drop = True).sort_values().index
    return x.iloc[order_indx[n]]

lead(x, n=1, default=None)

Return an array with each value replaced by the next (or further forward) value in the array.

Parameters:

Name Type Description Default
x None

a pandas Series object

required
n None

number of next values forward to replace each value with

1
default None

what to replace the n final values of the array with

None

Examples:

>>> lead(pd.Series([1,2,3]), n=1)
0    2.0
1    3.0
2    NaN
dtype: float64
>>> lead(pd.Series([1,2,3]), n=1, default = 99)
0     2
1     3
2    99
dtype: int64
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def lead(x, n = 1, default = None):
    """Return an array with each value replaced by the next (or further forward) value in the array.

    Arguments:
        x: a pandas Series object
        n: number of next values forward to replace each value with
        default: what to replace the n final values of the array with

    Example:
        >>> lead(pd.Series([1,2,3]), n=1)
        0    2.0
        1    3.0
        2    NaN
        dtype: float64

        >>> lead(pd.Series([1,2,3]), n=1, default = 99)
        0     2
        1     3
        2    99
        dtype: int64

    """
    res = x.shift(-1*n, fill_value = default)

    return res

min_rank(x, na_option='keep')

Return the min rank. See pd.Series.rank with method="min" for details.

Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def min_rank(x, na_option = "keep"):
    """Return the min rank. See pd.Series.rank with method="min" for details.

    """
    return x.rank(method = "min", na_option = na_option)

n(x)

Return the total number of elements in the array (or rows in a DataFrame).

Examples:

>>> ser = pd.Series([1,2,3])
>>> n(ser)
3
>>> df = pd.DataFrame({'x': ser})
>>> n(df)
3
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = NDFrame)
def n(x):
    """Return the total number of elements in the array (or rows in a DataFrame).

    Example:
        >>> ser = pd.Series([1,2,3])
        >>> n(ser)
        3

        >>> df = pd.DataFrame({'x': ser})
        >>> n(df)
        3

    """
    if isinstance(x, pd.DataFrame):
        return x.shape[0]

    return len(x)

n_distinct(x)

Return the total number of distinct (i.e. unique) elements in an array.

Examples:

>>> n_distinct(pd.Series([1,1,2,2]))
2
Source code in siuba/dply/vector.py
@alias_series_agg('nunique')
@symbolic_dispatch(cls = Series)
def n_distinct(x):
    """Return the total number of distinct (i.e. unique) elements in an array.

    Example:
        >>> n_distinct(pd.Series([1,1,2,2]))
        2

    """
    return x.nunique()

na_if(x, y)

Return a array like x, but with values in y replaced by NAs.

Examples:

>>> na_if(pd.Series([1,2,3]), [1,3])
0    NaN
1    2.0
2    NaN
dtype: float64
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def na_if(x, y):
    """Return a array like x, but with values in y replaced by NAs.

    Examples:
        >>> na_if(pd.Series([1,2,3]), [1,3])
        0    NaN
        1    2.0
        2    NaN
        dtype: float64

    """
    y = [y] if not np.ndim(y) else y

    tmp_x = x.copy(deep = True)
    tmp_x[x.isin(y)] = np.nan

    return tmp_x

near(x)

TODO: Not Implemented

Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def near(x):
    """TODO: Not Implemented"""
    raise NotImplementedError("near not implemented") 

nth(x, n, order_by=None, default=None)

Return the nth entry of x. Similar to x[n].

Note: first(x) and last(x) are nth(x, 0) and nth(x, -1).

Parameters:

Name Type Description Default
x None

series to get entry from.

required
n None

position of entry to get from x (0 indicates first entry).

required
order_by None

optional Series used to reorder x.

None
default None

(not implemented) value to return if no entry at n.

None

Examples:

>>> ser = pd.Series(['a', 'b', 'c'])
>>> nth(ser, 1)
'b'
>>> sorter = pd.Series([1, 2, 0])
>>> nth(ser, 1, order_by = sorter)
'a'
>>> nth(ser, 0), nth(ser, -1)
('a', 'c')
>>> first(ser), last(ser)
('a', 'c')
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def nth(x, n, order_by = None, default = None):
    """Return the nth entry of x. Similar to x[n].

    Note:
        first(x) and last(x) are nth(x, 0) and nth(x, -1).

    Arguments:
        x: series to get entry from.
        n: position of entry to get from x (0 indicates first entry).
        order_by: optional Series used to reorder x.
        default: (not implemented) value to return if no entry at n.

    Examples:
        >>> ser = pd.Series(['a', 'b', 'c'])
        >>> nth(ser, 1)
        'b'

        >>> sorter = pd.Series([1, 2, 0])
        >>> nth(ser, 1, order_by = sorter)
        'a'

        >>> nth(ser, 0), nth(ser, -1)
        ('a', 'c')

        >>> first(ser), last(ser)
        ('a', 'c')

    """

    if default is not None:
        raise NotImplementedError("default argument not implemented") 

    # check indexing is in range, handles positive and negative cases.
    # TODO: is returning None the correct behavior for an empty Series?
    if n >= len(x) or abs(n) > len(x):
        return default

    if order_by is None:
        return x.iloc[n]

    # case where order_by is specified and n in range ----
    # TODO: ensure order_by is arraylike
    if not isinstance(order_by, pd.Series):
        raise NotImplementedError(
                "order_by argument is type %s, but currently only"
                "implemented for Series" % type(order_by)
                )

    if len(x) != len(order_by):
        raise ValueError("x and order_by arguments must be same length")

    order_indx = order_by.reset_index(drop = True).sort_values().index
    return x.iloc[order_indx[n]]

ntile(x, n)

TODO: Not Implemented

Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def ntile(x, n):
    """TODO: Not Implemented"""
    raise NotImplementedError("ntile not implemented")

percent_rank(x, na_option='keep')

Return the percent rank.

Note: Uses minimum rank, and reports the proportion of unique ranks each entry is greater than.

Examples:

>>> percent_rank(pd.Series([1, 2, 3]))
0    0.0
1    0.5
2    1.0
dtype: float64
>>> percent_rank(pd.Series([1, 2, 2]))
0    0.0
1    0.5
2    0.5
dtype: float64
>>> percent_rank(pd.Series([1]))
0   NaN
dtype: float64
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def percent_rank(x, na_option = "keep"):
    """Return the percent rank.

    Note:
        Uses minimum rank, and reports the proportion of unique ranks each entry is greater than.

    Examples:
        >>> percent_rank(pd.Series([1, 2, 3]))
        0    0.0
        1    0.5
        2    1.0
        dtype: float64

        >>> percent_rank(pd.Series([1, 2, 2]))
        0    0.0
        1    0.5
        2    0.5
        dtype: float64

        >>> percent_rank(pd.Series([1]))
        0   NaN
        dtype: float64


    """
    return (min_rank(x) - 1) / (x.count() - 1)

row_number(x)

Return the row number (position) for each value in x, beginning with 1.

Examples:

>>> ser = pd.Series([7,8])
>>> row_number(ser)
0    1
1    2
dtype: int64
>>> row_number(pd.DataFrame({'a': ser}))
0    1
1    2
dtype: int64
>>> row_number(pd.Series([7,8], index = [3, 4]))
3    1
4    2
dtype: int64
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = NDFrame)
def row_number(x):
    """Return the row number (position) for each value in x, beginning with 1.

    Example:
        >>> ser = pd.Series([7,8])
        >>> row_number(ser)
        0    1
        1    2
        dtype: int64

        >>> row_number(pd.DataFrame({'a': ser}))
        0    1
        1    2
        dtype: int64

        >>> row_number(pd.Series([7,8], index = [3, 4]))
        3    1
        4    2
        dtype: int64


    """
    if isinstance(x, pd.DataFrame):
        n = x.shape[0]
    else:
        n = len(x)

    arr = np.arange(1, n + 1)

    # could use single dispatch, but for now ensure output data type matches input
    if isinstance(x, pd.Series):
        return x._constructor(arr, x.index, fastpath = True)

    return pd.Series(arr, x.index, fastpath = True)

Conditionals

Note that these functions currently can be imported from the top level:

from siuba import case_when, if_else

They will be moved into the siuba.dply.vector module, but the above import will continue to work for backwards compatibility.

case_when(__data, cases)

Generalized, vectorized if statement.

Parameters:

Name Type Description Default
__data

The input data.

required
cases dict

A mapping of condition : value.

required

Examples:

>>> import pandas as pd
>>> from siuba import _, case_when
>>> df = pd.DataFrame({"x": [1, 2, 3]})
>>> case_when(df, {_.x == 1: "one", _.x == 2: "two"})
0     one
1     two
2    None
dtype: object
>>> df >> case_when({_.x == 1: "one", _.x == 2: "two"})
0     one
1     two
2    None
dtype: object
>>> df >> case_when({_.x == 1: "one", _.x == 2: "two", True: "other"})
0      one
1      two
2    other
dtype: object
Source code in siuba/dply/verbs.py
@singledispatch2((pd.DataFrame,pd.Series))
def case_when(__data, cases: dict):
    """Generalized, vectorized if statement.

    Parameters
    ----------
    __data:
        The input data.
    cases: dict
        A mapping of condition : value.

    See Also
    --------
    if_else : Handles the special case of two conditions.

    Examples
    --------
    >>> import pandas as pd
    >>> from siuba import _, case_when

    >>> df = pd.DataFrame({"x": [1, 2, 3]})
    >>> case_when(df, {_.x == 1: "one", _.x == 2: "two"})
    0     one
    1     two
    2    None
    dtype: object

    >>> df >> case_when({_.x == 1: "one", _.x == 2: "two"})
    0     one
    1     two
    2    None
    dtype: object

    >>> df >> case_when({_.x == 1: "one", _.x == 2: "two", True: "other"})
    0      one
    1      two
    2    other
    dtype: object


    """
    if isinstance(cases, Call):
        cases = cases(__data)
    # TODO: handle when receive list of (k,v) pairs for py < 3.5 compat?

    stripped_cases = {strip_symbolic(k): strip_symbolic(v) for k,v in cases.items()}
    n = len(__data)
    out = np.repeat(None, n)
    for k, v in reversed(list(stripped_cases.items())):
        if callable(k):
            result = _val_call(k, __data, n)
            indx = np.where(result)[0]

            val_res = _val_call(v, __data, n, indx)
            out[indx] = val_res
        elif k:
            # e.g. k is just True, etc..
            val_res = _val_call(v, __data, n)
            out[:] = val_res

    # by recreating an array, attempts to cast as best dtype
    return pd.Series(list(out))

if_else(condition, true, false)

Parameters:

Name Type Description Default
condition

Logical vector (or lazy expression).

required
true

Values to be used when condition is True.

required
false

Values to be used when condition is False.

required

Examples:

>>> ser1 = pd.Series([1,2,3])
>>> if_else(ser1 > 2, np.nan, ser1)
0    1.0
1    2.0
2    NaN
dtype: float64
>>> from siuba import _
>>> f = if_else(_ < 2, _, 2)
>>> f(ser1)
0    1
1    2
2    2
dtype: int64
>>> import numpy as np
>>> ser2 = pd.Series(['NA', 'a', 'b'])
>>> if_else(ser2 == 'NA', np.nan, ser2)
0    NaN
1      a
2      b
dtype: object
Source code in siuba/dply/verbs.py
@singledispatch
def if_else(condition, true, false):
    """
    Parameters
    ----------
    condition:
        Logical vector (or lazy expression).
    true:
        Values to be used when condition is True.
    false:
        Values to be used when condition is False.

    See Also
    --------
    case_when : Generalized if_else, for handling many cases.

    Examples
    --------
    >>> ser1 = pd.Series([1,2,3])
    >>> if_else(ser1 > 2, np.nan, ser1)
    0    1.0
    1    2.0
    2    NaN
    dtype: float64

    >>> from siuba import _
    >>> f = if_else(_ < 2, _, 2)
    >>> f(ser1)
    0    1
    1    2
    2    2
    dtype: int64

    >>> import numpy as np
    >>> ser2 = pd.Series(['NA', 'a', 'b'])
    >>> if_else(ser2 == 'NA', np.nan, ser2)
    0    NaN
    1      a
    2      b
    dtype: object

    """
    raise_type_error(condition)