General
Importing
from siuba.dply.vector import n, lead, lag
siuba.dply.vector
between(x, left, right, default=False)
Return whether a value is between left and right (including either side).
Examples:
>>> between(pd.Series([1,2,3]), 0, 2)
0 True
1 True
2 False
dtype: bool
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def between(x, left, right, default = False):
"""Return whether a value is between left and right (including either side).
Example:
>>> between(pd.Series([1,2,3]), 0, 2)
0 True
1 True
2 False
dtype: bool
Note:
This is a thin wrapper around pd.Series.between(left, right)
"""
# note: NA -> False, in tidyverse NA -> NA
if default is not False:
raise TypeError("between function must use default = False for pandas Series")
return x.between(left, right)
coalesce(x, *args)
Returns a copy of x, with NaN values filled in from *args. Ignores indexes.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
x |
None |
a pandas Series object |
required |
*args |
None |
other Series that are the same length as x, or a scalar |
() |
Examples:
>>> x = pd.Series([1.1, None, None])
>>> abc = pd.Series(['a', 'b', None])
>>> xyz = pd.Series(['x', 'y', 'z'])
>>> coalesce(x, abc)
0 1.1
1 b
2 None
dtype: object
>>> coalesce(x, abc, xyz)
0 1.1
1 b
2 z
dtype: object
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def coalesce(x, *args):
"""Returns a copy of x, with NaN values filled in from \*args. Ignores indexes.
Arguments:
x: a pandas Series object
*args: other Series that are the same length as x, or a scalar
Examples:
>>> x = pd.Series([1.1, None, None])
>>> abc = pd.Series(['a', 'b', None])
>>> xyz = pd.Series(['x', 'y', 'z'])
>>> coalesce(x, abc)
0 1.1
1 b
2 None
dtype: object
>>> coalesce(x, abc, xyz)
0 1.1
1 b
2 z
dtype: object
"""
crnt = x.reset_index(drop = True)
for other in args:
if isinstance(other, pd.Series):
other = other.reset_index(drop = True)
crnt = crnt.where(crnt.notna(), other)
crnt.index = x.index
return crnt
cumall(x)
Return a same-length array. For each entry, indicates whether that entry and all previous are True-like.
Examples:
>>> cumall(pd.Series([True, False, False]))
0 True
1 False
2 False
dtype: bool
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def cumall(x):
"""Return a same-length array. For each entry, indicates whether that entry and all previous are True-like.
Example:
>>> cumall(pd.Series([True, False, False]))
0 True
1 False
2 False
dtype: bool
"""
return _expand_bool(x, np.all)
cumany(x)
Return a same-length array. For each entry, indicates whether that entry or any previous are True-like.
Examples:
>>> cumany(pd.Series([False, True, False]))
0 False
1 True
2 True
dtype: bool
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def cumany(x):
"""Return a same-length array. For each entry, indicates whether that entry or any previous are True-like.
Example:
>>> cumany(pd.Series([False, True, False]))
0 False
1 True
2 True
dtype: bool
"""
return _expand_bool(x, np.any)
cume_dist(x, na_option='keep')
Return the cumulative distribution corresponding to each value in x.
This reflects the proportion of values that are less than or equal to each value.
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def cume_dist(x, na_option = "keep"):
"""Return the cumulative distribution corresponding to each value in x.
This reflects the proportion of values that are less than or equal to each value.
"""
return x.rank(method = "max", na_option = na_option) / x.count()
cummean(x)
Return a same-length array, containing the cumulative mean.
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def cummean(x):
"""Return a same-length array, containing the cumulative mean."""
return x.expanding().mean()
dense_rank(x, na_option='keep')
Return the dense rank.
This method of ranking returns values ranging from 1 to the number of unique entries. Ties are all given the same ranking.
Examples:
>>> dense_rank(pd.Series([1,3,3,5]))
0 1.0
1 2.0
2 2.0
3 3.0
dtype: float64
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def dense_rank(x, na_option = "keep"):
"""Return the dense rank.
This method of ranking returns values ranging from 1 to the number of unique entries.
Ties are all given the same ranking.
Example:
>>> dense_rank(pd.Series([1,3,3,5]))
0 1.0
1 2.0
2 2.0
3 3.0
dtype: float64
"""
return x.rank(method = "dense", na_option = na_option)
desc(x)
Return array sorted in descending order.
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def desc(x):
"""Return array sorted in descending order."""
return x.sort_values(ascending = False).reset_index(drop = True)
first(x, n, order_by=None, default=None)
Return the nth entry of x. Similar to x[n].
Note: first(x) and last(x) are nth(x, 0) and nth(x, -1).
Parameters:
Name | Type | Description | Default |
---|---|---|---|
x |
None |
series to get entry from. |
required |
n |
None |
position of entry to get from x (0 indicates first entry). |
required |
order_by |
None |
optional Series used to reorder x. |
None |
default |
None |
(not implemented) value to return if no entry at n. |
None |
Examples:
>>> ser = pd.Series(['a', 'b', 'c'])
>>> nth(ser, 1)
'b'
>>> sorter = pd.Series([1, 2, 0])
>>> nth(ser, 1, order_by = sorter)
'a'
>>> nth(ser, 0), nth(ser, -1)
('a', 'c')
>>> first(ser), last(ser)
('a', 'c')
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def nth(x, n, order_by = None, default = None):
"""Return the nth entry of x. Similar to x[n].
Note:
first(x) and last(x) are nth(x, 0) and nth(x, -1).
Arguments:
x: series to get entry from.
n: position of entry to get from x (0 indicates first entry).
order_by: optional Series used to reorder x.
default: (not implemented) value to return if no entry at n.
Examples:
>>> ser = pd.Series(['a', 'b', 'c'])
>>> nth(ser, 1)
'b'
>>> sorter = pd.Series([1, 2, 0])
>>> nth(ser, 1, order_by = sorter)
'a'
>>> nth(ser, 0), nth(ser, -1)
('a', 'c')
>>> first(ser), last(ser)
('a', 'c')
"""
if default is not None:
raise NotImplementedError("default argument not implemented")
# check indexing is in range, handles positive and negative cases.
# TODO: is returning None the correct behavior for an empty Series?
if n >= len(x) or abs(n) > len(x):
return default
if order_by is None:
return x.iloc[n]
# case where order_by is specified and n in range ----
# TODO: ensure order_by is arraylike
if not isinstance(order_by, pd.Series):
raise NotImplementedError(
"order_by argument is type %s, but currently only"
"implemented for Series" % type(order_by)
)
if len(x) != len(order_by):
raise ValueError("x and order_by arguments must be same length")
order_indx = order_by.reset_index(drop = True).sort_values().index
return x.iloc[order_indx[n]]
lag(x, n=1, default=None)
Return an array with each value replaced by the previous (or further backward) value in the array.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
x |
None |
a pandas Series object |
required |
n |
None |
number of next values backward to replace each value with |
1 |
default |
None |
what to replace the n final values of the array with |
None |
Examples:
>>> lag(pd.Series([1,2,3]), n=1)
0 NaN
1 1.0
2 2.0
dtype: float64
>>> lag(pd.Series([1,2,3]), n=1, default = 99)
0 99.0
1 1.0
2 2.0
dtype: float64
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def lag(x, n = 1, default = None):
"""Return an array with each value replaced by the previous (or further backward) value in the array.
Arguments:
x: a pandas Series object
n: number of next values backward to replace each value with
default: what to replace the n final values of the array with
Example:
>>> lag(pd.Series([1,2,3]), n=1)
0 NaN
1 1.0
2 2.0
dtype: float64
>>> lag(pd.Series([1,2,3]), n=1, default = 99)
0 99.0
1 1.0
2 2.0
dtype: float64
"""
res = x.shift(n)
if default is not None:
res.iloc[:n] = default
return res
last(x, n, order_by=None, default=None)
Return the nth entry of x. Similar to x[n].
Note: first(x) and last(x) are nth(x, 0) and nth(x, -1).
Parameters:
Name | Type | Description | Default |
---|---|---|---|
x |
None |
series to get entry from. |
required |
n |
None |
position of entry to get from x (0 indicates first entry). |
required |
order_by |
None |
optional Series used to reorder x. |
None |
default |
None |
(not implemented) value to return if no entry at n. |
None |
Examples:
>>> ser = pd.Series(['a', 'b', 'c'])
>>> nth(ser, 1)
'b'
>>> sorter = pd.Series([1, 2, 0])
>>> nth(ser, 1, order_by = sorter)
'a'
>>> nth(ser, 0), nth(ser, -1)
('a', 'c')
>>> first(ser), last(ser)
('a', 'c')
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def nth(x, n, order_by = None, default = None):
"""Return the nth entry of x. Similar to x[n].
Note:
first(x) and last(x) are nth(x, 0) and nth(x, -1).
Arguments:
x: series to get entry from.
n: position of entry to get from x (0 indicates first entry).
order_by: optional Series used to reorder x.
default: (not implemented) value to return if no entry at n.
Examples:
>>> ser = pd.Series(['a', 'b', 'c'])
>>> nth(ser, 1)
'b'
>>> sorter = pd.Series([1, 2, 0])
>>> nth(ser, 1, order_by = sorter)
'a'
>>> nth(ser, 0), nth(ser, -1)
('a', 'c')
>>> first(ser), last(ser)
('a', 'c')
"""
if default is not None:
raise NotImplementedError("default argument not implemented")
# check indexing is in range, handles positive and negative cases.
# TODO: is returning None the correct behavior for an empty Series?
if n >= len(x) or abs(n) > len(x):
return default
if order_by is None:
return x.iloc[n]
# case where order_by is specified and n in range ----
# TODO: ensure order_by is arraylike
if not isinstance(order_by, pd.Series):
raise NotImplementedError(
"order_by argument is type %s, but currently only"
"implemented for Series" % type(order_by)
)
if len(x) != len(order_by):
raise ValueError("x and order_by arguments must be same length")
order_indx = order_by.reset_index(drop = True).sort_values().index
return x.iloc[order_indx[n]]
lead(x, n=1, default=None)
Return an array with each value replaced by the next (or further forward) value in the array.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
x |
None |
a pandas Series object |
required |
n |
None |
number of next values forward to replace each value with |
1 |
default |
None |
what to replace the n final values of the array with |
None |
Examples:
>>> lead(pd.Series([1,2,3]), n=1)
0 2.0
1 3.0
2 NaN
dtype: float64
>>> lead(pd.Series([1,2,3]), n=1, default = 99)
0 2
1 3
2 99
dtype: int64
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def lead(x, n = 1, default = None):
"""Return an array with each value replaced by the next (or further forward) value in the array.
Arguments:
x: a pandas Series object
n: number of next values forward to replace each value with
default: what to replace the n final values of the array with
Example:
>>> lead(pd.Series([1,2,3]), n=1)
0 2.0
1 3.0
2 NaN
dtype: float64
>>> lead(pd.Series([1,2,3]), n=1, default = 99)
0 2
1 3
2 99
dtype: int64
"""
res = x.shift(-1*n, fill_value = default)
return res
min_rank(x, na_option='keep')
Return the min rank. See pd.Series.rank with method="min" for details.
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def min_rank(x, na_option = "keep"):
"""Return the min rank. See pd.Series.rank with method="min" for details.
"""
return x.rank(method = "min", na_option = na_option)
n(x)
Return the total number of elements in the array (or rows in a DataFrame).
Examples:
>>> ser = pd.Series([1,2,3])
>>> n(ser)
3
>>> df = pd.DataFrame({'x': ser})
>>> n(df)
3
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = NDFrame)
def n(x):
"""Return the total number of elements in the array (or rows in a DataFrame).
Example:
>>> ser = pd.Series([1,2,3])
>>> n(ser)
3
>>> df = pd.DataFrame({'x': ser})
>>> n(df)
3
"""
if isinstance(x, pd.DataFrame):
return x.shape[0]
return len(x)
n_distinct(x)
Return the total number of distinct (i.e. unique) elements in an array.
Examples:
>>> n_distinct(pd.Series([1,1,2,2]))
2
Source code in siuba/dply/vector.py
@alias_series_agg('nunique')
@symbolic_dispatch(cls = Series)
def n_distinct(x):
"""Return the total number of distinct (i.e. unique) elements in an array.
Example:
>>> n_distinct(pd.Series([1,1,2,2]))
2
"""
return x.nunique()
na_if(x, y)
Return a array like x, but with values in y replaced by NAs.
Examples:
>>> na_if(pd.Series([1,2,3]), [1,3])
0 NaN
1 2.0
2 NaN
dtype: float64
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def na_if(x, y):
"""Return a array like x, but with values in y replaced by NAs.
Examples:
>>> na_if(pd.Series([1,2,3]), [1,3])
0 NaN
1 2.0
2 NaN
dtype: float64
"""
y = [y] if not np.ndim(y) else y
tmp_x = x.copy(deep = True)
tmp_x[x.isin(y)] = np.nan
return tmp_x
near(x)
TODO: Not Implemented
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def near(x):
"""TODO: Not Implemented"""
raise NotImplementedError("near not implemented")
nth(x, n, order_by=None, default=None)
Return the nth entry of x. Similar to x[n].
Note: first(x) and last(x) are nth(x, 0) and nth(x, -1).
Parameters:
Name | Type | Description | Default |
---|---|---|---|
x |
None |
series to get entry from. |
required |
n |
None |
position of entry to get from x (0 indicates first entry). |
required |
order_by |
None |
optional Series used to reorder x. |
None |
default |
None |
(not implemented) value to return if no entry at n. |
None |
Examples:
>>> ser = pd.Series(['a', 'b', 'c'])
>>> nth(ser, 1)
'b'
>>> sorter = pd.Series([1, 2, 0])
>>> nth(ser, 1, order_by = sorter)
'a'
>>> nth(ser, 0), nth(ser, -1)
('a', 'c')
>>> first(ser), last(ser)
('a', 'c')
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def nth(x, n, order_by = None, default = None):
"""Return the nth entry of x. Similar to x[n].
Note:
first(x) and last(x) are nth(x, 0) and nth(x, -1).
Arguments:
x: series to get entry from.
n: position of entry to get from x (0 indicates first entry).
order_by: optional Series used to reorder x.
default: (not implemented) value to return if no entry at n.
Examples:
>>> ser = pd.Series(['a', 'b', 'c'])
>>> nth(ser, 1)
'b'
>>> sorter = pd.Series([1, 2, 0])
>>> nth(ser, 1, order_by = sorter)
'a'
>>> nth(ser, 0), nth(ser, -1)
('a', 'c')
>>> first(ser), last(ser)
('a', 'c')
"""
if default is not None:
raise NotImplementedError("default argument not implemented")
# check indexing is in range, handles positive and negative cases.
# TODO: is returning None the correct behavior for an empty Series?
if n >= len(x) or abs(n) > len(x):
return default
if order_by is None:
return x.iloc[n]
# case where order_by is specified and n in range ----
# TODO: ensure order_by is arraylike
if not isinstance(order_by, pd.Series):
raise NotImplementedError(
"order_by argument is type %s, but currently only"
"implemented for Series" % type(order_by)
)
if len(x) != len(order_by):
raise ValueError("x and order_by arguments must be same length")
order_indx = order_by.reset_index(drop = True).sort_values().index
return x.iloc[order_indx[n]]
ntile(x, n)
TODO: Not Implemented
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def ntile(x, n):
"""TODO: Not Implemented"""
raise NotImplementedError("ntile not implemented")
percent_rank(x, na_option='keep')
Return the percent rank.
Note: Uses minimum rank, and reports the proportion of unique ranks each entry is greater than.
Examples:
>>> percent_rank(pd.Series([1, 2, 3]))
0 0.0
1 0.5
2 1.0
dtype: float64
>>> percent_rank(pd.Series([1, 2, 2]))
0 0.0
1 0.5
2 0.5
dtype: float64
>>> percent_rank(pd.Series([1]))
0 NaN
dtype: float64
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = Series)
def percent_rank(x, na_option = "keep"):
"""Return the percent rank.
Note:
Uses minimum rank, and reports the proportion of unique ranks each entry is greater than.
Examples:
>>> percent_rank(pd.Series([1, 2, 3]))
0 0.0
1 0.5
2 1.0
dtype: float64
>>> percent_rank(pd.Series([1, 2, 2]))
0 0.0
1 0.5
2 0.5
dtype: float64
>>> percent_rank(pd.Series([1]))
0 NaN
dtype: float64
"""
return (min_rank(x) - 1) / (x.count() - 1)
row_number(x)
Return the row number (position) for each value in x, beginning with 1.
Examples:
>>> ser = pd.Series([7,8])
>>> row_number(ser)
0 1
1 2
dtype: int64
>>> row_number(pd.DataFrame({'a': ser}))
0 1
1 2
dtype: int64
>>> row_number(pd.Series([7,8], index = [3, 4]))
3 1
4 2
dtype: int64
Source code in siuba/dply/vector.py
@symbolic_dispatch(cls = NDFrame)
def row_number(x):
"""Return the row number (position) for each value in x, beginning with 1.
Example:
>>> ser = pd.Series([7,8])
>>> row_number(ser)
0 1
1 2
dtype: int64
>>> row_number(pd.DataFrame({'a': ser}))
0 1
1 2
dtype: int64
>>> row_number(pd.Series([7,8], index = [3, 4]))
3 1
4 2
dtype: int64
"""
if isinstance(x, pd.DataFrame):
n = x.shape[0]
else:
n = len(x)
arr = np.arange(1, n + 1)
# could use single dispatch, but for now ensure output data type matches input
if isinstance(x, pd.Series):
return x._constructor(arr, x.index, fastpath = True)
return pd.Series(arr, x.index, fastpath = True)
Conditionals
Note that these functions currently can be imported from the top level:
from siuba import case_when, if_else
They will be moved into the siuba.dply.vector
module, but the above import will
continue to work for backwards compatibility.
case_when(__data, cases)
Generalized, vectorized if statement.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
__data |
The input data. |
required | |
cases |
dict |
A mapping of condition : value. |
required |
Examples:
>>> import pandas as pd
>>> from siuba import _, case_when
>>> df = pd.DataFrame({"x": [1, 2, 3]})
>>> case_when(df, {_.x == 1: "one", _.x == 2: "two"})
0 one
1 two
2 None
dtype: object
>>> df >> case_when({_.x == 1: "one", _.x == 2: "two"})
0 one
1 two
2 None
dtype: object
>>> df >> case_when({_.x == 1: "one", _.x == 2: "two", True: "other"})
0 one
1 two
2 other
dtype: object
Source code in siuba/dply/verbs.py
@singledispatch2((pd.DataFrame,pd.Series))
def case_when(__data, cases: dict):
"""Generalized, vectorized if statement.
Parameters
----------
__data:
The input data.
cases: dict
A mapping of condition : value.
See Also
--------
if_else : Handles the special case of two conditions.
Examples
--------
>>> import pandas as pd
>>> from siuba import _, case_when
>>> df = pd.DataFrame({"x": [1, 2, 3]})
>>> case_when(df, {_.x == 1: "one", _.x == 2: "two"})
0 one
1 two
2 None
dtype: object
>>> df >> case_when({_.x == 1: "one", _.x == 2: "two"})
0 one
1 two
2 None
dtype: object
>>> df >> case_when({_.x == 1: "one", _.x == 2: "two", True: "other"})
0 one
1 two
2 other
dtype: object
"""
if isinstance(cases, Call):
cases = cases(__data)
# TODO: handle when receive list of (k,v) pairs for py < 3.5 compat?
stripped_cases = {strip_symbolic(k): strip_symbolic(v) for k,v in cases.items()}
n = len(__data)
out = np.repeat(None, n)
for k, v in reversed(list(stripped_cases.items())):
if callable(k):
result = _val_call(k, __data, n)
indx = np.where(result)[0]
val_res = _val_call(v, __data, n, indx)
out[indx] = val_res
elif k:
# e.g. k is just True, etc..
val_res = _val_call(v, __data, n)
out[:] = val_res
# by recreating an array, attempts to cast as best dtype
return pd.Series(list(out))
if_else(condition, true, false)
Parameters:
Name | Type | Description | Default |
---|---|---|---|
condition |
Logical vector (or lazy expression). |
required | |
true |
Values to be used when condition is True. |
required | |
false |
Values to be used when condition is False. |
required |
Examples:
>>> ser1 = pd.Series([1,2,3])
>>> if_else(ser1 > 2, np.nan, ser1)
0 1.0
1 2.0
2 NaN
dtype: float64
>>> from siuba import _
>>> f = if_else(_ < 2, _, 2)
>>> f(ser1)
0 1
1 2
2 2
dtype: int64
>>> import numpy as np
>>> ser2 = pd.Series(['NA', 'a', 'b'])
>>> if_else(ser2 == 'NA', np.nan, ser2)
0 NaN
1 a
2 b
dtype: object
Source code in siuba/dply/verbs.py
@singledispatch
def if_else(condition, true, false):
"""
Parameters
----------
condition:
Logical vector (or lazy expression).
true:
Values to be used when condition is True.
false:
Values to be used when condition is False.
See Also
--------
case_when : Generalized if_else, for handling many cases.
Examples
--------
>>> ser1 = pd.Series([1,2,3])
>>> if_else(ser1 > 2, np.nan, ser1)
0 1.0
1 2.0
2 NaN
dtype: float64
>>> from siuba import _
>>> f = if_else(_ < 2, _, 2)
>>> f(ser1)
0 1
1 2
2 2
dtype: int64
>>> import numpy as np
>>> ser2 = pd.Series(['NA', 'a', 'b'])
>>> if_else(ser2 == 'NA', np.nan, ser2)
0 NaN
1 a
2 b
dtype: object
"""
raise_type_error(condition)