arrange
arrange(__data, *args)
Re-order the rows of a DataFrame using the values of specified columns.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
__data |
The input table. |
required | |
*args |
Columns or expressions used to sort the rows. |
() |
Examples:
>>> import pandas as pd
>>> from siuba import _, arrange, mutate
>>> df = pd.DataFrame({"x": [2, 1, 1], "y": ["aa", "b", "aa"]})
>>> df
x y
0 2 aa
1 1 b
2 1 aa
Arrange sorts on the first argument, then the second, etc..
>>> df >> arrange(_.x, _.y)
x y
2 1 aa
1 1 b
0 2 aa
Use a minus sign (-
) to sort is descending order.
>>> df >> arrange(-_.x)
x y
0 2 aa
1 1 b
2 1 aa
Note that arrange can sort on complex expressions:
>>> df >> arrange(-_.y.str.len())
x y
0 2 aa
2 1 aa
1 1 b
The case above is equivalent to running a mutate before arrange:
>>> df >> mutate(res = -_.y.str.len()) >> arrange(_.res)
x y res
0 2 aa -2
2 1 aa -2
1 1 b -1
Source code in siuba/dply/verbs.py
@singledispatch2(DataFrame)
def arrange(__data, *args):
"""Re-order the rows of a DataFrame using the values of specified columns.
Parameters
----------
__data:
The input table.
*args:
Columns or expressions used to sort the rows.
Examples
--------
>>> import pandas as pd
>>> from siuba import _, arrange, mutate
>>> df = pd.DataFrame({"x": [2, 1, 1], "y": ["aa", "b", "aa"]})
>>> df
x y
0 2 aa
1 1 b
2 1 aa
Arrange sorts on the first argument, then the second, etc..
>>> df >> arrange(_.x, _.y)
x y
2 1 aa
1 1 b
0 2 aa
Use a minus sign (`-`) to sort is descending order.
>>> df >> arrange(-_.x)
x y
0 2 aa
1 1 b
2 1 aa
Note that arrange can sort on complex expressions:
>>> df >> arrange(-_.y.str.len())
x y
0 2 aa
2 1 aa
1 1 b
The case above is equivalent to running a mutate before arrange:
>>> df >> mutate(res = -_.y.str.len()) >> arrange(_.res)
x y res
0 2 aa -2
2 1 aa -2
1 1 b -1
"""
# TODO:
# - add arguments to pass to sort_values (e.g. ascending, kind)
#
# basically need some (1) select behavior, (2) mutate-like behavior
# df.sort_values is the obvious candidate, but only takes names, not expressions
# to work around this, we make a shallow copy of data, and add sorting columns
# then drop them at the end
#
# sort order is determined by using a unary w/ Call e.g. -_.repo
df = __data.copy(deep = False)
n_cols = len(df.columns)
n_args = len(args)
#kwargs = {n_cols + ii: arg for ii,arg in enumerate(args)}
# TODO: more careful handling of arg types (true across library :/ )..
tmp_cols = []
sort_cols = []
ascending = []
for ii, arg in enumerate(args):
f, asc = _call_strip_ascending(arg)
ascending.append(asc)
col = simple_varname(f)
if col is not None:
sort_cols.append(col)
else:
# TODO: could screw up if user has columns names that are ints...
sort_cols.append(n_cols + ii)
tmp_cols.append(n_cols + ii)
res = f(df)
if isinstance(res, pd.DataFrame):
raise NotImplementedError(
f"`arrange()` expression {ii} of {len(args)} returned a "
"DataFrame, which is currently unsupported."
)
df[n_cols + ii] = res
return df.sort_values(by = sort_cols, kind = "mergesort", ascending = ascending) \
.drop(tmp_cols, axis = 1)