Skip to content

arrange

arrange(__data, *args)

Re-order the rows of a DataFrame using the values of specified columns.

Parameters:

Name Type Description Default
__data

The input table.

required
*args

Columns or expressions used to sort the rows.

()

Examples:

>>> import pandas as pd
>>> from siuba import _, arrange, mutate
>>> df = pd.DataFrame({"x": [2, 1, 1], "y": ["aa", "b", "aa"]})
>>> df
   x   y
0  2  aa
1  1   b
2  1  aa

Arrange sorts on the first argument, then the second, etc..

>>> df >> arrange(_.x, _.y)
   x   y
2  1  aa
1  1   b
0  2  aa

Use a minus sign (-) to sort is descending order.

>>> df >> arrange(-_.x)
   x   y
0  2  aa
1  1   b
2  1  aa

Note that arrange can sort on complex expressions:

>>> df >> arrange(-_.y.str.len())
   x   y
0  2  aa
2  1  aa
1  1   b

The case above is equivalent to running a mutate before arrange:

>>> df >> mutate(res = -_.y.str.len()) >> arrange(_.res)
   x   y  res
0  2  aa   -2
2  1  aa   -2
1  1   b   -1
Source code in siuba/dply/verbs.py
@singledispatch2(DataFrame)
def arrange(__data, *args):
    """Re-order the rows of a DataFrame using the values of specified columns.

    Parameters
    ----------
    __data:
        The input table.
    *args:
        Columns or expressions used to sort the rows.

    Examples
    --------

    >>> import pandas as pd
    >>> from siuba import _, arrange, mutate

    >>> df = pd.DataFrame({"x": [2, 1, 1], "y": ["aa", "b", "aa"]})
    >>> df
       x   y
    0  2  aa
    1  1   b
    2  1  aa

    Arrange sorts on the first argument, then the second, etc..

    >>> df >> arrange(_.x, _.y)
       x   y
    2  1  aa
    1  1   b
    0  2  aa

    Use a minus sign (`-`) to sort is descending order.

    >>> df >> arrange(-_.x)
       x   y
    0  2  aa
    1  1   b
    2  1  aa

    Note that arrange can sort on complex expressions:

    >>> df >> arrange(-_.y.str.len())
       x   y
    0  2  aa
    2  1  aa
    1  1   b

    The case above is equivalent to running a mutate before arrange:

    >>> df >> mutate(res = -_.y.str.len()) >> arrange(_.res)
       x   y  res
    0  2  aa   -2
    2  1  aa   -2
    1  1   b   -1

    """
    # TODO:
    #   - add arguments to pass to sort_values (e.g. ascending, kind)
    # 
    # basically need some (1) select behavior, (2) mutate-like behavior
    # df.sort_values is the obvious candidate, but only takes names, not expressions
    # to work around this, we make a shallow copy of data, and add sorting columns
    # then drop them at the end
    # 
    # sort order is determined by using a unary w/ Call e.g. -_.repo

    df = __data.copy(deep = False)
    n_cols = len(df.columns)
    n_args = len(args)

    #kwargs = {n_cols + ii: arg for ii,arg in enumerate(args)}

    # TODO: more careful handling of arg types (true across library :/ )..
    tmp_cols = []
    sort_cols = []
    ascending = []
    for ii, arg in enumerate(args):
        f, asc = _call_strip_ascending(arg)

        ascending.append(asc)

        col = simple_varname(f)
        if col is not None:
            sort_cols.append(col)
        else:
            # TODO: could screw up if user has columns names that are ints...
            sort_cols.append(n_cols + ii)
            tmp_cols.append(n_cols + ii)

            res = f(df)

            if isinstance(res, pd.DataFrame):
                raise NotImplementedError(
                    f"`arrange()` expression {ii} of {len(args)} returned a "
                    "DataFrame, which is currently unsupported."
                )

            df[n_cols + ii] = res


    return df.sort_values(by = sort_cols, kind = "mergesort", ascending = ascending) \
             .drop(tmp_cols, axis = 1)