Skip to content

summarize

summarize(__data, *args, **kwargs)

Assign variables that are single number summaries of a DataFrame.

Grouped DataFrames will produce one row for each group. Otherwise, summarize produces a DataFrame with a single row.

Parameters:

Name Type Description Default
__data a DataFrame

The data being summarized.

required
**kwargs

new_col_name=value pairs, where value can be a function taking a single argument for the data being operated on.

{}

Examples:

>>> from siuba import _, group_by, summarize
>>> from siuba.data import cars
>>> cars >> summarize(avg = _.mpg.mean(), n = _.shape[0])
         avg   n
0  20.090625  32
>>> g_cyl = cars >> group_by(_.cyl)
>>> g_cyl >> summarize(min = _.mpg.min())
   cyl   min
0    4  21.4
1    6  17.8
2    8  10.4
>>> g_cyl >> summarize(mpg_std_err = _.mpg.std() / _.shape[0]**.5)
   cyl  mpg_std_err
0    4     1.359764
1    6     0.549397
2    8     0.684202
Source code in siuba/dply/verbs.py
@singledispatch2(DataFrame)
def summarize(__data, *args, **kwargs):
    """Assign variables that are single number summaries of a DataFrame.

    Grouped DataFrames will produce one row for each group. Otherwise, summarize
    produces a DataFrame with a single row.

    Parameters
    ----------
    __data: a DataFrame
        The data being summarized.
    **kwargs:
        new_col_name=value pairs, where value can be a function taking
        a single argument for the data being operated on.


    Examples
    --------
    >>> from siuba import _, group_by, summarize
    >>> from siuba.data import cars

    >>> cars >> summarize(avg = _.mpg.mean(), n = _.shape[0])
             avg   n
    0  20.090625  32

    >>> g_cyl = cars >> group_by(_.cyl)
    >>> g_cyl >> summarize(min = _.mpg.min())
       cyl   min
    0    4  21.4
    1    6  17.8
    2    8  10.4

    >>> g_cyl >> summarize(mpg_std_err = _.mpg.std() / _.shape[0]**.5)
       cyl  mpg_std_err
    0    4     1.359764
    1    6     0.549397
    2    8     0.684202

    """
    results = {}

    for ii, expr in enumerate(args):
        if not callable(expr):
            raise TypeError(
                "Unnamed arguments to summarize must be callable, but argument number "
                f"{ii} was type: {type(expr)}"
            )

        res = expr(__data)
        if isinstance(res, DataFrame):
            if len(res) != 1:
                raise ValueError(
                    f"Summarize argument `{ii}` returned a DataFrame with {len(res)} rows."
                    " Result must only be a single row."
                )

            for col_name in res.columns:
                results[col_name] = res[col_name].array
        else:
            raise ValueError(
                "Unnamed arguments to summarize must return a DataFrame, but argument "
                f"`{ii} returned type: {type(expr)}"
            )



    for k, v in kwargs.items():
        # TODO: raise error if a named expression returns a DataFrame
        res = v(__data) if callable(v) else v

        if is_scalar(res) or len(res) == 1:
            # keep result, but use underlying array to avoid crazy index issues
            # on DataFrame construction (#138)
            results[k] = res.array if isinstance(res, pd.Series) else res

        else:
            raise ValueError(
                f"Summarize argument `{k}` must return result of length 1 or a scalar.\n\n"
                f"Result type: {type(res)}\n"
                f"Result length: {len(res)}"
            )

    # must pass index, or raises error when using all scalar values
    return DataFrame(results, index = [0])