summarize
summarize(__data, *args, **kwargs)
Assign variables that are single number summaries of a DataFrame.
Grouped DataFrames will produce one row for each group. Otherwise, summarize produces a DataFrame with a single row.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
__data |
a DataFrame |
The data being summarized. |
required |
**kwargs |
new_col_name=value pairs, where value can be a function taking a single argument for the data being operated on. |
{} |
Examples:
>>> from siuba import _, group_by, summarize
>>> from siuba.data import cars
>>> cars >> summarize(avg = _.mpg.mean(), n = _.shape[0])
avg n
0 20.090625 32
>>> g_cyl = cars >> group_by(_.cyl)
>>> g_cyl >> summarize(min = _.mpg.min())
cyl min
0 4 21.4
1 6 17.8
2 8 10.4
>>> g_cyl >> summarize(mpg_std_err = _.mpg.std() / _.shape[0]**.5)
cyl mpg_std_err
0 4 1.359764
1 6 0.549397
2 8 0.684202
Source code in siuba/dply/verbs.py
@singledispatch2(DataFrame)
def summarize(__data, *args, **kwargs):
"""Assign variables that are single number summaries of a DataFrame.
Grouped DataFrames will produce one row for each group. Otherwise, summarize
produces a DataFrame with a single row.
Parameters
----------
__data: a DataFrame
The data being summarized.
**kwargs:
new_col_name=value pairs, where value can be a function taking
a single argument for the data being operated on.
Examples
--------
>>> from siuba import _, group_by, summarize
>>> from siuba.data import cars
>>> cars >> summarize(avg = _.mpg.mean(), n = _.shape[0])
avg n
0 20.090625 32
>>> g_cyl = cars >> group_by(_.cyl)
>>> g_cyl >> summarize(min = _.mpg.min())
cyl min
0 4 21.4
1 6 17.8
2 8 10.4
>>> g_cyl >> summarize(mpg_std_err = _.mpg.std() / _.shape[0]**.5)
cyl mpg_std_err
0 4 1.359764
1 6 0.549397
2 8 0.684202
"""
results = {}
for ii, expr in enumerate(args):
if not callable(expr):
raise TypeError(
"Unnamed arguments to summarize must be callable, but argument number "
f"{ii} was type: {type(expr)}"
)
res = expr(__data)
if isinstance(res, DataFrame):
if len(res) != 1:
raise ValueError(
f"Summarize argument `{ii}` returned a DataFrame with {len(res)} rows."
" Result must only be a single row."
)
for col_name in res.columns:
results[col_name] = res[col_name].array
else:
raise ValueError(
"Unnamed arguments to summarize must return a DataFrame, but argument "
f"`{ii} returned type: {type(expr)}"
)
for k, v in kwargs.items():
# TODO: raise error if a named expression returns a DataFrame
res = v(__data) if callable(v) else v
if is_scalar(res) or len(res) == 1:
# keep result, but use underlying array to avoid crazy index issues
# on DataFrame construction (#138)
results[k] = res.array if isinstance(res, pd.Series) else res
else:
raise ValueError(
f"Summarize argument `{k}` must return result of length 1 or a scalar.\n\n"
f"Result type: {type(res)}\n"
f"Result length: {len(res)}"
)
# must pass index, or raises error when using all scalar values
return DataFrame(results, index = [0])