group_by, ungroup
group_by(__data, *args, *, add=False, **kwargs)
Return a grouped DataFrame, using columns or expressions to define groups.
Any operations (e.g. summarize, mutate, filter) performed on grouped data
will be performed "by group". Use ungroup()
to remove the groupings.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
__data |
The data being grouped. |
required | |
*args |
Lazy expressions used to select the grouping columns. Currently, each arg must refer to a single columns (e.g. .cyl, .mpg). |
() |
|
add |
bool |
If the data is already grouped, whether to add these groupings on top of those. |
False |
**kwargs |
Keyword arguments define new columns used to group the data. |
{} |
Examples:
>>> from siuba import _, group_by, summarize, filter, mutate, head
>>> from siuba.data import cars
>>> by_cyl = cars >> group_by(_.cyl)
>>> by_cyl >> summarize(max_mpg = _.mpg.max(), max_hp = _.hp.max())
cyl max_mpg max_hp
0 4 33.9 113
1 6 21.4 175
2 8 19.2 335
>>> by_cyl >> filter(_.mpg == _.mpg.max())
(grouped data frame)
cyl mpg hp
3 6 21.4 110
19 4 33.9 65
24 8 19.2 175
>>> cars >> group_by(cyl2 = _.cyl + 1) >> head(2)
(grouped data frame)
cyl mpg hp cyl2
0 6 21.0 110 7
1 6 21.0 110 7
Note that creating the new grouping column is always performed on ungrouped data. Use an explicit mutate on the grouped data perform the operation within groups.
For example, the code below calls pd.cut on the mpg column, within each cyl group.
>>> from siuba.siu import call
>>> (cars
... >> group_by(_.cyl)
... >> mutate(mpg_bin = call(pd.cut, _.mpg, 3))
... >> group_by(_.mpg_bin, add=True)
... >> head(2)
... )
(grouped data frame)
cyl mpg hp mpg_bin
0 6 21.0 110 (20.2, 21.4]
1 6 21.0 110 (20.2, 21.4]
Source code in siuba/dply/verbs.py
@singledispatch2((pd.DataFrame, DataFrameGroupBy))
def group_by(__data, *args, add = False, **kwargs):
"""Return a grouped DataFrame, using columns or expressions to define groups.
Any operations (e.g. summarize, mutate, filter) performed on grouped data
will be performed "by group". Use `ungroup()` to remove the groupings.
Parameters
----------
__data:
The data being grouped.
*args:
Lazy expressions used to select the grouping columns. Currently, each
arg must refer to a single columns (e.g. _.cyl, _.mpg).
add: bool
If the data is already grouped, whether to add these groupings on top of those.
**kwargs:
Keyword arguments define new columns used to group the data.
Examples
--------
>>> from siuba import _, group_by, summarize, filter, mutate, head
>>> from siuba.data import cars
>>> by_cyl = cars >> group_by(_.cyl)
>>> by_cyl >> summarize(max_mpg = _.mpg.max(), max_hp = _.hp.max())
cyl max_mpg max_hp
0 4 33.9 113
1 6 21.4 175
2 8 19.2 335
>>> by_cyl >> filter(_.mpg == _.mpg.max())
(grouped data frame)
cyl mpg hp
3 6 21.4 110
19 4 33.9 65
24 8 19.2 175
>>> cars >> group_by(cyl2 = _.cyl + 1) >> head(2)
(grouped data frame)
cyl mpg hp cyl2
0 6 21.0 110 7
1 6 21.0 110 7
Note that creating the new grouping column is always performed on ungrouped data.
Use an explicit mutate on the grouped data perform the operation within groups.
For example, the code below calls pd.cut on the mpg column, within each cyl group.
>>> from siuba.siu import call
>>> (cars
... >> group_by(_.cyl)
... >> mutate(mpg_bin = call(pd.cut, _.mpg, 3))
... >> group_by(_.mpg_bin, add=True)
... >> head(2)
... )
(grouped data frame)
cyl mpg hp mpg_bin
0 6 21.0 110 (20.2, 21.4]
1 6 21.0 110 (20.2, 21.4]
"""
if isinstance(__data, DataFrameGroupBy):
tmp_df = __data.obj.copy()
else:
tmp_df = __data.copy()
# TODO: super inefficient, since it makes multiple copies of data
# need way to get the by_vars and apply (grouped) computation
computed = transmute(tmp_df, *args, **kwargs)
by_vars = list(computed.columns)
for k in by_vars:
tmp_df[k] = computed[k]
if isinstance(__data, DataFrameGroupBy) and add:
groupings = {el.name: el for el in __data.grouper.groupings}
for varname in by_vars:
# ensures group levels are recalculated if varname was in transmute
groupings[varname] = varname
return tmp_df.groupby(list(groupings.values()))
return tmp_df.groupby(by = by_vars)
ungroup(__data)
Return an ungrouped DataFrame.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
__data |
The data being ungrouped. |
required |
Examples:
>>> from siuba import _, group_by, ungroup
>>> from siuba.data import cars
>>> g_cyl = cars.groupby("cyl")
>>> res1 = ungroup(g_cyl)
>>> res2 = cars >> group_by(_.cyl) >> ungroup()
Source code in siuba/dply/verbs.py
@singledispatch2((pd.DataFrame, DataFrameGroupBy))
def ungroup(__data):
"""Return an ungrouped DataFrame.
Parameters
----------
__data:
The data being ungrouped.
Examples
--------
>>> from siuba import _, group_by, ungroup
>>> from siuba.data import cars
>>> g_cyl = cars.groupby("cyl")
>>> res1 = ungroup(g_cyl)
>>> res2 = cars >> group_by(_.cyl) >> ungroup()
"""
# TODO: can we somehow just restore the original df used to construct
# the groupby?
if isinstance(__data, pd.DataFrame):
return __data
elif isinstance(__data, DataFrameGroupBy):
return __data.obj
else:
raise TypeError(f"Unsupported type {type(__data)}")