count
count(__data, *args, *, wt=None, sort=False, name=None, **kwargs)
Summarize data with the number of rows for each grouping of data.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
__data |
A DataFrame. |
required | |
*args |
The names of columns to be used for grouping. Passed to group_by. |
() |
|
wt |
The name of a column to use as a weighted for each row. |
None |
|
sort |
Whether to sort the results in descending order. |
False |
|
**kwargs |
Creates a new named column, and uses for grouping. Passed to group_by. |
{} |
Examples:
>>> from siuba import _, count, group_by, summarize, arrange
>>> from siuba.data import mtcars
>>> count(mtcars, _.cyl, high_mpg = _.mpg > 30)
cyl high_mpg n
0 4 False 7
1 4 True 4
2 6 False 7
3 8 False 14
Use sort to order results by number of observations (in descending order).
>>> count(mtcars, _.cyl, sort=True)
cyl n
0 8 14
1 4 11
2 6 7
count is equivalent to doing a grouped summarize:
>>> mtcars >> group_by(_.cyl) >> summarize(n = _.shape[0]) >> arrange(-_.n)
cyl n
2 8 14
0 4 11
1 6 7
Source code in siuba/dply/verbs.py
@singledispatch2((pd.DataFrame, DataFrameGroupBy))
def count(__data, *args, wt = None, sort = False, name=None, **kwargs):
"""Summarize data with the number of rows for each grouping of data.
Parameters
----------
__data:
A DataFrame.
*args:
The names of columns to be used for grouping. Passed to group_by.
wt:
The name of a column to use as a weighted for each row.
sort:
Whether to sort the results in descending order.
**kwargs:
Creates a new named column, and uses for grouping. Passed to group_by.
Examples
--------
>>> from siuba import _, count, group_by, summarize, arrange
>>> from siuba.data import mtcars
>>> count(mtcars, _.cyl, high_mpg = _.mpg > 30)
cyl high_mpg n
0 4 False 7
1 4 True 4
2 6 False 7
3 8 False 14
Use sort to order results by number of observations (in descending order).
>>> count(mtcars, _.cyl, sort=True)
cyl n
0 8 14
1 4 11
2 6 7
count is equivalent to doing a grouped summarize:
>>> mtcars >> group_by(_.cyl) >> summarize(n = _.shape[0]) >> arrange(-_.n)
cyl n
2 8 14
0 4 11
1 6 7
"""
no_grouping_vars = not args and not kwargs and isinstance(__data, pd.DataFrame)
if wt is None:
if no_grouping_vars:
# no groups, just use number of rows
counts = pd.DataFrame({'tmp': [__data.shape[0]]})
else:
# tally rows for each group
counts = group_by(__data, *args, add = True, **kwargs).size().reset_index()
else:
wt_col = simple_varname(wt)
if wt_col is None:
raise Exception("wt argument has to be simple column name")
if no_grouping_vars:
# no groups, sum weights
counts = pd.DataFrame({'tmp': [__data[wt_col].sum()]})
else:
# do weighted tally
counts = group_by(__data, *args, add = True, **kwargs)[wt_col].sum().reset_index()
# count col named, n. If that col already exists, add more "n"s...
out_col = _check_name(name, set(counts.columns))
# rename the tally column to correct name
counts.rename(columns = {counts.columns[-1]: out_col}, inplace = True)
if sort:
return counts.sort_values(out_col, ascending = False).reset_index(drop = True)
return counts