nest, unnest
nest(__data, *args, *, key='data')
Nest columns within a DataFrame.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
__data |
A DataFrame. |
required | |
*args |
The names of columns to be nested. May use any syntax used by the
|
() |
|
key |
The name of the column that will hold the nested columns. |
'data' |
Examples:
>>> from siuba import _, nest
>>> from siuba.data import cars
>>> nested_cars = cars >> nest(-_.cyl)
Note that pandas with nested DataFrames looks okay in juypter notebooks, but has a weird representation in the IPython console, so the example below shows that each entry in the data column is a DataFrame.
>>> nested_cars.shape
(3, 2)
>>> type(nested_cars.data[0])
<class 'pandas.core.frame.DataFrame'>
Source code in siuba/dply/verbs.py
@singledispatch2(pd.DataFrame)
def nest(__data, *args, key = "data"):
"""Nest columns within a DataFrame.
Parameters
----------
__data:
A DataFrame.
*args:
The names of columns to be nested. May use any syntax used by the
`select` function.
key:
The name of the column that will hold the nested columns.
Examples
--------
>>> from siuba import _, nest
>>> from siuba.data import cars
>>> nested_cars = cars >> nest(-_.cyl)
Note that pandas with nested DataFrames looks okay in juypter notebooks,
but has a weird representation in the IPython console, so the example below
shows that each entry in the data column is a DataFrame.
>>> nested_cars.shape
(3, 2)
>>> type(nested_cars.data[0])
<class 'pandas.core.frame.DataFrame'>
"""
# TODO: copied from select function
var_list = var_create(*args)
od = var_select(__data.columns, *var_list)
# unselected columns are treated similar to using groupby
grp_keys = list(k for k in __data.columns if k not in set(od))
nest_keys = list(od)
# split into sub DataFrames, with only nest_keys as columns
g_df = __data.groupby(grp_keys)
splitter = g_df.grouper._get_splitter(g_df.obj[nest_keys])
# TODO: iterating over splitter now only produces 1 item (the dataframe)
# check backwards compat
def _extract_subdf_pandas_1_3(entry):
# in pandas < 1.3, splitter.__iter__ returns tuple entries (ii, df)
if isinstance(entry, tuple):
return entry[1]
# in pandas 1.3, each entry is just the dataframe
return entry
result_index = g_df.grouper.result_index
nested_dfs = [_extract_subdf_pandas_1_3(x) for x in splitter]
out = pd.DataFrame({key: nested_dfs}, index = result_index).reset_index()
return out
unnest(__data, key='data')
Unnest a column holding nested data (e.g. Series of lists or DataFrames).
Parameters:
Name | Type | Description | Default |
---|---|---|---|
___data |
A DataFrame. |
required | |
key |
The name of the column to be unnested. |
'data' |
Examples:
>>> import pandas as pd
>>> df = pd.DataFrame({'id': [1,2], 'data': [['a', 'b'], ['c']]})
>>> df >> unnest()
id data
0 1 a
1 1 b
2 2 c
Source code in siuba/dply/verbs.py
@singledispatch2(pd.DataFrame)
def unnest(__data, key = "data"):
"""Unnest a column holding nested data (e.g. Series of lists or DataFrames).
Parameters
----------
___data:
A DataFrame.
key:
The name of the column to be unnested.
Examples
--------
>>> import pandas as pd
>>> df = pd.DataFrame({'id': [1,2], 'data': [['a', 'b'], ['c']]})
>>> df >> unnest()
id data
0 1 a
1 1 b
2 2 c
"""
# TODO: currently only takes key, not expressions
nrows_nested = __data[key].apply(len, convert_dtype = True)
indx_nested = nrows_nested.index.repeat(nrows_nested)
grp_keys = list(__data.columns[__data.columns != key])
# flatten nested data
data_entries = map(_convert_nested_entry, __data[key])
long_data = pd.concat(data_entries, ignore_index = True)
long_data.name = key
# may be a better approach using a multi-index
long_grp = __data.loc[indx_nested, grp_keys].reset_index(drop = True)
return long_grp.join(long_data)