Skip to content

nest, unnest

nest(__data, *args, *, key='data')

Nest columns within a DataFrame.

Parameters:

Name Type Description Default
__data

A DataFrame.

required
*args

The names of columns to be nested. May use any syntax used by the select function.

()
key

The name of the column that will hold the nested columns.

'data'

Examples:

>>> from siuba import _, nest
>>> from siuba.data import cars
>>> nested_cars = cars >> nest(-_.cyl)

Note that pandas with nested DataFrames looks okay in juypter notebooks, but has a weird representation in the IPython console, so the example below shows that each entry in the data column is a DataFrame.

>>> nested_cars.shape
(3, 2)
>>> type(nested_cars.data[0])
<class 'pandas.core.frame.DataFrame'>
Source code in siuba/dply/verbs.py
@singledispatch2(pd.DataFrame)
def nest(__data, *args, key = "data"):
    """Nest columns within a DataFrame.


    Parameters
    ----------
    __data:
        A DataFrame.
    *args:
        The names of columns to be nested. May use any syntax used by the
        `select` function.
    key:
        The name of the column that will hold the nested columns.

    Examples
    --------

    >>> from siuba import _, nest
    >>> from siuba.data import cars
    >>> nested_cars = cars >> nest(-_.cyl)

    Note that pandas with nested DataFrames looks okay in juypter notebooks,
    but has a weird representation in the IPython console, so the example below
    shows that each entry in the data column is a DataFrame.

    >>> nested_cars.shape
    (3, 2)

    >>> type(nested_cars.data[0])
    <class 'pandas.core.frame.DataFrame'>

    """
    # TODO: copied from select function
    var_list = var_create(*args)
    od = var_select(__data.columns, *var_list)

    # unselected columns are treated similar to using groupby
    grp_keys = list(k for k in __data.columns if k not in set(od))
    nest_keys = list(od)

    # split into sub DataFrames, with only nest_keys as columns
    g_df = __data.groupby(grp_keys)
    splitter = g_df.grouper._get_splitter(g_df.obj[nest_keys])

    # TODO: iterating over splitter now only produces 1 item (the dataframe)
    # check backwards compat
    def _extract_subdf_pandas_1_3(entry):
        # in pandas < 1.3, splitter.__iter__ returns tuple entries (ii, df)
        if isinstance(entry, tuple):
            return entry[1]

        # in pandas 1.3, each entry is just the dataframe
        return entry

    result_index = g_df.grouper.result_index
    nested_dfs = [_extract_subdf_pandas_1_3(x) for x in splitter]

    out = pd.DataFrame({key: nested_dfs}, index = result_index).reset_index()

    return out

unnest(__data, key='data')

Unnest a column holding nested data (e.g. Series of lists or DataFrames).

Parameters:

Name Type Description Default
___data

A DataFrame.

required
key

The name of the column to be unnested.

'data'

Examples:

>>> import pandas as pd
>>> df = pd.DataFrame({'id': [1,2], 'data': [['a', 'b'], ['c']]})
>>> df >> unnest()
   id data
0   1    a
1   1    b
2   2    c
Source code in siuba/dply/verbs.py
@singledispatch2(pd.DataFrame)
def unnest(__data, key = "data"):
    """Unnest a column holding nested data (e.g. Series of lists or DataFrames).

    Parameters
    ----------
    ___data:
        A DataFrame.
    key:
        The name of the column to be unnested.

    Examples
    --------

    >>> import pandas as pd
    >>> df = pd.DataFrame({'id': [1,2], 'data': [['a', 'b'], ['c']]})
    >>> df >> unnest()
       id data
    0   1    a
    1   1    b
    2   2    c

    """
    # TODO: currently only takes key, not expressions
    nrows_nested = __data[key].apply(len, convert_dtype = True)
    indx_nested = nrows_nested.index.repeat(nrows_nested)

    grp_keys = list(__data.columns[__data.columns != key])

    # flatten nested data
    data_entries = map(_convert_nested_entry, __data[key])
    long_data = pd.concat(data_entries, ignore_index = True)
    long_data.name = key

    # may be a better approach using a multi-index
    long_grp = __data.loc[indx_nested, grp_keys].reset_index(drop = True)

    return long_grp.join(long_data)