Skip to content

gather, spread

gather(__data, key='key', value='value', *args, *, drop_na=False, convert=False)

Reshape table by gathering it in to long format.

Parameters:

Name Type Description Default
__data

The input data.

required
key

Name of the key (or measure) column, which holds the names of the columns that were turned into rows.

'key'
value

Name of the value column, which holds the values from the columns that were turned into rows.

'value'
*args

A selection of columns. If unspecified, all columns are selected. Any arguments you could pass to the select() verb are allowed.

()
drop_na bool

Whether to remove any rows where the value column is NA.

False

Examples:

>>> import pandas as pd
>>> from siuba import _, gather
>>> df = pd.DataFrame({"id": ["a", "b"], "x": [1, 2], "y": [3, None]})

The code below gathers in all columns, except id:

>>> gather(df, "key", "value", -_.id)
  id key  value
0  a   x    1.0
1  b   x    2.0
2  a   y    3.0
3  b   y    NaN
>>> gather(df, "measure", "result", _.x, _.y, drop_na=True)
  id measure  result
0  a       x     1.0
1  b       x     2.0
2  a       y     3.0
Source code in siuba/dply/verbs.py
@singledispatch2((pd.DataFrame, DataFrameGroupBy))
def gather(__data, key = "key", value = "value", *args, drop_na = False, convert = False):
    """Reshape table by gathering it in to long format.

    Parameters
    ----------
    __data:
        The input data.
    key:
        Name of the key (or measure) column, which holds the names of the columns
        that were turned into rows.
    value:
        Name of the value column, which holds the values from the columns that
        were turned into rows.
    *args:
        A selection of columns. If unspecified, all columns are selected. Any
        arguments you could pass to the select() verb are allowed.
    drop_na: bool
        Whether to remove any rows where the value column is NA.


    Examples
    --------

    >>> import pandas as pd
    >>> from siuba import _, gather

    >>> df = pd.DataFrame({"id": ["a", "b"], "x": [1, 2], "y": [3, None]})

    The code below gathers in all columns, except id:

    >>> gather(df, "key", "value", -_.id)
      id key  value
    0  a   x    1.0
    1  b   x    2.0
    2  a   y    3.0
    3  b   y    NaN

    >>> gather(df, "measure", "result", _.x, _.y, drop_na=True)
      id measure  result
    0  a       x     1.0
    1  b       x     2.0
    2  a       y     3.0

    """
    # TODO: implement var selection over *args
    if convert:
        raise NotImplementedError("convert not yet implemented")

    # TODO: copied from nest and select
    var_list = var_create(*(args or __data.columns))
    od = var_select(__data.columns, *var_list)

    if not od:
        return __data

    id_vars = [col for col in __data.columns if col not in od]
    long = pd.melt(__data, id_vars, list(od), key, value)

    if drop_na:
        return long[~long[value].isna()].reset_index(drop = True)

    return long

spread(__data, key, value, fill=None, reset_index=True)

Reshape table by spreading it out to wide format.

Parameters:

Name Type Description Default
__data

The input data.

required
key

Column whose values will be used as new column names.

required
value

Column whose values will fill the new column entries.

required
fill

Value to set for any missing values. By default keeps them as missing values.

None

Examples:

>>> import pandas as pd                                                
>>> from siuba import _, gather                                        
>>> df = pd.DataFrame({"id": ["a", "b"], "x": [1, 2], "y": [3, None]}) 
>>> long = gather(df, "key", "value", -_.id, drop_na=True)
>>> long
  id key  value
0  a   x    1.0
1  b   x    2.0
2  a   y    3.0
>>> spread(long, "key", "value")
  id    x    y
0  a  1.0  3.0
1  b  2.0  NaN
Source code in siuba/dply/verbs.py
@singledispatch2(pd.DataFrame)
def spread(__data, key, value, fill = None, reset_index = True):
    """Reshape table by spreading it out to wide format.

    Parameters
    ----------
    __data:
        The input data.
    key:
        Column whose values will be used as new column names.
    value:
        Column whose values will fill the new column entries.
    fill:
        Value to set for any missing values. By default keeps them as missing values.


    Examples
    --------
    >>> import pandas as pd                                                
    >>> from siuba import _, gather                                        

    >>> df = pd.DataFrame({"id": ["a", "b"], "x": [1, 2], "y": [3, None]}) 

    >>> long = gather(df, "key", "value", -_.id, drop_na=True)
    >>> long
      id key  value
    0  a   x    1.0
    1  b   x    2.0
    2  a   y    3.0

    >>> spread(long, "key", "value")
      id    x    y
    0  a  1.0  3.0
    1  b  2.0  NaN

    """
    key_col = _get_single_var_select(__data.columns, key)
    val_col = _get_single_var_select(__data.columns, value)

    id_cols = [col for col in __data.columns if col not in (key_col, val_col)]
    wide = __data.set_index(id_cols + [key_col]).unstack(level = -1)

    if fill is not None:
        wide.fillna(fill, inplace = True)

    # remove multi-index from both rows and cols
    wide.columns = wide.columns.droplevel().rename(None)
    if reset_index:
        wide.reset_index(inplace = True)

    return wide