Skip to content

pivot_longer, pivot_wider

Note that these functions are currently experimental.

from siuba.experimental.pivot import pivot_wider, pivot_longer

pivot_wider(__data, id_cols=None, id_expand=False, names_from='name', names_prefix='', names_sep='_', names_glue=None, names_sort=None, names_vary='fastest', names_expand=False, names_repair='check_unique', values_from='value', values_fill=None, values_fn=None, unused_fn=None)

Pivot data from long to wide format.

This function splits a column, putting the pieces side-by-side based on an index.

Parameters:

Name Type Description Default
__data

The input data.

required
id_cols

A selection of columns that uniquely identify each observation.

None
id_expand

Whether to ensure each unique combination of id_cols is a row in the data before pivoting, using expand(). This results in more rows. When True, this also sorts the final result by the id_cols.

False
names_from, values_from

A pair fo arguments describing which column (or columns) to get the name of the output column (names_from), and which column (or columns) to get the cell values from (values_from).

required
names_prefix

String added to the start of every variable name.

''
names_sep

If names_from or values_from contains multiple values, this will be used to join their values together into a single string to use as a column name.

'_'
names_glue

Instead of names_sep and names_prefix, supply a string template that uses the names_from columns (and a special .value variable) to create custom column names.

None
names_sort

Should the column names be sorted? The default is False, which results in column names ordered by first appearance.

None
names_vary

Option specifying how columns are ordered when names_from and values_from both identify new columns. "fastest" varies names_from fastest, while "slowest" varies names_from slowest.

'fastest'
names_expand

Whether to ensure all combinations of names_from columns are in the result using the expand() function. This results in more columns in the output.

False
names_repair

Strategy for fixing of invalid column names. "minimal" leaves them as is. "check_unique" raises an error if there are duplicate names. "unique" de-duplicates names by appending "___{position}" to them.

'check_unique'
values_fill

A scalar value used to fill in any missing values. Alternatively, a dictionary mapping column names to fill values.

None
values_fn

An optional function to apply to each cell of the output. This is useful when each cell would contain multiple values. E.g. values_fn="max" would calculate the max value.

None
unused_fn

Not implemented.

None

Examples:

>>> from siuba import _
>>> df = pd.DataFrame(
...    {"id": ["a", "b", "a"], "name": ["x", "x", "y"], "value": [1, 2, 3]}
... )
>>> df
  id name  value
0  a    x      1
1  b    x      2
2  a    y      3
>>> pivot_wider(df, names_from=_.name, values_from=_.value)
  id    x    y
0  a  1.0  3.0
1  b  2.0  NaN
>>> pivot_wider(df, names_from=_.name, values_from=_.value, values_fill=0)
  id  x  y
0  a  1  3
1  b  2  0
>>> many = pd.DataFrame({
...    "id": [1, 1, 2, 2],
...    "var": ["one", "two", "one", "two"],
...    "x": [1, 2, 3, 4],
...    "y": [6, 7, 8, 9]
... })
>>> pivot_wider(many, names_from=_.var, values_from=_[_.x, _.y])
   id  x_one  x_two  y_one  y_two
0   1      1      2      6      7
1   2      3      4      8      9
>>> pivot_wider(many, names_from=_.var, values_from=_[_.x, _.y], names_vary="slowest")
   id  x_one  y_one  x_two  y_two
0   1      1      6      2      7
1   2      3      8      4      9
>>> pivot_wider(many, names_from=_.var, values_from=_[_.x, _.y], names_sep=".")
   id  x.one  x.two  y.one  y.two
0   1      1      2      6      7
1   2      3      4      8      9
>>> glue = "{variable}_X_{value}"
>>> pivot_wider(many, names_from=_.var, values_from=_[_.x, _.y], names_glue=glue)
   id  x_X_one  x_X_two  y_X_one  y_X_two
0   1        1        2        6        7
1   2        3        4        8        9
>>> from siuba.data import warpbreaks
>>> warpbreaks.head()
   breaks wool tension
0      26    A       L
1      30    A       L
2      54    A       L
3      25    A       L
4      70    A       L
>>> pivot_wider(warpbreaks, names_from=_.wool, values_from=_.breaks, values_fn="mean")
  tension          A          B
0       H  24.555556  18.777778
1       L  44.555556  28.222222
2       M  24.000000  28.777778
Source code in siuba/experimental/pivot/pivot_wide.py
@singledispatch2((pd.DataFrame, DataFrameGroupBy))
def pivot_wider(
    __data,
    id_cols=None,
    id_expand=False,
    names_from="name",
    names_prefix="",
    names_sep="_",
    names_glue=None,
    names_sort=None,
    names_vary="fastest",
    names_expand=False,
    names_repair="check_unique",
    values_from="value",
    values_fill=None,
    values_fn=None,
    unused_fn=None
    ):
    """Pivot data from long to wide format.

    This function splits a column, putting the pieces side-by-side based on an index.

    Parameters
    ----------
    __data:
        The input data.
    id_cols:
        A selection of columns that uniquely identify each observation.
    id_expand:
        Whether to ensure each unique combination of id_cols is a row in the data
        before pivoting, using `expand()`. This results in more rows. When True,
        this also sorts the final result by the `id_cols`.
    names_from, values_from:
        A pair fo arguments describing which column (or columns) to get the name of
        the output column (names_from), and which column (or columns) to get the
        cell values from (values_from).
    names_prefix:
        String added to the start of every variable name.
    names_sep:
        If names_from or values_from contains multiple values, this will be used
        to join their values together into a single string to use as a column name.
    names_glue:
        Instead of names_sep and names_prefix, supply a string template that uses
        the names_from columns (and a special .value variable) to create custom
        column names.
    names_sort:
        Should the column names be sorted? The default is False, which results
        in column names ordered by first appearance.
    names_vary:
        Option specifying how columns are ordered when names_from and values_from
        both identify new columns. "fastest" varies names_from fastest, while "slowest"
        varies names_from slowest.
    names_expand:
        Whether to ensure all combinations of names_from columns are in the result
        using the `expand()` function. This results in more columns in the output.
    names_repair:
        Strategy for fixing of invalid column names. "minimal" leaves them as is.
        "check_unique" raises an error if there are duplicate names. "unique"
        de-duplicates names by appending "___{position}" to them.
    values_fill:
        A scalar value used to fill in any missing values. Alternatively, a
        dictionary mapping column names to fill values.
    values_fn:
        An optional function to apply to each cell of the output. This is useful
        when each cell would contain multiple values. E.g. values_fn="max" would
        calculate the max value.
    unused_fn:
        Not implemented.

    Examples
    --------
    >>> from siuba import _

    >>> df = pd.DataFrame(
    ...    {"id": ["a", "b", "a"], "name": ["x", "x", "y"], "value": [1, 2, 3]}
    ... )
    >>> df
      id name  value
    0  a    x      1
    1  b    x      2
    2  a    y      3

    >>> pivot_wider(df, names_from=_.name, values_from=_.value)
      id    x    y
    0  a  1.0  3.0
    1  b  2.0  NaN

    >>> pivot_wider(df, names_from=_.name, values_from=_.value, values_fill=0)
      id  x  y
    0  a  1  3
    1  b  2  0

    >>> many = pd.DataFrame({
    ...    "id": [1, 1, 2, 2],
    ...    "var": ["one", "two", "one", "two"],
    ...    "x": [1, 2, 3, 4],
    ...    "y": [6, 7, 8, 9]
    ... })
    >>> pivot_wider(many, names_from=_.var, values_from=_[_.x, _.y])
       id  x_one  x_two  y_one  y_two
    0   1      1      2      6      7
    1   2      3      4      8      9

    >>> pivot_wider(many, names_from=_.var, values_from=_[_.x, _.y], names_vary="slowest")
       id  x_one  y_one  x_two  y_two
    0   1      1      6      2      7
    1   2      3      8      4      9

    >>> pivot_wider(many, names_from=_.var, values_from=_[_.x, _.y], names_sep=".")
       id  x.one  x.two  y.one  y.two
    0   1      1      2      6      7
    1   2      3      4      8      9

    >>> glue = "{variable}_X_{value}"
    >>> pivot_wider(many, names_from=_.var, values_from=_[_.x, _.y], names_glue=glue)
       id  x_X_one  x_X_two  y_X_one  y_X_two
    0   1        1        2        6        7
    1   2        3        4        8        9

    >>> from siuba.data import warpbreaks
    >>> warpbreaks.head()
       breaks wool tension
    0      26    A       L
    1      30    A       L
    2      54    A       L
    3      25    A       L
    4      70    A       L

    >>> pivot_wider(warpbreaks, names_from=_.wool, values_from=_.breaks, values_fn="mean")
      tension          A          B
    0       H  24.555556  18.777778
    1       L  44.555556  28.222222
    2       M  24.000000  28.777778

    """
    input_ = __data

    if isinstance(__data, DataFrameGroupBy):
        __data = __data.obj

    # create spec ----
    spec = build_wider_spec(
        __data,
        names_from = names_from,
        values_from = values_from,
        names_prefix = names_prefix,
        names_sep = names_sep,
        names_glue = names_glue,
        names_sort = names_sort,
        names_vary = names_vary,
        names_expand = names_expand
    )

    # select id columns ---
    # necessary here, since if the spec is 0 rows you cannot know values_from
    # TODO: clean up symbolic handling of slices
    if id_cols is None:
        name_vars = _tidy_select(__data, names_from, "names_from")
        val_vars = _tidy_select(__data, values_from, "values_from")
        others = {*name_vars, *val_vars}

        id_cols = tuple([col for col in __data.columns if col not in others])
        id_vars = _select_expr_slice(id_cols)
    else:
        id_vars = id_cols

    out = pivot_wider_spec(
        input_,
        spec,
        names_repair = names_repair,
        id_cols = id_vars,
        id_expand = id_expand,
        values_fill = values_fill,
        values_fn = values_fn,
        unused_fn = unused_fn
    )

    return out

pivot_longer(__data, *cols, *, names_to='name', names_prefix=None, names_sep=None, names_pattern=None, names_ptypes=None, names_repair='check_unique', values_to='value', values_drop_na=False, values_ptypes=None, values_transform=None)

Pivot data from wide to long format.

This function stacks columns of data, turning them into rows.

Parameters:

Name Type Description Default
__data

The input data.

required
*cols

Columns to pivot into longer format. This uses tidyselect (e.g. _[_.some_col, _.another_col]).

()
names_to Union[str, Tuple[str, ...]]

A list specifying the new column or columns to create from the information stored in the column names of data specified by cols.

'name'
names_prefix Optional[str]

A regular expression to strip off from the start of column selected by *cols.

None
names_sep Optional[str]

If names_to is a list of name parts, this is a separater the name is split on. This is the same as the sep argument in the separate() function.

None
names_pattern Optional[str]

If names_to is a list of name parts, this is a pattern to extract parts This is the same as the regex argument in the extract() function.

None
names_ptypes, values_ptypes

Not implemented.

required
names_transform

TODO

required
names_repair str

Strategy for fixing of invalid column names. "minimal" leaves them as is. "check_unique" raises an error if there are duplicate names. "unique" de-duplicates names by appending "___{position}" to them.

'check_unique'
values_to str

A string specifying the name of the column created to hold the stacked values of the selected *cols. If names_to is a list with the entry ".value", then this argument is ignored.

'value'

Examples:

>>> from siuba import _
>>> df = pd.DataFrame({"id": [1, 2], "x": [5, 6], "y": [7, 8]})
>>> pivot_longer(df, ~_.id, names_to="variable", values_to="number")
   id variable  number
0   1        x       5
0   1        y       7
1   2        x       6
1   2        y       8
>>> weeks = pd.DataFrame({"id": [1], "year": [2020], "wk1": [5], "wk2": [6]})
>>> pivot_longer(weeks, _.startswith("wk"), names_to="week", names_prefix="wk")
   id  year week  value
0   1  2020    1      5
0   1  2020    2      6
>>> df2 = pd.DataFrame({"id": [1], "a_x1": [2], "b_x2": [3], "a_y1": [4]})
>>> names = ["condition", "group", "number"]
>>> pat = "(.*)_(.)(.*)"
>>> pivot_longer(df2, _["a_x1":"a_y1"], names_to = names, names_pattern = pat)
   id condition group number  value
0   1         a     x      1      2
0   1         b     x      2      3
0   1         a     y      1      4
>>> names = ["x1", "x2", "y1", "y2"]
>>> wide = pd.DataFrame({
...    "x1": [1, 11], "x2": [2, 22], "y1": [3, 33], "y2": [4, 44]
... })
>>> pivot_longer(wide, _[:], names_to = [".value", "set"], names_pattern = "(.)(.)")
  set   x   y
0   1   1   3
0   2   2   4
1   1  11  33
1   2  22  44
Source code in siuba/experimental/pivot/pivot_long.py
@singledispatch2(pd.DataFrame)
def pivot_longer(
        __data,
        *cols,
        names_to: Union[str, Tuple[str, ...]] = "name",
        names_prefix: Optional[str] = None,
        names_sep: Optional[str] = None,
        names_pattern: Optional[str] = None,
        names_ptypes: Optional[Tuple] = None,
        names_repair: str = "check_unique",
        values_to: str = "value",
        values_drop_na: bool = False,
        values_ptypes: Optional[Union[str, Tuple[str, ...]]] = None,
        values_transform: Optional[Dict] = None,
):
    """Pivot data from wide to long format.

    This function stacks columns of data, turning them into rows.

    Parameters
    ----------
    __data:
        The input data.
    *cols: 
        Columns to pivot into longer format. This uses tidyselect
        (e.g. `_[_.some_col, _.another_col]`).
    names_to:
        A list specifying the new column or columns to create from the information
        stored in the column names of data specified by cols.
    names_prefix:
        A regular expression to strip off from the start of column selected by `*cols`.
    names_sep:
        If names_to is a list of name parts, this is a separater the name is split on.
        This is the same as the sep argument in the separate() function.
    names_pattern:
        If names_to is a list of name parts, this is a pattern to extract parts
        This is the same as the regex argument in the extract() function.
    names_ptypes, values_ptypes:
        Not implemented.
    names_transform:
        TODO
    names_repair:
        Strategy for fixing of invalid column names. "minimal" leaves them as is.
        "check_unique" raises an error if there are duplicate names. "unique"
        de-duplicates names by appending "___{position}" to them.
    values_to:
        A string specifying the name of the column created to hold the stacked
        values of the selected `*cols`. If names_to is a list with the entry ".value",
        then this argument is ignored.


    Examples
    --------
    >>> from siuba import _

    >>> df = pd.DataFrame({"id": [1, 2], "x": [5, 6], "y": [7, 8]})
    >>> pivot_longer(df, ~_.id, names_to="variable", values_to="number")
       id variable  number
    0   1        x       5
    0   1        y       7
    1   2        x       6
    1   2        y       8

    >>> weeks = pd.DataFrame({"id": [1], "year": [2020], "wk1": [5], "wk2": [6]})
    >>> pivot_longer(weeks, _.startswith("wk"), names_to="week", names_prefix="wk")
       id  year week  value
    0   1  2020    1      5
    0   1  2020    2      6

    >>> df2 = pd.DataFrame({"id": [1], "a_x1": [2], "b_x2": [3], "a_y1": [4]})
    >>> names = ["condition", "group", "number"]
    >>> pat = "(.*)_(.)(.*)"
    >>> pivot_longer(df2, _["a_x1":"a_y1"], names_to = names, names_pattern = pat)
       id condition group number  value
    0   1         a     x      1      2
    0   1         b     x      2      3
    0   1         a     y      1      4

    >>> names = ["x1", "x2", "y1", "y2"]
    >>> wide = pd.DataFrame({
    ...    "x1": [1, 11], "x2": [2, 22], "y1": [3, 33], "y2": [4, 44]
    ... })
    >>> pivot_longer(wide, _[:], names_to = [".value", "set"], names_pattern = "(.)(.)")
      set   x   y
    0   1   1   3
    0   2   2   4
    1   1  11  33
    1   2  22  44

    """

    df_spec = build_longer_spec(
        __data,
        *cols,
        names_to=names_to,
        values_to=values_to,
        names_prefix=names_prefix,
        names_sep=names_sep,
        names_pattern=names_pattern,
        names_ptypes=names_ptypes,
    )

    return pivot_longer_spec(
        __data,
        df_spec,
        names_repair,
        values_drop_na,
        values_ptypes, 
        values_transform
    )