gather, spread
gather(__data, key='key', value='value', *args, *, drop_na=False, convert=False)
Reshape table by gathering it in to long format.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
__data |
The input data. |
required | |
key |
Name of the key (or measure) column, which holds the names of the columns that were turned into rows. |
'key' |
|
value |
Name of the value column, which holds the values from the columns that were turned into rows. |
'value' |
|
*args |
A selection of columns. If unspecified, all columns are selected. Any arguments you could pass to the select() verb are allowed. |
() |
|
drop_na |
bool |
Whether to remove any rows where the value column is NA. |
False |
Examples:
>>> import pandas as pd
>>> from siuba import _, gather
>>> df = pd.DataFrame({"id": ["a", "b"], "x": [1, 2], "y": [3, None]})
The code below gathers in all columns, except id:
>>> gather(df, "key", "value", -_.id)
id key value
0 a x 1.0
1 b x 2.0
2 a y 3.0
3 b y NaN
>>> gather(df, "measure", "result", _.x, _.y, drop_na=True)
id measure result
0 a x 1.0
1 b x 2.0
2 a y 3.0
Source code in siuba/dply/verbs.py
@singledispatch2((pd.DataFrame, DataFrameGroupBy))
def gather(__data, key = "key", value = "value", *args, drop_na = False, convert = False):
"""Reshape table by gathering it in to long format.
Parameters
----------
__data:
The input data.
key:
Name of the key (or measure) column, which holds the names of the columns
that were turned into rows.
value:
Name of the value column, which holds the values from the columns that
were turned into rows.
*args:
A selection of columns. If unspecified, all columns are selected. Any
arguments you could pass to the select() verb are allowed.
drop_na: bool
Whether to remove any rows where the value column is NA.
Examples
--------
>>> import pandas as pd
>>> from siuba import _, gather
>>> df = pd.DataFrame({"id": ["a", "b"], "x": [1, 2], "y": [3, None]})
The code below gathers in all columns, except id:
>>> gather(df, "key", "value", -_.id)
id key value
0 a x 1.0
1 b x 2.0
2 a y 3.0
3 b y NaN
>>> gather(df, "measure", "result", _.x, _.y, drop_na=True)
id measure result
0 a x 1.0
1 b x 2.0
2 a y 3.0
"""
# TODO: implement var selection over *args
if convert:
raise NotImplementedError("convert not yet implemented")
# TODO: copied from nest and select
var_list = var_create(*(args or __data.columns))
od = var_select(__data.columns, *var_list)
if not od:
return __data
id_vars = [col for col in __data.columns if col not in od]
long = pd.melt(__data, id_vars, list(od), key, value)
if drop_na:
return long[~long[value].isna()].reset_index(drop = True)
return long
spread(__data, key, value, fill=None, reset_index=True)
Reshape table by spreading it out to wide format.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
__data |
The input data. |
required | |
key |
Column whose values will be used as new column names. |
required | |
value |
Column whose values will fill the new column entries. |
required | |
fill |
Value to set for any missing values. By default keeps them as missing values. |
None |
Examples:
>>> import pandas as pd
>>> from siuba import _, gather
>>> df = pd.DataFrame({"id": ["a", "b"], "x": [1, 2], "y": [3, None]})
>>> long = gather(df, "key", "value", -_.id, drop_na=True)
>>> long
id key value
0 a x 1.0
1 b x 2.0
2 a y 3.0
>>> spread(long, "key", "value")
id x y
0 a 1.0 3.0
1 b 2.0 NaN
Source code in siuba/dply/verbs.py
@singledispatch2(pd.DataFrame)
def spread(__data, key, value, fill = None, reset_index = True):
"""Reshape table by spreading it out to wide format.
Parameters
----------
__data:
The input data.
key:
Column whose values will be used as new column names.
value:
Column whose values will fill the new column entries.
fill:
Value to set for any missing values. By default keeps them as missing values.
Examples
--------
>>> import pandas as pd
>>> from siuba import _, gather
>>> df = pd.DataFrame({"id": ["a", "b"], "x": [1, 2], "y": [3, None]})
>>> long = gather(df, "key", "value", -_.id, drop_na=True)
>>> long
id key value
0 a x 1.0
1 b x 2.0
2 a y 3.0
>>> spread(long, "key", "value")
id x y
0 a 1.0 3.0
1 b 2.0 NaN
"""
key_col = _get_single_var_select(__data.columns, key)
val_col = _get_single_var_select(__data.columns, value)
id_cols = [col for col in __data.columns if col not in (key_col, val_col)]
wide = __data.set_index(id_cols + [key_col]).unstack(level = -1)
if fill is not None:
wide.fillna(fill, inplace = True)
# remove multi-index from both rows and cols
wide.columns = wide.columns.droplevel().rename(None)
if reset_index:
wide.reset_index(inplace = True)
return wide