Skip to content

Forcats

Importing

from siuba.dply.forcats import fct_collapse, fct_infreq

siuba.dply.forcats

fct_collapse(fct, recat, group_other=None)

Return copy of fct with categories renamed. Optionally group all others.

Parameters:

Name Type Description Default
fct

A pandas.Categorical, or array(-like) used to create one.

required
recat

Dictionary of form {new_cat_name: old_cat_name}. old_cat_name may be a list of existing categories, to be given the same name.

required
group_other

An optional string, specifying what all other categories should be named. This will always be the last category level in the result.

None

Examples:

>>> fct_collapse(['a', 'b', 'c'], {'x': 'a'})
['x', 'b', 'c']
Categories (3, object): ['x', 'b', 'c']
>>> fct_collapse(['a', 'b', 'c'], {'x': 'a'}, group_other = 'others')
['x', 'others', 'others']
Categories (2, object): ['x', 'others']
>>> fct_collapse(['a', 'b', 'c'], {'ab': ['a', 'b']})
['ab', 'ab', 'c']
Categories (2, object): ['ab', 'c']
>>> fct_collapse(['a', 'b', None], {'a': ['b']})
['a', 'a', NaN]
Categories (1, object): ['a']
Source code in siuba/dply/forcats.py
@symbolic_dispatch
def fct_collapse(fct, recat, group_other = None) -> pd.Categorical:
    """Return copy of fct with categories renamed. Optionally group all others.

    Parameters
    ----------
    fct :
        A pandas.Categorical, or array(-like) used to create one.  
    recat :
        Dictionary of form {new_cat_name: old_cat_name}. old_cat_name may be
        a list of existing categories, to be given the same name.
    group_other :
        An optional string, specifying what all other categories should be named.
        This will always be the last category level in the result.

    Notes
    -----
    Resulting levels index is ordered according to the earliest level replaced.
    If we rename the first and last levels to "c", then "c" is the first level.

    Examples
    --------
    >>> fct_collapse(['a', 'b', 'c'], {'x': 'a'})
    ['x', 'b', 'c']
    Categories (3, object): ['x', 'b', 'c']

    >>> fct_collapse(['a', 'b', 'c'], {'x': 'a'}, group_other = 'others')
    ['x', 'others', 'others']
    Categories (2, object): ['x', 'others']

    >>> fct_collapse(['a', 'b', 'c'], {'ab': ['a', 'b']})
    ['ab', 'ab', 'c']
    Categories (2, object): ['ab', 'c']

    >>> fct_collapse(['a', 'b', None], {'a': ['b']})
    ['a', 'a', NaN]
    Categories (1, object): ['a']

    """
    if not isinstance(fct, pd.Categorical):
        new_fct = pd.Categorical(fct)
    else:
        new_fct = fct

    # each existing cat will map to a new one ----
    # need to know existing to new cat
    # need to know new cat to new code
    cat_to_new = {k: None for k in new_fct.categories}
    for new_name, v in recat.items():
        v = [v] if not np.ndim(v) else v
        for old_name in v:
            if cat_to_new[old_name] is not None:
                raise Exception("category %s was already re-assigned"%old_name)
            cat_to_new[old_name] = new_name

    # collapse all unspecified cats to group_other if specified ----
    for k, v in cat_to_new.items():
        if v is None:
            if group_other is not None:
                cat_to_new[k] = group_other
            else:
                cat_to_new[k] = k

    # map from old cat to new code ----

    # calculate new codes
    ordered_cats = {new: True for old, new in cat_to_new.items()}

    # move the other group to last in the ordered set
    if group_other is not None:
        try:
            del ordered_cats[group_other]
            ordered_cats[group_other] = True
        except KeyError:
            pass

    # map new category name to code
    new_cat_set = {k: ii for ii, k in enumerate(ordered_cats)}

    # at this point, we need remap codes to the other category
    # make an array, where the index is old code + 1 (so missing val index is 0)
    old_code_to_new = np.array(
            [-1] + [new_cat_set[new_cat] for new_cat in cat_to_new.values()]
    )

    # map old cats to new codes
    #remap_code = {old: new_cat_set[new] for old, new in cat_to_new.items()}
    new_codes = old_code_to_new[new_fct.codes + 1]
    new_cats = list(new_cat_set)

    out = pd.Categorical.from_codes(new_codes, new_cats)
    return _maybe_upcast(fct, out)

fct_infreq(fct, ordered=None)

Return a copy of fct, with categories ordered by frequency (largest first)

Parameters:

Name Type Description Default
fct list-like

A pandas Series, Categorical, or list-like object

required
ordered bool

Whether to return an ordered categorical. By default a Categorical inputs' ordered setting is respected. Use this to override it.

None

Examples:

>>> fct_infreq(["c", "a", "c", "c", "a", "b"])
['c', 'a', 'c', 'c', 'a', 'b']
Categories (3, object): ['c', 'a', 'b']
Source code in siuba/dply/forcats.py
@symbolic_dispatch
def fct_infreq(fct, ordered=None):
    """Return a copy of fct, with categories ordered by frequency (largest first)

    Parameters
    ----------
    fct : list-like
        A pandas Series, Categorical, or list-like object
    ordered : bool
        Whether to return an ordered categorical. By default a Categorical inputs'
        ordered setting is respected. Use this to override it.

    See Also
    --------
    fct_inorder : Order categories by when they're first observed.

    Examples
    --------

    >>> fct_infreq(["c", "a", "c", "c", "a", "b"])
    ['c', 'a', 'c', 'c', 'a', 'b']
    Categories (3, object): ['c', 'a', 'b']

    """

    if ordered is None:
        ordered = _get_cat_order(fct)


    # sort and create new categorical ----

    if isinstance(fct, pd.Categorical):
        # Categorical value counts are sorted in categories order
        # So to acheive the exact same result as the Series case below,
        # we need to use fct_inorder, so categories is in first-observed order.
        # This orders the final result by frequency, and then observed for ties.
        freq = fct_inorder(fct).value_counts().sort_values(ascending=False)

        # note that freq is a Series, but it has a CategoricalIndex.
        # we want the index values as shown, so we need to strip them out of
        # this nightmare index situation.
        categories = freq.index.categories[freq.index.dropna().codes]
        return pd.Categorical(fct, categories=categories, ordered=ordered)

    else:
        # Series sorts in descending frequency order
        ser = pd.Series(fct) if not isinstance(fct, pd.Series) else fct
        freq = ser.value_counts()
        cat = pd.Categorical(ser, categories=freq.index, ordered=ordered)

        if isinstance(fct, pd.Series):
            return pd.Series(cat)

        return cat

fct_inorder(fct, ordered=None)

Return a copy of fct, with categories ordered by when they first appear.

Parameters:

Name Type Description Default
fct list-like

A pandas Series, Categorical, or list-like object

required
ordered bool

Whether to return an ordered categorical. By default a Categorical inputs' ordered setting is respected. Use this to override it.

None

Examples:

>>> fct = pd.Categorical(["c", "a", "b"])
>>> fct
['c', 'a', 'b']
Categories (3, object): ['a', 'b', 'c']

Note that above the categories are sorted alphabetically. Use fct_inorder to keep the categories in first-observed order.

>>> fct_inorder(fct)
['c', 'a', 'b']
Categories (3, object): ['c', 'a', 'b']

fct_inorder also accepts pd.Series and list objects:

>>> fct_inorder(["z", "a"])
['z', 'a']
Categories (2, object): ['z', 'a']

By default, the ordered setting of categoricals is respected. Use the ordered parameter to override it.

>>> fct2 = pd.Categorical(["z", "a", "b"], ordered=True)
>>> fct_inorder(fct2)
['z', 'a', 'b']
Categories (3, object): ['z' < 'a' < 'b']
>>> fct_inorder(fct2, ordered=False)
['z', 'a', 'b']
Categories (3, object): ['z', 'a', 'b']
Source code in siuba/dply/forcats.py
@symbolic_dispatch
def fct_inorder(fct, ordered=None):
    """Return a copy of fct, with categories ordered by when they first appear.

    Parameters
    ----------
    fct : list-like
        A pandas Series, Categorical, or list-like object
    ordered : bool
        Whether to return an ordered categorical. By default a Categorical inputs'
        ordered setting is respected. Use this to override it.

    See Also
    --------
    fct_infreq : Order categories by value frequency count.

    Examples
    --------

    >>> fct = pd.Categorical(["c", "a", "b"])
    >>> fct
    ['c', 'a', 'b']
    Categories (3, object): ['a', 'b', 'c']

    Note that above the categories are sorted alphabetically. Use fct_inorder
    to keep the categories in first-observed order.

    >>> fct_inorder(fct)
    ['c', 'a', 'b']
    Categories (3, object): ['c', 'a', 'b']

    fct_inorder also accepts pd.Series and list objects:

    >>> fct_inorder(["z", "a"])
    ['z', 'a']
    Categories (2, object): ['z', 'a']

    By default, the ordered setting of categoricals is respected. Use the ordered
    parameter to override it.

    >>> fct2 = pd.Categorical(["z", "a", "b"], ordered=True)
    >>> fct_inorder(fct2)
    ['z', 'a', 'b']
    Categories (3, object): ['z' < 'a' < 'b']

    >>> fct_inorder(fct2, ordered=False)
    ['z', 'a', 'b']
    Categories (3, object): ['z', 'a', 'b']

    """

    if ordered is None:
        ordered = _get_cat_order(fct)

    if isinstance(fct, (pd.Series, pd.Categorical)):
        uniq = fct.dropna().unique()

        if isinstance(uniq, pd.Categorical):
            # the result of .unique for a categorical is a new categorical
            # unsurprisingly, it also sorts the categories, so reorder manually
            # (note that this also applies to Series[Categorical].unique())
            categories = uniq.categories[uniq.dropna().codes]
            return pd.Categorical(fct, categories, ordered=ordered)

        # series in, so series out
        cat = pd.Categorical(fct, uniq, ordered=ordered)
        return pd.Series(cat)

    ser = pd.Series(fct)
    return pd.Categorical(fct, categories = ser.dropna().unique(), ordered=ordered)

fct_lump(fct, n=None, prop=None, w=None, other_level='Other', ties=None)

Return a copy of fct with categories lumped together.

Parameters:

Name Type Description Default
fct

A pandas.Categorical, or array(-like) used to create one.

required
n

Number of categories to keep.

None
prop

(not implemented) keep categories that occur prop proportion of the time.

None
w

Array of weights corresponding to each value in fct.

None
other_level

Name for all lumped together levels.

'Other'
ties

(not implemented) method to use in the case of ties.

None

Examples:

>>> fct_lump(['a', 'a', 'b', 'c'], n = 1)
['a', 'a', 'Other', 'Other']
Categories (2, object): ['a', 'Other']
>>> fct_lump(['a', 'a', 'b', 'b', 'c', 'd'], prop = .2)
['a', 'a', 'b', 'b', 'Other', 'Other']
Categories (3, object): ['a', 'b', 'Other']
Source code in siuba/dply/forcats.py
@symbolic_dispatch
def fct_lump(fct, n = None, prop = None, w = None, other_level = "Other", ties = None) -> pd.Categorical:
    """Return a copy of fct with categories lumped together.

    Parameters
    ----------
    fct :
        A pandas.Categorical, or array(-like) used to create one.
    n :
        Number of categories to keep.
    prop :
        (not implemented) keep categories that occur prop proportion of the time.
    w :
        Array of weights corresponding to each value in fct.
    other_level :
        Name for all lumped together levels.
    ties :
        (not implemented) method to use in the case of ties.

    Notes
    -----
    Currently, one of n and prop must be specified.

    Examples
    --------
    >>> fct_lump(['a', 'a', 'b', 'c'], n = 1)
    ['a', 'a', 'Other', 'Other']
    Categories (2, object): ['a', 'Other']

    >>> fct_lump(['a', 'a', 'b', 'b', 'c', 'd'], prop = .2)
    ['a', 'a', 'b', 'b', 'Other', 'Other']
    Categories (3, object): ['a', 'b', 'Other']

    """

    if ties is not None:
        raise NotImplementedError("ties is not implemented")

    if n is None and prop is None:
        raise NotImplementedError("Either n or prop must be specified")

    keep_cats = _fct_lump_n_cats(fct, w, other_level, ties, n = n, prop = prop)

    out = fct_collapse(fct, {k:k for k in keep_cats}, group_other = other_level)
    return _maybe_upcast(fct, out)

fct_recode(fct, recat=None, **kwargs)

Return copy of fct with renamed categories.

Parameters:

Name Type Description Default
fct

A pandas.Categorical, or array(-like) used to create one.

required
**kwargs

Arguments of form new_name = old_name.

{}

Examples:

>>> cat = ['a', 'b', 'c']
>>> fct_recode(cat, z = 'c')
['a', 'b', 'z']
Categories (3, object): ['a', 'b', 'z']
>>> fct_recode(cat, x = ['a', 'b'])
['x', 'x', 'c']
Categories (2, object): ['x', 'c']
>>> fct_recode(cat, {"x": ['a', 'b']})
['x', 'x', 'c']
Categories (2, object): ['x', 'c']
Source code in siuba/dply/forcats.py
@symbolic_dispatch
def fct_recode(fct, recat=None, **kwargs) -> pd.Categorical:
    """Return copy of fct with renamed categories.

    Parameters
    ----------
    fct :
        A pandas.Categorical, or array(-like) used to create one. 
    **kwargs :
        Arguments of form new_name = old_name.

    Examples
    --------
    >>> cat = ['a', 'b', 'c']
    >>> fct_recode(cat, z = 'c')
    ['a', 'b', 'z']
    Categories (3, object): ['a', 'b', 'z']

    >>> fct_recode(cat, x = ['a', 'b'])
    ['x', 'x', 'c']
    Categories (2, object): ['x', 'c']

    >>> fct_recode(cat, {"x": ['a', 'b']})
    ['x', 'x', 'c']
    Categories (2, object): ['x', 'c']

    """

    if recat and not isinstance(recat, dict):
        raise TypeError("fct_recode requires named args or a dict.")

    if recat and kwargs:
        duplicate_keys = set(recat).intersection(set(kwargs))
        if duplicate_keys:
            raise ValueError(
                "The following recode name(s) were specified more than once: {}" \
                .format(duplicate_keys)
            )

    new_cats = {**recat, **kwargs} if recat else kwargs
    return _maybe_upcast(fct, fct_collapse(fct, new_cats))

fct_reorder(fct, x, func=<function median at 0x7fade5c56940>, desc=False)

Return copy of fct, with categories reordered according to values in x.

Parameters:

Name Type Description Default
fct

A pandas.Categorical, or array(-like) used to create one.

required
x

Values used to reorder categorical. Must be same length as fct.

required
func

Function run over all values within a level of the categorical.

<function median at 0x7fade5c56940>
desc

Whether to sort in descending order.

False

Examples:

>>> fct_reorder(['a', 'a', 'b'], [4, 3, 2])
['a', 'a', 'b']
Categories (2, object): ['b', 'a']
>>> fct_reorder(['a', 'a', 'b'], [4, 3, 2], desc = True)
['a', 'a', 'b']
Categories (2, object): ['a', 'b']
>>> fct_reorder(['x', 'x', 'y'], [4, 0, 2], np.max)
['x', 'x', 'y']
Categories (2, object): ['y', 'x']
Source code in siuba/dply/forcats.py
@symbolic_dispatch
def fct_reorder(fct, x, func = np.median, desc = False) -> pd.Categorical:
    """Return copy of fct, with categories reordered according to values in x.

    Parameters
    ----------
    fct :
        A pandas.Categorical, or array(-like) used to create one.
    x :
        Values used to reorder categorical. Must be same length as fct.
    func :
        Function run over all values within a level of the categorical.
    desc :
        Whether to sort in descending order.

    Notes
    -----
    NaN categories can't be ordered. When func returns NaN, sorting
    is always done with NaNs last.


    Examples
    --------

    >>> fct_reorder(['a', 'a', 'b'], [4, 3, 2])
    ['a', 'a', 'b']
    Categories (2, object): ['b', 'a']

    >>> fct_reorder(['a', 'a', 'b'], [4, 3, 2], desc = True)
    ['a', 'a', 'b']
    Categories (2, object): ['a', 'b']

    >>> fct_reorder(['x', 'x', 'y'], [4, 0, 2], np.max)
    ['x', 'x', 'y']
    Categories (2, object): ['y', 'x']

    """

    x_vals = x.values if isinstance(x, pd.Series) else x
    s = pd.Series(x_vals, index = fct)

    # sort groups by calculated agg func. note that groupby uses dropna=True by default,
    # but that's okay, since pandas categoricals can't order the NA category
    ordered = s.groupby(level = 0).agg(func).sort_values(ascending = not desc)

    out = pd.Categorical(fct, categories=ordered.index)
    return _maybe_upcast(fct, out)

fct_rev(fct)

Return a copy of fct with category level order reversed.next

Parameters:

Name Type Description Default
fct

A pandas.Categorical, or array(-like) used to create one.

required

Examples:

>>> fct = pd.Categorical(["a", "b", "c"])
>>> fct
['a', 'b', 'c']
Categories (3, object): ['a', 'b', 'c']
>>> fct_rev(fct)
['a', 'b', 'c']
Categories (3, object): ['c', 'b', 'a']

Note that this function can also accept a list.

>>> fct_rev(["a", "b", "c"])
['a', 'b', 'c']
Categories (3, object): ['c', 'b', 'a']
Source code in siuba/dply/forcats.py
@symbolic_dispatch
def fct_rev(fct) -> pd.Categorical:
    """Return a copy of fct with category level order reversed.next

    Parameters
    ----------
    fct :
        A pandas.Categorical, or array(-like) used to create one.

    Examples
    --------
    >>> fct = pd.Categorical(["a", "b", "c"])
    >>> fct
    ['a', 'b', 'c']
    Categories (3, object): ['a', 'b', 'c']

    >>> fct_rev(fct)
    ['a', 'b', 'c']
    Categories (3, object): ['c', 'b', 'a']

    Note that this function can also accept a list.

    >>> fct_rev(["a", "b", "c"])
    ['a', 'b', 'c']
    Categories (3, object): ['c', 'b', 'a']


    """

    if not isinstance(fct, pd.Categorical):
        fct = pd.Categorical(fct)

    rev_levels = list(reversed(fct.categories))

    out = fct.reorder_categories(rev_levels)
    return _maybe_upcast(fct, out)