Skip to content

separate, unite

separate(__data, col, into, sep='[^a-zA-Z0-9]', remove=True, convert=False, extra='warn', fill='warn')

Split col into len(into) piece. Return DataFrame with a column added for each piece.

Parameters:

Name Type Description Default
__data

a DataFrame.

required
col

name of column to split (either string, or siu expression).

required
into

names of resulting columns holding each entry in split.

required
sep

regular expression used to split col. Passed to col.str.split method.

'[^a-zA-Z0-9]'
remove

whether to remove col from the returned DataFrame.

True
convert

whether to attempt to convert the split columns to numerics.

False
extra

what to do when more splits than into names. One of ("warn", "drop" or "merge"). "warn" produces a warning; "drop" and "merge" currently not implemented.

'warn'
fill

what to do when fewer splits than into names. Currently not implemented.

'warn'

Examples:

>>> import pandas as pd
>>> from siuba import separate
>>> df = pd.DataFrame({"label": ["S1-1", "S2-2"]})

Split into two columns:

>>> separate(df, "label", into = ["season", "episode"])
  season episode
0     S1       1
1     S2       2

Split, and try to convert columns to numerics:

>>> separate(df, "label", into = ["season", "episode"], convert = True)
  season  episode
0     S1        1
1     S2        2
Source code in siuba/dply/verbs.py
@singledispatch2(pd.DataFrame)
def separate(__data, col, into, sep = r"[^a-zA-Z0-9]",
             remove = True, convert = False,
             extra = "warn", fill = "warn"
            ):
    """Split col into len(into) piece. Return DataFrame with a column added for each piece.

    Parameters
    ----------
    __data:
        a DataFrame.
    col:
        name of column to split (either string, or siu expression).
    into:
        names of resulting columns holding each entry in split.
    sep:
        regular expression used to split col. Passed to col.str.split method.
    remove:
        whether to remove col from the returned DataFrame.
    convert:
        whether to attempt to convert the split columns to numerics.
    extra:
        what to do when more splits than into names.  One of ("warn", "drop" or "merge").
        "warn" produces a warning; "drop" and "merge" currently not implemented.
    fill:
        what to do when fewer splits than into names. Currently not implemented.

    Examples
    --------
    >>> import pandas as pd
    >>> from siuba import separate

    >>> df = pd.DataFrame({"label": ["S1-1", "S2-2"]})

    Split into two columns:

    >>> separate(df, "label", into = ["season", "episode"])
      season episode
    0     S1       1
    1     S2       2

    Split, and try to convert columns to numerics:

    >>> separate(df, "label", into = ["season", "episode"], convert = True)
      season  episode
    0     S1        1
    1     S2        2

    """

    n_into = len(into)
    col_name = simple_varname(col)

    # splitting column ----
    all_splits = __data[col_name].str.split(sep, expand = True)
    n_split_cols = len(all_splits.columns)

    # handling too many or too few splits ----
    if  n_split_cols < n_into:
        # too few columns
        raise ValueError("Expected %s split cols, found %s" %(n_into, n_split_cols))
    elif n_split_cols > n_into:
        # Extra argument controls how we deal with too many splits
        if extra == "warn":
            df_extra_cols = all_splits.iloc[:, n_into].reset_index(drop=True)
            bad_rows = df_extra_cols.dropna(how="all")
            n_extra = bad_rows.shape[0]

            warnings.warn(
                f"Expected {n_into} pieces."
                f"Additional pieces discarded in {n_extra} rows."
                f"Row numbers: {bad_rows.index.values}",
                UserWarning
            )
        elif extra == "drop":
            pass
        elif extra == "merge":
            raise NotImplementedError("TODO: separate extra = 'merge'")
        else:
            raise ValueError("Invalid extra argument: %s" %extra)

    # create new columns in data ----
    out = __data.copy()

    for ii, name in enumerate(into):
        out[name] = all_splits.iloc[:, ii]

    #out = pd.concat([__data, keep_splits], axis = 1)

    # attempt to convert columns to numeric ----
    if convert:
        # TODO: better strategy here? 
        for k in into:
            try:
                out[k] = pd.to_numeric(out[k])
            except ValueError:
                pass

    if remove and col_name not in into:
        return out.drop(columns = col_name)

    return out

unite(__data, col, *args, *, sep='_', remove=True)

Combine multiple columns into a single column. Return DataFrame that column included.

Parameters:

Name Type Description Default
__data

a DataFrame

required
col

name of the to-be-created column (string).

required
*args

names of each column to combine.

()
sep

separator joining each column being combined.

'_'
remove

whether to remove the combined columns from the returned DataFrame.

True
Source code in siuba/dply/verbs.py
@singledispatch2(pd.DataFrame)
def unite(__data, col, *args, sep = "_", remove = True):
    """Combine multiple columns into a single column. Return DataFrame that column included.

    Parameters
    ----------
    __data:
        a DataFrame
    col:
        name of the to-be-created column (string).
    *args:
        names of each column to combine.
    sep:
        separator joining each column being combined.
    remove:
        whether to remove the combined columns from the returned DataFrame.

    """
    unite_col_names = list(map(simple_varname, args))
    out_col_name = simple_varname(col)

    # validations ----
    if None in unite_col_names:
        raise ValueError("*args must be string, or simple column name, e.g. _.col_name")

    missing_cols = set(unite_col_names) - set(__data.columns)
    if missing_cols:
        raise ValueError("columns %s not in DataFrame.columns" %missing_cols)


    unite_cols = [_coerce_to_str(__data[col_name]) for col_name in unite_col_names]

    if out_col_name in __data:
        raise ValueError("col argument %s already a column in data" % out_col_name)

    # perform unite ----
    # TODO: this is probably not very efficient. Maybe try with transform or apply?
    res = reduce(lambda x,y: x + sep + y, unite_cols)

    out_df = __data.copy()
    out_df[out_col_name] = res

    if remove:
        return out_df.drop(columns = unite_col_names)

    return out_df