separate, unite

`separate(__data, col, into, sep='[^a-zA-Z0-9]', remove=True, convert=False, extra='warn', fill='warn')`

Split col into len(into) piece. Return DataFrame with a column added for each piece.

Parameters:

Name	Description	Default
`__data`	a DataFrame.	required
`col`	name of column to split (either string, or siu expression).	required
`into`	names of resulting columns holding each entry in split.	required
`sep`	regular expression used to split col. Passed to col.str.split method.	`'[^a-zA-Z0-9]'`
`remove`	whether to remove col from the returned DataFrame.	`True`
`convert`	whether to attempt to convert the split columns to numerics.	`False`
`extra`	what to do when more splits than into names. One of ("warn", "drop" or "merge"). "warn" produces a warning; "drop" and "merge" currently not implemented.	`'warn'`
`fill`	what to do when fewer splits than into names. Currently not implemented.	`'warn'`

Examples:

>>> import pandas as pd
>>> from siuba import separate

>>> df = pd.DataFrame({"label": ["S1-1", "S2-2"]})

Split into two columns:

>>> separate(df, "label", into = ["season", "episode"])
  season episode
0     S1       1
1     S2       2

Split, and try to convert columns to numerics:

>>> separate(df, "label", into = ["season", "episode"], convert = True)
  season  episode
0     S1        1
1     S2        2

Source code in siuba/dply/verbs.py

@singledispatch2(pd.DataFrame)
def separate(__data, col, into, sep = r"[^a-zA-Z0-9]",
             remove = True, convert = False,
             extra = "warn", fill = "warn"
            ):
    """Split col into len(into) piece. Return DataFrame with a column added for each piece.

    Parameters
    ----------
    __data:
        a DataFrame.
    col:
        name of column to split (either string, or siu expression).
    into:
        names of resulting columns holding each entry in split.
    sep:
        regular expression used to split col. Passed to col.str.split method.
    remove:
        whether to remove col from the returned DataFrame.
    convert:
        whether to attempt to convert the split columns to numerics.
    extra:
        what to do when more splits than into names.  One of ("warn", "drop" or "merge").
        "warn" produces a warning; "drop" and "merge" currently not implemented.
    fill:
        what to do when fewer splits than into names. Currently not implemented.

    Examples
    --------
    >>> import pandas as pd
    >>> from siuba import separate

    >>> df = pd.DataFrame({"label": ["S1-1", "S2-2"]})

    Split into two columns:

    >>> separate(df, "label", into = ["season", "episode"])
      season episode
    0     S1       1
    1     S2       2

    Split, and try to convert columns to numerics:

    >>> separate(df, "label", into = ["season", "episode"], convert = True)
      season  episode
    0     S1        1
    1     S2        2

    """

    n_into = len(into)
    col_name = simple_varname(col)

    # splitting column ----
    all_splits = __data[col_name].str.split(sep, expand = True)
    n_split_cols = len(all_splits.columns)

    # handling too many or too few splits ----
    if  n_split_cols < n_into:
        # too few columns
        raise ValueError("Expected %s split cols, found %s" %(n_into, n_split_cols))
    elif n_split_cols > n_into:
        # Extra argument controls how we deal with too many splits
        if extra == "warn":
            df_extra_cols = all_splits.iloc[:, n_into].reset_index(drop=True)
            bad_rows = df_extra_cols.dropna(how="all")
            n_extra = bad_rows.shape[0]

            warnings.warn(
                f"Expected {n_into} pieces."
                f"Additional pieces discarded in {n_extra} rows."
                f"Row numbers: {bad_rows.index.values}",
                UserWarning
            )
        elif extra == "drop":
            pass
        elif extra == "merge":
            raise NotImplementedError("TODO: separate extra = 'merge'")
        else:
            raise ValueError("Invalid extra argument: %s" %extra)

    # create new columns in data ----
    out = __data.copy()

    for ii, name in enumerate(into):
        out[name] = all_splits.iloc[:, ii]

    #out = pd.concat([__data, keep_splits], axis = 1)

    # attempt to convert columns to numeric ----
    if convert:
        # TODO: better strategy here? 
        for k in into:
            try:
                out[k] = pd.to_numeric(out[k])
            except ValueError:
                pass

    if remove and col_name not in into:
        return out.drop(columns = col_name)

    return out

`unite(__data, col, args, , sep='_', remove=True)`

Combine multiple columns into a single column. Return DataFrame that column included.

Parameters:

Name	Description	Default
`__data`	a DataFrame	required
`col`	name of the to-be-created column (string).	required
`*args`	names of each column to combine.	`()`
`sep`	separator joining each column being combined.	`'_'`
`remove`	whether to remove the combined columns from the returned DataFrame.	`True`

Source code in siuba/dply/verbs.py

@singledispatch2(pd.DataFrame)
def unite(__data, col, *args, sep = "_", remove = True):
    """Combine multiple columns into a single column. Return DataFrame that column included.

    Parameters
    ----------
    __data:
        a DataFrame
    col:
        name of the to-be-created column (string).
    *args:
        names of each column to combine.
    sep:
        separator joining each column being combined.
    remove:
        whether to remove the combined columns from the returned DataFrame.

    """
    unite_col_names = list(map(simple_varname, args))
    out_col_name = simple_varname(col)

    # validations ----
    if None in unite_col_names:
        raise ValueError("*args must be string, or simple column name, e.g. _.col_name")

    missing_cols = set(unite_col_names) - set(__data.columns)
    if missing_cols:
        raise ValueError("columns %s not in DataFrame.columns" %missing_cols)


    unite_cols = [_coerce_to_str(__data[col_name]) for col_name in unite_col_names]

    if out_col_name in __data:
        raise ValueError("col argument %s already a column in data" % out_col_name)

    # perform unite ----
    # TODO: this is probably not very efficient. Maybe try with transform or apply?
    res = reduce(lambda x,y: x + sep + y, unite_cols)

    out_df = __data.copy()
    out_df[out_col_name] = res

    if remove:
        return out_df.drop(columns = unite_col_names)

    return out_df

separate, unite

separate(__data, col, into, sep='[^a-zA-Z0-9]', remove=True, convert=False, extra='warn', fill='warn')

unite(__data, col, *args, *, sep='_', remove=True)

`separate(__data, col, into, sep='[^a-zA-Z0-9]', remove=True, convert=False, extra='warn', fill='warn')`

`unite(__data, col, args, , sep='_', remove=True)`