separate, unite
separate(__data, col, into, sep='[^a-zA-Z0-9]', remove=True, convert=False, extra='warn', fill='warn')
Split col into len(into) piece. Return DataFrame with a column added for each piece.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
__data |
a DataFrame. |
required | |
col |
name of column to split (either string, or siu expression). |
required | |
into |
names of resulting columns holding each entry in split. |
required | |
sep |
regular expression used to split col. Passed to col.str.split method. |
'[^a-zA-Z0-9]' |
|
remove |
whether to remove col from the returned DataFrame. |
True |
|
convert |
whether to attempt to convert the split columns to numerics. |
False |
|
extra |
what to do when more splits than into names. One of ("warn", "drop" or "merge"). "warn" produces a warning; "drop" and "merge" currently not implemented. |
'warn' |
|
fill |
what to do when fewer splits than into names. Currently not implemented. |
'warn' |
Examples:
>>> import pandas as pd
>>> from siuba import separate
>>> df = pd.DataFrame({"label": ["S1-1", "S2-2"]})
Split into two columns:
>>> separate(df, "label", into = ["season", "episode"])
season episode
0 S1 1
1 S2 2
Split, and try to convert columns to numerics:
>>> separate(df, "label", into = ["season", "episode"], convert = True)
season episode
0 S1 1
1 S2 2
Source code in siuba/dply/verbs.py
@singledispatch2(pd.DataFrame)
def separate(__data, col, into, sep = r"[^a-zA-Z0-9]",
remove = True, convert = False,
extra = "warn", fill = "warn"
):
"""Split col into len(into) piece. Return DataFrame with a column added for each piece.
Parameters
----------
__data:
a DataFrame.
col:
name of column to split (either string, or siu expression).
into:
names of resulting columns holding each entry in split.
sep:
regular expression used to split col. Passed to col.str.split method.
remove:
whether to remove col from the returned DataFrame.
convert:
whether to attempt to convert the split columns to numerics.
extra:
what to do when more splits than into names. One of ("warn", "drop" or "merge").
"warn" produces a warning; "drop" and "merge" currently not implemented.
fill:
what to do when fewer splits than into names. Currently not implemented.
Examples
--------
>>> import pandas as pd
>>> from siuba import separate
>>> df = pd.DataFrame({"label": ["S1-1", "S2-2"]})
Split into two columns:
>>> separate(df, "label", into = ["season", "episode"])
season episode
0 S1 1
1 S2 2
Split, and try to convert columns to numerics:
>>> separate(df, "label", into = ["season", "episode"], convert = True)
season episode
0 S1 1
1 S2 2
"""
n_into = len(into)
col_name = simple_varname(col)
# splitting column ----
all_splits = __data[col_name].str.split(sep, expand = True)
n_split_cols = len(all_splits.columns)
# handling too many or too few splits ----
if n_split_cols < n_into:
# too few columns
raise ValueError("Expected %s split cols, found %s" %(n_into, n_split_cols))
elif n_split_cols > n_into:
# Extra argument controls how we deal with too many splits
if extra == "warn":
df_extra_cols = all_splits.iloc[:, n_into].reset_index(drop=True)
bad_rows = df_extra_cols.dropna(how="all")
n_extra = bad_rows.shape[0]
warnings.warn(
f"Expected {n_into} pieces."
f"Additional pieces discarded in {n_extra} rows."
f"Row numbers: {bad_rows.index.values}",
UserWarning
)
elif extra == "drop":
pass
elif extra == "merge":
raise NotImplementedError("TODO: separate extra = 'merge'")
else:
raise ValueError("Invalid extra argument: %s" %extra)
# create new columns in data ----
out = __data.copy()
for ii, name in enumerate(into):
out[name] = all_splits.iloc[:, ii]
#out = pd.concat([__data, keep_splits], axis = 1)
# attempt to convert columns to numeric ----
if convert:
# TODO: better strategy here?
for k in into:
try:
out[k] = pd.to_numeric(out[k])
except ValueError:
pass
if remove and col_name not in into:
return out.drop(columns = col_name)
return out
unite(__data, col, *args, *, sep='_', remove=True)
Combine multiple columns into a single column. Return DataFrame that column included.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
__data |
a DataFrame |
required | |
col |
name of the to-be-created column (string). |
required | |
*args |
names of each column to combine. |
() |
|
sep |
separator joining each column being combined. |
'_' |
|
remove |
whether to remove the combined columns from the returned DataFrame. |
True |
Source code in siuba/dply/verbs.py
@singledispatch2(pd.DataFrame)
def unite(__data, col, *args, sep = "_", remove = True):
"""Combine multiple columns into a single column. Return DataFrame that column included.
Parameters
----------
__data:
a DataFrame
col:
name of the to-be-created column (string).
*args:
names of each column to combine.
sep:
separator joining each column being combined.
remove:
whether to remove the combined columns from the returned DataFrame.
"""
unite_col_names = list(map(simple_varname, args))
out_col_name = simple_varname(col)
# validations ----
if None in unite_col_names:
raise ValueError("*args must be string, or simple column name, e.g. _.col_name")
missing_cols = set(unite_col_names) - set(__data.columns)
if missing_cols:
raise ValueError("columns %s not in DataFrame.columns" %missing_cols)
unite_cols = [_coerce_to_str(__data[col_name]) for col_name in unite_col_names]
if out_col_name in __data:
raise ValueError("col argument %s already a column in data" % out_col_name)
# perform unite ----
# TODO: this is probably not very efficient. Maybe try with transform or apply?
res = reduce(lambda x,y: x + sep + y, unite_cols)
out_df = __data.copy()
out_df[out_col_name] = res
if remove:
return out_df.drop(columns = unite_col_names)
return out_df