extract
extract(__data, col, into, regex='(\\w+)', remove=True, convert=False, flags=0)
Pull out len(into) fields from character strings.
Returns a DataFrame with a column added for each piece.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
__data |
a DataFrame |
required | |
col |
name of column to split (either string, or siu expression). |
required | |
into |
names of resulting columns holding each entry in pulled out fields. |
required | |
regex |
regular expression used to extract field. Passed to col.str.extract method. |
'(\\w+)' |
|
remove |
whether to remove col from the returned DataFrame. |
True |
|
convert |
whether to attempt to convert the split columns to numerics. |
False |
|
flags |
flags from the re module, passed to col.str.extract. |
0 |
Source code in siuba/dply/verbs.py
@singledispatch2(pd.DataFrame)
def extract(
__data, col, into, regex = r"(\w+)",
remove = True, convert = False,
flags = 0
):
"""Pull out len(into) fields from character strings.
Returns a DataFrame with a column added for each piece.
Parameters
----------
__data:
a DataFrame
col:
name of column to split (either string, or siu expression).
into:
names of resulting columns holding each entry in pulled out fields.
regex:
regular expression used to extract field. Passed to col.str.extract method.
remove:
whether to remove col from the returned DataFrame.
convert:
whether to attempt to convert the split columns to numerics.
flags:
flags from the re module, passed to col.str.extract.
"""
col_name = simple_varname(col)
n_into = len(into)
all_splits = __data[col_name].str.extract(regex, flags)
n_split_cols = len(all_splits.columns)
if n_split_cols != n_into:
raise ValueError("Split into %s pieces, but expected %s" % (n_split_cols, n_into))
# attempt to convert columns to numeric ----
if convert:
# TODO: better strategy here?
for k in all_splits:
try:
all_splits[k] = pd.to_numeric(all_splits[k])
except ValueError:
pass
out = __data.copy()
for ii, name in enumerate(into):
out[name] = all_splits.iloc[:, ii]
if remove:
return out.drop(columns = col_name)
return out