Skip to content

extract

extract(__data, col, into, regex='(\\w+)', remove=True, convert=False, flags=0)

Pull out len(into) fields from character strings.

Returns a DataFrame with a column added for each piece.

Parameters:

Name Type Description Default
__data

a DataFrame

required
col

name of column to split (either string, or siu expression).

required
into

names of resulting columns holding each entry in pulled out fields.

required
regex

regular expression used to extract field. Passed to col.str.extract method.

'(\\w+)'
remove

whether to remove col from the returned DataFrame.

True
convert

whether to attempt to convert the split columns to numerics.

False
flags

flags from the re module, passed to col.str.extract.

0
Source code in siuba/dply/verbs.py
@singledispatch2(pd.DataFrame)
def extract(
        __data, col, into, regex = r"(\w+)",
        remove = True, convert = False,
        flags = 0
        ):
    """Pull out len(into) fields from character strings. 

    Returns a DataFrame with a column added for each piece.

    Parameters
    ----------
    __data:
        a DataFrame
    col:
        name of column to split (either string, or siu expression).
    into:
        names of resulting columns holding each entry in pulled out fields.
    regex:
        regular expression used to extract field. Passed to col.str.extract method.
    remove:
        whether to remove col from the returned DataFrame.
    convert:
        whether to attempt to convert the split columns to numerics.
    flags:
        flags from the re module, passed to col.str.extract.

    """

    col_name = simple_varname(col)
    n_into = len(into)

    all_splits = __data[col_name].str.extract(regex, flags)
    n_split_cols = len(all_splits.columns)

    if n_split_cols != n_into:
        raise ValueError("Split into %s pieces, but expected %s" % (n_split_cols, n_into))

    # attempt to convert columns to numeric ----
    if convert:
        # TODO: better strategy here? 
        for k in all_splits:
            try:
                all_splits[k] = pd.to_numeric(all_splits[k])
            except ValueError:
                pass

    out = __data.copy()
    for ii, name in enumerate(into):
        out[name] = all_splits.iloc[:, ii]

    if remove:
        return out.drop(columns = col_name)

    return out