Skip to content

filter joins (anti, semi)

anti_join

anti_join(left, right=None, on=None, *args, *, by=None)

Return the left table with every row that would not be kept in an inner join.

Parameters:

Name Type Description Default
left

The left-hand table.

required
right

The right-hand table.

None
on

How to match them. By default it uses matches all columns with the same name across the two tables.

None

Examples:

>>> import pandas as pd
>>> from siuba import _, semi_join, anti_join
>>> df1 = pd.DataFrame({"id": [1, 2, 3], "x": ["a", "b", "c"]})
>>> df2 = pd.DataFrame({"id": [2, 3, 3], "y": ["l", "m", "n"]})
>>> df1 >> semi_join(_, df2)
   id  x
1   2  b
2   3  c
>>> df1 >> anti_join(_, df2)
   id  x
0   1  a

Generally, it's a good idea to explicitly specify the on argument.

>>> df1 >> anti_join(_, df2, on="id")
   id  x
0   1  a
Source code in siuba/dply/verbs.py
@singledispatch2((pd.DataFrame, DataFrameGroupBy))
@_bounce_groupby
def anti_join(left, right = None, on = None, *args, by = None):
    """Return the left table with every row that would *not* be kept in an inner join.

    Parameters
    ----------
    left :
        The left-hand table.
    right :
        The right-hand table.
    on :
        How to match them. By default it uses matches all columns with the same
        name across the two tables.

    Examples
    --------
    >>> import pandas as pd
    >>> from siuba import _, semi_join, anti_join

    >>> df1 = pd.DataFrame({"id": [1, 2, 3], "x": ["a", "b", "c"]})
    >>> df2 = pd.DataFrame({"id": [2, 3, 3], "y": ["l", "m", "n"]})

    >>> df1 >> semi_join(_, df2)
       id  x
    1   2  b
    2   3  c

    >>> df1 >> anti_join(_, df2)
       id  x
    0   1  a

    Generally, it's a good idea to explicitly specify the on argument.

    >>> df1 >> anti_join(_, df2, on="id")
       id  x
    0   1  a
    """

    if on is None and by is not None:
        on = by

    # copied from semi_join
    if isinstance(on, Mapping):
        left_on, right_on = zip(*on.items())
    else: 
        left_on = right_on = on

    if isinstance(right, DataFrameGroupBy):
        right = right.obj

    # manually perform merge, up to getting pieces need for indexing
    merger = _MergeOperation(left, right, left_on = left_on, right_on = right_on)
    _, l_indx, _ = merger._get_join_info()

    # use the left table's indexer to exclude those rows
    range_indx = pd.RangeIndex(len(left))
    return left.iloc[range_indx.difference(l_indx),:]

semi_join

semi_join(left, right=None, on=None, *args, *, by=None)

Return the left table with every row that would be kept in an inner join.

Parameters:

Name Type Description Default
left

The left-hand table.

required
right

The right-hand table.

None
on

How to match them. By default it uses matches all columns with the same name across the two tables.

None

Examples:

>>> import pandas as pd
>>> from siuba import _, semi_join, anti_join
>>> df1 = pd.DataFrame({"id": [1, 2, 3], "x": ["a", "b", "c"]})
>>> df2 = pd.DataFrame({"id": [2, 3, 3], "y": ["l", "m", "n"]})
>>> df1 >> semi_join(_, df2)
   id  x
1   2  b
2   3  c
>>> df1 >> anti_join(_, df2)
   id  x
0   1  a

Generally, it's a good idea to explicitly specify the on argument.

>>> df1 >> anti_join(_, df2, on="id")
   id  x
0   1  a
Source code in siuba/dply/verbs.py
@singledispatch2((pd.DataFrame, DataFrameGroupBy))
@_bounce_groupby
def semi_join(left, right = None, on = None, *args, by = None):
    """Return the left table with every row that would be kept in an inner join.

    Parameters
    ----------
    left :
        The left-hand table.
    right :
        The right-hand table.
    on :
        How to match them. By default it uses matches all columns with the same
        name across the two tables.

    Examples
    --------
    >>> import pandas as pd
    >>> from siuba import _, semi_join, anti_join

    >>> df1 = pd.DataFrame({"id": [1, 2, 3], "x": ["a", "b", "c"]})
    >>> df2 = pd.DataFrame({"id": [2, 3, 3], "y": ["l", "m", "n"]})

    >>> df1 >> semi_join(_, df2)
       id  x
    1   2  b
    2   3  c

    >>> df1 >> anti_join(_, df2)
       id  x
    0   1  a

    Generally, it's a good idea to explicitly specify the on argument.

    >>> df1 >> anti_join(_, df2, on="id")
       id  x
    0   1  a
    """

    if on is None and by is not None:
        on = by

    if isinstance(on, Mapping):
        # coerce colnames to list, to avoid indexing with tuples
        on_cols, right_on = map(list, zip(*on.items()))
        right = right[right_on].rename(dict(zip(right_on, on_cols)))
    elif on is None:
        warnings.warn(
            "No on column passed to join. "
            "Inferring join columns instead using shared column names."
        )

        on_cols = list(set(left.columns).intersection(set(right.columns)))
        if not len(on_cols):
            raise Exception("No join column specified, and no shared column names")

        warnings.warn("Detected shared columns: %s" % on_cols)
    elif isinstance(on, str):
        on_cols = [on]
    else:
        on_cols = on

    # get our semi join on ----
    if len(on_cols) == 1:
        col_name = on_cols[0]
        indx = left[col_name].isin(right[col_name])
        return left.loc[indx]

    # Not a super efficient approach. Effectively, an inner join with what would
    # be duplicate rows removed.
    merger = _MergeOperation(left, right, left_on = on_cols, right_on = on_cols)
    _, l_indx, _ = merger._get_join_info()


    range_indx = pd.RangeIndex(len(left))
    return left.loc[range_indx.isin(l_indx)]