filter joins (anti, semi)

anti_join

`anti_join(left, right=None, on=None, args, , by=None)`

Return the left table with every row that would not be kept in an inner join.

Parameters:

Name	Description	Default
`left`	The left-hand table.	required
`right`	The right-hand table.	`None`
`on`	How to match them. By default it uses matches all columns with the same name across the two tables.	`None`

Examples:

>>> import pandas as pd
>>> from siuba import _, semi_join, anti_join

>>> df1 = pd.DataFrame({"id": [1, 2, 3], "x": ["a", "b", "c"]})
>>> df2 = pd.DataFrame({"id": [2, 3, 3], "y": ["l", "m", "n"]})

>>> df1 >> semi_join(_, df2)
   id  x
1   2  b
2   3  c

>>> df1 >> anti_join(_, df2)
   id  x
0   1  a

Generally, it's a good idea to explicitly specify the on argument.

>>> df1 >> anti_join(_, df2, on="id")
   id  x
0   1  a

Source code in siuba/dply/verbs.py

@singledispatch2((pd.DataFrame, DataFrameGroupBy))
@_bounce_groupby
def anti_join(left, right = None, on = None, *args, by = None):
    """Return the left table with every row that would *not* be kept in an inner join.

    Parameters
    ----------
    left :
        The left-hand table.
    right :
        The right-hand table.
    on :
        How to match them. By default it uses matches all columns with the same
        name across the two tables.

    Examples
    --------
    >>> import pandas as pd
    >>> from siuba import _, semi_join, anti_join

    >>> df1 = pd.DataFrame({"id": [1, 2, 3], "x": ["a", "b", "c"]})
    >>> df2 = pd.DataFrame({"id": [2, 3, 3], "y": ["l", "m", "n"]})

    >>> df1 >> semi_join(_, df2)
       id  x
    1   2  b
    2   3  c

    >>> df1 >> anti_join(_, df2)
       id  x
    0   1  a

    Generally, it's a good idea to explicitly specify the on argument.

    >>> df1 >> anti_join(_, df2, on="id")
       id  x
    0   1  a
    """

    if on is None and by is not None:
        on = by

    # copied from semi_join
    if isinstance(on, Mapping):
        left_on, right_on = zip(*on.items())
    else: 
        left_on = right_on = on

    if isinstance(right, DataFrameGroupBy):
        right = right.obj

    # manually perform merge, up to getting pieces need for indexing
    merger = _MergeOperation(left, right, left_on = left_on, right_on = right_on)
    _, l_indx, _ = merger._get_join_info()

    # use the left table's indexer to exclude those rows
    range_indx = pd.RangeIndex(len(left))
    return left.iloc[range_indx.difference(l_indx),:]

semi_join

`semi_join(left, right=None, on=None, args, , by=None)`

Return the left table with every row that would be kept in an inner join.