filter joins (anti, semi)
anti_join
anti_join(left, right=None, on=None, *args, *, by=None)
Return the left table with every row that would not be kept in an inner join.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
left |
The left-hand table. |
required | |
right |
The right-hand table. |
None |
|
on |
How to match them. By default it uses matches all columns with the same name across the two tables. |
None |
Examples:
>>> import pandas as pd
>>> from siuba import _, semi_join, anti_join
>>> df1 = pd.DataFrame({"id": [1, 2, 3], "x": ["a", "b", "c"]})
>>> df2 = pd.DataFrame({"id": [2, 3, 3], "y": ["l", "m", "n"]})
>>> df1 >> semi_join(_, df2)
id x
1 2 b
2 3 c
>>> df1 >> anti_join(_, df2)
id x
0 1 a
Generally, it's a good idea to explicitly specify the on argument.
>>> df1 >> anti_join(_, df2, on="id")
id x
0 1 a
Source code in siuba/dply/verbs.py
@singledispatch2((pd.DataFrame, DataFrameGroupBy))
@_bounce_groupby
def anti_join(left, right = None, on = None, *args, by = None):
"""Return the left table with every row that would *not* be kept in an inner join.
Parameters
----------
left :
The left-hand table.
right :
The right-hand table.
on :
How to match them. By default it uses matches all columns with the same
name across the two tables.
Examples
--------
>>> import pandas as pd
>>> from siuba import _, semi_join, anti_join
>>> df1 = pd.DataFrame({"id": [1, 2, 3], "x": ["a", "b", "c"]})
>>> df2 = pd.DataFrame({"id": [2, 3, 3], "y": ["l", "m", "n"]})
>>> df1 >> semi_join(_, df2)
id x
1 2 b
2 3 c
>>> df1 >> anti_join(_, df2)
id x
0 1 a
Generally, it's a good idea to explicitly specify the on argument.
>>> df1 >> anti_join(_, df2, on="id")
id x
0 1 a
"""
if on is None and by is not None:
on = by
# copied from semi_join
if isinstance(on, Mapping):
left_on, right_on = zip(*on.items())
else:
left_on = right_on = on
if isinstance(right, DataFrameGroupBy):
right = right.obj
# manually perform merge, up to getting pieces need for indexing
merger = _MergeOperation(left, right, left_on = left_on, right_on = right_on)
_, l_indx, _ = merger._get_join_info()
# use the left table's indexer to exclude those rows
range_indx = pd.RangeIndex(len(left))
return left.iloc[range_indx.difference(l_indx),:]
semi_join
semi_join(left, right=None, on=None, *args, *, by=None)
Return the left table with every row that would be kept in an inner join.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
left |
The left-hand table. |
required | |
right |
The right-hand table. |
None |
|
on |
How to match them. By default it uses matches all columns with the same name across the two tables. |
None |
Examples:
>>> import pandas as pd
>>> from siuba import _, semi_join, anti_join
>>> df1 = pd.DataFrame({"id": [1, 2, 3], "x": ["a", "b", "c"]})
>>> df2 = pd.DataFrame({"id": [2, 3, 3], "y": ["l", "m", "n"]})
>>> df1 >> semi_join(_, df2)
id x
1 2 b
2 3 c
>>> df1 >> anti_join(_, df2)
id x
0 1 a
Generally, it's a good idea to explicitly specify the on argument.
>>> df1 >> anti_join(_, df2, on="id")
id x
0 1 a
Source code in siuba/dply/verbs.py
@singledispatch2((pd.DataFrame, DataFrameGroupBy))
@_bounce_groupby
def semi_join(left, right = None, on = None, *args, by = None):
"""Return the left table with every row that would be kept in an inner join.
Parameters
----------
left :
The left-hand table.
right :
The right-hand table.
on :
How to match them. By default it uses matches all columns with the same
name across the two tables.
Examples
--------
>>> import pandas as pd
>>> from siuba import _, semi_join, anti_join
>>> df1 = pd.DataFrame({"id": [1, 2, 3], "x": ["a", "b", "c"]})
>>> df2 = pd.DataFrame({"id": [2, 3, 3], "y": ["l", "m", "n"]})
>>> df1 >> semi_join(_, df2)
id x
1 2 b
2 3 c
>>> df1 >> anti_join(_, df2)
id x
0 1 a
Generally, it's a good idea to explicitly specify the on argument.
>>> df1 >> anti_join(_, df2, on="id")
id x
0 1 a
"""
if on is None and by is not None:
on = by
if isinstance(on, Mapping):
# coerce colnames to list, to avoid indexing with tuples
on_cols, right_on = map(list, zip(*on.items()))
right = right[right_on].rename(dict(zip(right_on, on_cols)))
elif on is None:
warnings.warn(
"No on column passed to join. "
"Inferring join columns instead using shared column names."
)
on_cols = list(set(left.columns).intersection(set(right.columns)))
if not len(on_cols):
raise Exception("No join column specified, and no shared column names")
warnings.warn("Detected shared columns: %s" % on_cols)
elif isinstance(on, str):
on_cols = [on]
else:
on_cols = on
# get our semi join on ----
if len(on_cols) == 1:
col_name = on_cols[0]
indx = left[col_name].isin(right[col_name])
return left.loc[indx]
# Not a super efficient approach. Effectively, an inner join with what would
# be duplicate rows removed.
merger = _MergeOperation(left, right, left_on = on_cols, right_on = on_cols)
_, l_indx, _ = merger._get_join_info()
range_indx = pd.RangeIndex(len(left))
return left.loc[range_indx.isin(l_indx)]