3
3
See source https://github.com/tidyverse/dplyr/blob/master/R/distinct.R
4
4
"""
5
5
from pipda import register_verb
6
+ from pipda .symbolic import Reference
6
7
7
8
from ..core .backends .pandas import DataFrame
8
9
from ..core .backends .pandas .core .groupby import GroupBy
11
12
from ..core .factory import func_factory
12
13
from ..core .utils import regcall
13
14
from ..core .tibble import Tibble , TibbleGrouped , reconstruct_tibble
14
- from ..base import union , setdiff , intersect
15
+ from ..base import union , setdiff , intersect , unique
15
16
from .mutate import mutate
16
17
17
18
@@ -33,31 +34,49 @@ def distinct(_data, *args, _keep_all=False, **kwargs):
33
34
A dataframe without duplicated rows in _data
34
35
"""
35
36
if not args and not kwargs :
36
- uniq = _data .drop_duplicates ()
37
+ out = _data .drop_duplicates ()
37
38
else :
38
- # keep_none_prefers_new_order
39
- uniq = (
40
- regcall (
41
- mutate ,
42
- _data ,
43
- * args ,
44
- ** kwargs ,
45
- _keep = "none" ,
39
+ if (
40
+ not kwargs
41
+ # optimize:
42
+ # iris >> distinct(f.Species, f.Sepal_Length)
43
+ # We don't need to do mutation
44
+ and all (
45
+ isinstance (expr , Reference )
46
+ and expr ._pipda_level == 1
47
+ and expr ._pipda_ref in _data .columns
48
+ for expr in args
46
49
)
47
- ).drop_duplicates ()
50
+ ):
51
+ subset = [expr ._pipda_ref for expr in args ]
52
+ ucols = getattr (_data , "group_vars" , [])
53
+ ucols .extend (subset )
54
+ ucols = regcall (unique , ucols )
55
+ uniq = _data .drop_duplicates (subset = subset )[ucols ]
56
+ else :
57
+ # keep_none_prefers_new_order
58
+ uniq = (
59
+ regcall (
60
+ mutate ,
61
+ _data ,
62
+ * args ,
63
+ ** kwargs ,
64
+ _keep = "none" ,
65
+ )
66
+ ).drop_duplicates ()
48
67
49
- if not _keep_all :
50
- # keep original order
51
- out = uniq [
52
- regcall (
53
- union ,
54
- regcall (intersect , _data .columns , uniq .columns ),
55
- regcall (setdiff , uniq .columns , _data .columns ),
56
- )
57
- ]
58
- else :
59
- out = _data .loc [uniq .index , :].copy ()
60
- out [uniq .columns .tolist ()] = uniq
68
+ if not _keep_all :
69
+ # keep original order
70
+ out = uniq [
71
+ regcall (
72
+ union ,
73
+ regcall (intersect , _data .columns , uniq .columns ),
74
+ regcall (setdiff , uniq .columns , _data .columns ),
75
+ )
76
+ ]
77
+ else :
78
+ out = _data .loc [uniq .index , :].copy ()
79
+ out [uniq .columns .tolist ()] = uniq
61
80
62
81
return reconstruct_tibble (_data , Tibble (out , copy = False ))
63
82
0 commit comments