🔖 0.8.6 (#134)

pwwang · web-flow · commit cd2595b4b4c2 · 2022-08-25T10:30:48.000-07:00
diff --git a/datar/__init__.py b/datar/__init__.py
@@ -13,7 +13,7 @@
 )
 
 __all__ = ("f", "get_versions")
-__version__ = "0.8.5"
+__version__ = "0.8.6"
 
 apply_init_callbacks()
 
diff --git a/datar/base/arithmetic.py b/datar/base/arithmetic.py
@@ -1,5 +1,6 @@
 """Arithmetic or math functions"""
 
+from functools import singledispatch
 import inspect
 from typing import TYPE_CHECKING, Union
 
@@ -883,18 +884,42 @@ def std(
 sd = std
 
 
-@func_factory("transform", {"x", "w"})
-def weighted_mean(
-    x: Series, w: Series = 1, na_rm=True, __args_raw=None
-) -> Series:
-    """Calculate weighted mean"""
-    if __args_raw["w"] is not None and np.nansum(w) == 0:
+@singledispatch
+def _weighted_mean(
+    df: DataFrame,
+    has_w: bool = True,
+    na_rm: bool = True,
+) -> np.ndarray:
+    if not has_w:
+        return np.nanmean(df["x"]) if na_rm else np.mean(df["x"])
+
+    if np.nansum(df["w"]) == 0:
         return np.nan
 
     if na_rm:
-        na_mask = pd.isnull(x)
-        x = x[~na_mask.values]
-        w = w[~na_mask.values]
+        na_mask = pd.isnull(df["x"])
+        x = df["x"][~na_mask.values]
+        w = df["w"][~na_mask.values]
         return np.average(x, weights=w)
 
-    return np.average(x, weights=w)
+    return np.average(df["x"], weights=df["w"])
+
+
+@_weighted_mean.register(TibbleGrouped)
+def _(
+    df: TibbleGrouped,
+    has_w: bool = True,
+    na_rm: bool = True,
+) -> Series:
+    return df._datar["grouped"].apply(
+        lambda subdf: _weighted_mean(subdf, has_w, na_rm)
+    )
+
+
+@func_factory(None, {"x", "w"})
+def weighted_mean(
+    x: Series, w: Series = 1, na_rm=True, __args_raw=None, __args_frame=None,
+) -> Series:
+    """Calculate weighted mean"""
+    has_w = __args_raw["w"] is not None
+    return _weighted_mean(__args_frame, has_w, na_rm)
diff --git a/datar/base/verbs.py b/datar/base/verbs.py
@@ -234,7 +234,7 @@ def union(x, y):
 
 @register_verb(context=Context.EVAL)
 def unique(x):
-    """Union of two iterables"""
+    """Get unique elements from an iterable and keep their order"""
     # order not kept
     # return np.unique(x)
     if is_scalar(x):
diff --git a/datar/dplyr/distinct.py b/datar/dplyr/distinct.py
@@ -3,6 +3,7 @@
 See source https://github.com/tidyverse/dplyr/blob/master/R/distinct.R
 """
 from pipda import register_verb
+from pipda.symbolic import Reference
 
 from ..core.backends.pandas import DataFrame
 from ..core.backends.pandas.core.groupby import GroupBy
@@ -11,7 +12,7 @@
 from ..core.factory import func_factory
 from ..core.utils import regcall
 from ..core.tibble import Tibble, TibbleGrouped, reconstruct_tibble
-from ..base import union, setdiff, intersect
+from ..base import union, setdiff, intersect, unique
 from .mutate import mutate
 
 
@@ -33,31 +34,49 @@ def distinct(_data, *args, _keep_all=False, **kwargs):
         A dataframe without duplicated rows in _data
     """
     if not args and not kwargs:
-        uniq = _data.drop_duplicates()
+        out = _data.drop_duplicates()
     else:
-        # keep_none_prefers_new_order
-        uniq = (
-            regcall(
-                mutate,
-                _data,
-                *args,
-                **kwargs,
-                _keep="none",
+        if (
+            not kwargs
+            # optimize:
+            # iris >> distinct(f.Species, f.Sepal_Length)
+            # We don't need to do mutation
+            and all(
+                isinstance(expr, Reference)
+                and expr._pipda_level == 1
+                and expr._pipda_ref in _data.columns
+                for expr in args
             )
-        ).drop_duplicates()
+        ):
+            subset = [expr._pipda_ref for expr in args]
+            ucols = getattr(_data, "group_vars", [])
+            ucols.extend(subset)
+            ucols = regcall(unique, ucols)
+            uniq = _data.drop_duplicates(subset=subset)[ucols]
+        else:
+            # keep_none_prefers_new_order
+            uniq = (
+                regcall(
+                    mutate,
+                    _data,
+                    *args,
+                    **kwargs,
+                    _keep="none",
+                )
+            ).drop_duplicates()
 
-    if not _keep_all:
-        # keep original order
-        out = uniq[
-            regcall(
-                union,
-                regcall(intersect, _data.columns, uniq.columns),
-                regcall(setdiff, uniq.columns, _data.columns),
-            )
-        ]
-    else:
-        out = _data.loc[uniq.index, :].copy()
-        out[uniq.columns.tolist()] = uniq
+        if not _keep_all:
+            # keep original order
+            out = uniq[
+                regcall(
+                    union,
+                    regcall(intersect, _data.columns, uniq.columns),
+                    regcall(setdiff, uniq.columns, _data.columns),
+                )
+            ]
+        else:
+            out = _data.loc[uniq.index, :].copy()
+            out[uniq.columns.tolist()] = uniq
 
     return reconstruct_tibble(_data, Tibble(out, copy=False))
 
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -1,3 +1,9 @@
+## 0.8.6
+
+- 🐛 Fix weighted_mean not working for grouped data (#133)
+- ✅ Add tests for weighted_mean on grouped data
+- ⚡️ Optimize distinct on existing columns (#128)
+
 ## 0.8.5
 
 - 🐛 Fix columns missing after Join by same columns using mapping (#122)
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,12 +1,9 @@
 # use_directory_urls doesn't work for newer versions
-mkdocs==1.1.2
-# AttributeError: module 'jinja2' has no attribute 'contextfilter'
-# jinja2==3.1.0
-jinja2==3.0.3
-mkdocs-material==7.2.3
-pymdown-extensions==8.2
+mkdocs
+mkdocs-material
+pymdown-extensions
 mkapi-fix
-mkdocs-jupyter==0.17.3
+mkdocs-jupyter
 ipykernel
 ipython_genutils
 # to compile readme.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "datar"
-version = "0.8.5"
+version = "0.8.6"
 description = "Port of dplyr and other related R packages in python, using pipda."
 authors = ["pwwang <pwwang@pwwang.com>"]
 readme = "README.md"
diff --git a/tests/base/test_stats.py b/tests/base/test_stats.py
@@ -16,6 +16,11 @@ def test_weighted_mean():
     with pytest.raises(ValueError):
         weighted_mean([1,2], [1,2,3])
 
+    df = tibble(g=[1, 1, 2, 2], x=[1, 2, 3, 4], w=[1, 3, 3, 3]).group_by('g')
+    assert weighted_mean(df.g.obj, w=None) == 1.5
+    assert_iterable_equal(weighted_mean(df.g), [1, 2])
+    assert_iterable_equal(weighted_mean(df.x, w=df.w), [1.75, 3.5])
+
 
 def test_quantile():
     df = tibble(x=[1, 2, 3], g=[1, 2, 2])
diff --git a/tests/dplyr/test_distinct.py b/tests/dplyr/test_distinct.py
@@ -23,6 +23,7 @@
 )
 from datar.tibble import tibble
 from datar.datasets import iris
+from datar.testing import assert_frame_equal
 
 
 def test_single_column():
@@ -51,7 +52,7 @@ def test_keeps_only_specified_cols():
     df = tibble(x=c(1, 1, 1), y=c(1, 1, 1))
     expect = tibble(x=1)
     out = df >> distinct(f.x)
-    assert out.equals(expect)
+    assert_frame_equal(out, expect)
 
 
 def test_unless_keep_all_true():

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@`
`13`	`13`	`)`
`14`	`14`
`15`	`15`	`__all__ = ("f", "get_versions")`
`16`		`-__version__ = "0.8.5"`
	`16`	`+__version__ = "0.8.6"`
`17`	`17`
`18`	`18`	`apply_init_callbacks()`
`19`	`19`