From 1705532ea839380f3984476a872a74a991155ca9 Mon Sep 17 00:00:00 2001 From: Thomas H Date: Thu, 24 Apr 2025 18:43:53 +0000 Subject: [PATCH 01/10] add NullKeyWarning --- doc/source/reference/testing.rst | 1 + pandas/errors/__init__.py | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/doc/source/reference/testing.rst b/doc/source/reference/testing.rst index 1f164d1aa98b4..8d918503c0004 100644 --- a/doc/source/reference/testing.rst +++ b/doc/source/reference/testing.rst @@ -46,6 +46,7 @@ Exceptions and warnings errors.MergeError errors.NoBufferPresent errors.NullFrequencyError + errors.NullKeyWarning errors.NumbaUtilError errors.NumExprClobberingError errors.OptionError diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 2b5bc450e41d6..5556a2b6b5491 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -878,6 +878,29 @@ class CategoricalConversionWarning(Warning): """ +class NullKeyWarning(Warning): + """ + Warning raised when grouping on null/NA keys with default `dropna` argument. + + This warning helps ensure data integrity and alerts users to potential issues + during grouping/aggregating when the default value of `dropna` would lead to + null keys being dropped from the output. + + For more information, see discussion of [PDEP-11](#53094) + + See Also + -------- + DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns. + DataFrame.pivot_table : Create a spreadsheet-style pivot table as a DataFrame. + + Examples + -------- + >>> df = pd.DataFrame({"A": ["a", None], "B": [1, 2]}) + >>> df.groupby(["A"]).sum() # doctest: +SKIP + ... # NullKeyWarning: ... + """ + + class LossySetitemError(Exception): """ Raised when trying to do a __setitem__ on an np.ndarray that is not lossless. @@ -927,6 +950,7 @@ class InvalidComparison(Exception): "MergeError", "NoBufferPresent", "NullFrequencyError", + "NullKeyWarning", "NumExprClobberingError", "NumbaUtilError", "OptionError", From bfa58460874f31b7b0b321405cc5c021ddf0a1e8 Mon Sep 17 00:00:00 2001 From: Thomas H Date: Thu, 24 Apr 2025 19:22:22 +0000 Subject: [PATCH 02/10] Add tests --- pandas/tests/groupby/test_groupby_dropna.py | 27 +++++++++++++++++++++ pandas/tests/groupby/test_missing.py | 6 +++-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 8c4ab42b7be7a..1a410dee8ac7a 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -384,6 +384,33 @@ def test_groupby_nan_included(): assert list(result.keys())[0:2] == ["g1", "g2"] +@pytest.mark.parametrize( + "by", + [ + pytest.param("group", id="column"), + pytest.param(pd.Series(["g1", np.nan, "g1", "g2", np.nan]), id="Series"), + pytest.param("_index", id="index"), + ], +) +@pytest.mark.parametrize("dropna", [True, False, None]) +def test_groupby_nan_included_warns(by, dropna): + # GH 61339 + data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} + df = pd.DataFrame(data) + if by == "_index": + df = df.set_index("group") + + kwargs = {} + warning_type = pd.errors.NullKeyWarning + if dropna is not None: + kwargs = {"dropna": dropna} + warning_type = None + + with tm.assert_produces_warning(warning_type): + grouped = df.groupby(by, **kwargs) + result = grouped.indices # noqa:F841 + + def test_groupby_drop_nan_with_multi_index(): # GH 39895 df = pd.DataFrame([[np.nan, 0, 1]], columns=["a", "b", "c"]) diff --git a/pandas/tests/groupby/test_missing.py b/pandas/tests/groupby/test_missing.py index 2b590c50371e9..1d742850b807c 100644 --- a/pandas/tests/groupby/test_missing.py +++ b/pandas/tests/groupby/test_missing.py @@ -83,7 +83,9 @@ def test_min_count(func, min_count, value): def test_indices_with_missing(): # GH 9304 df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]}) - g = df.groupby(["a", "b"]) - result = g.indices + # GH 61339 + with tm.assert_produces_warning(pd.errors.NullKeyWarning): + g = df.groupby(["a", "b"]) + result = g.indices expected = {(1.0, 2): np.array([0]), (1.0, 3): np.array([1])} assert result == expected From 992eafff3eebbc514a6cdad298456263b77c6a2c Mon Sep 17 00:00:00 2001 From: Thomas H Date: Thu, 24 Apr 2025 20:34:00 +0000 Subject: [PATCH 03/10] fix index and Series tests --- pandas/tests/groupby/test_groupby_dropna.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 1a410dee8ac7a..92f8dd4b0d6ca 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -397,8 +397,9 @@ def test_groupby_nan_included_warns(by, dropna): # GH 61339 data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} df = pd.DataFrame(data) - if by == "_index": + if isinstance(by, str) and by == "_index": df = df.set_index("group") + by = "group" kwargs = {} warning_type = pd.errors.NullKeyWarning From d1c5053017f96625adf65b3196a7dd45759da4e9 Mon Sep 17 00:00:00 2001 From: Thomas H Date: Thu, 24 Apr 2025 20:36:53 +0000 Subject: [PATCH 04/10] add multi-index and categorical tests --- pandas/tests/groupby/test_groupby_dropna.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 92f8dd4b0d6ca..4ff97f7827a8b 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -389,13 +389,22 @@ def test_groupby_nan_included(): [ pytest.param("group", id="column"), pytest.param(pd.Series(["g1", np.nan, "g1", "g2", np.nan]), id="Series"), + pytest.param( + pd.Series(["g1", np.nan, "g1", "g2", np.nan]).astype("category"), + id="Categorical", + ), pytest.param("_index", id="index"), + pytest.param(["group", "group2"], id="multikey"), ], ) @pytest.mark.parametrize("dropna", [True, False, None]) def test_groupby_nan_included_warns(by, dropna): # GH 61339 - data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} + data = { + "group": ["g1", np.nan, "g1", "g2", np.nan], + "group2": ["g1", "g2", np.nan, "g2", np.nan], + "B": [0, 1, 2, 3, 4], + } df = pd.DataFrame(data) if isinstance(by, str) and by == "_index": df = df.set_index("group") From 47cabb285ea550b0ab70cc7ca4cc203c571633f4 Mon Sep 17 00:00:00 2001 From: Thomas H Date: Thu, 24 Apr 2025 20:39:28 +0000 Subject: [PATCH 05/10] implement dropna null key warning --- pandas/core/frame.py | 2 +- pandas/core/groupby/groupby.py | 17 ++++++--- pandas/core/groupby/grouper.py | 64 ++++++++++++++++++++++++++-------- 3 files changed, 62 insertions(+), 21 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6158e19737185..6e4df26b2bd1d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9148,7 +9148,7 @@ def groupby( sort: bool = True, group_keys: bool = True, observed: bool = True, - dropna: bool = True, + dropna: bool | lib.NoDefault = lib.no_default, ) -> DataFrameGroupBy: from pandas.core.groupby.generic import DataFrameGroupBy diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 33539e8d294c1..428135df6cce1 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -486,6 +486,12 @@ def __repr__(self) -> str: # TODO: Better repr for GroupBy object return object.__repr__(self) + @property + def dropna(self) -> bool: + if self._dropna is lib.no_default: + return True + return self._dropna + @final @property def groups(self) -> dict[Hashable, Index]: @@ -1053,7 +1059,7 @@ def __init__( sort: bool = True, group_keys: bool = True, observed: bool = False, - dropna: bool = True, + dropna: bool | lib.NoDefault = lib.no_default, ) -> None: self._selection = selection @@ -1064,7 +1070,7 @@ def __init__( self.keys = keys self.sort = sort self.group_keys = group_keys - self.dropna = dropna + self._dropna = dropna if grouper is None: grouper, exclusions, obj = get_grouper( @@ -1073,7 +1079,7 @@ def __init__( level=level, sort=sort, observed=observed, - dropna=self.dropna, + dropna=self._dropna, ) self.observed = observed @@ -2664,7 +2670,8 @@ def _value_counts( groupings, sort=False, observed=self.observed, - dropna=self.dropna, + # TODO: Should we pass through lib.no_default? + dropna=self._dropna, ) result_series = cast(Series, gb.size()) result_series.name = name @@ -2695,7 +2702,7 @@ def _value_counts( indexed_group_size = result_series.groupby( result_series.index.droplevel(levels), sort=self.sort, - dropna=self.dropna, + dropna=self._dropna, # GH#43999 - deprecation of observed=False observed=False, ).transform("sum") diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index c9d874fc08dbe..81791219868d4 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -9,12 +9,18 @@ TYPE_CHECKING, final, ) +import warnings import numpy as np +from pandas._libs import lib from pandas._libs.tslibs import OutOfBoundsDatetime -from pandas.errors import InvalidIndexError +from pandas.errors import ( + InvalidIndexError, + NullKeyWarning, +) from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_list_like, @@ -55,6 +61,13 @@ from pandas.core.generic import NDFrame +_NULL_KEY_MESSAGE = ( + "`dropna` is not specified but grouper encountered null group keys. These keys " + "will be dropped from the result by default. To keep null keys, set `dropna=True`, " + "or to hide this warning and drop null keys, set `dropna=False`." +) + + class Grouper: """ A Grouper allows the user to specify a groupby instruction for an object. @@ -246,7 +259,7 @@ class Grouper: """ sort: bool - dropna: bool + dropna: bool | lib.NoDefault _grouper: Index | None _attributes: tuple[str, ...] = ("key", "level", "freq", "sort", "dropna") @@ -264,7 +277,7 @@ def __init__( level=None, freq=None, sort: bool = False, - dropna: bool = True, + dropna: bool | lib.NoDefault = lib.no_default, ) -> None: self.key = key self.level = level @@ -442,7 +455,7 @@ def __init__( sort: bool = True, observed: bool = False, in_axis: bool = False, - dropna: bool = True, + dropna: bool | lib.NoDefault = lib.no_default, uniques: ArrayLike | None = None, ) -> None: self.level = level @@ -599,6 +612,12 @@ def codes(self) -> npt.NDArray[np.signedinteger]: def uniques(self) -> ArrayLike: return self._codes_and_uniques[1] + @property + def dropna(self) -> bool: + if self._dropna is lib.no_default: + return True + return self._dropna + @cache_readonly def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: uniques: ArrayLike @@ -617,11 +636,11 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: else: ucodes = np.arange(len(categories)) - has_dropped_na = False - if not self._dropna: - na_mask = cat.isna() - if np.any(na_mask): - has_dropped_na = True + has_na_values = False + na_mask = cat.isna() + if np.any(na_mask): + has_na_values = True + if not self.dropna: if self._sort: # NA goes at the end, gets `largest non-NA code + 1` na_code = len(categories) @@ -637,11 +656,18 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: ) codes = cat.codes - if has_dropped_na: - if not self._sort: - # NA code is based on first appearance, increment higher codes - codes = np.where(codes >= na_code, codes + 1, codes) - codes = np.where(na_mask, na_code, codes) + if has_na_values: + if not self.dropna: + if not self._sort: + # NA code is based on first appearance, increment higher codes + codes = np.where(codes >= na_code, codes + 1, codes) + codes = np.where(na_mask, na_code, codes) + elif self._dropna is lib.no_default: + warnings.warn( + _NULL_KEY_MESSAGE, + NullKeyWarning, + stacklevel=find_stack_level(), + ) return codes, uniques @@ -660,8 +686,16 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: # error: Incompatible types in assignment (expression has type "Union[ # ndarray[Any, Any], Index]", variable has type "Categorical") codes, uniques = algorithms.factorize( # type: ignore[assignment] - self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna + self.grouping_vector, sort=self._sort, use_na_sentinel=self.dropna ) + # TODO: Is `min(codes)` or `-1 in codes` faster? + if self._dropna is lib.no_default and (codes == -1).any(): + warnings.warn( + _NULL_KEY_MESSAGE, + NullKeyWarning, + stacklevel=find_stack_level(), + ) + return codes, uniques @cache_readonly From 0bf986ec904529f62a91992ce9b4324b4b451099 Mon Sep 17 00:00:00 2001 From: Thomas H Date: Thu, 24 Apr 2025 20:54:25 +0000 Subject: [PATCH 06/10] add test for `Series.groupby` --- pandas/tests/groupby/test_groupby_dropna.py | 28 +++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 4ff97f7827a8b..5bb49eb0b2b38 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -421,6 +421,34 @@ def test_groupby_nan_included_warns(by, dropna): result = grouped.indices # noqa:F841 +@pytest.mark.parametrize( + "by_type", + [ + "level", + "argument", + ], +) +@pytest.mark.parametrize("dropna", [True, False, None]) +def test_groupby_series_nan_included_warns(by_type, dropna): + # GH 61339 + index = ["a", "a", "b", np.nan] + ser = pd.Series([1, 2, 3, 3]) + + if by_type == "level": + ser = ser.set_axis(index, axis=0) + kwargs = {"level": 0} + elif by_type == "argument": + kwargs = {"by": index} + + warning_type = pd.errors.NullKeyWarning + if dropna is not None: + kwargs["dropna"] = dropna + warning_type = None + + with tm.assert_produces_warning(warning_type): + ser.groupby(**kwargs).sum() + + def test_groupby_drop_nan_with_multi_index(): # GH 39895 df = pd.DataFrame([[np.nan, 0, 1]], columns=["a", "b", "c"]) From cee2378f2cc4b6d60fb82ea89c082a47d4bb6515 Mon Sep 17 00:00:00 2001 From: Thomas H Date: Thu, 24 Apr 2025 20:55:57 +0000 Subject: [PATCH 07/10] implement for `Series.groupby` --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index d6a982c65e9fd..6b3033e17973c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1972,7 +1972,7 @@ def groupby( sort: bool = True, group_keys: bool = True, observed: bool = False, - dropna: bool = True, + dropna: bool | lib.NoDefault = lib.no_default, ) -> SeriesGroupBy: from pandas.core.groupby.generic import SeriesGroupBy From 41131a14324ababc5c81f194de3d9a239d120f27 Mon Sep 17 00:00:00 2001 From: Thomas H Date: Fri, 25 Apr 2025 13:50:29 +0000 Subject: [PATCH 08/10] add mode.null_grouper_warning option --- pandas/core/config_init.py | 16 ++++++++++++++++ pandas/core/groupby/grouper.py | 12 +++++++++--- pandas/tests/groupby/test_groupby_dropna.py | 6 ++++++ 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 20fe8cbab1c9f..1cb1750af0092 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -457,6 +457,22 @@ def is_terminal() -> bool: ) +null_grouper_warning = """ +: string + Whether to show or hide NullKeyWarning if default grouping would result in a + null group key being dropped, + The default is False +""" + +with cf.config_prefix("mode"): + cf.register_option( + "null_grouper_warning", + False, + null_grouper_warning, + validator=is_bool, + ) + + string_storage_doc = """ : string The default storage for StringDtype. diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 81791219868d4..e6072dd518fd5 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -13,6 +13,8 @@ import numpy as np +from pandas._config.config import get_option + from pandas._libs import lib from pandas._libs.tslibs import OutOfBoundsDatetime from pandas.errors import ( @@ -621,6 +623,7 @@ def dropna(self) -> bool: @cache_readonly def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: uniques: ArrayLike + unspecified_dropna = self._dropna is lib.no_default if self._passed_categorical: # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes; @@ -662,7 +665,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: # NA code is based on first appearance, increment higher codes codes = np.where(codes >= na_code, codes + 1, codes) codes = np.where(na_mask, na_code, codes) - elif self._dropna is lib.no_default: + elif get_option("null_grouper_warning") and unspecified_dropna: warnings.warn( _NULL_KEY_MESSAGE, NullKeyWarning, @@ -688,8 +691,11 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: codes, uniques = algorithms.factorize( # type: ignore[assignment] self.grouping_vector, sort=self._sort, use_na_sentinel=self.dropna ) - # TODO: Is `min(codes)` or `-1 in codes` faster? - if self._dropna is lib.no_default and (codes == -1).any(): + if ( + get_option("null_grouper_warning") + and unspecified_dropna + and codes.min() == -1 + ): warnings.warn( _NULL_KEY_MESSAGE, NullKeyWarning, diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 5bb49eb0b2b38..6311e89e61d44 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -10,6 +10,12 @@ from pandas.tests.groupby import get_groupby_method_args +@pytest.fixture(scope="module", autouse=True) +def setup_warnings(): + with pd.option_context("mode.null_grouper_warning", True): + yield + + @pytest.mark.parametrize( "dropna, tuples, outputs", [ From 692c153a6bcfac2ac4306cfc8d12481f8a1e9f6b Mon Sep 17 00:00:00 2001 From: Thomas H Date: Fri, 25 Apr 2025 21:14:43 +0000 Subject: [PATCH 09/10] fix tests which trigger NullKeyWarning this will help with PDEP-11 (#53094) as an intermediate step to identify tests that will fail under the default value --- pandas/tests/groupby/test_categorical.py | 14 ++++++++------ pandas/tests/groupby/test_groupby.py | 21 +++++++++++---------- pandas/tests/groupby/test_grouping.py | 4 ++-- pandas/tests/groupby/test_indexing.py | 1 + pandas/tests/groupby/test_reductions.py | 19 +++++++++++-------- 5 files changed, 33 insertions(+), 26 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e49be8c00b426..f6c12b677d615 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -192,7 +192,7 @@ def test_basic_cut_grouping(): # GH 9603 df = DataFrame({"a": [1, 0, 0, 0]}) c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list("abcd"))) - result = df.groupby(c, observed=False).apply(len) + result = df.groupby(c, observed=False, dropna=True).apply(len) exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered) expected = Series([1, 0, 0, 0], index=exp_index) @@ -568,7 +568,7 @@ def test_observed_groups_with_nan(observed): "vals": [1, 2, 3], } ) - g = df.groupby("cat", observed=observed) + g = df.groupby("cat", observed=observed, dropna=True) result = g.groups if observed: expected = {"a": Index([0, 2], dtype="int64")} @@ -587,7 +587,7 @@ def test_observed_nth(): ser = Series([1, 2, 3]) df = DataFrame({"cat": cat, "ser": ser}) - result = df.groupby("cat", observed=False)["ser"].nth(0) + result = df.groupby("cat", observed=False, dropna=True)["ser"].nth(0) expected = df["ser"].iloc[[0]] tm.assert_series_equal(result, expected) @@ -597,7 +597,7 @@ def test_dataframe_categorical_with_nan(observed): s1 = Categorical([np.nan, "a", np.nan, "a"], categories=["a", "b", "c"]) s2 = Series([1, 2, 3, 4]) df = DataFrame({"s1": s1, "s2": s2}) - result = df.groupby("s1", observed=observed).first().reset_index() + result = df.groupby("s1", observed=observed, dropna=True).first().reset_index() if observed: expected = DataFrame( {"s1": Categorical(["a"], categories=["a", "b", "c"]), "s2": [2]} @@ -768,7 +768,9 @@ def test_categorical_series(series, data): # Group the given series by a series with categorical data type such that group A # takes indices 0 and 3 and group B indices 1 and 2, obtaining the values mapped in # the given data. - groupby = series.groupby(Series(list("ABBA"), dtype="category"), observed=False) + groupby = series.groupby( + Series(list("ABBA"), dtype="category"), observed=False, dropna=True + ) result = groupby.aggregate(list) expected = Series(data, index=CategoricalIndex(data.keys())) tm.assert_series_equal(result, expected) @@ -973,7 +975,7 @@ def test_groupby_empty_with_category(): # test fix for when group by on None resulted in # coercion of dtype categorical -> float df = DataFrame({"A": [None] * 3, "B": Categorical(["train", "train", "test"])}) - result = df.groupby("A").first()["B"] + result = df.groupby("A", dropna=True).first()["B"] expected = Series( Categorical([], categories=["test", "train"]), index=Series([], dtype="object", name="A"), diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4b1f23c1f755e..cbbd4b5fb33d5 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -140,9 +140,9 @@ def test_len(): def test_len_nan_group(): # issue 11016 df = DataFrame({"a": [np.nan] * 3, "b": [1, 2, 3]}) - assert len(df.groupby("a")) == 0 + assert len(df.groupby("a", dropna=True)) == 0 assert len(df.groupby("b")) == 3 - assert len(df.groupby(["a", "b"])) == 0 + assert len(df.groupby(["a", "b"], dropna=True)) == 0 def test_groupby_timedelta_median(): @@ -922,6 +922,7 @@ def test_groupby_complex_numbers(): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings("ignore::pandas.errors.NullKeyWarning") def test_groupby_series_indexed_differently(): s1 = Series( [5.0, -9.0, 4.0, 100.0, -5.0, 55.0, 6.7], @@ -1215,7 +1216,7 @@ def test_groupby_nat_exclude(): "str": [np.nan, "a", np.nan, "a", np.nan, "a", np.nan, "b"], } ) - grouped = df.groupby("dt") + grouped = df.groupby("dt", dropna=True) expected = [ RangeIndex(start=1, stop=13, step=6), @@ -1253,7 +1254,7 @@ def test_groupby_nat_exclude(): assert nan_df["nat"].dtype == "datetime64[s]" for key in ["nan", "nat"]: - grouped = nan_df.groupby(key) + grouped = nan_df.groupby(key, dropna=True) assert grouped.groups == {} assert grouped.ngroups == 0 assert grouped.indices == {} @@ -1266,7 +1267,7 @@ def test_groupby_nat_exclude(): def test_groupby_two_group_keys_all_nan(): # GH #36842: Grouping over two group keys shouldn't raise an error df = DataFrame({"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 2]}) - result = df.groupby(["a", "b"]).indices + result = df.groupby(["a", "b"], dropna=True).indices assert result == {} @@ -2050,7 +2051,7 @@ def test_groupby_only_none_group(): # see GH21624 # this was crashing with "ValueError: Length of passed values is 1, index implies 0" df = DataFrame({"g": [None], "x": 1}) - actual = df.groupby("g")["x"].transform("sum") + actual = df.groupby("g", dropna=True)["x"].transform("sum") expected = Series([np.nan], name="x") tm.assert_series_equal(actual, expected) @@ -2295,7 +2296,7 @@ def test_groupby_mean_duplicate_index(rand_series_with_duplicate_datetimeindex): def test_groupby_all_nan_groups_drop(): # GH 15036 s = Series([1, 2, 3], [np.nan, np.nan, np.nan]) - result = s.groupby(s.index).sum() + result = s.groupby(s.index, dropna=True).sum() expected = Series([], index=Index([], dtype=np.float64), dtype=np.int64) tm.assert_series_equal(result, expected) @@ -2459,7 +2460,7 @@ def test_groupby_none_in_first_mi_level(): # GH#47348 arr = [[None, 1, 0, 1], [2, 3, 2, 3]] ser = Series(1, index=MultiIndex.from_arrays(arr, names=["a", "b"])) - result = ser.groupby(level=[0, 1]).sum() + result = ser.groupby(level=[0, 1], dropna=True).sum() expected = Series( [1, 2], MultiIndex.from_tuples([(0.0, 2), (1.0, 3)], names=["a", "b"]) ) @@ -2632,9 +2633,9 @@ def test_groupby_method_drop_na(method): df = DataFrame({"A": ["a", np.nan, "b", np.nan, "c"], "B": range(5)}) if method == "nth": - result = getattr(df.groupby("A"), method)(n=0) + result = getattr(df.groupby("A", dropna=True), method)(n=0) else: - result = getattr(df.groupby("A"), method)() + result = getattr(df.groupby("A", dropna=True), method)() if method in ["first", "last"]: expected = DataFrame({"B": [0, 2, 4]}).set_index( diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 53e9c53efebf7..fd86f6150215a 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -691,7 +691,7 @@ def test_groupby_level_with_nas(self, sort): # factorizing doesn't confuse things s = Series(np.arange(8.0), index=index) - result = s.groupby(level=0, sort=sort).sum() + result = s.groupby(level=0, sort=sort, dropna=True).sum() expected = Series([6.0, 18.0], index=[0.0, 1.0]) tm.assert_series_equal(result, expected) @@ -817,7 +817,7 @@ def test_groupby_level_index_value_all_na(self): df = DataFrame( [["x", np.nan, 10], [None, np.nan, 20]], columns=["A", "B", "C"] ).set_index(["A", "B"]) - result = df.groupby(level=["A", "B"]).sum() + result = df.groupby(level=["A", "B"], dropna=True).sum() expected = DataFrame( data=[], index=MultiIndex( diff --git a/pandas/tests/groupby/test_indexing.py b/pandas/tests/groupby/test_indexing.py index a3d3f509e186a..60856c94c3b46 100644 --- a/pandas/tests/groupby/test_indexing.py +++ b/pandas/tests/groupby/test_indexing.py @@ -294,6 +294,7 @@ def test_groupby_duplicated_columns(func): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings("ignore::pandas.errors.NullKeyWarning") def test_groupby_get_nonexisting_groups(): # GH#32492 df = pd.DataFrame( diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 014558bbf4bba..06986d73b72cd 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -656,7 +656,7 @@ def test_multifunc_skipna(func, values, dtype, result_dtype, skipna): tm.assert_series_equal(result, expected) -def test_cython_median(): +def test_cython_median(dropna): arr = np.random.default_rng(2).standard_normal(1000) arr[::2] = np.nan df = DataFrame(arr) @@ -664,24 +664,26 @@ def test_cython_median(): labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float) labels[::17] = np.nan - result = df.groupby(labels).median() - exp = df.groupby(labels).agg(np.nanmedian) + result = df.groupby(labels, dropna=dropna).median() + exp = df.groupby(labels, dropna=dropna).agg(np.nanmedian) tm.assert_frame_equal(result, exp) df = DataFrame(np.random.default_rng(2).standard_normal((1000, 5))) - rs = df.groupby(labels).agg(np.median) - xp = df.groupby(labels).median() + rs = df.groupby(labels, dropna=dropna).agg(np.median) + xp = df.groupby(labels, dropna=dropna).median() tm.assert_frame_equal(rs, xp) -def test_median_empty_bins(observed): +def test_median_empty_bins(observed, dropna): df = DataFrame(np.random.default_rng(2).integers(0, 44, 500)) grps = range(0, 55, 5) bins = pd.cut(df[0], grps) - result = df.groupby(bins, observed=observed).median() - expected = df.groupby(bins, observed=observed).agg(lambda x: x.median()) + result = df.groupby(bins, observed=observed, dropna=dropna).median() + expected = df.groupby(bins, observed=observed, dropna=dropna).agg( + lambda x: x.median() + ) tm.assert_frame_equal(result, expected) @@ -1069,6 +1071,7 @@ def test_max_nan_bug(): @pytest.mark.slow +@pytest.mark.filterwarnings("ignore::pandas.errors.NullKeyWarning") @pytest.mark.parametrize("with_nan", [True, False]) @pytest.mark.parametrize("keys", [["joe"], ["joe", "jim"]]) def test_series_groupby_nunique(sort, dropna, as_index, with_nan, keys): From 9822a66cb62d48898b6b703164d331f375295672 Mon Sep 17 00:00:00 2001 From: Thomas H Date: Fri, 25 Apr 2025 21:16:56 +0000 Subject: [PATCH 10/10] resolve repr change and empty grouper bug --- pandas/core/groupby/grouper.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e6072dd518fd5..a4a5d075cfc43 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -285,13 +285,19 @@ def __init__( self.level = level self.freq = freq self.sort = sort - self.dropna = dropna + self._dropna = dropna self._indexer_deprecated: npt.NDArray[np.intp] | None = None self.binner = None self._grouper = None self._indexer: npt.NDArray[np.intp] | None = None + @property + def dropna(self): + if self._dropna is lib.no_default: + return True + return self._dropna + def _get_grouper( self, obj: NDFrameT, validate: bool = True ) -> tuple[ops.BaseGrouper, NDFrameT]: @@ -694,7 +700,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: if ( get_option("null_grouper_warning") and unspecified_dropna - and codes.min() == -1 + and codes.min(initial=0) == -1 ): warnings.warn( _NULL_KEY_MESSAGE,