Skip to content

Add warning to .groupby when null keys would be dropped due to default dropna #61351

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions doc/source/reference/testing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ Exceptions and warnings
errors.MergeError
errors.NoBufferPresent
errors.NullFrequencyError
errors.NullKeyWarning
errors.NumbaUtilError
errors.NumExprClobberingError
errors.OptionError
Expand Down
16 changes: 16 additions & 0 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,22 @@ def is_terminal() -> bool:
)


null_grouper_warning = """
: string
Whether to show or hide NullKeyWarning if default grouping would result in a
null group key being dropped,
The default is False
"""

with cf.config_prefix("mode"):
cf.register_option(
"null_grouper_warning",
False,
null_grouper_warning,
validator=is_bool,
)


string_storage_doc = """
: string
The default storage for StringDtype.
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9148,7 +9148,7 @@ def groupby(
sort: bool = True,
group_keys: bool = True,
observed: bool = True,
dropna: bool = True,
dropna: bool | lib.NoDefault = lib.no_default,
) -> DataFrameGroupBy:
from pandas.core.groupby.generic import DataFrameGroupBy

Expand Down
17 changes: 12 additions & 5 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,12 @@ def __repr__(self) -> str:
# TODO: Better repr for GroupBy object
return object.__repr__(self)

@property
def dropna(self) -> bool:
if self._dropna is lib.no_default:
return True
return self._dropna
Comment on lines +489 to +493
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know the implementation is trivial, but this is redundant with Grouper. I'm not sure we can get around it while still being a class property, but should the default value be referenced as a constant defined just once?


@final
@property
def groups(self) -> dict[Hashable, Index]:
Expand Down Expand Up @@ -1053,7 +1059,7 @@ def __init__(
sort: bool = True,
group_keys: bool = True,
observed: bool = False,
dropna: bool = True,
dropna: bool | lib.NoDefault = lib.no_default,
) -> None:
self._selection = selection

Expand All @@ -1064,7 +1070,7 @@ def __init__(
self.keys = keys
self.sort = sort
self.group_keys = group_keys
self.dropna = dropna
self._dropna = dropna

if grouper is None:
grouper, exclusions, obj = get_grouper(
Expand All @@ -1073,7 +1079,7 @@ def __init__(
level=level,
sort=sort,
observed=observed,
dropna=self.dropna,
dropna=self._dropna,
)

self.observed = observed
Expand Down Expand Up @@ -2664,7 +2670,8 @@ def _value_counts(
groupings,
sort=False,
observed=self.observed,
dropna=self.dropna,
# TODO: Should we pass through lib.no_default?
dropna=self._dropna,
)
result_series = cast(Series, gb.size())
result_series.name = name
Expand Down Expand Up @@ -2695,7 +2702,7 @@ def _value_counts(
indexed_group_size = result_series.groupby(
result_series.index.droplevel(levels),
sort=self.sort,
dropna=self.dropna,
dropna=self._dropna,
# GH#43999 - deprecation of observed=False
observed=False,
).transform("sum")
Expand Down
78 changes: 62 additions & 16 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,20 @@
TYPE_CHECKING,
final,
)
import warnings

import numpy as np

from pandas._config.config import get_option

from pandas._libs import lib
from pandas._libs.tslibs import OutOfBoundsDatetime
from pandas.errors import InvalidIndexError
from pandas.errors import (
InvalidIndexError,
NullKeyWarning,
)
from pandas.util._decorators import cache_readonly
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.common import (
is_list_like,
Expand Down Expand Up @@ -55,6 +63,13 @@
from pandas.core.generic import NDFrame


_NULL_KEY_MESSAGE = (
"`dropna` is not specified but grouper encountered null group keys. These keys "
"will be dropped from the result by default. To keep null keys, set `dropna=True`, "
"or to hide this warning and drop null keys, set `dropna=False`."
)
Comment on lines +66 to +70
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this a standard approach for a warning message that could be hit from two lines of code?



class Grouper:
"""
A Grouper allows the user to specify a groupby instruction for an object.
Expand Down Expand Up @@ -246,7 +261,7 @@ class Grouper:
"""

sort: bool
dropna: bool
dropna: bool | lib.NoDefault
_grouper: Index | None

_attributes: tuple[str, ...] = ("key", "level", "freq", "sort", "dropna")
Expand All @@ -264,19 +279,25 @@ def __init__(
level=None,
freq=None,
sort: bool = False,
dropna: bool = True,
dropna: bool | lib.NoDefault = lib.no_default,
) -> None:
self.key = key
self.level = level
self.freq = freq
self.sort = sort
self.dropna = dropna
self._dropna = dropna

self._indexer_deprecated: npt.NDArray[np.intp] | None = None
self.binner = None
self._grouper = None
self._indexer: npt.NDArray[np.intp] | None = None

@property
def dropna(self):
if self._dropna is lib.no_default:
return True
return self._dropna

def _get_grouper(
self, obj: NDFrameT, validate: bool = True
) -> tuple[ops.BaseGrouper, NDFrameT]:
Expand Down Expand Up @@ -442,7 +463,7 @@ def __init__(
sort: bool = True,
observed: bool = False,
in_axis: bool = False,
dropna: bool = True,
dropna: bool | lib.NoDefault = lib.no_default,
uniques: ArrayLike | None = None,
) -> None:
self.level = level
Expand Down Expand Up @@ -599,9 +620,16 @@ def codes(self) -> npt.NDArray[np.signedinteger]:
def uniques(self) -> ArrayLike:
return self._codes_and_uniques[1]

@property
def dropna(self) -> bool:
if self._dropna is lib.no_default:
return True
return self._dropna

@cache_readonly
def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
uniques: ArrayLike
unspecified_dropna = self._dropna is lib.no_default
if self._passed_categorical:
# we make a CategoricalIndex out of the cat grouper
# preserving the categories / ordered attributes;
Expand All @@ -617,11 +645,11 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
else:
ucodes = np.arange(len(categories))

has_dropped_na = False
if not self._dropna:
na_mask = cat.isna()
if np.any(na_mask):
has_dropped_na = True
has_na_values = False
na_mask = cat.isna()
if np.any(na_mask):
has_na_values = True
if not self.dropna:
if self._sort:
# NA goes at the end, gets `largest non-NA code + 1`
na_code = len(categories)
Expand All @@ -637,11 +665,18 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
)
codes = cat.codes

if has_dropped_na:
if not self._sort:
# NA code is based on first appearance, increment higher codes
codes = np.where(codes >= na_code, codes + 1, codes)
codes = np.where(na_mask, na_code, codes)
if has_na_values:
if not self.dropna:
if not self._sort:
# NA code is based on first appearance, increment higher codes
codes = np.where(codes >= na_code, codes + 1, codes)
codes = np.where(na_mask, na_code, codes)
elif get_option("null_grouper_warning") and unspecified_dropna:
warnings.warn(
_NULL_KEY_MESSAGE,
NullKeyWarning,
stacklevel=find_stack_level(),
)

return codes, uniques

Expand All @@ -660,8 +695,19 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
# error: Incompatible types in assignment (expression has type "Union[
# ndarray[Any, Any], Index]", variable has type "Categorical")
codes, uniques = algorithms.factorize( # type: ignore[assignment]
self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna
self.grouping_vector, sort=self._sort, use_na_sentinel=self.dropna
)
if (
get_option("null_grouper_warning")
and unspecified_dropna
and codes.min(initial=0) == -1
):
warnings.warn(
_NULL_KEY_MESSAGE,
NullKeyWarning,
stacklevel=find_stack_level(),
)

return codes, uniques

@cache_readonly
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1972,7 +1972,7 @@ def groupby(
sort: bool = True,
group_keys: bool = True,
observed: bool = False,
dropna: bool = True,
dropna: bool | lib.NoDefault = lib.no_default,
) -> SeriesGroupBy:
from pandas.core.groupby.generic import SeriesGroupBy

Expand Down
24 changes: 24 additions & 0 deletions pandas/errors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -878,6 +878,29 @@ class CategoricalConversionWarning(Warning):
"""


class NullKeyWarning(Warning):
"""
Warning raised when grouping on null/NA keys with default `dropna` argument.

This warning helps ensure data integrity and alerts users to potential issues
during grouping/aggregating when the default value of `dropna` would lead to
null keys being dropped from the output.

For more information, see discussion of [PDEP-11](#53094)

See Also
--------
DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns.
DataFrame.pivot_table : Create a spreadsheet-style pivot table as a DataFrame.

Examples
--------
>>> df = pd.DataFrame({"A": ["a", None], "B": [1, 2]})
>>> df.groupby(["A"]).sum() # doctest: +SKIP
... # NullKeyWarning: ...
"""


class LossySetitemError(Exception):
"""
Raised when trying to do a __setitem__ on an np.ndarray that is not lossless.
Expand Down Expand Up @@ -927,6 +950,7 @@ class InvalidComparison(Exception):
"MergeError",
"NoBufferPresent",
"NullFrequencyError",
"NullKeyWarning",
"NumExprClobberingError",
"NumbaUtilError",
"OptionError",
Expand Down
14 changes: 8 additions & 6 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def test_basic_cut_grouping():
# GH 9603
df = DataFrame({"a": [1, 0, 0, 0]})
c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list("abcd")))
result = df.groupby(c, observed=False).apply(len)
result = df.groupby(c, observed=False, dropna=True).apply(len)

exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered)
expected = Series([1, 0, 0, 0], index=exp_index)
Expand Down Expand Up @@ -568,7 +568,7 @@ def test_observed_groups_with_nan(observed):
"vals": [1, 2, 3],
}
)
g = df.groupby("cat", observed=observed)
g = df.groupby("cat", observed=observed, dropna=True)
result = g.groups
if observed:
expected = {"a": Index([0, 2], dtype="int64")}
Expand All @@ -587,7 +587,7 @@ def test_observed_nth():
ser = Series([1, 2, 3])
df = DataFrame({"cat": cat, "ser": ser})

result = df.groupby("cat", observed=False)["ser"].nth(0)
result = df.groupby("cat", observed=False, dropna=True)["ser"].nth(0)
expected = df["ser"].iloc[[0]]
tm.assert_series_equal(result, expected)

Expand All @@ -597,7 +597,7 @@ def test_dataframe_categorical_with_nan(observed):
s1 = Categorical([np.nan, "a", np.nan, "a"], categories=["a", "b", "c"])
s2 = Series([1, 2, 3, 4])
df = DataFrame({"s1": s1, "s2": s2})
result = df.groupby("s1", observed=observed).first().reset_index()
result = df.groupby("s1", observed=observed, dropna=True).first().reset_index()
if observed:
expected = DataFrame(
{"s1": Categorical(["a"], categories=["a", "b", "c"]), "s2": [2]}
Expand Down Expand Up @@ -768,7 +768,9 @@ def test_categorical_series(series, data):
# Group the given series by a series with categorical data type such that group A
# takes indices 0 and 3 and group B indices 1 and 2, obtaining the values mapped in
# the given data.
groupby = series.groupby(Series(list("ABBA"), dtype="category"), observed=False)
groupby = series.groupby(
Series(list("ABBA"), dtype="category"), observed=False, dropna=True
)
result = groupby.aggregate(list)
expected = Series(data, index=CategoricalIndex(data.keys()))
tm.assert_series_equal(result, expected)
Expand Down Expand Up @@ -973,7 +975,7 @@ def test_groupby_empty_with_category():
# test fix for when group by on None resulted in
# coercion of dtype categorical -> float
df = DataFrame({"A": [None] * 3, "B": Categorical(["train", "train", "test"])})
result = df.groupby("A").first()["B"]
result = df.groupby("A", dropna=True).first()["B"]
expected = Series(
Categorical([], categories=["test", "train"]),
index=Series([], dtype="object", name="A"),
Expand Down
Loading
Loading