diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index 8377f72cd3768..848b71363ce20 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -7589,6 +7589,7 @@ def drop( axis: Optional[Axis] = 0, index: Union[Name, List[Name]] = None, columns: Union[Name, List[Name]] = None, + errors: str = "raise", ) -> "DataFrame": """ Drop specified labels from columns. @@ -7614,6 +7615,10 @@ def drop( columns : single label or list-like Alternative to specifying axis (``labels, axis=1`` is equivalent to ``columns=labels``). + errors : {{'ignore', 'raise'}}, default 'raise' + If 'ignore', suppress error and only existing labels are dropped. + + .. versionadded:: 4.1.0 Returns ------- @@ -7677,14 +7682,16 @@ def drop( ----- Currently, dropping rows of a MultiIndex DataFrame is not supported yet. """ + if errors not in ("raise", "ignore"): + raise ValueError("errors must be either 'raise' or 'ignore'") if labels is not None: if index is not None or columns is not None: raise ValueError("Cannot specify both 'labels' and 'index'/'columns'") axis = validate_axis(axis) if axis == 1: - return self.drop(index=index, columns=labels) + return self.drop(index=index, columns=labels, errors=errors) else: - return self.drop(index=labels, columns=columns) + return self.drop(index=labels, columns=columns, errors=errors) else: if index is None and columns is None: raise ValueError("Need to specify at least one of 'labels' or 'columns' or 'index'") @@ -7737,8 +7744,17 @@ def drop( for col in columns if label[: len(col)] == col ) + if errors == "raise": + missing = [ + col + for col in columns + if not any(label[: len(col)] == col for label in internal.column_labels) + ] + if missing: + raise KeyError(missing) + if len(drop_column_labels) == 0: - raise KeyError(columns) + return DataFrame(internal) keep_columns_and_labels = [ (column, label) diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py index 4eddbe5ad8adc..cee9d2620277e 100644 --- a/python/pyspark/pandas/series.py +++ b/python/pyspark/pandas/series.py @@ -2554,6 +2554,7 @@ def drop( columns: Optional[Union[Name, List[Name]]] = None, level: Optional[int] = None, inplace: bool = False, + errors: str = "raise", ) -> "Series": """ Return Series with specified index labels removed. @@ -2577,6 +2578,10 @@ def drop( If True, do operation inplace and return None .. versionadded:: 3.4.0 + errors : {{'ignore', 'raise'}}, default 'raise' + If 'ignore', suppress error and only existing labels are dropped. + + .. versionadded:: 4.1.0 Returns ------- @@ -2685,6 +2690,8 @@ def drop( length 0.3 dtype: float64 """ + if errors not in ("raise", "ignore"): + raise ValueError("errors must be either 'raise' or 'ignore'") dropped = self._drop( labels=labels, index=index, level=level, inplace=inplace, columns=columns ) diff --git a/python/pyspark/pandas/tests/frame/test_reindexing.py b/python/pyspark/pandas/tests/frame/test_reindexing.py index cc1a7b6ed1c13..7bf63fdc80f83 100644 --- a/python/pyspark/pandas/tests/frame/test_reindexing.py +++ b/python/pyspark/pandas/tests/frame/test_reindexing.py @@ -318,6 +318,70 @@ def test_drop(self): lambda: psdf.drop(labels="A", axis=0, columns="X"), ) + def test_drop_with_errors(self): + pdf = pd.DataFrame({"x": [1, 2], "y": [3, 4], "z": [5, 6]}, index=np.random.rand(2)) + psdf = ps.from_pandas(pdf) + + # errors='ignore' with all-missing columns + self.assert_eq( + psdf.drop(columns=["a", "b"], errors="ignore"), + pdf.drop(columns=["a", "b"], errors="ignore"), + ) + + # errors='ignore' with some existing, some missing columns + self.assert_eq( + psdf.drop(columns=["x", "a"], errors="ignore"), + pdf.drop(columns=["x", "a"], errors="ignore"), + ) + + # errors='ignore' via labels + axis=1 + self.assert_eq( + psdf.drop(["x", "a"], axis=1, errors="ignore"), + pdf.drop(["x", "a"], axis=1, errors="ignore"), + ) + + # errors='raise' (explicit) should still raise for missing columns + self.assertRaises(KeyError, lambda: psdf.drop(columns=["a", "b"], errors="raise")) + + # errors='raise' with partial match (some exist, some don't) + self.assertRaises(KeyError, lambda: psdf.drop(columns=["x", "a"], errors="raise")) + + # errors='raise' is the default + self.assertRaises(KeyError, lambda: psdf.drop(columns=["x", "a"])) + + # errors='ignore' for row drops + pdf2 = pd.DataFrame({"X": [1, 2, 3], "Y": [4, 5, 6]}, index=["A", "B", "C"]) + psdf2 = ps.from_pandas(pdf2) + self.assert_eq( + psdf2.drop(index=["A", "Z"], errors="ignore"), + pdf2.drop(index=["A", "Z"], errors="ignore"), + ) + + # errors='ignore' for combined row and column drops + self.assert_eq( + psdf2.drop(index=["A"], columns=["X", "W"], errors="ignore"), + pdf2.drop(index=["A"], columns=["X", "W"], errors="ignore"), + ) + + # MultiIndex columns with errors='ignore' + columns = pd.MultiIndex.from_tuples([(1, "x"), (1, "y"), (2, "z")]) + pdf.columns = columns + psdf = ps.from_pandas(pdf) + self.assert_eq( + psdf.drop(columns=3, errors="ignore"), + pdf.drop(columns=3, errors="ignore"), + ) + self.assert_eq( + psdf.drop(columns=(1, "z"), errors="ignore"), + pdf.drop(columns=(1, "z"), errors="ignore"), + ) + + # Invalid errors value + self.assertRaises( + ValueError, + lambda: psdf.drop(columns=[1], errors="invalid"), + ) + def test_droplevel(self): pdf = ( pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) diff --git a/python/pyspark/pandas/tests/series/test_compute.py b/python/pyspark/pandas/tests/series/test_compute.py index 6dad66387dee6..76389c34c7988 100644 --- a/python/pyspark/pandas/tests/series/test_compute.py +++ b/python/pyspark/pandas/tests/series/test_compute.py @@ -240,6 +240,34 @@ def test_drop(self): self.assert_eq(psser, pser) self.assert_eq(psdf, pdf) + def test_drop_with_errors(self): + pser = pd.Series([10, 20, 30], index=["a", "b", "c"]) + psser = ps.from_pandas(pser) + + # errors='ignore' with missing index label + self.assert_eq( + psser.drop(["a", "x"], errors="ignore"), + pser.drop(["a", "x"], errors="ignore"), + ) + + # errors='ignore' with all-missing index labels + self.assert_eq( + psser.drop(["x", "y"], errors="ignore"), + pser.drop(["x", "y"], errors="ignore"), + ) + + # errors='ignore' with columns (no-op for Series) + self.assert_eq( + psser.drop(columns=["a"], errors="ignore"), + pser.drop(columns=["a"], errors="ignore"), + ) + + # Invalid errors value + self.assertRaises( + ValueError, + lambda: psser.drop("a", errors="invalid"), + ) + def test_pop(self): midx = pd.MultiIndex( [["lama", "cow", "falcon"], ["speed", "weight", "length"]],