Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 19 additions & 3 deletions python/pyspark/pandas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7589,6 +7589,7 @@ def drop(
axis: Optional[Axis] = 0,
index: Union[Name, List[Name]] = None,
columns: Union[Name, List[Name]] = None,
errors: str = "raise",
) -> "DataFrame":
"""
Drop specified labels from columns.
Expand All @@ -7614,6 +7615,10 @@ def drop(
columns : single label or list-like
Alternative to specifying axis (``labels, axis=1``
is equivalent to ``columns=labels``).
errors : {{'ignore', 'raise'}}, default 'raise'
If 'ignore', suppress error and only existing labels are dropped.

.. versionadded:: 4.1.0

Returns
-------
Expand Down Expand Up @@ -7677,14 +7682,16 @@ def drop(
-----
Currently, dropping rows of a MultiIndex DataFrame is not supported yet.
"""
if errors not in ("raise", "ignore"):
raise ValueError("errors must be either 'raise' or 'ignore'")
if labels is not None:
if index is not None or columns is not None:
raise ValueError("Cannot specify both 'labels' and 'index'/'columns'")
axis = validate_axis(axis)
if axis == 1:
return self.drop(index=index, columns=labels)
return self.drop(index=index, columns=labels, errors=errors)
else:
return self.drop(index=labels, columns=columns)
return self.drop(index=labels, columns=columns, errors=errors)
else:
if index is None and columns is None:
raise ValueError("Need to specify at least one of 'labels' or 'columns' or 'index'")
Expand Down Expand Up @@ -7737,8 +7744,17 @@ def drop(
for col in columns
if label[: len(col)] == col
)
if errors == "raise":
missing = [
col
for col in columns
if not any(label[: len(col)] == col for label in internal.column_labels)
]
if missing:
raise KeyError(missing)

if len(drop_column_labels) == 0:
raise KeyError(columns)
return DataFrame(internal)

keep_columns_and_labels = [
(column, label)
Expand Down
7 changes: 7 additions & 0 deletions python/pyspark/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2554,6 +2554,7 @@ def drop(
columns: Optional[Union[Name, List[Name]]] = None,
level: Optional[int] = None,
inplace: bool = False,
errors: str = "raise",
) -> "Series":
"""
Return Series with specified index labels removed.
Expand All @@ -2577,6 +2578,10 @@ def drop(
If True, do operation inplace and return None

.. versionadded:: 3.4.0
errors : {{'ignore', 'raise'}}, default 'raise'
If 'ignore', suppress error and only existing labels are dropped.

.. versionadded:: 4.1.0

Returns
-------
Expand Down Expand Up @@ -2685,6 +2690,8 @@ def drop(
length 0.3
dtype: float64
"""
if errors not in ("raise", "ignore"):
raise ValueError("errors must be either 'raise' or 'ignore'")
dropped = self._drop(
labels=labels, index=index, level=level, inplace=inplace, columns=columns
)
Expand Down
64 changes: 64 additions & 0 deletions python/pyspark/pandas/tests/frame/test_reindexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,70 @@ def test_drop(self):
lambda: psdf.drop(labels="A", axis=0, columns="X"),
)

def test_drop_with_errors(self):
pdf = pd.DataFrame({"x": [1, 2], "y": [3, 4], "z": [5, 6]}, index=np.random.rand(2))
psdf = ps.from_pandas(pdf)

# errors='ignore' with all-missing columns
self.assert_eq(
psdf.drop(columns=["a", "b"], errors="ignore"),
pdf.drop(columns=["a", "b"], errors="ignore"),
)

# errors='ignore' with some existing, some missing columns
self.assert_eq(
psdf.drop(columns=["x", "a"], errors="ignore"),
pdf.drop(columns=["x", "a"], errors="ignore"),
)

# errors='ignore' via labels + axis=1
self.assert_eq(
psdf.drop(["x", "a"], axis=1, errors="ignore"),
pdf.drop(["x", "a"], axis=1, errors="ignore"),
)

# errors='raise' (explicit) should still raise for missing columns
self.assertRaises(KeyError, lambda: psdf.drop(columns=["a", "b"], errors="raise"))

# errors='raise' with partial match (some exist, some don't)
self.assertRaises(KeyError, lambda: psdf.drop(columns=["x", "a"], errors="raise"))

# errors='raise' is the default
self.assertRaises(KeyError, lambda: psdf.drop(columns=["x", "a"]))

# errors='ignore' for row drops
pdf2 = pd.DataFrame({"X": [1, 2, 3], "Y": [4, 5, 6]}, index=["A", "B", "C"])
psdf2 = ps.from_pandas(pdf2)
self.assert_eq(
psdf2.drop(index=["A", "Z"], errors="ignore"),
pdf2.drop(index=["A", "Z"], errors="ignore"),
)

# errors='ignore' for combined row and column drops
self.assert_eq(
psdf2.drop(index=["A"], columns=["X", "W"], errors="ignore"),
pdf2.drop(index=["A"], columns=["X", "W"], errors="ignore"),
)

# MultiIndex columns with errors='ignore'
columns = pd.MultiIndex.from_tuples([(1, "x"), (1, "y"), (2, "z")])
pdf.columns = columns
psdf = ps.from_pandas(pdf)
self.assert_eq(
psdf.drop(columns=3, errors="ignore"),
pdf.drop(columns=3, errors="ignore"),
)
self.assert_eq(
psdf.drop(columns=(1, "z"), errors="ignore"),
pdf.drop(columns=(1, "z"), errors="ignore"),
)

# Invalid errors value
self.assertRaises(
ValueError,
lambda: psdf.drop(columns=[1], errors="invalid"),
)

def test_droplevel(self):
pdf = (
pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
Expand Down
28 changes: 28 additions & 0 deletions python/pyspark/pandas/tests/series/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,34 @@ def test_drop(self):
self.assert_eq(psser, pser)
self.assert_eq(psdf, pdf)

def test_drop_with_errors(self):
pser = pd.Series([10, 20, 30], index=["a", "b", "c"])
psser = ps.from_pandas(pser)

# errors='ignore' with missing index label
self.assert_eq(
psser.drop(["a", "x"], errors="ignore"),
pser.drop(["a", "x"], errors="ignore"),
)

# errors='ignore' with all-missing index labels
self.assert_eq(
psser.drop(["x", "y"], errors="ignore"),
pser.drop(["x", "y"], errors="ignore"),
)

# errors='ignore' with columns (no-op for Series)
self.assert_eq(
psser.drop(columns=["a"], errors="ignore"),
pser.drop(columns=["a"], errors="ignore"),
)

# Invalid errors value
self.assertRaises(
ValueError,
lambda: psser.drop("a", errors="invalid"),
)

def test_pop(self):
midx = pd.MultiIndex(
[["lama", "cow", "falcon"], ["speed", "weight", "length"]],
Expand Down