Skip to content

Commit d1fc0cb

Browse files
committed
Add retry logic for transient errors during cluster deletion
1 parent bd80ca0 commit d1fc0cb

File tree

2 files changed

+78
-4
lines changed

2 files changed

+78
-4
lines changed

pkg/util/azureerrors/error.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,3 +291,28 @@ func detectVMProfile(errStr string) VMProfileType {
291291
}
292292
return VMProfileUnknown
293293
}
294+
295+
// IsRetryableError returns true if the error is a transient/retryable error
296+
// such as 429 Too Many Requests or contains RetryableError code
297+
func IsRetryableError(err error) bool {
298+
if err == nil {
299+
return false
300+
}
301+
302+
var responseError *azcore.ResponseError
303+
if errors.As(err, &responseError) {
304+
if responseError.StatusCode == http.StatusTooManyRequests {
305+
return true
306+
}
307+
}
308+
309+
var detailedErr autorest.DetailedError
310+
if errors.As(err, &detailedErr) {
311+
if detailedErr.StatusCode == http.StatusTooManyRequests {
312+
return true
313+
}
314+
}
315+
316+
// Check for RetryableError in error message (nested Azure errors)
317+
return strings.Contains(err.Error(), "RetryableError")
318+
}

pkg/util/cluster/cluster.go

Lines changed: 53 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1354,7 +1354,31 @@ func (c *Cluster) deleteWimiRoleAssignments(ctx context.Context, vnetResourceGro
13541354

13551355
func (c *Cluster) deleteCluster(ctx context.Context, resourceGroup, clusterName string) error {
13561356
c.log.Printf("deleting cluster %s", clusterName)
1357-
if err := c.openshiftclusters.DeleteAndWait(ctx, resourceGroup, clusterName); err != nil {
1357+
1358+
timeoutCtx, cancel := context.WithTimeout(ctx, 30*time.Minute)
1359+
defer cancel()
1360+
1361+
var lastErr error
1362+
err := wait.PollImmediateUntil(30*time.Second, func() (bool, error) {
1363+
err := c.openshiftclusters.DeleteAndWait(timeoutCtx, resourceGroup, clusterName)
1364+
if err == nil {
1365+
return true, nil
1366+
}
1367+
if azureerrors.IsRetryableError(err) {
1368+
c.log.Warnf("retryable error deleting cluster %s, will retry: %v", clusterName, err)
1369+
lastErr = err
1370+
return false, nil
1371+
}
1372+
return false, err
1373+
}, timeoutCtx.Done())
1374+
1375+
if err != nil {
1376+
// If err is wait.ErrWaitTimeout, the timeout expired while retrying
1377+
// In that case, return lastErr if we have one (more informative)
1378+
if err == wait.ErrWaitTimeout && lastErr != nil {
1379+
return fmt.Errorf("error deleting cluster %s: %w", clusterName, lastErr)
1380+
}
1381+
// Otherwise return the actual error (e.g., non-retryable error from poll function)
13581382
return fmt.Errorf("error deleting cluster %s: %w", clusterName, err)
13591383
}
13601384
return nil
@@ -1365,14 +1389,39 @@ func (c *Cluster) ensureResourceGroupDeleted(ctx context.Context, resourceGroupN
13651389
timeoutCtx, cancel := context.WithTimeout(ctx, 10*time.Minute)
13661390
defer cancel()
13671391

1368-
return wait.PollImmediateUntil(5*time.Second, func() (bool, error) {
1369-
_, err := c.groups.Get(ctx, resourceGroupName)
1392+
var lastErr error
1393+
err := wait.PollImmediateUntil(5*time.Second, func() (bool, error) {
1394+
_, err := c.groups.Get(timeoutCtx, resourceGroupName)
13701395
if azureerrors.ResourceGroupNotFound(err) {
13711396
c.log.Infof("finished deleting resource group %s", resourceGroupName)
13721397
return true, nil
13731398
}
1374-
return false, fmt.Errorf("failed to delete resource group %s with %s", resourceGroupName, err)
1399+
if err != nil {
1400+
// Non-retryable errors should fail immediately
1401+
if !azureerrors.IsRetryableError(err) {
1402+
return false, fmt.Errorf("non-retryable error checking resource group %s: %w", resourceGroupName, err)
1403+
}
1404+
lastErr = err
1405+
c.log.Warnf("retryable error checking resource group %s, will retry: %v", resourceGroupName, err)
1406+
} else {
1407+
c.log.Infof("resource group %s still exists, waiting for deletion", resourceGroupName)
1408+
}
1409+
return false, nil
13751410
}, timeoutCtx.Done())
1411+
1412+
if err != nil {
1413+
// If err is wait.ErrWaitTimeout, the timeout expired while retrying
1414+
// In that case, return lastErr if we have one (more informative)
1415+
if err == wait.ErrWaitTimeout && lastErr != nil {
1416+
return fmt.Errorf("timed out waiting for resource group %s to be deleted, last error: %w", resourceGroupName, lastErr)
1417+
}
1418+
// Otherwise return the actual error (e.g., non-retryable error or timeout with no lastErr)
1419+
if err == wait.ErrWaitTimeout {
1420+
return fmt.Errorf("timed out waiting for resource group %s to be deleted", resourceGroupName)
1421+
}
1422+
return err
1423+
}
1424+
return nil
13761425
}
13771426

13781427
func (c *Cluster) deleteResourceGroup(ctx context.Context, resourceGroup string) error {

0 commit comments

Comments
 (0)