diff --git a/openedx/core/djangoapps/user_api/accounts/tests/test_retirement_views.py b/openedx/core/djangoapps/user_api/accounts/tests/test_retirement_views.py index 5bef4324d122..cc441d7adb6f 100644 --- a/openedx/core/djangoapps/user_api/accounts/tests/test_retirement_views.py +++ b/openedx/core/djangoapps/user_api/accounts/tests/test_retirement_views.py @@ -14,7 +14,9 @@ from django.conf import settings from django.core import mail from django.core.cache import cache +from django.db import connection from django.test import TestCase +from django.test.utils import CaptureQueriesContext from django.urls import reverse from enterprise.models import ( EnterpriseCourseEnrollment, @@ -1084,9 +1086,80 @@ def cleanup_and_assert_status(self, data=None, expected_status=status.HTTP_204_N assert response.status_code == expected_status return response - def test_simple_success(self): - self.cleanup_and_assert_status() - assert not UserRetirementStatus.objects.all() + def _assert_redacted_update_delete_queries(self, queries, redacted_username, redacted_email, redacted_name): + """ + Helper method to verify UPDATE and DELETE queries contain correct field-value assignments. + Args: + queries: List of captured query dicts from CaptureQueriesContext + redacted_username: Expected redacted username value + redacted_email: Expected redacted email value + redacted_name: Expected redacted name value + """ + update_queries = [q for q in queries if 'UPDATE' in q['sql'] and 'user_api_userretirementstatus' in q['sql']] + delete_queries = [q for q in queries if 'DELETE' in q['sql'] and 'user_api_userretirementstatus' in q['sql']] + # Should have exactly 1 bulk UPDATE and 1 bulk DELETE query (not individual per-record queries) + assert len(update_queries) == 1, f"Expected 1 UPDATE query, found {len(update_queries)}" + assert len(delete_queries) == 1, f"Expected 1 DELETE query, found {len(delete_queries)}" + + # Verify UPDATE query redacts records with the correct field-value assignments + update_query = update_queries[0] + sql_lower = update_query['sql'] + # Ensure original_username, original_email, and original_name are set to redacted values + assert f'"original_username" = \'{redacted_username}\'' in sql_lower, ( + f"UPDATE query missing '\"original_username\" = {redacted_username}': {sql_lower}" + ) + assert f'"original_email" = \'{redacted_email}\'' in sql_lower, ( + f"UPDATE query missing '\"original_email\" = {redacted_email}': {sql_lower}" + ) + assert f'"original_name" = \'{redacted_name}\'' in sql_lower, ( + f"UPDATE query missing '\"original_name\" = {redacted_name}': {sql_lower}" + ) + + # Verify DELETE is from the correct table + delete_query = delete_queries[0] + sql_lower = delete_query['sql'] + assert 'user_api_userretirementstatus' in sql_lower, ( + f"DELETE query against unexpected table: {sql_lower}" + ) + # Verify DELETE uses ID filtering (not field values) to prevent over-deletion bug + assert '"id" IN' in sql_lower or 'WHERE "id"' in sql_lower, ( + f"DELETE query should use ID filtering to prevent over-deletion, but got: {sql_lower}" + ) + + def test_default_redacted_values(self): + """ + Test basic cleanup with default redacted values. + Verify that redaction (UPDATE) happens before deletion (DELETE). + Captures actual SQL queries to ensure UPDATE queries contain correct field-value assignments. + """ + with CaptureQueriesContext(connection) as context: + self.cleanup_and_assert_status() + # Verify records are deleted after redaction + retirements = UserRetirementStatus.objects.all() + assert retirements.count() == 0 + # Verify UPDATE and DELETE queries with default 'redacted' value + self._assert_redacted_update_delete_queries(context.captured_queries, 'redacted', 'redacted', 'redacted') + + def test_custom_redacted_values(self): + """Test that custom redacted values are applied before deletion.""" + custom_username = 'username-redacted-12345' + custom_email = 'email-redacted-67890' + custom_name = 'name-redacted-abcde' + data = { + 'usernames': self.usernames, + 'redacted_username': custom_username, + 'redacted_email': custom_email, + 'redacted_name': custom_name + } + with CaptureQueriesContext(connection) as context: + self.cleanup_and_assert_status(data=data) + # Verify records are deleted after redaction + retirements = UserRetirementStatus.objects.all() + assert retirements.count() == 0 + # Verify UPDATE and DELETE queries with custom redacted values + self._assert_redacted_update_delete_queries( + context.captured_queries, custom_username, custom_email, custom_name + ) def test_leaves_other_users(self): remaining_usernames = [] @@ -1123,6 +1196,31 @@ def test_username_bad_state(self): self.cleanup_and_assert_status(expected_status=status.HTTP_400_BAD_REQUEST) + def test_does_not_delete_unrelated_redacted_records(self): + """ + Verify cleanup doesn't delete unrelated records with coincidental redacted values. + Regression test for over-deletion bug where deletion was filtered by field values + (original_username='redacted') instead of by primary key. + """ + # Create an unrelated record that already has redacted field values + other_user = UserFactory() + other_retirement = create_retirement_status(other_user, state=self.complete_state) + other_retirement.original_username = 'redacted' + other_retirement.original_email = 'redacted' + other_retirement.original_name = 'redacted' + other_retirement.save() + other_id = other_retirement.id + + # Clean up only self.usernames records + self.cleanup_and_assert_status() + + # Verify target records were deleted + target_count = UserRetirementStatus.objects.filter(user__username__in=self.usernames).count() + assert target_count == 0, f"Expected 0 target records, found {target_count}" + + # Verify unrelated record was NOT deleted (not a target of cleanup) + assert UserRetirementStatus.objects.filter(id=other_id).exists() + @ddt.ddt @skip_unless_lms diff --git a/openedx/core/djangoapps/user_api/accounts/views.py b/openedx/core/djangoapps/user_api/accounts/views.py index 5180c1adb0ec..3ca4853dde3f 100644 --- a/openedx/core/djangoapps/user_api/accounts/views.py +++ b/openedx/core/djangoapps/user_api/accounts/views.py @@ -1029,19 +1029,26 @@ def cleanup(self, request): ``` { - 'usernames': ['user1', 'user2', ...] + 'usernames': ['user1', 'user2', ...], + 'redacted_username': 'Value to store in username field', + 'redacted_email': 'Value to store in email field', + 'redacted_name': 'Value to store in name field' } ``` - Deletes a batch of retirement requests by username. + Redacts and then deletes a batch of retirement requests by username. """ try: usernames = request.data["usernames"] + redacted_username = request.data.get("redacted_username", "redacted") + redacted_email = request.data.get("redacted_email", "redacted") + redacted_name = request.data.get("redacted_name", "redacted") if not isinstance(usernames, list): raise TypeError("Usernames should be an array.") complete_state = RetirementState.objects.get(state_name="COMPLETE") + # Get the retirement records to redact and delete retirements = UserRetirementStatus.objects.filter( original_username__in=usernames, current_state=complete_state ) @@ -1050,7 +1057,21 @@ def cleanup(self, request): if len(usernames) != len(retirements): raise UserRetirementStatus.DoesNotExist("Not all usernames exist in the COMPLETE state.") - retirements.delete() + # Capture IDs before update to ensure we only delete the records we changed + retirement_ids = list(retirements.values_list('id', flat=True)) + + # Redact PII fields first, then delete. In case an ETL tool is syncing data + # to a downstream data warehouse, and treats the deletes as soft-deletes, + # the data will have first been redacted, protecting the sensitive PII. + with transaction.atomic(): + UserRetirementStatus.objects.filter(id__in=retirement_ids).update( + original_username=redacted_username, + original_email=redacted_email, + original_name=redacted_name + ) + + # Delete only the records we updated + UserRetirementStatus.objects.filter(id__in=retirement_ids).delete() return Response(status=status.HTTP_204_NO_CONTENT) except (RetirementStateError, UserRetirementStatus.DoesNotExist, TypeError) as exc: return Response(str(exc), status=status.HTTP_400_BAD_REQUEST) diff --git a/scripts/user_retirement/retirement_archive_and_cleanup.py b/scripts/user_retirement/retirement_archive_and_cleanup.py index d44a5e9fa8c9..5909962c3b5f 100644 --- a/scripts/user_retirement/retirement_archive_and_cleanup.py +++ b/scripts/user_retirement/retirement_archive_and_cleanup.py @@ -196,16 +196,19 @@ def _archive_retirements_or_exit(config, learners, dry_run=False): FAIL_EXCEPTION(ERR_ARCHIVING, 'Unexpected error occurred archiving retirements!', exc) -def _cleanup_retirements_or_exit(config, learners): +def _cleanup_retirements_or_exit(config, learners, redacted_username='redacted', + redacted_email='redacted', redacted_name='redacted'): """ - Bulk deletes the retirements for this run + Bulk deletes the retirements for this run after redacting PII fields """ LOG('Cleaning up retirements for {} learners'.format(len(learners))) try: usernames = [l['original_username'] for l in learners] - config['LMS'].bulk_cleanup_retirements(usernames) + config['LMS'].bulk_cleanup_retirements( + usernames, redacted_username, redacted_email, redacted_name + ) except Exception as exc: # pylint: disable=broad-except - FAIL_EXCEPTION(ERR_DELETING, 'Unexpected error occurred deleting retirements!', exc) + FAIL_EXCEPTION(ERR_DELETING, 'Unexpected error occurred redacting/deleting retirements!', exc) def _get_utc_now(): @@ -259,7 +262,26 @@ def _get_utc_now(): help='Number of user retirements to process', type=int ) -def archive_and_cleanup(config_file, cool_off_days, dry_run, start_date, end_date, batch_size): +@click.option( + '--redacted_username', + help='Value to use for redacted username field', + type=str, + default='redacted' +) +@click.option( + '--redacted_email', + help='Value to use for redacted email field', + type=str, + default='redacted' +) +@click.option( + '--redacted_name', + help='Value to use for redacted name field', + type=str, + default='redacted' +) +def archive_and_cleanup(config_file, cool_off_days, dry_run, start_date, end_date, batch_size, + redacted_username, redacted_email, redacted_name): """ Cleans up UserRetirementStatus rows in LMS by: 1- Getting all rows currently in COMPLETE that were created --cool_off_days ago or more, @@ -314,7 +336,9 @@ def archive_and_cleanup(config_file, cool_off_days, dry_run, start_date, end_dat if dry_run: LOG('This is a dry-run. Exiting before any retirements are cleaned up') else: - _cleanup_retirements_or_exit(config, batch) + _cleanup_retirements_or_exit( + config, batch, redacted_username, redacted_email, redacted_name + ) LOG('Archive and cleanup complete for batch #{}'.format(str(index + 1))) time.sleep(DELAY) else: diff --git a/scripts/user_retirement/tests/test_retirement_archive_and_cleanup.py b/scripts/user_retirement/tests/test_retirement_archive_and_cleanup.py index 12c48dda9ff1..edcc3f833531 100644 --- a/scripts/user_retirement/tests/test_retirement_archive_and_cleanup.py +++ b/scripts/user_retirement/tests/test_retirement_archive_and_cleanup.py @@ -28,7 +28,8 @@ FAKE_BUCKET_NAME = "fake_test_bucket" -def _call_script(cool_off_days=37, batch_size=None, dry_run=None, start_date=None, end_date=None): +def _call_script(cool_off_days=37, batch_size=None, dry_run=None, start_date=None, end_date=None, + redacted_username=None, redacted_email=None, redacted_name=None): """ Call the archive script with the given params and a generic config file. Returns the CliRunner.invoke results @@ -50,6 +51,12 @@ def _call_script(cool_off_days=37, batch_size=None, dry_run=None, start_date=Non base_args += ['--start_date', start_date] if end_date: base_args += ['--end_date', end_date] + if redacted_username: + base_args += ['--redacted_username', redacted_username] + if redacted_email: + base_args += ['--redacted_email', redacted_email] + if redacted_name: + base_args += ['--redacted_name', redacted_name] result = runner.invoke(archive_and_cleanup, args=base_args) print(result) @@ -106,7 +113,7 @@ def test_successful(*args, **kwargs): assert mock_get_access_token.call_count == 1 mock_get_learners.assert_called_once() mock_bulk_cleanup_retirements.assert_called_once_with( - ['test1', 'test2', 'test3']) + ['test1', 'test2', 'test3'], 'redacted', 'redacted', 'redacted') assert result.exit_code == 0 assert 'Archive and cleanup complete' in result.output @@ -134,7 +141,8 @@ def test_successful_with_batching(*args, **kwargs): # Called once to get the LMS token assert mock_get_access_token.call_count == 1 mock_get_learners.assert_called_once() - get_learner_calls = [call(['test1', 'test2']), call(['test3'])] + get_learner_calls = [call(['test1', 'test2'], 'redacted', 'redacted', 'redacted'), + call(['test3'], 'redacted', 'redacted', 'redacted')] mock_bulk_cleanup_retirements.assert_has_calls(get_learner_calls) assert result.exit_code == 0 @@ -142,6 +150,39 @@ def test_successful_with_batching(*args, **kwargs): assert 'Archive and cleanup complete for batch #2' in result.output +@patch('scripts.user_retirement.utils.edx_api.BaseApiClient.get_access_token', return_value=('THIS_IS_A_JWT', None)) +@patch.multiple( + 'scripts.user_retirement.utils.edx_api.LmsApi', + get_learners_by_date_and_status=DEFAULT, + bulk_cleanup_retirements=DEFAULT +) +@mock_aws +def test_successful_with_custom_redaction_values(*args, **kwargs): + conn = boto3.resource('s3') + conn.create_bucket(Bucket=FAKE_BUCKET_NAME) + + mock_get_access_token = args[0] + mock_get_learners = kwargs['get_learners_by_date_and_status'] + mock_bulk_cleanup_retirements = kwargs['bulk_cleanup_retirements'] + + mock_get_learners.return_value = fake_learners_to_retire() + + result = _call_script( + redacted_username='custom_user', + redacted_email='custom@example.com', + redacted_name='Custom Name' + ) + + # Called once to get the LMS token + assert mock_get_access_token.call_count == 1 + mock_get_learners.assert_called_once() + mock_bulk_cleanup_retirements.assert_called_once_with( + ['test1', 'test2', 'test3'], 'custom_user', 'custom@example.com', 'Custom Name') + + assert result.exit_code == 0 + assert 'Archive and cleanup complete' in result.output + + @patch('scripts.user_retirement.utils.edx_api.BaseApiClient.get_access_token', return_value=('THIS_IS_A_JWT', None)) @patch.multiple( 'scripts.user_retirement.utils.edx_api.LmsApi', @@ -216,7 +257,7 @@ def test_bad_fetch(*_): def test_bad_lms_deletion(*_): result = _call_script() assert result.exit_code == ERR_DELETING - assert 'Unexpected error occurred deleting retirements!' in result.output + assert 'Unexpected error occurred redacting/deleting retirements!' in result.output @patch('scripts.user_retirement.utils.edx_api.BaseApiClient.get_access_token', return_value=('THIS_IS_A_JWT', None)) diff --git a/scripts/user_retirement/utils/edx_api.py b/scripts/user_retirement/utils/edx_api.py index e891f04019a9..fdd20c92f832 100644 --- a/scripts/user_retirement/utils/edx_api.py +++ b/scripts/user_retirement/utils/edx_api.py @@ -352,11 +352,19 @@ def retirement_retire_proctoring_backend_data(self, learner): return self._request("POST", api_url) @_retry_lms_api() - def bulk_cleanup_retirements(self, usernames): - """ - Deletes the retirements for all given usernames - """ - data = {"usernames": usernames} + def bulk_cleanup_retirements(self, usernames, redacted_username=None, + redacted_email=None, redacted_name=None): + """ + Redacts and then deletes the retirements for all given usernames. + Optionally pass caller-defined redacted values for each PII field before deletion. + """ + data = {'usernames': usernames} + if redacted_username is not None: + data['redacted_username'] = redacted_username + if redacted_email is not None: + data['redacted_email'] = redacted_email + if redacted_name is not None: + data['redacted_name'] = redacted_name api_url = self.get_api_url("api/user/v1/accounts/retirement_cleanup") return self._request("POST", api_url, json=data)