Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
from django.conf import settings
from django.core import mail
from django.core.cache import cache
from django.db import connection
from django.test import TestCase
from django.test.utils import CaptureQueriesContext
from django.urls import reverse
from enterprise.models import (
EnterpriseCourseEnrollment,
Expand Down Expand Up @@ -1084,9 +1086,80 @@ def cleanup_and_assert_status(self, data=None, expected_status=status.HTTP_204_N
assert response.status_code == expected_status
return response

def test_simple_success(self):
self.cleanup_and_assert_status()
assert not UserRetirementStatus.objects.all()
def _assert_redacted_update_delete_queries(self, queries, redacted_username, redacted_email, redacted_name):
"""
Helper method to verify UPDATE and DELETE queries contain correct field-value assignments.
Args:
queries: List of captured query dicts from CaptureQueriesContext
redacted_username: Expected redacted username value
redacted_email: Expected redacted email value
redacted_name: Expected redacted name value
"""
update_queries = [q for q in queries if 'UPDATE' in q['sql'] and 'user_api_userretirementstatus' in q['sql']]
delete_queries = [q for q in queries if 'DELETE' in q['sql'] and 'user_api_userretirementstatus' in q['sql']]
# Should have exactly 1 bulk UPDATE and 1 bulk DELETE query (not individual per-record queries)
assert len(update_queries) == 1, f"Expected 1 UPDATE query, found {len(update_queries)}"
assert len(delete_queries) == 1, f"Expected 1 DELETE query, found {len(delete_queries)}"

# Verify UPDATE query redacts records with the correct field-value assignments
update_query = update_queries[0]
sql_lower = update_query['sql']
# Ensure original_username, original_email, and original_name are set to redacted values
assert f'"original_username" = \'{redacted_username}\'' in sql_lower, (
f"UPDATE query missing '\"original_username\" = {redacted_username}': {sql_lower}"
)
assert f'"original_email" = \'{redacted_email}\'' in sql_lower, (
f"UPDATE query missing '\"original_email\" = {redacted_email}': {sql_lower}"
)
assert f'"original_name" = \'{redacted_name}\'' in sql_lower, (
f"UPDATE query missing '\"original_name\" = {redacted_name}': {sql_lower}"
)

# Verify DELETE is from the correct table
delete_query = delete_queries[0]
sql_lower = delete_query['sql']
assert 'user_api_userretirementstatus' in sql_lower, (
f"DELETE query against unexpected table: {sql_lower}"
)
# Verify DELETE uses ID filtering (not field values) to prevent over-deletion bug
assert '"id" IN' in sql_lower or 'WHERE "id"' in sql_lower, (
f"DELETE query should use ID filtering to prevent over-deletion, but got: {sql_lower}"
)

def test_default_redacted_values(self):
"""
Test basic cleanup with default redacted values.
Verify that redaction (UPDATE) happens before deletion (DELETE).
Captures actual SQL queries to ensure UPDATE queries contain correct field-value assignments.
"""
with CaptureQueriesContext(connection) as context:
self.cleanup_and_assert_status()
# Verify records are deleted after redaction
retirements = UserRetirementStatus.objects.all()
assert retirements.count() == 0
# Verify UPDATE and DELETE queries with default 'redacted' value
self._assert_redacted_update_delete_queries(context.captured_queries, 'redacted', 'redacted', 'redacted')

def test_custom_redacted_values(self):
"""Test that custom redacted values are applied before deletion."""
custom_username = 'username-redacted-12345'
custom_email = 'email-redacted-67890'
custom_name = 'name-redacted-abcde'
data = {
'usernames': self.usernames,
'redacted_username': custom_username,
'redacted_email': custom_email,
'redacted_name': custom_name
}
with CaptureQueriesContext(connection) as context:
self.cleanup_and_assert_status(data=data)
# Verify records are deleted after redaction
retirements = UserRetirementStatus.objects.all()
assert retirements.count() == 0
# Verify UPDATE and DELETE queries with custom redacted values
self._assert_redacted_update_delete_queries(
context.captured_queries, custom_username, custom_email, custom_name
)

def test_leaves_other_users(self):
remaining_usernames = []
Expand Down Expand Up @@ -1123,6 +1196,31 @@ def test_username_bad_state(self):

self.cleanup_and_assert_status(expected_status=status.HTTP_400_BAD_REQUEST)

def test_does_not_delete_unrelated_redacted_records(self):
"""
Verify cleanup doesn't delete unrelated records with coincidental redacted values.
Regression test for over-deletion bug where deletion was filtered by field values
(original_username='redacted') instead of by primary key.
"""
# Create an unrelated record that already has redacted field values
other_user = UserFactory()
other_retirement = create_retirement_status(other_user, state=self.complete_state)
other_retirement.original_username = 'redacted'
other_retirement.original_email = 'redacted'
other_retirement.original_name = 'redacted'
other_retirement.save()
other_id = other_retirement.id

# Clean up only self.usernames records
self.cleanup_and_assert_status()

# Verify target records were deleted
target_count = UserRetirementStatus.objects.filter(user__username__in=self.usernames).count()
assert target_count == 0, f"Expected 0 target records, found {target_count}"

# Verify unrelated record was NOT deleted (not a target of cleanup)
assert UserRetirementStatus.objects.filter(id=other_id).exists()


@ddt.ddt
@skip_unless_lms
Expand Down
27 changes: 24 additions & 3 deletions openedx/core/djangoapps/user_api/accounts/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -1029,19 +1029,26 @@ def cleanup(self, request):

```
{
'usernames': ['user1', 'user2', ...]
'usernames': ['user1', 'user2', ...],
'redacted_username': 'Value to store in username field',
'redacted_email': 'Value to store in email field',
'redacted_name': 'Value to store in name field'
}
```

Deletes a batch of retirement requests by username.
Redacts and then deletes a batch of retirement requests by username.
"""
try:
usernames = request.data["usernames"]
redacted_username = request.data.get("redacted_username", "redacted")
redacted_email = request.data.get("redacted_email", "redacted")
redacted_name = request.data.get("redacted_name", "redacted")

if not isinstance(usernames, list):
raise TypeError("Usernames should be an array.")

complete_state = RetirementState.objects.get(state_name="COMPLETE")
# Get the retirement records to redact and delete
retirements = UserRetirementStatus.objects.filter(
original_username__in=usernames, current_state=complete_state
)
Expand All @@ -1050,7 +1057,21 @@ def cleanup(self, request):
if len(usernames) != len(retirements):
raise UserRetirementStatus.DoesNotExist("Not all usernames exist in the COMPLETE state.")

retirements.delete()
# Capture IDs before update to ensure we only delete the records we changed
retirement_ids = list(retirements.values_list('id', flat=True))

# Redact PII fields first, then delete. In case an ETL tool is syncing data
# to a downstream data warehouse, and treats the deletes as soft-deletes,
# the data will have first been redacted, protecting the sensitive PII.
with transaction.atomic():
UserRetirementStatus.objects.filter(id__in=retirement_ids).update(
original_username=redacted_username,
original_email=redacted_email,
original_name=redacted_name
)

# Delete only the records we updated
UserRetirementStatus.objects.filter(id__in=retirement_ids).delete()
return Response(status=status.HTTP_204_NO_CONTENT)
except (RetirementStateError, UserRetirementStatus.DoesNotExist, TypeError) as exc:
return Response(str(exc), status=status.HTTP_400_BAD_REQUEST)
Expand Down
36 changes: 30 additions & 6 deletions scripts/user_retirement/retirement_archive_and_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,16 +196,19 @@ def _archive_retirements_or_exit(config, learners, dry_run=False):
FAIL_EXCEPTION(ERR_ARCHIVING, 'Unexpected error occurred archiving retirements!', exc)


def _cleanup_retirements_or_exit(config, learners):
def _cleanup_retirements_or_exit(config, learners, redacted_username='redacted',
redacted_email='redacted', redacted_name='redacted'):
"""
Bulk deletes the retirements for this run
Bulk deletes the retirements for this run after redacting PII fields
"""
LOG('Cleaning up retirements for {} learners'.format(len(learners)))
try:
usernames = [l['original_username'] for l in learners]
config['LMS'].bulk_cleanup_retirements(usernames)
config['LMS'].bulk_cleanup_retirements(
usernames, redacted_username, redacted_email, redacted_name
)
except Exception as exc: # pylint: disable=broad-except
FAIL_EXCEPTION(ERR_DELETING, 'Unexpected error occurred deleting retirements!', exc)
FAIL_EXCEPTION(ERR_DELETING, 'Unexpected error occurred redacting/deleting retirements!', exc)


def _get_utc_now():
Expand Down Expand Up @@ -259,7 +262,26 @@ def _get_utc_now():
help='Number of user retirements to process',
type=int
)
def archive_and_cleanup(config_file, cool_off_days, dry_run, start_date, end_date, batch_size):
@click.option(
'--redacted_username',
help='Value to use for redacted username field',
type=str,
default='redacted'
)
@click.option(
'--redacted_email',
help='Value to use for redacted email field',
type=str,
default='redacted'
)
@click.option(
'--redacted_name',
help='Value to use for redacted name field',
type=str,
default='redacted'
)
def archive_and_cleanup(config_file, cool_off_days, dry_run, start_date, end_date, batch_size,
redacted_username, redacted_email, redacted_name):
"""
Cleans up UserRetirementStatus rows in LMS by:
1- Getting all rows currently in COMPLETE that were created --cool_off_days ago or more,
Expand Down Expand Up @@ -314,7 +336,9 @@ def archive_and_cleanup(config_file, cool_off_days, dry_run, start_date, end_dat
if dry_run:
LOG('This is a dry-run. Exiting before any retirements are cleaned up')
else:
_cleanup_retirements_or_exit(config, batch)
_cleanup_retirements_or_exit(
config, batch, redacted_username, redacted_email, redacted_name
)
LOG('Archive and cleanup complete for batch #{}'.format(str(index + 1)))
time.sleep(DELAY)
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
FAKE_BUCKET_NAME = "fake_test_bucket"


def _call_script(cool_off_days=37, batch_size=None, dry_run=None, start_date=None, end_date=None):
def _call_script(cool_off_days=37, batch_size=None, dry_run=None, start_date=None, end_date=None,
redacted_username=None, redacted_email=None, redacted_name=None):
"""
Call the archive script with the given params and a generic config file.
Returns the CliRunner.invoke results
Expand All @@ -50,6 +51,12 @@ def _call_script(cool_off_days=37, batch_size=None, dry_run=None, start_date=Non
base_args += ['--start_date', start_date]
if end_date:
base_args += ['--end_date', end_date]
if redacted_username:
base_args += ['--redacted_username', redacted_username]
if redacted_email:
base_args += ['--redacted_email', redacted_email]
if redacted_name:
base_args += ['--redacted_name', redacted_name]

result = runner.invoke(archive_and_cleanup, args=base_args)
print(result)
Expand Down Expand Up @@ -106,7 +113,7 @@ def test_successful(*args, **kwargs):
assert mock_get_access_token.call_count == 1
mock_get_learners.assert_called_once()
mock_bulk_cleanup_retirements.assert_called_once_with(
['test1', 'test2', 'test3'])
['test1', 'test2', 'test3'], 'redacted', 'redacted', 'redacted')

assert result.exit_code == 0
assert 'Archive and cleanup complete' in result.output
Expand Down Expand Up @@ -134,14 +141,48 @@ def test_successful_with_batching(*args, **kwargs):
# Called once to get the LMS token
assert mock_get_access_token.call_count == 1
mock_get_learners.assert_called_once()
get_learner_calls = [call(['test1', 'test2']), call(['test3'])]
get_learner_calls = [call(['test1', 'test2'], 'redacted', 'redacted', 'redacted'),
call(['test3'], 'redacted', 'redacted', 'redacted')]
mock_bulk_cleanup_retirements.assert_has_calls(get_learner_calls)

assert result.exit_code == 0
assert 'Archive and cleanup complete for batch #1' in result.output
assert 'Archive and cleanup complete for batch #2' in result.output


@patch('scripts.user_retirement.utils.edx_api.BaseApiClient.get_access_token', return_value=('THIS_IS_A_JWT', None))
@patch.multiple(
'scripts.user_retirement.utils.edx_api.LmsApi',
get_learners_by_date_and_status=DEFAULT,
bulk_cleanup_retirements=DEFAULT
)
@mock_aws
def test_successful_with_custom_redaction_values(*args, **kwargs):
conn = boto3.resource('s3')
conn.create_bucket(Bucket=FAKE_BUCKET_NAME)

mock_get_access_token = args[0]
mock_get_learners = kwargs['get_learners_by_date_and_status']
mock_bulk_cleanup_retirements = kwargs['bulk_cleanup_retirements']

mock_get_learners.return_value = fake_learners_to_retire()

result = _call_script(
redacted_username='custom_user',
redacted_email='custom@example.com',
redacted_name='Custom Name'
)

# Called once to get the LMS token
assert mock_get_access_token.call_count == 1
mock_get_learners.assert_called_once()
mock_bulk_cleanup_retirements.assert_called_once_with(
['test1', 'test2', 'test3'], 'custom_user', 'custom@example.com', 'Custom Name')

assert result.exit_code == 0
assert 'Archive and cleanup complete' in result.output


@patch('scripts.user_retirement.utils.edx_api.BaseApiClient.get_access_token', return_value=('THIS_IS_A_JWT', None))
@patch.multiple(
'scripts.user_retirement.utils.edx_api.LmsApi',
Expand Down Expand Up @@ -216,7 +257,7 @@ def test_bad_fetch(*_):
def test_bad_lms_deletion(*_):
result = _call_script()
assert result.exit_code == ERR_DELETING
assert 'Unexpected error occurred deleting retirements!' in result.output
assert 'Unexpected error occurred redacting/deleting retirements!' in result.output


@patch('scripts.user_retirement.utils.edx_api.BaseApiClient.get_access_token', return_value=('THIS_IS_A_JWT', None))
Expand Down
18 changes: 13 additions & 5 deletions scripts/user_retirement/utils/edx_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,11 +352,19 @@ def retirement_retire_proctoring_backend_data(self, learner):
return self._request("POST", api_url)

@_retry_lms_api()
def bulk_cleanup_retirements(self, usernames):
"""
Deletes the retirements for all given usernames
"""
data = {"usernames": usernames}
def bulk_cleanup_retirements(self, usernames, redacted_username=None,
redacted_email=None, redacted_name=None):
"""
Redacts and then deletes the retirements for all given usernames.
Optionally pass caller-defined redacted values for each PII field before deletion.
"""
data = {'usernames': usernames}
if redacted_username is not None:
data['redacted_username'] = redacted_username
if redacted_email is not None:
data['redacted_email'] = redacted_email
if redacted_name is not None:
data['redacted_name'] = redacted_name
api_url = self.get_api_url("api/user/v1/accounts/retirement_cleanup")
return self._request("POST", api_url, json=data)

Expand Down
Loading