Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/aks-agent/HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@ To release a new version, please select a new version number (usually plus 1 to
Pending
+++++++

1.0.0b16
++++++++
* Fix: client mode use AzureCLICredential to authenticate with Azure
* Fix: correct wrong prompt message for init and cleanup
* Fix: prompt the whole flags including --resource-group, --name and optional --namespace for az aks agent command
* Enhancement: cluster mode cleanup will wait for pods to be removed after deletion

1.0.0b15
++++++++
* Feature: Add local mode support - run AKS agent in Docker container on local machine as an alternative to cluster deployment
Expand Down
2 changes: 1 addition & 1 deletion src/aks-agent/azext_aks_agent/_help.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
This command allows you to ask questions about your Azure Kubernetes cluster and get answers using AI models.

Prerequisites:
- Run 'az aks agent-init' first to configure the LLM provider and deployment mode
- Run 'az aks agent-init -n {name} -g {resource_group_name}' first to configure the LLM provider and deployment mode
- For client mode: Docker must be installed and running
parameters:
- name: --name -n
Expand Down
118 changes: 115 additions & 3 deletions src/aks-agent/azext_aks_agent/agent/k8s/aks_agent_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,23 @@ def exec_aks_agent(self, command_flags: str = "") -> bool:
AzCLIError: If execution fails
"""

@abstractmethod
def command_flags(self) -> str:
"""
Get command flags for general aks-agent commands.
Returns:
str: Command flags string appropriate for the concrete implementation.
"""

@abstractmethod
def init_command_flags(self) -> str:
"""
Get command flags for init command (without namespace).

Returns:
str: Command flags in format '-n {cluster_name} -g {resource_group_name}'
"""


class AKSAgentManager(AKSAgentManagerLLMConfigBase): # pylint: disable=too-many-instance-attributes
"""
Expand Down Expand Up @@ -413,6 +430,75 @@ def _run_helm_command(self, args: List[str], check: bool = True) -> tuple[bool,
"""
return self.helm_manager.run_command(args, check=check)

def command_flags(self) -> str:
"""
Get command flags for CLI commands.

Returns:
str: Command flags in format '-n {cluster_name} -g {resource_group_name} --namespace {namespace}'
"""
return f"-n {self.cluster_name} -g {self.resource_group_name} --namespace {self.namespace}"

def init_command_flags(self) -> str:
"""
Get command flags for init command (without namespace).

Returns:
str: Command flags in format '-n {cluster_name} -g {resource_group_name}'
"""
return f"-n {self.cluster_name} -g {self.resource_group_name}"

def _wait_for_pods_removed(self, timeout: int = 60, interval: int = 2) -> bool:
"""
Wait for all AKS agent pods to be removed from the namespace.

Args:
timeout: Maximum time to wait in seconds (default: 60)
interval: Time to wait between checks in seconds (default: 2)

Returns:
bool: True if all pods are removed within timeout, False otherwise
"""
import time

logger.info("Waiting for pods to be removed from namespace '%s'", self.namespace)
start_time = time.time()

while time.time() - start_time < timeout:
try:
# Check for pods with either label selector
agent_pods = self.core_v1.list_namespaced_pod(
namespace=self.namespace,
label_selector=AGENT_LABEL_SELECTOR
)
mcp_pods = self.core_v1.list_namespaced_pod(
namespace=self.namespace,
label_selector=AKS_MCP_LABEL_SELECTOR
)

total_pods = len(agent_pods.items) + len(mcp_pods.items)

if total_pods == 0:
logger.info("All pods removed successfully")
return True

logger.debug("Still %d pod(s) remaining, waiting...", total_pods)
time.sleep(interval)

except ApiException as e:
if e.status == 404:
# Namespace might have been deleted, consider this as success
logger.info("Namespace not found, pods are considered removed")
return True
logger.warning("Error checking pod status: %s", e)
time.sleep(interval)
except Exception as e: # pylint: disable=broad-exception-caught
logger.warning("Unexpected error checking pod status: %s", e)
time.sleep(interval)

logger.warning("Timeout waiting for pods to be removed")
return False

def deploy_agent(self, chart_version: Optional[str] = None) -> Tuple[bool, str]:
"""
Deploy AKS agent using helm chart.
Expand Down Expand Up @@ -678,6 +764,13 @@ def uninstall_agent(self, delete_secret: bool = True) -> bool:
# Delete the LLM configuration secret if requested
if delete_secret:
self.delete_llm_config_secret()

# Wait for pods to be removed
logger.info("Waiting for pods to be removed...")
pods_removed = self._wait_for_pods_removed(timeout=60)
if not pods_removed:
logger.warning("Timeout waiting for all pods to be removed. Some pods may still be terminating.")

return True
raise AzCLIError(f"Failed to uninstall AKS agent: {output}")

Expand Down Expand Up @@ -945,6 +1038,24 @@ def _ensure_custom_toolset(self) -> None:
else:
logger.debug("custom_toolset.yaml already exists at: %s", custom_toolset_file)

def command_flags(self) -> str:
"""
Get command flags for CLI commands.

Returns:
str: Command flags in format '-n {cluster_name} -g {resource_group_name}'
"""
return f"-n {self.cluster_name} -g {self.resource_group_name}"

def init_command_flags(self) -> str:
"""
Get command flags for init command (without namespace).

Returns:
str: Command flags in format '-n {cluster_name} -g {resource_group_name}'
"""
return f"-n {self.cluster_name} -g {self.resource_group_name}"

def save_llm_config(self, provider: LLMProvider, params: dict) -> None:
"""
Save LLM configuration using the LLMConfigManager.
Expand Down Expand Up @@ -1036,11 +1147,12 @@ def exec_aks_agent(self, command_flags: str = "") -> bool:
# Mount custom_toolset.yaml
volumes.extend(["-v", f"{custom_toolset_file}:/etc/aks-agent/config/custom_toolset.yaml:ro"])

# Build environment variables for AKS context
# Build environment variables for AKS context and use AzureCLICredential to authenticate
env_vars = [
"-e", f"AKS_RESOURCE_GROUP={self.resource_group_name}",
"-e", f"AKS_RESOURCE_GROUP_NAME={self.resource_group_name}",
"-e", f"AKS_CLUSTER_NAME={self.cluster_name}",
"-e", f"AKS_SUBSCRIPTION_ID={self.subscription_id}"
"-e", f"AKS_SUBSCRIPTION_ID={self.subscription_id}",
"-e", "AZURE_TOKEN_CREDENTIALS=AzureCLICredential"
]

# Prepare the command
Expand Down
30 changes: 21 additions & 9 deletions src/aks-agent/azext_aks_agent/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,12 +226,14 @@ def _setup_helm_deployment(console, aks_agent_manager: AKSAgentManager):
aks_agent_manager.managed_identity_client_id = managed_identity_client_id
else:
# Handle non-standard helm status (failed, pending-install, pending-upgrade, etc.)
cmd_flags = aks_agent_manager.command_flags()
init_cmd_flags = aks_agent_manager.init_command_flags()
console.print(
f"⚠️ Detected unexpected helm status: {helm_status}\n"
f"The AKS agent deployment is in an unexpected state.\n\n"
f"To investigate, run: az aks agent --status\n"
f"To investigate, run: az aks agent --status {cmd_flags}\n"
f"To recover:\n"
f" 1. Clean up and reinitialize: az aks agent cleanup && az aks agent init\n"
f" 1. Clean up and reinitialize: az aks agent-cleanup {cmd_flags} && az aks agent-init {init_cmd_flags}\n"
f" 2. Check deployment logs for more details",
style=HELP_COLOR)
raise AzCLIError(f"Cannot proceed with initialization due to unexpected helm status: {helm_status}")
Expand All @@ -245,8 +247,9 @@ def _setup_helm_deployment(console, aks_agent_manager: AKSAgentManager):
else:
console.print("❌ Failed to deploy agent", style=ERROR_COLOR)
console.print(f"Error: {error_msg}", style=ERROR_COLOR)
cmd_flags = aks_agent_manager.command_flags()
console.print(
"Run 'az aks agent --status' to investigate the deployment issue.",
f"Run 'az aks agent --status {cmd_flags}' to investigate the deployment issue.",
style=INFO_COLOR)
raise AzCLIError("Failed to deploy agent")

Expand All @@ -261,7 +264,9 @@ def _setup_helm_deployment(console, aks_agent_manager: AKSAgentManager):
"⚠️ AKS agent is deployed but not yet ready. It may take a few moments to start.",
style=WARNING_COLOR)
if helm_status not in ["deployed", "superseded"]:
console.print("You can check the status later using 'az aks agent --status'", style="cyan")
cmd_flags = aks_agent_manager.command_flags()
console.print(
f"You can check the status later using 'az aks agent --status {cmd_flags}'", style="cyan")


def _prompt_managed_identity_configuration(console):
Expand Down Expand Up @@ -325,7 +330,8 @@ def _setup_and_create_llm_config(console, aks_agent_manager: AKSAgentManagerLLMC
raise AzCLIError(f"Failed to save LLM configuration: {str(e)}")

elif error is not None and action == "retry_input":
raise AzCLIError(f"Please re-run `az aks agent-init` to correct the input parameters. {error}")
cmd_flags = aks_agent_manager.init_command_flags()
raise AzCLIError(f"Please re-run `az aks agent-init {cmd_flags}` to correct the input parameters. {error}")
else:
raise AzCLIError(f"Please check your deployed model and network connectivity. {error}")

Expand Down Expand Up @@ -386,7 +392,9 @@ def _aks_agent_local_status(agent_manager: AKSAgentManagerClient):
console.print("\n✅ Client mode is configured and ready!", style=SUCCESS_COLOR)
else:
console.print("\n❌ No LLM configuration found", style=ERROR_COLOR)
console.print("Run 'az aks agent-init' to set up LLM configuration.", style=INFO_COLOR)
cmd_flags = agent_manager.init_command_flags()
console.print(
f"Run 'az aks agent-init {cmd_flags}' to set up LLM configuration.", style=INFO_COLOR)


def _aks_agent_status(agent_manager: AKSAgentManager):
Expand All @@ -402,7 +410,9 @@ def _aks_agent_status(agent_manager: AKSAgentManager):
console.print(f"\n✅ Helm Release: {helm_status}", style=SUCCESS_COLOR)
elif helm_status == "not_found":
console.print("\n❌ Helm Release: Not found", style=ERROR_COLOR)
console.print("The AKS agent is not installed. Run with az aks agent-init to install.", style=INFO_COLOR)
cmd_flags = agent_manager.init_command_flags()
console.print(
f"The AKS agent is not installed. Run 'az aks agent-init {cmd_flags}' to install.", style=INFO_COLOR)
return
else:
console.print(f"\n⚠️ Helm Release: {helm_status}", style=WARNING_COLOR)
Expand Down Expand Up @@ -520,8 +530,9 @@ def aks_agent_cleanup(
if success:
console.print("✅ Cleanup completed successfully! All resources have been removed.", style=SUCCESS_COLOR)
else:
cmd_flags = agent_manager.command_flags()
console.print(
"❌ Cleanup failed. Please run 'az aks agent --status' to verify cleanup completion.", style=ERROR_COLOR)
f"❌ Cleanup failed. Please run 'az aks agent --status {cmd_flags}' to verify cleanup completion.", style=ERROR_COLOR)


# pylint: disable=unused-argument
Expand Down Expand Up @@ -597,8 +608,9 @@ def aks_agent(
success, result = agent_manager.get_agent_pods()
if not success:
# get_agent_pods already logged the error, provide helpful message
cmd_flags = agent_manager.init_command_flags()
error_msg = f"Failed to find AKS agent pods: {result}\n"
error_msg += "The AKS agent may not be deployed. Run 'az aks agent-init' to initialize the deployment."
error_msg += f"The AKS agent may not be deployed. Run 'az aks agent-init {cmd_flags}' to initialize the deployment."
raise CLIError(error_msg)

# prepare CLI flags
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@
import unittest
from unittest.mock import MagicMock, Mock, PropertyMock, patch

from azext_aks_agent.agent.k8s.aks_agent_manager import AKSAgentManager
from azext_aks_agent.agent.k8s.aks_agent_manager import (
AKSAgentManager,
AKSAgentManagerClient,
)
from kubernetes.client.rest import ApiException


Expand Down Expand Up @@ -190,14 +193,16 @@ def test_deploy_agent_failure(self, mock_helm_manager, mock_load_config, mock_in
self.assertFalse(success)
self.assertIn("deployment failed", error_msg)

@patch('azext_aks_agent.agent.k8s.aks_agent_manager.AKSAgentManager._wait_for_pods_removed')
@patch('azext_aks_agent.agent.k8s.aks_agent_manager.AKSAgentManager._run_helm_command')
@patch('azext_aks_agent.agent.k8s.aks_agent_manager.AKSAgentManager._init_k8s_client')
@patch('azext_aks_agent.agent.k8s.aks_agent_manager.AKSAgentManager._load_existing_helm_release_config')
@patch('azext_aks_agent.agent.k8s.aks_agent_manager.HelmManager')
def test_uninstall_agent_success(self, mock_helm_manager, mock_load_config,
mock_init_client, mock_helm_cmd):
mock_init_client, mock_helm_cmd, mock_wait_pods):
"""Test successful agent uninstallation."""
mock_helm_cmd.return_value = (True, "uninstalled successfully")
mock_wait_pods.return_value = True

manager = AKSAgentManager(
resource_group_name=self.resource_group,
Expand All @@ -208,6 +213,7 @@ def test_uninstall_agent_success(self, mock_helm_manager, mock_load_config,
result = manager.uninstall_agent()

self.assertTrue(result)
mock_wait_pods.assert_called_once_with(timeout=60)

@patch('azext_aks_agent.agent.k8s.aks_agent_manager.AKSAgentManager._init_k8s_client')
@patch('azext_aks_agent.agent.k8s.aks_agent_manager.AKSAgentManager._load_existing_helm_release_config')
Expand Down Expand Up @@ -290,6 +296,79 @@ def create_pod(name):
self.assertEqual(len(status["deployments"]), 2)
self.assertEqual(len(status["pods"]), 2)

@patch('azext_aks_agent.agent.k8s.aks_agent_manager.AKSAgentManager._init_k8s_client')
@patch('azext_aks_agent.agent.k8s.aks_agent_manager.AKSAgentManager._load_existing_helm_release_config')
@patch('azext_aks_agent.agent.k8s.aks_agent_manager.HelmManager')
def test_command_flags(self, mock_helm_manager, mock_load_config, mock_init_client):
"""Test command_flags returns correct format with namespace."""
manager = AKSAgentManager(
resource_group_name=self.resource_group,
cluster_name=self.cluster_name,
subscription_id=self.subscription_id,
namespace=self.namespace
)

result = manager.command_flags()

expected = f"-n {self.cluster_name} -g {self.resource_group} --namespace {self.namespace}"
self.assertEqual(result, expected)

@patch('azext_aks_agent.agent.k8s.aks_agent_manager.AKSAgentManager._init_k8s_client')
@patch('azext_aks_agent.agent.k8s.aks_agent_manager.AKSAgentManager._load_existing_helm_release_config')
@patch('azext_aks_agent.agent.k8s.aks_agent_manager.HelmManager')
def test_init_command_flags(self, mock_helm_manager, mock_load_config, mock_init_client):
"""Test init_command_flags returns correct format without namespace."""
manager = AKSAgentManager(
resource_group_name=self.resource_group,
cluster_name=self.cluster_name,
subscription_id=self.subscription_id,
namespace=self.namespace
)

result = manager.init_command_flags()

expected = f"-n {self.cluster_name} -g {self.resource_group}"
self.assertEqual(result, expected)


class TestAKSAgentManagerClient(unittest.TestCase):
"""Test cases for AKSAgentManagerClient."""

def setUp(self):
"""Set up test fixtures."""
self.resource_group = "test-rg"
self.cluster_name = "test-cluster"
self.subscription_id = "test-sub-id"
self.kubeconfig_path = "/mock/kubeconfig"

def test_command_flags(self):
"""Test command_flags returns correct format."""
manager = AKSAgentManagerClient(
resource_group_name=self.resource_group,
cluster_name=self.cluster_name,
subscription_id=self.subscription_id,
kubeconfig_path=self.kubeconfig_path
)

result = manager.command_flags()

expected = f"-n {self.cluster_name} -g {self.resource_group}"
self.assertEqual(result, expected)

def test_init_command_flags(self):
"""Test init_command_flags returns correct format."""
manager = AKSAgentManagerClient(
resource_group_name=self.resource_group,
cluster_name=self.cluster_name,
subscription_id=self.subscription_id,
kubeconfig_path=self.kubeconfig_path
)

result = manager.init_command_flags()

expected = f"-n {self.cluster_name} -g {self.resource_group}"
self.assertEqual(result, expected)


if __name__ == '__main__':
unittest.main()
2 changes: 1 addition & 1 deletion src/aks-agent/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from setuptools import find_packages, setup

VERSION = "1.0.0b15"
VERSION = "1.0.0b16"

CLASSIFIERS = [
"Development Status :: 4 - Beta",
Expand Down
Loading