diff --git a/backend/main.go b/backend/main.go index edb6ec13bf..e517552253 100644 --- a/backend/main.go +++ b/backend/main.go @@ -27,6 +27,7 @@ import ( "syscall" "time" + "github.com/Azure/ARO-HCP/backend/pkg/controllers/clusterprovisioningcontrollers" "github.com/go-logr/logr" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" @@ -366,6 +367,9 @@ func Run(cmd *cobra.Command, args []string) error { cosmosMatchingClusterController = controllerutils.NewClusterWatchingController( "CosmosMatchingClusters", dbClient, subscriptionLister, 60*time.Minute, mismatchcontrollers.NewCosmosClusterMatchingController(utilsclock.RealClock{}, dbClient, clusterServiceClient)) + dnsReservationController = controllerutils.NewClusterWatchingController( + "DNSReservation", dbClient, subscriptionLister, 1*time.Minute, + clusterprovisioningcontrollers.NewDNSReservationController(dbClient)) ) le, err := leaderelection.NewLeaderElector(leaderelection.LeaderElectionConfig{ @@ -388,6 +392,7 @@ func Run(cmd *cobra.Command, args []string) error { go cosmosMatchingNodePoolController.Run(ctx, 20) go cosmosMatchingExternalAuthController.Run(ctx, 20) go cosmosMatchingClusterController.Run(ctx, 20) + go dnsReservationController.Run(ctx, 20) }, OnStoppedLeading: func() { operationsScanner.LeaderGauge.Set(0) diff --git a/backend/pkg/controllers/clusterprovisioningcontrollers/dns_reservation.go b/backend/pkg/controllers/clusterprovisioningcontrollers/dns_reservation.go new file mode 100644 index 0000000000..0c25530e63 --- /dev/null +++ b/backend/pkg/controllers/clusterprovisioningcontrollers/dns_reservation.go @@ -0,0 +1,107 @@ +// Copyright 2025 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package clusterprovisioningcontrollers + +import ( + "context" + "fmt" + "net/http" + "time" + + "github.com/Azure/ARO-HCP/internal/api" + azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/lru" + + "github.com/Azure/ARO-HCP/backend/pkg/controllers/controllerutils" + "github.com/Azure/ARO-HCP/internal/database" + "github.com/Azure/ARO-HCP/internal/serverutils" + "github.com/Azure/ARO-HCP/internal/utils" +) + +type dnsReservationController struct { + cosmosClient database.DBClient +} + +// NewDataDumpController periodically lists all clusters and for each out when the cluster was created and its state. +func NewDNSReservationController(cosmosClient database.DBClient) controllerutils.ClusterSyncer { + c := &dnsReservationController{ + cosmosClient: cosmosClient, + } + + return c +} + +func (c *dnsReservationController) SyncOnce(ctx context.Context, key controllerutils.HCPClusterKey) error { + logger := utils.LoggerFromContext(ctx) + + customerDesiredCluster, err := c.cosmosClient.HCPClusters(key.SubscriptionID, key.ResourceGroupName).Get(ctx, key.HCPClusterName) + if database.IsResponseError(err, http.StatusNotFound) { + return nil // no work to do + } + if err != nil { + return utils.TrackError(fmt.Errorf("failed to get HCP cluster: %w", err)) + } + + serviceProviderCluster, err := c.cosmosClient.ServiceProviderClusters(key.SubscriptionID, key.ResourceGroupName, key.HCPClusterName).Get(ctx, "default") + if database.IsResponseError(err, http.StatusNotFound) { + // create it + serviceProviderCluster, err = c.cosmosClient.ServiceProviderClusters(key.SubscriptionID, key.ResourceGroupName, key.HCPClusterName).Create( + ctx, + &api.ServiceProviderCluster{ + CosmosMetadata: api.CosmosMetadata{}, + ResourceID: azcorearm.ResourceID{}, + LoadBalancerResourceID: nil, + KubeAPIServerDNSReservation: nil, + }, + nil) + } + if err != nil { + return utils.TrackError(fmt.Errorf("failed to get or create service provider cluster: %w", err)) + } + + if serviceProviderCluster.KubeAPIServerDNSReservation != nil { + // no work to do + return nil + } + + // if we're here, we need to reserve a DNS name. Just create a random one. if it succeeds, the name is free and use it. + // if it fails, just return the error and the auto-retry will trigger us again soon. That handles both the conflict case + // and a general "it's down" case and we get free reporting. + dnsReservation, err := c.cosmosClient.DNSReservations(key.SubscriptionID).Create( + ctx, + &api.DNSReservation{ + CosmosMetadata: api.CosmosMetadata{}, + ResourceID: nil, + MustBindByTime: metav1.Time{}, + OwningCluster: nil, + }, + nil) + if err != nil { + return utils.TrackError(fmt.Errorf("failed to reserve DNS name: %w", err) + } + logger.Info("reserved DNS name", "kubeAPIServerDNSName", dnsReservation.ResourceID) + + serviceProviderCluster.KubeAPIServerDNSReservation = dnsReservation.ResourceID + _, err = c.cosmosClient.ServiceProviderClusters(key.SubscriptionID, key.ResourceGroupName, key.HCPClusterName).Replace(ctx, serviceProviderCluster, nil) + if err != nil { + return utils.TrackError(fmt.Errorf("failed to update service provider cluster: %w", err)) + } + + // from here we get choices about granularity. I'd be fine to see this controller go on and create azure stuff. + // I'd also be find to see another controller create the azure stuff. + + return nil +} diff --git a/internal/api/registry.go b/internal/api/registry.go index 2b70a12fcc..cc49f1bf97 100644 --- a/internal/api/registry.go +++ b/internal/api/registry.go @@ -76,6 +76,7 @@ const ( ) var ( + DNSReservationResourceType = azcorearm.NewResourceType(ProviderNamespace, "dnsReservations") OperationStatusResourceType = azcorearm.NewResourceType(ProviderNamespace, OperationStatusResourceTypeName) ClusterResourceType = azcorearm.NewResourceType(ProviderNamespace, ClusterResourceTypeName) ServiceProviderClusterResourceType = azcorearm.NewResourceType(ProviderNamespace, ClusterResourceTypeName+"/serviceProviderCluster") diff --git a/internal/api/types_dnsreservation.go b/internal/api/types_dnsreservation.go new file mode 100644 index 0000000000..6454fb528f --- /dev/null +++ b/internal/api/types_dnsreservation.go @@ -0,0 +1,43 @@ +// Copyright 2025 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package api + +import ( + azcorearm "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// DNSReservation is a logical (not real) resource that exists at a subscription level to provide a simple means of reserving a DNS reservation. +// It logically belongs +type DNSReservation struct { + // CosmosMetadata ResourceID is nested under the cluster so that association and cleanup work as expected + // it will be the ServiceProviderCluster type and the name default + CosmosMetadata `json:"cosmosMetadata"` + + // this matches the resourcedocument and standard storage schema. + // we already store this field, but its currently done in conversion trickery. Update to directly serialize it. + // all items previously stored will read out and have this filled in. + // we need to be sure that all new records have it too. + ResourceID *azcorearm.ResourceID `json:"resourceId,omitempty"` + + // MustBindByTime is the time by which a ServiceProviderClusterStatus must have claimed this DNSReservation. + // If a cleanup thread finds a DNSReservation that is not listed in a ServiceProviderClusterStatus after this time, + // then the DNSReservation will be deleted. + MustBindByTime metav1.Time `json:"mustBindByTime"` + + // OwningCluster is the name of the cluster that this reservation is for. This allows for easy cleanup after MustBindByTime + // is expired. + OwningCluster *azcorearm.ResourceID `json:"owningCluster,omitempty"` +} diff --git a/internal/api/types_serviceprovider_cluster.go b/internal/api/types_serviceprovider_cluster.go index e806786c38..7a1622d558 100644 --- a/internal/api/types_serviceprovider_cluster.go +++ b/internal/api/types_serviceprovider_cluster.go @@ -29,4 +29,6 @@ type ServiceProviderCluster struct { ResourceID azcorearm.ResourceID `json:"resourceId"` LoadBalancerResourceID *azcorearm.ResourceID `json:"loadBalancerResourceID,omitempty"` + + KubeAPIServerDNSReservation *azcorearm.ResourceID `json:"kubeAPIServerDNSReservation,omitempty"` } diff --git a/internal/database/crud_hcpcluster.go b/internal/database/crud_hcpcluster.go index ded3853c1d..12c40eb346 100644 --- a/internal/database/crud_hcpcluster.go +++ b/internal/database/crud_hcpcluster.go @@ -45,6 +45,16 @@ type OperationCRUD interface { ListActiveOperations(options *DBClientListActiveOperationDocsOptions) DBClientIterator[api.Operation] } +func NewDNSReservationCRUD(containerClient *azcosmos.ContainerClient, subscriptionID string) ResourceCRUD[api.DNSReservation] { + parts := []string{ + "/subscriptions", + strings.ToLower(subscriptionID), + } + parentResourceID := api.Must(azcorearm.ParseResourceID(path.Join(parts...))) + + return NewCosmosResourceCRUD[api.DNSReservation, GenericDocument[api.DNSReservation]](containerClient, parentResourceID, api.OperationStatusResourceType) +} + type operationCRUD struct { *nestedCosmosResourceCRUD[api.Operation, Operation] } diff --git a/internal/database/database.go b/internal/database/database.go index 715b9e162e..95c9c95ebb 100644 --- a/internal/database/database.go +++ b/internal/database/database.go @@ -118,6 +118,8 @@ type DBClient interface { // to end users via ARM. They must also survive the thing they are deleting, so they live under a subscription directly. Operations(subscriptionID string) OperationCRUD + DNSReservations(subscriptionID string) ResourceCRUD[api.DNSReservation] + Subscriptions() SubscriptionCRUD ServiceProviderClusters(subscriptionID, resourceGroupName, clusterName string) ServiceProviderClusterCRUD @@ -279,6 +281,10 @@ func (d *cosmosDBClient) ServiceProviderClusters(subscriptionID, resourceGroupNa d.resources, clusterResourceID, api.ServiceProviderClusterResourceType) } +func (d *cosmosDBClient) DNSReservations(subscriptionID string) ResourceCRUD[api.DNSReservation] { + return NewDNSReservationCRUD(d.resources, subscriptionID) +} + func (d *cosmosDBClient) UntypedCRUD(parentResourceID azcorearm.ResourceID) (UntypedResourceCRUD, error) { return NewUntypedCRUD(d.resources, parentResourceID), nil }