From 8d41bc7ab308c5910d434380c6b9009578ced031 Mon Sep 17 00:00:00 2001 From: Junze Bao Date: Tue, 26 Aug 2025 17:25:04 +0200 Subject: [PATCH 01/14] adjust more log levels Signed-off-by: Junze Bao --- pkg/scheduler/actions/allocate/allocate.go | 2 +- pkg/scheduler/actions/reclaim/reclaim.go | 4 ++-- pkg/scheduler/cache/cache.go | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/scheduler/actions/allocate/allocate.go b/pkg/scheduler/actions/allocate/allocate.go index 7151c75689..c7fc38c969 100644 --- a/pkg/scheduler/actions/allocate/allocate.go +++ b/pkg/scheduler/actions/allocate/allocate.go @@ -179,7 +179,7 @@ func (alloc *Action) allocateResources(queues *util.PriorityQueue, jobsMap map[a continue } - klog.V(3).Infof("Try to allocate resource to %d tasks of Job <%v/%v>", + klog.V(4).Infof("Try to allocate resource to %d tasks of Job <%v/%v>", tasks.Len(), job.Namespace, job.Name) hardMode, highestAllowedTier := job.IsHardTopologyMode() diff --git a/pkg/scheduler/actions/reclaim/reclaim.go b/pkg/scheduler/actions/reclaim/reclaim.go index bb006ec73d..38cd8371fd 100644 --- a/pkg/scheduler/actions/reclaim/reclaim.go +++ b/pkg/scheduler/actions/reclaim/reclaim.go @@ -152,7 +152,7 @@ func (ra *Action) Execute(ssn *framework.Session) { continue } - klog.V(3).Infof("Considering Task <%s/%s> on Node <%s>.", task.Namespace, task.Name, n.Name) + klog.V(4).Infof("Considering Task <%s/%s> on Node <%s>.", task.Namespace, task.Name, n.Name) var reclaimees []*api.TaskInfo for _, task := range n.Tasks { @@ -184,7 +184,7 @@ func (ra *Action) Execute(ssn *framework.Session) { victims := ssn.Reclaimable(task, reclaimees) if err := util.ValidateVictims(task, n, victims); err != nil { - klog.V(3).Infof("No validated victims on Node <%s>: %v", n.Name, err) + klog.V(4).Infof("No validated victims on Node <%s>: %v", n.Name, err) continue } diff --git a/pkg/scheduler/cache/cache.go b/pkg/scheduler/cache/cache.go index aee9212bce..f3926ae1a9 100644 --- a/pkg/scheduler/cache/cache.go +++ b/pkg/scheduler/cache/cache.go @@ -906,9 +906,9 @@ func (sc *SchedulerCache) Bind(ctx context.Context, bindContexts []*BindContext) tmp := time.Now() errMsg := sc.Binder.Bind(sc.kubeClient, readyToBindTasks) if len(errMsg) == 0 { - klog.V(3).Infof("bind ok, latency %v", time.Since(tmp)) + klog.V(4).Infof("bind ok, latency %v", time.Since(tmp)) } else { - klog.V(3).Infof("There are %d tasks in total and %d binds failed, latency %v", len(readyToBindTasks), len(errMsg), time.Since(tmp)) + klog.V(4).Infof("There are %d tasks in total and %d binds failed, latency %v", len(readyToBindTasks), len(errMsg), time.Since(tmp)) } for _, bindContext := range bindContexts { From a7e5a15e51892936b59767b3281d00b7e1626bfc Mon Sep 17 00:00:00 2001 From: Junze Bao Date: Tue, 26 Aug 2025 17:32:23 +0200 Subject: [PATCH 02/14] add info Signed-off-by: Junze Bao --- pkg/scheduler/actions/preempt/preempt.go | 4 ++-- pkg/scheduler/actions/reclaim/reclaim.go | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/scheduler/actions/preempt/preempt.go b/pkg/scheduler/actions/preempt/preempt.go index f237e3a162..5683f2b4da 100644 --- a/pkg/scheduler/actions/preempt/preempt.go +++ b/pkg/scheduler/actions/preempt/preempt.go @@ -359,7 +359,7 @@ func (pmpt *Action) normalPreempt( preemptee := victimsQueue.Pop().(*api.TaskInfo) klog.V(3).Infof("Try to preempt Task <%s/%s> for Task <%s/%s>", preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name) - if err := stmt.Evict(preemptee, "preempt"); err != nil { + if err := stmt.Evict(preemptee, "preempt for task "+preemptor.Name); err != nil { klog.Errorf("Failed to preempt Task <%s/%s> for Task <%s/%s>: %v", preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name, err) continue @@ -501,7 +501,7 @@ func prepareCandidate(c *candidate, pod *v1.Pod, stmt *framework.Statement, ssn for _, victim := range c.Victims() { klog.V(3).Infof("Try to preempt Task <%s/%s> for Task <%s/%s>", victim.Namespace, victim.Name, pod.Namespace, pod.Name) - if err := stmt.Evict(victim, "preempt"); err != nil { + if err := stmt.Evict(victim, "preempt for task "+pod.Name); err != nil { klog.Errorf("Failed to preempt Task <%s/%s> for Task <%s/%s>: %v", victim.Namespace, victim.Name, pod.Namespace, pod.Name, err) return api.AsStatus(err) diff --git a/pkg/scheduler/actions/reclaim/reclaim.go b/pkg/scheduler/actions/reclaim/reclaim.go index 38cd8371fd..3062fae6cf 100644 --- a/pkg/scheduler/actions/reclaim/reclaim.go +++ b/pkg/scheduler/actions/reclaim/reclaim.go @@ -198,7 +198,7 @@ func (ra *Action) Execute(ssn *framework.Session) { reclaimee := victimsQueue.Pop().(*api.TaskInfo) klog.Errorf("Try to reclaim Task <%s/%s> for Tasks <%s/%s>", reclaimee.Namespace, reclaimee.Name, task.Namespace, task.Name) - if err := ssn.Evict(reclaimee, "reclaim"); err != nil { + if err := ssn.Evict(reclaimee, "reclaim for task "+task.Name); err != nil { klog.Errorf("Failed to reclaim Task <%s/%s> for Tasks <%s/%s>: %v", reclaimee.Namespace, reclaimee.Name, task.Namespace, task.Name, err) continue From 6f9f7936de886db452490445b7014f13b88242ce Mon Sep 17 00:00:00 2001 From: Junze Bao Date: Tue, 26 Aug 2025 17:49:03 +0200 Subject: [PATCH 03/14] adjust preempt log level Signed-off-by: Junze Bao --- pkg/scheduler/actions/preempt/preempt.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/scheduler/actions/preempt/preempt.go b/pkg/scheduler/actions/preempt/preempt.go index 5683f2b4da..5f6a458366 100644 --- a/pkg/scheduler/actions/preempt/preempt.go +++ b/pkg/scheduler/actions/preempt/preempt.go @@ -319,7 +319,7 @@ func (pmpt *Action) normalPreempt( assigned := false for _, node := range selectedNodes { - klog.V(3).Infof("Considering Task <%s/%s> on Node <%s>.", + klog.V(4).Infof("Considering Task <%s/%s> on Node <%s>.", preemptor.Namespace, preemptor.Name, node.Name) var preemptees []*api.TaskInfo @@ -334,7 +334,7 @@ func (pmpt *Action) normalPreempt( metrics.UpdatePreemptionVictimsCount(len(victims)) if err := util.ValidateVictims(preemptor, node, victims); err != nil { - klog.V(3).Infof("No validated victims on Node <%s>: %v", node.Name, err) + klog.V(4).Infof("No validated victims on Node <%s>: %v", node.Name, err) continue } @@ -373,7 +373,7 @@ func (pmpt *Action) normalPreempt( } metrics.RegisterPreemptionAttempts() - klog.V(3).Infof("Preempted <%v> for Task <%s/%s> requested <%v>.", + klog.V(4).Infof("Try to preempt <%v> for Task <%s/%s> requested <%v>.", preempted, preemptor.Namespace, preemptor.Name, preemptor.InitResreq) // If preemptor's queue is not allocatable, it means preemptor cannot be allocated. So no need care about the node idle resource @@ -720,7 +720,7 @@ func SelectVictimsOnNode( metrics.UpdatePreemptionVictimsCount(len(allVictims)) if err := util.ValidateVictims(preemptor, nodeInfo, allVictims); err != nil { - klog.V(3).Infof("No validated victims on Node <%s>: %v", nodeInfo.Name, err) + klog.V(4).Infof("No validated victims on Node <%s>: %v", nodeInfo.Name, err) return nil, api.AsStatus(fmt.Errorf("no validated victims on Node <%s>: %v", nodeInfo.Name, err)) } From 057d2e21bb69c76054ea7d258eb9e899275762d2 Mon Sep 17 00:00:00 2001 From: Junze Bao Date: Wed, 27 Aug 2025 10:06:37 +0200 Subject: [PATCH 04/14] adjust more Signed-off-by: Junze Bao --- pkg/scheduler/api/resource_info.go | 4 +++- pkg/scheduler/framework/session_plugins.go | 2 ++ pkg/scheduler/plugins/capacity/capacity.go | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pkg/scheduler/api/resource_info.go b/pkg/scheduler/api/resource_info.go index 5aae74c1bb..a2805597a1 100644 --- a/pkg/scheduler/api/resource_info.go +++ b/pkg/scheduler/api/resource_info.go @@ -103,8 +103,10 @@ func NewResource(rl v1.ResourceList) *Resource { if !ignore { r.AddScalar(rName, float64(rQuant.MilliValue())) } else { - klog.V(4).Infof("Ignoring resource %s", rName.String()) + klog.V(3).Infof("Ignoring resource %s", rName.String()) } + } else { + klog.V(3).Infof("non scalar resource %s", rName.String()) } } } diff --git a/pkg/scheduler/framework/session_plugins.go b/pkg/scheduler/framework/session_plugins.go index 100da748c8..919769a672 100644 --- a/pkg/scheduler/framework/session_plugins.go +++ b/pkg/scheduler/framework/session_plugins.go @@ -23,6 +23,7 @@ package framework import ( "context" + "k8s.io/klog/v2" k8sframework "k8s.io/kubernetes/pkg/scheduler/framework" "volcano.sh/apis/pkg/apis/scheduling" @@ -204,6 +205,7 @@ func (ssn *Session) Reclaimable(reclaimer *api.TaskInfo, reclaimees []*api.TaskI victims = nil break } + klog.V(3).Infof("Victims from plugin %s, victims=%+v reclaimer=%s", plugin.Name, victims, reclaimer.Name) // first iteration - initialize victims list if victims == nil { victims = candidates diff --git a/pkg/scheduler/plugins/capacity/capacity.go b/pkg/scheduler/plugins/capacity/capacity.go index d576537dd5..7529fe6380 100644 --- a/pkg/scheduler/plugins/capacity/capacity.go +++ b/pkg/scheduler/plugins/capacity/capacity.go @@ -141,7 +141,7 @@ func (cp *capacityPlugin) OnSessionOpen(ssn *framework.Session) { allocated.Sub(reclaimee.Resreq) victims = append(victims, reclaimee) } - klog.V(4).Infof("Victims from capacity plugin, victims=%+v reclaimer=%s", victims, reclaimer) + klog.V(3).Infof("Victims from capacity plugin, victims=%+v reclaimer=%s", victims, reclaimer) return victims, util.Permit }) From ab5fccd6ef771e4f58034d24822ccd03af987fb4 Mon Sep 17 00:00:00 2001 From: Junze Bao Date: Wed, 27 Aug 2025 21:39:39 +0200 Subject: [PATCH 05/14] fix reclaim Signed-off-by: Junze Bao --- pkg/scheduler/actions/reclaim/reclaim.go | 2 +- pkg/scheduler/api/resource_info.go | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pkg/scheduler/actions/reclaim/reclaim.go b/pkg/scheduler/actions/reclaim/reclaim.go index 3062fae6cf..b92b8675ef 100644 --- a/pkg/scheduler/actions/reclaim/reclaim.go +++ b/pkg/scheduler/actions/reclaim/reclaim.go @@ -142,6 +142,7 @@ func (ra *Action) Execute(ssn *framework.Session) { } assigned := false + reclaimed := api.EmptyResource() // we should filter out those nodes that are UnschedulableAndUnresolvable status got in allocate action totalNodes := ssn.FilterOutUnschedulableAndUnresolvableNodesForTask(task) for _, n := range totalNodes { @@ -191,7 +192,6 @@ func (ra *Action) Execute(ssn *framework.Session) { victimsQueue := ssn.BuildVictimsPriorityQueue(victims, task) resreq := task.InitResreq.Clone() - reclaimed := api.EmptyResource() // Reclaim victims for tasks. for !victimsQueue.Empty() { diff --git a/pkg/scheduler/api/resource_info.go b/pkg/scheduler/api/resource_info.go index a2805597a1..fe89a1a03f 100644 --- a/pkg/scheduler/api/resource_info.go +++ b/pkg/scheduler/api/resource_info.go @@ -105,8 +105,6 @@ func NewResource(rl v1.ResourceList) *Resource { } else { klog.V(3).Infof("Ignoring resource %s", rName.String()) } - } else { - klog.V(3).Infof("non scalar resource %s", rName.String()) } } } From 6c7ecd8620cb5199595fc73588ccb397f923d385 Mon Sep 17 00:00:00 2001 From: Junze Bao Date: Wed, 27 Aug 2025 22:43:26 +0200 Subject: [PATCH 06/14] fix add resource logs Signed-off-by: Junze Bao --- pkg/scheduler/actions/reclaim/reclaim.go | 2 +- pkg/scheduler/api/resource_info.go | 2 ++ pkg/scheduler/plugins/capacity/capacity.go | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/scheduler/actions/reclaim/reclaim.go b/pkg/scheduler/actions/reclaim/reclaim.go index b92b8675ef..3062fae6cf 100644 --- a/pkg/scheduler/actions/reclaim/reclaim.go +++ b/pkg/scheduler/actions/reclaim/reclaim.go @@ -142,7 +142,6 @@ func (ra *Action) Execute(ssn *framework.Session) { } assigned := false - reclaimed := api.EmptyResource() // we should filter out those nodes that are UnschedulableAndUnresolvable status got in allocate action totalNodes := ssn.FilterOutUnschedulableAndUnresolvableNodesForTask(task) for _, n := range totalNodes { @@ -192,6 +191,7 @@ func (ra *Action) Execute(ssn *framework.Session) { victimsQueue := ssn.BuildVictimsPriorityQueue(victims, task) resreq := task.InitResreq.Clone() + reclaimed := api.EmptyResource() // Reclaim victims for tasks. for !victimsQueue.Empty() { diff --git a/pkg/scheduler/api/resource_info.go b/pkg/scheduler/api/resource_info.go index fe89a1a03f..a6f2b2986c 100644 --- a/pkg/scheduler/api/resource_info.go +++ b/pkg/scheduler/api/resource_info.go @@ -416,6 +416,7 @@ func (r *Resource) LessEqual(rr *Resource, defaultValue DimensionDefaultValue) b if defaultValue == Infinity { for name := range rr.ScalarResources { if _, ok := r.ScalarResources[name]; !ok { + klog.V(3).Infof("Scalar resource %s is not defined in r, r: %v, rr: %v", name.String(), r, rr) return false } } @@ -424,6 +425,7 @@ func (r *Resource) LessEqual(rr *Resource, defaultValue DimensionDefaultValue) b for resourceName, leftValue := range r.ScalarResources { rightValue, ok := rr.ScalarResources[resourceName] if !ok && defaultValue == Infinity { + klog.V(3).Infof("Scalar resource %s is not defined in rr, r: %v, rr: %v", resourceName.String(), r, rr) continue } diff --git a/pkg/scheduler/plugins/capacity/capacity.go b/pkg/scheduler/plugins/capacity/capacity.go index 7529fe6380..06e0f5d74f 100644 --- a/pkg/scheduler/plugins/capacity/capacity.go +++ b/pkg/scheduler/plugins/capacity/capacity.go @@ -419,7 +419,7 @@ func (cp *capacityPlugin) buildQueueAttrs(ssn *framework.Session) { attr.realCapability = realCapability } cp.queueOpts[job.Queue] = attr - klog.V(4).Infof("Added Queue <%s> attributes.", job.Queue) + klog.V(3).Infof("Added Queue <%s> attributes.", job.Queue) } attr := cp.queueOpts[job.Queue] From dc4a2c4457b07564e166b0d9a0c5f7ecde81085b Mon Sep 17 00:00:00 2001 From: Junze Bao Date: Thu, 28 Aug 2025 10:35:05 +0200 Subject: [PATCH 07/14] resource hack Signed-off-by: Junze Bao --- pkg/scheduler/actions/reclaim/reclaim.go | 2 +- pkg/scheduler/api/resource_info.go | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pkg/scheduler/actions/reclaim/reclaim.go b/pkg/scheduler/actions/reclaim/reclaim.go index 3062fae6cf..c3367572a5 100644 --- a/pkg/scheduler/actions/reclaim/reclaim.go +++ b/pkg/scheduler/actions/reclaim/reclaim.go @@ -184,7 +184,7 @@ func (ra *Action) Execute(ssn *framework.Session) { victims := ssn.Reclaimable(task, reclaimees) if err := util.ValidateVictims(task, n, victims); err != nil { - klog.V(4).Infof("No validated victims on Node <%s>: %v", n.Name, err) + klog.V(3).Infof("No validated victims on Node <%s>: %v", n.Name, err) continue } diff --git a/pkg/scheduler/api/resource_info.go b/pkg/scheduler/api/resource_info.go index a6f2b2986c..8a714f9d4a 100644 --- a/pkg/scheduler/api/resource_info.go +++ b/pkg/scheduler/api/resource_info.go @@ -415,6 +415,9 @@ func (r *Resource) LessEqual(rr *Resource, defaultValue DimensionDefaultValue) b if defaultValue == Infinity { for name := range rr.ScalarResources { + if IgnoreScalarResource(name) { + continue + } if _, ok := r.ScalarResources[name]; !ok { klog.V(3).Infof("Scalar resource %s is not defined in r, r: %v, rr: %v", name.String(), r, rr) return false @@ -423,6 +426,9 @@ func (r *Resource) LessEqual(rr *Resource, defaultValue DimensionDefaultValue) b } for resourceName, leftValue := range r.ScalarResources { + if IgnoreScalarResource(resourceName) { + continue + } rightValue, ok := rr.ScalarResources[resourceName] if !ok && defaultValue == Infinity { klog.V(3).Infof("Scalar resource %s is not defined in rr, r: %v, rr: %v", resourceName.String(), r, rr) @@ -463,7 +469,7 @@ func (r *Resource) LessEqualWithDimension(rr *Resource, req *Resource) bool { } for name, quant := range req.ScalarResources { - if IsIgnoredScalarResource(name) { + if IsIgnoredScalarResource(name) || IgnoreScalarResource(name) { continue } rQuant := r.ScalarResources[name] @@ -792,3 +798,7 @@ func ExceededPart(left, right *Resource) *Resource { diff, _ := left.Diff(right, Zero) return diff } + +func IgnoreScalarResource(name v1.ResourceName) bool { + return name == "attachable-volumes-csi-fsx.csi.aws.com" || name == "efa.poolsi.de/infiniband" || name == "vpc.amazonaws.com/efa" || ignoredScalarResources.Has(string(name)) +} From 56b09f4174617f24e836f9fbf5eaca565ae7c5de Mon Sep 17 00:00:00 2001 From: Junze Bao Date: Thu, 28 Aug 2025 11:22:15 +0200 Subject: [PATCH 08/14] add node idle to reclaimed Signed-off-by: Junze Bao --- pkg/scheduler/actions/reclaim/reclaim.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/scheduler/actions/reclaim/reclaim.go b/pkg/scheduler/actions/reclaim/reclaim.go index c3367572a5..3af720ded7 100644 --- a/pkg/scheduler/actions/reclaim/reclaim.go +++ b/pkg/scheduler/actions/reclaim/reclaim.go @@ -191,7 +191,7 @@ func (ra *Action) Execute(ssn *framework.Session) { victimsQueue := ssn.BuildVictimsPriorityQueue(victims, task) resreq := task.InitResreq.Clone() - reclaimed := api.EmptyResource() + reclaimed := n.FutureIdle() // Reclaim victims for tasks. for !victimsQueue.Empty() { From 51dd721d5d15ffb2615d5eb1b1ce4edc3c055547 Mon Sep 17 00:00:00 2001 From: Junze Bao Date: Thu, 28 Aug 2025 11:24:50 +0200 Subject: [PATCH 09/14] add log and victims Signed-off-by: Junze Bao --- pkg/scheduler/actions/reclaim/reclaim.go | 2 +- pkg/scheduler/plugins/capacity/capacity.go | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pkg/scheduler/actions/reclaim/reclaim.go b/pkg/scheduler/actions/reclaim/reclaim.go index 3af720ded7..414f031813 100644 --- a/pkg/scheduler/actions/reclaim/reclaim.go +++ b/pkg/scheduler/actions/reclaim/reclaim.go @@ -101,7 +101,7 @@ func (ra *Action) Execute(ssn *framework.Session) { queue := queues.Pop().(*api.QueueInfo) if ssn.Overused(queue) { - klog.V(3).Infof("Queue <%s> is overused, ignore it.", queue.Name) + klog.V(3).Infof("Queue <%s> is overused <%v>, ignore it.", queue.Name, queue.Queue.Status.Allocated) continue } diff --git a/pkg/scheduler/plugins/capacity/capacity.go b/pkg/scheduler/plugins/capacity/capacity.go index 06e0f5d74f..a06015b4c8 100644 --- a/pkg/scheduler/plugins/capacity/capacity.go +++ b/pkg/scheduler/plugins/capacity/capacity.go @@ -141,7 +141,11 @@ func (cp *capacityPlugin) OnSessionOpen(ssn *framework.Session) { allocated.Sub(reclaimee.Resreq) victims = append(victims, reclaimee) } - klog.V(3).Infof("Victims from capacity plugin, victims=%+v reclaimer=%s", victims, reclaimer) + victimNames := []string{} + for _, victim := range victims { + victimNames = append(victimNames, victim.Name) + } + klog.V(3).Infof("Victims from capacity plugin, victims=%+v reclaimer=%s", victimNames, reclaimer) return victims, util.Permit }) @@ -419,7 +423,7 @@ func (cp *capacityPlugin) buildQueueAttrs(ssn *framework.Session) { attr.realCapability = realCapability } cp.queueOpts[job.Queue] = attr - klog.V(3).Infof("Added Queue <%s> attributes.", job.Queue) + klog.V(3).Infof("Added Queue <%s> attributes <%v>.", job.Queue, attr) } attr := cp.queueOpts[job.Queue] From 478c1169fb1b5a0952d4f30cde2271cae675ad46 Mon Sep 17 00:00:00 2001 From: Junze Bao Date: Thu, 28 Aug 2025 11:41:36 +0200 Subject: [PATCH 10/14] use futureidle clone Signed-off-by: Junze Bao --- pkg/scheduler/actions/reclaim/reclaim.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/scheduler/actions/reclaim/reclaim.go b/pkg/scheduler/actions/reclaim/reclaim.go index 414f031813..80fb1e971a 100644 --- a/pkg/scheduler/actions/reclaim/reclaim.go +++ b/pkg/scheduler/actions/reclaim/reclaim.go @@ -191,7 +191,7 @@ func (ra *Action) Execute(ssn *framework.Session) { victimsQueue := ssn.BuildVictimsPriorityQueue(victims, task) resreq := task.InitResreq.Clone() - reclaimed := n.FutureIdle() + reclaimed := n.FutureIdle().Clone() // Reclaim victims for tasks. for !victimsQueue.Empty() { From fac63f29ead5dc8d2652e68dbe3a4fdd927a0979 Mon Sep 17 00:00:00 2001 From: Junze Bao Date: Thu, 28 Aug 2025 11:55:05 +0200 Subject: [PATCH 11/14] fix Signed-off-by: Junze Bao --- pkg/scheduler/actions/reclaim/reclaim.go | 2 +- pkg/scheduler/api/resource_info.go | 14 +------------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/pkg/scheduler/actions/reclaim/reclaim.go b/pkg/scheduler/actions/reclaim/reclaim.go index 80fb1e971a..414f031813 100644 --- a/pkg/scheduler/actions/reclaim/reclaim.go +++ b/pkg/scheduler/actions/reclaim/reclaim.go @@ -191,7 +191,7 @@ func (ra *Action) Execute(ssn *framework.Session) { victimsQueue := ssn.BuildVictimsPriorityQueue(victims, task) resreq := task.InitResreq.Clone() - reclaimed := n.FutureIdle().Clone() + reclaimed := n.FutureIdle() // Reclaim victims for tasks. for !victimsQueue.Empty() { diff --git a/pkg/scheduler/api/resource_info.go b/pkg/scheduler/api/resource_info.go index 8a714f9d4a..fe89a1a03f 100644 --- a/pkg/scheduler/api/resource_info.go +++ b/pkg/scheduler/api/resource_info.go @@ -415,23 +415,15 @@ func (r *Resource) LessEqual(rr *Resource, defaultValue DimensionDefaultValue) b if defaultValue == Infinity { for name := range rr.ScalarResources { - if IgnoreScalarResource(name) { - continue - } if _, ok := r.ScalarResources[name]; !ok { - klog.V(3).Infof("Scalar resource %s is not defined in r, r: %v, rr: %v", name.String(), r, rr) return false } } } for resourceName, leftValue := range r.ScalarResources { - if IgnoreScalarResource(resourceName) { - continue - } rightValue, ok := rr.ScalarResources[resourceName] if !ok && defaultValue == Infinity { - klog.V(3).Infof("Scalar resource %s is not defined in rr, r: %v, rr: %v", resourceName.String(), r, rr) continue } @@ -469,7 +461,7 @@ func (r *Resource) LessEqualWithDimension(rr *Resource, req *Resource) bool { } for name, quant := range req.ScalarResources { - if IsIgnoredScalarResource(name) || IgnoreScalarResource(name) { + if IsIgnoredScalarResource(name) { continue } rQuant := r.ScalarResources[name] @@ -798,7 +790,3 @@ func ExceededPart(left, right *Resource) *Resource { diff, _ := left.Diff(right, Zero) return diff } - -func IgnoreScalarResource(name v1.ResourceName) bool { - return name == "attachable-volumes-csi-fsx.csi.aws.com" || name == "efa.poolsi.de/infiniband" || name == "vpc.amazonaws.com/efa" || ignoredScalarResources.Has(string(name)) -} From 7a41a7ad2c2edd2bbd5bfc724b455664c37c08c9 Mon Sep 17 00:00:00 2001 From: Junze Bao Date: Thu, 28 Aug 2025 15:12:52 +0200 Subject: [PATCH 12/14] adjust log Signed-off-by: Junze Bao --- pkg/scheduler/actions/preempt/preempt.go | 8 ++++---- pkg/scheduler/actions/reclaim/reclaim.go | 4 ++-- pkg/scheduler/api/resource_info.go | 2 +- pkg/scheduler/framework/session_plugins.go | 6 +++++- pkg/scheduler/plugins/capacity/capacity.go | 7 ++----- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/pkg/scheduler/actions/preempt/preempt.go b/pkg/scheduler/actions/preempt/preempt.go index 5f6a458366..d306cc6f14 100644 --- a/pkg/scheduler/actions/preempt/preempt.go +++ b/pkg/scheduler/actions/preempt/preempt.go @@ -319,7 +319,7 @@ func (pmpt *Action) normalPreempt( assigned := false for _, node := range selectedNodes { - klog.V(4).Infof("Considering Task <%s/%s> on Node <%s>.", + klog.V(3).Infof("Considering Task <%s/%s> on Node <%s>.", preemptor.Namespace, preemptor.Name, node.Name) var preemptees []*api.TaskInfo @@ -334,7 +334,7 @@ func (pmpt *Action) normalPreempt( metrics.UpdatePreemptionVictimsCount(len(victims)) if err := util.ValidateVictims(preemptor, node, victims); err != nil { - klog.V(4).Infof("No validated victims on Node <%s>: %v", node.Name, err) + klog.V(3).Infof("No validated victims on Node <%s>: %v", node.Name, err) continue } @@ -373,7 +373,7 @@ func (pmpt *Action) normalPreempt( } metrics.RegisterPreemptionAttempts() - klog.V(4).Infof("Try to preempt <%v> for Task <%s/%s> requested <%v>.", + klog.V(3).Infof("Try to preempt <%v> for Task <%s/%s> requested <%v>.", preempted, preemptor.Namespace, preemptor.Name, preemptor.InitResreq) // If preemptor's queue is not allocatable, it means preemptor cannot be allocated. So no need care about the node idle resource @@ -720,7 +720,7 @@ func SelectVictimsOnNode( metrics.UpdatePreemptionVictimsCount(len(allVictims)) if err := util.ValidateVictims(preemptor, nodeInfo, allVictims); err != nil { - klog.V(4).Infof("No validated victims on Node <%s>: %v", nodeInfo.Name, err) + klog.V(3).Infof("No validated victims on Node <%s>: %v", nodeInfo.Name, err) return nil, api.AsStatus(fmt.Errorf("no validated victims on Node <%s>: %v", nodeInfo.Name, err)) } diff --git a/pkg/scheduler/actions/reclaim/reclaim.go b/pkg/scheduler/actions/reclaim/reclaim.go index 414f031813..a9c4ab5227 100644 --- a/pkg/scheduler/actions/reclaim/reclaim.go +++ b/pkg/scheduler/actions/reclaim/reclaim.go @@ -152,7 +152,7 @@ func (ra *Action) Execute(ssn *framework.Session) { continue } - klog.V(4).Infof("Considering Task <%s/%s> on Node <%s>.", task.Namespace, task.Name, n.Name) + klog.V(3).Infof("Considering Task <%s/%s> on Node <%s>.", task.Namespace, task.Name, n.Name) var reclaimees []*api.TaskInfo for _, task := range n.Tasks { @@ -177,7 +177,7 @@ func (ra *Action) Execute(ssn *framework.Session) { } if len(reclaimees) == 0 { - klog.V(4).Infof("No reclaimees on Node <%s>.", n.Name) + klog.V(3).Infof("No reclaimees on Node <%s>.", n.Name) continue } diff --git a/pkg/scheduler/api/resource_info.go b/pkg/scheduler/api/resource_info.go index fe89a1a03f..5aae74c1bb 100644 --- a/pkg/scheduler/api/resource_info.go +++ b/pkg/scheduler/api/resource_info.go @@ -103,7 +103,7 @@ func NewResource(rl v1.ResourceList) *Resource { if !ignore { r.AddScalar(rName, float64(rQuant.MilliValue())) } else { - klog.V(3).Infof("Ignoring resource %s", rName.String()) + klog.V(4).Infof("Ignoring resource %s", rName.String()) } } } diff --git a/pkg/scheduler/framework/session_plugins.go b/pkg/scheduler/framework/session_plugins.go index 919769a672..67ff03b0d5 100644 --- a/pkg/scheduler/framework/session_plugins.go +++ b/pkg/scheduler/framework/session_plugins.go @@ -205,7 +205,11 @@ func (ssn *Session) Reclaimable(reclaimer *api.TaskInfo, reclaimees []*api.TaskI victims = nil break } - klog.V(3).Infof("Victims from plugin %s, victims=%+v reclaimer=%s", plugin.Name, victims, reclaimer.Name) + victimNames := []string{} + for _, victim := range victims { + victimNames = append(victimNames, victim.Name) + } + klog.V(3).Infof("Victims from plugin %s, victims=%+v reclaimer=%s", plugin.Name, victimNames, reclaimer.Name) // first iteration - initialize victims list if victims == nil { victims = candidates diff --git a/pkg/scheduler/plugins/capacity/capacity.go b/pkg/scheduler/plugins/capacity/capacity.go index a06015b4c8..2269752755 100644 --- a/pkg/scheduler/plugins/capacity/capacity.go +++ b/pkg/scheduler/plugins/capacity/capacity.go @@ -138,14 +138,11 @@ func (cp *capacityPlugin) OnSessionOpen(ssn *framework.Session) { if allocated.LessEqual(attr.deserved, api.Infinity) || !attr.guarantee.LessEqual(exceptReclaimee, api.Zero) { continue } + klog.V(3).Infof("reclaimee %s(%+v) becomes victim after comparison. allocated=%+v, deserved=%+v, exceptReclaimee=%+v", reclaimee.Name, reclaimee.Resreq, allocated, attr.deserved, exceptReclaimee) allocated.Sub(reclaimee.Resreq) victims = append(victims, reclaimee) } - victimNames := []string{} - for _, victim := range victims { - victimNames = append(victimNames, victim.Name) - } - klog.V(3).Infof("Victims from capacity plugin, victims=%+v reclaimer=%s", victimNames, reclaimer) + return victims, util.Permit }) From 297a589f246a93a44b04968f78f6a798244420a5 Mon Sep 17 00:00:00 2001 From: Junze Bao Date: Thu, 28 Aug 2025 16:05:50 +0200 Subject: [PATCH 13/14] add resource name ignore Signed-off-by: Junze Bao --- pkg/scheduler/api/resource_info.go | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pkg/scheduler/api/resource_info.go b/pkg/scheduler/api/resource_info.go index 5aae74c1bb..358c89701a 100644 --- a/pkg/scheduler/api/resource_info.go +++ b/pkg/scheduler/api/resource_info.go @@ -415,6 +415,9 @@ func (r *Resource) LessEqual(rr *Resource, defaultValue DimensionDefaultValue) b if defaultValue == Infinity { for name := range rr.ScalarResources { + if IgnoreScalarResource(name) { + continue + } if _, ok := r.ScalarResources[name]; !ok { return false } @@ -422,6 +425,9 @@ func (r *Resource) LessEqual(rr *Resource, defaultValue DimensionDefaultValue) b } for resourceName, leftValue := range r.ScalarResources { + if IgnoreScalarResource(resourceName) { + continue + } rightValue, ok := rr.ScalarResources[resourceName] if !ok && defaultValue == Infinity { continue @@ -461,7 +467,7 @@ func (r *Resource) LessEqualWithDimension(rr *Resource, req *Resource) bool { } for name, quant := range req.ScalarResources { - if IsIgnoredScalarResource(name) { + if IgnoreScalarResource(name) { continue } rQuant := r.ScalarResources[name] @@ -790,3 +796,7 @@ func ExceededPart(left, right *Resource) *Resource { diff, _ := left.Diff(right, Zero) return diff } + +func IgnoreScalarResource(name v1.ResourceName) bool { + return name == "attachable-volumes-csi-fsx.csi.aws.com" || name == "efa.poolsi.de/infiniband" || name == "vpc.amazonaws.com/efa" || ignoredScalarResources.Has(string(name)) +} From 0e71176844d2ee3c0b9216a5ccff7d73ed1ca821 Mon Sep 17 00:00:00 2001 From: Junze Bao Date: Fri, 29 Aug 2025 09:14:20 +0200 Subject: [PATCH 14/14] add queue Signed-off-by: Junze Bao --- pkg/scheduler/plugins/capacity/capacity.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/scheduler/plugins/capacity/capacity.go b/pkg/scheduler/plugins/capacity/capacity.go index 2269752755..051c20a1a0 100644 --- a/pkg/scheduler/plugins/capacity/capacity.go +++ b/pkg/scheduler/plugins/capacity/capacity.go @@ -138,7 +138,7 @@ func (cp *capacityPlugin) OnSessionOpen(ssn *framework.Session) { if allocated.LessEqual(attr.deserved, api.Infinity) || !attr.guarantee.LessEqual(exceptReclaimee, api.Zero) { continue } - klog.V(3).Infof("reclaimee %s(%+v) becomes victim after comparison. allocated=%+v, deserved=%+v, exceptReclaimee=%+v", reclaimee.Name, reclaimee.Resreq, allocated, attr.deserved, exceptReclaimee) + klog.V(3).Infof("reclaimee %s/%s(%+v) becomes victim after comparison. allocated=%+v, deserved=%+v, exceptReclaimee=%+v", job.Queue, reclaimee.Name, reclaimee.Resreq, allocated, attr.deserved, exceptReclaimee) allocated.Sub(reclaimee.Resreq) victims = append(victims, reclaimee) }