diff --git a/pkg/scheduler/actions/allocate/allocate.go b/pkg/scheduler/actions/allocate/allocate.go index 7151c75689..c7fc38c969 100644 --- a/pkg/scheduler/actions/allocate/allocate.go +++ b/pkg/scheduler/actions/allocate/allocate.go @@ -179,7 +179,7 @@ func (alloc *Action) allocateResources(queues *util.PriorityQueue, jobsMap map[a continue } - klog.V(3).Infof("Try to allocate resource to %d tasks of Job <%v/%v>", + klog.V(4).Infof("Try to allocate resource to %d tasks of Job <%v/%v>", tasks.Len(), job.Namespace, job.Name) hardMode, highestAllowedTier := job.IsHardTopologyMode() diff --git a/pkg/scheduler/actions/preempt/preempt.go b/pkg/scheduler/actions/preempt/preempt.go index f237e3a162..d306cc6f14 100644 --- a/pkg/scheduler/actions/preempt/preempt.go +++ b/pkg/scheduler/actions/preempt/preempt.go @@ -359,7 +359,7 @@ func (pmpt *Action) normalPreempt( preemptee := victimsQueue.Pop().(*api.TaskInfo) klog.V(3).Infof("Try to preempt Task <%s/%s> for Task <%s/%s>", preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name) - if err := stmt.Evict(preemptee, "preempt"); err != nil { + if err := stmt.Evict(preemptee, "preempt for task "+preemptor.Name); err != nil { klog.Errorf("Failed to preempt Task <%s/%s> for Task <%s/%s>: %v", preemptee.Namespace, preemptee.Name, preemptor.Namespace, preemptor.Name, err) continue @@ -373,7 +373,7 @@ func (pmpt *Action) normalPreempt( } metrics.RegisterPreemptionAttempts() - klog.V(3).Infof("Preempted <%v> for Task <%s/%s> requested <%v>.", + klog.V(3).Infof("Try to preempt <%v> for Task <%s/%s> requested <%v>.", preempted, preemptor.Namespace, preemptor.Name, preemptor.InitResreq) // If preemptor's queue is not allocatable, it means preemptor cannot be allocated. So no need care about the node idle resource @@ -501,7 +501,7 @@ func prepareCandidate(c *candidate, pod *v1.Pod, stmt *framework.Statement, ssn for _, victim := range c.Victims() { klog.V(3).Infof("Try to preempt Task <%s/%s> for Task <%s/%s>", victim.Namespace, victim.Name, pod.Namespace, pod.Name) - if err := stmt.Evict(victim, "preempt"); err != nil { + if err := stmt.Evict(victim, "preempt for task "+pod.Name); err != nil { klog.Errorf("Failed to preempt Task <%s/%s> for Task <%s/%s>: %v", victim.Namespace, victim.Name, pod.Namespace, pod.Name, err) return api.AsStatus(err) diff --git a/pkg/scheduler/actions/reclaim/reclaim.go b/pkg/scheduler/actions/reclaim/reclaim.go index bb006ec73d..a9c4ab5227 100644 --- a/pkg/scheduler/actions/reclaim/reclaim.go +++ b/pkg/scheduler/actions/reclaim/reclaim.go @@ -101,7 +101,7 @@ func (ra *Action) Execute(ssn *framework.Session) { queue := queues.Pop().(*api.QueueInfo) if ssn.Overused(queue) { - klog.V(3).Infof("Queue <%s> is overused, ignore it.", queue.Name) + klog.V(3).Infof("Queue <%s> is overused <%v>, ignore it.", queue.Name, queue.Queue.Status.Allocated) continue } @@ -177,7 +177,7 @@ func (ra *Action) Execute(ssn *framework.Session) { } if len(reclaimees) == 0 { - klog.V(4).Infof("No reclaimees on Node <%s>.", n.Name) + klog.V(3).Infof("No reclaimees on Node <%s>.", n.Name) continue } @@ -191,14 +191,14 @@ func (ra *Action) Execute(ssn *framework.Session) { victimsQueue := ssn.BuildVictimsPriorityQueue(victims, task) resreq := task.InitResreq.Clone() - reclaimed := api.EmptyResource() + reclaimed := n.FutureIdle() // Reclaim victims for tasks. for !victimsQueue.Empty() { reclaimee := victimsQueue.Pop().(*api.TaskInfo) klog.Errorf("Try to reclaim Task <%s/%s> for Tasks <%s/%s>", reclaimee.Namespace, reclaimee.Name, task.Namespace, task.Name) - if err := ssn.Evict(reclaimee, "reclaim"); err != nil { + if err := ssn.Evict(reclaimee, "reclaim for task "+task.Name); err != nil { klog.Errorf("Failed to reclaim Task <%s/%s> for Tasks <%s/%s>: %v", reclaimee.Namespace, reclaimee.Name, task.Namespace, task.Name, err) continue diff --git a/pkg/scheduler/api/resource_info.go b/pkg/scheduler/api/resource_info.go index 5aae74c1bb..358c89701a 100644 --- a/pkg/scheduler/api/resource_info.go +++ b/pkg/scheduler/api/resource_info.go @@ -415,6 +415,9 @@ func (r *Resource) LessEqual(rr *Resource, defaultValue DimensionDefaultValue) b if defaultValue == Infinity { for name := range rr.ScalarResources { + if IgnoreScalarResource(name) { + continue + } if _, ok := r.ScalarResources[name]; !ok { return false } @@ -422,6 +425,9 @@ func (r *Resource) LessEqual(rr *Resource, defaultValue DimensionDefaultValue) b } for resourceName, leftValue := range r.ScalarResources { + if IgnoreScalarResource(resourceName) { + continue + } rightValue, ok := rr.ScalarResources[resourceName] if !ok && defaultValue == Infinity { continue @@ -461,7 +467,7 @@ func (r *Resource) LessEqualWithDimension(rr *Resource, req *Resource) bool { } for name, quant := range req.ScalarResources { - if IsIgnoredScalarResource(name) { + if IgnoreScalarResource(name) { continue } rQuant := r.ScalarResources[name] @@ -790,3 +796,7 @@ func ExceededPart(left, right *Resource) *Resource { diff, _ := left.Diff(right, Zero) return diff } + +func IgnoreScalarResource(name v1.ResourceName) bool { + return name == "attachable-volumes-csi-fsx.csi.aws.com" || name == "efa.poolsi.de/infiniband" || name == "vpc.amazonaws.com/efa" || ignoredScalarResources.Has(string(name)) +} diff --git a/pkg/scheduler/cache/cache.go b/pkg/scheduler/cache/cache.go index aee9212bce..f3926ae1a9 100644 --- a/pkg/scheduler/cache/cache.go +++ b/pkg/scheduler/cache/cache.go @@ -906,9 +906,9 @@ func (sc *SchedulerCache) Bind(ctx context.Context, bindContexts []*BindContext) tmp := time.Now() errMsg := sc.Binder.Bind(sc.kubeClient, readyToBindTasks) if len(errMsg) == 0 { - klog.V(3).Infof("bind ok, latency %v", time.Since(tmp)) + klog.V(4).Infof("bind ok, latency %v", time.Since(tmp)) } else { - klog.V(3).Infof("There are %d tasks in total and %d binds failed, latency %v", len(readyToBindTasks), len(errMsg), time.Since(tmp)) + klog.V(4).Infof("There are %d tasks in total and %d binds failed, latency %v", len(readyToBindTasks), len(errMsg), time.Since(tmp)) } for _, bindContext := range bindContexts { diff --git a/pkg/scheduler/framework/session_plugins.go b/pkg/scheduler/framework/session_plugins.go index 100da748c8..67ff03b0d5 100644 --- a/pkg/scheduler/framework/session_plugins.go +++ b/pkg/scheduler/framework/session_plugins.go @@ -23,6 +23,7 @@ package framework import ( "context" + "k8s.io/klog/v2" k8sframework "k8s.io/kubernetes/pkg/scheduler/framework" "volcano.sh/apis/pkg/apis/scheduling" @@ -204,6 +205,11 @@ func (ssn *Session) Reclaimable(reclaimer *api.TaskInfo, reclaimees []*api.TaskI victims = nil break } + victimNames := []string{} + for _, victim := range victims { + victimNames = append(victimNames, victim.Name) + } + klog.V(3).Infof("Victims from plugin %s, victims=%+v reclaimer=%s", plugin.Name, victimNames, reclaimer.Name) // first iteration - initialize victims list if victims == nil { victims = candidates diff --git a/pkg/scheduler/plugins/capacity/capacity.go b/pkg/scheduler/plugins/capacity/capacity.go index d576537dd5..051c20a1a0 100644 --- a/pkg/scheduler/plugins/capacity/capacity.go +++ b/pkg/scheduler/plugins/capacity/capacity.go @@ -138,10 +138,11 @@ func (cp *capacityPlugin) OnSessionOpen(ssn *framework.Session) { if allocated.LessEqual(attr.deserved, api.Infinity) || !attr.guarantee.LessEqual(exceptReclaimee, api.Zero) { continue } + klog.V(3).Infof("reclaimee %s/%s(%+v) becomes victim after comparison. allocated=%+v, deserved=%+v, exceptReclaimee=%+v", job.Queue, reclaimee.Name, reclaimee.Resreq, allocated, attr.deserved, exceptReclaimee) allocated.Sub(reclaimee.Resreq) victims = append(victims, reclaimee) } - klog.V(4).Infof("Victims from capacity plugin, victims=%+v reclaimer=%s", victims, reclaimer) + return victims, util.Permit }) @@ -419,7 +420,7 @@ func (cp *capacityPlugin) buildQueueAttrs(ssn *framework.Session) { attr.realCapability = realCapability } cp.queueOpts[job.Queue] = attr - klog.V(4).Infof("Added Queue <%s> attributes.", job.Queue) + klog.V(3).Infof("Added Queue <%s> attributes <%v>.", job.Queue, attr) } attr := cp.queueOpts[job.Queue]