Sched ext回调3——select_cpu(linux 6.15.7)
struct sched_ext_ops代表了一个调度器里面定义了很多回调函数本文分析select_cpu这个hooksched ext的部分hook如下一、唤醒task时执行select_cpu的调用链wake_up_process└─ try_to_wake_up├─(场景A) p current特殊处理| └─ ttwu_do_wakeup| └─ WRITE_ONCE(p-__state, TASK_RUNNING)继续执行p退出| (场景B)├─ select_task_rq返回cpu| ├─ cpu p-sched_class-select_task_rq ← 即 select_task_rq_scx| | |根据sched ext是否实现select_cpu回调走下面2个场景| | ├─(实现) SCX_CALL_OP_TASK_RET(…, select_cpu← 即 select_cpu 回调| | └─(未实现) scx_select_cpu_dfl|├─ ttwu_queue(p, cpu, wake_flags)| ├─ rq cpu_rq(cpu) 参数cpu是前面select_cpu返回的| └─ ttwu_do_activate(rq, p, wake_flags, rf) 将task放入到cpu代表的队列中| ├─ activate_task 将task加入到调度器自己的queue| | ├─ enqueue_task| | | └─ p-sched_class-enqueue_task ← 即 enqueue_task_scx| | | ├─ p-scx.flags | SCX_TASK_QUEUED| | | ├─ SCX_CALL_OP_TASK(…, runnable← 即 runnable 回调| | | └─ do_enqueue_task| | ├─ p不允许migrationgoto local| | ├─ sched ext无enqueue回调,goto global| | ├─ p-scx.ddsp_dsq_id ! SCX_DSQ_INVALID,goto direct| | ├─ SCX_CALL_OP_TASK(…, enqueue← 即 enqueue 回调| | ├─ p-scx.ddsp_dsq_id ! SCX_DSQ_INVALID,goto direct| |direct:├─ direct_dispatch(p, enq_flags)| |local:├─ dispatch_enqueue(rq-scx.local_dsq, p, enq_flags)| |global:├─ dispatch_enqueue(find_global_dsq(p), p, enq_flags)| || └─ WRITE_ONCE(p-on_rq, TASK_ON_RQ_QUEUED)|├─ wakeup_preemptenqueue的task是否抢占正在运行的task| ├─(场景1)donor-sched_class-wakeup_preempttask属于同一个调度类| └─(场景2)resched_curr(rq)enqueue的task调度类更高|├─ ttwu_do_wakeup设置task的状态为running└─ WRITE_ONCE(p-__state, TASK_RUNNING)二、exec时执行select_cpu的调用链SYSCALL_DEFINE3(execve, …└─ do_execve└─ do_execveat_common├─ alloc_bprm├─ copy_string_kernel(bprm-filename, bprm)├─ copy_strings(bprm-envc, envp, bprm)├─ copy_strings(bprm-argc, argv, bprm)└─ bprm_execve├─ sched_exec| ├─ p-sched_class-select_task_rq(p, task_cpu(p), WF_EXEC) ← 注1| | ├─(场景1) SCX_CALL_OP_TASK_RET(…, select_cpu← 即select_cpu| | └─(场景2) scx_select_cpu_dfl 如果sched ext没有定义select_cpu| || ├─ arg (struct migration_arg){ p, dest_cpu }| ├─ stop_one_cpu(task_cpu(p), migration_cpu_stop, arg) ← 注2|├─ exec_binprm注1对于sched extp-sched_class-select_task_rq即 select_task_rq_scx另外第2个参数task_cpu(p)是execve() 发生时当前正在运行的 CPU。注2由于exec_binprm后址空间、代码、数据都会变cache / TLB / 内存局部性几乎全部失效这是一个“迁移成本极低”的好时机。如果注1处选择了不同于当前cpu立即做迁移。三、唤醒流程中select cpu用到的p-wake_cputry_to_wake_up -- select_task_rq(p, p-wake_cpu, wake_flags):int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { cpu select_task_rq(p, p-wake_cpu, wake_flags); if (task_cpu(p) ! cpu) { if (p-in_iowait) { delayacct_blkio_end(p); atomic_dec(task_rq(p)-nr_iowait); } wake_flags | WF_MIGRATED; psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } }p-wake_cpu告诉调度器希望任务醒来时“应该尽量考虑这个 CPU”但是否真的用这个CPU由调度器根据策略NUMA、能耗、负载、绑定关系等决定。细节参考调度器选核函数select_task_rq_fair、select_task_rq_scx、select_task_rq_idle等等。p-wake_cpu在ac66f5477239ebd3c4e2cbf2f591ef387aa09884中引入用于解决numa balance迁移task时如果task不在运行队列中!p-on_rq通过延迟迁移到唤醒时刻保证了迁移交换操作的原子性和一致性同时避免了复杂的锁竞争和状态管理。wake_cpu字段记录该任务应该被迁移到的目标CPUtry_to_wake_up -- select_task_rq会优先考虑选择该cpu。task_numa_migrate -- migrate_swap int migrate_swap(struct task_struct *cur, struct task_struct *p, int target_cpu, int curr_cpu) { arg (struct migration_swap_arg){ .src_task cur, .src_cpu curr_cpu, .dst_task p, .dst_cpu target_cpu, }; ret stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, arg); }static int migrate_swap_stop(void *data) { struct migration_swap_arg *arg data; migrate_swap - stop_two_cpus的arg参数 __migrate_swap_task(arg-src_task, arg-dst_cpu); __migrate_swap_task(arg-dst_task, arg-src_cpu); return 0; }static void __migrate_swap_task(struct task_struct *p, int cpu) { if (task_on_rq_queued(p)) { …… } else { /* * Task isnt running anymore; make it appear like we migrated * it before it went to sleep. This means on wakeup we make the * previous CPU our target instead of where it really is. */ p-wake_cpu cpu; task不在运行队列中设置wake_cputry_to_wake_up优先选择该cpu } }四、p-wake_cpu在sched ext中的使用select_task_rq_scx代码static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) { /* * sched_exec() calls with %WF_EXEC when p is about to exec(2) as it * can be a good migration opportunity with low cache and memory * footprint. Returning a CPU different than prev_cpu triggers * immediate rq migration. However, for SCX, as the current rq * association doesnt dictate where the task is going to run, this * doesnt fit well. If necessary, we can later add a dedicated method * which can decide to preempt self to force it through the regular * scheduling path. */ if (unlikely(wake_flags WF_EXEC)) 注3 return prev_cpu; if (SCX_HAS_OP(select_cpu) !rq_bypass) { cpu SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, select_cpu, NULL, p, prev_cpu, wake_flags); 注4 p-scx.selected_cpu cpu; *ddsp_taskp NULL; if (ops_cpu_valid(cpu, from ops.select_cpu())) return cpu; else return prev_cpu; } else { cpu scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0); return cpu; } }注3kernel_execve - bprm_execve - p-sched_class-select_task_rq(p, task_cpu(p), WF_EXEC)传递的是WF_EXEC标记select_task_rq_scx检测到该标记会直接返回prev_cpu。根据注释exec时如果select_task_rq返回一个不同于于当前的cpu后会立即迁移task到新cpu上参考第二节但对于sched ext调度器来说返回的cpu并不代表task就会在该cpu上运行dispatch阶段可以将task放到任何cpu上执行所以立即迁过去没有意义。注4执行sched ext的select_cpu回调。下面代码中的prev_cpu就是p-wake_cpu。SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, select_cpu, NULL, p, prev_cpu, wake_flags) try_to_wake_up -- select_task_rq(p, p-wake_cpu, wake_flags)所以怎么使用p-wake_cpu是由用户实现的select_cpu回调决定的。五、select_cpu的注释说明struct sched_ext_ops { /** * select_cpu: Pick the target CPU for a task which is being woken up * p: task being woken up * prev_cpu: the cpu p was on before sleeping * wake_flags: SCX_WAKE_* * * Decision made here isnt final. p may be moved to any CPU while it * is getting dispatched for execution later. However, as p is not on * the rq at this point, getting the eventual execution CPU right here * saves a small bit of overhead down the line. 注5 * * If an idle CPU is returned, the CPU is kicked and will try to * dispatch. While an explicit custom mechanism can be added, * select_cpu() serves as the default way to wake up idle CPUs. 注6 * * p may be inserted into a DSQ directly by calling * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. * Directly inserting into %SCX_DSQ_LOCAL will put p in the local DSQ * of the CPU returned by this operation. 注7 * * Note that select_cpu() is never called for tasks that can only run * on a single CPU or tasks with migration disabled, as they dont have * the option to select a different CPU. See select_task_rq() for * details. 注8 */ s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);注5当任务被唤醒时select_cpu为即将唤醒的任务选择目标cpu但task最终不一定运行在该cpu因为dispatch可将task放到任意cpu上执行。注6内核默认行为是如果返回的cpu是idle的sched ext会立即 kick这个cpu让cpu退出idle状态尝试从其本地或全局运行队列中取出任务执行。select_cpu是唤醒 idlecpu的标准且推荐方式。只要select_cpu回调返回一个 idle cpu内核就会自动处理唤醒逻辑。用户也可以在eBPF程序中比如enqueue回调中调用scx_bpf_kick_cpu主动kick cpu。注7如果用户在eBPF程序中调用scx_bpf_dsq_insert将task加入dsq那么enqueue回调就不会再执行了。注8如果cpumask只允许运行在一个cpu或者关闭了迁移功能select_cpu不会被执行。因为这个task没有机会运行在其他cpu上所以就不需要select_cpu了。