Completely Fair Scheduler
History of linux schedulers:
- Linux 1.2: circular queue, round-robin
- Linux 2.2: scheduling classes and symmetric multiprocessing
- Linux 2.4: O(N) simple scheduler
- Linux 2.6: O(1) scheduler
- Linux 2.6.30: CFS
原理
CFS调度器把CPU资源公平地分给每一个处在就绪态的进程。 不考虑优先级的话,每个进程平分CPU时间。 CFS使用virtual runtime(vruntime)表示每个进程已经消耗的CPU时间。 当发生抢占时,CFS从就绪进程队列中选择vruntime最小的那个进程来执行一段时间,直到中断或被抢占。 CFS没有直接使用时间片。
Linux进程可以通过设置nice值来调整进程的优先级。 CFS对进程优先级有特殊的处理方式。 CFS并不维护用多个优先级队列,而是在更新vruntime时根据进程优先级计算进程消耗的CPU时间。 如果进程nice值高,优先级低,那么增加的vruntime等于消耗的CPU时间放大一个比例。 如果进程nice值低,优先级高,那么增加的vruntime等于消耗的CPU时间缩小一个比例。
实际运行时间 = 调度周期 * 进程权重 / 所有进程权重之和
vruntime = 实际运行时间 * 1024 / 进程权重 = 调度周期 * 1024 / 所有进程权重之和
可以看出记录的vruntime和进程的权重无关。 在一个调度周期内,虽然进程实际运行的时间不一样,但是他们增加的vruntime是相同的。 调度时间和vruntime是等比例增加的。
CFS使用红黑树保存就绪进程队列。 红黑树使用进程的vruntime作为索引。因此,红黑树中最左边节点的vruntime最小。 插入和删除进程节点的效率是O(lgn)。
CFS调度器采用per cpu run queue。 在目前的CFS调度器中,每个CPU只维护本地run queue中所有进程的公平性。 为了实现跨CPU的调度公平性,CFS必须定时进行load balance,将一些进程从繁忙的CPU的run queue中移到其他空闲的run queue中。 这个load balance的过程需要获得其他run queue的锁。
数据结构(Linux 3.9)
- 进程单元:
task_struct
- 进程调度单元:
sched_entity
(se),包含rb_node
结构的红黑树节点 - CFS调度队列:
cfs_rq
,属于rq调度子队列
task_struct
|
|---> sched_entity
|
|---> rb_node ==> (Red-Balck Tree) <== cfs_rq
task_struct
:
// include/linux/sched.h
1201 struct task_struct {
...
1217 struct sched_entity se; // 调度单元
...
1580 };
sched_entity
:
// include/linux/sched.h
1139 struct sched_entity {
1140 struct load_weight load; /* for load-balancing */
1141 struct rb_node run_node; // RB-tree 节点
1142 struct list_head group_node;
1143 unsigned int on_rq;
1144
1145 u64 exec_start;
1146 u64 sum_exec_runtime;
1147 u64 vruntime;
1148 u64 prev_sum_exec_runtime;
1149
1150 u64 nr_migrations;
1151
1156 #ifdef CONFIG_FAIR_GROUP_SCHED
1157 struct sched_entity *parent;
1158 /* rq on which this entity is (to be) queued: */
1159 struct cfs_rq *cfs_rq;
1160 /* rq "owned" by this entity/group: */
1161 struct cfs_rq *my_q;
1162 #endif
...
1173 };
cfs_rq
:
// include/linux/sched.h
206 struct cfs_rq {
207 struct load_weight load;
208 unsigned int nr_running, h_nr_running; // 队列上可运行的进程总数
209
210 u64 exec_clock;
211 u64 min_vruntime; // 所有进程中最小的vruntime
...
216 struct rb_root tasks_timeline; // 指向红黑树root
217 struct rb_node *rb_leftmost; // 指向红黑树最左边节点
218
219 /*
220 * 'curr' points to currently running entity on this cfs_rq.
221 * It is set to NULL otherwise (i.e when none are currently running).
222 */
223 struct sched_entity *curr, *next, *last, *skip;
...
287 };
调度器接口
调度类(scheduling classes)sched_class
定义了每一种调度类型必须包含的接口,类似面向对象中的接口类。调度类目前有以下三种类型:
rt_sched_class
fair_sched_class
idle_sched_class
它们的共同接口是:
1041 struct sched_class {
1042 const struct sched_class *next; // 调度类单向链表,顺序编译时确定
1043
1044 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); // 进程入队,在进程状态变为TASK_RUNNING时发生
1045 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); // 进程出队
1046 void (*yield_task) (struct rq *rq); // 挂起调度器
1047 bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); // 挂起进程
1048
1049 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);// 检查当前进程能否被p指向的进程抢占
1050
1051 struct task_struct * (*pick_next_task) (struct rq *rq); // 取出下一个执行进程
1052 void (*put_prev_task) (struct rq *rq, struct task_struct *p); // 在用另一个进程代替当前运行的进程之前调用
...
1070 void (*set_curr_task) (struct rq *rq);
1071 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
1072 void (*task_fork) (struct task_struct *p);
1073
1074 void (*switched_from) (struct rq *this_rq, struct task_struct *task);
1075 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
1076 void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
1077 int oldprio);
1078
1079 unsigned int (*get_rr_interval) (struct rq *rq,
1080 struct task_struct *task);
1081
1082 #ifdef CONFIG_FAIR_GROUP_SCHED
1083 void (*task_move_group) (struct task_struct *p, int on_rq);
1084 #endif
1085 };
schedule()
: 执行抢占
CFS实现
vruntime 更新
sysctl_sched_latency = nr_running / sched_nr_latency
/proc/sys/kernel/sched_latency_ns /proc/sys/kernel/sched_min_granularity_ns
vruntime = cfs_rq->min_vruntime
if (initial) {
vruntime += sched_vslice()
} else {
vruntime -= sysctl_sched_latency
}
vruntime = max(se->vruntime, vruntime)
se->vruntime = vruntime
- sched_vslice是做什么的? 该函数根据被调度的对象计算了一个调度周期的虚拟运行时
- 为什么要加上sched_vslice? 不能总让新的进程占用CPU
- sysctl_sched_latency是什么? 一个调度周期,由__sched_period()计算
- 为什么要减去sysctl_sched_latency? 一个进程睡眠了很久
- 为什么要取二者的最大值? 一个进程仅仅睡眠了一会儿
check_preempt_curr()
3505 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
3506 {
3507 struct task_struct *curr = rq->curr;
3508 struct sched_entity *se = &curr->se, *pse = &p->se;
3509 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3510 int scale = cfs_rq->nr_running >= sched_nr_latency;
3511 int next_buddy_marked = 0;
3512
3513 if (unlikely(se == pse))
3514 return;
3515
3516 /*
3517 * This is possible from callers such as move_task(), in which we
3518 * unconditionally check_prempt_curr() after an enqueue (which may have
3519 * lead to a throttle). This both saves work and prevents false
3520 * next-buddy nomination below.
3521 */
3522 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
3523 return;
3524
3525 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
3526 set_next_buddy(pse);
3527 next_buddy_marked = 1;
3528 }
3529
3530 /*
3531 * We can come here with TIF_NEED_RESCHED already set from new task
3532 * wake up path.
3533 *
3534 * Note: this also catches the edge-case of curr being in a throttled
3535 * group (e.g. via set_curr_task), since update_curr() (in the
3536 * enqueue of curr) will have resulted in resched being set. This
3537 * prevents us from potentially nominating it as a false LAST_BUDDY
3538 * below.
3539 */
3540 if (test_tsk_need_resched(curr))
3541 return;
3542
3543 /* Idle tasks are by definition preempted by non-idle tasks. */
3544 if (unlikely(curr->policy == SCHED_IDLE) &&
3545 likely(p->policy != SCHED_IDLE))
3546 goto preempt;
3547
3548 /*
3549 * Batch and idle tasks do not preempt non-idle tasks (their preemption
3550 * is driven by the tick):
3551 */
3552 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
3553 return;
3554
3555 find_matching_se(&se, &pse);
3556 update_curr(cfs_rq_of(se));
3557 BUG_ON(!pse);
3558 if (wakeup_preempt_entity(se, pse) == 1) {
3559 /*
3560 * Bias pick_next to pick the sched entity that is
3561 * triggering this preemption.
3562 */
3563 if (!next_buddy_marked)
3564 set_next_buddy(pse);
3565 goto preempt;
3566 }
3567
3568 return;
3569
3570 preempt:
3571 resched_task(curr);
3572 /*
3573 * Only set the backward buddy when the current task is still
3574 * on the rq. This can happen when a wakeup gets interleaved
3575 * with schedule on the ->pre_schedule() or idle_balance()
3576 * point, either of which can * drop the rq lock.
3577 *
3578 * Also, during early boot the idle thread is in the fair class,
3579 * for obvious reasons its a bad idea to schedule back to it.
3580 */
3581 if (unlikely(!se->on_rq || curr == rq->idle))
3582 return;
3583
3584 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
3585 set_last_buddy(se);
3586 }
task_fork_fair()
5733 static void task_fork_fair(struct task_struct *p)
5734 {
5735 struct cfs_rq *cfs_rq;
5736 struct sched_entity *se = &p->se, *curr;
5737 int this_cpu = smp_processor_id();
5738 struct rq *rq = this_rq();
5739 unsigned long flags;
5740
5741 raw_spin_lock_irqsave(&rq->lock, flags);
5742
5743 update_rq_clock(rq);
5744
5745 cfs_rq = task_cfs_rq(current);
5746 curr = cfs_rq->curr;
5747
5748 if (unlikely(task_cpu(p) != this_cpu)) {
5749 rcu_read_lock();
5750 __set_task_cpu(p, this_cpu);
5751 rcu_read_unlock();
5752 }
5753
5754 update_curr(cfs_rq); // 更新当前进程的、队列的一些信息(如vruntime)
5755
5756 if (curr) // curr是父进程
5757 se->vruntime = curr->vruntime;
5758 place_entity(cfs_rq, se, 1);
5759
5760 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
5761 /*
5762 * Upon rescheduling, sched_class::put_prev_task() will place
5763 * 'current' within the tree based on its new key value.
5764 */
5765 swap(curr->vruntime, se->vruntime);
5766 resched_task(rq->curr); // 置位,标志需要重新调度
5767 }
5768
5769 se->vruntime -= cfs_rq->min_vruntime;
5770
5771 raw_spin_unlock_irqrestore(&rq->lock, flags);
5772 }
task_tick_fair()
5712 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
5713 {
5714 struct cfs_rq *cfs_rq;
5715 struct sched_entity *se = &curr->se;
5716
5717 for_each_sched_entity(se) {
5718 cfs_rq = cfs_rq_of(se);
5719 entity_tick(cfs_rq, se, queued); // entity 完成每个进程时间更新
5720 }
5721
5722 if (sched_feat_numa(NUMA))
5723 task_tick_numa(rq, curr);
5724
5725 update_rq_runnable_avg(rq, 1);
5726 }
1953 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1954 {
1955 /*
1956 * Update run-time statistics of the 'current'.
1957 */
1958 update_curr(cfs_rq);
1959
1960 /*
1961 * Ensure that runnable average is periodically updated.
1962 */
1963 update_entity_load_avg(curr, 1);
1964 update_cfs_rq_blocked_load(cfs_rq, 1);
...
1983 if (cfs_rq->nr_running > 1) // 一个进程超过了它本该执行的时间,则调度换出
1984 check_preempt_tick(cfs_rq, curr);
1985 }
check_preempt_tick()
1819 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1820 {
1821 unsigned long ideal_runtime, delta_exec;
1822 struct sched_entity *se;
1823 s64 delta;
1824
1825 ideal_runtime = sched_slice(cfs_rq, curr);
1826 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1827 if (delta_exec > ideal_runtime) {
1828 resched_task(rq_of(cfs_rq)->curr);
1829 /*
1830 * The current task ran long enough, ensure it doesn't get
1831 * re-elected due to buddy favours.
1832 */
1833 clear_buddies(cfs_rq, curr);
1834 return;
1835 }
1836
1837 /*
1838 * Ensure that a task that missed wakeup preemption by a
1839 * narrow margin doesn't have to wait for a full slice.
1840 * This also mitigates buddy induced latencies under load.
1841 */
1842 if (delta_exec < sysctl_sched_min_granularity)
1843 return;
1844
1845 se = __pick_first_entity(cfs_rq);
1846 delta = curr->vruntime - se->vruntime;
1847
1848 if (delta < 0)
1849 return;
1850
1851 if (delta > ideal_runtime)
1852 resched_task(rq_of(cfs_rq)->curr);
1853 }
调度触发
- 新建进程触发调度
task_fork()
=>task_fork_fair()
/proc/sys/kernel/sched_child_runs_first - 周期性触发调度
task_tick()
=>task_tick_fair()
- 唤醒抢占触发调度
check_preempt_wakeup()
=>check_preempt_wakeup()
/proc/sys/kernel/sched_min_granularity_ns
新建进程触发调度
在Linux中,POSXI的系统调用fork()
,vfork()
和clone()
最终通过do_fork()
来实现。
// kernel/fork.c
1558 long do_fork(unsigned long clone_flags,
1559 unsigned long stack_start,
1560 unsigned long stack_size,
1561 int __user *parent_tidptr,
1562 int __user *child_tidptr)
1563 {
...
1595 p = copy_process(clone_flags, stack_start, stack_size, // => task_fork()
1596 child_tidptr, NULL, trace);
...
1617 wake_up_new_task(p); // => check_preempt_wakeup()
...
1631 }
1123 /*
1124 * This creates a new process as a copy of the old one,
1125 * but does not actually start it yet.
1126 *
1127 * It copies the registers, and all the appropriate
1128 * parts of the process environment (as per the clone
1129 * flags). The actual kick-off is left to the caller.
1130 */
1131 static struct task_struct *copy_process(unsigned long clone_flags,
1132 unsigned long stack_start,
1133 unsigned long stack_size,
1134 int __user *child_tidptr,
1135 struct pid *pid,
1136 int trace)
1137 {
...
1308 sched_fork(p);
...
1528 }
1619 /*
1620 * fork()/clone()-time setup:
1621 */
1622 void sched_fork(struct task_struct *p)
1623 {
...
1664 if (p->sched_class->task_fork)
1665 p->sched_class->task_fork(p); // 调用当前进程的调度类的task_fork(),CFS为task_fork_fair()
...
1694 }
周期性触发调度
每次time interrupt产生时,系统的time interrupt handler函数tick_periodic()
会调用update_process_times()
更新进程vruntime,
设设置need_resched
,平衡调度队列。
x86系统默认的time interrupt周期是10ms(100HZ)。
1342 /*
1343 * Called from the timer interrupt handler to charge one tick to the current
1344 * process. user_tick is 1 if the tick is user time, 0 for system.
1345 */
1346 void update_process_times(int user_tick)
1347 {
1348 struct task_struct *p = current;
1349 int cpu = smp_processor_id();
1350
1351 /* Note: this timer irq context must be accounted for as well. */
1352 account_process_tick(p, user_tick);
1353 run_local_timers();
1354 rcu_check_callbacks(cpu, user_tick);
1355 #ifdef CONFIG_IRQ_WORK
1356 if (in_irq())
1357 irq_work_run();
1358 #endif
1359 scheduler_tick(); // HERE!
1360 run_posix_cpu_timers(p);
1361 }
2684 void scheduler_tick(void)
2685 {
2686 int cpu = smp_processor_id();
2687 struct rq *rq = cpu_rq(cpu);
2688 struct task_struct *curr = rq->curr;
2689
2690 sched_clock_tick();
2691
2692 raw_spin_lock(&rq->lock);
2693 update_rq_clock(rq);
2694 update_cpu_load_active(rq);
2695 curr->sched_class->task_tick(rq, curr, 0); // 调用当前进程的调度类的task_tick(),CFS为task_tick_fair()
2696 raw_spin_unlock(&rq->lock);
2697
2698 perf_event_task_tick();
...
2704 }
唤醒抢占触发调度
// kernel/sched/core.c
1703 void wake_up_new_task(struct task_struct *p)
1704 {
1705 unsigned long flags;
1706 struct rq *rq;
1707
1708 raw_spin_lock_irqsave(&p->pi_lock, flags);
...
1717
1718 rq = __task_rq_lock(p);
1719 activate_task(rq, p, 0);
1720 p->on_rq = 1;
1721 trace_sched_wakeup_new(p, true);
1722 check_preempt_curr(rq, p, WF_FORK); // check_preempt_curr()
...
1727 task_rq_unlock(rq, p, &flags);
1728 }
check_preempt_curr()
会在多处被调用:
909 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
910 {
911 const struct sched_class *class;
912
913 if (p->sched_class == rq->curr->sched_class) {
914 rq->curr->sched_class->check_preempt_curr(rq, p, flags); // 调度调度器的check_preempt_curr()
915 } else {
916 for_each_class(class) {
917 if (class == rq->curr->sched_class)
918 break;
919 if (class == p->sched_class) {
920 resched_task(rq->curr);
921 break;
922 }
923 }
924 }
...
932 }
933