Aug 23 2013

Completely Fair Scheduler

History of linux schedulers:

Linux 1.2: circular queue, round-robin
Linux 2.2: scheduling classes and symmetric multiprocessing
Linux 2.4: O(N) simple scheduler
Linux 2.6: O(1) scheduler
Linux 2.6.30: CFS

原理

CFS调度器把CPU资源公平地分给每一个处在就绪态的进程。不考虑优先级的话，每个进程平分CPU时间。 CFS使用virtual runtime(vruntime)表示每个进程已经消耗的CPU时间。当发生抢占时，CFS从就绪进程队列中选择vruntime最小的那个进程来执行一段时间，直到中断或被抢占。 CFS没有直接使用时间片。

Linux进程可以通过设置nice值来调整进程的优先级。 CFS对进程优先级有特殊的处理方式。 CFS并不维护用多个优先级队列，而是在更新vruntime时根据进程优先级计算进程消耗的CPU时间。如果进程nice值高，优先级低，那么增加的vruntime等于消耗的CPU时间放大一个比例。如果进程nice值低，优先级高，那么增加的vruntime等于消耗的CPU时间缩小一个比例。

实际运行时间 = 调度周期 * 进程权重 / 所有进程权重之和
vruntime = 实际运行时间 * 1024 / 进程权重 = 调度周期 * 1024 / 所有进程权重之和

可以看出记录的vruntime和进程的权重无关。在一个调度周期内，虽然进程实际运行的时间不一样，但是他们增加的vruntime是相同的。调度时间和vruntime是等比例增加的。

CFS使用红黑树保存就绪进程队列。红黑树使用进程的vruntime作为索引。因此，红黑树中最左边节点的vruntime最小。插入和删除进程节点的效率是O(lgn)。

CFS调度器采用per cpu run queue。在目前的CFS调度器中，每个CPU只维护本地run queue中所有进程的公平性。为了实现跨CPU的调度公平性，CFS必须定时进行load balance，将一些进程从繁忙的CPU的run queue中移到其他空闲的run queue中。这个load balance的过程需要获得其他run queue的锁。

数据结构(Linux 3.9)

进程单元: task_struct
进程调度单元: sched_entity(se)，包含rb_node结构的红黑树节点
CFS调度队列: cfs_rq，属于rq调度子队列

task_struct
    |
    |---> sched_entity
                 |
                 |---> rb_node ==> (Red-Balck Tree)  <== cfs_rq

task_struct:

// include/linux/sched.h
1201 struct task_struct {
             ...
1217         struct sched_entity se;    // 调度单元
             ...
1580 };

sched_entity:

// include/linux/sched.h
1139 struct sched_entity {
1140         struct load_weight      load;           /* for load-balancing */
1141         struct rb_node          run_node;       // RB-tree 节点
1142         struct list_head        group_node;
1143         unsigned int            on_rq;
1144 
1145         u64                     exec_start;
1146         u64                     sum_exec_runtime;
1147         u64                     vruntime;
1148         u64                     prev_sum_exec_runtime;
1149 
1150         u64                     nr_migrations;
1151 
1156 #ifdef CONFIG_FAIR_GROUP_SCHED
1157         struct sched_entity     *parent;
1158         /* rq on which this entity is (to be) queued: */
1159         struct cfs_rq           *cfs_rq;
1160         /* rq "owned" by this entity/group: */
1161         struct cfs_rq           *my_q;
1162 #endif
             ...
1173 };

cfs_rq:

// include/linux/sched.h
 206 struct cfs_rq {
 207         struct load_weight load;
 208         unsigned int nr_running, h_nr_running;     // 队列上可运行的进程总数
 209 
 210         u64 exec_clock; 
 211         u64 min_vruntime;                          // 所有进程中最小的vruntime
             ...
 216         struct rb_root tasks_timeline;    // 指向红黑树root
 217         struct rb_node *rb_leftmost;      // 指向红黑树最左边节点
 218 
 219         /*
 220          * 'curr' points to currently running entity on this cfs_rq.
 221          * It is set to NULL otherwise (i.e when none are currently running).
 222          */
 223         struct sched_entity *curr, *next, *last, *skip;
             ...
 287 };

调度器接口

调度类(scheduling classes)sched_class定义了每一种调度类型必须包含的接口，类似面向对象中的接口类。调度类目前有以下三种类型：

rt_sched_class
fair_sched_class
idle_sched_class

它们的共同接口是：

1041 struct sched_class {
1042         const struct sched_class *next;                                              // 调度类单向链表，顺序编译时确定
1043 
1044         void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);      // 进程入队，在进程状态变为TASK_RUNNING时发生
1045         void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);      // 进程出队
1046         void (*yield_task) (struct rq *rq);                                          // 挂起调度器
1047         bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);  // 挂起进程
1048 
1049         void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);// 检查当前进程能否被p指向的进程抢占
1050 
1051         struct task_struct * (*pick_next_task) (struct rq *rq);                      // 取出下一个执行进程
1052         void (*put_prev_task) (struct rq *rq, struct task_struct *p);                // 在用另一个进程代替当前运行的进程之前调用
             ...
1070         void (*set_curr_task) (struct rq *rq);
1071         void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
1072         void (*task_fork) (struct task_struct *p);
1073 
1074         void (*switched_from) (struct rq *this_rq, struct task_struct *task);
1075         void (*switched_to) (struct rq *this_rq, struct task_struct *task);
1076         void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
1077                              int oldprio);
1078 
1079         unsigned int (*get_rr_interval) (struct rq *rq,
1080                                          struct task_struct *task);
1081 
1082 #ifdef CONFIG_FAIR_GROUP_SCHED
1083         void (*task_move_group) (struct task_struct *p, int on_rq);
1084 #endif
1085 };

schedule(): 执行抢占

CFS实现

vruntime 更新

sysctl_sched_latency = nr_running / sched_nr_latency

/proc/sys/kernel/sched_latency_ns /proc/sys/kernel/sched_min_granularity_ns

vruntime = cfs_rq->min_vruntime

if (initial) {
    vruntime += sched_vslice()
} else {
    vruntime -= sysctl_sched_latency
}

vruntime = max(se->vruntime, vruntime)
se->vruntime = vruntime

sched_vslice是做什么的？该函数根据被调度的对象计算了一个调度周期的虚拟运行时
为什么要加上sched_vslice？不能总让新的进程占用CPU
sysctl_sched_latency是什么？一个调度周期，由__sched_period()计算
为什么要减去sysctl_sched_latency？一个进程睡眠了很久
为什么要取二者的最大值? 一个进程仅仅睡眠了一会儿

check_preempt_curr()

3505 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
3506 {
3507         struct task_struct *curr = rq->curr;
3508         struct sched_entity *se = &curr->se, *pse = &p->se;
3509         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3510         int scale = cfs_rq->nr_running >= sched_nr_latency;
3511         int next_buddy_marked = 0;
3512 
3513         if (unlikely(se == pse))
3514                 return;
3515 
3516         /*
3517          * This is possible from callers such as move_task(), in which we
3518          * unconditionally check_prempt_curr() after an enqueue (which may have
3519          * lead to a throttle).  This both saves work and prevents false
3520          * next-buddy nomination below.
3521          */
3522         if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
3523                 return;
3524 
3525         if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
3526                 set_next_buddy(pse);
3527                 next_buddy_marked = 1;
3528         }
3529 
3530         /*
3531          * We can come here with TIF_NEED_RESCHED already set from new task
3532          * wake up path.
3533          *
3534          * Note: this also catches the edge-case of curr being in a throttled
3535          * group (e.g. via set_curr_task), since update_curr() (in the
3536          * enqueue of curr) will have resulted in resched being set.  This
3537          * prevents us from potentially nominating it as a false LAST_BUDDY
3538          * below.
3539          */
3540         if (test_tsk_need_resched(curr))
3541                 return;
3542 
3543         /* Idle tasks are by definition preempted by non-idle tasks. */
3544         if (unlikely(curr->policy == SCHED_IDLE) &&
3545             likely(p->policy != SCHED_IDLE))
3546                 goto preempt;
3547 
3548         /*
3549          * Batch and idle tasks do not preempt non-idle tasks (their preemption
3550          * is driven by the tick):
3551          */
3552         if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
3553                 return;
3554 
3555         find_matching_se(&se, &pse);
3556         update_curr(cfs_rq_of(se));
3557         BUG_ON(!pse);
3558         if (wakeup_preempt_entity(se, pse) == 1) {
3559                 /*
3560                  * Bias pick_next to pick the sched entity that is
3561                  * triggering this preemption.
3562                  */
3563                 if (!next_buddy_marked)
3564                         set_next_buddy(pse);
3565                 goto preempt;
3566         }
3567 
3568         return;
3569 
3570 preempt:
3571         resched_task(curr);
3572         /*
3573          * Only set the backward buddy when the current task is still
3574          * on the rq. This can happen when a wakeup gets interleaved
3575          * with schedule on the ->pre_schedule() or idle_balance()
3576          * point, either of which can * drop the rq lock.
3577          *
3578          * Also, during early boot the idle thread is in the fair class,
3579          * for obvious reasons its a bad idea to schedule back to it.
3580          */
3581         if (unlikely(!se->on_rq || curr == rq->idle))
3582                 return;
3583 
3584         if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
3585                 set_last_buddy(se);
3586 }

task_fork_fair()

5733 static void task_fork_fair(struct task_struct *p)
5734 {
5735         struct cfs_rq *cfs_rq;
5736         struct sched_entity *se = &p->se, *curr;
5737         int this_cpu = smp_processor_id();
5738         struct rq *rq = this_rq();
5739         unsigned long flags;
5740 
5741         raw_spin_lock_irqsave(&rq->lock, flags);
5742 
5743         update_rq_clock(rq);
5744 
5745         cfs_rq = task_cfs_rq(current);
5746         curr = cfs_rq->curr;
5747 
5748         if (unlikely(task_cpu(p) != this_cpu)) {
5749                 rcu_read_lock();
5750                 __set_task_cpu(p, this_cpu);
5751                 rcu_read_unlock();
5752         }
5753 
5754         update_curr(cfs_rq);                     // 更新当前进程的、队列的一些信息(如vruntime)
5755 
5756         if (curr)                                // curr是父进程
5757                 se->vruntime = curr->vruntime;
5758         place_entity(cfs_rq, se, 1);
5759 
5760         if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
5761                 /*
5762                  * Upon rescheduling, sched_class::put_prev_task() will place
5763                  * 'current' within the tree based on its new key value.
5764                  */
5765                 swap(curr->vruntime, se->vruntime);
5766                 resched_task(rq->curr);          // 置位，标志需要重新调度
5767         }
5768 
5769         se->vruntime -= cfs_rq->min_vruntime;
5770 
5771         raw_spin_unlock_irqrestore(&rq->lock, flags);
5772 }

task_tick_fair()

5712 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
5713 {
5714         struct cfs_rq *cfs_rq;
5715         struct sched_entity *se = &curr->se;
5716 
5717         for_each_sched_entity(se) {
5718                 cfs_rq = cfs_rq_of(se);
5719                 entity_tick(cfs_rq, se, queued);           // entity 完成每个进程时间更新
5720         }
5721 
5722         if (sched_feat_numa(NUMA))
5723                 task_tick_numa(rq, curr);
5724 
5725         update_rq_runnable_avg(rq, 1);
5726 }

1953 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1954 {
1955         /*
1956          * Update run-time statistics of the 'current'.
1957          */
1958         update_curr(cfs_rq);
1959 
1960         /*
1961          * Ensure that runnable average is periodically updated.
1962          */
1963         update_entity_load_avg(curr, 1);
1964         update_cfs_rq_blocked_load(cfs_rq, 1);
        ...
1983         if (cfs_rq->nr_running > 1)                        // 一个进程超过了它本该执行的时间，则调度换出
1984                 check_preempt_tick(cfs_rq, curr);
1985 }

check_preempt_tick()

1819 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1820 {
1821         unsigned long ideal_runtime, delta_exec;
1822         struct sched_entity *se;
1823         s64 delta;
1824 
1825         ideal_runtime = sched_slice(cfs_rq, curr);
1826         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1827         if (delta_exec > ideal_runtime) {
1828                 resched_task(rq_of(cfs_rq)->curr);
1829                 /*
1830                  * The current task ran long enough, ensure it doesn't get
1831                  * re-elected due to buddy favours.
1832                  */
1833                 clear_buddies(cfs_rq, curr);
1834                 return;
1835         }
1836 
1837         /*
1838          * Ensure that a task that missed wakeup preemption by a
1839          * narrow margin doesn't have to wait for a full slice.
1840          * This also mitigates buddy induced latencies under load.
1841          */
1842         if (delta_exec < sysctl_sched_min_granularity)
1843                 return;
1844 
1845         se = __pick_first_entity(cfs_rq);
1846         delta = curr->vruntime - se->vruntime;
1847 
1848         if (delta < 0)
1849                 return;
1850 
1851         if (delta > ideal_runtime)
1852                 resched_task(rq_of(cfs_rq)->curr);
1853 }

调度触发

新建进程触发调度 task_fork() => task_fork_fair() /proc/sys/kernel/sched_child_runs_first
周期性触发调度 task_tick() => task_tick_fair()
唤醒抢占触发调度 check_preempt_wakeup() => check_preempt_wakeup() /proc/sys/kernel/sched_min_granularity_ns

新建进程触发调度

在Linux中，POSXI的系统调用fork(),vfork()和clone()最终通过do_fork()来实现。

// kernel/fork.c
1558 long do_fork(unsigned long clone_flags,
1559               unsigned long stack_start,
1560               unsigned long stack_size,
1561               int __user *parent_tidptr,
1562               int __user *child_tidptr)
1563 {
        ...
1595         p = copy_process(clone_flags, stack_start, stack_size,    // => task_fork()
1596                          child_tidptr, NULL, trace);
        ...
1617                 wake_up_new_task(p);                              // => check_preempt_wakeup()
        ...
1631 }

1123 /*
1124  * This creates a new process as a copy of the old one,
1125  * but does not actually start it yet.
1126  *
1127  * It copies the registers, and all the appropriate
1128  * parts of the process environment (as per the clone
1129  * flags). The actual kick-off is left to the caller.
1130  */
1131 static struct task_struct *copy_process(unsigned long clone_flags,
1132                                         unsigned long stack_start,
1133                                         unsigned long stack_size,
1134                                         int __user *child_tidptr,
1135                                         struct pid *pid,
1136                                         int trace)
1137 {
        ...
1308         sched_fork(p);
        ...
1528 }

1619 /*
1620  * fork()/clone()-time setup:
1621  */
1622 void sched_fork(struct task_struct *p)
1623 {
        ...
1664         if (p->sched_class->task_fork)
1665                 p->sched_class->task_fork(p);      // 调用当前进程的调度类的task_fork()，CFS为task_fork_fair()
        ...
1694 }

周期性触发调度

每次time interrupt产生时，系统的time interrupt handler函数tick_periodic()会调用update_process_times()更新进程vruntime，设设置need_resched，平衡调度队列。 x86系统默认的time interrupt周期是10ms(100HZ)。

1342 /*
1343  * Called from the timer interrupt handler to charge one tick to the current
1344  * process.  user_tick is 1 if the tick is user time, 0 for system.
1345  */
1346 void update_process_times(int user_tick)
1347 {
1348         struct task_struct *p = current;
1349         int cpu = smp_processor_id();
1350 
1351         /* Note: this timer irq context must be accounted for as well. */
1352         account_process_tick(p, user_tick);
1353         run_local_timers();
1354         rcu_check_callbacks(cpu, user_tick);
1355 #ifdef CONFIG_IRQ_WORK
1356         if (in_irq())
1357                 irq_work_run();
1358 #endif
1359         scheduler_tick();                       // HERE!
1360         run_posix_cpu_timers(p);
1361 }

2684 void scheduler_tick(void)
2685 {
2686         int cpu = smp_processor_id();
2687         struct rq *rq = cpu_rq(cpu);
2688         struct task_struct *curr = rq->curr;
2689 
2690         sched_clock_tick();
2691 
2692         raw_spin_lock(&rq->lock);
2693         update_rq_clock(rq);
2694         update_cpu_load_active(rq);
2695         curr->sched_class->task_tick(rq, curr, 0);           // 调用当前进程的调度类的task_tick()，CFS为task_tick_fair()
2696         raw_spin_unlock(&rq->lock);
2697 
2698         perf_event_task_tick();
             ...
2704 }

唤醒抢占触发调度

// kernel/sched/core.c
1703 void wake_up_new_task(struct task_struct *p)
1704 {
1705         unsigned long flags;
1706         struct rq *rq;
1707 
1708         raw_spin_lock_irqsave(&p->pi_lock, flags);
        ...
1717 
1718         rq = __task_rq_lock(p);
1719         activate_task(rq, p, 0);
1720         p->on_rq = 1;
1721         trace_sched_wakeup_new(p, true);
1722         check_preempt_curr(rq, p, WF_FORK);     // check_preempt_curr()
        ...
1727         task_rq_unlock(rq, p, &flags);
1728 }

check_preempt_curr()会在多处被调用：

 909 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 910 {
 911         const struct sched_class *class;
 912 
 913         if (p->sched_class == rq->curr->sched_class) {
 914                 rq->curr->sched_class->check_preempt_curr(rq, p, flags);    // 调度调度器的check_preempt_curr()
 915         } else {
 916                 for_each_class(class) {
 917                         if (class == rq->curr->sched_class)
 918                                 break;
 919                         if (class == p->sched_class) {
 920                                 resched_task(rq->curr);
 921                                 break;
 922                         }
 923                 }
 924         }
             ...
 932 }
 933

Finding for Joy at Sea