Linux schedule 原始碼解讀
本文章環境基於 Linux v4.14.259
第一次 trace
整個排程的流程,kernel
真的是個大坑,有很多概念都還不熟,只能整理大概的流程,具體很多 function
的作用都沒辦法很好的說明,希望之後可以多閱讀 source code
,把相關的知識慢慢補齊,拼湊成完整的知識。
schedule
要 trace
排程器要先找到 schedule
的入口,定義在 kernel/sched/core.c
,函式定義如下
asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current;
// 避免 deadlock
sched_submit_work(tsk);
do {
// 排程本身無法搶佔,等排程完畢再開啟
preempt_disable();
__schedule(false);
// 排程完成,開啟搶佔功能
sched_preempt_enable_no_resched();
} while (need_resched()); // 檢查是不是被設置 TIF_NEED_RESCHED,如果被設置就重新排程
}
EXPORT_SYMBOL(schedule);
sched_submit_work
static inline void sched_submit_work(struct task_struct *tsk)
{
if (!tsk->state || tsk_is_pi_blocked(tsk))
return;
/*
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
*/
if (blk_needs_flush_plug(tsk))
blk_schedule_flush_plug(tsk);
}
在 sched_submit_work
中先檢查 tsk->state
是否為 0(runnable
),如果是就直接返回。
tsk_is_pi_blocked
檢查 tsk
的 deadlock
檢測器是否為空。
注意到 schedule
的 while
迴圈中的 __schedule(false)
,__schedule
是真正排程執行的地方,所以繼續往下 trace
_schedule
在 __schedule()
定義上方有一段註解說明排程時機,主要是說明 task_struct
在什麼情況會被排程,底下會大概敘述一下排程器在什麼情況下會考慮排程。
/*
* __schedule() is the main scheduler function.
*
* The main means of driving the scheduler and thus entering this function are:
*
* 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
*
* 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
* paths. For example, see arch/x86/entry_64.S.
*
* To drive preemption between tasks, the scheduler sets the flag in timer
* interrupt handler scheduler_tick().
*
* 3. Wakeups don't really cause entry into schedule(). They add a
* task to the run-queue and that's it.
*
* Now, if the new task added to the run-queue preempts the current
* task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
* called on the nearest possible occasion:
*
* - If the kernel is preemptible (CONFIG_PREEMPT=y):
*
* - in syscall or exception context, at the next outmost
* preempt_enable(). (this might be as soon as the wake_up()'s
* spin_unlock()!)
*
* - in IRQ context, return from interrupt-handler to
* preemptible context
*
* - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
* then at the next:
*
* - cond_resched() call
* - explicit schedule() call
* - return from syscall or exception to user-space
* - return from interrupt-handler to user-space
*
* WARNING: must be called with preemption disabled!
*/
在這段註解中大致說明了幾個排程的場景跟時機
- 自願切換(
Voluntary
)sleep
,定時任務場景mutex
,semaphore
waitqueue
等- 在呼叫 do_exit 時,主動釋放資源,並且最後會調用一次
主排程器
。
- 強制切換(
Involuntary
), 也被稱為搶佔(Preemption
)TIF_NEED_RESCHED
,TIF
開頭代表是thread information flags
,詳細有哪些flag
可以參考 arch/x86/include/asm/thread_info.h,kernel
透過這個flag
來判斷這個task_struct
是否要被搶佔(Preemption
)- 具體
TIF_NEED_RESCHED
的細節可以參考這篇文章 - 當
TIF_NEED_RESCHED
被設置後並不是馬上被排程,而是會在最近的排程點被排程
- 具體
wake_up
只是把task
加入runqueue
中,之後根據preempts
的設置會有不同的處理方式。
static void __sched notrace __schedule(bool preempt)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
struct rq_flags rf;
struct rq *rq;
int cpu;
cpu = smp_processor_id();
rq = cpu_rq(cpu);
prev = rq->curr;
schedule_debug(prev);
if (sched_feat(HRTICK))
hrtick_clear(rq);
local_irq_disable();
rcu_note_context_switch(preempt);
/*
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
*/
rq_lock(rq, &rf);
smp_mb__after_spinlock();
/* Promote REQ to ACT */
rq->clock_update_flags <<= 1;
update_rq_clock(rq);
switch_count = &prev->nivcsw;
if (!preempt && prev->state) {
if (unlikely(signal_pending_state(prev->state, prev))) {
prev->state = TASK_RUNNING;
} else {
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
prev->on_rq = 0;
if (prev->in_iowait) {
atomic_inc(&rq->nr_iowait);
delayacct_blkio_start();
}
/*
* If a worker went to sleep, notify and ask workqueue
* whether it wants to wake up a task to maintain
* concurrency.
*/
if (prev->flags & PF_WQ_WORKER) {
struct task_struct *to_wakeup;
to_wakeup = wq_worker_sleeping(prev);
if (to_wakeup)
try_to_wake_up_local(to_wakeup, &rf);
}
}
switch_count = &prev->nvcsw;
}
next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
if (likely(prev != next)) {
rq->nr_switches++;
rq->curr = next;
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
* rq->curr, before returning to user-space. For TSO
* (e.g. x86), the architecture must provide its own
* barrier in switch_mm(). For weakly ordered machines
* for which spin_unlock() acts as a full memory
* barrier, finish_lock_switch() in common code takes
* care of this barrier. For weakly ordered machines for
* which spin_unlock() acts as a RELEASE barrier (only
* arm64 and PowerPC), arm64 has a full barrier in
* switch_to(), and PowerPC has
* smp_mb__after_unlock_lock() before
* finish_lock_switch().
*/
++*switch_count;
trace_sched_switch(preempt, prev, next);
/* Also unlocks the rq: */
prev->cs_cnt++;
rq = context_switch(rq, prev, next, &rf);
} else {
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
rq_unlock_irq(rq, &rf);
}
balance_callback(rq);
}
task_struct
中 nivcsw
變數來計算搶佔的排程次數,nvcsw
代表非搶佔排程。
switch_count = &prev->nivcsw;
// scheduler 會檢查 prev 的狀態以及是否允許 kernel preempt
if (!preempt && prev->state) {
// 如果 prev 的狀態是 TASK_INTERRUPTIBLE
// 而且 prev 有非阻塞等待訊號
// 這樣就不應該將 prev 從 runqueu 刪掉
// 而是把它的狀態改成 TASK_RUNNING, 等待下一次的排程
if (unlikely(signal_pending_state(prev->state, prev))) {
prev->state = TASK_RUNNING;
} else {
// 將 prev 從 runqueue 中移除
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
// 標記 prev 不在 runqueue 中
prev->on_rq = 0;
if (prev->in_iowait) {
atomic_inc(&rq->nr_iowait);
delayacct_blkio_start();
}
/*
* If a worker went to sleep, notify and ask workqueue
* whether it wants to wake up a task to maintain
* concurrency.
*/
if (prev->flags & PF_WQ_WORKER) {
struct task_struct *to_wakeup;
to_wakeup = wq_worker_sleeping(prev);
if (to_wakeup)
try_to_wake_up_local(to_wakeup, &rf);
}
}
// 如果不是被搶佔的,就應該要累加 nvcsw, 代表主動放棄資源的排程
switch_count = &prev->nvcsw;
}
接下來則是呼叫排程器選擇下一個優先度最高的 task
排進 runqueue
。
// 選擇一個優先度最高的 task 放入 runqueue 中
next = pick_next_task(rq, prev, &rf);
// 清除 prev 的 TIF_NEED_RESCHED flag
clear_tsk_need_resched(prev);
自從 Linux v2.6.23
開始,Linux
引入了 scheduling class
的概念,大大提昇的排程器的擴展性,用戶可以依照自己的需求實現 interface
直接放入 kernel
中進行排程, 同樣在 Linux v2.6.23
之後 Completely Fair Scheduler(CFS)
取代原先的 O(1) scheduler
成為系統預設的排程器,在 pick_next_task
中就會實際根據排程器選擇出一個優先度最高的 task
,但因為篇幅問題,之後有機會再深入研究排程器的實現。
處理完 prev
,也選擇了 next
,接著就是要進行 context switch
的部份。
// 如果 prev 不是 next (也有機率 prev 放入 runqueue 中之後又被挑選到)
if (likely(prev != next)) {
rq->nr_switches++; // 計算 number of context switches.
rq->curr = next; // 將 curr 指向被排程的新 task
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
* rq->curr, before returning to user-space. For TSO
* (e.g. x86), the architecture must provide its own
* barrier in switch_mm(). For weakly ordered machines
* for which spin_unlock() acts as a full memory
* barrier, finish_lock_switch() in common code takes
* care of this barrier. For weakly ordered machines for
* which spin_unlock() acts as a RELEASE barrier (only
* arm64 and PowerPC), arm64 has a full barrier in
* switch_to(), and PowerPC has
* smp_mb__after_unlock_lock() before
* finish_lock_switch().
*/
// 增加一次被排程的次數
// 上面的判斷可以得知,如果是搶佔會加 nivcsw,非搶佔則是會加 nvcsw
++*switch_count;
trace_sched_switch(preempt, prev, next);
/* Also unlocks the rq: */
prev->cs_cnt++;
// 實際進行 context switch 的 function
rq = context_switch(rq, prev, next, &rf);
} else {
// 如果 prev == next 則不進行 context switch
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
rq_unlock_irq(rq, &rf);
}
get the number of context switches
回到作業題目(一)的需求,其實可以發現沒有必要再自己加上一個 counter
來計算被排程的次數,只要把 task_struct
中的 tsk->nivcsw + tsk->nvcse
就可以得到這個 task_struct
被 context switch
的次數了,甚至可以用 nr_switches
這個變數來直接獲得答案。
在驗證自己程式的正確性時,我們會參考 /proc/{pid}/sched
中的 nr_switches
,實際找到 proc_sched_show_task 就會發現 nr_switches
也是用 nivcsw + nvcse
來實現。
void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
struct seq_file *m)
{
unsigned long nr_switches;
.
.
.
nr_switches = p->nvcsw + p->nivcsw;
.
.
.
__P(nr_switches);
}
所以題目一的實現如下:
kernel space
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/resource.h>
#include <asm/errno.h>
SYSCALL_DEFINE1(get_number_of_context_switches, unsigned int*, cnt) {
unsigned int cs_cnt = current->nvcsw + current->nivcsw;
printk("context switches counter = %u\n", cs_cnt);
if (copy_to_user(cnt, &cs_cnt, sizeof(unsigned int))) {
return -EFAULT;
}
return 0;
}
利用 SYSCALL_DEFINE1
的 macro
來展開我們定義的 get_number_of_context_switches
,利用 task_struct
的 nvcse, nivcsw
來獲得 number of context switches
。
之所以要用 SYSCALL_DEFINE1
來定義 system call
是因為之前有個 CVE-2009-0029,攻擊者可以藉由 32 跟 64 bits
回傳的一些漏洞,設法存取非法的地址,為了避免人為疏失,所以保險的方式還是用 SYSCALL_DEFINE
來定義會比較安全。
user space
#include <stdio.h>
#include <syscall.h>
#include <unistd.h>
#define NUMBER_OF_ITERATIONS 99999999
int main() {
int i, t=2, u=3, v;
unsigned int cnt;
printf("pid = %d\n", getpid());
for(i = 0; i < NUMBER_OF_ITERATIONS*3; i++)
v = (++t)*(u++) * 3;
if (syscall(333, &cnt))
printf("Error\n");
else
printf("context switch counter = %d\n", cnt);
getchar();
}
用 getchar()
卡住最後面方便觀察,但是為導致 cnt
與觀察 proc/{pid}/sched
的結果差1,如下圖所示: