Kernel mode & user mode switch

2024-12-31

Kernel mode & user mode switch

Author: 堇姬Naup

Usermode to Kernel mode

  • call swapgs

這裡先解釋幾個東西
GS register、IA32_GS_BASE、IA32_KERNEL_GS_BASE、swapgs 是甚麼

  • GS reg: 用途主要圍繞上下文切換、內核態與用戶態的切換以及訪問每個 CPU 的數據結構,在 64 位模式下,GS 指向一個base address,允許進行高效的offset訪問,例如通過gs:[offset] 訪問data。
  • IA32_GS_BASE: 用於保存當前 GS base(MSR 地址為 0xC0000101)
  • IA32_KERNEL_GS_BASE: 用來保存kernel mode GS base address(MSR 地址為 0xC0000102)
  • swapgs
    快速切換 GS 段寄存器的address value
    user mode的 GS base address(存儲在 IA32_GS_BASE register中)和 kernel mode的 GS base address(存儲在 IA32_KERNEL_GS_BASE register中)交換

現在要切換至kernel mode,gs存的是usermode的base address(IA32_GS_BASE),會和 kernel base address交換(IA32_KERNEL_GS_BASE)

  • 接下來,將當前stack 頂部(usermode)放在per CPU內,並將per CPU中的kernel stack頂部放入rsp/rbp

PS: per CPU 變數用於在多處理器(SMP, Symmetric Multi-Processing)環境下,為每個 CPU 分配一份獨立的變數空間

偷偷塞個東西(大致是這種概念):

1
2
3
4
5
6
7
8
9
SystemCallEntryPoint:
SwapGS ; set up kernel pointer, save user's GS base
mov gs:[SavedUserRSP], rsp ; save user's stack pointer
mov rsp, gs:[KernelStackPtr] ; set up kernel stack
push rax ; now that we have a stack, save user's GPRs
mov rax, gs:[CPUnumber] ; get CPU number < or whatever >
. ; perform system service
.
SwapGS ; restore user's GS, save kernel pointer
  • 之後通過push保存每個register的值

大概是通過以下方式,會先切換至kernel stack
將register壓進kernel stack(一整個形成一個struct,pt_regs)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
ENTRY(entry_SYSCALL_64)
/* SWAPGS_UNSAFE_STACK is a macro, directly defined as the swapgs instruction in x86 */
SWAPGS_UNSAFE_STACK

/* Save the stack pointer value and set the kernel stack */
movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp

/* Save register values by pushing them onto the stack to form a pt_regs structure */
/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
pushq %rcx /* pt_regs->cx */
pushq $-ENOSYS /* pt_regs->ax */
pushq %r8 /* pt_regs->r8 */
pushq %r9 /* pt_regs->r9 */
pushq %r10 /* pt_regs->r10 */
pushq %r11 /* pt_regs->r11 */
sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */

這是pt_regs struct

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
struct pt_regs {
/*
* C ABI says these regs are callee-preserved. They aren't saved on kernel entry
* unless syscall needs a complete, fully filled "struct pt_regs".
*/
unsigned long r15;
unsigned long r14;
unsigned long r13;
unsigned long r12;
unsigned long rbp;
unsigned long rbx;
/* These regs are callee-clobbered. Always saved on kernel entry. */
unsigned long r11;
unsigned long r10;
unsigned long r9;
unsigned long r8;
unsigned long rax;
unsigned long rcx;
unsigned long rdx;
unsigned long rsi;
unsigned long rdi;
/*
* On syscall entry, this is syscall#. On CPU exception, this is error code.
* On hw interrupt, it's IRQ number:
*/
unsigned long orig_rax;
/* Return frame for iretq */
unsigned long rip;
unsigned long cs;
unsigned long eflags;
unsigned long rsp;
unsigned long ss;
/* top of stack page */
};

補一下

  • unsigned long rip:指令指標寄存器(Instruction Pointer Register),指向當前執行的指令地址。
  • unsigned long cs:代碼段選擇器(Code Segment Selector),描述代碼段的特性。
  • unsigned long eflags:標誌寄存器(Flags Register),保存處理器狀態標誌。
  • unsigned long rsp:堆疊指標寄存器(Stack Pointer Register),指向堆疊頂部。
  • unsigned long ss:堆疊段選擇器(Stack Segment Selector),描述堆疊段的特性。

這樣就進入到kernel mode了

Kernel mode to usermode(No KPTI)

  • call swapgs
  • 通過swapgs gs會拿回usermode base address
  • 接下來通過 sysretq 或 iretq 恢復到user space
  • 若使用 iretq 需要再給出userspace額外的資訊,也就是trap_frame
1
2
3
4
5
6
7
struct trap_frame {
size_t user_rip;
size_t user_cs;
size_t user_rflags;
size_t user_sp;
size_t user_ss;
} __attribute__((packed));

iret

這是詳細的iret實現方式

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
#ifdef CONFIG_DEBUG_ENTRY
/* Assert that pt_regs indicates user mode. */
testb $3, CS(%rsp)
jnz 1f
ud2
1:
#endif
POP_REGS pop_rdi=0

/*
* The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
* Save old stack pointer and switch to trampoline stack.
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
UNWIND_HINT_EMPTY

/* Copy the IRET frame to the trampoline stack. */
pushq 6*8(%rdi) /* SS */
pushq 5*8(%rdi) /* RSP */
pushq 4*8(%rdi) /* EFLAGS */
pushq 3*8(%rdi) /* CS */
pushq 2*8(%rdi) /* RIP */

/* Push user RDI on the trampoline stack. */
pushq (%rdi)

/*
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
STACKLEAK_ERASE_NOCLOBBER

SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi

/* Restore RDI. */
popq %rdi
SWAPGS
INTERRUPT_RETURN

Kernel mode to usermode(Open KPTI)

不能單純透過call swapgs; iret 來切換
還需要去切換page table

首先是四層頁表(基本跟windows kernel沒有差,可以參考我的windows kernel exploitation那篇)

通過 PGD->PUD->PMD->PTE 找到對應的page,再透過offset找到該page內的正確address,來將virtual address 映射到 physical address
CR3 register則儲存了PGD位置
而KPTI讓userspace只有少量的kernel page table(必要的如中斷等)
簡單來說就是原本的一張表usermode、kernel mode用同一張
但是開啟KPTI後會拆成兩個如下圖

在切換mode時同時也需要去更改cr3存的page table address
另外再KPTI開啟時,其映射的userspace memory被標記NX,使ret2usr不可用(因為你就算能寫上shellcode到userspace,但跳上去也不能執行shellcode)

知道了上述後來看有KPTI後會如何執行切換
詳細的可以去 swapgs_restore_regs_and_return_to_usermode

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
0xffffffff81c00fb0 <swapgs_restore_regs_and_return_to_usermode>:	nop    DWORD PTR [rax+rax*1+0x0]
0xffffffff81c00fb5 <swapgs_restore_regs_and_return_to_usermode+5>: pop r15
0xffffffff81c00fb7 <swapgs_restore_regs_and_return_to_usermode+7>: pop r14
0xffffffff81c00fb9 <swapgs_restore_regs_and_return_to_usermode+9>: pop r13
0xffffffff81c00fbb <swapgs_restore_regs_and_return_to_usermode+11>: pop r12
0xffffffff81c00fbd <swapgs_restore_regs_and_return_to_usermode+13>: pop rbp
0xffffffff81c00fbe <swapgs_restore_regs_and_return_to_usermode+14>: pop rbx
0xffffffff81c00fbf <swapgs_restore_regs_and_return_to_usermode+15>: pop r11
0xffffffff81c00fc1 <swapgs_restore_regs_and_return_to_usermode+17>: pop r10
0xffffffff81c00fc3 <swapgs_restore_regs_and_return_to_usermode+19>: pop r9
0xffffffff81c00fc5 <swapgs_restore_regs_and_return_to_usermode+21>: pop r8
0xffffffff81c00fc7 <swapgs_restore_regs_and_return_to_usermode+23>: pop rax
0xffffffff81c00fc8 <swapgs_restore_regs_and_return_to_usermode+24>: pop rcx
0xffffffff81c00fc9 <swapgs_restore_regs_and_return_to_usermode+25>: pop rdx
0xffffffff81c00fca <swapgs_restore_regs_and_return_to_usermode+26>: pop rsi
0xffffffff81c00fcb <swapgs_restore_regs_and_return_to_usermode+27>: mov rdi,rsp
0xffffffff81c00fce <swapgs_restore_regs_and_return_to_usermode+30>: mov rsp,QWORD PTR gs:0x6004
0xffffffff81c00fd7 <swapgs_restore_regs_and_return_to_usermode+39>: push QWORD PTR [rdi+0x30]
0xffffffff81c00fda <swapgs_restore_regs_and_return_to_usermode+42>: push QWORD PTR [rdi+0x28]
0xffffffff81c00fdd <swapgs_restore_regs_and_return_to_usermode+45>: push QWORD PTR [rdi+0x20]
0xffffffff81c00fe0 <swapgs_restore_regs_and_return_to_usermode+48>: push QWORD PTR [rdi+0x18]
0xffffffff81c00fe3 <swapgs_restore_regs_and_return_to_usermode+51>: push QWORD PTR [rdi+0x10]
0xffffffff81c00fe6 <swapgs_restore_regs_and_return_to_usermode+54>: push QWORD PTR [rdi]
0xffffffff81c00fe8 <swapgs_restore_regs_and_return_to_usermode+56>: push rax
0xffffffff81c00fe9 <swapgs_restore_regs_and_return_to_usermode+57>: xchg ax,ax
0xffffffff81c00feb <swapgs_restore_regs_and_return_to_usermode+59>: mov rdi,cr3
0xffffffff81c00fee <swapgs_restore_regs_and_return_to_usermode+62>: jmp 0xffffffff81c01024

0xffffffff81c01024 <swapgs_restore_regs_and_return_to_usermode+116>: or rdi,0x1000
0xffffffff81c0102b <swapgs_restore_regs_and_return_to_usermode+123>: mov cr3,rdi
0xffffffff81c0102e <swapgs_restore_regs_and_return_to_usermode+126>: pop rax
0xffffffff81c0102f <swapgs_restore_regs_and_return_to_usermode+127>: pop rdi
0xffffffff81c01030 <swapgs_restore_regs_and_return_to_usermode+128>: swapgs
0xffffffff81c01033 <swapgs_restore_regs_and_return_to_usermode+131>: jmp 0xffffffff81c01060 <native_iret>

0xffffffff81c01060 <native_iret>: test BYTE PTR [rsp+0x20],0x4
0xffffffff81c01065 <native_iret+5>: jne 0xffffffff81c01069 <native_irq_return_ldt>
0xffffffff81c01067 <native_irq_return_iret>: iretq

通過呼叫這function也能直接幫你修好cr3

1
2
3
4
5
6
7
mov  rdi, cr3
or rdi, 0x1000 #0b1000000000000
mov cr3, rdi
pop rax
pop rdi
swapgs
iretq

cr3 可以參考該docs的2.5(p.3069)
docs

總之第十三位(cr3[12])紀錄當前是kernel or user mode page table(PGD address)

為了實現快速切換,因此將kernel mode PGD跟user mode PGD緊鄰著(4kb + 4kb = 8kb),kernel PGD在low address,user在high address
通過去 or cr3[12]就可以快速操作

cr3

1
2
3
4
5
6
7
8
9
cr3[12]
0 ---------------|
1 ------------- |
| |
+------------+ | |
| user PGD | - |
+------------+ |
| kernel PGD | ---
+------------+

after all

不得不說網路上的文章真的很亂www
另外kernel文章真的很難寫,寫一寫很容易寫亂,所以應該不會像heap一樣一篇完成,而是拆成小篇小篇(不然我已經寫壞了4、5篇了)
總之有心力就寫吧 OwOb