红联Linux门户
Linux帮助

系统调用入口函数源码分析system_call——X86_64

发布时间:2016-05-21 10:05:25来源:linux网站作者:伊人椅剑入凡尘

在实验中用到这一块,就去看源码分析整理了一下,全部为个人理解。有错误的地方,希望和大牛交流。


首先解释一下,我实验的目的是获得系统调用入口函数system_call的起始地址和函数大小。

在linux-3.10.1, x86 64位的系统下,系统调用的入口地址保存在MSR寄存器中,通过rdmsrl(MSR_LSTAR,ksystem_call);便可获得系统调用的入口地址,然后对该入口地址进行解析得到入口函数为system_call,具体的函数实现在/linux-3.10.1/arch/x86/kernel/entry_64.S文件中。

Entry_64.S为一个汇编文件,即system_call函数是有汇编语言实现的,在ENTRY(system_call)与END(system_call)之间有很对其他的函数定义和调用,与C语言程序的结构不同,因此system_call以及它内部包含的所有的内核函数的符号信息都保存在kallsyms文件系统中,因此根据内核栈中的返回地址查询kallsyms得到的内核符号可能只是真正调用函数内部的一个中间函数,通过实验我们也验证了这个猜想,这也是在实验中根据内核函数中一个地址本该获得的内核符号为system_call,结果得到的却是system_call_fast_path的原因。


对system_call的函数实现分析之后我们得到图1的处理流程:

系统调用入口函数源码分析system_call——X86_64

图1 system_call的处理过程


为了获取到正确的内核函数信息,在内核模块中使用kallsyms_lookup_name()函数以system_call为参数可以获得它对应的内核符号信息,在得到system_call结束后的第一个函数对应的地址信息,即可计算出system_call函数所占空间大小。

对于宿主机的系统调用表以及其他内核符号的信息,使用相同的方法获取。


下面是对system_call汇编源码的一些注释:

/*
* System call entry. Up to 6 arguments in registers are supported.
*
* SYSCALL does not save anything on the stack and does not change the
* stack pointer.  However, it does mask the flags register for us, so
* CLD and CLAC are not needed.
*/

/*
* Register setup:
* rax  system call number
* rdi  arg0
* rcx  return address for syscall/sysret, C arg3
* rsi  arg1
* rdx  arg2
* r10  arg3 (--> moved to rcx for C)
* r8   arg4
* r9   arg5
* r11  eflags for syscall/sysret, temporary for C
* r12-r15,rbp,rbx saved by C code, not touched.
*
* Interrupts are off on entry.
* Only called from user space.
*
* XXXif we had a free scratch register we could save the RSP into the stack frame
*      and report it properly in ps. Unfortunately we haven't.
*
* When user can change the frames always force IRET. That is because
* it deals with uncanonical addresses better. SYSRET has trouble
* with them due to bugs in both AMD and Intel CPUs.
*/

ENTRY(system_call)
CFI_STARTPROCsimple
CFI_SIGNAL_FRAME
CFI_DEF_CFArsp,KERNEL_STACK_OFFSET
CFI_REGISTERrip,rcx
/*CFI_REGISTERrflags,r11*/
SWAPGS_UNSAFE_STACK
//上面的几行代码执行了swapgs指令。修改gs寄存器从用户态切换到内核态,其实就是修改运行级别
/*
* A hypervisor implementation might want to use a label
* after the swapgs, so that it can do the swapgs
* for the guest and jump here on syscall.
*/
GLOBAL(system_call_after_swapgs)

movq%rsp,PER_CPU_VAR(old_rsp)//保存原来的rsp
movqPER_CPU_VAR(kernel_stack),%rsp//切换到内核堆栈
/*
* No need to follow this irqs off/on section - it's straight
* and short:
*/
ENABLE_INTERRUPTS(CLBR_NONE)//开中断
SAVE_ARGS 8,0
movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
movq  %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)//这里检查 此系统调用是否正在被跟踪
jnz tracesys
system_call_fastpath:
#if __SYSCALL_MASK == ~0
cmpq $__NR_syscall_max,%rax
#else
andl $__SYSCALL_MASK,%eax
cmpl $__NR_syscall_max,%eax
#endif
ja badsys
movq %r10,%rcx
call *sys_call_table(,%rax,8)  # XXX: rip relative//此处调用系统调用的处理例程
movq %rax,RAX-ARGOFFSET(%rsp)
/*
* Syscall return path ending with SYSRET (fast path)
* Has incomplete stack frame and undefined top of stack.
*/
ret_from_sys_call:
movl $_TIF_ALLWORK_MASK,%edi
/* edi:flagmask */
sysret_check:
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_NONE)//关中断
TRACE_IRQS_OFF
movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
andl %edi,%edx
jnz  sysret_careful
CFI_REMEMBER_STATE
/*
* sysretq will re-enable interrupts:
*/
TRACE_IRQS_ON
movq RIP-ARGOFFSET(%rsp),%rcx
CFI_REGISTERrip,rcx
RESTORE_ARGS 1,-ARG_SKIP,0
/*CFI_REGISTERrflags,r11*/
movqPER_CPU_VAR(old_rsp), %rsp
USERGS_SYSRET64

CFI_RESTORE_STATE
/* Handle reschedules */
/* edx:work, edi: workmask */
sysret_careful:
bt $TIF_NEED_RESCHED,%edx
jnc sysret_signal
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
SCHEDULE_USER
popq_cfi %rdi
jmp sysret_check

/* Handle a signal */
sysret_signal:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
#ifdef CONFIG_AUDITSYSCALL
bt $TIF_SYSCALL_AUDIT,%edx
jc sysret_audit
#endif
/*
* We have a signal, or exit tracing or single-step.
* These all wind up with the iret return path anyway,
* so just join that path right now.
*/
FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
jmp int_check_syscall_exit_work      //快速系统调用完成后,跳转到正常的退出工作。
//以上的工作进行快速系统调用的处理工作。
badsys:
movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
jmp ret_from_sys_call

#ifdef CONFIG_AUDITSYSCALL
/*
* Fast path for syscall audit without full syscall trace.
* We just call __audit_syscall_entry() directly, and then
* jump back to the normal fast path.
*/
auditsys:
movq %r10,%r9/* 6th arg: 4th syscall arg */
movq %rdx,%r8/* 5th arg: 3rd syscall arg */
movq %rsi,%rcx/* 4th arg: 2nd syscall arg */
movq %rdi,%rdx/* 3rd arg: 1st syscall arg */
movq %rax,%rsi/* 2nd arg: syscall number */
movl $AUDIT_ARCH_X86_64,%edi/* 1st arg: audit arch */
call __audit_syscall_entry
LOAD_ARGS 0/* reload call-clobbered registers */
jmp system_call_fastpath

/*
* Return fast path for syscall audit.  Call __audit_syscall_exit()
* directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
* masked off.
*/
sysret_audit:
movq RAX-ARGOFFSET(%rsp),%rsi/* second arg, syscall return value */
cmpq $-MAX_ERRNO,%rsi/* is it < -MAX_ERRNO? */
setbe %al/* 1 if so, 0 if not */
movzbl %al,%edi/* zero-extend that into %edi */
call __audit_syscall_exit
movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
jmp sysret_check
#endif/* CONFIG_AUDITSYSCALL */    //用于快速系统调用

/* Do syscall tracing */
tracesys:
#ifdef CONFIG_AUDITSYSCALL
testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jz auditsys
#endif
SAVE_REST
movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
FIXUP_TOP_OF_STACK %rdi
movq %rsp,%rdi
call syscall_trace_enter
/*
* Reload arg registers from stack in case ptrace changed them.
* We don't reload %rax because syscall_trace_enter() returned
* the value it wants us to use in the table lookup.
*/
LOAD_ARGS ARGOFFSET, 1
RESTORE_REST
#if __SYSCALL_MASK == ~0
cmpq $__NR_syscall_max,%rax
#else
andl $__SYSCALL_MASK,%eax
cmpl $__NR_syscall_max,%eax
#endif
ja   int_ret_from_sys_call/* RAX(%rsp) set to -ENOSYS above */
movq %r10,%rcx/* fixup for C */
call *sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
/* Use IRET because user could have changed frame */

/*
* Syscall return path ending with IRET.
* Has correct top of stack, but partial stack frame.
*/
GLOBAL(int_ret_from_sys_call)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
movl $_TIF_ALLWORK_MASK,%edi
/* edi:mask to check */
GLOBAL(int_with_check)
LOCKDEP_SYS_EXIT_IRQ
GET_THREAD_INFO(%rcx)
movl TI_flags(%rcx),%edx
andl %edi,%edx
jnz   int_careful
andl    $~TS_COMPAT,TI_status(%rcx)
jmp   retint_swapgs  //这里系统调用的全部工作结束,返回到用户空间。

/* Either reschedule or signal or syscall exit tracking needed. */
/* First do a reschedule test. */
/* edx:work, edi: workmask */
int_careful:
bt $TIF_NEED_RESCHED,%edx
jnc  int_very_careful
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
SCHEDULE_USER
popq_cfi %rdi
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check

/* handle signals and tracing -- both require a full stack frame */
int_very_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
int_check_syscall_exit_work:
SAVE_REST
/* Check for syscall exit trace */
testl $_TIF_WORK_SYSCALL_EXIT,%edx
jz int_signal
pushq_cfi %rdi
leaq 8(%rsp),%rdi# &ptregs -> arg1
call syscall_trace_leave
popq_cfi %rdi
andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
jmp int_restore_rest

int_signal:
testl $_TIF_DO_NOTIFY_MASK,%edx
jz 1f
movq %rsp,%rdi# &ptregs -> arg1
xorl %esi,%esi# oldset -> arg2
call do_notify_resume
1:movl $_TIF_WORK_MASK,%edi
int_restore_rest:
RESTORE_REST
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
CFI_ENDPROC
END(system_call)


本文永久更新地址:http://www.linuxdiyf.com/linux/20815.html