diff options
author | Yunhong Jiang <yunhong.jiang@intel.com> | 2015-08-04 12:17:53 -0700 |
---|---|---|
committer | Yunhong Jiang <yunhong.jiang@intel.com> | 2015-08-04 15:44:42 -0700 |
commit | 9ca8dbcc65cfc63d6f5ef3312a33184e1d726e00 (patch) | |
tree | 1c9cafbcd35f783a87880a10f85d1a060db1a563 /kernel/arch/x86/ia32 | |
parent | 98260f3884f4a202f9ca5eabed40b1354c489b29 (diff) |
Add the rt linux 4.1.3-rt3 as base
Import the rt linux 4.1.3-rt3 as OPNFV kvm base.
It's from git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git linux-4.1.y-rt and
the base is:
commit 0917f823c59692d751951bf5ea699a2d1e2f26a2
Author: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Sat Jul 25 12:13:34 2015 +0200
Prepare v4.1.3-rt3
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
We lose all the git history this way and it's not good. We
should apply another opnfv project repo in future.
Change-Id: I87543d81c9df70d99c5001fbdf646b202c19f423
Signed-off-by: Yunhong Jiang <yunhong.jiang@intel.com>
Diffstat (limited to 'kernel/arch/x86/ia32')
-rw-r--r-- | kernel/arch/x86/ia32/Makefile | 10 | ||||
-rw-r--r-- | kernel/arch/x86/ia32/audit.c | 43 | ||||
-rw-r--r-- | kernel/arch/x86/ia32/ia32_aout.c | 479 | ||||
-rw-r--r-- | kernel/arch/x86/ia32/ia32_signal.c | 497 | ||||
-rw-r--r-- | kernel/arch/x86/ia32/ia32entry.S | 611 | ||||
-rw-r--r-- | kernel/arch/x86/ia32/sys_ia32.c | 231 |
6 files changed, 1871 insertions, 0 deletions
diff --git a/kernel/arch/x86/ia32/Makefile b/kernel/arch/x86/ia32/Makefile new file mode 100644 index 000000000..bb635c641 --- /dev/null +++ b/kernel/arch/x86/ia32/Makefile @@ -0,0 +1,10 @@ +# +# Makefile for the ia32 kernel emulation subsystem. +# + +obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o + +obj-$(CONFIG_IA32_AOUT) += ia32_aout.o + +audit-class-$(CONFIG_AUDIT) := audit.o +obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y) diff --git a/kernel/arch/x86/ia32/audit.c b/kernel/arch/x86/ia32/audit.c new file mode 100644 index 000000000..2eccc8932 --- /dev/null +++ b/kernel/arch/x86/ia32/audit.c @@ -0,0 +1,43 @@ +#include <asm/unistd_32.h> + +unsigned ia32_dir_class[] = { +#include <asm-generic/audit_dir_write.h> +~0U +}; + +unsigned ia32_chattr_class[] = { +#include <asm-generic/audit_change_attr.h> +~0U +}; + +unsigned ia32_write_class[] = { +#include <asm-generic/audit_write.h> +~0U +}; + +unsigned ia32_read_class[] = { +#include <asm-generic/audit_read.h> +~0U +}; + +unsigned ia32_signal_class[] = { +#include <asm-generic/audit_signal.h> +~0U +}; + +int ia32_classify_syscall(unsigned syscall) +{ + switch (syscall) { + case __NR_open: + return 2; + case __NR_openat: + return 3; + case __NR_socketcall: + return 4; + case __NR_execve: + case __NR_execveat: + return 5; + default: + return 1; + } +} diff --git a/kernel/arch/x86/ia32/ia32_aout.c b/kernel/arch/x86/ia32/ia32_aout.c new file mode 100644 index 000000000..ae6aad1d2 --- /dev/null +++ b/kernel/arch/x86/ia32/ia32_aout.c @@ -0,0 +1,479 @@ +/* + * a.out loader for x86-64 + * + * Copyright (C) 1991, 1992, 1996 Linus Torvalds + * Hacked together by Andi Kleen + */ + +#include <linux/module.h> + +#include <linux/time.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/a.out.h> +#include <linux/errno.h> +#include <linux/signal.h> +#include <linux/string.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/stat.h> +#include <linux/fcntl.h> +#include <linux/ptrace.h> +#include <linux/user.h> +#include <linux/binfmts.h> +#include <linux/personality.h> +#include <linux/init.h> +#include <linux/jiffies.h> +#include <linux/perf_event.h> + +#include <asm/uaccess.h> +#include <asm/pgalloc.h> +#include <asm/cacheflush.h> +#include <asm/user32.h> +#include <asm/ia32.h> + +#undef WARN_OLD + +static int load_aout_binary(struct linux_binprm *); +static int load_aout_library(struct file *); + +#ifdef CONFIG_COREDUMP +static int aout_core_dump(struct coredump_params *); + +static unsigned long get_dr(int n) +{ + struct perf_event *bp = current->thread.ptrace_bps[n]; + return bp ? bp->hw.info.address : 0; +} + +/* + * fill in the user structure for a core dump.. + */ +static void dump_thread32(struct pt_regs *regs, struct user32 *dump) +{ + u32 fs, gs; + memset(dump, 0, sizeof(*dump)); + +/* changed the size calculations - should hopefully work better. lbt */ + dump->magic = CMAGIC; + dump->start_code = 0; + dump->start_stack = regs->sp & ~(PAGE_SIZE - 1); + dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; + dump->u_dsize = ((unsigned long) + (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; + dump->u_dsize -= dump->u_tsize; + dump->u_debugreg[0] = get_dr(0); + dump->u_debugreg[1] = get_dr(1); + dump->u_debugreg[2] = get_dr(2); + dump->u_debugreg[3] = get_dr(3); + dump->u_debugreg[6] = current->thread.debugreg6; + dump->u_debugreg[7] = current->thread.ptrace_dr7; + + if (dump->start_stack < 0xc0000000) { + unsigned long tmp; + + tmp = (unsigned long) (0xc0000000 - dump->start_stack); + dump->u_ssize = tmp >> PAGE_SHIFT; + } + + dump->regs.ebx = regs->bx; + dump->regs.ecx = regs->cx; + dump->regs.edx = regs->dx; + dump->regs.esi = regs->si; + dump->regs.edi = regs->di; + dump->regs.ebp = regs->bp; + dump->regs.eax = regs->ax; + dump->regs.ds = current->thread.ds; + dump->regs.es = current->thread.es; + savesegment(fs, fs); + dump->regs.fs = fs; + savesegment(gs, gs); + dump->regs.gs = gs; + dump->regs.orig_eax = regs->orig_ax; + dump->regs.eip = regs->ip; + dump->regs.cs = regs->cs; + dump->regs.eflags = regs->flags; + dump->regs.esp = regs->sp; + dump->regs.ss = regs->ss; + +#if 1 /* FIXME */ + dump->u_fpvalid = 0; +#else + dump->u_fpvalid = dump_fpu(regs, &dump->i387); +#endif +} + +#endif + +static struct linux_binfmt aout_format = { + .module = THIS_MODULE, + .load_binary = load_aout_binary, + .load_shlib = load_aout_library, +#ifdef CONFIG_COREDUMP + .core_dump = aout_core_dump, +#endif + .min_coredump = PAGE_SIZE +}; + +static void set_brk(unsigned long start, unsigned long end) +{ + start = PAGE_ALIGN(start); + end = PAGE_ALIGN(end); + if (end <= start) + return; + vm_brk(start, end - start); +} + +#ifdef CONFIG_COREDUMP +/* + * These are the only things you should do on a core-file: use only these + * macros to write out all the necessary info. + */ + +#include <linux/coredump.h> + +#define START_DATA(u) (u.u_tsize << PAGE_SHIFT) +#define START_STACK(u) (u.start_stack) + +/* + * Routine writes a core dump image in the current directory. + * Currently only a stub-function. + * + * Note that setuid/setgid files won't make a core-dump if the uid/gid + * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable" + * field, which also makes sure the core-dumps won't be recursive if the + * dumping of the process results in another error.. + */ + +static int aout_core_dump(struct coredump_params *cprm) +{ + mm_segment_t fs; + int has_dumped = 0; + unsigned long dump_start, dump_size; + struct user32 dump; + + fs = get_fs(); + set_fs(KERNEL_DS); + has_dumped = 1; + strncpy(dump.u_comm, current->comm, sizeof(current->comm)); + dump.u_ar0 = offsetof(struct user32, regs); + dump.signal = cprm->siginfo->si_signo; + dump_thread32(cprm->regs, &dump); + + /* + * If the size of the dump file exceeds the rlimit, then see + * what would happen if we wrote the stack, but not the data + * area. + */ + if ((dump.u_dsize + dump.u_ssize + 1) * PAGE_SIZE > cprm->limit) + dump.u_dsize = 0; + + /* Make sure we have enough room to write the stack and data areas. */ + if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit) + dump.u_ssize = 0; + + /* make sure we actually have a data and stack area to dump */ + set_fs(USER_DS); + if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump), + dump.u_dsize << PAGE_SHIFT)) + dump.u_dsize = 0; + if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump), + dump.u_ssize << PAGE_SHIFT)) + dump.u_ssize = 0; + + set_fs(KERNEL_DS); + /* struct user */ + if (!dump_emit(cprm, &dump, sizeof(dump))) + goto end_coredump; + /* Now dump all of the user data. Include malloced stuff as well */ + if (!dump_skip(cprm, PAGE_SIZE - sizeof(dump))) + goto end_coredump; + /* now we start writing out the user space info */ + set_fs(USER_DS); + /* Dump the data area */ + if (dump.u_dsize != 0) { + dump_start = START_DATA(dump); + dump_size = dump.u_dsize << PAGE_SHIFT; + if (!dump_emit(cprm, (void *)dump_start, dump_size)) + goto end_coredump; + } + /* Now prepare to dump the stack area */ + if (dump.u_ssize != 0) { + dump_start = START_STACK(dump); + dump_size = dump.u_ssize << PAGE_SHIFT; + if (!dump_emit(cprm, (void *)dump_start, dump_size)) + goto end_coredump; + } +end_coredump: + set_fs(fs); + return has_dumped; +} +#endif + +/* + * create_aout_tables() parses the env- and arg-strings in new user + * memory and creates the pointer tables from them, and puts their + * addresses on the "stack", returning the new stack pointer value. + */ +static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm) +{ + u32 __user *argv, *envp, *sp; + int argc = bprm->argc, envc = bprm->envc; + + sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p); + sp -= envc+1; + envp = sp; + sp -= argc+1; + argv = sp; + put_user((unsigned long) envp, --sp); + put_user((unsigned long) argv, --sp); + put_user(argc, --sp); + current->mm->arg_start = (unsigned long) p; + while (argc-- > 0) { + char c; + + put_user((u32)(unsigned long)p, argv++); + do { + get_user(c, p++); + } while (c); + } + put_user(0, argv); + current->mm->arg_end = current->mm->env_start = (unsigned long) p; + while (envc-- > 0) { + char c; + + put_user((u32)(unsigned long)p, envp++); + do { + get_user(c, p++); + } while (c); + } + put_user(0, envp); + current->mm->env_end = (unsigned long) p; + return sp; +} + +/* + * These are the functions used to load a.out style executables and shared + * libraries. There is no binary dependent code anywhere else. + */ +static int load_aout_binary(struct linux_binprm *bprm) +{ + unsigned long error, fd_offset, rlim; + struct pt_regs *regs = current_pt_regs(); + struct exec ex; + int retval; + + ex = *((struct exec *) bprm->buf); /* exec-header */ + if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC && + N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) || + N_TRSIZE(ex) || N_DRSIZE(ex) || + i_size_read(file_inode(bprm->file)) < + ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { + return -ENOEXEC; + } + + fd_offset = N_TXTOFF(ex); + + /* Check initial limits. This avoids letting people circumvent + * size limits imposed on them by creating programs with large + * arrays in the data or bss. + */ + rlim = rlimit(RLIMIT_DATA); + if (rlim >= RLIM_INFINITY) + rlim = ~0; + if (ex.a_data + ex.a_bss > rlim) + return -ENOMEM; + + /* Flush all traces of the currently running executable */ + retval = flush_old_exec(bprm); + if (retval) + return retval; + + /* OK, This is the point of no return */ + set_personality(PER_LINUX); + set_personality_ia32(false); + + setup_new_exec(bprm); + + regs->cs = __USER32_CS; + regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = + regs->r13 = regs->r14 = regs->r15 = 0; + + current->mm->end_code = ex.a_text + + (current->mm->start_code = N_TXTADDR(ex)); + current->mm->end_data = ex.a_data + + (current->mm->start_data = N_DATADDR(ex)); + current->mm->brk = ex.a_bss + + (current->mm->start_brk = N_BSSADDR(ex)); + + retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); + if (retval < 0) + return retval; + + install_exec_creds(bprm); + + if (N_MAGIC(ex) == OMAGIC) { + unsigned long text_addr, map_size; + + text_addr = N_TXTADDR(ex); + map_size = ex.a_text+ex.a_data; + + error = vm_brk(text_addr & PAGE_MASK, map_size); + + if (error != (text_addr & PAGE_MASK)) + return error; + + error = read_code(bprm->file, text_addr, 32, + ex.a_text + ex.a_data); + if ((signed long)error < 0) + return error; + } else { +#ifdef WARN_OLD + static unsigned long error_time, error_time2; + if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && + (N_MAGIC(ex) != NMAGIC) && + time_after(jiffies, error_time2 + 5*HZ)) { + printk(KERN_NOTICE "executable not page aligned\n"); + error_time2 = jiffies; + } + + if ((fd_offset & ~PAGE_MASK) != 0 && + time_after(jiffies, error_time + 5*HZ)) { + printk(KERN_WARNING + "fd_offset is not page aligned. Please convert " + "program: %pD\n", + bprm->file); + error_time = jiffies; + } +#endif + + if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) { + vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); + read_code(bprm->file, N_TXTADDR(ex), fd_offset, + ex.a_text+ex.a_data); + goto beyond_if; + } + + error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, + PROT_READ | PROT_EXEC, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | + MAP_EXECUTABLE | MAP_32BIT, + fd_offset); + + if (error != N_TXTADDR(ex)) + return error; + + error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | + MAP_EXECUTABLE | MAP_32BIT, + fd_offset + ex.a_text); + if (error != N_DATADDR(ex)) + return error; + } +beyond_if: + set_binfmt(&aout_format); + + set_brk(current->mm->start_brk, current->mm->brk); + + current->mm->start_stack = + (unsigned long)create_aout_tables((char __user *)bprm->p, bprm); + /* start thread */ + loadsegment(fs, 0); + loadsegment(ds, __USER32_DS); + loadsegment(es, __USER32_DS); + load_gs_index(0); + (regs)->ip = ex.a_entry; + (regs)->sp = current->mm->start_stack; + (regs)->flags = 0x200; + (regs)->cs = __USER32_CS; + (regs)->ss = __USER32_DS; + regs->r8 = regs->r9 = regs->r10 = regs->r11 = + regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; + set_fs(USER_DS); + return 0; +} + +static int load_aout_library(struct file *file) +{ + unsigned long bss, start_addr, len, error; + int retval; + struct exec ex; + + + retval = -ENOEXEC; + error = kernel_read(file, 0, (char *) &ex, sizeof(ex)); + if (error != sizeof(ex)) + goto out; + + /* We come in here for the regular a.out style of shared libraries */ + if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) || + N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) || + i_size_read(file_inode(file)) < + ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { + goto out; + } + + if (N_FLAGS(ex)) + goto out; + + /* For QMAGIC, the starting address is 0x20 into the page. We mask + this off to get the starting address for the page */ + + start_addr = ex.a_entry & 0xfffff000; + + if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) { +#ifdef WARN_OLD + static unsigned long error_time; + if (time_after(jiffies, error_time + 5*HZ)) { + printk(KERN_WARNING + "N_TXTOFF is not page aligned. Please convert " + "library: %pD\n", + file); + error_time = jiffies; + } +#endif + vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); + + read_code(file, start_addr, N_TXTOFF(ex), + ex.a_text + ex.a_data); + retval = 0; + goto out; + } + /* Now use mmap to map the library into memory. */ + error = vm_mmap(file, start_addr, ex.a_text + ex.a_data, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_32BIT, + N_TXTOFF(ex)); + retval = error; + if (error != start_addr) + goto out; + + len = PAGE_ALIGN(ex.a_text + ex.a_data); + bss = ex.a_text + ex.a_data + ex.a_bss; + if (bss > len) { + error = vm_brk(start_addr + len, bss - len); + retval = error; + if (error != start_addr + len) + goto out; + } + retval = 0; +out: + return retval; +} + +static int __init init_aout_binfmt(void) +{ + register_binfmt(&aout_format); + return 0; +} + +static void __exit exit_aout_binfmt(void) +{ + unregister_binfmt(&aout_format); +} + +module_init(init_aout_binfmt); +module_exit(exit_aout_binfmt); +MODULE_LICENSE("GPL"); diff --git a/kernel/arch/x86/ia32/ia32_signal.c b/kernel/arch/x86/ia32/ia32_signal.c new file mode 100644 index 000000000..c81d35e6c --- /dev/null +++ b/kernel/arch/x86/ia32/ia32_signal.c @@ -0,0 +1,497 @@ +/* + * linux/arch/x86_64/ia32/ia32_signal.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson + * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes + * 2000-12-* x86-64 compatibility mode signal handling by Andi Kleen + */ + +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/wait.h> +#include <linux/unistd.h> +#include <linux/stddef.h> +#include <linux/personality.h> +#include <linux/compat.h> +#include <linux/binfmts.h> +#include <asm/ucontext.h> +#include <asm/uaccess.h> +#include <asm/i387.h> +#include <asm/fpu-internal.h> +#include <asm/ptrace.h> +#include <asm/ia32_unistd.h> +#include <asm/user32.h> +#include <asm/sigcontext32.h> +#include <asm/proto.h> +#include <asm/vdso.h> +#include <asm/sigframe.h> +#include <asm/sighandling.h> +#include <asm/sys_ia32.h> +#include <asm/smap.h> + +int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) +{ + int err = 0; + bool ia32 = test_thread_flag(TIF_IA32); + + if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) + return -EFAULT; + + put_user_try { + /* If you change siginfo_t structure, please make sure that + this code is fixed accordingly. + It should never copy any pad contained in the structure + to avoid security leaks, but must copy the generic + 3 ints plus the relevant union member. */ + put_user_ex(from->si_signo, &to->si_signo); + put_user_ex(from->si_errno, &to->si_errno); + put_user_ex((short)from->si_code, &to->si_code); + + if (from->si_code < 0) { + put_user_ex(from->si_pid, &to->si_pid); + put_user_ex(from->si_uid, &to->si_uid); + put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr); + } else { + /* + * First 32bits of unions are always present: + * si_pid === si_band === si_tid === si_addr(LS half) + */ + put_user_ex(from->_sifields._pad[0], + &to->_sifields._pad[0]); + switch (from->si_code >> 16) { + case __SI_FAULT >> 16: + break; + case __SI_SYS >> 16: + put_user_ex(from->si_syscall, &to->si_syscall); + put_user_ex(from->si_arch, &to->si_arch); + break; + case __SI_CHLD >> 16: + if (ia32) { + put_user_ex(from->si_utime, &to->si_utime); + put_user_ex(from->si_stime, &to->si_stime); + } else { + put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime); + put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime); + } + put_user_ex(from->si_status, &to->si_status); + /* FALL THROUGH */ + default: + case __SI_KILL >> 16: + put_user_ex(from->si_uid, &to->si_uid); + break; + case __SI_POLL >> 16: + put_user_ex(from->si_fd, &to->si_fd); + break; + case __SI_TIMER >> 16: + put_user_ex(from->si_overrun, &to->si_overrun); + put_user_ex(ptr_to_compat(from->si_ptr), + &to->si_ptr); + break; + /* This is not generated by the kernel as of now. */ + case __SI_RT >> 16: + case __SI_MESGQ >> 16: + put_user_ex(from->si_uid, &to->si_uid); + put_user_ex(from->si_int, &to->si_int); + break; + } + } + } put_user_catch(err); + + return err; +} + +int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) +{ + int err = 0; + u32 ptr32; + + if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t))) + return -EFAULT; + + get_user_try { + get_user_ex(to->si_signo, &from->si_signo); + get_user_ex(to->si_errno, &from->si_errno); + get_user_ex(to->si_code, &from->si_code); + + get_user_ex(to->si_pid, &from->si_pid); + get_user_ex(to->si_uid, &from->si_uid); + get_user_ex(ptr32, &from->si_ptr); + to->si_ptr = compat_ptr(ptr32); + } get_user_catch(err); + + return err; +} + +/* + * Do a signal return; undo the signal stack. + */ +#define loadsegment_gs(v) load_gs_index(v) +#define loadsegment_fs(v) loadsegment(fs, v) +#define loadsegment_ds(v) loadsegment(ds, v) +#define loadsegment_es(v) loadsegment(es, v) + +#define get_user_seg(seg) ({ unsigned int v; savesegment(seg, v); v; }) +#define set_user_seg(seg, v) loadsegment_##seg(v) + +#define COPY(x) { \ + get_user_ex(regs->x, &sc->x); \ +} + +#define GET_SEG(seg) ({ \ + unsigned short tmp; \ + get_user_ex(tmp, &sc->seg); \ + tmp; \ +}) + +#define COPY_SEG_CPL3(seg) do { \ + regs->seg = GET_SEG(seg) | 3; \ +} while (0) + +#define RELOAD_SEG(seg) { \ + unsigned int pre = GET_SEG(seg); \ + unsigned int cur = get_user_seg(seg); \ + pre |= 3; \ + if (pre != cur) \ + set_user_seg(seg, pre); \ +} + +static int ia32_restore_sigcontext(struct pt_regs *regs, + struct sigcontext_ia32 __user *sc) +{ + unsigned int tmpflags, err = 0; + void __user *buf; + u32 tmp; + + /* Always make any pending restarted system calls return -EINTR */ + current->restart_block.fn = do_no_restart_syscall; + + get_user_try { + /* + * Reload fs and gs if they have changed in the signal + * handler. This does not handle long fs/gs base changes in + * the handler, but does not clobber them at least in the + * normal case. + */ + RELOAD_SEG(gs); + RELOAD_SEG(fs); + RELOAD_SEG(ds); + RELOAD_SEG(es); + + COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); + COPY(dx); COPY(cx); COPY(ip); COPY(ax); + /* Don't touch extended registers */ + + COPY_SEG_CPL3(cs); + COPY_SEG_CPL3(ss); + + get_user_ex(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); + /* disable syscall checks */ + regs->orig_ax = -1; + + get_user_ex(tmp, &sc->fpstate); + buf = compat_ptr(tmp); + } get_user_catch(err); + + err |= restore_xstate_sig(buf, 1); + + force_iret(); + + return err; +} + +asmlinkage long sys32_sigreturn(void) +{ + struct pt_regs *regs = current_pt_regs(); + struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); + sigset_t set; + + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__get_user(set.sig[0], &frame->sc.oldmask) + || (_COMPAT_NSIG_WORDS > 1 + && __copy_from_user((((char *) &set.sig) + 4), + &frame->extramask, + sizeof(frame->extramask)))) + goto badframe; + + set_current_blocked(&set); + + if (ia32_restore_sigcontext(regs, &frame->sc)) + goto badframe; + return regs->ax; + +badframe: + signal_fault(regs, frame, "32bit sigreturn"); + return 0; +} + +asmlinkage long sys32_rt_sigreturn(void) +{ + struct pt_regs *regs = current_pt_regs(); + struct rt_sigframe_ia32 __user *frame; + sigset_t set; + + frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); + + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) + goto badframe; + + set_current_blocked(&set); + + if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext)) + goto badframe; + + if (compat_restore_altstack(&frame->uc.uc_stack)) + goto badframe; + + return regs->ax; + +badframe: + signal_fault(regs, frame, "32bit rt sigreturn"); + return 0; +} + +/* + * Set up a signal frame. + */ + +static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, + void __user *fpstate, + struct pt_regs *regs, unsigned int mask) +{ + int err = 0; + + put_user_try { + put_user_ex(get_user_seg(gs), (unsigned int __user *)&sc->gs); + put_user_ex(get_user_seg(fs), (unsigned int __user *)&sc->fs); + put_user_ex(get_user_seg(ds), (unsigned int __user *)&sc->ds); + put_user_ex(get_user_seg(es), (unsigned int __user *)&sc->es); + + put_user_ex(regs->di, &sc->di); + put_user_ex(regs->si, &sc->si); + put_user_ex(regs->bp, &sc->bp); + put_user_ex(regs->sp, &sc->sp); + put_user_ex(regs->bx, &sc->bx); + put_user_ex(regs->dx, &sc->dx); + put_user_ex(regs->cx, &sc->cx); + put_user_ex(regs->ax, &sc->ax); + put_user_ex(current->thread.trap_nr, &sc->trapno); + put_user_ex(current->thread.error_code, &sc->err); + put_user_ex(regs->ip, &sc->ip); + put_user_ex(regs->cs, (unsigned int __user *)&sc->cs); + put_user_ex(regs->flags, &sc->flags); + put_user_ex(regs->sp, &sc->sp_at_signal); + put_user_ex(regs->ss, (unsigned int __user *)&sc->ss); + + put_user_ex(ptr_to_compat(fpstate), &sc->fpstate); + + /* non-iBCS2 extensions.. */ + put_user_ex(mask, &sc->oldmask); + put_user_ex(current->thread.cr2, &sc->cr2); + } put_user_catch(err); + + return err; +} + +/* + * Determine which stack to use.. + */ +static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, + size_t frame_size, + void __user **fpstate) +{ + unsigned long sp; + + /* Default to using normal stack */ + sp = regs->sp; + + /* This is the X/Open sanctioned signal stack switching. */ + if (ksig->ka.sa.sa_flags & SA_ONSTACK) + sp = sigsp(sp, ksig); + /* This is the legacy signal stack switching. */ + else if ((regs->ss & 0xffff) != __USER32_DS && + !(ksig->ka.sa.sa_flags & SA_RESTORER) && + ksig->ka.sa.sa_restorer) + sp = (unsigned long) ksig->ka.sa.sa_restorer; + + if (used_math()) { + unsigned long fx_aligned, math_size; + + sp = alloc_mathframe(sp, 1, &fx_aligned, &math_size); + *fpstate = (struct _fpstate_ia32 __user *) sp; + if (save_xstate_sig(*fpstate, (void __user *)fx_aligned, + math_size) < 0) + return (void __user *) -1L; + } + + sp -= frame_size; + /* Align the stack pointer according to the i386 ABI, + * i.e. so that on function entry ((sp + 4) & 15) == 0. */ + sp = ((sp + 4) & -16ul) - 4; + return (void __user *) sp; +} + +int ia32_setup_frame(int sig, struct ksignal *ksig, + compat_sigset_t *set, struct pt_regs *regs) +{ + struct sigframe_ia32 __user *frame; + void __user *restorer; + int err = 0; + void __user *fpstate = NULL; + + /* copy_to_user optimizes that into a single 8 byte store */ + static const struct { + u16 poplmovl; + u32 val; + u16 int80; + } __attribute__((packed)) code = { + 0xb858, /* popl %eax ; movl $...,%eax */ + __NR_ia32_sigreturn, + 0x80cd, /* int $0x80 */ + }; + + frame = get_sigframe(ksig, regs, sizeof(*frame), &fpstate); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return -EFAULT; + + if (__put_user(sig, &frame->sig)) + return -EFAULT; + + if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) + return -EFAULT; + + if (_COMPAT_NSIG_WORDS > 1) { + if (__copy_to_user(frame->extramask, &set->sig[1], + sizeof(frame->extramask))) + return -EFAULT; + } + + if (ksig->ka.sa.sa_flags & SA_RESTORER) { + restorer = ksig->ka.sa.sa_restorer; + } else { + /* Return stub is in 32bit vsyscall page */ + if (current->mm->context.vdso) + restorer = current->mm->context.vdso + + selected_vdso32->sym___kernel_sigreturn; + else + restorer = &frame->retcode; + } + + put_user_try { + put_user_ex(ptr_to_compat(restorer), &frame->pretcode); + + /* + * These are actually not used anymore, but left because some + * gdb versions depend on them as a marker. + */ + put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode); + } put_user_catch(err); + + if (err) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; + + /* Make -mregparm=3 work */ + regs->ax = sig; + regs->dx = 0; + regs->cx = 0; + + loadsegment(ds, __USER32_DS); + loadsegment(es, __USER32_DS); + + regs->cs = __USER32_CS; + regs->ss = __USER32_DS; + + return 0; +} + +int ia32_setup_rt_frame(int sig, struct ksignal *ksig, + compat_sigset_t *set, struct pt_regs *regs) +{ + struct rt_sigframe_ia32 __user *frame; + void __user *restorer; + int err = 0; + void __user *fpstate = NULL; + + /* __copy_to_user optimizes that into a single 8 byte store */ + static const struct { + u8 movl; + u32 val; + u16 int80; + u8 pad; + } __attribute__((packed)) code = { + 0xb8, + __NR_ia32_rt_sigreturn, + 0x80cd, + 0, + }; + + frame = get_sigframe(ksig, regs, sizeof(*frame), &fpstate); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return -EFAULT; + + put_user_try { + put_user_ex(sig, &frame->sig); + put_user_ex(ptr_to_compat(&frame->info), &frame->pinfo); + put_user_ex(ptr_to_compat(&frame->uc), &frame->puc); + + /* Create the ucontext. */ + if (cpu_has_xsave) + put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); + else + put_user_ex(0, &frame->uc.uc_flags); + put_user_ex(0, &frame->uc.uc_link); + compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); + + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; + else + restorer = current->mm->context.vdso + + selected_vdso32->sym___kernel_rt_sigreturn; + put_user_ex(ptr_to_compat(restorer), &frame->pretcode); + + /* + * Not actually used anymore, but left because some gdb + * versions need it. + */ + put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode); + } put_user_catch(err); + + err |= copy_siginfo_to_user32(&frame->info, &ksig->info); + err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate, + regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + + if (err) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; + + /* Make -mregparm=3 work */ + regs->ax = sig; + regs->dx = (unsigned long) &frame->info; + regs->cx = (unsigned long) &frame->uc; + + loadsegment(ds, __USER32_DS); + loadsegment(es, __USER32_DS); + + regs->cs = __USER32_CS; + regs->ss = __USER32_DS; + + return 0; +} diff --git a/kernel/arch/x86/ia32/ia32entry.S b/kernel/arch/x86/ia32/ia32entry.S new file mode 100644 index 000000000..72bf2680f --- /dev/null +++ b/kernel/arch/x86/ia32/ia32entry.S @@ -0,0 +1,611 @@ +/* + * Compatibility mode system call entry point for x86-64. + * + * Copyright 2000-2002 Andi Kleen, SuSE Labs. + */ + +#include <asm/dwarf2.h> +#include <asm/calling.h> +#include <asm/asm-offsets.h> +#include <asm/current.h> +#include <asm/errno.h> +#include <asm/ia32_unistd.h> +#include <asm/thread_info.h> +#include <asm/segment.h> +#include <asm/irqflags.h> +#include <asm/asm.h> +#include <asm/smap.h> +#include <linux/linkage.h> +#include <linux/err.h> + +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ +#include <linux/elf-em.h> +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +#define sysexit_audit ia32_ret_from_sys_call +#define sysretl_audit ia32_ret_from_sys_call +#endif + + .section .entry.text, "ax" + + /* clobbers %rax */ + .macro CLEAR_RREGS _r9=rax + xorl %eax,%eax + movq %rax,R11(%rsp) + movq %rax,R10(%rsp) + movq %\_r9,R9(%rsp) + movq %rax,R8(%rsp) + .endm + + /* + * Reload arg registers from stack in case ptrace changed them. + * We don't reload %eax because syscall_trace_enter() returned + * the %rax value we should see. Instead, we just truncate that + * value to 32 bits again as we did on entry from user mode. + * If it's a new value set by user_regset during entry tracing, + * this matches the normal truncation of the user-mode value. + * If it's -1 to make us punt the syscall, then (u32)-1 is still + * an appropriately invalid value. + */ + .macro LOAD_ARGS32 _r9=0 + .if \_r9 + movl R9(%rsp),%r9d + .endif + movl RCX(%rsp),%ecx + movl RDX(%rsp),%edx + movl RSI(%rsp),%esi + movl RDI(%rsp),%edi + movl %eax,%eax /* zero extension */ + .endm + + .macro CFI_STARTPROC32 simple + CFI_STARTPROC \simple + CFI_UNDEFINED r8 + CFI_UNDEFINED r9 + CFI_UNDEFINED r10 + CFI_UNDEFINED r11 + CFI_UNDEFINED r12 + CFI_UNDEFINED r13 + CFI_UNDEFINED r14 + CFI_UNDEFINED r15 + .endm + +#ifdef CONFIG_PARAVIRT +ENTRY(native_usergs_sysret32) + swapgs + sysretl +ENDPROC(native_usergs_sysret32) + +ENTRY(native_irq_enable_sysexit) + swapgs + sti + sysexit +ENDPROC(native_irq_enable_sysexit) +#endif + +/* + * 32bit SYSENTER instruction entry. + * + * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. + * IF and VM in rflags are cleared (IOW: interrupts are off). + * SYSENTER does not save anything on the stack, + * and does not save old rip (!!!) and rflags. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp user stack + * 0(%ebp) arg6 + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. We set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_sysenter_target) + CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp,0 + CFI_REGISTER rsp,rbp + + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK + movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %ebp, %ebp + movl %eax, %eax + + movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d + CFI_REGISTER rip,r10 + + /* Construct struct pt_regs on stack */ + pushq_cfi $__USER32_DS /* pt_regs->ss */ + pushq_cfi %rbp /* pt_regs->sp */ + CFI_REL_OFFSET rsp,0 + pushfq_cfi /* pt_regs->flags */ + pushq_cfi $__USER32_CS /* pt_regs->cs */ + pushq_cfi %r10 /* pt_regs->ip = thread_info->sysenter_return */ + CFI_REL_OFFSET rip,0 + pushq_cfi_reg rax /* pt_regs->orig_ax */ + pushq_cfi_reg rdi /* pt_regs->di */ + pushq_cfi_reg rsi /* pt_regs->si */ + pushq_cfi_reg rdx /* pt_regs->dx */ + pushq_cfi_reg rcx /* pt_regs->cx */ + pushq_cfi_reg rax /* pt_regs->ax */ + cld + sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + CFI_ADJUST_CFA_OFFSET 10*8 + + /* + * no need to do an access_ok check here because rbp has been + * 32bit zero extended + */ + ASM_STAC +1: movl (%rbp),%ebp + _ASM_EXTABLE(1b,ia32_badarg) + ASM_CLAC + + /* + * Sysenter doesn't filter flags, so we need to clear NT + * ourselves. To save a few cycles, we can check whether + * NT was set instead of doing an unconditional popfq. + */ + testl $X86_EFLAGS_NT,EFLAGS(%rsp) + jnz sysenter_fix_flags +sysenter_flags_fixed: + + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + CFI_REMEMBER_STATE + jnz sysenter_tracesys + cmpq $(IA32_NR_syscalls-1),%rax + ja ia32_badsys +sysenter_do_call: + /* 32bit syscall -> 64bit C ABI argument conversion */ + movl %edi,%r8d /* arg5 */ + movl %ebp,%r9d /* arg6 */ + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ + movl %ebx,%edi /* arg1 */ + movl %edx,%edx /* arg3 (zero extension) */ +sysenter_dispatch: + call *ia32_sys_call_table(,%rax,8) + movq %rax,RAX(%rsp) + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysexit_audit +sysexit_from_sys_call: + /* + * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an + * NMI between STI and SYSEXIT has poorly specified behavior, + * and and NMI followed by an IRQ with usergs is fatal. So + * we just pretend we're using SYSEXIT but we really use + * SYSRETL instead. + * + * This code path is still called 'sysexit' because it pairs + * with 'sysenter' and it uses the SYSENTER calling convention. + */ + andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + movl RIP(%rsp),%ecx /* User %eip */ + CFI_REGISTER rip,rcx + RESTORE_RSI_RDI + xorl %edx,%edx /* avoid info leaks */ + xorq %r8,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + movl EFLAGS(%rsp),%r11d /* User eflags */ + /*CFI_RESTORE rflags*/ + TRACE_IRQS_ON + + /* + * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, + * since it avoids a dicey window with interrupts enabled. + */ + movl RSP(%rsp),%esp + + /* + * USERGS_SYSRET32 does: + * gsbase = user's gs base + * eip = ecx + * rflags = r11 + * cs = __USER32_CS + * ss = __USER_DS + * + * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: + * + * pop %ebp + * pop %edx + * pop %ecx + * + * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to + * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's + * address (already known to user code), and R12-R15 are + * callee-saved and therefore don't contain any interesting + * kernel data. + */ + USERGS_SYSRET32 + + CFI_RESTORE_STATE + +#ifdef CONFIG_AUDITSYSCALL + .macro auditsys_entry_common + movl %esi,%r8d /* 5th arg: 4th syscall arg */ + movl %ecx,%r9d /*swap with edx*/ + movl %edx,%ecx /* 4th arg: 3rd syscall arg */ + movl %r9d,%edx /* 3rd arg: 2nd syscall arg */ + movl %ebx,%esi /* 2nd arg: 1st syscall arg */ + movl %eax,%edi /* 1st arg: syscall number */ + call __audit_syscall_entry + movl RAX(%rsp),%eax /* reload syscall number */ + cmpq $(IA32_NR_syscalls-1),%rax + ja ia32_badsys + movl %ebx,%edi /* reload 1st syscall arg */ + movl RCX(%rsp),%esi /* reload 2nd syscall arg */ + movl RDX(%rsp),%edx /* reload 3rd syscall arg */ + movl RSI(%rsp),%ecx /* reload 4th syscall arg */ + movl RDI(%rsp),%r8d /* reload 5th syscall arg */ + .endm + + .macro auditsys_exit exit + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_ret_from_sys_call + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + movl %eax,%esi /* second arg, syscall return value */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + jbe 1f + movslq %eax, %rsi /* if error sign extend to 64 bits */ +1: setbe %al /* 1 if error, 0 if not */ + movzbl %al,%edi /* zero-extend that into %edi */ + call __audit_syscall_exit + movq RAX(%rsp),%rax /* reload syscall return value */ + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz \exit + CLEAR_RREGS + jmp int_with_check + .endm + +sysenter_auditsys: + auditsys_entry_common + movl %ebp,%r9d /* reload 6th syscall arg */ + jmp sysenter_dispatch + +sysexit_audit: + auditsys_exit sysexit_from_sys_call +#endif + +sysenter_fix_flags: + pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) + popfq_cfi + jmp sysenter_flags_fixed + +sysenter_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz sysenter_auditsys +#endif + SAVE_EXTRA_REGS + CLEAR_RREGS + movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ + RESTORE_EXTRA_REGS + cmpq $(IA32_NR_syscalls-1),%rax + ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ + jmp sysenter_do_call + CFI_ENDPROC +ENDPROC(ia32_sysenter_target) + +/* + * 32bit SYSCALL instruction entry. + * + * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * then loads new ss, cs, and rip from previously programmed MSRs. + * rflags gets masked by a value from another MSR (so CLD and CLAC + * are not needed). SYSCALL does not save anything on the stack + * and does not change rsp. + * + * Note: rflags saving+masking-with-MSR happens only in Long mode + * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it). + * Don't get confused: rflags saving+masking depends on Long Mode Active bit + * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes + * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). + * + * Arguments: + * eax system call number + * ecx return address + * ebx arg1 + * ebp arg2 (note: not saved in the stack frame, should not be touched) + * edx arg3 + * esi arg4 + * edi arg5 + * esp user stack + * 0(%esp) arg6 + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. We set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_cstar_target) + CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp,0 + CFI_REGISTER rip,rcx + /*CFI_REGISTER rflags,r11*/ + + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK + movl %esp,%r8d + CFI_REGISTER rsp,r8 + movq PER_CPU_VAR(kernel_stack),%rsp + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %eax,%eax + + /* Construct struct pt_regs on stack */ + pushq_cfi $__USER32_DS /* pt_regs->ss */ + pushq_cfi %r8 /* pt_regs->sp */ + CFI_REL_OFFSET rsp,0 + pushq_cfi %r11 /* pt_regs->flags */ + pushq_cfi $__USER32_CS /* pt_regs->cs */ + pushq_cfi %rcx /* pt_regs->ip */ + CFI_REL_OFFSET rip,0 + pushq_cfi_reg rax /* pt_regs->orig_ax */ + pushq_cfi_reg rdi /* pt_regs->di */ + pushq_cfi_reg rsi /* pt_regs->si */ + pushq_cfi_reg rdx /* pt_regs->dx */ + pushq_cfi_reg rbp /* pt_regs->cx */ + movl %ebp,%ecx + pushq_cfi_reg rax /* pt_regs->ax */ + sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + CFI_ADJUST_CFA_OFFSET 10*8 + + /* + * no need to do an access_ok check here because r8 has been + * 32bit zero extended + */ + ASM_STAC +1: movl (%r8),%r9d + _ASM_EXTABLE(1b,ia32_badarg) + ASM_CLAC + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + CFI_REMEMBER_STATE + jnz cstar_tracesys + cmpq $IA32_NR_syscalls-1,%rax + ja ia32_badsys +cstar_do_call: + /* 32bit syscall -> 64bit C ABI argument conversion */ + movl %edi,%r8d /* arg5 */ + /* r9 already loaded */ /* arg6 */ + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ + movl %ebx,%edi /* arg1 */ + movl %edx,%edx /* arg3 (zero extension) */ +cstar_dispatch: + call *ia32_sys_call_table(,%rax,8) + movq %rax,RAX(%rsp) + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysretl_audit +sysretl_from_sys_call: + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + RESTORE_RSI_RDI_RDX + movl RIP(%rsp),%ecx + CFI_REGISTER rip,rcx + movl EFLAGS(%rsp),%r11d + /*CFI_REGISTER rflags,r11*/ + xorq %r10,%r10 + xorq %r9,%r9 + xorq %r8,%r8 + TRACE_IRQS_ON + movl RSP(%rsp),%esp + CFI_RESTORE rsp + /* + * 64bit->32bit SYSRET restores eip from ecx, + * eflags from r11 (but RF and VM bits are forced to 0), + * cs and ss are loaded from MSRs. + * (Note: 32bit->32bit SYSRET is different: since r11 + * does not exist, it merely sets eflags.IF=1). + * + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss + * descriptor is not reinitialized. This means that we must + * avoid SYSRET with SS == NULL, which could happen if we schedule, + * exit the kernel, and re-enter using an interrupt vector. (All + * interrupt entries on x86_64 set SS to NULL.) We prevent that + * from happening by reloading SS in __switch_to. + */ + USERGS_SYSRET32 + +#ifdef CONFIG_AUDITSYSCALL +cstar_auditsys: + CFI_RESTORE_STATE + movl %r9d,R9(%rsp) /* register to be clobbered by call */ + auditsys_entry_common + movl R9(%rsp),%r9d /* reload 6th syscall arg */ + jmp cstar_dispatch + +sysretl_audit: + auditsys_exit sysretl_from_sys_call +#endif + +cstar_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz cstar_auditsys +#endif + xchgl %r9d,%ebp + SAVE_EXTRA_REGS + CLEAR_RREGS r9 + movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */ + RESTORE_EXTRA_REGS + xchgl %ebp,%r9d + cmpq $(IA32_NR_syscalls-1),%rax + ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ + jmp cstar_do_call +END(ia32_cstar_target) + +ia32_badarg: + ASM_CLAC + movq $-EFAULT,%rax + jmp ia32_sysret + CFI_ENDPROC + +/* + * Emulated IA32 system calls via int 0x80. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp arg6 (note: not saved in the stack frame, should not be touched) + * + * Notes: + * Uses the same stack frame as the x86-64 version. + * All registers except eax must be saved (but ptrace may violate that). + * Arguments are zero extended. For system calls that want sign extension and + * take long arguments a wrapper is needed. Most calls can just be called + * directly. + * Assumes it is only called from user space and entered with interrupts off. + */ + +ENTRY(ia32_syscall) + CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp,5*8 + /*CFI_REL_OFFSET ss,4*8 */ + CFI_REL_OFFSET rsp,3*8 + /*CFI_REL_OFFSET rflags,2*8 */ + /*CFI_REL_OFFSET cs,1*8 */ + CFI_REL_OFFSET rip,0*8 + + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + PARAVIRT_ADJUST_EXCEPTION_FRAME + SWAPGS + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %eax,%eax + + /* Construct struct pt_regs on stack (iret frame is already on stack) */ + pushq_cfi_reg rax /* pt_regs->orig_ax */ + pushq_cfi_reg rdi /* pt_regs->di */ + pushq_cfi_reg rsi /* pt_regs->si */ + pushq_cfi_reg rdx /* pt_regs->dx */ + pushq_cfi_reg rcx /* pt_regs->cx */ + pushq_cfi_reg rax /* pt_regs->ax */ + cld + sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + CFI_ADJUST_CFA_OFFSET 10*8 + + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_tracesys + cmpq $(IA32_NR_syscalls-1),%rax + ja ia32_badsys +ia32_do_call: + /* 32bit syscall -> 64bit C ABI argument conversion */ + movl %edi,%r8d /* arg5 */ + movl %ebp,%r9d /* arg6 */ + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ + movl %ebx,%edi /* arg1 */ + movl %edx,%edx /* arg3 (zero extension) */ + call *ia32_sys_call_table(,%rax,8) # xxx: rip relative +ia32_sysret: + movq %rax,RAX(%rsp) +ia32_ret_from_sys_call: + CLEAR_RREGS + jmp int_ret_from_sys_call + +ia32_tracesys: + SAVE_EXTRA_REGS + CLEAR_RREGS + movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ + RESTORE_EXTRA_REGS + cmpq $(IA32_NR_syscalls-1),%rax + ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ + jmp ia32_do_call +END(ia32_syscall) + +ia32_badsys: + movq $0,ORIG_RAX(%rsp) + movq $-ENOSYS,%rax + jmp ia32_sysret + + CFI_ENDPROC + + .macro PTREGSCALL label, func + ALIGN +GLOBAL(\label) + leaq \func(%rip),%rax + jmp ia32_ptregs_common + .endm + + CFI_STARTPROC32 + + PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn + PTREGSCALL stub32_sigreturn, sys32_sigreturn + PTREGSCALL stub32_fork, sys_fork + PTREGSCALL stub32_vfork, sys_vfork + + ALIGN +GLOBAL(stub32_clone) + leaq sys_clone(%rip),%rax + mov %r8, %rcx + jmp ia32_ptregs_common + + ALIGN +ia32_ptregs_common: + CFI_ENDPROC + CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp,SIZEOF_PTREGS + CFI_REL_OFFSET rax,RAX + CFI_REL_OFFSET rcx,RCX + CFI_REL_OFFSET rdx,RDX + CFI_REL_OFFSET rsi,RSI + CFI_REL_OFFSET rdi,RDI + CFI_REL_OFFSET rip,RIP +/* CFI_REL_OFFSET cs,CS*/ +/* CFI_REL_OFFSET rflags,EFLAGS*/ + CFI_REL_OFFSET rsp,RSP +/* CFI_REL_OFFSET ss,SS*/ + SAVE_EXTRA_REGS 8 + call *%rax + RESTORE_EXTRA_REGS 8 + ret + CFI_ENDPROC +END(ia32_ptregs_common) diff --git a/kernel/arch/x86/ia32/sys_ia32.c b/kernel/arch/x86/ia32/sys_ia32.c new file mode 100644 index 000000000..719cd702b --- /dev/null +++ b/kernel/arch/x86/ia32/sys_ia32.c @@ -0,0 +1,231 @@ +/* + * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on + * sys_sparc32 + * + * Copyright (C) 2000 VA Linux Co + * Copyright (C) 2000 Don Dugger <n0ano@valinux.com> + * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com> + * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) + * Copyright (C) 2000 Hewlett-Packard Co. + * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com> + * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port) + * + * These routines maintain argument size conversion between 32bit and 64bit + * environment. In 2.5 most of this should be moved to a generic directory. + * + * This file assumes that there is a hole at the end of user address space. + * + * Some of the functions are LE specific currently. These are + * hopefully all marked. This should be fixed. + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/signal.h> +#include <linux/syscalls.h> +#include <linux/times.h> +#include <linux/utsname.h> +#include <linux/mm.h> +#include <linux/uio.h> +#include <linux/poll.h> +#include <linux/personality.h> +#include <linux/stat.h> +#include <linux/rwsem.h> +#include <linux/compat.h> +#include <linux/vfs.h> +#include <linux/ptrace.h> +#include <linux/highuid.h> +#include <linux/sysctl.h> +#include <linux/slab.h> +#include <asm/mman.h> +#include <asm/types.h> +#include <asm/uaccess.h> +#include <linux/atomic.h> +#include <asm/vgtod.h> +#include <asm/sys_ia32.h> + +#define AA(__x) ((unsigned long)(__x)) + + +asmlinkage long sys32_truncate64(const char __user *filename, + unsigned long offset_low, + unsigned long offset_high) +{ + return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low); +} + +asmlinkage long sys32_ftruncate64(unsigned int fd, unsigned long offset_low, + unsigned long offset_high) +{ + return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low); +} + +/* + * Another set for IA32/LFS -- x86_64 struct stat is different due to + * support for 64bit inode numbers. + */ +static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) +{ + typeof(ubuf->st_uid) uid = 0; + typeof(ubuf->st_gid) gid = 0; + SET_UID(uid, from_kuid_munged(current_user_ns(), stat->uid)); + SET_GID(gid, from_kgid_munged(current_user_ns(), stat->gid)); + if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) || + __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) || + __put_user(stat->ino, &ubuf->__st_ino) || + __put_user(stat->ino, &ubuf->st_ino) || + __put_user(stat->mode, &ubuf->st_mode) || + __put_user(stat->nlink, &ubuf->st_nlink) || + __put_user(uid, &ubuf->st_uid) || + __put_user(gid, &ubuf->st_gid) || + __put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev) || + __put_user(stat->size, &ubuf->st_size) || + __put_user(stat->atime.tv_sec, &ubuf->st_atime) || + __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec) || + __put_user(stat->mtime.tv_sec, &ubuf->st_mtime) || + __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) || + __put_user(stat->ctime.tv_sec, &ubuf->st_ctime) || + __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) || + __put_user(stat->blksize, &ubuf->st_blksize) || + __put_user(stat->blocks, &ubuf->st_blocks)) + return -EFAULT; + return 0; +} + +asmlinkage long sys32_stat64(const char __user *filename, + struct stat64 __user *statbuf) +{ + struct kstat stat; + int ret = vfs_stat(filename, &stat); + + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +asmlinkage long sys32_lstat64(const char __user *filename, + struct stat64 __user *statbuf) +{ + struct kstat stat; + int ret = vfs_lstat(filename, &stat); + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +asmlinkage long sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf) +{ + struct kstat stat; + int ret = vfs_fstat(fd, &stat); + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +asmlinkage long sys32_fstatat(unsigned int dfd, const char __user *filename, + struct stat64 __user *statbuf, int flag) +{ + struct kstat stat; + int error; + + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_stat64(statbuf, &stat); +} + +/* + * Linux/i386 didn't use to be able to handle more than + * 4 system call parameters, so these system calls used a memory + * block for parameter passing.. + */ + +struct mmap_arg_struct32 { + unsigned int addr; + unsigned int len; + unsigned int prot; + unsigned int flags; + unsigned int fd; + unsigned int offset; +}; + +asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *arg) +{ + struct mmap_arg_struct32 a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + + if (a.offset & ~PAGE_MASK) + return -EINVAL; + + return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset>>PAGE_SHIFT); +} + +asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int __user *stat_addr, + int options) +{ + return compat_sys_wait4(pid, stat_addr, options, NULL); +} + +/* warning: next two assume little endian */ +asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count, + u32 poslo, u32 poshi) +{ + return sys_pread64(fd, ubuf, count, + ((loff_t)AA(poshi) << 32) | AA(poslo)); +} + +asmlinkage long sys32_pwrite(unsigned int fd, const char __user *ubuf, + u32 count, u32 poslo, u32 poshi) +{ + return sys_pwrite64(fd, ubuf, count, + ((loff_t)AA(poshi) << 32) | AA(poslo)); +} + + +/* + * Some system calls that need sign extended arguments. This could be + * done by a generic wrapper. + */ +long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, + __u32 len_low, __u32 len_high, int advice) +{ + return sys_fadvise64_64(fd, + (((u64)offset_high)<<32) | offset_low, + (((u64)len_high)<<32) | len_low, + advice); +} + +asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, + size_t count) +{ + return sys_readahead(fd, ((u64)off_hi << 32) | off_lo, count); +} + +asmlinkage long sys32_sync_file_range(int fd, unsigned off_low, unsigned off_hi, + unsigned n_low, unsigned n_hi, int flags) +{ + return sys_sync_file_range(fd, + ((u64)off_hi << 32) | off_low, + ((u64)n_hi << 32) | n_low, flags); +} + +asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi, + size_t len, int advice) +{ + return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo, + len, advice); +} + +asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo, + unsigned offset_hi, unsigned len_lo, + unsigned len_hi) +{ + return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, + ((u64)len_hi << 32) | len_lo); +} |