diff options
author | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-04-11 10:41:07 +0300 |
---|---|---|
committer | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-04-13 08:17:18 +0300 |
commit | e09b41010ba33a20a87472ee821fa407a5b8da36 (patch) | |
tree | d10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/samples/bpf | |
parent | f93b97fd65072de626c074dbe099a1fff05ce060 (diff) |
These changes are the raw update to linux-4.4.6-rt14. Kernel sources
are taken from kernel.org, and rt patch from the rt wiki download page.
During the rebasing, the following patch collided:
Force tick interrupt and get rid of softirq magic(I70131fb85).
Collisions have been removed because its logic was found on the
source already.
Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769
Signed-off-by: José Pekkarinen <jose.pekkarinen@nokia.com>
Diffstat (limited to 'kernel/samples/bpf')
23 files changed, 1949 insertions, 53 deletions
diff --git a/kernel/samples/bpf/Makefile b/kernel/samples/bpf/Makefile index 76e3458a5..edd638b58 100644 --- a/kernel/samples/bpf/Makefile +++ b/kernel/samples/bpf/Makefile @@ -4,47 +4,76 @@ obj- := dummy.o # List of programs to build hostprogs-y := test_verifier test_maps hostprogs-y += sock_example +hostprogs-y += fds_example hostprogs-y += sockex1 hostprogs-y += sockex2 +hostprogs-y += sockex3 hostprogs-y += tracex1 hostprogs-y += tracex2 hostprogs-y += tracex3 hostprogs-y += tracex4 +hostprogs-y += tracex5 +hostprogs-y += tracex6 +hostprogs-y += trace_output +hostprogs-y += lathist test_verifier-objs := test_verifier.o libbpf.o test_maps-objs := test_maps.o libbpf.o sock_example-objs := sock_example.o libbpf.o +fds_example-objs := bpf_load.o libbpf.o fds_example.o sockex1-objs := bpf_load.o libbpf.o sockex1_user.o sockex2-objs := bpf_load.o libbpf.o sockex2_user.o +sockex3-objs := bpf_load.o libbpf.o sockex3_user.o tracex1-objs := bpf_load.o libbpf.o tracex1_user.o tracex2-objs := bpf_load.o libbpf.o tracex2_user.o tracex3-objs := bpf_load.o libbpf.o tracex3_user.o tracex4-objs := bpf_load.o libbpf.o tracex4_user.o +tracex5-objs := bpf_load.o libbpf.o tracex5_user.o +tracex6-objs := bpf_load.o libbpf.o tracex6_user.o +trace_output-objs := bpf_load.o libbpf.o trace_output_user.o +lathist-objs := bpf_load.o libbpf.o lathist_user.o # Tell kbuild to always build the programs always := $(hostprogs-y) always += sockex1_kern.o always += sockex2_kern.o +always += sockex3_kern.o always += tracex1_kern.o always += tracex2_kern.o always += tracex3_kern.o always += tracex4_kern.o +always += tracex5_kern.o +always += tracex6_kern.o +always += trace_output_kern.o always += tcbpf1_kern.o +always += lathist_kern.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable +HOSTLOADLIBES_fds_example += -lelf HOSTLOADLIBES_sockex1 += -lelf HOSTLOADLIBES_sockex2 += -lelf +HOSTLOADLIBES_sockex3 += -lelf HOSTLOADLIBES_tracex1 += -lelf HOSTLOADLIBES_tracex2 += -lelf HOSTLOADLIBES_tracex3 += -lelf HOSTLOADLIBES_tracex4 += -lelf -lrt +HOSTLOADLIBES_tracex5 += -lelf +HOSTLOADLIBES_tracex6 += -lelf +HOSTLOADLIBES_trace_output += -lelf -lrt +HOSTLOADLIBES_lathist += -lelf # point this to your LLVM backend with bpf support LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc -%.o: %.c +# asm/sysreg.h inline assmbly used by it is incompatible with llvm. +# But, ehere is not easy way to fix it, so just exclude it since it is +# useless for BPF samples. +$(obj)/%.o: $(src)/%.c clang $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \ - -D__KERNEL__ -Wno-unused-value -Wno-pointer-sign \ + -D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \ -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@ + clang $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \ + -D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \ + -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=asm -o $@.s diff --git a/kernel/samples/bpf/bpf_helpers.h b/kernel/samples/bpf/bpf_helpers.h index f960b5fb3..7ad19e1db 100644 --- a/kernel/samples/bpf/bpf_helpers.h +++ b/kernel/samples/bpf/bpf_helpers.h @@ -21,6 +21,24 @@ static unsigned long long (*bpf_ktime_get_ns)(void) = (void *) BPF_FUNC_ktime_get_ns; static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) = (void *) BPF_FUNC_trace_printk; +static void (*bpf_tail_call)(void *ctx, void *map, int index) = + (void *) BPF_FUNC_tail_call; +static unsigned long long (*bpf_get_smp_processor_id)(void) = + (void *) BPF_FUNC_get_smp_processor_id; +static unsigned long long (*bpf_get_current_pid_tgid)(void) = + (void *) BPF_FUNC_get_current_pid_tgid; +static unsigned long long (*bpf_get_current_uid_gid)(void) = + (void *) BPF_FUNC_get_current_uid_gid; +static int (*bpf_get_current_comm)(void *buf, int buf_size) = + (void *) BPF_FUNC_get_current_comm; +static int (*bpf_perf_event_read)(void *map, int index) = + (void *) BPF_FUNC_perf_event_read; +static int (*bpf_clone_redirect)(void *ctx, int ifindex, int flags) = + (void *) BPF_FUNC_clone_redirect; +static int (*bpf_redirect)(int ifindex, int flags) = + (void *) BPF_FUNC_redirect; +static int (*bpf_perf_event_output)(void *ctx, void *map, int index, void *data, int size) = + (void *) BPF_FUNC_perf_event_output; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions @@ -50,4 +68,41 @@ static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flag static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flags) = (void *) BPF_FUNC_l4_csum_replace; +#if defined(__x86_64__) + +#define PT_REGS_PARM1(x) ((x)->di) +#define PT_REGS_PARM2(x) ((x)->si) +#define PT_REGS_PARM3(x) ((x)->dx) +#define PT_REGS_PARM4(x) ((x)->cx) +#define PT_REGS_PARM5(x) ((x)->r8) +#define PT_REGS_RET(x) ((x)->sp) +#define PT_REGS_FP(x) ((x)->bp) +#define PT_REGS_RC(x) ((x)->ax) +#define PT_REGS_SP(x) ((x)->sp) + +#elif defined(__s390x__) + +#define PT_REGS_PARM1(x) ((x)->gprs[2]) +#define PT_REGS_PARM2(x) ((x)->gprs[3]) +#define PT_REGS_PARM3(x) ((x)->gprs[4]) +#define PT_REGS_PARM4(x) ((x)->gprs[5]) +#define PT_REGS_PARM5(x) ((x)->gprs[6]) +#define PT_REGS_RET(x) ((x)->gprs[14]) +#define PT_REGS_FP(x) ((x)->gprs[11]) /* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_RC(x) ((x)->gprs[2]) +#define PT_REGS_SP(x) ((x)->gprs[15]) + +#elif defined(__aarch64__) + +#define PT_REGS_PARM1(x) ((x)->regs[0]) +#define PT_REGS_PARM2(x) ((x)->regs[1]) +#define PT_REGS_PARM3(x) ((x)->regs[2]) +#define PT_REGS_PARM4(x) ((x)->regs[3]) +#define PT_REGS_PARM5(x) ((x)->regs[4]) +#define PT_REGS_RET(x) ((x)->regs[30]) +#define PT_REGS_FP(x) ((x)->regs[29]) /* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_RC(x) ((x)->regs[0]) +#define PT_REGS_SP(x) ((x)->sp) + +#endif #endif diff --git a/kernel/samples/bpf/bpf_load.c b/kernel/samples/bpf/bpf_load.c index 38dac5a53..da86a8e0a 100644 --- a/kernel/samples/bpf/bpf_load.c +++ b/kernel/samples/bpf/bpf_load.c @@ -16,6 +16,7 @@ #include <sys/ioctl.h> #include <sys/mman.h> #include <poll.h> +#include <ctype.h> #include "libbpf.h" #include "bpf_helpers.h" #include "bpf_load.h" @@ -29,6 +30,19 @@ int map_fd[MAX_MAPS]; int prog_fd[MAX_PROGS]; int event_fd[MAX_PROGS]; int prog_cnt; +int prog_array_fd = -1; + +static int populate_prog_array(const char *event, int prog_fd) +{ + int ind = atoi(event), err; + + err = bpf_update_elem(prog_array_fd, &ind, &prog_fd, BPF_ANY); + if (err < 0) { + printf("failed to store prog_fd in prog_array\n"); + return -1; + } + return 0; +} static int load_and_attach(const char *event, struct bpf_insn *prog, int size) { @@ -54,12 +68,40 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) return -1; } + fd = bpf_prog_load(prog_type, prog, size, license, kern_version); + if (fd < 0) { + printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf); + return -1; + } + + prog_fd[prog_cnt++] = fd; + + if (is_socket) { + event += 6; + if (*event != '/') + return 0; + event++; + if (!isdigit(*event)) { + printf("invalid prog number\n"); + return -1; + } + return populate_prog_array(event, fd); + } + if (is_kprobe || is_kretprobe) { if (is_kprobe) event += 7; else event += 10; + if (*event == 0) { + printf("event name cannot be empty\n"); + return -1; + } + + if (isdigit(*event)) + return populate_prog_array(event, fd); + snprintf(buf, sizeof(buf), "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events", is_kprobe ? 'p' : 'r', event, event); @@ -71,18 +113,6 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) } } - fd = bpf_prog_load(prog_type, prog, size, license, kern_version); - - if (fd < 0) { - printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf); - return -1; - } - - prog_fd[prog_cnt++] = fd; - - if (is_socket) - return 0; - strcpy(buf, DEBUGFS); strcat(buf, "events/kprobes/"); strcat(buf, event); @@ -130,6 +160,9 @@ static int load_maps(struct bpf_map_def *maps, int len) maps[i].max_entries); if (map_fd[i] < 0) return 1; + + if (maps[i].type == BPF_MAP_TYPE_PROG_ARRAY) + prog_array_fd = map_fd[i]; } return 0; } diff --git a/kernel/samples/bpf/fds_example.c b/kernel/samples/bpf/fds_example.c new file mode 100644 index 000000000..e2fd16c3d --- /dev/null +++ b/kernel/samples/bpf/fds_example.c @@ -0,0 +1,183 @@ +#include <linux/unistd.h> +#include <linux/bpf.h> + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <unistd.h> +#include <string.h> +#include <assert.h> +#include <errno.h> + +#include <sys/types.h> +#include <sys/socket.h> + +#include "bpf_load.h" +#include "libbpf.h" + +#define BPF_F_PIN (1 << 0) +#define BPF_F_GET (1 << 1) +#define BPF_F_PIN_GET (BPF_F_PIN | BPF_F_GET) + +#define BPF_F_KEY (1 << 2) +#define BPF_F_VAL (1 << 3) +#define BPF_F_KEY_VAL (BPF_F_KEY | BPF_F_VAL) + +#define BPF_M_UNSPEC 0 +#define BPF_M_MAP 1 +#define BPF_M_PROG 2 + +static void usage(void) +{ + printf("Usage: fds_example [...]\n"); + printf(" -F <file> File to pin/get object\n"); + printf(" -P |- pin object\n"); + printf(" -G `- get object\n"); + printf(" -m eBPF map mode\n"); + printf(" -k <key> |- map key\n"); + printf(" -v <value> `- map value\n"); + printf(" -p eBPF prog mode\n"); + printf(" -o <object> `- object file\n"); + printf(" -h Display this help.\n"); +} + +static int bpf_map_create(void) +{ + return bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(uint32_t), + sizeof(uint32_t), 1024); +} + +static int bpf_prog_create(const char *object) +{ + static const struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }; + + if (object) { + assert(!load_bpf_file((char *)object)); + return prog_fd[0]; + } else { + return bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, + insns, sizeof(insns), "GPL", 0); + } +} + +static int bpf_do_map(const char *file, uint32_t flags, uint32_t key, + uint32_t value) +{ + int fd, ret; + + if (flags & BPF_F_PIN) { + fd = bpf_map_create(); + printf("bpf: map fd:%d (%s)\n", fd, strerror(errno)); + assert(fd > 0); + + ret = bpf_obj_pin(fd, file); + printf("bpf: pin ret:(%d,%s)\n", ret, strerror(errno)); + assert(ret == 0); + } else { + fd = bpf_obj_get(file); + printf("bpf: get fd:%d (%s)\n", fd, strerror(errno)); + assert(fd > 0); + } + + if ((flags & BPF_F_KEY_VAL) == BPF_F_KEY_VAL) { + ret = bpf_update_elem(fd, &key, &value, 0); + printf("bpf: fd:%d u->(%u:%u) ret:(%d,%s)\n", fd, key, value, + ret, strerror(errno)); + assert(ret == 0); + } else if (flags & BPF_F_KEY) { + ret = bpf_lookup_elem(fd, &key, &value); + printf("bpf: fd:%d l->(%u):%u ret:(%d,%s)\n", fd, key, value, + ret, strerror(errno)); + assert(ret == 0); + } + + return 0; +} + +static int bpf_do_prog(const char *file, uint32_t flags, const char *object) +{ + int fd, sock, ret; + + if (flags & BPF_F_PIN) { + fd = bpf_prog_create(object); + printf("bpf: prog fd:%d (%s)\n", fd, strerror(errno)); + assert(fd > 0); + + ret = bpf_obj_pin(fd, file); + printf("bpf: pin ret:(%d,%s)\n", ret, strerror(errno)); + assert(ret == 0); + } else { + fd = bpf_obj_get(file); + printf("bpf: get fd:%d (%s)\n", fd, strerror(errno)); + assert(fd > 0); + } + + sock = open_raw_sock("lo"); + assert(sock > 0); + + ret = setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &fd, sizeof(fd)); + printf("bpf: sock:%d <- fd:%d attached ret:(%d,%s)\n", sock, fd, + ret, strerror(errno)); + assert(ret == 0); + + return 0; +} + +int main(int argc, char **argv) +{ + const char *file = NULL, *object = NULL; + uint32_t key = 0, value = 0, flags = 0; + int opt, mode = BPF_M_UNSPEC; + + while ((opt = getopt(argc, argv, "F:PGmk:v:po:")) != -1) { + switch (opt) { + /* General args */ + case 'F': + file = optarg; + break; + case 'P': + flags |= BPF_F_PIN; + break; + case 'G': + flags |= BPF_F_GET; + break; + /* Map-related args */ + case 'm': + mode = BPF_M_MAP; + break; + case 'k': + key = strtoul(optarg, NULL, 0); + flags |= BPF_F_KEY; + break; + case 'v': + value = strtoul(optarg, NULL, 0); + flags |= BPF_F_VAL; + break; + /* Prog-related args */ + case 'p': + mode = BPF_M_PROG; + break; + case 'o': + object = optarg; + break; + default: + goto out; + } + } + + if (!(flags & BPF_F_PIN_GET) || !file) + goto out; + + switch (mode) { + case BPF_M_MAP: + return bpf_do_map(file, flags, key, value); + case BPF_M_PROG: + return bpf_do_prog(file, flags, object); + } +out: + usage(); + return -1; +} diff --git a/kernel/samples/bpf/lathist_kern.c b/kernel/samples/bpf/lathist_kern.c new file mode 100644 index 000000000..18fa08847 --- /dev/null +++ b/kernel/samples/bpf/lathist_kern.c @@ -0,0 +1,99 @@ +/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com + * Copyright (c) 2015 BMW Car IT GmbH + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/version.h> +#include <linux/ptrace.h> +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +#define MAX_ENTRIES 20 +#define MAX_CPU 4 + +/* We need to stick to static allocated memory (an array instead of + * hash table) because managing dynamic memory from the + * trace_preempt_[on|off] tracepoints hooks is not supported. + */ + +struct bpf_map_def SEC("maps") my_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(u64), + .max_entries = MAX_CPU, +}; + +SEC("kprobe/trace_preempt_off") +int bpf_prog1(struct pt_regs *ctx) +{ + int cpu = bpf_get_smp_processor_id(); + u64 *ts = bpf_map_lookup_elem(&my_map, &cpu); + + if (ts) + *ts = bpf_ktime_get_ns(); + + return 0; +} + +static unsigned int log2(unsigned int v) +{ + unsigned int r; + unsigned int shift; + + r = (v > 0xFFFF) << 4; v >>= r; + shift = (v > 0xFF) << 3; v >>= shift; r |= shift; + shift = (v > 0xF) << 2; v >>= shift; r |= shift; + shift = (v > 0x3) << 1; v >>= shift; r |= shift; + r |= (v >> 1); + + return r; +} + +static unsigned int log2l(unsigned long v) +{ + unsigned int hi = v >> 32; + + if (hi) + return log2(hi) + 32; + else + return log2(v); +} + +struct bpf_map_def SEC("maps") my_lat = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(long), + .max_entries = MAX_CPU * MAX_ENTRIES, +}; + +SEC("kprobe/trace_preempt_on") +int bpf_prog2(struct pt_regs *ctx) +{ + u64 *ts, cur_ts, delta; + int key, cpu; + long *val; + + cpu = bpf_get_smp_processor_id(); + ts = bpf_map_lookup_elem(&my_map, &cpu); + if (!ts) + return 0; + + cur_ts = bpf_ktime_get_ns(); + delta = log2l(cur_ts - *ts); + + if (delta > MAX_ENTRIES - 1) + delta = MAX_ENTRIES - 1; + + key = cpu * MAX_ENTRIES + delta; + val = bpf_map_lookup_elem(&my_lat, &key); + if (val) + __sync_fetch_and_add((long *)val, 1); + + return 0; + +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/kernel/samples/bpf/lathist_user.c b/kernel/samples/bpf/lathist_user.c new file mode 100644 index 000000000..65da8c157 --- /dev/null +++ b/kernel/samples/bpf/lathist_user.c @@ -0,0 +1,103 @@ +/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com + * Copyright (c) 2015 BMW Car IT GmbH + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <signal.h> +#include <linux/bpf.h> +#include "libbpf.h" +#include "bpf_load.h" + +#define MAX_ENTRIES 20 +#define MAX_CPU 4 +#define MAX_STARS 40 + +struct cpu_hist { + long data[MAX_ENTRIES]; + long max; +}; + +static struct cpu_hist cpu_hist[MAX_CPU]; + +static void stars(char *str, long val, long max, int width) +{ + int i; + + for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++) + str[i] = '*'; + if (val > max) + str[i - 1] = '+'; + str[i] = '\0'; +} + +static void print_hist(void) +{ + char starstr[MAX_STARS]; + struct cpu_hist *hist; + int i, j; + + /* clear screen */ + printf("\033[2J"); + + for (j = 0; j < MAX_CPU; j++) { + hist = &cpu_hist[j]; + + /* ignore CPUs without data (maybe offline?) */ + if (hist->max == 0) + continue; + + printf("CPU %d\n", j); + printf(" latency : count distribution\n"); + for (i = 1; i <= MAX_ENTRIES; i++) { + stars(starstr, hist->data[i - 1], hist->max, MAX_STARS); + printf("%8ld -> %-8ld : %-8ld |%-*s|\n", + (1l << i) >> 1, (1l << i) - 1, + hist->data[i - 1], MAX_STARS, starstr); + } + } +} + +static void get_data(int fd) +{ + long key, value; + int c, i; + + for (i = 0; i < MAX_CPU; i++) + cpu_hist[i].max = 0; + + for (c = 0; c < MAX_CPU; c++) { + for (i = 0; i < MAX_ENTRIES; i++) { + key = c * MAX_ENTRIES + i; + bpf_lookup_elem(fd, &key, &value); + + cpu_hist[c].data[i] = value; + if (value > cpu_hist[c].max) + cpu_hist[c].max = value; + } + } +} + +int main(int argc, char **argv) +{ + char filename[256]; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + while (1) { + get_data(map_fd[1]); + print_hist(); + sleep(5); + } + + return 0; +} diff --git a/kernel/samples/bpf/libbpf.c b/kernel/samples/bpf/libbpf.c index 7e1efa7e2..65a8d48d2 100644 --- a/kernel/samples/bpf/libbpf.c +++ b/kernel/samples/bpf/libbpf.c @@ -103,6 +103,25 @@ int bpf_prog_load(enum bpf_prog_type prog_type, return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); } +int bpf_obj_pin(int fd, const char *pathname) +{ + union bpf_attr attr = { + .pathname = ptr_to_u64((void *)pathname), + .bpf_fd = fd, + }; + + return syscall(__NR_bpf, BPF_OBJ_PIN, &attr, sizeof(attr)); +} + +int bpf_obj_get(const char *pathname) +{ + union bpf_attr attr = { + .pathname = ptr_to_u64((void *)pathname), + }; + + return syscall(__NR_bpf, BPF_OBJ_GET, &attr, sizeof(attr)); +} + int open_raw_sock(const char *name) { struct sockaddr_ll sll; diff --git a/kernel/samples/bpf/libbpf.h b/kernel/samples/bpf/libbpf.h index 7235e292a..014aacf91 100644 --- a/kernel/samples/bpf/libbpf.h +++ b/kernel/samples/bpf/libbpf.h @@ -15,6 +15,9 @@ int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, int insn_len, const char *license, int kern_version); +int bpf_obj_pin(int fd, const char *pathname); +int bpf_obj_get(const char *pathname); + #define LOG_BUF_SIZE 65536 extern char bpf_log_buf[LOG_BUF_SIZE]; @@ -64,6 +67,14 @@ extern char bpf_log_buf[LOG_BUF_SIZE]; .off = 0, \ .imm = 0 }) +#define BPF_MOV32_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + /* Short form of mov, dst_reg = imm32 */ #define BPF_MOV64_IMM(DST, IMM) \ diff --git a/kernel/samples/bpf/sockex3_kern.c b/kernel/samples/bpf/sockex3_kern.c new file mode 100644 index 000000000..41ae2fd21 --- /dev/null +++ b/kernel/samples/bpf/sockex3_kern.c @@ -0,0 +1,290 @@ +/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" +#include <uapi/linux/in.h> +#include <uapi/linux/if.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/if_tunnel.h> +#include <uapi/linux/mpls.h> +#define IP_MF 0x2000 +#define IP_OFFSET 0x1FFF + +#define PROG(F) SEC("socket/"__stringify(F)) int bpf_func_##F + +struct bpf_map_def SEC("maps") jmp_table = { + .type = BPF_MAP_TYPE_PROG_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = 8, +}; + +#define PARSE_VLAN 1 +#define PARSE_MPLS 2 +#define PARSE_IP 3 +#define PARSE_IPV6 4 + +/* protocol dispatch routine. + * It tail-calls next BPF program depending on eth proto + * Note, we could have used: + * bpf_tail_call(skb, &jmp_table, proto); + * but it would need large prog_array + */ +static inline void parse_eth_proto(struct __sk_buff *skb, u32 proto) +{ + switch (proto) { + case ETH_P_8021Q: + case ETH_P_8021AD: + bpf_tail_call(skb, &jmp_table, PARSE_VLAN); + break; + case ETH_P_MPLS_UC: + case ETH_P_MPLS_MC: + bpf_tail_call(skb, &jmp_table, PARSE_MPLS); + break; + case ETH_P_IP: + bpf_tail_call(skb, &jmp_table, PARSE_IP); + break; + case ETH_P_IPV6: + bpf_tail_call(skb, &jmp_table, PARSE_IPV6); + break; + } +} + +struct vlan_hdr { + __be16 h_vlan_TCI; + __be16 h_vlan_encapsulated_proto; +}; + +struct flow_keys { + __be32 src; + __be32 dst; + union { + __be32 ports; + __be16 port16[2]; + }; + __u32 ip_proto; +}; + +static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff) +{ + return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off)) + & (IP_MF | IP_OFFSET); +} + +static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off) +{ + __u64 w0 = load_word(ctx, off); + __u64 w1 = load_word(ctx, off + 4); + __u64 w2 = load_word(ctx, off + 8); + __u64 w3 = load_word(ctx, off + 12); + + return (__u32)(w0 ^ w1 ^ w2 ^ w3); +} + +struct globals { + struct flow_keys flow; +}; + +struct bpf_map_def SEC("maps") percpu_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(struct globals), + .max_entries = 32, +}; + +/* user poor man's per_cpu until native support is ready */ +static struct globals *this_cpu_globals(void) +{ + u32 key = bpf_get_smp_processor_id(); + + return bpf_map_lookup_elem(&percpu_map, &key); +} + +/* some simple stats for user space consumption */ +struct pair { + __u64 packets; + __u64 bytes; +}; + +struct bpf_map_def SEC("maps") hash_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(struct flow_keys), + .value_size = sizeof(struct pair), + .max_entries = 1024, +}; + +static void update_stats(struct __sk_buff *skb, struct globals *g) +{ + struct flow_keys key = g->flow; + struct pair *value; + + value = bpf_map_lookup_elem(&hash_map, &key); + if (value) { + __sync_fetch_and_add(&value->packets, 1); + __sync_fetch_and_add(&value->bytes, skb->len); + } else { + struct pair val = {1, skb->len}; + + bpf_map_update_elem(&hash_map, &key, &val, BPF_ANY); + } +} + +static __always_inline void parse_ip_proto(struct __sk_buff *skb, + struct globals *g, __u32 ip_proto) +{ + __u32 nhoff = skb->cb[0]; + int poff; + + switch (ip_proto) { + case IPPROTO_GRE: { + struct gre_hdr { + __be16 flags; + __be16 proto; + }; + + __u32 gre_flags = load_half(skb, + nhoff + offsetof(struct gre_hdr, flags)); + __u32 gre_proto = load_half(skb, + nhoff + offsetof(struct gre_hdr, proto)); + + if (gre_flags & (GRE_VERSION|GRE_ROUTING)) + break; + + nhoff += 4; + if (gre_flags & GRE_CSUM) + nhoff += 4; + if (gre_flags & GRE_KEY) + nhoff += 4; + if (gre_flags & GRE_SEQ) + nhoff += 4; + + skb->cb[0] = nhoff; + parse_eth_proto(skb, gre_proto); + break; + } + case IPPROTO_IPIP: + parse_eth_proto(skb, ETH_P_IP); + break; + case IPPROTO_IPV6: + parse_eth_proto(skb, ETH_P_IPV6); + break; + case IPPROTO_TCP: + case IPPROTO_UDP: + g->flow.ports = load_word(skb, nhoff); + case IPPROTO_ICMP: + g->flow.ip_proto = ip_proto; + update_stats(skb, g); + break; + default: + break; + } +} + +PROG(PARSE_IP)(struct __sk_buff *skb) +{ + struct globals *g = this_cpu_globals(); + __u32 nhoff, verlen, ip_proto; + + if (!g) + return 0; + + nhoff = skb->cb[0]; + + if (unlikely(ip_is_fragment(skb, nhoff))) + return 0; + + ip_proto = load_byte(skb, nhoff + offsetof(struct iphdr, protocol)); + + if (ip_proto != IPPROTO_GRE) { + g->flow.src = load_word(skb, nhoff + offsetof(struct iphdr, saddr)); + g->flow.dst = load_word(skb, nhoff + offsetof(struct iphdr, daddr)); + } + + verlen = load_byte(skb, nhoff + 0/*offsetof(struct iphdr, ihl)*/); + nhoff += (verlen & 0xF) << 2; + + skb->cb[0] = nhoff; + parse_ip_proto(skb, g, ip_proto); + return 0; +} + +PROG(PARSE_IPV6)(struct __sk_buff *skb) +{ + struct globals *g = this_cpu_globals(); + __u32 nhoff, ip_proto; + + if (!g) + return 0; + + nhoff = skb->cb[0]; + + ip_proto = load_byte(skb, + nhoff + offsetof(struct ipv6hdr, nexthdr)); + g->flow.src = ipv6_addr_hash(skb, + nhoff + offsetof(struct ipv6hdr, saddr)); + g->flow.dst = ipv6_addr_hash(skb, + nhoff + offsetof(struct ipv6hdr, daddr)); + nhoff += sizeof(struct ipv6hdr); + + skb->cb[0] = nhoff; + parse_ip_proto(skb, g, ip_proto); + return 0; +} + +PROG(PARSE_VLAN)(struct __sk_buff *skb) +{ + __u32 nhoff, proto; + + nhoff = skb->cb[0]; + + proto = load_half(skb, nhoff + offsetof(struct vlan_hdr, + h_vlan_encapsulated_proto)); + nhoff += sizeof(struct vlan_hdr); + skb->cb[0] = nhoff; + + parse_eth_proto(skb, proto); + + return 0; +} + +PROG(PARSE_MPLS)(struct __sk_buff *skb) +{ + __u32 nhoff, label; + + nhoff = skb->cb[0]; + + label = load_word(skb, nhoff); + nhoff += sizeof(struct mpls_label); + skb->cb[0] = nhoff; + + if (label & MPLS_LS_S_MASK) { + __u8 verlen = load_byte(skb, nhoff); + if ((verlen & 0xF0) == 4) + parse_eth_proto(skb, ETH_P_IP); + else + parse_eth_proto(skb, ETH_P_IPV6); + } else { + parse_eth_proto(skb, ETH_P_MPLS_UC); + } + + return 0; +} + +SEC("socket/0") +int main_prog(struct __sk_buff *skb) +{ + __u32 nhoff = ETH_HLEN; + __u32 proto = load_half(skb, 12); + + skb->cb[0] = nhoff; + parse_eth_proto(skb, proto); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/kernel/samples/bpf/sockex3_user.c b/kernel/samples/bpf/sockex3_user.c new file mode 100644 index 000000000..2617772d0 --- /dev/null +++ b/kernel/samples/bpf/sockex3_user.c @@ -0,0 +1,66 @@ +#include <stdio.h> +#include <assert.h> +#include <linux/bpf.h> +#include "libbpf.h" +#include "bpf_load.h" +#include <unistd.h> +#include <arpa/inet.h> + +struct flow_keys { + __be32 src; + __be32 dst; + union { + __be32 ports; + __be16 port16[2]; + }; + __u32 ip_proto; +}; + +struct pair { + __u64 packets; + __u64 bytes; +}; + +int main(int argc, char **argv) +{ + char filename[256]; + FILE *f; + int i, sock; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + sock = open_raw_sock("lo"); + + assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd[4], + sizeof(__u32)) == 0); + + if (argc > 1) + f = popen("ping -c5 localhost", "r"); + else + f = popen("netperf -l 4 localhost", "r"); + (void) f; + + for (i = 0; i < 5; i++) { + struct flow_keys key = {}, next_key; + struct pair value; + + sleep(1); + printf("IP src.port -> dst.port bytes packets\n"); + while (bpf_get_next_key(map_fd[2], &key, &next_key) == 0) { + bpf_lookup_elem(map_fd[2], &next_key, &value); + printf("%s.%05d -> %s.%05d %12lld %12lld\n", + inet_ntoa((struct in_addr){htonl(next_key.src)}), + next_key.port16[0], + inet_ntoa((struct in_addr){htonl(next_key.dst)}), + next_key.port16[1], + value.bytes, value.packets); + key = next_key; + } + } + return 0; +} diff --git a/kernel/samples/bpf/tcbpf1_kern.c b/kernel/samples/bpf/tcbpf1_kern.c index 7c27710f8..fa051b3d5 100644 --- a/kernel/samples/bpf/tcbpf1_kern.c +++ b/kernel/samples/bpf/tcbpf1_kern.c @@ -5,7 +5,7 @@ #include <uapi/linux/in.h> #include <uapi/linux/tcp.h> #include <uapi/linux/filter.h> - +#include <uapi/linux/pkt_cls.h> #include "bpf_helpers.h" /* compiler workaround */ @@ -21,7 +21,7 @@ static inline void set_dst_mac(struct __sk_buff *skb, char *mac) static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos) { - __u8 old_tos = load_byte(skb, BPF_LL_OFF + TOS_OFF); + __u8 old_tos = load_byte(skb, TOS_OFF); bpf_l3_csum_replace(skb, IP_CSUM_OFF, htons(old_tos), htons(new_tos), 2); bpf_skb_store_bytes(skb, TOS_OFF, &new_tos, sizeof(new_tos), 0); @@ -34,7 +34,7 @@ static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos) static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip) { - __u32 old_ip = _htonl(load_word(skb, BPF_LL_OFF + IP_SRC_OFF)); + __u32 old_ip = _htonl(load_word(skb, IP_SRC_OFF)); bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_ip, new_ip, IS_PSEUDO | sizeof(new_ip)); bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip)); @@ -44,7 +44,7 @@ static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip) #define TCP_DPORT_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, dest)) static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port) { - __u16 old_port = htons(load_half(skb, BPF_LL_OFF + TCP_DPORT_OFF)); + __u16 old_port = htons(load_half(skb, TCP_DPORT_OFF)); bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_port, new_port, sizeof(new_port)); bpf_skb_store_bytes(skb, TCP_DPORT_OFF, &new_port, sizeof(new_port), 0); @@ -53,7 +53,7 @@ static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port) SEC("classifier") int bpf_prog1(struct __sk_buff *skb) { - __u8 proto = load_byte(skb, BPF_LL_OFF + ETH_HLEN + offsetof(struct iphdr, protocol)); + __u8 proto = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); long *value; if (proto == IPPROTO_TCP) { @@ -64,4 +64,26 @@ int bpf_prog1(struct __sk_buff *skb) return 0; } +SEC("redirect_xmit") +int _redirect_xmit(struct __sk_buff *skb) +{ + return bpf_redirect(skb->ifindex + 1, 0); +} +SEC("redirect_recv") +int _redirect_recv(struct __sk_buff *skb) +{ + return bpf_redirect(skb->ifindex + 1, 1); +} +SEC("clone_redirect_xmit") +int _clone_redirect_xmit(struct __sk_buff *skb) +{ + bpf_clone_redirect(skb, skb->ifindex + 1, 0); + return TC_ACT_SHOT; +} +SEC("clone_redirect_recv") +int _clone_redirect_recv(struct __sk_buff *skb) +{ + bpf_clone_redirect(skb, skb->ifindex + 1, 1); + return TC_ACT_SHOT; +} char _license[] SEC("license") = "GPL"; diff --git a/kernel/samples/bpf/test_verifier.c b/kernel/samples/bpf/test_verifier.c index 12f3780af..563c507c0 100644 --- a/kernel/samples/bpf/test_verifier.c +++ b/kernel/samples/bpf/test_verifier.c @@ -15,20 +15,28 @@ #include <string.h> #include <linux/filter.h> #include <stddef.h> +#include <stdbool.h> +#include <sys/resource.h> #include "libbpf.h" #define MAX_INSNS 512 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) +#define MAX_FIXUPS 8 + struct bpf_test { const char *descr; struct bpf_insn insns[MAX_INSNS]; - int fixup[32]; + int fixup[MAX_FIXUPS]; + int prog_array_fixup[MAX_FIXUPS]; const char *errstr; + const char *errstr_unpriv; enum { + UNDEF, ACCEPT, REJECT - } result; + } result, result_unpriv; + enum bpf_prog_type prog_type; }; static struct bpf_test tests[] = { @@ -95,6 +103,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .errstr = "invalid BPF_LD_IMM insn", + .errstr_unpriv = "R1 pointer comparison", .result = REJECT, }, { @@ -108,6 +117,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .errstr = "invalid BPF_LD_IMM insn", + .errstr_unpriv = "R1 pointer comparison", .result = REJECT, }, { @@ -218,6 +228,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .errstr = "R0 !read_ok", + .errstr_unpriv = "R1 pointer comparison", .result = REJECT, }, { @@ -293,7 +304,9 @@ static struct bpf_test tests[] = { BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R0 leaks addr", .result = ACCEPT, + .result_unpriv = REJECT, }, { "check corrupted spill/fill", @@ -309,6 +322,7 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, + .errstr_unpriv = "attempt to corrupt spilled", .errstr = "corrupted spill", .result = REJECT, }, @@ -472,6 +486,7 @@ static struct bpf_test tests[] = { }, .fixup = {3}, .errstr = "R0 invalid mem access", + .errstr_unpriv = "R0 leaks addr", .result = REJECT, }, { @@ -494,6 +509,8 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R1 pointer comparison", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -520,6 +537,8 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R1 pointer comparison", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -554,6 +573,8 @@ static struct bpf_test tests[] = { BPF_EXIT_INSN(), }, .fixup = {24}, + .errstr_unpriv = "R1 pointer comparison", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -602,6 +623,8 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R1 pointer comparison", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -641,6 +664,8 @@ static struct bpf_test tests[] = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R1 pointer comparison", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -698,6 +723,7 @@ static struct bpf_test tests[] = { }, .fixup = {4}, .errstr = "different pointers", + .errstr_unpriv = "R1 pointer comparison", .result = REJECT, }, { @@ -719,6 +745,7 @@ static struct bpf_test tests[] = { }, .fixup = {6}, .errstr = "different pointers", + .errstr_unpriv = "R1 pointer comparison", .result = REJECT, }, { @@ -741,7 +768,417 @@ static struct bpf_test tests[] = { }, .fixup = {7}, .errstr = "different pointers", + .errstr_unpriv = "R1 pointer comparison", + .result = REJECT, + }, + { + "check skb->mark is not writeable by sockets", + .insns = { + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, + offsetof(struct __sk_buff, mark)), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .errstr_unpriv = "R1 leaks addr", + .result = REJECT, + }, + { + "check skb->tc_index is not writeable by sockets", + .insns = { + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, + offsetof(struct __sk_buff, tc_index)), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .errstr_unpriv = "R1 leaks addr", + .result = REJECT, + }, + { + "check non-u32 access to cb", + .insns = { + BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_1, + offsetof(struct __sk_buff, cb[0])), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .errstr_unpriv = "R1 leaks addr", + .result = REJECT, + }, + { + "check out of range skb->cb access", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[0]) + 256), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .errstr_unpriv = "", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_SCHED_ACT, + }, + { + "write skb fields from socket prog", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[4])), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, mark)), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, tc_index)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, + offsetof(struct __sk_buff, cb[0])), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, + offsetof(struct __sk_buff, cb[2])), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .errstr_unpriv = "R1 leaks addr", + .result_unpriv = REJECT, + }, + { + "write skb fields from tc_cls_act prog", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, cb[0])), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, mark)), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, tc_index)), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, tc_index)), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[3])), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "", + .result_unpriv = REJECT, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, + { + "PTR_TO_STACK store/load", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -10), + BPF_ST_MEM(BPF_DW, BPF_REG_1, 2, 0xfaceb00c), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 2), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "PTR_TO_STACK store/load - bad alignment on off", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_1, 2, 0xfaceb00c), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 2), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "misaligned access off -6 size 8", + }, + { + "PTR_TO_STACK store/load - bad alignment on reg", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -10), + BPF_ST_MEM(BPF_DW, BPF_REG_1, 8, 0xfaceb00c), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 8), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "misaligned access off -2 size 8", + }, + { + "PTR_TO_STACK store/load - out of bounds low", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -80000), + BPF_ST_MEM(BPF_DW, BPF_REG_1, 8, 0xfaceb00c), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 8), + BPF_EXIT_INSN(), + }, .result = REJECT, + .errstr = "invalid stack off=-79992 size=8", + }, + { + "PTR_TO_STACK store/load - out of bounds high", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_1, 8, 0xfaceb00c), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 8), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .errstr = "invalid stack off=0 size=8", + }, + { + "unpriv: return pointer", + .insns = { + BPF_MOV64_REG(BPF_REG_0, BPF_REG_10), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "R0 leaks addr", + }, + { + "unpriv: add const to pointer", + .insns = { + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "R1 pointer arithmetic", + }, + { + "unpriv: add pointer to pointer", + .insns = { + BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_10), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "R1 pointer arithmetic", + }, + { + "unpriv: neg pointer", + .insns = { + BPF_ALU64_IMM(BPF_NEG, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "R1 pointer arithmetic", + }, + { + "unpriv: cmp pointer with const", + .insns = { + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "R1 pointer comparison", + }, + { + "unpriv: cmp pointer with pointer", + .insns = { + BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_10, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "R10 pointer comparison", + }, + { + "unpriv: check that printk is disallowed", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), + BPF_MOV64_IMM(BPF_REG_2, 8), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_1), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_trace_printk), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "unknown func 6", + .result_unpriv = REJECT, + .result = ACCEPT, + }, + { + "unpriv: pass pointer to helper function", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_2), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_2), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_update_elem), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup = {3}, + .errstr_unpriv = "R4 leaks addr", + .result_unpriv = REJECT, + .result = ACCEPT, + }, + { + "unpriv: indirectly pass pointer on stack to helper function", + .insns = { + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_10, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup = {3}, + .errstr = "invalid indirect read from stack off -8+0 size 8", + .result = REJECT, + }, + { + "unpriv: mangle pointer on stack 1", + .insns = { + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_10, -8), + BPF_ST_MEM(BPF_W, BPF_REG_10, -8, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "attempt to corrupt spilled", + .result_unpriv = REJECT, + .result = ACCEPT, + }, + { + "unpriv: mangle pointer on stack 2", + .insns = { + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_10, -8), + BPF_ST_MEM(BPF_B, BPF_REG_10, -1, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "attempt to corrupt spilled", + .result_unpriv = REJECT, + .result = ACCEPT, + }, + { + "unpriv: read pointer from stack in small chunks", + .insns = { + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_10, -8), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -8), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "invalid size", + .result = REJECT, + }, + { + "unpriv: write pointer into ctx", + .insns = { + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "R1 leaks addr", + .result_unpriv = REJECT, + .errstr = "invalid bpf_context access", + .result = REJECT, + }, + { + "unpriv: write pointer into map elem value", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup = {3}, + .errstr_unpriv = "R0 leaks addr", + .result_unpriv = REJECT, + .result = ACCEPT, + }, + { + "unpriv: partial copy of pointer", + .insns = { + BPF_MOV32_REG(BPF_REG_1, BPF_REG_10), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "R10 partial copy", + .result_unpriv = REJECT, + .result = ACCEPT, + }, + { + "unpriv: pass pointer to tail_call", + .insns = { + BPF_MOV64_REG(BPF_REG_3, BPF_REG_1), + BPF_LD_MAP_FD(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_array_fixup = {1}, + .errstr_unpriv = "R3 leaks addr into helper", + .result_unpriv = REJECT, + .result = ACCEPT, + }, + { + "unpriv: cmp map pointer with zero", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup = {1}, + .errstr_unpriv = "R1 pointer comparison", + .result_unpriv = REJECT, + .result = ACCEPT, + }, + { + "unpriv: write into frame pointer", + .insns = { + BPF_MOV64_REG(BPF_REG_10, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "frame pointer is read only", + .result = REJECT, + }, + { + "unpriv: cmp of frame pointer", + .insns = { + BPF_JMP_IMM(BPF_JEQ, BPF_REG_10, 0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "R10 pointer comparison", + .result_unpriv = REJECT, + .result = ACCEPT, + }, + { + "unpriv: cmp of stack pointer", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_2, 0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "R2 pointer comparison", + .result_unpriv = REJECT, + .result = ACCEPT, + }, + { + "unpriv: obfuscate stack pointer", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "R2 pointer arithmetic", + .result_unpriv = REJECT, + .result = ACCEPT, }, }; @@ -758,13 +1195,24 @@ static int probe_filter_length(struct bpf_insn *fp) static int create_map(void) { - long long key, value = 0; int map_fd; - map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), 1024); - if (map_fd < 0) { + map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, + sizeof(long long), sizeof(long long), 1024); + if (map_fd < 0) printf("failed to create map '%s'\n", strerror(errno)); - } + + return map_fd; +} + +static int create_prog_array(void) +{ + int map_fd; + + map_fd = bpf_create_map(BPF_MAP_TYPE_PROG_ARRAY, + sizeof(int), sizeof(int), 4); + if (map_fd < 0) + printf("failed to create prog_array '%s'\n", strerror(errno)); return map_fd; } @@ -772,12 +1220,17 @@ static int create_map(void) static int test(void) { int prog_fd, i, pass_cnt = 0, err_cnt = 0; + bool unpriv = geteuid() != 0; for (i = 0; i < ARRAY_SIZE(tests); i++) { struct bpf_insn *prog = tests[i].insns; + int prog_type = tests[i].prog_type; int prog_len = probe_filter_length(prog); int *fixup = tests[i].fixup; - int map_fd = -1; + int *prog_array_fixup = tests[i].prog_array_fixup; + int expected_result; + const char *expected_errstr; + int map_fd = -1, prog_array_fd = -1; if (*fixup) { map_fd = create_map(); @@ -787,13 +1240,31 @@ static int test(void) fixup++; } while (*fixup); } + if (*prog_array_fixup) { + prog_array_fd = create_prog_array(); + + do { + prog[*prog_array_fixup].imm = prog_array_fd; + prog_array_fixup++; + } while (*prog_array_fixup); + } printf("#%d %s ", i, tests[i].descr); - prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, - prog_len * sizeof(struct bpf_insn), + prog_fd = bpf_prog_load(prog_type ?: BPF_PROG_TYPE_SOCKET_FILTER, + prog, prog_len * sizeof(struct bpf_insn), "GPL", 0); - if (tests[i].result == ACCEPT) { + if (unpriv && tests[i].result_unpriv != UNDEF) + expected_result = tests[i].result_unpriv; + else + expected_result = tests[i].result; + + if (unpriv && tests[i].errstr_unpriv) + expected_errstr = tests[i].errstr_unpriv; + else + expected_errstr = tests[i].errstr; + + if (expected_result == ACCEPT) { if (prog_fd < 0) { printf("FAIL\nfailed to load prog '%s'\n", strerror(errno)); @@ -808,7 +1279,7 @@ static int test(void) err_cnt++; goto fail; } - if (strstr(bpf_log_buf, tests[i].errstr) == 0) { + if (strstr(bpf_log_buf, expected_errstr) == 0) { printf("FAIL\nunexpected error message: %s", bpf_log_buf); err_cnt++; @@ -821,6 +1292,8 @@ static int test(void) fail: if (map_fd >= 0) close(map_fd); + if (prog_array_fd >= 0) + close(prog_array_fd); close(prog_fd); } @@ -831,5 +1304,8 @@ fail: int main(void) { + struct rlimit r = {1 << 20, 1 << 20}; + + setrlimit(RLIMIT_MEMLOCK, &r); return test(); } diff --git a/kernel/samples/bpf/trace_output_kern.c b/kernel/samples/bpf/trace_output_kern.c new file mode 100644 index 000000000..8d8d1ec42 --- /dev/null +++ b/kernel/samples/bpf/trace_output_kern.c @@ -0,0 +1,31 @@ +#include <linux/ptrace.h> +#include <linux/version.h> +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +struct bpf_map_def SEC("maps") my_map = { + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(u32), + .max_entries = 2, +}; + +SEC("kprobe/sys_write") +int bpf_prog1(struct pt_regs *ctx) +{ + struct S { + u64 pid; + u64 cookie; + } data; + + memset(&data, 0, sizeof(data)); + data.pid = bpf_get_current_pid_tgid(); + data.cookie = 0x12345678; + + bpf_perf_event_output(ctx, &my_map, 0, &data, sizeof(data)); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/kernel/samples/bpf/trace_output_user.c b/kernel/samples/bpf/trace_output_user.c new file mode 100644 index 000000000..661a7d052 --- /dev/null +++ b/kernel/samples/bpf/trace_output_user.c @@ -0,0 +1,196 @@ +/* This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <fcntl.h> +#include <poll.h> +#include <sys/ioctl.h> +#include <linux/perf_event.h> +#include <linux/bpf.h> +#include <errno.h> +#include <assert.h> +#include <sys/syscall.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <time.h> +#include <signal.h> +#include "libbpf.h" +#include "bpf_load.h" + +static int pmu_fd; + +int page_size; +int page_cnt = 8; +volatile struct perf_event_mmap_page *header; + +typedef void (*print_fn)(void *data, int size); + +static int perf_event_mmap(int fd) +{ + void *base; + int mmap_size; + + page_size = getpagesize(); + mmap_size = page_size * (page_cnt + 1); + + base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (base == MAP_FAILED) { + printf("mmap err\n"); + return -1; + } + + header = base; + return 0; +} + +static int perf_event_poll(int fd) +{ + struct pollfd pfd = { .fd = fd, .events = POLLIN }; + + return poll(&pfd, 1, 1000); +} + +struct perf_event_sample { + struct perf_event_header header; + __u32 size; + char data[]; +}; + +void perf_event_read(print_fn fn) +{ + __u64 data_tail = header->data_tail; + __u64 data_head = header->data_head; + __u64 buffer_size = page_cnt * page_size; + void *base, *begin, *end; + char buf[256]; + + asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */ + if (data_head == data_tail) + return; + + base = ((char *)header) + page_size; + + begin = base + data_tail % buffer_size; + end = base + data_head % buffer_size; + + while (begin != end) { + struct perf_event_sample *e; + + e = begin; + if (begin + e->header.size > base + buffer_size) { + long len = base + buffer_size - begin; + + assert(len < e->header.size); + memcpy(buf, begin, len); + memcpy(buf + len, base, e->header.size - len); + e = (void *) buf; + begin = base + e->header.size - len; + } else if (begin + e->header.size == base + buffer_size) { + begin = base; + } else { + begin += e->header.size; + } + + if (e->header.type == PERF_RECORD_SAMPLE) { + fn(e->data, e->size); + } else if (e->header.type == PERF_RECORD_LOST) { + struct { + struct perf_event_header header; + __u64 id; + __u64 lost; + } *lost = (void *) e; + printf("lost %lld events\n", lost->lost); + } else { + printf("unknown event type=%d size=%d\n", + e->header.type, e->header.size); + } + } + + __sync_synchronize(); /* smp_mb() */ + header->data_tail = data_head; +} + +static __u64 time_get_ns(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000000000ull + ts.tv_nsec; +} + +static __u64 start_time; + +#define MAX_CNT 100000ll + +static void print_bpf_output(void *data, int size) +{ + static __u64 cnt; + struct { + __u64 pid; + __u64 cookie; + } *e = data; + + if (e->cookie != 0x12345678) { + printf("BUG pid %llx cookie %llx sized %d\n", + e->pid, e->cookie, size); + kill(0, SIGINT); + } + + cnt++; + + if (cnt == MAX_CNT) { + printf("recv %lld events per sec\n", + MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); + kill(0, SIGINT); + } +} + +static void test_bpf_perf_event(void) +{ + struct perf_event_attr attr = { + .sample_type = PERF_SAMPLE_RAW, + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_BPF_OUTPUT, + }; + int key = 0; + + pmu_fd = perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0); + + assert(pmu_fd >= 0); + assert(bpf_update_elem(map_fd[0], &key, &pmu_fd, BPF_ANY) == 0); + ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0); +} + +int main(int argc, char **argv) +{ + char filename[256]; + FILE *f; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + test_bpf_perf_event(); + + if (perf_event_mmap(pmu_fd) < 0) + return 1; + + f = popen("taskset 1 dd if=/dev/zero of=/dev/null", "r"); + (void) f; + + start_time = time_get_ns(); + for (;;) { + perf_event_poll(pmu_fd); + perf_event_read(print_bpf_output); + } + + return 0; +} diff --git a/kernel/samples/bpf/tracex1_kern.c b/kernel/samples/bpf/tracex1_kern.c index 316204637..3f450a8fa 100644 --- a/kernel/samples/bpf/tracex1_kern.c +++ b/kernel/samples/bpf/tracex1_kern.c @@ -29,7 +29,7 @@ int bpf_prog1(struct pt_regs *ctx) int len; /* non-portable! works for the given kernel only */ - skb = (struct sk_buff *) ctx->di; + skb = (struct sk_buff *) PT_REGS_PARM1(ctx); dev = _(skb->dev); diff --git a/kernel/samples/bpf/tracex2_kern.c b/kernel/samples/bpf/tracex2_kern.c index 19ec1cfc4..b32367cfb 100644 --- a/kernel/samples/bpf/tracex2_kern.c +++ b/kernel/samples/bpf/tracex2_kern.c @@ -27,10 +27,10 @@ int bpf_prog2(struct pt_regs *ctx) long init_val = 1; long *value; - /* x64 specific: read ip of kfree_skb caller. + /* x64/s390x specific: read ip of kfree_skb caller. * non-portable version of __builtin_return_address(0) */ - bpf_probe_read(&loc, sizeof(loc), (void *)ctx->sp); + bpf_probe_read(&loc, sizeof(loc), (void *)PT_REGS_RET(ctx)); value = bpf_map_lookup_elem(&my_map, &loc); if (value) @@ -62,24 +62,38 @@ static unsigned int log2l(unsigned long v) return log2(v); } +struct hist_key { + char comm[16]; + u64 pid_tgid; + u64 uid_gid; + u32 index; +}; + struct bpf_map_def SEC("maps") my_hist_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(u32), + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(struct hist_key), .value_size = sizeof(long), - .max_entries = 64, + .max_entries = 1024, }; SEC("kprobe/sys_write") int bpf_prog3(struct pt_regs *ctx) { - long write_size = ctx->dx; /* arg3 */ + long write_size = PT_REGS_PARM3(ctx); long init_val = 1; long *value; - u32 index = log2l(write_size); + struct hist_key key = {}; + + key.index = log2l(write_size); + key.pid_tgid = bpf_get_current_pid_tgid(); + key.uid_gid = bpf_get_current_uid_gid(); + bpf_get_current_comm(&key.comm, sizeof(key.comm)); - value = bpf_map_lookup_elem(&my_hist_map, &index); + value = bpf_map_lookup_elem(&my_hist_map, &key); if (value) __sync_fetch_and_add(value, 1); + else + bpf_map_update_elem(&my_hist_map, &key, &init_val, BPF_ANY); return 0; } char _license[] SEC("license") = "GPL"; diff --git a/kernel/samples/bpf/tracex2_user.c b/kernel/samples/bpf/tracex2_user.c index 91b8d0896..cd0241c14 100644 --- a/kernel/samples/bpf/tracex2_user.c +++ b/kernel/samples/bpf/tracex2_user.c @@ -3,6 +3,7 @@ #include <stdlib.h> #include <signal.h> #include <linux/bpf.h> +#include <string.h> #include "libbpf.h" #include "bpf_load.h" @@ -20,23 +21,42 @@ static void stars(char *str, long val, long max, int width) str[i] = '\0'; } -static void print_hist(int fd) +struct task { + char comm[16]; + __u64 pid_tgid; + __u64 uid_gid; +}; + +struct hist_key { + struct task t; + __u32 index; +}; + +#define SIZE sizeof(struct task) + +static void print_hist_for_pid(int fd, void *task) { - int key; + struct hist_key key = {}, next_key; + char starstr[MAX_STARS]; long value; long data[MAX_INDEX] = {}; - char starstr[MAX_STARS]; - int i; int max_ind = -1; long max_value = 0; + int i, ind; - for (key = 0; key < MAX_INDEX; key++) { - bpf_lookup_elem(fd, &key, &value); - data[key] = value; - if (value && key > max_ind) - max_ind = key; + while (bpf_get_next_key(fd, &key, &next_key) == 0) { + if (memcmp(&next_key, task, SIZE)) { + key = next_key; + continue; + } + bpf_lookup_elem(fd, &next_key, &value); + ind = next_key.index; + data[ind] = value; + if (value && ind > max_ind) + max_ind = ind; if (value > max_value) max_value = value; + key = next_key; } printf(" syscall write() stats\n"); @@ -48,6 +68,35 @@ static void print_hist(int fd) MAX_STARS, starstr); } } + +static void print_hist(int fd) +{ + struct hist_key key = {}, next_key; + static struct task tasks[1024]; + int task_cnt = 0; + int i; + + while (bpf_get_next_key(fd, &key, &next_key) == 0) { + int found = 0; + + for (i = 0; i < task_cnt; i++) + if (memcmp(&tasks[i], &next_key, SIZE) == 0) + found = 1; + if (!found) + memcpy(&tasks[task_cnt++], &next_key, SIZE); + key = next_key; + } + + for (i = 0; i < task_cnt; i++) { + printf("\npid %d cmd %s uid %d\n", + (__u32) tasks[i].pid_tgid, + tasks[i].comm, + (__u32) tasks[i].uid_gid); + print_hist_for_pid(fd, &tasks[i]); + } + +} + static void int_exit(int sig) { print_hist(map_fd[1]); diff --git a/kernel/samples/bpf/tracex3_kern.c b/kernel/samples/bpf/tracex3_kern.c index 255ff2792..bf337fbb0 100644 --- a/kernel/samples/bpf/tracex3_kern.c +++ b/kernel/samples/bpf/tracex3_kern.c @@ -23,7 +23,7 @@ struct bpf_map_def SEC("maps") my_map = { SEC("kprobe/blk_mq_start_request") int bpf_prog1(struct pt_regs *ctx) { - long rq = ctx->di; + long rq = PT_REGS_PARM1(ctx); u64 val = bpf_ktime_get_ns(); bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY); @@ -51,7 +51,7 @@ struct bpf_map_def SEC("maps") lat_map = { SEC("kprobe/blk_update_request") int bpf_prog2(struct pt_regs *ctx) { - long rq = ctx->di; + long rq = PT_REGS_PARM1(ctx); u64 *value, l, base; u32 index; diff --git a/kernel/samples/bpf/tracex4_kern.c b/kernel/samples/bpf/tracex4_kern.c index 126b80512..ac4671420 100644 --- a/kernel/samples/bpf/tracex4_kern.c +++ b/kernel/samples/bpf/tracex4_kern.c @@ -27,7 +27,7 @@ struct bpf_map_def SEC("maps") my_map = { SEC("kprobe/kmem_cache_free") int bpf_prog1(struct pt_regs *ctx) { - long ptr = ctx->si; + long ptr = PT_REGS_PARM2(ctx); bpf_map_delete_elem(&my_map, &ptr); return 0; @@ -36,11 +36,11 @@ int bpf_prog1(struct pt_regs *ctx) SEC("kretprobe/kmem_cache_alloc_node") int bpf_prog2(struct pt_regs *ctx) { - long ptr = ctx->ax; + long ptr = PT_REGS_RC(ctx); long ip = 0; /* get ip address of kmem_cache_alloc_node() caller */ - bpf_probe_read(&ip, sizeof(ip), (void *)(ctx->bp + sizeof(ip))); + bpf_probe_read(&ip, sizeof(ip), (void *)(PT_REGS_FP(ctx) + sizeof(ip))); struct pair v = { .val = bpf_ktime_get_ns(), diff --git a/kernel/samples/bpf/tracex5_kern.c b/kernel/samples/bpf/tracex5_kern.c new file mode 100644 index 000000000..b3f4295bf --- /dev/null +++ b/kernel/samples/bpf/tracex5_kern.c @@ -0,0 +1,75 @@ +/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/ptrace.h> +#include <linux/version.h> +#include <uapi/linux/bpf.h> +#include <uapi/linux/seccomp.h> +#include "bpf_helpers.h" + +#define PROG(F) SEC("kprobe/"__stringify(F)) int bpf_func_##F + +struct bpf_map_def SEC("maps") progs = { + .type = BPF_MAP_TYPE_PROG_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = 1024, +}; + +SEC("kprobe/seccomp_phase1") +int bpf_prog1(struct pt_regs *ctx) +{ + struct seccomp_data sd = {}; + + bpf_probe_read(&sd, sizeof(sd), (void *)PT_REGS_PARM1(ctx)); + + /* dispatch into next BPF program depending on syscall number */ + bpf_tail_call(ctx, &progs, sd.nr); + + /* fall through -> unknown syscall */ + if (sd.nr >= __NR_getuid && sd.nr <= __NR_getsid) { + char fmt[] = "syscall=%d (one of get/set uid/pid/gid)\n"; + bpf_trace_printk(fmt, sizeof(fmt), sd.nr); + } + return 0; +} + +/* we jump here when syscall number == __NR_write */ +PROG(__NR_write)(struct pt_regs *ctx) +{ + struct seccomp_data sd = {}; + + bpf_probe_read(&sd, sizeof(sd), (void *)PT_REGS_PARM1(ctx)); + if (sd.args[2] == 512) { + char fmt[] = "write(fd=%d, buf=%p, size=%d)\n"; + bpf_trace_printk(fmt, sizeof(fmt), + sd.args[0], sd.args[1], sd.args[2]); + } + return 0; +} + +PROG(__NR_read)(struct pt_regs *ctx) +{ + struct seccomp_data sd = {}; + + bpf_probe_read(&sd, sizeof(sd), (void *)PT_REGS_PARM1(ctx)); + if (sd.args[2] > 128 && sd.args[2] <= 1024) { + char fmt[] = "read(fd=%d, buf=%p, size=%d)\n"; + bpf_trace_printk(fmt, sizeof(fmt), + sd.args[0], sd.args[1], sd.args[2]); + } + return 0; +} + +PROG(__NR_mmap)(struct pt_regs *ctx) +{ + char fmt[] = "mmap\n"; + bpf_trace_printk(fmt, sizeof(fmt)); + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/kernel/samples/bpf/tracex5_user.c b/kernel/samples/bpf/tracex5_user.c new file mode 100644 index 000000000..a04dd3cd4 --- /dev/null +++ b/kernel/samples/bpf/tracex5_user.c @@ -0,0 +1,46 @@ +#include <stdio.h> +#include <linux/bpf.h> +#include <unistd.h> +#include <linux/filter.h> +#include <linux/seccomp.h> +#include <sys/prctl.h> +#include "libbpf.h" +#include "bpf_load.h" + +/* install fake seccomp program to enable seccomp code path inside the kernel, + * so that our kprobe attached to seccomp_phase1() can be triggered + */ +static void install_accept_all_seccomp(void) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), + .filter = filter, + }; + if (prctl(PR_SET_SECCOMP, 2, &prog)) + perror("prctl"); +} + +int main(int ac, char **argv) +{ + FILE *f; + char filename[256]; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + install_accept_all_seccomp(); + + f = popen("dd if=/dev/zero of=/dev/null count=5", "r"); + (void) f; + + read_trace_pipe(); + + return 0; +} diff --git a/kernel/samples/bpf/tracex6_kern.c b/kernel/samples/bpf/tracex6_kern.c new file mode 100644 index 000000000..be479c4af --- /dev/null +++ b/kernel/samples/bpf/tracex6_kern.c @@ -0,0 +1,27 @@ +#include <linux/ptrace.h> +#include <linux/version.h> +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +struct bpf_map_def SEC("maps") my_map = { + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(u32), + .max_entries = 32, +}; + +SEC("kprobe/sys_write") +int bpf_prog1(struct pt_regs *ctx) +{ + u64 count; + u32 key = bpf_get_smp_processor_id(); + char fmt[] = "CPU-%d %llu\n"; + + count = bpf_perf_event_read(&my_map, key); + bpf_trace_printk(fmt, sizeof(fmt), key, count); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/kernel/samples/bpf/tracex6_user.c b/kernel/samples/bpf/tracex6_user.c new file mode 100644 index 000000000..8ea4976cf --- /dev/null +++ b/kernel/samples/bpf/tracex6_user.c @@ -0,0 +1,72 @@ +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <fcntl.h> +#include <poll.h> +#include <sys/ioctl.h> +#include <linux/perf_event.h> +#include <linux/bpf.h> +#include "libbpf.h" +#include "bpf_load.h" + +#define SAMPLE_PERIOD 0x7fffffffffffffffULL + +static void test_bpf_perf_event(void) +{ + int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + int *pmu_fd = malloc(nr_cpus * sizeof(int)); + int status, i; + + struct perf_event_attr attr_insn_pmu = { + .freq = 0, + .sample_period = SAMPLE_PERIOD, + .inherit = 0, + .type = PERF_TYPE_HARDWARE, + .read_format = 0, + .sample_type = 0, + .config = 0,/* PMU: cycles */ + }; + + for (i = 0; i < nr_cpus; i++) { + pmu_fd[i] = perf_event_open(&attr_insn_pmu, -1/*pid*/, i/*cpu*/, -1/*group_fd*/, 0); + if (pmu_fd[i] < 0) { + printf("event syscall failed\n"); + goto exit; + } + + bpf_update_elem(map_fd[0], &i, &pmu_fd[i], BPF_ANY); + ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0); + } + + status = system("ls > /dev/null"); + if (status) + goto exit; + status = system("sleep 2"); + if (status) + goto exit; + +exit: + for (i = 0; i < nr_cpus; i++) + close(pmu_fd[i]); + close(map_fd[0]); + free(pmu_fd); +} + +int main(int argc, char **argv) +{ + char filename[256]; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + test_bpf_perf_event(); + read_trace_pipe(); + + return 0; +} |