diff options
Diffstat (limited to 'kernel/samples/bpf')
-rw-r--r-- | kernel/samples/bpf/Makefile | 50 | ||||
-rw-r--r-- | kernel/samples/bpf/bpf_helpers.h | 53 | ||||
-rw-r--r-- | kernel/samples/bpf/bpf_load.c | 312 | ||||
-rw-r--r-- | kernel/samples/bpf/bpf_load.h | 27 | ||||
-rw-r--r-- | kernel/samples/bpf/libbpf.c | 135 | ||||
-rw-r--r-- | kernel/samples/bpf/libbpf.h | 190 | ||||
-rw-r--r-- | kernel/samples/bpf/sock_example.c | 101 | ||||
-rw-r--r-- | kernel/samples/bpf/sockex1_kern.c | 29 | ||||
-rw-r--r-- | kernel/samples/bpf/sockex1_user.c | 49 | ||||
-rw-r--r-- | kernel/samples/bpf/sockex2_kern.c | 221 | ||||
-rw-r--r-- | kernel/samples/bpf/sockex2_user.c | 49 | ||||
-rw-r--r-- | kernel/samples/bpf/tcbpf1_kern.c | 67 | ||||
-rw-r--r-- | kernel/samples/bpf/test_maps.c | 291 | ||||
-rw-r--r-- | kernel/samples/bpf/test_verifier.c | 835 | ||||
-rw-r--r-- | kernel/samples/bpf/tracex1_kern.c | 50 | ||||
-rw-r--r-- | kernel/samples/bpf/tracex1_user.c | 25 | ||||
-rw-r--r-- | kernel/samples/bpf/tracex2_kern.c | 86 | ||||
-rw-r--r-- | kernel/samples/bpf/tracex2_user.c | 95 | ||||
-rw-r--r-- | kernel/samples/bpf/tracex3_kern.c | 89 | ||||
-rw-r--r-- | kernel/samples/bpf/tracex3_user.c | 150 | ||||
-rw-r--r-- | kernel/samples/bpf/tracex4_kern.c | 54 | ||||
-rw-r--r-- | kernel/samples/bpf/tracex4_user.c | 69 |
22 files changed, 3027 insertions, 0 deletions
diff --git a/kernel/samples/bpf/Makefile b/kernel/samples/bpf/Makefile new file mode 100644 index 000000000..76e3458a5 --- /dev/null +++ b/kernel/samples/bpf/Makefile @@ -0,0 +1,50 @@ +# kbuild trick to avoid linker error. Can be omitted if a module is built. +obj- := dummy.o + +# List of programs to build +hostprogs-y := test_verifier test_maps +hostprogs-y += sock_example +hostprogs-y += sockex1 +hostprogs-y += sockex2 +hostprogs-y += tracex1 +hostprogs-y += tracex2 +hostprogs-y += tracex3 +hostprogs-y += tracex4 + +test_verifier-objs := test_verifier.o libbpf.o +test_maps-objs := test_maps.o libbpf.o +sock_example-objs := sock_example.o libbpf.o +sockex1-objs := bpf_load.o libbpf.o sockex1_user.o +sockex2-objs := bpf_load.o libbpf.o sockex2_user.o +tracex1-objs := bpf_load.o libbpf.o tracex1_user.o +tracex2-objs := bpf_load.o libbpf.o tracex2_user.o +tracex3-objs := bpf_load.o libbpf.o tracex3_user.o +tracex4-objs := bpf_load.o libbpf.o tracex4_user.o + +# Tell kbuild to always build the programs +always := $(hostprogs-y) +always += sockex1_kern.o +always += sockex2_kern.o +always += tracex1_kern.o +always += tracex2_kern.o +always += tracex3_kern.o +always += tracex4_kern.o +always += tcbpf1_kern.o + +HOSTCFLAGS += -I$(objtree)/usr/include + +HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable +HOSTLOADLIBES_sockex1 += -lelf +HOSTLOADLIBES_sockex2 += -lelf +HOSTLOADLIBES_tracex1 += -lelf +HOSTLOADLIBES_tracex2 += -lelf +HOSTLOADLIBES_tracex3 += -lelf +HOSTLOADLIBES_tracex4 += -lelf -lrt + +# point this to your LLVM backend with bpf support +LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc + +%.o: %.c + clang $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \ + -D__KERNEL__ -Wno-unused-value -Wno-pointer-sign \ + -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@ diff --git a/kernel/samples/bpf/bpf_helpers.h b/kernel/samples/bpf/bpf_helpers.h new file mode 100644 index 000000000..f960b5fb3 --- /dev/null +++ b/kernel/samples/bpf/bpf_helpers.h @@ -0,0 +1,53 @@ +#ifndef __BPF_HELPERS_H +#define __BPF_HELPERS_H + +/* helper macro to place programs, maps, license in + * different sections in elf_bpf file. Section names + * are interpreted by elf_bpf loader + */ +#define SEC(NAME) __attribute__((section(NAME), used)) + +/* helper functions called from eBPF programs written in C */ +static void *(*bpf_map_lookup_elem)(void *map, void *key) = + (void *) BPF_FUNC_map_lookup_elem; +static int (*bpf_map_update_elem)(void *map, void *key, void *value, + unsigned long long flags) = + (void *) BPF_FUNC_map_update_elem; +static int (*bpf_map_delete_elem)(void *map, void *key) = + (void *) BPF_FUNC_map_delete_elem; +static int (*bpf_probe_read)(void *dst, int size, void *unsafe_ptr) = + (void *) BPF_FUNC_probe_read; +static unsigned long long (*bpf_ktime_get_ns)(void) = + (void *) BPF_FUNC_ktime_get_ns; +static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) = + (void *) BPF_FUNC_trace_printk; + +/* llvm builtin functions that eBPF C program may use to + * emit BPF_LD_ABS and BPF_LD_IND instructions + */ +struct sk_buff; +unsigned long long load_byte(void *skb, + unsigned long long off) asm("llvm.bpf.load.byte"); +unsigned long long load_half(void *skb, + unsigned long long off) asm("llvm.bpf.load.half"); +unsigned long long load_word(void *skb, + unsigned long long off) asm("llvm.bpf.load.word"); + +/* a helper structure used by eBPF C program + * to describe map attributes to elf_bpf loader + */ +struct bpf_map_def { + unsigned int type; + unsigned int key_size; + unsigned int value_size; + unsigned int max_entries; +}; + +static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int flags) = + (void *) BPF_FUNC_skb_store_bytes; +static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flags) = + (void *) BPF_FUNC_l3_csum_replace; +static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flags) = + (void *) BPF_FUNC_l4_csum_replace; + +#endif diff --git a/kernel/samples/bpf/bpf_load.c b/kernel/samples/bpf/bpf_load.c new file mode 100644 index 000000000..38dac5a53 --- /dev/null +++ b/kernel/samples/bpf/bpf_load.c @@ -0,0 +1,312 @@ +#include <stdio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <libelf.h> +#include <gelf.h> +#include <errno.h> +#include <unistd.h> +#include <string.h> +#include <stdbool.h> +#include <stdlib.h> +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/perf_event.h> +#include <sys/syscall.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <poll.h> +#include "libbpf.h" +#include "bpf_helpers.h" +#include "bpf_load.h" + +#define DEBUGFS "/sys/kernel/debug/tracing/" + +static char license[128]; +static int kern_version; +static bool processed_sec[128]; +int map_fd[MAX_MAPS]; +int prog_fd[MAX_PROGS]; +int event_fd[MAX_PROGS]; +int prog_cnt; + +static int load_and_attach(const char *event, struct bpf_insn *prog, int size) +{ + bool is_socket = strncmp(event, "socket", 6) == 0; + bool is_kprobe = strncmp(event, "kprobe/", 7) == 0; + bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; + enum bpf_prog_type prog_type; + char buf[256]; + int fd, efd, err, id; + struct perf_event_attr attr = {}; + + attr.type = PERF_TYPE_TRACEPOINT; + attr.sample_type = PERF_SAMPLE_RAW; + attr.sample_period = 1; + attr.wakeup_events = 1; + + if (is_socket) { + prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + } else if (is_kprobe || is_kretprobe) { + prog_type = BPF_PROG_TYPE_KPROBE; + } else { + printf("Unknown event '%s'\n", event); + return -1; + } + + if (is_kprobe || is_kretprobe) { + if (is_kprobe) + event += 7; + else + event += 10; + + snprintf(buf, sizeof(buf), + "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events", + is_kprobe ? 'p' : 'r', event, event); + err = system(buf); + if (err < 0) { + printf("failed to create kprobe '%s' error '%s'\n", + event, strerror(errno)); + return -1; + } + } + + fd = bpf_prog_load(prog_type, prog, size, license, kern_version); + + if (fd < 0) { + printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf); + return -1; + } + + prog_fd[prog_cnt++] = fd; + + if (is_socket) + return 0; + + strcpy(buf, DEBUGFS); + strcat(buf, "events/kprobes/"); + strcat(buf, event); + strcat(buf, "/id"); + + efd = open(buf, O_RDONLY, 0); + if (efd < 0) { + printf("failed to open event %s\n", event); + return -1; + } + + err = read(efd, buf, sizeof(buf)); + if (err < 0 || err >= sizeof(buf)) { + printf("read from '%s' failed '%s'\n", event, strerror(errno)); + return -1; + } + + close(efd); + + buf[err] = 0; + id = atoi(buf); + attr.config = id; + + efd = perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0); + if (efd < 0) { + printf("event %d fd %d err %s\n", id, efd, strerror(errno)); + return -1; + } + event_fd[prog_cnt - 1] = efd; + ioctl(efd, PERF_EVENT_IOC_ENABLE, 0); + ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd); + + return 0; +} + +static int load_maps(struct bpf_map_def *maps, int len) +{ + int i; + + for (i = 0; i < len / sizeof(struct bpf_map_def); i++) { + + map_fd[i] = bpf_create_map(maps[i].type, + maps[i].key_size, + maps[i].value_size, + maps[i].max_entries); + if (map_fd[i] < 0) + return 1; + } + return 0; +} + +static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname, + GElf_Shdr *shdr, Elf_Data **data) +{ + Elf_Scn *scn; + + scn = elf_getscn(elf, i); + if (!scn) + return 1; + + if (gelf_getshdr(scn, shdr) != shdr) + return 2; + + *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name); + if (!*shname || !shdr->sh_size) + return 3; + + *data = elf_getdata(scn, 0); + if (!*data || elf_getdata(scn, *data) != NULL) + return 4; + + return 0; +} + +static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, + GElf_Shdr *shdr, struct bpf_insn *insn) +{ + int i, nrels; + + nrels = shdr->sh_size / shdr->sh_entsize; + + for (i = 0; i < nrels; i++) { + GElf_Sym sym; + GElf_Rel rel; + unsigned int insn_idx; + + gelf_getrel(data, i, &rel); + + insn_idx = rel.r_offset / sizeof(struct bpf_insn); + + gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym); + + if (insn[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) { + printf("invalid relo for insn[%d].code 0x%x\n", + insn_idx, insn[insn_idx].code); + return 1; + } + insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; + insn[insn_idx].imm = map_fd[sym.st_value / sizeof(struct bpf_map_def)]; + } + + return 0; +} + +int load_bpf_file(char *path) +{ + int fd, i; + Elf *elf; + GElf_Ehdr ehdr; + GElf_Shdr shdr, shdr_prog; + Elf_Data *data, *data_prog, *symbols = NULL; + char *shname, *shname_prog; + + if (elf_version(EV_CURRENT) == EV_NONE) + return 1; + + fd = open(path, O_RDONLY, 0); + if (fd < 0) + return 1; + + elf = elf_begin(fd, ELF_C_READ, NULL); + + if (!elf) + return 1; + + if (gelf_getehdr(elf, &ehdr) != &ehdr) + return 1; + + /* clear all kprobes */ + i = system("echo \"\" > /sys/kernel/debug/tracing/kprobe_events"); + + /* scan over all elf sections to get license and map info */ + for (i = 1; i < ehdr.e_shnum; i++) { + + if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) + continue; + + if (0) /* helpful for llvm debugging */ + printf("section %d:%s data %p size %zd link %d flags %d\n", + i, shname, data->d_buf, data->d_size, + shdr.sh_link, (int) shdr.sh_flags); + + if (strcmp(shname, "license") == 0) { + processed_sec[i] = true; + memcpy(license, data->d_buf, data->d_size); + } else if (strcmp(shname, "version") == 0) { + processed_sec[i] = true; + if (data->d_size != sizeof(int)) { + printf("invalid size of version section %zd\n", + data->d_size); + return 1; + } + memcpy(&kern_version, data->d_buf, sizeof(int)); + } else if (strcmp(shname, "maps") == 0) { + processed_sec[i] = true; + if (load_maps(data->d_buf, data->d_size)) + return 1; + } else if (shdr.sh_type == SHT_SYMTAB) { + symbols = data; + } + } + + /* load programs that need map fixup (relocations) */ + for (i = 1; i < ehdr.e_shnum; i++) { + + if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) + continue; + if (shdr.sh_type == SHT_REL) { + struct bpf_insn *insns; + + if (get_sec(elf, shdr.sh_info, &ehdr, &shname_prog, + &shdr_prog, &data_prog)) + continue; + + insns = (struct bpf_insn *) data_prog->d_buf; + + processed_sec[shdr.sh_info] = true; + processed_sec[i] = true; + + if (parse_relo_and_apply(data, symbols, &shdr, insns)) + continue; + + if (memcmp(shname_prog, "kprobe/", 7) == 0 || + memcmp(shname_prog, "kretprobe/", 10) == 0 || + memcmp(shname_prog, "socket", 6) == 0) + load_and_attach(shname_prog, insns, data_prog->d_size); + } + } + + /* load programs that don't use maps */ + for (i = 1; i < ehdr.e_shnum; i++) { + + if (processed_sec[i]) + continue; + + if (get_sec(elf, i, &ehdr, &shname, &shdr, &data)) + continue; + + if (memcmp(shname, "kprobe/", 7) == 0 || + memcmp(shname, "kretprobe/", 10) == 0 || + memcmp(shname, "socket", 6) == 0) + load_and_attach(shname, data->d_buf, data->d_size); + } + + close(fd); + return 0; +} + +void read_trace_pipe(void) +{ + int trace_fd; + + trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); + if (trace_fd < 0) + return; + + while (1) { + static char buf[4096]; + ssize_t sz; + + sz = read(trace_fd, buf, sizeof(buf)); + if (sz > 0) { + buf[sz] = 0; + puts(buf); + } + } +} diff --git a/kernel/samples/bpf/bpf_load.h b/kernel/samples/bpf/bpf_load.h new file mode 100644 index 000000000..cbd7c2b53 --- /dev/null +++ b/kernel/samples/bpf/bpf_load.h @@ -0,0 +1,27 @@ +#ifndef __BPF_LOAD_H +#define __BPF_LOAD_H + +#define MAX_MAPS 32 +#define MAX_PROGS 32 + +extern int map_fd[MAX_MAPS]; +extern int prog_fd[MAX_PROGS]; +extern int event_fd[MAX_PROGS]; + +/* parses elf file compiled by llvm .c->.o + * . parses 'maps' section and creates maps via BPF syscall + * . parses 'license' section and passes it to syscall + * . parses elf relocations for BPF maps and adjusts BPF_LD_IMM64 insns by + * storing map_fd into insn->imm and marking such insns as BPF_PSEUDO_MAP_FD + * . loads eBPF programs via BPF syscall + * + * One ELF file can contain multiple BPF programs which will be loaded + * and their FDs stored stored in prog_fd array + * + * returns zero on success + */ +int load_bpf_file(char *path); + +void read_trace_pipe(void); + +#endif diff --git a/kernel/samples/bpf/libbpf.c b/kernel/samples/bpf/libbpf.c new file mode 100644 index 000000000..7e1efa7e2 --- /dev/null +++ b/kernel/samples/bpf/libbpf.c @@ -0,0 +1,135 @@ +/* eBPF mini library */ +#include <stdlib.h> +#include <stdio.h> +#include <linux/unistd.h> +#include <unistd.h> +#include <string.h> +#include <linux/netlink.h> +#include <linux/bpf.h> +#include <errno.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <linux/if_packet.h> +#include <arpa/inet.h> +#include "libbpf.h" + +static __u64 ptr_to_u64(void *ptr) +{ + return (__u64) (unsigned long) ptr; +} + +int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, + int max_entries) +{ + union bpf_attr attr = { + .map_type = map_type, + .key_size = key_size, + .value_size = value_size, + .max_entries = max_entries + }; + + return syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr)); +} + +int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags) +{ + union bpf_attr attr = { + .map_fd = fd, + .key = ptr_to_u64(key), + .value = ptr_to_u64(value), + .flags = flags, + }; + + return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); +} + +int bpf_lookup_elem(int fd, void *key, void *value) +{ + union bpf_attr attr = { + .map_fd = fd, + .key = ptr_to_u64(key), + .value = ptr_to_u64(value), + }; + + return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); +} + +int bpf_delete_elem(int fd, void *key) +{ + union bpf_attr attr = { + .map_fd = fd, + .key = ptr_to_u64(key), + }; + + return syscall(__NR_bpf, BPF_MAP_DELETE_ELEM, &attr, sizeof(attr)); +} + +int bpf_get_next_key(int fd, void *key, void *next_key) +{ + union bpf_attr attr = { + .map_fd = fd, + .key = ptr_to_u64(key), + .next_key = ptr_to_u64(next_key), + }; + + return syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr)); +} + +#define ROUND_UP(x, n) (((x) + (n) - 1u) & ~((n) - 1u)) + +char bpf_log_buf[LOG_BUF_SIZE]; + +int bpf_prog_load(enum bpf_prog_type prog_type, + const struct bpf_insn *insns, int prog_len, + const char *license, int kern_version) +{ + union bpf_attr attr = { + .prog_type = prog_type, + .insns = ptr_to_u64((void *) insns), + .insn_cnt = prog_len / sizeof(struct bpf_insn), + .license = ptr_to_u64((void *) license), + .log_buf = ptr_to_u64(bpf_log_buf), + .log_size = LOG_BUF_SIZE, + .log_level = 1, + }; + + /* assign one field outside of struct init to make sure any + * padding is zero initialized + */ + attr.kern_version = kern_version; + + bpf_log_buf[0] = 0; + + return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); +} + +int open_raw_sock(const char *name) +{ + struct sockaddr_ll sll; + int sock; + + sock = socket(PF_PACKET, SOCK_RAW | SOCK_NONBLOCK | SOCK_CLOEXEC, htons(ETH_P_ALL)); + if (sock < 0) { + printf("cannot create raw socket\n"); + return -1; + } + + memset(&sll, 0, sizeof(sll)); + sll.sll_family = AF_PACKET; + sll.sll_ifindex = if_nametoindex(name); + sll.sll_protocol = htons(ETH_P_ALL); + if (bind(sock, (struct sockaddr *)&sll, sizeof(sll)) < 0) { + printf("bind to %s: %s\n", name, strerror(errno)); + close(sock); + return -1; + } + + return sock; +} + +int perf_event_open(struct perf_event_attr *attr, int pid, int cpu, + int group_fd, unsigned long flags) +{ + return syscall(__NR_perf_event_open, attr, pid, cpu, + group_fd, flags); +} diff --git a/kernel/samples/bpf/libbpf.h b/kernel/samples/bpf/libbpf.h new file mode 100644 index 000000000..7235e292a --- /dev/null +++ b/kernel/samples/bpf/libbpf.h @@ -0,0 +1,190 @@ +/* eBPF mini library */ +#ifndef __LIBBPF_H +#define __LIBBPF_H + +struct bpf_insn; + +int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, + int max_entries); +int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags); +int bpf_lookup_elem(int fd, void *key, void *value); +int bpf_delete_elem(int fd, void *key); +int bpf_get_next_key(int fd, void *key, void *next_key); + +int bpf_prog_load(enum bpf_prog_type prog_type, + const struct bpf_insn *insns, int insn_len, + const char *license, int kern_version); + +#define LOG_BUF_SIZE 65536 +extern char bpf_log_buf[LOG_BUF_SIZE]; + +/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ + +#define BPF_ALU64_REG(OP, DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +#define BPF_ALU32_REG(OP, DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ + +#define BPF_ALU64_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_ALU32_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +/* Short form of mov, dst_reg = src_reg */ + +#define BPF_MOV64_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +/* Short form of mov, dst_reg = imm32 */ + +#define BPF_MOV64_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ +#define BPF_LD_IMM64(DST, IMM) \ + BPF_LD_IMM64_RAW(DST, 0, IMM) + +#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = (__u32) (IMM) }), \ + ((struct bpf_insn) { \ + .code = 0, /* zero is reserved opcode */ \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((__u64) (IMM)) >> 32 }) + +#ifndef BPF_PSEUDO_MAP_FD +# define BPF_PSEUDO_MAP_FD 1 +#endif + +/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ +#define BPF_LD_MAP_FD(DST, MAP_FD) \ + BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) + + +/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */ + +#define BPF_LD_ABS(SIZE, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +/* Memory load, dst_reg = *(uint *) (src_reg + off16) */ + +#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Memory store, *(uint *) (dst_reg + off16) = src_reg */ + +#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Memory store, *(uint *) (dst_reg + off16) = imm32 */ + +#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */ + +#define BPF_JMP_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ + +#define BPF_JMP_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +/* Raw code statement block */ + +#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = CODE, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = IMM }) + +/* Program exit */ + +#define BPF_EXIT_INSN() \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_EXIT, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0 }) + +/* create RAW socket and bind to interface 'name' */ +int open_raw_sock(const char *name); + +struct perf_event_attr; +int perf_event_open(struct perf_event_attr *attr, int pid, int cpu, + int group_fd, unsigned long flags); +#endif diff --git a/kernel/samples/bpf/sock_example.c b/kernel/samples/bpf/sock_example.c new file mode 100644 index 000000000..a0ce251c5 --- /dev/null +++ b/kernel/samples/bpf/sock_example.c @@ -0,0 +1,101 @@ +/* eBPF example program: + * - creates arraymap in kernel with key 4 bytes and value 8 bytes + * + * - loads eBPF program: + * r0 = skb->data[ETH_HLEN + offsetof(struct iphdr, protocol)]; + * *(u32*)(fp - 4) = r0; + * // assuming packet is IPv4, lookup ip->proto in a map + * value = bpf_map_lookup_elem(map_fd, fp - 4); + * if (value) + * (*(u64*)value) += 1; + * + * - attaches this program to eth0 raw socket + * + * - every second user space reads map[tcp], map[udp], map[icmp] to see + * how many packets of given protocol were seen on eth0 + */ +#include <stdio.h> +#include <unistd.h> +#include <assert.h> +#include <linux/bpf.h> +#include <string.h> +#include <stdlib.h> +#include <errno.h> +#include <sys/socket.h> +#include <arpa/inet.h> +#include <linux/if_ether.h> +#include <linux/ip.h> +#include <stddef.h> +#include "libbpf.h" + +static int test_sock(void) +{ + int sock = -1, map_fd, prog_fd, i, key; + long long value = 0, tcp_cnt, udp_cnt, icmp_cnt; + + map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value), + 256); + if (map_fd < 0) { + printf("failed to create map '%s'\n", strerror(errno)); + goto cleanup; + } + + struct bpf_insn prog[] = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_LD_ABS(BPF_B, ETH_HLEN + offsetof(struct iphdr, protocol) /* R0 = ip->proto */), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, map_fd), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */ + BPF_EXIT_INSN(), + }; + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, sizeof(prog), + "GPL", 0); + if (prog_fd < 0) { + printf("failed to load prog '%s'\n", strerror(errno)); + goto cleanup; + } + + sock = open_raw_sock("lo"); + + if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, + sizeof(prog_fd)) < 0) { + printf("setsockopt %s\n", strerror(errno)); + goto cleanup; + } + + for (i = 0; i < 10; i++) { + key = IPPROTO_TCP; + assert(bpf_lookup_elem(map_fd, &key, &tcp_cnt) == 0); + + key = IPPROTO_UDP; + assert(bpf_lookup_elem(map_fd, &key, &udp_cnt) == 0); + + key = IPPROTO_ICMP; + assert(bpf_lookup_elem(map_fd, &key, &icmp_cnt) == 0); + + printf("TCP %lld UDP %lld ICMP %lld packets\n", + tcp_cnt, udp_cnt, icmp_cnt); + sleep(1); + } + +cleanup: + /* maps, programs, raw sockets will auto cleanup on process exit */ + return 0; +} + +int main(void) +{ + FILE *f; + + f = popen("ping -c5 localhost", "r"); + (void)f; + + return test_sock(); +} diff --git a/kernel/samples/bpf/sockex1_kern.c b/kernel/samples/bpf/sockex1_kern.c new file mode 100644 index 000000000..ed18e9a49 --- /dev/null +++ b/kernel/samples/bpf/sockex1_kern.c @@ -0,0 +1,29 @@ +#include <uapi/linux/bpf.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include "bpf_helpers.h" + +struct bpf_map_def SEC("maps") my_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(long), + .max_entries = 256, +}; + +SEC("socket1") +int bpf_prog1(struct __sk_buff *skb) +{ + int index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); + long *value; + + if (skb->pkt_type != PACKET_OUTGOING) + return 0; + + value = bpf_map_lookup_elem(&my_map, &index); + if (value) + __sync_fetch_and_add(value, skb->len); + + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/kernel/samples/bpf/sockex1_user.c b/kernel/samples/bpf/sockex1_user.c new file mode 100644 index 000000000..678ce4693 --- /dev/null +++ b/kernel/samples/bpf/sockex1_user.c @@ -0,0 +1,49 @@ +#include <stdio.h> +#include <assert.h> +#include <linux/bpf.h> +#include "libbpf.h" +#include "bpf_load.h" +#include <unistd.h> +#include <arpa/inet.h> + +int main(int ac, char **argv) +{ + char filename[256]; + FILE *f; + int i, sock; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + sock = open_raw_sock("lo"); + + assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd, + sizeof(prog_fd[0])) == 0); + + f = popen("ping -c5 localhost", "r"); + (void) f; + + for (i = 0; i < 5; i++) { + long long tcp_cnt, udp_cnt, icmp_cnt; + int key; + + key = IPPROTO_TCP; + assert(bpf_lookup_elem(map_fd[0], &key, &tcp_cnt) == 0); + + key = IPPROTO_UDP; + assert(bpf_lookup_elem(map_fd[0], &key, &udp_cnt) == 0); + + key = IPPROTO_ICMP; + assert(bpf_lookup_elem(map_fd[0], &key, &icmp_cnt) == 0); + + printf("TCP %lld UDP %lld ICMP %lld bytes\n", + tcp_cnt, udp_cnt, icmp_cnt); + sleep(1); + } + + return 0; +} diff --git a/kernel/samples/bpf/sockex2_kern.c b/kernel/samples/bpf/sockex2_kern.c new file mode 100644 index 000000000..ba0e177ff --- /dev/null +++ b/kernel/samples/bpf/sockex2_kern.c @@ -0,0 +1,221 @@ +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" +#include <uapi/linux/in.h> +#include <uapi/linux/if.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/if_tunnel.h> +#define IP_MF 0x2000 +#define IP_OFFSET 0x1FFF + +struct vlan_hdr { + __be16 h_vlan_TCI; + __be16 h_vlan_encapsulated_proto; +}; + +struct flow_keys { + __be32 src; + __be32 dst; + union { + __be32 ports; + __be16 port16[2]; + }; + __u16 thoff; + __u8 ip_proto; +}; + +static inline int proto_ports_offset(__u64 proto) +{ + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_DCCP: + case IPPROTO_ESP: + case IPPROTO_SCTP: + case IPPROTO_UDPLITE: + return 0; + case IPPROTO_AH: + return 4; + default: + return 0; + } +} + +static inline int ip_is_fragment(struct __sk_buff *ctx, __u64 nhoff) +{ + return load_half(ctx, nhoff + offsetof(struct iphdr, frag_off)) + & (IP_MF | IP_OFFSET); +} + +static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off) +{ + __u64 w0 = load_word(ctx, off); + __u64 w1 = load_word(ctx, off + 4); + __u64 w2 = load_word(ctx, off + 8); + __u64 w3 = load_word(ctx, off + 12); + + return (__u32)(w0 ^ w1 ^ w2 ^ w3); +} + +static inline __u64 parse_ip(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto, + struct flow_keys *flow) +{ + __u64 verlen; + + if (unlikely(ip_is_fragment(skb, nhoff))) + *ip_proto = 0; + else + *ip_proto = load_byte(skb, nhoff + offsetof(struct iphdr, protocol)); + + if (*ip_proto != IPPROTO_GRE) { + flow->src = load_word(skb, nhoff + offsetof(struct iphdr, saddr)); + flow->dst = load_word(skb, nhoff + offsetof(struct iphdr, daddr)); + } + + verlen = load_byte(skb, nhoff + 0/*offsetof(struct iphdr, ihl)*/); + if (likely(verlen == 0x45)) + nhoff += 20; + else + nhoff += (verlen & 0xF) << 2; + + return nhoff; +} + +static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto, + struct flow_keys *flow) +{ + *ip_proto = load_byte(skb, + nhoff + offsetof(struct ipv6hdr, nexthdr)); + flow->src = ipv6_addr_hash(skb, + nhoff + offsetof(struct ipv6hdr, saddr)); + flow->dst = ipv6_addr_hash(skb, + nhoff + offsetof(struct ipv6hdr, daddr)); + nhoff += sizeof(struct ipv6hdr); + + return nhoff; +} + +static inline bool flow_dissector(struct __sk_buff *skb, struct flow_keys *flow) +{ + __u64 nhoff = ETH_HLEN; + __u64 ip_proto; + __u64 proto = load_half(skb, 12); + int poff; + + if (proto == ETH_P_8021AD) { + proto = load_half(skb, nhoff + offsetof(struct vlan_hdr, + h_vlan_encapsulated_proto)); + nhoff += sizeof(struct vlan_hdr); + } + + if (proto == ETH_P_8021Q) { + proto = load_half(skb, nhoff + offsetof(struct vlan_hdr, + h_vlan_encapsulated_proto)); + nhoff += sizeof(struct vlan_hdr); + } + + if (likely(proto == ETH_P_IP)) + nhoff = parse_ip(skb, nhoff, &ip_proto, flow); + else if (proto == ETH_P_IPV6) + nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow); + else + return false; + + switch (ip_proto) { + case IPPROTO_GRE: { + struct gre_hdr { + __be16 flags; + __be16 proto; + }; + + __u64 gre_flags = load_half(skb, + nhoff + offsetof(struct gre_hdr, flags)); + __u64 gre_proto = load_half(skb, + nhoff + offsetof(struct gre_hdr, proto)); + + if (gre_flags & (GRE_VERSION|GRE_ROUTING)) + break; + + proto = gre_proto; + nhoff += 4; + if (gre_flags & GRE_CSUM) + nhoff += 4; + if (gre_flags & GRE_KEY) + nhoff += 4; + if (gre_flags & GRE_SEQ) + nhoff += 4; + + if (proto == ETH_P_8021Q) { + proto = load_half(skb, + nhoff + offsetof(struct vlan_hdr, + h_vlan_encapsulated_proto)); + nhoff += sizeof(struct vlan_hdr); + } + + if (proto == ETH_P_IP) + nhoff = parse_ip(skb, nhoff, &ip_proto, flow); + else if (proto == ETH_P_IPV6) + nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow); + else + return false; + break; + } + case IPPROTO_IPIP: + nhoff = parse_ip(skb, nhoff, &ip_proto, flow); + break; + case IPPROTO_IPV6: + nhoff = parse_ipv6(skb, nhoff, &ip_proto, flow); + break; + default: + break; + } + + flow->ip_proto = ip_proto; + poff = proto_ports_offset(ip_proto); + if (poff >= 0) { + nhoff += poff; + flow->ports = load_word(skb, nhoff); + } + + flow->thoff = (__u16) nhoff; + + return true; +} + +struct pair { + long packets; + long bytes; +}; + +struct bpf_map_def SEC("maps") hash_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(__be32), + .value_size = sizeof(struct pair), + .max_entries = 1024, +}; + +SEC("socket2") +int bpf_prog2(struct __sk_buff *skb) +{ + struct flow_keys flow; + struct pair *value; + u32 key; + + if (!flow_dissector(skb, &flow)) + return 0; + + key = flow.dst; + value = bpf_map_lookup_elem(&hash_map, &key); + if (value) { + __sync_fetch_and_add(&value->packets, 1); + __sync_fetch_and_add(&value->bytes, skb->len); + } else { + struct pair val = {1, skb->len}; + + bpf_map_update_elem(&hash_map, &key, &val, BPF_ANY); + } + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/kernel/samples/bpf/sockex2_user.c b/kernel/samples/bpf/sockex2_user.c new file mode 100644 index 000000000..29a276d76 --- /dev/null +++ b/kernel/samples/bpf/sockex2_user.c @@ -0,0 +1,49 @@ +#include <stdio.h> +#include <assert.h> +#include <linux/bpf.h> +#include "libbpf.h" +#include "bpf_load.h" +#include <unistd.h> +#include <arpa/inet.h> + +struct pair { + __u64 packets; + __u64 bytes; +}; + +int main(int ac, char **argv) +{ + char filename[256]; + FILE *f; + int i, sock; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + sock = open_raw_sock("lo"); + + assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd, + sizeof(prog_fd[0])) == 0); + + f = popen("ping -c5 localhost", "r"); + (void) f; + + for (i = 0; i < 5; i++) { + int key = 0, next_key; + struct pair value; + + while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) { + bpf_lookup_elem(map_fd[0], &next_key, &value); + printf("ip %s bytes %lld packets %lld\n", + inet_ntoa((struct in_addr){htonl(next_key)}), + value.bytes, value.packets); + key = next_key; + } + sleep(1); + } + return 0; +} diff --git a/kernel/samples/bpf/tcbpf1_kern.c b/kernel/samples/bpf/tcbpf1_kern.c new file mode 100644 index 000000000..7c27710f8 --- /dev/null +++ b/kernel/samples/bpf/tcbpf1_kern.c @@ -0,0 +1,67 @@ +#include <uapi/linux/bpf.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/in.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/filter.h> + +#include "bpf_helpers.h" + +/* compiler workaround */ +#define _htonl __builtin_bswap32 + +static inline void set_dst_mac(struct __sk_buff *skb, char *mac) +{ + bpf_skb_store_bytes(skb, 0, mac, ETH_ALEN, 1); +} + +#define IP_CSUM_OFF (ETH_HLEN + offsetof(struct iphdr, check)) +#define TOS_OFF (ETH_HLEN + offsetof(struct iphdr, tos)) + +static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos) +{ + __u8 old_tos = load_byte(skb, BPF_LL_OFF + TOS_OFF); + + bpf_l3_csum_replace(skb, IP_CSUM_OFF, htons(old_tos), htons(new_tos), 2); + bpf_skb_store_bytes(skb, TOS_OFF, &new_tos, sizeof(new_tos), 0); +} + +#define TCP_CSUM_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, check)) +#define IP_SRC_OFF (ETH_HLEN + offsetof(struct iphdr, saddr)) + +#define IS_PSEUDO 0x10 + +static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip) +{ + __u32 old_ip = _htonl(load_word(skb, BPF_LL_OFF + IP_SRC_OFF)); + + bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_ip, new_ip, IS_PSEUDO | sizeof(new_ip)); + bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip)); + bpf_skb_store_bytes(skb, IP_SRC_OFF, &new_ip, sizeof(new_ip), 0); +} + +#define TCP_DPORT_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, dest)) +static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port) +{ + __u16 old_port = htons(load_half(skb, BPF_LL_OFF + TCP_DPORT_OFF)); + + bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_port, new_port, sizeof(new_port)); + bpf_skb_store_bytes(skb, TCP_DPORT_OFF, &new_port, sizeof(new_port), 0); +} + +SEC("classifier") +int bpf_prog1(struct __sk_buff *skb) +{ + __u8 proto = load_byte(skb, BPF_LL_OFF + ETH_HLEN + offsetof(struct iphdr, protocol)); + long *value; + + if (proto == IPPROTO_TCP) { + set_ip_tos(skb, 8); + set_tcp_ip_src(skb, 0xA010101); + set_tcp_dest_port(skb, 5001); + } + + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/kernel/samples/bpf/test_maps.c b/kernel/samples/bpf/test_maps.c new file mode 100644 index 000000000..6299ee95c --- /dev/null +++ b/kernel/samples/bpf/test_maps.c @@ -0,0 +1,291 @@ +/* + * Testsuite for eBPF maps + * + * Copyright (c) 2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <stdio.h> +#include <unistd.h> +#include <linux/bpf.h> +#include <errno.h> +#include <string.h> +#include <assert.h> +#include <sys/wait.h> +#include <stdlib.h> +#include "libbpf.h" + +/* sanity tests for map API */ +static void test_hashmap_sanity(int i, void *data) +{ + long long key, next_key, value; + int map_fd; + + map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), 2); + if (map_fd < 0) { + printf("failed to create hashmap '%s'\n", strerror(errno)); + exit(1); + } + + key = 1; + value = 1234; + /* insert key=1 element */ + assert(bpf_update_elem(map_fd, &key, &value, BPF_ANY) == 0); + + value = 0; + /* BPF_NOEXIST means: add new element if it doesn't exist */ + assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 && + /* key=1 already exists */ + errno == EEXIST); + + assert(bpf_update_elem(map_fd, &key, &value, -1) == -1 && errno == EINVAL); + + /* check that key=1 can be found */ + assert(bpf_lookup_elem(map_fd, &key, &value) == 0 && value == 1234); + + key = 2; + /* check that key=2 is not found */ + assert(bpf_lookup_elem(map_fd, &key, &value) == -1 && errno == ENOENT); + + /* BPF_EXIST means: update existing element */ + assert(bpf_update_elem(map_fd, &key, &value, BPF_EXIST) == -1 && + /* key=2 is not there */ + errno == ENOENT); + + /* insert key=2 element */ + assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == 0); + + /* key=1 and key=2 were inserted, check that key=0 cannot be inserted + * due to max_entries limit + */ + key = 0; + assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 && + errno == E2BIG); + + /* check that key = 0 doesn't exist */ + assert(bpf_delete_elem(map_fd, &key) == -1 && errno == ENOENT); + + /* iterate over two elements */ + assert(bpf_get_next_key(map_fd, &key, &next_key) == 0 && + (next_key == 1 || next_key == 2)); + assert(bpf_get_next_key(map_fd, &next_key, &next_key) == 0 && + (next_key == 1 || next_key == 2)); + assert(bpf_get_next_key(map_fd, &next_key, &next_key) == -1 && + errno == ENOENT); + + /* delete both elements */ + key = 1; + assert(bpf_delete_elem(map_fd, &key) == 0); + key = 2; + assert(bpf_delete_elem(map_fd, &key) == 0); + assert(bpf_delete_elem(map_fd, &key) == -1 && errno == ENOENT); + + key = 0; + /* check that map is empty */ + assert(bpf_get_next_key(map_fd, &key, &next_key) == -1 && + errno == ENOENT); + close(map_fd); +} + +static void test_arraymap_sanity(int i, void *data) +{ + int key, next_key, map_fd; + long long value; + + map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value), 2); + if (map_fd < 0) { + printf("failed to create arraymap '%s'\n", strerror(errno)); + exit(1); + } + + key = 1; + value = 1234; + /* insert key=1 element */ + assert(bpf_update_elem(map_fd, &key, &value, BPF_ANY) == 0); + + value = 0; + assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 && + errno == EEXIST); + + /* check that key=1 can be found */ + assert(bpf_lookup_elem(map_fd, &key, &value) == 0 && value == 1234); + + key = 0; + /* check that key=0 is also found and zero initialized */ + assert(bpf_lookup_elem(map_fd, &key, &value) == 0 && value == 0); + + + /* key=0 and key=1 were inserted, check that key=2 cannot be inserted + * due to max_entries limit + */ + key = 2; + assert(bpf_update_elem(map_fd, &key, &value, BPF_EXIST) == -1 && + errno == E2BIG); + + /* check that key = 2 doesn't exist */ + assert(bpf_lookup_elem(map_fd, &key, &value) == -1 && errno == ENOENT); + + /* iterate over two elements */ + assert(bpf_get_next_key(map_fd, &key, &next_key) == 0 && + next_key == 0); + assert(bpf_get_next_key(map_fd, &next_key, &next_key) == 0 && + next_key == 1); + assert(bpf_get_next_key(map_fd, &next_key, &next_key) == -1 && + errno == ENOENT); + + /* delete shouldn't succeed */ + key = 1; + assert(bpf_delete_elem(map_fd, &key) == -1 && errno == EINVAL); + + close(map_fd); +} + +#define MAP_SIZE (32 * 1024) +static void test_map_large(void) +{ + struct bigkey { + int a; + char b[116]; + long long c; + } key; + int map_fd, i, value; + + /* allocate 4Mbyte of memory */ + map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), + MAP_SIZE); + if (map_fd < 0) { + printf("failed to create large map '%s'\n", strerror(errno)); + exit(1); + } + + for (i = 0; i < MAP_SIZE; i++) { + key = (struct bigkey) {.c = i}; + value = i; + assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == 0); + } + key.c = -1; + assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 && + errno == E2BIG); + + /* iterate through all elements */ + for (i = 0; i < MAP_SIZE; i++) + assert(bpf_get_next_key(map_fd, &key, &key) == 0); + assert(bpf_get_next_key(map_fd, &key, &key) == -1 && errno == ENOENT); + + key.c = 0; + assert(bpf_lookup_elem(map_fd, &key, &value) == 0 && value == 0); + key.a = 1; + assert(bpf_lookup_elem(map_fd, &key, &value) == -1 && errno == ENOENT); + + close(map_fd); +} + +/* fork N children and wait for them to complete */ +static void run_parallel(int tasks, void (*fn)(int i, void *data), void *data) +{ + pid_t pid[tasks]; + int i; + + for (i = 0; i < tasks; i++) { + pid[i] = fork(); + if (pid[i] == 0) { + fn(i, data); + exit(0); + } else if (pid[i] == -1) { + printf("couldn't spawn #%d process\n", i); + exit(1); + } + } + for (i = 0; i < tasks; i++) { + int status; + + assert(waitpid(pid[i], &status, 0) == pid[i]); + assert(status == 0); + } +} + +static void test_map_stress(void) +{ + run_parallel(100, test_hashmap_sanity, NULL); + run_parallel(100, test_arraymap_sanity, NULL); +} + +#define TASKS 1024 +#define DO_UPDATE 1 +#define DO_DELETE 0 +static void do_work(int fn, void *data) +{ + int map_fd = ((int *)data)[0]; + int do_update = ((int *)data)[1]; + int i; + int key, value; + + for (i = fn; i < MAP_SIZE; i += TASKS) { + key = value = i; + if (do_update) + assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == 0); + else + assert(bpf_delete_elem(map_fd, &key) == 0); + } +} + +static void test_map_parallel(void) +{ + int i, map_fd, key = 0, value = 0; + int data[2]; + + map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), + MAP_SIZE); + if (map_fd < 0) { + printf("failed to create map for parallel test '%s'\n", + strerror(errno)); + exit(1); + } + + data[0] = map_fd; + data[1] = DO_UPDATE; + /* use the same map_fd in children to add elements to this map + * child_0 adds key=0, key=1024, key=2048, ... + * child_1 adds key=1, key=1025, key=2049, ... + * child_1023 adds key=1023, ... + */ + run_parallel(TASKS, do_work, data); + + /* check that key=0 is already there */ + assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 && + errno == EEXIST); + + /* check that all elements were inserted */ + key = -1; + for (i = 0; i < MAP_SIZE; i++) + assert(bpf_get_next_key(map_fd, &key, &key) == 0); + assert(bpf_get_next_key(map_fd, &key, &key) == -1 && errno == ENOENT); + + /* another check for all elements */ + for (i = 0; i < MAP_SIZE; i++) { + key = MAP_SIZE - i - 1; + assert(bpf_lookup_elem(map_fd, &key, &value) == 0 && + value == key); + } + + /* now let's delete all elemenets in parallel */ + data[1] = DO_DELETE; + run_parallel(TASKS, do_work, data); + + /* nothing should be left */ + key = -1; + assert(bpf_get_next_key(map_fd, &key, &key) == -1 && errno == ENOENT); +} + +int main(void) +{ + test_hashmap_sanity(0, NULL); + test_arraymap_sanity(0, NULL); + test_map_large(); + test_map_parallel(); + test_map_stress(); + printf("test_maps: OK\n"); + return 0; +} diff --git a/kernel/samples/bpf/test_verifier.c b/kernel/samples/bpf/test_verifier.c new file mode 100644 index 000000000..12f3780af --- /dev/null +++ b/kernel/samples/bpf/test_verifier.c @@ -0,0 +1,835 @@ +/* + * Testsuite for eBPF verifier + * + * Copyright (c) 2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <stdio.h> +#include <unistd.h> +#include <linux/bpf.h> +#include <errno.h> +#include <linux/unistd.h> +#include <string.h> +#include <linux/filter.h> +#include <stddef.h> +#include "libbpf.h" + +#define MAX_INSNS 512 +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) + +struct bpf_test { + const char *descr; + struct bpf_insn insns[MAX_INSNS]; + int fixup[32]; + const char *errstr; + enum { + ACCEPT, + REJECT + } result; +}; + +static struct bpf_test tests[] = { + { + "add+sub+mul", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 2), + BPF_MOV64_IMM(BPF_REG_2, 3), + BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -1), + BPF_ALU64_IMM(BPF_MUL, BPF_REG_1, 3), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "unreachable", + .insns = { + BPF_EXIT_INSN(), + BPF_EXIT_INSN(), + }, + .errstr = "unreachable", + .result = REJECT, + }, + { + "unreachable2", + .insns = { + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_JMP_IMM(BPF_JA, 0, 0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "unreachable", + .result = REJECT, + }, + { + "out of range jump", + .insns = { + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_EXIT_INSN(), + }, + .errstr = "jump out of range", + .result = REJECT, + }, + { + "out of range jump2", + .insns = { + BPF_JMP_IMM(BPF_JA, 0, 0, -2), + BPF_EXIT_INSN(), + }, + .errstr = "jump out of range", + .result = REJECT, + }, + { + "test1 ld_imm64", + .insns = { + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1), + BPF_LD_IMM64(BPF_REG_0, 0), + BPF_LD_IMM64(BPF_REG_0, 0), + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .errstr = "invalid BPF_LD_IMM insn", + .result = REJECT, + }, + { + "test2 ld_imm64", + .insns = { + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1), + BPF_LD_IMM64(BPF_REG_0, 0), + BPF_LD_IMM64(BPF_REG_0, 0), + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .errstr = "invalid BPF_LD_IMM insn", + .result = REJECT, + }, + { + "test3 ld_imm64", + .insns = { + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1), + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0), + BPF_LD_IMM64(BPF_REG_0, 0), + BPF_LD_IMM64(BPF_REG_0, 0), + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_ld_imm64 insn", + .result = REJECT, + }, + { + "test4 ld_imm64", + .insns = { + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_ld_imm64 insn", + .result = REJECT, + }, + { + "test5 ld_imm64", + .insns = { + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0), + }, + .errstr = "invalid bpf_ld_imm64 insn", + .result = REJECT, + }, + { + "no bpf_exit", + .insns = { + BPF_ALU64_REG(BPF_MOV, BPF_REG_0, BPF_REG_2), + }, + .errstr = "jump out of range", + .result = REJECT, + }, + { + "loop (back-edge)", + .insns = { + BPF_JMP_IMM(BPF_JA, 0, 0, -1), + BPF_EXIT_INSN(), + }, + .errstr = "back-edge", + .result = REJECT, + }, + { + "loop2 (back-edge)", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_0), + BPF_JMP_IMM(BPF_JA, 0, 0, -4), + BPF_EXIT_INSN(), + }, + .errstr = "back-edge", + .result = REJECT, + }, + { + "conditional loop", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, -3), + BPF_EXIT_INSN(), + }, + .errstr = "back-edge", + .result = REJECT, + }, + { + "read uninitialized register", + .insns = { + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + }, + .errstr = "R2 !read_ok", + .result = REJECT, + }, + { + "read invalid register", + .insns = { + BPF_MOV64_REG(BPF_REG_0, -1), + BPF_EXIT_INSN(), + }, + .errstr = "R15 is invalid", + .result = REJECT, + }, + { + "program doesn't init R0 before exit", + .insns = { + BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .errstr = "R0 !read_ok", + .result = REJECT, + }, + { + "program doesn't init R0 before exit in all branches", + .insns = { + BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .errstr = "R0 !read_ok", + .result = REJECT, + }, + { + "stack out of bounds", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0), + BPF_EXIT_INSN(), + }, + .errstr = "invalid stack", + .result = REJECT, + }, + { + "invalid call insn1", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL | BPF_X, 0, 0, 0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "BPF_CALL uses reserved", + .result = REJECT, + }, + { + "invalid call insn2", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 1, 0), + BPF_EXIT_INSN(), + }, + .errstr = "BPF_CALL uses reserved", + .result = REJECT, + }, + { + "invalid function call", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 1234567), + BPF_EXIT_INSN(), + }, + .errstr = "invalid func 1234567", + .result = REJECT, + }, + { + "uninitialized stack1", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), + }, + .fixup = {2}, + .errstr = "invalid indirect read from stack", + .result = REJECT, + }, + { + "uninitialized stack2", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, -8), + BPF_EXIT_INSN(), + }, + .errstr = "invalid read from stack", + .result = REJECT, + }, + { + "check valid spill/fill", + .insns = { + /* spill R1(ctx) into stack */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + + /* fill it back into R2 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -8), + + /* should be able to access R0 = *(R2 + 8) */ + /* BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 8), */ + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "check corrupted spill/fill", + .insns = { + /* spill R1(ctx) into stack */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + + /* mess up with R1 pointer on stack */ + BPF_ST_MEM(BPF_B, BPF_REG_10, -7, 0x23), + + /* fill back into R0 should fail */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), + + BPF_EXIT_INSN(), + }, + .errstr = "corrupted spill", + .result = REJECT, + }, + { + "invalid src register in STX", + .insns = { + BPF_STX_MEM(BPF_B, BPF_REG_10, -1, -1), + BPF_EXIT_INSN(), + }, + .errstr = "R15 is invalid", + .result = REJECT, + }, + { + "invalid dst register in STX", + .insns = { + BPF_STX_MEM(BPF_B, 14, BPF_REG_10, -1), + BPF_EXIT_INSN(), + }, + .errstr = "R14 is invalid", + .result = REJECT, + }, + { + "invalid dst register in ST", + .insns = { + BPF_ST_MEM(BPF_B, 14, -1, -1), + BPF_EXIT_INSN(), + }, + .errstr = "R14 is invalid", + .result = REJECT, + }, + { + "invalid src register in LDX", + .insns = { + BPF_LDX_MEM(BPF_B, BPF_REG_0, 12, 0), + BPF_EXIT_INSN(), + }, + .errstr = "R12 is invalid", + .result = REJECT, + }, + { + "invalid dst register in LDX", + .insns = { + BPF_LDX_MEM(BPF_B, 11, BPF_REG_1, 0), + BPF_EXIT_INSN(), + }, + .errstr = "R11 is invalid", + .result = REJECT, + }, + { + "junk insn", + .insns = { + BPF_RAW_INSN(0, 0, 0, 0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "invalid BPF_LD_IMM", + .result = REJECT, + }, + { + "junk insn2", + .insns = { + BPF_RAW_INSN(1, 0, 0, 0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "BPF_LDX uses reserved fields", + .result = REJECT, + }, + { + "junk insn3", + .insns = { + BPF_RAW_INSN(-1, 0, 0, 0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "invalid BPF_ALU opcode f0", + .result = REJECT, + }, + { + "junk insn4", + .insns = { + BPF_RAW_INSN(-1, -1, -1, -1, -1), + BPF_EXIT_INSN(), + }, + .errstr = "invalid BPF_ALU opcode f0", + .result = REJECT, + }, + { + "junk insn5", + .insns = { + BPF_RAW_INSN(0x7f, -1, -1, -1, -1), + BPF_EXIT_INSN(), + }, + .errstr = "BPF_ALU uses reserved fields", + .result = REJECT, + }, + { + "misaligned read from stack", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, -4), + BPF_EXIT_INSN(), + }, + .errstr = "misaligned access", + .result = REJECT, + }, + { + "invalid map_fd for function call", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_delete_elem), + BPF_EXIT_INSN(), + }, + .errstr = "fd 0 is not pointing to valid bpf_map", + .result = REJECT, + }, + { + "don't check return value before access", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + }, + .fixup = {3}, + .errstr = "R0 invalid mem access 'map_value_or_null'", + .result = REJECT, + }, + { + "access memory with incorrect alignment", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0), + BPF_EXIT_INSN(), + }, + .fixup = {3}, + .errstr = "misaligned access", + .result = REJECT, + }, + { + "sometimes access memory with incorrect alignment", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + }, + .fixup = {3}, + .errstr = "R0 invalid mem access", + .result = REJECT, + }, + { + "jump test 1", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_2, -8, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 1, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_2, -16, 1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 2, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_2, -8, 2), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 3, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_2, -16, 3), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 4, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_2, -8, 4), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 5, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_2, -32, 5), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "jump test 2", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 2), + BPF_ST_MEM(BPF_DW, BPF_REG_2, -8, 0), + BPF_JMP_IMM(BPF_JA, 0, 0, 14), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 1, 2), + BPF_ST_MEM(BPF_DW, BPF_REG_2, -16, 0), + BPF_JMP_IMM(BPF_JA, 0, 0, 11), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 2, 2), + BPF_ST_MEM(BPF_DW, BPF_REG_2, -32, 0), + BPF_JMP_IMM(BPF_JA, 0, 0, 8), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 3, 2), + BPF_ST_MEM(BPF_DW, BPF_REG_2, -40, 0), + BPF_JMP_IMM(BPF_JA, 0, 0, 5), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 4, 2), + BPF_ST_MEM(BPF_DW, BPF_REG_2, -48, 0), + BPF_JMP_IMM(BPF_JA, 0, 0, 2), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 5, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_2, -56, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "jump test 3", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 3), + BPF_ST_MEM(BPF_DW, BPF_REG_2, -8, 0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_JMP_IMM(BPF_JA, 0, 0, 19), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 1, 3), + BPF_ST_MEM(BPF_DW, BPF_REG_2, -16, 0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16), + BPF_JMP_IMM(BPF_JA, 0, 0, 15), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 2, 3), + BPF_ST_MEM(BPF_DW, BPF_REG_2, -32, 0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -32), + BPF_JMP_IMM(BPF_JA, 0, 0, 11), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 3, 3), + BPF_ST_MEM(BPF_DW, BPF_REG_2, -40, 0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -40), + BPF_JMP_IMM(BPF_JA, 0, 0, 7), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 4, 3), + BPF_ST_MEM(BPF_DW, BPF_REG_2, -48, 0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -48), + BPF_JMP_IMM(BPF_JA, 0, 0, 3), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 5, 0), + BPF_ST_MEM(BPF_DW, BPF_REG_2, -56, 0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -56), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_delete_elem), + BPF_EXIT_INSN(), + }, + .fixup = {24}, + .result = ACCEPT, + }, + { + "jump test 4", + .insns = { + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 2), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 3), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 4), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 2), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 3), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 4), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 2), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 3), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 4), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 2), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 3), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 4), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 2), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 3), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 4), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 2), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 3), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 4), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 2), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 3), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 4), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 2), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 3), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 4), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 2), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 3), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 4), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "jump test 5", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_2), + BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 2), + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, -8), + BPF_JMP_IMM(BPF_JA, 0, 0, 2), + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_2, -8), + BPF_JMP_IMM(BPF_JA, 0, 0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 2), + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, -8), + BPF_JMP_IMM(BPF_JA, 0, 0, 2), + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_2, -8), + BPF_JMP_IMM(BPF_JA, 0, 0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 2), + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, -8), + BPF_JMP_IMM(BPF_JA, 0, 0, 2), + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_2, -8), + BPF_JMP_IMM(BPF_JA, 0, 0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 2), + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, -8), + BPF_JMP_IMM(BPF_JA, 0, 0, 2), + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_2, -8), + BPF_JMP_IMM(BPF_JA, 0, 0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 2), + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, -8), + BPF_JMP_IMM(BPF_JA, 0, 0, 2), + BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_2, -8), + BPF_JMP_IMM(BPF_JA, 0, 0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "access skb fields ok", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, len)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, mark)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, pkt_type)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, queue_mapping)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, protocol)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, vlan_present)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, vlan_tci)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "access skb fields bad1", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, -4), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_context access", + .result = REJECT, + }, + { + "access skb fields bad2", + .insns = { + BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 9), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, pkt_type)), + BPF_EXIT_INSN(), + }, + .fixup = {4}, + .errstr = "different pointers", + .result = REJECT, + }, + { + "access skb fields bad3", + .insns = { + BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 2), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, pkt_type)), + BPF_EXIT_INSN(), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_JMP_IMM(BPF_JA, 0, 0, -12), + }, + .fixup = {6}, + .errstr = "different pointers", + .result = REJECT, + }, + { + "access skb fields bad4", + .insns = { + BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 3), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, + offsetof(struct __sk_buff, len)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_JMP_IMM(BPF_JA, 0, 0, -13), + }, + .fixup = {7}, + .errstr = "different pointers", + .result = REJECT, + }, +}; + +static int probe_filter_length(struct bpf_insn *fp) +{ + int len = 0; + + for (len = MAX_INSNS - 1; len > 0; --len) + if (fp[len].code != 0 || fp[len].imm != 0) + break; + + return len + 1; +} + +static int create_map(void) +{ + long long key, value = 0; + int map_fd; + + map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), 1024); + if (map_fd < 0) { + printf("failed to create map '%s'\n", strerror(errno)); + } + + return map_fd; +} + +static int test(void) +{ + int prog_fd, i, pass_cnt = 0, err_cnt = 0; + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + struct bpf_insn *prog = tests[i].insns; + int prog_len = probe_filter_length(prog); + int *fixup = tests[i].fixup; + int map_fd = -1; + + if (*fixup) { + map_fd = create_map(); + + do { + prog[*fixup].imm = map_fd; + fixup++; + } while (*fixup); + } + printf("#%d %s ", i, tests[i].descr); + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, + prog_len * sizeof(struct bpf_insn), + "GPL", 0); + + if (tests[i].result == ACCEPT) { + if (prog_fd < 0) { + printf("FAIL\nfailed to load prog '%s'\n", + strerror(errno)); + printf("%s", bpf_log_buf); + err_cnt++; + goto fail; + } + } else { + if (prog_fd >= 0) { + printf("FAIL\nunexpected success to load\n"); + printf("%s", bpf_log_buf); + err_cnt++; + goto fail; + } + if (strstr(bpf_log_buf, tests[i].errstr) == 0) { + printf("FAIL\nunexpected error message: %s", + bpf_log_buf); + err_cnt++; + goto fail; + } + } + + pass_cnt++; + printf("OK\n"); +fail: + if (map_fd >= 0) + close(map_fd); + close(prog_fd); + + } + printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, err_cnt); + + return 0; +} + +int main(void) +{ + return test(); +} diff --git a/kernel/samples/bpf/tracex1_kern.c b/kernel/samples/bpf/tracex1_kern.c new file mode 100644 index 000000000..316204637 --- /dev/null +++ b/kernel/samples/bpf/tracex1_kern.c @@ -0,0 +1,50 @@ +/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <uapi/linux/bpf.h> +#include <linux/version.h> +#include "bpf_helpers.h" + +#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;}) + +/* kprobe is NOT a stable ABI + * kernel functions can be removed, renamed or completely change semantics. + * Number of arguments and their positions can change, etc. + * In such case this bpf+kprobe example will no longer be meaningful + */ +SEC("kprobe/__netif_receive_skb_core") +int bpf_prog1(struct pt_regs *ctx) +{ + /* attaches to kprobe netif_receive_skb, + * looks for packets on loobpack device and prints them + */ + char devname[IFNAMSIZ] = {}; + struct net_device *dev; + struct sk_buff *skb; + int len; + + /* non-portable! works for the given kernel only */ + skb = (struct sk_buff *) ctx->di; + + dev = _(skb->dev); + + len = _(skb->len); + + bpf_probe_read(devname, sizeof(devname), dev->name); + + if (devname[0] == 'l' && devname[1] == 'o') { + char fmt[] = "skb %p len %d\n"; + /* using bpf_trace_printk() for DEBUG ONLY */ + bpf_trace_printk(fmt, sizeof(fmt), skb, len); + } + + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/kernel/samples/bpf/tracex1_user.c b/kernel/samples/bpf/tracex1_user.c new file mode 100644 index 000000000..31a48183b --- /dev/null +++ b/kernel/samples/bpf/tracex1_user.c @@ -0,0 +1,25 @@ +#include <stdio.h> +#include <linux/bpf.h> +#include <unistd.h> +#include "libbpf.h" +#include "bpf_load.h" + +int main(int ac, char **argv) +{ + FILE *f; + char filename[256]; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + f = popen("taskset 1 ping -c5 localhost", "r"); + (void) f; + + read_trace_pipe(); + + return 0; +} diff --git a/kernel/samples/bpf/tracex2_kern.c b/kernel/samples/bpf/tracex2_kern.c new file mode 100644 index 000000000..19ec1cfc4 --- /dev/null +++ b/kernel/samples/bpf/tracex2_kern.c @@ -0,0 +1,86 @@ +/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/version.h> +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +struct bpf_map_def SEC("maps") my_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(long), + .value_size = sizeof(long), + .max_entries = 1024, +}; + +/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe + * example will no longer be meaningful + */ +SEC("kprobe/kfree_skb") +int bpf_prog2(struct pt_regs *ctx) +{ + long loc = 0; + long init_val = 1; + long *value; + + /* x64 specific: read ip of kfree_skb caller. + * non-portable version of __builtin_return_address(0) + */ + bpf_probe_read(&loc, sizeof(loc), (void *)ctx->sp); + + value = bpf_map_lookup_elem(&my_map, &loc); + if (value) + *value += 1; + else + bpf_map_update_elem(&my_map, &loc, &init_val, BPF_ANY); + return 0; +} + +static unsigned int log2(unsigned int v) +{ + unsigned int r; + unsigned int shift; + + r = (v > 0xFFFF) << 4; v >>= r; + shift = (v > 0xFF) << 3; v >>= shift; r |= shift; + shift = (v > 0xF) << 2; v >>= shift; r |= shift; + shift = (v > 0x3) << 1; v >>= shift; r |= shift; + r |= (v >> 1); + return r; +} + +static unsigned int log2l(unsigned long v) +{ + unsigned int hi = v >> 32; + if (hi) + return log2(hi) + 32; + else + return log2(v); +} + +struct bpf_map_def SEC("maps") my_hist_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(long), + .max_entries = 64, +}; + +SEC("kprobe/sys_write") +int bpf_prog3(struct pt_regs *ctx) +{ + long write_size = ctx->dx; /* arg3 */ + long init_val = 1; + long *value; + u32 index = log2l(write_size); + + value = bpf_map_lookup_elem(&my_hist_map, &index); + if (value) + __sync_fetch_and_add(value, 1); + return 0; +} +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/kernel/samples/bpf/tracex2_user.c b/kernel/samples/bpf/tracex2_user.c new file mode 100644 index 000000000..91b8d0896 --- /dev/null +++ b/kernel/samples/bpf/tracex2_user.c @@ -0,0 +1,95 @@ +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <signal.h> +#include <linux/bpf.h> +#include "libbpf.h" +#include "bpf_load.h" + +#define MAX_INDEX 64 +#define MAX_STARS 38 + +static void stars(char *str, long val, long max, int width) +{ + int i; + + for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++) + str[i] = '*'; + if (val > max) + str[i - 1] = '+'; + str[i] = '\0'; +} + +static void print_hist(int fd) +{ + int key; + long value; + long data[MAX_INDEX] = {}; + char starstr[MAX_STARS]; + int i; + int max_ind = -1; + long max_value = 0; + + for (key = 0; key < MAX_INDEX; key++) { + bpf_lookup_elem(fd, &key, &value); + data[key] = value; + if (value && key > max_ind) + max_ind = key; + if (value > max_value) + max_value = value; + } + + printf(" syscall write() stats\n"); + printf(" byte_size : count distribution\n"); + for (i = 1; i <= max_ind + 1; i++) { + stars(starstr, data[i - 1], max_value, MAX_STARS); + printf("%8ld -> %-8ld : %-8ld |%-*s|\n", + (1l << i) >> 1, (1l << i) - 1, data[i - 1], + MAX_STARS, starstr); + } +} +static void int_exit(int sig) +{ + print_hist(map_fd[1]); + exit(0); +} + +int main(int ac, char **argv) +{ + char filename[256]; + long key, next_key, value; + FILE *f; + int i; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + signal(SIGINT, int_exit); + + /* start 'ping' in the background to have some kfree_skb events */ + f = popen("ping -c5 localhost", "r"); + (void) f; + + /* start 'dd' in the background to have plenty of 'write' syscalls */ + f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r"); + (void) f; + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + for (i = 0; i < 5; i++) { + key = 0; + while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) { + bpf_lookup_elem(map_fd[0], &next_key, &value); + printf("location 0x%lx count %ld\n", next_key, value); + key = next_key; + } + if (key) + printf("\n"); + sleep(1); + } + print_hist(map_fd[1]); + + return 0; +} diff --git a/kernel/samples/bpf/tracex3_kern.c b/kernel/samples/bpf/tracex3_kern.c new file mode 100644 index 000000000..255ff2792 --- /dev/null +++ b/kernel/samples/bpf/tracex3_kern.c @@ -0,0 +1,89 @@ +/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/version.h> +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +struct bpf_map_def SEC("maps") my_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(long), + .value_size = sizeof(u64), + .max_entries = 4096, +}; + +/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe + * example will no longer be meaningful + */ +SEC("kprobe/blk_mq_start_request") +int bpf_prog1(struct pt_regs *ctx) +{ + long rq = ctx->di; + u64 val = bpf_ktime_get_ns(); + + bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY); + return 0; +} + +static unsigned int log2l(unsigned long long n) +{ +#define S(k) if (n >= (1ull << k)) { i += k; n >>= k; } + int i = -(n == 0); + S(32); S(16); S(8); S(4); S(2); S(1); + return i; +#undef S +} + +#define SLOTS 100 + +struct bpf_map_def SEC("maps") lat_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u64), + .max_entries = SLOTS, +}; + +SEC("kprobe/blk_update_request") +int bpf_prog2(struct pt_regs *ctx) +{ + long rq = ctx->di; + u64 *value, l, base; + u32 index; + + value = bpf_map_lookup_elem(&my_map, &rq); + if (!value) + return 0; + + u64 cur_time = bpf_ktime_get_ns(); + u64 delta = cur_time - *value; + + bpf_map_delete_elem(&my_map, &rq); + + /* the lines below are computing index = log10(delta)*10 + * using integer arithmetic + * index = 29 ~ 1 usec + * index = 59 ~ 1 msec + * index = 89 ~ 1 sec + * index = 99 ~ 10sec or more + * log10(x)*10 = log2(x)*10/log2(10) = log2(x)*3 + */ + l = log2l(delta); + base = 1ll << l; + index = (l * 64 + (delta - base) * 64 / base) * 3 / 64; + + if (index >= SLOTS) + index = SLOTS - 1; + + value = bpf_map_lookup_elem(&lat_map, &index); + if (value) + __sync_fetch_and_add((long *)value, 1); + + return 0; +} +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/kernel/samples/bpf/tracex3_user.c b/kernel/samples/bpf/tracex3_user.c new file mode 100644 index 000000000..0aaa933ab --- /dev/null +++ b/kernel/samples/bpf/tracex3_user.c @@ -0,0 +1,150 @@ +/* Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <unistd.h> +#include <stdbool.h> +#include <string.h> +#include <linux/bpf.h> +#include "libbpf.h" +#include "bpf_load.h" + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) + +#define SLOTS 100 + +static void clear_stats(int fd) +{ + __u32 key; + __u64 value = 0; + + for (key = 0; key < SLOTS; key++) + bpf_update_elem(fd, &key, &value, BPF_ANY); +} + +const char *color[] = { + "\033[48;5;255m", + "\033[48;5;252m", + "\033[48;5;250m", + "\033[48;5;248m", + "\033[48;5;246m", + "\033[48;5;244m", + "\033[48;5;242m", + "\033[48;5;240m", + "\033[48;5;238m", + "\033[48;5;236m", + "\033[48;5;234m", + "\033[48;5;232m", +}; +const int num_colors = ARRAY_SIZE(color); + +const char nocolor[] = "\033[00m"; + +const char *sym[] = { + " ", + " ", + ".", + ".", + "*", + "*", + "o", + "o", + "O", + "O", + "#", + "#", +}; + +bool full_range = false; +bool text_only = false; + +static void print_banner(void) +{ + if (full_range) + printf("|1ns |10ns |100ns |1us |10us |100us" + " |1ms |10ms |100ms |1s |10s\n"); + else + printf("|1us |10us |100us |1ms |10ms " + "|100ms |1s |10s\n"); +} + +static void print_hist(int fd) +{ + __u32 key; + __u64 value; + __u64 cnt[SLOTS]; + __u64 max_cnt = 0; + __u64 total_events = 0; + + for (key = 0; key < SLOTS; key++) { + value = 0; + bpf_lookup_elem(fd, &key, &value); + cnt[key] = value; + total_events += value; + if (value > max_cnt) + max_cnt = value; + } + clear_stats(fd); + for (key = full_range ? 0 : 29; key < SLOTS; key++) { + int c = num_colors * cnt[key] / (max_cnt + 1); + + if (text_only) + printf("%s", sym[c]); + else + printf("%s %s", color[c], nocolor); + } + printf(" # %lld\n", total_events); +} + +int main(int ac, char **argv) +{ + char filename[256]; + int i; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + for (i = 1; i < ac; i++) { + if (strcmp(argv[i], "-a") == 0) { + full_range = true; + } else if (strcmp(argv[i], "-t") == 0) { + text_only = true; + } else if (strcmp(argv[i], "-h") == 0) { + printf("Usage:\n" + " -a display wider latency range\n" + " -t text only\n"); + return 1; + } + } + + printf(" heatmap of IO latency\n"); + if (text_only) + printf(" %s", sym[num_colors - 1]); + else + printf(" %s %s", color[num_colors - 1], nocolor); + printf(" - many events with this latency\n"); + + if (text_only) + printf(" %s", sym[0]); + else + printf(" %s %s", color[0], nocolor); + printf(" - few events\n"); + + for (i = 0; ; i++) { + if (i % 20 == 0) + print_banner(); + print_hist(map_fd[1]); + sleep(2); + } + + return 0; +} diff --git a/kernel/samples/bpf/tracex4_kern.c b/kernel/samples/bpf/tracex4_kern.c new file mode 100644 index 000000000..126b80512 --- /dev/null +++ b/kernel/samples/bpf/tracex4_kern.c @@ -0,0 +1,54 @@ +/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/ptrace.h> +#include <linux/version.h> +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +struct pair { + u64 val; + u64 ip; +}; + +struct bpf_map_def SEC("maps") my_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(long), + .value_size = sizeof(struct pair), + .max_entries = 1000000, +}; + +/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe + * example will no longer be meaningful + */ +SEC("kprobe/kmem_cache_free") +int bpf_prog1(struct pt_regs *ctx) +{ + long ptr = ctx->si; + + bpf_map_delete_elem(&my_map, &ptr); + return 0; +} + +SEC("kretprobe/kmem_cache_alloc_node") +int bpf_prog2(struct pt_regs *ctx) +{ + long ptr = ctx->ax; + long ip = 0; + + /* get ip address of kmem_cache_alloc_node() caller */ + bpf_probe_read(&ip, sizeof(ip), (void *)(ctx->bp + sizeof(ip))); + + struct pair v = { + .val = bpf_ktime_get_ns(), + .ip = ip, + }; + + bpf_map_update_elem(&my_map, &ptr, &v, BPF_ANY); + return 0; +} +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/kernel/samples/bpf/tracex4_user.c b/kernel/samples/bpf/tracex4_user.c new file mode 100644 index 000000000..bc4a3bdea --- /dev/null +++ b/kernel/samples/bpf/tracex4_user.c @@ -0,0 +1,69 @@ +/* Copyright (c) 2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <unistd.h> +#include <stdbool.h> +#include <string.h> +#include <time.h> +#include <linux/bpf.h> +#include "libbpf.h" +#include "bpf_load.h" + +struct pair { + long long val; + __u64 ip; +}; + +static __u64 time_get_ns(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000000000ull + ts.tv_nsec; +} + +static void print_old_objects(int fd) +{ + long long val = time_get_ns(); + __u64 key, next_key; + struct pair v; + + key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */ + + key = -1; + while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) { + bpf_lookup_elem(map_fd[0], &next_key, &v); + key = next_key; + if (val - v.val < 1000000000ll) + /* object was allocated more then 1 sec ago */ + continue; + printf("obj 0x%llx is %2lldsec old was allocated at ip %llx\n", + next_key, (val - v.val) / 1000000000ll, v.ip); + } +} + +int main(int ac, char **argv) +{ + char filename[256]; + int i; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + for (i = 0; ; i++) { + print_old_objects(map_fd[1]); + sleep(1); + } + + return 0; +} |