From 2906c56252687c0fc7585b05f250e70f23b6f748 Mon Sep 17 00:00:00 2001 From: David Su Date: Fri, 20 Nov 2015 16:34:57 -0800 Subject: Add VM preemption latency measuring tool vm-trace This tool is to be used to measure VM preemption latencies defined by kvmfornfv requirements. See README.txt for usage of the tool. Change-Id: I78aed07df0af02f2268e36cfeb81923ba2bdb52a Signed-off-by: David Su --- tests/vm-trace/Makefile | 23 ++ tests/vm-trace/README.txt | 51 ++++ tests/vm-trace/vm-trace.c | 632 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 706 insertions(+) create mode 100644 tests/vm-trace/Makefile create mode 100644 tests/vm-trace/README.txt create mode 100644 tests/vm-trace/vm-trace.c diff --git a/tests/vm-trace/Makefile b/tests/vm-trace/Makefile new file mode 100644 index 000000000..e367739d4 --- /dev/null +++ b/tests/vm-trace/Makefile @@ -0,0 +1,23 @@ +DEBUG ?= n + +ifeq ($(DEBUG),y) + DBGFLAGS = -g -DDEBUG +else + DBGFLAGS = -O2 +endif + +XTRA_CFLAGS = $(DBGFLAGS) -Wall +XTRALIBS = -lrt -lpthread + +all: vm-trace cscope + +vm-trace: vm-trace.c + $(CC) $(XTRA_CFLAGS) $< -o $@ $(XTRALIBS) + +cscope.out: vm-trace.c + cscope -b + +cscope: cscope.out + +clean: + rm -f *.o core* vm-trace cscope.* diff --git a/tests/vm-trace/README.txt b/tests/vm-trace/README.txt new file mode 100644 index 000000000..815dca93a --- /dev/null +++ b/tests/vm-trace/README.txt @@ -0,0 +1,51 @@ +vm-trace is a tool utilizing the ftrace infrastructure in Linux kernel to +measure VM preemption latencies. For more info about ftrace, see +Documentation/trace/ftrace.txt. See include/linux/ring_buffer.h and +include/linux/ftrace_event.h for data structures used by ftrace. + +The tool enables 2 trace points in KVM driver: +kvm_exit defined in vmx_vcpu_run() (see arch/x86/kvm/vmx.c), and +kvm_entry defined in vcpu_enter_guest() (see arch/x86/kvm/x86.c). + +It then spawns a thread to extract trace data from the kernel ftrace ring +buffer using the splice() system call. Once the tracing duration has elapsed, +vm-trace calculates VM exit-entry latencies based on the timestamps of the +events. (A future improvement could be to spawn another thread to process the +trace on the fly to improve vm-trace's performance.) + +To take a trace, do the following: + +1. Run qemu-kvm to start guest VM +2. Bind each qemu-kvm vCPU thread to an isolated pCPU +3. Start desired workload on the guest +4. Run vm-trace on the host: + vm-trace -p cpu_to_trace -c cpu_to_collect_trace -s duration_in_seconds + +cpu_to_trace is one of the pCPUs from step 2 above that you want to trace. +vm-trace does not support tracing multiple pCPUs. + +cpu_to_collect_trace is the CPU used to read and save the trace data. +If the host system is NUMA, make sure to assign a CPU in the same NUMA node +as cpu_to_trace to cpu_to_collect_trace. + +A binary file named trace.bin will be saved in the current working directory. +Be aware that, depending on the tracing duration and type of workload running +on the guest, the file can become quite large. + +vm-trace requires root privileges. + +Some statistics of the events will be displayed similar to the following: + + Number of VM events = 21608832 + Average VM Exit-Entry latency = 1us + Maximum VM Exit-Entry latency = 5us + Maximum cumulative latency within 1ms = 12us + +trace.bin will be overwritten each time vm-trace is run in this mode, +so rename/copy the file if you want to keep it. + +To process a previously collected trace file, run: + vm-trace -f trace_file [-v] + +If -v is specified, all events in the trace file will be displayed. +This is helpful for identifying cause of long latency. diff --git a/tests/vm-trace/vm-trace.c b/tests/vm-trace/vm-trace.c new file mode 100644 index 000000000..32b4d8f74 --- /dev/null +++ b/tests/vm-trace/vm-trace.c @@ -0,0 +1,632 @@ +#define _LARGEFILE64_SOURCE +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct file_header +{ + uint32_t signature; + uint16_t vmentry_id; + uint16_t vmexit_id; +}; + +struct thread_param +{ + int raw_fd; + int out_fd; + int pipefd[2]; + int cpu; +}; + +struct event +{ + uint64_t timestamp; + uint16_t type; +}; + +enum rb_type +{ + RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28, + RINGBUF_TYPE_PADDING, + RINGBUF_TYPE_TIME_EXTEND, + RINGBUF_TYPE_TIME_STAMP +}; + +struct vmexit_data +{ + uint32_t exit_reason; + uint64_t guest_rip; + uint32_t isa; + uint64_t info1; + uint64_t info2; +}; + +struct vmentry_data +{ + uint32_t vcpu_id; +}; + +#pragma pack(1) + +struct trace_data +{ + uint16_t type; + uint8_t flags; + uint8_t preempt_count; + int32_t pid; + uint16_t migrate_disable; + uint16_t padding1; + uint32_t padding2; + union + { + struct vmexit_data vmexit; + struct vmentry_data vmentry; + }; +}; + +struct event_entry +{ + uint32_t type_len: 5, time_delta: 27; + union + { + uint32_t array[1]; + struct trace_data trace; + }; +}; + +#define DATA_SIZE 4080 + +struct event_page +{ + uint64_t timestamp; + uint64_t commit; + union + { + uint8_t data[DATA_SIZE]; + struct event_entry event; + }; +}; + +#define PAGE_SIZE sizeof(struct event_page) + +#define TRACE_PATH "/sys/kernel/debug/tracing" +#define TRACE_FILE "trace.bin" +#define TRACE_SIG 0xcce96d01 + +#define VM_ENTRY "kvm/kvm_entry" +#define VM_EXIT "kvm/kvm_exit" + +#ifdef DEBUG +#define dbg_printf(_f_,...) {printf(_f_,##__VA_ARGS__);} +#else +#define dbg_printf(_f_,...) +#endif + +static uint16_t vmentry_id; +static uint16_t vmexit_id; + +static int stop_tracing; +static int verbose; + + +static struct event* read_events(int fd, size_t *n) +{ + int i, j; + ssize_t rd; + uint32_t len, offset; + uint64_t event_time, pre_event_time, pre_page_time = 0; + struct event_page page; + struct event_entry *e; + struct event *events = NULL; + + *n = 0; + for (i = 0; 1; i++) + { + if ((rd = read(fd, &page, PAGE_SIZE)) == 0) + { + if (*n == 0) + fprintf(stderr, "No events found\nMake sure a VM is running and the CPU to trace is bound to a QEMU vCPU thread\n"); + + return events; + } + else if (rd < 0) + { + fprintf(stderr, "Failed to read trace file\n"); + free(events); + return NULL; + } + else if (rd < PAGE_SIZE) + { + fprintf(stderr, "Trace file does not have enough data\n"); + free(events); + return NULL; + } + + dbg_printf("Page %d:\n", i); + dbg_printf(" timestamp = %ld\n", page.timestamp); + dbg_printf(" commit = %ld\n", page.commit); + + if (page.timestamp < pre_page_time) + fprintf(stderr, "Warning: page time going backwards\n"); + + pre_page_time = page.timestamp; + + offset = 0; + pre_event_time = 0; + for (j = 0; 1; j++) + { + e = (struct event_entry *)(page.data+offset); + + if (e->type_len == 0) + e = (struct event_entry *)&e->array[1]; + + if (pre_event_time) + event_time = pre_event_time + e->time_delta; + else + event_time = page.timestamp; + + if (e->type_len < RINGBUF_TYPE_DATA_TYPE_LEN_MAX) + { + len = (e->type_len+1) * sizeof(uint32_t); + + if (e->trace.type == vmexit_id || e->trace.type == vmentry_id) + { + if ((events = realloc(events, (*n+1) * sizeof(struct event))) == NULL) + { + fprintf(stderr, "Failed to allocate memory\n"); + return NULL; + } + + events[*n].timestamp = event_time; + events[*n].type = e->trace.type; + + if (verbose) + { + if (e->trace.type == vmexit_id) + { + printf(" %ld: VM_EXIT reason: %08x, ", event_time, e->trace.vmexit.exit_reason); + printf("info1: %016lx, info2: %08lx\n", e->trace.vmexit.info1, e->trace.vmexit.info2); + } + else + { + printf(" %ld: VM_ENTRY dt: %d, vcpu: %d\n", event_time, e->time_delta, e->trace.vmentry.vcpu_id); + } + } + + *n += 1; + } + else if (e->trace.type == 0) + break; + else + fprintf(stderr, "UNKNOWN event %d\n", e->trace.type); + } + else if (e->type_len == RINGBUF_TYPE_TIME_EXTEND) + { + len = 8; + event_time = pre_event_time + (e->time_delta | (e->array[0]<<28)); + dbg_printf(" entry %d: TIME_EXTEND %ld\n", j, event_time); + } + else if (e->type_len == RINGBUF_TYPE_PADDING) + { + if (e->time_delta == 0) + break; + + len = e->array[0] + sizeof(uint32_t); + dbg_printf(" entry %d: PADDING, len %d @ %ld\n", j, len, event_time); + } + else if (e->type_len == RINGBUF_TYPE_TIME_STAMP) + { + len = 16; + dbg_printf(" entry %d: TIME_STAMP @ %ld\n", j, event_time); + } + + pre_event_time = event_time; + + offset += len; + if (offset >= DATA_SIZE) + break; + } + + dbg_printf(" events in page %d = %d\n", i, j); + } + + return events; +} + +static int parse_trace_file(char *file) +{ + int fd; + uint16_t pre_event = 0; + struct event *events; + size_t num_events, i, j = 0; + uint64_t d, exit_time = 0, total = 0, max_lat = 0; + uint64_t pre_time = 0, acc_1ms = 0, max_acc = 0; + struct file_header header; + + if ((fd = open(file, O_RDONLY|O_LARGEFILE)) < 0) + { + perror(file); + return -1; + } + + if (read(fd, &header, sizeof(struct file_header)) < 0) + { + perror(file); + return -1; + } + + if (header.signature != TRACE_SIG) + { + fprintf(stderr, "File %s is not a vm-trace file\n", file); + return -1; + } + + vmentry_id = header.vmentry_id; + vmexit_id = header.vmexit_id; + + if ((events = read_events(fd, &num_events)) == NULL) + return -1; + + printf("Number of VM events = %ld\n", num_events); + + for (i = 0; i < num_events; i++) + { + if (events[i].type == vmexit_id) + { + exit_time = events[i].timestamp; + } + else if (events[i].type == vmentry_id) + { + if (exit_time) + { + d = events[i].timestamp - exit_time; + if (d > max_lat) + max_lat = d; + + total += d; + acc_1ms += d; + j++; + } + } + + if (events[i].type == pre_event) + fprintf(stderr, "Warning: repeated events\n"); + pre_event = events[i].type; + + if (pre_time) + { + if (events[i].timestamp - pre_time >= 1000000) + { + if (acc_1ms > max_acc) + max_acc = acc_1ms; + + acc_1ms = 0; + pre_time = events[i].timestamp; + } + } + else + pre_time = events[i].timestamp; + } + + free(events); + + printf("Average VM Exit-Entry latency = %ldus\n", total/j/1000); + printf("Maximum VM Exit-Entry latency = %ldus\n", max_lat/1000); + printf("Maximum cumulative latency within 1ms = %ldus\n", max_acc/1000); + + close(fd); + return 0; +} + +static int get_event_id(char *event) +{ + char path[PATH_MAX+1]; + int fd; + ssize_t r; + + sprintf(path, "%s/events/%s/id", TRACE_PATH, event); + if ((fd = open(path, O_RDONLY)) < 0) + { + perror(path); + return -1; + } + + if ((r = read(fd, path, PATH_MAX)) < 0) + { + close(fd); + perror(path); + return -1; + } + + close(fd); + + path[r+1] = '\0'; + return atoi(path); +} + +static int enable_event(char *event, int en) +{ + char path[PATH_MAX+1], *s; + int fd; + + if (en) + s = "1"; + else + s = "0"; + + sprintf(path, "%s/events/%s/enable", TRACE_PATH, event); + if ((fd = open(path, O_WRONLY | O_TRUNC)) < 0) + { + perror(path); + return -1; + } + + if (write(fd, s, 2) < 0) + { + close(fd); + perror(path); + return -1; + } + + close(fd); + + return 0; +} + +static int enable_events(int en) +{ + if (enable_event(VM_ENTRY, en) < 0) + return -1; + + if (enable_event(VM_EXIT, en) < 0) + return -1; + + return 0; +} + +static int setup_tracing(int cpu) +{ + char path[PATH_MAX+1], mask[20]; + int fd, h, l; + + if (cpu > 31) + { + l = 0; + h = 1 << (cpu-32); + } + else + { + l = 1 << cpu; + h = 0; + } + + sprintf(mask, "%X,%X", h, l); + + sprintf(path, "%s/tracing_cpumask", TRACE_PATH); + if ((fd = open(path, O_WRONLY | O_TRUNC)) < 0) + { + perror(path); + return -1; + } + + if (write(fd, mask, strlen(mask)) < 0) + { + close(fd); + perror(path); + return -1; + } + + close(fd); + + sprintf(path, "%s/trace", TRACE_PATH); + if ((fd = open(path, O_WRONLY | O_TRUNC)) < 0) + { + perror(path); + return -1; + } + + if (write(fd, "", 1) < 0) + { + close(fd); + perror(path); + return -1; + } + + close(fd); + + if ((vmentry_id = get_event_id(VM_ENTRY)) < 0) + return -1; + + if ((vmexit_id = get_event_id(VM_EXIT)) < 0) + return -1; + + if (enable_events(1) < 0) + return -1; + + return 0; +} + +static void disable_tracing(int fd, pthread_t thread) +{ + if (write(fd, "0", 2) < 0) + perror("disable_tracing"); + close(fd); + + enable_events(0); + + stop_tracing = 1; + pthread_join(thread, NULL); +} + +static void *tracing_thread(void *param) +{ + cpu_set_t mask; + struct thread_param *p = param; + ssize_t r; + + CPU_ZERO(&mask); + CPU_SET(p->cpu, &mask); + if(pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &mask) != 0) + fprintf(stderr, "Could not set CPU affinity to CPU #%d\n", p->cpu); + + while (!stop_tracing) + { + if ((r = splice(p->raw_fd, NULL, p->pipefd[1], NULL, PAGE_SIZE, SPLICE_F_MOVE|SPLICE_F_NONBLOCK)) < 0) + { + if (errno == EAGAIN) + continue; + + perror("splice1"); + break; + } + else if (r == 0) + continue; + + if (splice(p->pipefd[0], NULL, p->out_fd, NULL, PAGE_SIZE, SPLICE_F_MOVE|SPLICE_F_NONBLOCK) < 0) + { + perror("splice2"); + break; + } + } + + close(p->raw_fd); + close(p->pipefd[1]); + close(p->pipefd[0]); + close(p->out_fd); + + return NULL; +} + +static void usage(char *argv[]) +{ + fprintf(stderr, "Usage: %s -p cpu_to_trace -c cpu_to_collect_trace -s duration_in_seconds\n", argv[0]); + fprintf(stderr, " %s -f trace_file [-v]\n", argv[0]); + exit(-1); +} + +int main(int argc, char *argv[]) +{ + char path[PATH_MAX+1], *file = NULL; + int cpu = -1, ttime = 0; + int opt, fd; + pthread_t thread; + struct file_header header; + struct thread_param param; + struct timespec interval; + + param.cpu = -1; + + while ((opt = getopt(argc, argv, "p:c:s:f:v")) != -1) + { + switch (opt) + { + case 'p': + cpu = atoi(optarg); + break; + case 'c': + param.cpu = atoi(optarg); + break; + case 's': + ttime = atoi(optarg); + break; + case 'f': + file = optarg; + break; + case 'v': + verbose = 1; + break; + default: + usage(argv); + } + } + + if ((cpu < 0 || param.cpu < 0 || ttime <= 0) && file == NULL) + usage(argv); + + if (file != NULL) + return parse_trace_file(file); + + verbose = 0; + + if (setup_tracing(cpu) < 0) + return -1; + + if ((param.out_fd = open(TRACE_FILE, O_WRONLY|O_CREAT|O_TRUNC|O_LARGEFILE, 0644)) < 0) + { + perror(TRACE_FILE); + return -1; + } + + header.signature = TRACE_SIG; + header.vmentry_id = vmentry_id; + header.vmexit_id = vmexit_id; + + if (write(param.out_fd, &header, sizeof(struct file_header)) < 0) + { + perror(TRACE_FILE); + return -1; + } + + sprintf(path, "%s/per_cpu/cpu%d/trace_pipe_raw", TRACE_PATH, cpu); + if ((param.raw_fd = open(path, O_RDONLY)) < 0) + { + perror(path); + return -1; + } + + if (pipe(param.pipefd) < 0) + { + perror("pipe"); + return -1; + } + + sprintf(path, "%s/tracing_on", TRACE_PATH); + if ((fd = open(path, O_WRONLY)) < 0) + { + perror(path); + return -1; + } + + if (pthread_create(&thread, NULL, tracing_thread, ¶m)) + { + perror("pthread_create"); + return -1; + } + + if (write(fd, "1", 2) < 0) + { + perror(path); + disable_tracing(fd, thread); + return -1; + } + + printf("CPU to trace: %d\n", cpu); + printf("CPU to collect trace: %d\n", param.cpu); + printf("Duration: %d seconds\n", ttime); + + interval.tv_sec = ttime; + interval.tv_nsec = 0; + if (clock_nanosleep(CLOCK_MONOTONIC, 0, &interval, NULL)) + { + perror("clock_nanosleep"); + disable_tracing(fd, thread); + return -1; + } + + disable_tracing(fd, thread); + + printf("Processing event file ...\n"); + + return parse_trace_file(TRACE_FILE); +} -- cgit 1.2.3-korg