#define _LARGEFILE64_SOURCE #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include struct file_header { uint32_t signature; uint16_t vmentry_id; uint16_t vmexit_id; }; struct thread_param { int raw_fd; int out_fd; int pipefd[2]; int cpu; }; struct event { uint64_t timestamp; uint16_t type; }; enum rb_type { RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28, RINGBUF_TYPE_PADDING, RINGBUF_TYPE_TIME_EXTEND, RINGBUF_TYPE_TIME_STAMP }; struct vmexit_data { uint32_t exit_reason; uint64_t guest_rip; uint32_t isa; uint64_t info1; uint64_t info2; }; struct vmentry_data { uint32_t vcpu_id; }; #pragma pack(1) struct trace_data { uint16_t type; uint8_t flags; uint8_t preempt_count; int32_t pid; uint16_t migrate_disable; uint16_t padding1; uint32_t padding2; union { struct vmexit_data vmexit; struct vmentry_data vmentry; }; }; struct event_entry { uint32_t type_len: 5, time_delta: 27; union { uint32_t array[1]; struct trace_data trace; }; }; #define DATA_SIZE 4080 struct event_page { uint64_t timestamp; uint64_t commit; union { uint8_t data[DATA_SIZE]; struct event_entry event; }; }; #define PAGE_SIZE sizeof(struct event_page) #define TRACE_PATH "/sys/kernel/debug/tracing" #define TRACE_FILE "trace.bin" #define TRACE_SIG 0xcce96d01 #define VM_ENTRY "kvm/kvm_entry" #define VM_EXIT "kvm/kvm_exit" #ifdef DEBUG #define dbg_printf(_f_,...) {printf(_f_,##__VA_ARGS__);} #else #define dbg_printf(_f_,...) #endif static uint16_t vmentry_id; static uint16_t vmexit_id; static int stop_tracing; static int verbose; static struct event* read_events(int fd, size_t *n) { int i, j; ssize_t rd; uint32_t len, offset; uint64_t event_time, pre_event_time, pre_page_time = 0; struct event_page page; struct event_entry *e; struct event *events = NULL; *n = 0; for (i = 0; 1; i++) { if ((rd = read(fd, &page, PAGE_SIZE)) == 0) { if (*n == 0) fprintf(stderr, "No events found\nMake sure a VM is running and the CPU to trace is bound to a QEMU vCPU thread\n"); return events; } else if (rd < 0) { fprintf(stderr, "Failed to read trace file\n"); free(events); return NULL; } else if (rd < PAGE_SIZE) { fprintf(stderr, "Trace file does not have enough data\n"); free(events); return NULL; } dbg_printf("Page %d:\n", i); dbg_printf(" timestamp = %ld\n", page.timestamp); dbg_printf(" commit = %ld\n", page.commit); if (page.timestamp < pre_page_time) fprintf(stderr, "Warning: page time going backwards\n"); pre_page_time = page.timestamp; offset = 0; pre_event_time = 0; for (j = 0; 1; j++) { e = (struct event_entry *)(page.data+offset); if (e->type_len == 0) e = (struct event_entry *)&e->array[1]; if (pre_event_time) event_time = pre_event_time + e->time_delta; else event_time = page.timestamp; if (e->type_len < RINGBUF_TYPE_DATA_TYPE_LEN_MAX) { len = (e->type_len+1) * sizeof(uint32_t); if (e->trace.type == vmexit_id || e->trace.type == vmentry_id) { if ((events = realloc(events, (*n+1) * sizeof(struct event))) == NULL) { fprintf(stderr, "Failed to allocate memory\n"); return NULL; } events[*n].timestamp = event_time; events[*n].type = e->trace.type; if (verbose) { if (e->trace.type == vmexit_id) { printf(" %ld: VM_EXIT reason: %08x, ", event_time, e->trace.vmexit.exit_reason); printf("info1: %016lx, info2: %08lx\n", e->trace.vmexit.info1, e->trace.vmexit.info2); } else { printf(" %ld: VM_ENTRY dt: %d, vcpu: %d\n", event_time, e->time_delta, e->trace.vmentry.vcpu_id); } } *n += 1; } else if (e->trace.type == 0) break; else fprintf(stderr, "UNKNOWN event %d\n", e->trace.type); } else if (e->type_len == RINGBUF_TYPE_TIME_EXTEND) { len = 8; event_time = pre_event_time + (e->time_delta | (e->array[0]<<28)); dbg_printf(" entry %d: TIME_EXTEND %ld\n", j, event_time); } else if (e->type_len == RINGBUF_TYPE_PADDING) { if (e->time_delta == 0) break; len = e->array[0] + sizeof(uint32_t); dbg_printf(" entry %d: PADDING, len %d @ %ld\n", j, len, event_time); } else if (e->type_len == RINGBUF_TYPE_TIME_STAMP) { len = 16; dbg_printf(" entry %d: TIME_STAMP @ %ld\n", j, event_time); } pre_event_time = event_time; offset += len; if (offset >= DATA_SIZE) break; } dbg_printf(" events in page %d = %d\n", i, j); } return events; } static int parse_trace_file(char *file) { int fd; uint16_t pre_event = 0; struct event *events; size_t num_events, i, j = 0; uint64_t d, exit_time = 0, total = 0, max_lat = 0; uint64_t pre_time = 0, acc_1ms = 0, max_acc = 0; struct file_header header; if ((fd = open(file, O_RDONLY|O_LARGEFILE)) < 0) { perror(file); return -1; } if (read(fd, &header, sizeof(struct file_header)) < 0) { perror(file); return -1; } if (header.signature != TRACE_SIG) { fprintf(stderr, "File %s is not a vm-trace file\n", file); return -1; } vmentry_id = header.vmentry_id; vmexit_id = header.vmexit_id; if ((events = read_events(fd, &num_events)) == NULL) return -1; printf("Number of VM events = %ld\n", num_events); for (i = 0; i < num_events; i++) { if (events[i].type == vmexit_id) { exit_time = events[i].timestamp; } else if (events[i].type == vmentry_id) { if (exit_time) { d = events[i].timestamp - exit_time; if (d > max_lat) max_lat = d; total += d; acc_1ms += d; j++; } } if (events[i].type == pre_event) fprintf(stderr, "Warning: repeated events\n"); pre_event = events[i].type; if (pre_time) { if (events[i].timestamp - pre_time >= 1000000) { if (acc_1ms > max_acc) max_acc = acc_1ms; acc_1ms = 0; pre_time = events[i].timestamp; } } else pre_time = events[i].timestamp; } free(events); printf("Average VM Exit-Entry latency = %ldus\n", total/j/1000); printf("Maximum VM Exit-Entry latency = %ldus\n", max_lat/1000); printf("Maximum cumulative latency within 1ms = %ldus\n", max_acc/1000); close(fd); return 0; } static int get_event_id(char *event) { char path[PATH_MAX+1]; int fd; ssize_t r; sprintf(path, "%s/events/%s/id", TRACE_PATH, event); if ((fd = open(path, O_RDONLY)) < 0) { perror(path); return -1; } if ((r = read(fd, path, PATH_MAX)) < 0) { close(fd); perror(path); return -1; } close(fd); path[r+1] = '\0'; return atoi(path); } static int enable_event(char *event, int en) { char path[PATH_MAX+1], *s; int fd; if (en) s = "1"; else s = "0"; sprintf(path, "%s/events/%s/enable", TRACE_PATH, event); if ((fd = open(path, O_WRONLY | O_TRUNC)) < 0) { perror(path); return -1; } if (write(fd, s, 2) < 0) { close(fd); perror(path); return -1; } close(fd); return 0; } static int enable_events(int en) { if (enable_event(VM_ENTRY, en) < 0) return -1; if (enable_event(VM_EXIT, en) < 0) return -1; return 0; } static int setup_tracing(int cpu) { char path[PATH_MAX+1], mask[20]; int fd, h, l; if (cpu > 31) { l = 0; h = 1 << (cpu-32); } else { l = 1 << cpu; h = 0; } sprintf(mask, "%X,%X", h, l); sprintf(path, "%s/tracing_cpumask", TRACE_PATH); if ((fd = open(path, O_WRONLY | O_TRUNC)) < 0) { perror(path); return -1; } if (write(fd, mask, strlen(mask)) < 0) { close(fd); perror(path); return -1; } close(fd); sprintf(path, "%s/trace", TRACE_PATH); if ((fd = open(path, O_WRONLY | O_TRUNC)) < 0) { perror(path); return -1; } if (write(fd, "", 1) < 0) { close(fd); perror(path); return -1; } close(fd); if ((vmentry_id = get_event_id(VM_ENTRY)) < 0) return -1; if ((vmexit_id = get_event_id(VM_EXIT)) < 0) return -1; if (enable_events(1) < 0) return -1; return 0; } static void disable_tracing(int fd, pthread_t thread) { if (write(fd, "0", 2) < 0) perror("disable_tracing"); close(fd); enable_events(0); stop_tracing = 1; pthread_join(thread, NULL); } static void *tracing_thread(void *param) { cpu_set_t mask; struct thread_param *p = param; ssize_t r; CPU_ZERO(&mask); CPU_SET(p->cpu, &mask); if(pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &mask) != 0) fprintf(stderr, "Could not set CPU affinity to CPU #%d\n", p->cpu); while (!stop_tracing) { if ((r = splice(p->raw_fd, NULL, p->pipefd[1], NULL, PAGE_SIZE, SPLICE_F_MOVE|SPLICE_F_NONBLOCK)) < 0) { if (errno == EAGAIN) continue; perror("splice1"); break; } else if (r == 0) continue; if (splice(p->pipefd[0], NULL, p->out_fd, NULL, PAGE_SIZE, SPLICE_F_MOVE|SPLICE_F_NONBLOCK) < 0) { perror("splice2"); break; } } close(p->raw_fd); close(p->pipefd[1]); close(p->pipefd[0]); close(p->out_fd); return NULL; } static void usage(char *argv[]) { fprintf(stderr, "Usage: %s -p cpu_to_trace -c cpu_to_collect_trace -s duration_in_seconds\n", argv[0]); fprintf(stderr, " %s -f trace_file [-v]\n", argv[0]); exit(-1); } int main(int argc, char *argv[]) { char path[PATH_MAX+1], *file = NULL; int cpu = -1, ttime = 0; int opt, fd; pthread_t thread; struct file_header header; struct thread_param param; struct timespec interval; param.cpu = -1; while ((opt = getopt(argc, argv, "p:c:s:f:v")) != -1) { switch (opt) { case 'p': cpu = atoi(optarg); break; case 'c': param.cpu = atoi(optarg); break; case 's': ttime = atoi(optarg); break; case 'f': file = optarg; break; case 'v': verbose = 1; break; default: usage(argv); } } if ((cpu < 0 || param.cpu < 0 || ttime <= 0) && file == NULL) usage(argv); if (file != NULL) return parse_trace_file(file); verbose = 0; if (setup_tracing(cpu) < 0) return -1; if ((param.out_fd = open(TRACE_FILE, O_WRONLY|O_CREAT|O_TRUNC|O_LARGEFILE, 0644)) < 0) { perror(TRACE_FILE); return -1; } header.signature = TRACE_SIG; header.vmentry_id = vmentry_id; header.vmexit_id = vmexit_id; if (write(param.out_fd, &header, sizeof(struct file_header)) < 0) { perror(TRACE_FILE); return -1; } sprintf(path, "%s/per_cpu/cpu%d/trace_pipe_raw", TRACE_PATH, cpu); if ((param.raw_fd = open(path, O_RDONLY)) < 0) { perror(path); return -1; } if (pipe(param.pipefd) < 0) { perror("pipe"); return -1; } sprintf(path, "%s/tracing_on", TRACE_PATH); if ((fd = open(path, O_WRONLY)) < 0) { perror(path); return -1; } if (pthread_create(&thread, NULL, tracing_thread, ¶m)) { perror("pthread_create"); return -1; } if (write(fd, "1", 2) < 0) { perror(path); disable_tracing(fd, thread); return -1; } printf("CPU to trace: %d\n", cpu); printf("CPU to collect trace: %d\n", param.cpu); printf("Duration: %d seconds\n", ttime); interval.tv_sec = ttime; interval.tv_nsec = 0; if (clock_nanosleep(CLOCK_MONOTONIC, 0, &interval, NULL)) { perror("clock_nanosleep"); disable_tracing(fd, thread); return -1; } disable_tracing(fd, thread); printf("Processing event file ...\n"); return parse_trace_file(TRACE_FILE); }