diff options
-rw-r--r-- | ci/envs/create-rt-tests-rpm.sh | 30 | ||||
-rwxr-xr-x | ci/envs/guest-cmd.sh | 10 | ||||
-rwxr-xr-x | ci/envs/guest-modify.sh | 142 | ||||
-rwxr-xr-x | ci/envs/guest-setup0.sh | 63 | ||||
-rwxr-xr-x | ci/envs/guest-setup1.sh | 26 | ||||
-rw-r--r-- | ci/envs/host-config | 19 | ||||
-rwxr-xr-x | ci/envs/host-run-qemu.sh | 45 | ||||
-rwxr-xr-x | ci/envs/host-setup0.sh | 75 | ||||
-rwxr-xr-x | ci/envs/host-setup1.sh | 76 | ||||
-rw-r--r-- | ci/envs/rt-tests.patch | 26 | ||||
-rw-r--r-- | kernel/arch/x86/configs/opnfv.config | 28 | ||||
-rw-r--r-- | kernel/drivers/vfio/pci/vfio_pci_intrs.c | 2 | ||||
-rw-r--r-- | kernel/kernel/time/hrtimer.c | 11 | ||||
-rw-r--r-- | kernel/kernel/time/tick-sched.c | 64 | ||||
-rw-r--r-- | qemu/hw/i386/kvm/clock.c | 18 | ||||
-rw-r--r-- | qemu/migration/savevm.c | 2 | ||||
-rw-r--r-- | qemu/target-i386/kvm.c | 45 | ||||
-rw-r--r-- | qemu/target-i386/kvm_i386.h | 1 | ||||
-rw-r--r-- | tests/vm-trace/Makefile | 23 | ||||
-rw-r--r-- | tests/vm-trace/README.txt | 51 | ||||
-rw-r--r-- | tests/vm-trace/vm-trace.c | 632 |
21 files changed, 1316 insertions, 73 deletions
diff --git a/ci/envs/create-rt-tests-rpm.sh b/ci/envs/create-rt-tests-rpm.sh new file mode 100644 index 000000000..96fef2bcd --- /dev/null +++ b/ci/envs/create-rt-tests-rpm.sh @@ -0,0 +1,30 @@ +#!/bin/bash +############################################################################## +## Copyright (c) 2015 Intel Corp. +## +## All rights reserved. This program and the accompanying materials +## are made available under the terms of the Apache License, Version 2.0 +## which accompanies this distribution, and is available at +## http://www.apache.org/licenses/LICENSE-2.0 +############################################################################### + +usage () +{ + echo "$0 rpmdir" + exit 1 +} + +rpmdir=$1 +rm -rf ${rpmdir}/rt-tests-0.96-1.el7.centos.x86_64.rpm +gitdir=`mktemp -d` +ROOTDIR=$(cd $(dirname "$0")/../.. && pwd) +VERSION=v0.96 +cd $gitdir +git clone https://git.kernel.org/pub/scm/utils/rt-tests/rt-tests.git +cd rt-tests +git checkout -b ${VERSION} ${VERSION} +patch -p1 -i ${ROOTDIR}/ci/envs/rt-tests.patch +make HAVE_PARSE_CPUSTRING_ALL=1 rpm +cp ./RPMS/x86_64/rt-tests-0.96-1.el7.centos.x86_64.rpm $rpmdir +rm -rf $gitdir + diff --git a/ci/envs/guest-cmd.sh b/ci/envs/guest-cmd.sh new file mode 100755 index 000000000..abfa51a40 --- /dev/null +++ b/ci/envs/guest-cmd.sh @@ -0,0 +1,10 @@ +#!/bin/bash +############################################################################## +## Copyright (c) 2015 Intel Corp. +## +## All rights reserved. This program and the accompanying materials +## are made available under the terms of the Apache License, Version 2.0 +## which accompanies this distribution, and is available at +## http://www.apache.org/licenses/LICENSE-2.0 +############################################################################### + diff --git a/ci/envs/guest-modify.sh b/ci/envs/guest-modify.sh new file mode 100755 index 000000000..1208dd37e --- /dev/null +++ b/ci/envs/guest-modify.sh @@ -0,0 +1,142 @@ +#!/bin/bash + +############################################################################## +# Copyright (c) 2015 Ericsson AB and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## + +# This is copy from yardstick-img-modify on yardstick project. Currently +# yardstick script only ubuntu image, and this one is more for CentOS. +# Example invocation: +# yardstick-img-modify /home/yardstick/tools/ubuntu-server-cloudimg-modify.sh +# +# Warning: the script will create files by default in: +# /tmp/workspace/yardstick +# the files will be owned by root! +# +# TODO: image resize is needed if the base image is too small +# + +set -e +set -x + +die() { + echo "error: $1" >&2 + exit 1 +} + +usage () { + echo "$0 cmd workspace" + exit 1 +} + +test $# -eq 2 || usage +test $(id -u) -eq 0 || die "should invoke using sudo" + +ROOTDIR=$(cd $(dirname "$0")/../.. && pwd) +cmd=$1 +test -x $cmd +workspace=$2 +mountdir=`mktemp -d` + +image_url=${IMAGE_URL:-"http://cloud.centos.org/centos/7/images/CentOS-7-x86_64-GenericCloud-1510.qcow2"} +md5sums_url=${MD5SUMS_URL:-"http://cloud.centos.org/centos/7/images/sha256sum.txt"} + +imgfile="${workspace}/guest.img" +raw_imgfile="${workspace}/guest.raw" +filename=$(basename $image_url) +md5filename=$(basename $md5sums_url) + +# download and checksum base image, conditionally if local copy is outdated +download() { + test -d $workspace || mkdir -p $workspace + cd $workspace + rm -f $md5filename # always download the checksum file to a detect stale image + wget $md5sums_url + test -e $filename || wget -nc $image_url + grep "$filename\$" $md5filename |sha256sum -c + if [ $? -ne 0 ]; then + rm $filename + wget -nc $image_url + grep $filename $md5filename | md5sum -c + fi + rm -rf $raw_imgfile + qemu-img convert $filename $raw_imgfile + cd - +} + +# mount image +setup() { + mkdir -p $mountdir + + loopdevice=$(kpartx -l $raw_imgfile | head -1 | cut -f1 -d ' ') + + kpartx -a $raw_imgfile + # No idea why need this sleep + sleep 3 + mount /dev/mapper/$loopdevice $mountdir + + cp $cmd "$mountdir/" +} + +# modify image running a script using in a chrooted environment +modify() { + # Add the ssh key to the image + mkdir -p ${mountdir}/root/.ssh + cp ${ROOTDIR}/ci/envs/kvm4nfv_key.pub ${mountdir}/root/.ssh/authorized_keys + chmod 700 ${mountdir}/root/.ssh + chmod 600 ${mountdir}/root/.ssh/authorized_keys + + + umount $mountdir + + qemu-img convert -O qcow2 $raw_imgfile $imgfile +} + +# cleanup (umount) the image +cleanup() { + # designed to be idempotent + mount | grep $mountdir && umount $mountdir + kpartx -d $raw_imgfile || true + rm -f $raw_imgfile + rm -rf $mountdir +} + +exitcode="" +error_trap() +{ + local rc=$? + + set +e + + if [ -z "$exitcode" ]; then + exitcode=$rc + fi + + cleanup + + echo "Image build failed with $exitcode" + + exit $exitcode +} + +main() { + cleanup + + trap "error_trap" EXIT SIGTERM + + download + setup + modify + + trap - EXIT SIGTERM + cleanup + + echo "the modified image is found here: $imgfile" +} + +main diff --git a/ci/envs/guest-setup0.sh b/ci/envs/guest-setup0.sh new file mode 100755 index 000000000..490bd570d --- /dev/null +++ b/ci/envs/guest-setup0.sh @@ -0,0 +1,63 @@ +#!/bin/bash +############################################################################## +## Copyright (c) 2015 Intel Corp. +## +## All rights reserved. This program and the accompanying materials +## are made available under the terms of the Apache License, Version 2.0 +## which accompanies this distribution, and is available at +## http://www.apache.org/licenses/LICENSE-2.0 +############################################################################### + + +rpmdir=${1:-"/root/workspace/"} +rpmpat="kernel-4.1*.rpm" +rpm -ihv ${rpmdir}/rt-tests-0.96-1.el7.centos.x86_64.rpm +guest_isolcpus=1 + +# The script's caller should passing the rpm directory that is built out from +# build.sh. The default rpmdir is the one used by yardstick scripts. +install_kernel () { + # Install the kernel rpm + filenum=`ls -l ${rpmdir}/${rpmpat} |wc -l` + if [ $filenum -eq 0 ] + then + echo "No kernel rpm found in workspace/rpm" + exit 1 + elif [ $filenum -gt 1 ] + then + echo "Multiple kernel rpm found in workspace/rpm" + exit 1 + else + krpm=`find "${rpmdir}" -name "${rpmpat}"` + rpm -ihv $krpm + fi +} + +config_grub () { + key=$1 + val=$2 + + if grep '[" ]'${key} /etc/default/grub > /dev/null ; then + sed -i 's/\([" ]\)'${key}'=[^ "]*/\1'${key}'='${val}'/' /etc/default/grub + else + sed -i 's/GRUB_CMDLINE_LINUX="\(.*\)"/GRUB_CMDLINE_LINUX="\1 '${key}'='${val}'"/' /etc/default/grub + fi +} + +# Isolate CPUs from the general scheduler +config_grub 'isolcpus' ${guest_isolcpus} + +# Stop timer ticks on isolated CPUs whenever possible +config_grub 'nohz_full' ${guest_isolcpus} + +# Disable machine check +config_grub 'mce' 'off' + +# Use polling idle loop to improve performance +config_grub 'idle' 'poll' + +## Disable clocksource verification at runtime +config_grub 'tsc' 'reliable' + +grub2-mkconfig -o /boot/grub2/grub.cfg +install_kernel diff --git a/ci/envs/guest-setup1.sh b/ci/envs/guest-setup1.sh new file mode 100755 index 000000000..678baa43b --- /dev/null +++ b/ci/envs/guest-setup1.sh @@ -0,0 +1,26 @@ +#!/bin/bash +############################################################################## +## Copyright (c) 2015 Intel Corp. +## +## All rights reserved. This program and the accompanying materials +## are made available under the terms of the Apache License, Version 2.0 +## which accompanies this distribution, and is available at +## http://www.apache.org/licenses/LICENSE-2.0 +############################################################################### + +set_irq_affinity () { + for irq in /proc/irq/* ; do + echo 0 > /proc/irq/${1}/smp_affinity_list + done +} + +# Disable watchdogs to reduce overhead +echo 0 > /proc/sys/kernel/watchdog +echo 0 > /proc/sys/kernel/nmi_watchdog + +# Route device interrupts to non-RT CPU +set_irq_affinity + +# Disable RT throttling +echo -1 > /proc/sys/kernel/sched_rt_period_us +echo -1 > /proc/sys/kernel/sched_rt_runtime_us diff --git a/ci/envs/host-config b/ci/envs/host-config new file mode 100644 index 000000000..ce6243ce0 --- /dev/null +++ b/ci/envs/host-config @@ -0,0 +1,19 @@ +############################################################################## +## Copyright (c) 2015 Intel Corp. +## +## All rights reserved. This program and the accompanying materials +## are made available under the terms of the Apache License, Version 2.0 +## which accompanies this distribution, and is available at +## http://www.apache.org/licenses/LICENSE-2.0 +############################################################################### + +# Isolated cpus for nfv, must be delimited with ',' +host_isolcpus=3,4 + +# Number of huge pages to create and on which NUMA node +numa_node=0 +huge_pages=2 + +# QEMU executable path and number of cpus for guest +qemu=/usr/libexec/qemu-kvm +guest_cpus=2 diff --git a/ci/envs/host-run-qemu.sh b/ci/envs/host-run-qemu.sh new file mode 100755 index 000000000..c7a2fecc6 --- /dev/null +++ b/ci/envs/host-run-qemu.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +############################################################################## +## Copyright (c) 2015 Intel Corp. +## +## All rights reserved. This program and the accompanying materials +## are made available under the terms of the Apache License, Version 2.0 +## which accompanies this distribution, and is available at +## http://www.apache.org/licenses/LICENSE-2.0 +############################################################################### + +source host-config + +cpumask () { + m=$((1<<${1})) + printf 0x%x ${m} +} + +qmp_sock="/tmp/qmp-sock-$$" + +${qemu} -smp ${guest_cpus} -monitor unix:${qmp_sock},server,nowait -daemonize \ + -cpu host,migratable=off,+invtsc,+tsc-deadline,pmu=off \ + -realtime mlock=on -mem-prealloc -enable-kvm -m 1G \ + -mem-path /mnt/hugetlbfs-1g \ + -drive file=/root/workspace/image/guest.img,cache=none,aio=threads \ + -netdev user,id=guest0,hostfwd=tcp::5555-:22 \ + -device virtio-net-pci,netdev=guest0 \ + -nographic -serial /dev/null -parallel /dev/null + +i=0 +for c in `echo ${host_isolcpus} | sed 's/,/ /g'` ; do + cpu[$i]=${c} + i=`expr $i + 1` +done + +threads=`echo "info cpus" | nc -U ${qmp_sock} | grep thread_id | cut -d= -f3` + +# Bind QEMU processor threads to RT CPUs +i=0 +for tid in ${threads} ; do + tid=`printf %d ${tid}` # this is required to get rid of cr at end + mask=`cpumask ${cpu[$i]}` + taskset -p ${mask} ${tid} + i=`expr $i + 1` +done diff --git a/ci/envs/host-setup0.sh b/ci/envs/host-setup0.sh new file mode 100755 index 000000000..79d1f585a --- /dev/null +++ b/ci/envs/host-setup0.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +############################################################################## +## Copyright (c) 2015 Intel Corp. +## +## All rights reserved. This program and the accompanying materials +## are made available under the terms of the Apache License, Version 2.0 +## which accompanies this distribution, and is available at +## http://www.apache.org/licenses/LICENSE-2.0 +############################################################################### + +source host-config + +rpmdir=${1:-"/root/workspace/rpm/"} +rpmpat="kernel-4.1*.rpm" + +config_grub () { + key=$1 + val=$2 + + if grep '[" ]'${key} /etc/default/grub > /dev/null ; then + sed -i 's/\([" ]\)'${key}'=[^ "]*/\1'${key}'='${val}'/' /etc/default/grub + else + sed -i 's/GRUB_CMDLINE_LINUX="\(.*\)"/GRUB_CMDLINE_LINUX="\1 '${key}'='${val}'"/' /etc/default/grub + fi +} + +# The script's caller should passing the rpm directory that is built out from +# build.sh. The default rpmdir is the one used by yardstick scripts. +install_kernel () { + # Install the kernel rpm + filenum=`ls -l ${rpmdir}/${rpmpat} |wc -l` + if [ $filenum -eq 0 ] + then + echo "No kernel rpm found in workspace/rpm" + exit 1 + elif [ $filenum -gt 1 ] + then + echo "Multiple kernel rpm found in workspace/rpm" + exit 1 + else + krpm=`find "${rpmdir}" -name "${rpmpat}"` + rpm -ihv $krpm + fi +} + +# Isolate CPUs from the general scheduler +config_grub 'isolcpus' ${host_isolcpus} + +# Stop timer ticks on isolated CPUs whenever possible +config_grub 'nohz_full' ${host_isolcpus} + +# Do not call RCU callbacks on isolated CPUs +config_grub 'rcu_nocbs' ${host_isolcpus} + +# Enable intel iommu driver and disable DMA translation for devices +config_grub 'iommu' 'pt' +config_grub 'intel_iommu' 'on' + +# Set HugeTLB pages to 1GB +config_grub 'default_hugepagesz' '1G' +config_grub 'hugepagesz' '1G' + +# Disable machine check +config_grub 'mce' 'off' + +## Use polling idle loop to improve performance +config_grub 'idle' 'poll' + +## Disable clocksource verification at runtime +config_grub 'tsc' 'reliable' + +grub2-mkconfig -o /boot/grub2/grub.cfg + +install_kernel diff --git a/ci/envs/host-setup1.sh b/ci/envs/host-setup1.sh new file mode 100755 index 000000000..3d2de6ddf --- /dev/null +++ b/ci/envs/host-setup1.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +############################################################################## +## Copyright (c) 2015 Intel Corp. +## +## All rights reserved. This program and the accompanying materials +## are made available under the terms of the Apache License, Version 2.0 +## which accompanies this distribution, and is available at +## http://www.apache.org/licenses/LICENSE-2.0 +############################################################################### + +source host-config + + +############################## +# Create 1GB pages for guest # +############################## + +hugepage_size=`cat /proc/meminfo |grep Hugepagesize |tr -s " "| cut -f 2 -d " "` +if [[ $hugepage_size -ne 1048576 ]] +then + echo "Need 1G huge page support for performance benefit" + exit 1 +fi + +mkdir -p /mnt/hugetlbfs-1g +mount -t hugetlbfs hugetlbfs /mnt/hugetlbfs-1g -osize=1G + +hugepage_dir="/sys/devices/system/node/node${numa_node}/hugepages/hugepages-1048576kB/nr_hugepages" + +huge_pages+=`cat $hugepage_dir` +echo ${huge_pages} > ${hugepage_dir} + +############################ +# RT optimization # +############################ +# Disable watchdogs to reduce overhead +echo 0 > /proc/sys/kernel/watchdog +echo 0 > /proc/sys/kernel/nmi_watchdog + +# Change RT priority of ksoftirqd and rcuc kernel threads on isolated CPUs +i=0 +for c in `echo $host_isolcpus | sed 's/,/ /g'` ; do + tid=`pgrep -a ksoftirq | grep "ksoftirqd/${c}$" | cut -d ' ' -f 1` + chrt -fp 2 ${tid} + + tid=`pgrep -a rcuc | grep "rcuc/${c}$" | cut -d ' ' -f 1` + chrt -fp 3 ${tid} + + cpu[$i]=${c} + i=`expr $i + 1` +done + +# Change RT priority of rcub kernel threads +for tid in `pgrep -a rcub | cut -d ' ' -f 1` ; do + chrt -fp 3 ${tid} +done + +# Disable RT throttling +echo -1 > /proc/sys/kernel/sched_rt_period_us +echo -1 > /proc/sys/kernel/sched_rt_runtime_us + +# Reroute interrupts bound to isolated CPUs to CPU 0 +for irq in /proc/irq/* ; do + if [ -d ${irq} ] && ! grep - ${irq}/smp_affinity_list > /dev/null ; then + al=`cat ${irq}/smp_affinity_list` + if [[ ${cpu[*]} =~ ${al} ]] ; then + echo 0 > ${irq}/smp_affinity_list + fi + fi +done + +# Change the iptable so that we can ssh to the guest remotely +iptables -I INPUT -p tcp --dport 5555 -j ACCEPT +# TODO: download guest disk image from artifactory + diff --git a/ci/envs/rt-tests.patch b/ci/envs/rt-tests.patch new file mode 100644 index 000000000..b938e0768 --- /dev/null +++ b/ci/envs/rt-tests.patch @@ -0,0 +1,26 @@ +/******************************************************************************* + * * Copyright (c) 2015 Intel Corp. + * * + * * All rights reserved. This program and the accompanying materials + * * are made available under the terms of the Apache License, Version 2.0 + * * which accompanies this distribution, and is available at + * * http://www.apache.org/licenses/LICENSE-2.0 + * *******************************************************************************/ + +diff --git a/Makefile b/Makefile +index 1e4b7d1b0d3a..98968b94a57f 100644 +--- a/Makefile ++++ b/Makefile +@@ -198,10 +198,10 @@ release: distclean changelog + cp -r Makefile COPYING ChangeLog MAINTAINERS doc README.markdown src tmp/rt-tests + rm -f rt-tests-$(VERSION).tar rt-tests-$(VERSION).tar.asc + tar -C tmp -cf rt-tests-$(VERSION).tar rt-tests +- gpg2 --default-key clrkwllms@kernel.org --detach-sign --armor rt-tests-$(VERSION).tar ++ #gpg2 --default-key clrkwllms@kernel.org --detach-sign --armor rt-tests-$(VERSION).tar + gzip rt-tests-$(VERSION).tar + rm -f ChangeLog +- cp rt-tests-$(VERSION).tar.gz rt-tests-$(VERSION).tar.asc releases ++ cp rt-tests-$(VERSION).tar.gz releases + + .PHONY: tarball + tarball: diff --git a/kernel/arch/x86/configs/opnfv.config b/kernel/arch/x86/configs/opnfv.config index 704af497d..462858c57 100644 --- a/kernel/arch/x86/configs/opnfv.config +++ b/kernel/arch/x86/configs/opnfv.config @@ -74,7 +74,7 @@ CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y CONFIG_POSIX_MQUEUE_SYSCTL=y CONFIG_CROSS_MEMORY_ATTACH=y -# CONFIG_FHANDLE is not set +CONFIG_FHANDLE=y CONFIG_USELIB=y # CONFIG_AUDIT is not set CONFIG_HAVE_ARCH_AUDITSYSCALL=y @@ -734,15 +734,15 @@ CONFIG_IP_PNP_BOOTP=y CONFIG_IP_PNP_RARP=y # CONFIG_NET_IPIP is not set # CONFIG_NET_IPGRE_DEMUX is not set -# CONFIG_NET_IP_TUNNEL is not set +CONFIG_NET_IP_TUNNEL=y CONFIG_IP_MROUTE=y # CONFIG_IP_MROUTE_MULTIPLE_TABLES is not set CONFIG_IP_PIMSM_V1=y CONFIG_IP_PIMSM_V2=y CONFIG_SYN_COOKIES=y -# CONFIG_NET_UDP_TUNNEL is not set +CONFIG_NET_UDP_TUNNEL=y # CONFIG_NET_FOU is not set -# CONFIG_GENEVE is not set +CONFIG_GENEVE=y # CONFIG_INET_AH is not set # CONFIG_INET_ESP is not set # CONFIG_INET_IPCOMP is not set @@ -972,11 +972,15 @@ CONFIG_NET_SCH_FIFO=y # CONFIG_DCB is not set CONFIG_DNS_RESOLVER=y # CONFIG_BATMAN_ADV is not set -# CONFIG_OPENVSWITCH is not set +CONFIG_OPENVSWITCH=m +CONFIG_OPENVSWITCH_VXLAN=m +CONFIG_OPENVSWITCH_GENEVE=m # CONFIG_VSOCKETS is not set # CONFIG_NETLINK_MMAP is not set # CONFIG_NETLINK_DIAG is not set -# CONFIG_MPLS is not set +CONFIG_MPLS=y +CONFIG_NET_MPLS_GSO=m +# CONFIG_MPLS_ROUTING is not set # CONFIG_HSR is not set # CONFIG_NET_SWITCHDEV is not set CONFIG_RPS=y @@ -1352,7 +1356,7 @@ CONFIG_NET_CORE=y # CONFIG_NET_TEAM is not set # CONFIG_MACVLAN is not set # CONFIG_IPVLAN is not set -# CONFIG_VXLAN is not set +CONFIG_VXLAN=y CONFIG_NETCONSOLE=y CONFIG_NETPOLL=y CONFIG_NET_POLL_CONTROLLER=y @@ -1448,10 +1452,12 @@ CONFIG_IGB=y CONFIG_IGBVF=y CONFIG_IXGB=y CONFIG_IXGBE=y +CONFIG_IXGBE_VXLAN=y # CONFIG_IXGBE_HWMON is not set -# CONFIG_IXGBEVF is not set +CONFIG_IXGBEVF=y CONFIG_I40E=y -# CONFIG_I40EVF is not set +CONFIG_I40E_VXLAN=y +CONFIG_I40EVF=y # CONFIG_FM10K is not set CONFIG_NET_VENDOR_I825XX=y # CONFIG_IP1000 is not set @@ -2373,7 +2379,7 @@ CONFIG_FB=y # CONFIG_FIRMWARE_EDID is not set CONFIG_FB_CMDLINE=y # CONFIG_FB_DDC is not set -# CONFIG_FB_BOOT_VESA_SUPPORT is not set +CONFIG_FB_BOOT_VESA_SUPPORT=y CONFIG_FB_CFB_FILLRECT=y CONFIG_FB_CFB_COPYAREA=y CONFIG_FB_CFB_IMAGEBLIT=y @@ -2400,7 +2406,7 @@ CONFIG_FB_TILEBLITTING=y # CONFIG_FB_IMSTT is not set # CONFIG_FB_VGA16 is not set # CONFIG_FB_UVESA is not set -# CONFIG_FB_VESA is not set +CONFIG_FB_VESA=y # CONFIG_FB_N411 is not set # CONFIG_FB_HGA is not set # CONFIG_FB_OPENCORES is not set diff --git a/kernel/drivers/vfio/pci/vfio_pci_intrs.c b/kernel/drivers/vfio/pci/vfio_pci_intrs.c index 1f577b4ac..a21d8e1e3 100644 --- a/kernel/drivers/vfio/pci/vfio_pci_intrs.c +++ b/kernel/drivers/vfio/pci/vfio_pci_intrs.c @@ -352,7 +352,7 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, pci_write_msi_msg(irq, &msg); } - ret = request_irq(irq, vfio_msihandler, 0, + ret = request_irq(irq, vfio_msihandler, IRQF_NO_THREAD, vdev->ctx[vector].name, trigger); if (ret) { kfree(vdev->ctx[vector].name); diff --git a/kernel/kernel/time/hrtimer.c b/kernel/kernel/time/hrtimer.c index 2c6be169b..5d193396e 100644 --- a/kernel/kernel/time/hrtimer.c +++ b/kernel/kernel/time/hrtimer.c @@ -583,6 +583,12 @@ static int hrtimer_reprogram(struct hrtimer *timer, if (hrtimer_callback_running(timer)) return 0; + if (base->cpu_base != cpu_base) + return 0; + + if (cpu_base->in_hrtirq) + return 0; + /* * CLOCK_REALTIME timer might be requested with an absolute * expiry time which is less than base->offset. Nothing wrong @@ -613,12 +619,11 @@ static int hrtimer_reprogram(struct hrtimer *timer, if (cpu_base->hang_detected) return 0; + cpu_base->expires_next = expires; /* * Clockevents returns -ETIME, when the event was in the past. */ - res = tick_program_event(expires, 0); - if (!IS_ERR_VALUE(res)) - cpu_base->expires_next = expires; + res = tick_program_event(expires, 1); return res; } diff --git a/kernel/kernel/time/tick-sched.c b/kernel/kernel/time/tick-sched.c index b3841ba00..f61dbf202 100644 --- a/kernel/kernel/time/tick-sched.c +++ b/kernel/kernel/time/tick-sched.c @@ -576,6 +576,20 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); +static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) +{ + hrtimer_cancel(&ts->sched_timer); + hrtimer_set_expires(&ts->sched_timer, ts->last_tick); + + /* Forward the time to expire in the future */ + hrtimer_forward(&ts->sched_timer, now, tick_period); + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); + else + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); +} + static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now, int cpu) { @@ -704,22 +718,16 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, goto out; } - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, expires, - HRTIMER_MODE_ABS_PINNED); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - goto out; - } else if (!tick_program_event(expires, 0)) - goto out; - /* - * We are past the event already. So we crossed a - * jiffie boundary. Update jiffies and raise the - * softirq. - */ - tick_do_update_jiffies64(ktime_get()); + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_start(&ts->sched_timer, expires, + HRTIMER_MODE_ABS_PINNED); + else + tick_program_event(expires, 1); + } else { + /* Tick is stopped, but required now. Enforce it */ + tick_nohz_restart(ts, now); + } - raise_softirq_irqoff(TIMER_SOFTIRQ); out: ts->next_jiffies = next_jiffies; ts->last_jiffies = last_jiffies; @@ -880,32 +888,6 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } -static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) -{ - hrtimer_cancel(&ts->sched_timer); - hrtimer_set_expires(&ts->sched_timer, ts->last_tick); - - while (1) { - /* Forward the time to expire in the future */ - hrtimer_forward(&ts->sched_timer, now, tick_period); - - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start_expires(&ts->sched_timer, - HRTIMER_MODE_ABS_PINNED); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - break; - } else { - if (!tick_program_event( - hrtimer_get_expires(&ts->sched_timer), 0)) - break; - } - /* Reread time and update jiffies */ - now = ktime_get(); - tick_do_update_jiffies64(now); - } -} - static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ diff --git a/qemu/hw/i386/kvm/clock.c b/qemu/hw/i386/kvm/clock.c index efdf16584..0593a3f1f 100644 --- a/qemu/hw/i386/kvm/clock.c +++ b/qemu/hw/i386/kvm/clock.c @@ -17,7 +17,7 @@ #include "qemu/host-utils.h" #include "sysemu/sysemu.h" #include "sysemu/kvm.h" -#include "sysemu/cpus.h" +#include "kvm_i386.h" #include "hw/sysbus.h" #include "hw/kvm/clock.h" @@ -125,21 +125,7 @@ static void kvmclock_vm_state_change(void *opaque, int running, return; } - cpu_synchronize_all_states(); - /* In theory, the cpu_synchronize_all_states() call above wouldn't - * affect the rest of the code, as the VCPU state inside CPUState - * is supposed to always match the VCPU state on the kernel side. - * - * In practice, calling cpu_synchronize_state() too soon will load the - * kernel-side APIC state into X86CPU.apic_state too early, APIC state - * won't be reloaded later because CPUState.vcpu_dirty==true, and - * outdated APIC state may be migrated to another host. - * - * The real fix would be to make sure outdated APIC state is read - * from the kernel again when necessary. While this is not fixed, we - * need the cpu_clean_all_dirty() call below. - */ - cpu_clean_all_dirty(); + kvm_synchronize_all_tsc(); ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data); if (ret < 0) { diff --git a/qemu/migration/savevm.c b/qemu/migration/savevm.c index 60712153f..a42874b10 100644 --- a/qemu/migration/savevm.c +++ b/qemu/migration/savevm.c @@ -945,8 +945,8 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) qemu_savevm_state_complete(f); ret = qemu_file_get_error(f); } + qemu_savevm_state_cancel(); if (ret != 0) { - qemu_savevm_state_cancel(); error_setg_errno(errp, -ret, "Error while writing VM state"); } return ret; diff --git a/qemu/target-i386/kvm.c b/qemu/target-i386/kvm.c index 066d03d99..721c580ed 100644 --- a/qemu/target-i386/kvm.c +++ b/qemu/target-i386/kvm.c @@ -96,6 +96,51 @@ bool kvm_allows_irq0_override(void) return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing(); } +static int kvm_get_tsc(CPUState *cs) +{ + X86CPU *cpu = X86_CPU(cs); + CPUX86State *env = &cpu->env; + struct { + struct kvm_msrs info; + struct kvm_msr_entry entries[1]; + } msr_data; + int ret; + + if (env->tsc_valid) { + return 0; + } + + msr_data.info.nmsrs = 1; + msr_data.entries[0].index = MSR_IA32_TSC; + env->tsc_valid = !runstate_is_running(); + + ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data); + if (ret < 0) { + return ret; + } + + env->tsc = msr_data.entries[0].data; + return 0; +} + +static inline void do_kvm_synchronize_tsc(void *arg) +{ + CPUState *cpu = arg; + + kvm_get_tsc(cpu); +} + +void kvm_synchronize_all_tsc(void) +{ + CPUState *cpu; + + if (kvm_enabled()) { + CPU_FOREACH(cpu) { + run_on_cpu(cpu, do_kvm_synchronize_tsc, cpu); + } + } +} + static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max) { struct kvm_cpuid2 *cpuid; diff --git a/qemu/target-i386/kvm_i386.h b/qemu/target-i386/kvm_i386.h index e557e94f4..c1b312ba2 100644 --- a/qemu/target-i386/kvm_i386.h +++ b/qemu/target-i386/kvm_i386.h @@ -15,6 +15,7 @@ bool kvm_allows_irq0_override(void); bool kvm_has_smm(void); +void kvm_synchronize_all_tsc(void); void kvm_arch_reset_vcpu(X86CPU *cs); void kvm_arch_do_init_vcpu(X86CPU *cs); diff --git a/tests/vm-trace/Makefile b/tests/vm-trace/Makefile new file mode 100644 index 000000000..e367739d4 --- /dev/null +++ b/tests/vm-trace/Makefile @@ -0,0 +1,23 @@ +DEBUG ?= n + +ifeq ($(DEBUG),y) + DBGFLAGS = -g -DDEBUG +else + DBGFLAGS = -O2 +endif + +XTRA_CFLAGS = $(DBGFLAGS) -Wall +XTRALIBS = -lrt -lpthread + +all: vm-trace cscope + +vm-trace: vm-trace.c + $(CC) $(XTRA_CFLAGS) $< -o $@ $(XTRALIBS) + +cscope.out: vm-trace.c + cscope -b + +cscope: cscope.out + +clean: + rm -f *.o core* vm-trace cscope.* diff --git a/tests/vm-trace/README.txt b/tests/vm-trace/README.txt new file mode 100644 index 000000000..815dca93a --- /dev/null +++ b/tests/vm-trace/README.txt @@ -0,0 +1,51 @@ +vm-trace is a tool utilizing the ftrace infrastructure in Linux kernel to +measure VM preemption latencies. For more info about ftrace, see +Documentation/trace/ftrace.txt. See include/linux/ring_buffer.h and +include/linux/ftrace_event.h for data structures used by ftrace. + +The tool enables 2 trace points in KVM driver: +kvm_exit defined in vmx_vcpu_run() (see arch/x86/kvm/vmx.c), and +kvm_entry defined in vcpu_enter_guest() (see arch/x86/kvm/x86.c). + +It then spawns a thread to extract trace data from the kernel ftrace ring +buffer using the splice() system call. Once the tracing duration has elapsed, +vm-trace calculates VM exit-entry latencies based on the timestamps of the +events. (A future improvement could be to spawn another thread to process the +trace on the fly to improve vm-trace's performance.) + +To take a trace, do the following: + +1. Run qemu-kvm to start guest VM +2. Bind each qemu-kvm vCPU thread to an isolated pCPU +3. Start desired workload on the guest +4. Run vm-trace on the host: + vm-trace -p cpu_to_trace -c cpu_to_collect_trace -s duration_in_seconds + +cpu_to_trace is one of the pCPUs from step 2 above that you want to trace. +vm-trace does not support tracing multiple pCPUs. + +cpu_to_collect_trace is the CPU used to read and save the trace data. +If the host system is NUMA, make sure to assign a CPU in the same NUMA node +as cpu_to_trace to cpu_to_collect_trace. + +A binary file named trace.bin will be saved in the current working directory. +Be aware that, depending on the tracing duration and type of workload running +on the guest, the file can become quite large. + +vm-trace requires root privileges. + +Some statistics of the events will be displayed similar to the following: + + Number of VM events = 21608832 + Average VM Exit-Entry latency = 1us + Maximum VM Exit-Entry latency = 5us + Maximum cumulative latency within 1ms = 12us + +trace.bin will be overwritten each time vm-trace is run in this mode, +so rename/copy the file if you want to keep it. + +To process a previously collected trace file, run: + vm-trace -f trace_file [-v] + +If -v is specified, all events in the trace file will be displayed. +This is helpful for identifying cause of long latency. diff --git a/tests/vm-trace/vm-trace.c b/tests/vm-trace/vm-trace.c new file mode 100644 index 000000000..32b4d8f74 --- /dev/null +++ b/tests/vm-trace/vm-trace.c @@ -0,0 +1,632 @@ +#define _LARGEFILE64_SOURCE +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <stdint.h> +#include <fcntl.h> +#include <limits.h> +#include <time.h> +#include <pthread.h> +#include <sched.h> +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> + +struct file_header +{ + uint32_t signature; + uint16_t vmentry_id; + uint16_t vmexit_id; +}; + +struct thread_param +{ + int raw_fd; + int out_fd; + int pipefd[2]; + int cpu; +}; + +struct event +{ + uint64_t timestamp; + uint16_t type; +}; + +enum rb_type +{ + RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28, + RINGBUF_TYPE_PADDING, + RINGBUF_TYPE_TIME_EXTEND, + RINGBUF_TYPE_TIME_STAMP +}; + +struct vmexit_data +{ + uint32_t exit_reason; + uint64_t guest_rip; + uint32_t isa; + uint64_t info1; + uint64_t info2; +}; + +struct vmentry_data +{ + uint32_t vcpu_id; +}; + +#pragma pack(1) + +struct trace_data +{ + uint16_t type; + uint8_t flags; + uint8_t preempt_count; + int32_t pid; + uint16_t migrate_disable; + uint16_t padding1; + uint32_t padding2; + union + { + struct vmexit_data vmexit; + struct vmentry_data vmentry; + }; +}; + +struct event_entry +{ + uint32_t type_len: 5, time_delta: 27; + union + { + uint32_t array[1]; + struct trace_data trace; + }; +}; + +#define DATA_SIZE 4080 + +struct event_page +{ + uint64_t timestamp; + uint64_t commit; + union + { + uint8_t data[DATA_SIZE]; + struct event_entry event; + }; +}; + +#define PAGE_SIZE sizeof(struct event_page) + +#define TRACE_PATH "/sys/kernel/debug/tracing" +#define TRACE_FILE "trace.bin" +#define TRACE_SIG 0xcce96d01 + +#define VM_ENTRY "kvm/kvm_entry" +#define VM_EXIT "kvm/kvm_exit" + +#ifdef DEBUG +#define dbg_printf(_f_,...) {printf(_f_,##__VA_ARGS__);} +#else +#define dbg_printf(_f_,...) +#endif + +static uint16_t vmentry_id; +static uint16_t vmexit_id; + +static int stop_tracing; +static int verbose; + + +static struct event* read_events(int fd, size_t *n) +{ + int i, j; + ssize_t rd; + uint32_t len, offset; + uint64_t event_time, pre_event_time, pre_page_time = 0; + struct event_page page; + struct event_entry *e; + struct event *events = NULL; + + *n = 0; + for (i = 0; 1; i++) + { + if ((rd = read(fd, &page, PAGE_SIZE)) == 0) + { + if (*n == 0) + fprintf(stderr, "No events found\nMake sure a VM is running and the CPU to trace is bound to a QEMU vCPU thread\n"); + + return events; + } + else if (rd < 0) + { + fprintf(stderr, "Failed to read trace file\n"); + free(events); + return NULL; + } + else if (rd < PAGE_SIZE) + { + fprintf(stderr, "Trace file does not have enough data\n"); + free(events); + return NULL; + } + + dbg_printf("Page %d:\n", i); + dbg_printf(" timestamp = %ld\n", page.timestamp); + dbg_printf(" commit = %ld\n", page.commit); + + if (page.timestamp < pre_page_time) + fprintf(stderr, "Warning: page time going backwards\n"); + + pre_page_time = page.timestamp; + + offset = 0; + pre_event_time = 0; + for (j = 0; 1; j++) + { + e = (struct event_entry *)(page.data+offset); + + if (e->type_len == 0) + e = (struct event_entry *)&e->array[1]; + + if (pre_event_time) + event_time = pre_event_time + e->time_delta; + else + event_time = page.timestamp; + + if (e->type_len < RINGBUF_TYPE_DATA_TYPE_LEN_MAX) + { + len = (e->type_len+1) * sizeof(uint32_t); + + if (e->trace.type == vmexit_id || e->trace.type == vmentry_id) + { + if ((events = realloc(events, (*n+1) * sizeof(struct event))) == NULL) + { + fprintf(stderr, "Failed to allocate memory\n"); + return NULL; + } + + events[*n].timestamp = event_time; + events[*n].type = e->trace.type; + + if (verbose) + { + if (e->trace.type == vmexit_id) + { + printf(" %ld: VM_EXIT reason: %08x, ", event_time, e->trace.vmexit.exit_reason); + printf("info1: %016lx, info2: %08lx\n", e->trace.vmexit.info1, e->trace.vmexit.info2); + } + else + { + printf(" %ld: VM_ENTRY dt: %d, vcpu: %d\n", event_time, e->time_delta, e->trace.vmentry.vcpu_id); + } + } + + *n += 1; + } + else if (e->trace.type == 0) + break; + else + fprintf(stderr, "UNKNOWN event %d\n", e->trace.type); + } + else if (e->type_len == RINGBUF_TYPE_TIME_EXTEND) + { + len = 8; + event_time = pre_event_time + (e->time_delta | (e->array[0]<<28)); + dbg_printf(" entry %d: TIME_EXTEND %ld\n", j, event_time); + } + else if (e->type_len == RINGBUF_TYPE_PADDING) + { + if (e->time_delta == 0) + break; + + len = e->array[0] + sizeof(uint32_t); + dbg_printf(" entry %d: PADDING, len %d @ %ld\n", j, len, event_time); + } + else if (e->type_len == RINGBUF_TYPE_TIME_STAMP) + { + len = 16; + dbg_printf(" entry %d: TIME_STAMP @ %ld\n", j, event_time); + } + + pre_event_time = event_time; + + offset += len; + if (offset >= DATA_SIZE) + break; + } + + dbg_printf(" events in page %d = %d\n", i, j); + } + + return events; +} + +static int parse_trace_file(char *file) +{ + int fd; + uint16_t pre_event = 0; + struct event *events; + size_t num_events, i, j = 0; + uint64_t d, exit_time = 0, total = 0, max_lat = 0; + uint64_t pre_time = 0, acc_1ms = 0, max_acc = 0; + struct file_header header; + + if ((fd = open(file, O_RDONLY|O_LARGEFILE)) < 0) + { + perror(file); + return -1; + } + + if (read(fd, &header, sizeof(struct file_header)) < 0) + { + perror(file); + return -1; + } + + if (header.signature != TRACE_SIG) + { + fprintf(stderr, "File %s is not a vm-trace file\n", file); + return -1; + } + + vmentry_id = header.vmentry_id; + vmexit_id = header.vmexit_id; + + if ((events = read_events(fd, &num_events)) == NULL) + return -1; + + printf("Number of VM events = %ld\n", num_events); + + for (i = 0; i < num_events; i++) + { + if (events[i].type == vmexit_id) + { + exit_time = events[i].timestamp; + } + else if (events[i].type == vmentry_id) + { + if (exit_time) + { + d = events[i].timestamp - exit_time; + if (d > max_lat) + max_lat = d; + + total += d; + acc_1ms += d; + j++; + } + } + + if (events[i].type == pre_event) + fprintf(stderr, "Warning: repeated events\n"); + pre_event = events[i].type; + + if (pre_time) + { + if (events[i].timestamp - pre_time >= 1000000) + { + if (acc_1ms > max_acc) + max_acc = acc_1ms; + + acc_1ms = 0; + pre_time = events[i].timestamp; + } + } + else + pre_time = events[i].timestamp; + } + + free(events); + + printf("Average VM Exit-Entry latency = %ldus\n", total/j/1000); + printf("Maximum VM Exit-Entry latency = %ldus\n", max_lat/1000); + printf("Maximum cumulative latency within 1ms = %ldus\n", max_acc/1000); + + close(fd); + return 0; +} + +static int get_event_id(char *event) +{ + char path[PATH_MAX+1]; + int fd; + ssize_t r; + + sprintf(path, "%s/events/%s/id", TRACE_PATH, event); + if ((fd = open(path, O_RDONLY)) < 0) + { + perror(path); + return -1; + } + + if ((r = read(fd, path, PATH_MAX)) < 0) + { + close(fd); + perror(path); + return -1; + } + + close(fd); + + path[r+1] = '\0'; + return atoi(path); +} + +static int enable_event(char *event, int en) +{ + char path[PATH_MAX+1], *s; + int fd; + + if (en) + s = "1"; + else + s = "0"; + + sprintf(path, "%s/events/%s/enable", TRACE_PATH, event); + if ((fd = open(path, O_WRONLY | O_TRUNC)) < 0) + { + perror(path); + return -1; + } + + if (write(fd, s, 2) < 0) + { + close(fd); + perror(path); + return -1; + } + + close(fd); + + return 0; +} + +static int enable_events(int en) +{ + if (enable_event(VM_ENTRY, en) < 0) + return -1; + + if (enable_event(VM_EXIT, en) < 0) + return -1; + + return 0; +} + +static int setup_tracing(int cpu) +{ + char path[PATH_MAX+1], mask[20]; + int fd, h, l; + + if (cpu > 31) + { + l = 0; + h = 1 << (cpu-32); + } + else + { + l = 1 << cpu; + h = 0; + } + + sprintf(mask, "%X,%X", h, l); + + sprintf(path, "%s/tracing_cpumask", TRACE_PATH); + if ((fd = open(path, O_WRONLY | O_TRUNC)) < 0) + { + perror(path); + return -1; + } + + if (write(fd, mask, strlen(mask)) < 0) + { + close(fd); + perror(path); + return -1; + } + + close(fd); + + sprintf(path, "%s/trace", TRACE_PATH); + if ((fd = open(path, O_WRONLY | O_TRUNC)) < 0) + { + perror(path); + return -1; + } + + if (write(fd, "", 1) < 0) + { + close(fd); + perror(path); + return -1; + } + + close(fd); + + if ((vmentry_id = get_event_id(VM_ENTRY)) < 0) + return -1; + + if ((vmexit_id = get_event_id(VM_EXIT)) < 0) + return -1; + + if (enable_events(1) < 0) + return -1; + + return 0; +} + +static void disable_tracing(int fd, pthread_t thread) +{ + if (write(fd, "0", 2) < 0) + perror("disable_tracing"); + close(fd); + + enable_events(0); + + stop_tracing = 1; + pthread_join(thread, NULL); +} + +static void *tracing_thread(void *param) +{ + cpu_set_t mask; + struct thread_param *p = param; + ssize_t r; + + CPU_ZERO(&mask); + CPU_SET(p->cpu, &mask); + if(pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &mask) != 0) + fprintf(stderr, "Could not set CPU affinity to CPU #%d\n", p->cpu); + + while (!stop_tracing) + { + if ((r = splice(p->raw_fd, NULL, p->pipefd[1], NULL, PAGE_SIZE, SPLICE_F_MOVE|SPLICE_F_NONBLOCK)) < 0) + { + if (errno == EAGAIN) + continue; + + perror("splice1"); + break; + } + else if (r == 0) + continue; + + if (splice(p->pipefd[0], NULL, p->out_fd, NULL, PAGE_SIZE, SPLICE_F_MOVE|SPLICE_F_NONBLOCK) < 0) + { + perror("splice2"); + break; + } + } + + close(p->raw_fd); + close(p->pipefd[1]); + close(p->pipefd[0]); + close(p->out_fd); + + return NULL; +} + +static void usage(char *argv[]) +{ + fprintf(stderr, "Usage: %s -p cpu_to_trace -c cpu_to_collect_trace -s duration_in_seconds\n", argv[0]); + fprintf(stderr, " %s -f trace_file [-v]\n", argv[0]); + exit(-1); +} + +int main(int argc, char *argv[]) +{ + char path[PATH_MAX+1], *file = NULL; + int cpu = -1, ttime = 0; + int opt, fd; + pthread_t thread; + struct file_header header; + struct thread_param param; + struct timespec interval; + + param.cpu = -1; + + while ((opt = getopt(argc, argv, "p:c:s:f:v")) != -1) + { + switch (opt) + { + case 'p': + cpu = atoi(optarg); + break; + case 'c': + param.cpu = atoi(optarg); + break; + case 's': + ttime = atoi(optarg); + break; + case 'f': + file = optarg; + break; + case 'v': + verbose = 1; + break; + default: + usage(argv); + } + } + + if ((cpu < 0 || param.cpu < 0 || ttime <= 0) && file == NULL) + usage(argv); + + if (file != NULL) + return parse_trace_file(file); + + verbose = 0; + + if (setup_tracing(cpu) < 0) + return -1; + + if ((param.out_fd = open(TRACE_FILE, O_WRONLY|O_CREAT|O_TRUNC|O_LARGEFILE, 0644)) < 0) + { + perror(TRACE_FILE); + return -1; + } + + header.signature = TRACE_SIG; + header.vmentry_id = vmentry_id; + header.vmexit_id = vmexit_id; + + if (write(param.out_fd, &header, sizeof(struct file_header)) < 0) + { + perror(TRACE_FILE); + return -1; + } + + sprintf(path, "%s/per_cpu/cpu%d/trace_pipe_raw", TRACE_PATH, cpu); + if ((param.raw_fd = open(path, O_RDONLY)) < 0) + { + perror(path); + return -1; + } + + if (pipe(param.pipefd) < 0) + { + perror("pipe"); + return -1; + } + + sprintf(path, "%s/tracing_on", TRACE_PATH); + if ((fd = open(path, O_WRONLY)) < 0) + { + perror(path); + return -1; + } + + if (pthread_create(&thread, NULL, tracing_thread, ¶m)) + { + perror("pthread_create"); + return -1; + } + + if (write(fd, "1", 2) < 0) + { + perror(path); + disable_tracing(fd, thread); + return -1; + } + + printf("CPU to trace: %d\n", cpu); + printf("CPU to collect trace: %d\n", param.cpu); + printf("Duration: %d seconds\n", ttime); + + interval.tv_sec = ttime; + interval.tv_nsec = 0; + if (clock_nanosleep(CLOCK_MONOTONIC, 0, &interval, NULL)) + { + perror("clock_nanosleep"); + disable_tracing(fd, thread); + return -1; + } + + disable_tracing(fd, thread); + + printf("Processing event file ...\n"); + + return parse_trace_file(TRACE_FILE); +} |