summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ci/envs/create-rt-tests-rpm.sh30
-rwxr-xr-xci/envs/guest-cmd.sh10
-rwxr-xr-xci/envs/guest-modify.sh142
-rwxr-xr-xci/envs/guest-setup0.sh63
-rwxr-xr-xci/envs/guest-setup1.sh26
-rw-r--r--ci/envs/host-config19
-rwxr-xr-xci/envs/host-run-qemu.sh45
-rwxr-xr-xci/envs/host-setup0.sh75
-rwxr-xr-xci/envs/host-setup1.sh76
-rw-r--r--ci/envs/rt-tests.patch26
-rw-r--r--kernel/arch/x86/configs/opnfv.config28
-rw-r--r--kernel/drivers/vfio/pci/vfio_pci_intrs.c2
-rw-r--r--kernel/kernel/time/hrtimer.c11
-rw-r--r--kernel/kernel/time/tick-sched.c64
-rw-r--r--qemu/hw/i386/kvm/clock.c18
-rw-r--r--qemu/migration/savevm.c2
-rw-r--r--qemu/target-i386/kvm.c45
-rw-r--r--qemu/target-i386/kvm_i386.h1
-rw-r--r--tests/vm-trace/Makefile23
-rw-r--r--tests/vm-trace/README.txt51
-rw-r--r--tests/vm-trace/vm-trace.c632
21 files changed, 1316 insertions, 73 deletions
diff --git a/ci/envs/create-rt-tests-rpm.sh b/ci/envs/create-rt-tests-rpm.sh
new file mode 100644
index 000000000..96fef2bcd
--- /dev/null
+++ b/ci/envs/create-rt-tests-rpm.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+##############################################################################
+## Copyright (c) 2015 Intel Corp.
+##
+## All rights reserved. This program and the accompanying materials
+## are made available under the terms of the Apache License, Version 2.0
+## which accompanies this distribution, and is available at
+## http://www.apache.org/licenses/LICENSE-2.0
+###############################################################################
+
+usage ()
+{
+ echo "$0 rpmdir"
+ exit 1
+}
+
+rpmdir=$1
+rm -rf ${rpmdir}/rt-tests-0.96-1.el7.centos.x86_64.rpm
+gitdir=`mktemp -d`
+ROOTDIR=$(cd $(dirname "$0")/../.. && pwd)
+VERSION=v0.96
+cd $gitdir
+git clone https://git.kernel.org/pub/scm/utils/rt-tests/rt-tests.git
+cd rt-tests
+git checkout -b ${VERSION} ${VERSION}
+patch -p1 -i ${ROOTDIR}/ci/envs/rt-tests.patch
+make HAVE_PARSE_CPUSTRING_ALL=1 rpm
+cp ./RPMS/x86_64/rt-tests-0.96-1.el7.centos.x86_64.rpm $rpmdir
+rm -rf $gitdir
+
diff --git a/ci/envs/guest-cmd.sh b/ci/envs/guest-cmd.sh
new file mode 100755
index 000000000..abfa51a40
--- /dev/null
+++ b/ci/envs/guest-cmd.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+##############################################################################
+## Copyright (c) 2015 Intel Corp.
+##
+## All rights reserved. This program and the accompanying materials
+## are made available under the terms of the Apache License, Version 2.0
+## which accompanies this distribution, and is available at
+## http://www.apache.org/licenses/LICENSE-2.0
+###############################################################################
+
diff --git a/ci/envs/guest-modify.sh b/ci/envs/guest-modify.sh
new file mode 100755
index 000000000..1208dd37e
--- /dev/null
+++ b/ci/envs/guest-modify.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+
+##############################################################################
+# Copyright (c) 2015 Ericsson AB and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+
+# This is copy from yardstick-img-modify on yardstick project. Currently
+# yardstick script only ubuntu image, and this one is more for CentOS.
+# Example invocation:
+# yardstick-img-modify /home/yardstick/tools/ubuntu-server-cloudimg-modify.sh
+#
+# Warning: the script will create files by default in:
+# /tmp/workspace/yardstick
+# the files will be owned by root!
+#
+# TODO: image resize is needed if the base image is too small
+#
+
+set -e
+set -x
+
+die() {
+ echo "error: $1" >&2
+ exit 1
+}
+
+usage () {
+ echo "$0 cmd workspace"
+ exit 1
+}
+
+test $# -eq 2 || usage
+test $(id -u) -eq 0 || die "should invoke using sudo"
+
+ROOTDIR=$(cd $(dirname "$0")/../.. && pwd)
+cmd=$1
+test -x $cmd
+workspace=$2
+mountdir=`mktemp -d`
+
+image_url=${IMAGE_URL:-"http://cloud.centos.org/centos/7/images/CentOS-7-x86_64-GenericCloud-1510.qcow2"}
+md5sums_url=${MD5SUMS_URL:-"http://cloud.centos.org/centos/7/images/sha256sum.txt"}
+
+imgfile="${workspace}/guest.img"
+raw_imgfile="${workspace}/guest.raw"
+filename=$(basename $image_url)
+md5filename=$(basename $md5sums_url)
+
+# download and checksum base image, conditionally if local copy is outdated
+download() {
+ test -d $workspace || mkdir -p $workspace
+ cd $workspace
+ rm -f $md5filename # always download the checksum file to a detect stale image
+ wget $md5sums_url
+ test -e $filename || wget -nc $image_url
+ grep "$filename\$" $md5filename |sha256sum -c
+ if [ $? -ne 0 ]; then
+ rm $filename
+ wget -nc $image_url
+ grep $filename $md5filename | md5sum -c
+ fi
+ rm -rf $raw_imgfile
+ qemu-img convert $filename $raw_imgfile
+ cd -
+}
+
+# mount image
+setup() {
+ mkdir -p $mountdir
+
+ loopdevice=$(kpartx -l $raw_imgfile | head -1 | cut -f1 -d ' ')
+
+ kpartx -a $raw_imgfile
+ # No idea why need this sleep
+ sleep 3
+ mount /dev/mapper/$loopdevice $mountdir
+
+ cp $cmd "$mountdir/"
+}
+
+# modify image running a script using in a chrooted environment
+modify() {
+ # Add the ssh key to the image
+ mkdir -p ${mountdir}/root/.ssh
+ cp ${ROOTDIR}/ci/envs/kvm4nfv_key.pub ${mountdir}/root/.ssh/authorized_keys
+ chmod 700 ${mountdir}/root/.ssh
+ chmod 600 ${mountdir}/root/.ssh/authorized_keys
+
+
+ umount $mountdir
+
+ qemu-img convert -O qcow2 $raw_imgfile $imgfile
+}
+
+# cleanup (umount) the image
+cleanup() {
+ # designed to be idempotent
+ mount | grep $mountdir && umount $mountdir
+ kpartx -d $raw_imgfile || true
+ rm -f $raw_imgfile
+ rm -rf $mountdir
+}
+
+exitcode=""
+error_trap()
+{
+ local rc=$?
+
+ set +e
+
+ if [ -z "$exitcode" ]; then
+ exitcode=$rc
+ fi
+
+ cleanup
+
+ echo "Image build failed with $exitcode"
+
+ exit $exitcode
+}
+
+main() {
+ cleanup
+
+ trap "error_trap" EXIT SIGTERM
+
+ download
+ setup
+ modify
+
+ trap - EXIT SIGTERM
+ cleanup
+
+ echo "the modified image is found here: $imgfile"
+}
+
+main
diff --git a/ci/envs/guest-setup0.sh b/ci/envs/guest-setup0.sh
new file mode 100755
index 000000000..490bd570d
--- /dev/null
+++ b/ci/envs/guest-setup0.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+##############################################################################
+## Copyright (c) 2015 Intel Corp.
+##
+## All rights reserved. This program and the accompanying materials
+## are made available under the terms of the Apache License, Version 2.0
+## which accompanies this distribution, and is available at
+## http://www.apache.org/licenses/LICENSE-2.0
+###############################################################################
+
+
+rpmdir=${1:-"/root/workspace/"}
+rpmpat="kernel-4.1*.rpm"
+rpm -ihv ${rpmdir}/rt-tests-0.96-1.el7.centos.x86_64.rpm
+guest_isolcpus=1
+
+# The script's caller should passing the rpm directory that is built out from
+# build.sh. The default rpmdir is the one used by yardstick scripts.
+install_kernel () {
+ # Install the kernel rpm
+ filenum=`ls -l ${rpmdir}/${rpmpat} |wc -l`
+ if [ $filenum -eq 0 ]
+ then
+ echo "No kernel rpm found in workspace/rpm"
+ exit 1
+ elif [ $filenum -gt 1 ]
+ then
+ echo "Multiple kernel rpm found in workspace/rpm"
+ exit 1
+ else
+ krpm=`find "${rpmdir}" -name "${rpmpat}"`
+ rpm -ihv $krpm
+ fi
+}
+
+config_grub () {
+ key=$1
+ val=$2
+
+ if grep '[" ]'${key} /etc/default/grub > /dev/null ; then
+ sed -i 's/\([" ]\)'${key}'=[^ "]*/\1'${key}'='${val}'/' /etc/default/grub
+ else
+ sed -i 's/GRUB_CMDLINE_LINUX="\(.*\)"/GRUB_CMDLINE_LINUX="\1 '${key}'='${val}'"/' /etc/default/grub
+ fi
+}
+
+# Isolate CPUs from the general scheduler
+config_grub 'isolcpus' ${guest_isolcpus}
+
+# Stop timer ticks on isolated CPUs whenever possible
+config_grub 'nohz_full' ${guest_isolcpus}
+
+# Disable machine check
+config_grub 'mce' 'off'
+
+# Use polling idle loop to improve performance
+config_grub 'idle' 'poll'
+
+## Disable clocksource verification at runtime
+config_grub 'tsc' 'reliable'
+
+grub2-mkconfig -o /boot/grub2/grub.cfg
+install_kernel
diff --git a/ci/envs/guest-setup1.sh b/ci/envs/guest-setup1.sh
new file mode 100755
index 000000000..678baa43b
--- /dev/null
+++ b/ci/envs/guest-setup1.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+##############################################################################
+## Copyright (c) 2015 Intel Corp.
+##
+## All rights reserved. This program and the accompanying materials
+## are made available under the terms of the Apache License, Version 2.0
+## which accompanies this distribution, and is available at
+## http://www.apache.org/licenses/LICENSE-2.0
+###############################################################################
+
+set_irq_affinity () {
+ for irq in /proc/irq/* ; do
+ echo 0 > /proc/irq/${1}/smp_affinity_list
+ done
+}
+
+# Disable watchdogs to reduce overhead
+echo 0 > /proc/sys/kernel/watchdog
+echo 0 > /proc/sys/kernel/nmi_watchdog
+
+# Route device interrupts to non-RT CPU
+set_irq_affinity
+
+# Disable RT throttling
+echo -1 > /proc/sys/kernel/sched_rt_period_us
+echo -1 > /proc/sys/kernel/sched_rt_runtime_us
diff --git a/ci/envs/host-config b/ci/envs/host-config
new file mode 100644
index 000000000..ce6243ce0
--- /dev/null
+++ b/ci/envs/host-config
@@ -0,0 +1,19 @@
+##############################################################################
+## Copyright (c) 2015 Intel Corp.
+##
+## All rights reserved. This program and the accompanying materials
+## are made available under the terms of the Apache License, Version 2.0
+## which accompanies this distribution, and is available at
+## http://www.apache.org/licenses/LICENSE-2.0
+###############################################################################
+
+# Isolated cpus for nfv, must be delimited with ','
+host_isolcpus=3,4
+
+# Number of huge pages to create and on which NUMA node
+numa_node=0
+huge_pages=2
+
+# QEMU executable path and number of cpus for guest
+qemu=/usr/libexec/qemu-kvm
+guest_cpus=2
diff --git a/ci/envs/host-run-qemu.sh b/ci/envs/host-run-qemu.sh
new file mode 100755
index 000000000..c7a2fecc6
--- /dev/null
+++ b/ci/envs/host-run-qemu.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+##############################################################################
+## Copyright (c) 2015 Intel Corp.
+##
+## All rights reserved. This program and the accompanying materials
+## are made available under the terms of the Apache License, Version 2.0
+## which accompanies this distribution, and is available at
+## http://www.apache.org/licenses/LICENSE-2.0
+###############################################################################
+
+source host-config
+
+cpumask () {
+ m=$((1<<${1}))
+ printf 0x%x ${m}
+}
+
+qmp_sock="/tmp/qmp-sock-$$"
+
+${qemu} -smp ${guest_cpus} -monitor unix:${qmp_sock},server,nowait -daemonize \
+ -cpu host,migratable=off,+invtsc,+tsc-deadline,pmu=off \
+ -realtime mlock=on -mem-prealloc -enable-kvm -m 1G \
+ -mem-path /mnt/hugetlbfs-1g \
+ -drive file=/root/workspace/image/guest.img,cache=none,aio=threads \
+ -netdev user,id=guest0,hostfwd=tcp::5555-:22 \
+ -device virtio-net-pci,netdev=guest0 \
+ -nographic -serial /dev/null -parallel /dev/null
+
+i=0
+for c in `echo ${host_isolcpus} | sed 's/,/ /g'` ; do
+ cpu[$i]=${c}
+ i=`expr $i + 1`
+done
+
+threads=`echo "info cpus" | nc -U ${qmp_sock} | grep thread_id | cut -d= -f3`
+
+# Bind QEMU processor threads to RT CPUs
+i=0
+for tid in ${threads} ; do
+ tid=`printf %d ${tid}` # this is required to get rid of cr at end
+ mask=`cpumask ${cpu[$i]}`
+ taskset -p ${mask} ${tid}
+ i=`expr $i + 1`
+done
diff --git a/ci/envs/host-setup0.sh b/ci/envs/host-setup0.sh
new file mode 100755
index 000000000..79d1f585a
--- /dev/null
+++ b/ci/envs/host-setup0.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+##############################################################################
+## Copyright (c) 2015 Intel Corp.
+##
+## All rights reserved. This program and the accompanying materials
+## are made available under the terms of the Apache License, Version 2.0
+## which accompanies this distribution, and is available at
+## http://www.apache.org/licenses/LICENSE-2.0
+###############################################################################
+
+source host-config
+
+rpmdir=${1:-"/root/workspace/rpm/"}
+rpmpat="kernel-4.1*.rpm"
+
+config_grub () {
+ key=$1
+ val=$2
+
+ if grep '[" ]'${key} /etc/default/grub > /dev/null ; then
+ sed -i 's/\([" ]\)'${key}'=[^ "]*/\1'${key}'='${val}'/' /etc/default/grub
+ else
+ sed -i 's/GRUB_CMDLINE_LINUX="\(.*\)"/GRUB_CMDLINE_LINUX="\1 '${key}'='${val}'"/' /etc/default/grub
+ fi
+}
+
+# The script's caller should passing the rpm directory that is built out from
+# build.sh. The default rpmdir is the one used by yardstick scripts.
+install_kernel () {
+ # Install the kernel rpm
+ filenum=`ls -l ${rpmdir}/${rpmpat} |wc -l`
+ if [ $filenum -eq 0 ]
+ then
+ echo "No kernel rpm found in workspace/rpm"
+ exit 1
+ elif [ $filenum -gt 1 ]
+ then
+ echo "Multiple kernel rpm found in workspace/rpm"
+ exit 1
+ else
+ krpm=`find "${rpmdir}" -name "${rpmpat}"`
+ rpm -ihv $krpm
+ fi
+}
+
+# Isolate CPUs from the general scheduler
+config_grub 'isolcpus' ${host_isolcpus}
+
+# Stop timer ticks on isolated CPUs whenever possible
+config_grub 'nohz_full' ${host_isolcpus}
+
+# Do not call RCU callbacks on isolated CPUs
+config_grub 'rcu_nocbs' ${host_isolcpus}
+
+# Enable intel iommu driver and disable DMA translation for devices
+config_grub 'iommu' 'pt'
+config_grub 'intel_iommu' 'on'
+
+# Set HugeTLB pages to 1GB
+config_grub 'default_hugepagesz' '1G'
+config_grub 'hugepagesz' '1G'
+
+# Disable machine check
+config_grub 'mce' 'off'
+
+## Use polling idle loop to improve performance
+config_grub 'idle' 'poll'
+
+## Disable clocksource verification at runtime
+config_grub 'tsc' 'reliable'
+
+grub2-mkconfig -o /boot/grub2/grub.cfg
+
+install_kernel
diff --git a/ci/envs/host-setup1.sh b/ci/envs/host-setup1.sh
new file mode 100755
index 000000000..3d2de6ddf
--- /dev/null
+++ b/ci/envs/host-setup1.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+##############################################################################
+## Copyright (c) 2015 Intel Corp.
+##
+## All rights reserved. This program and the accompanying materials
+## are made available under the terms of the Apache License, Version 2.0
+## which accompanies this distribution, and is available at
+## http://www.apache.org/licenses/LICENSE-2.0
+###############################################################################
+
+source host-config
+
+
+##############################
+# Create 1GB pages for guest #
+##############################
+
+hugepage_size=`cat /proc/meminfo |grep Hugepagesize |tr -s " "| cut -f 2 -d " "`
+if [[ $hugepage_size -ne 1048576 ]]
+then
+ echo "Need 1G huge page support for performance benefit"
+ exit 1
+fi
+
+mkdir -p /mnt/hugetlbfs-1g
+mount -t hugetlbfs hugetlbfs /mnt/hugetlbfs-1g -osize=1G
+
+hugepage_dir="/sys/devices/system/node/node${numa_node}/hugepages/hugepages-1048576kB/nr_hugepages"
+
+huge_pages+=`cat $hugepage_dir`
+echo ${huge_pages} > ${hugepage_dir}
+
+############################
+# RT optimization #
+############################
+# Disable watchdogs to reduce overhead
+echo 0 > /proc/sys/kernel/watchdog
+echo 0 > /proc/sys/kernel/nmi_watchdog
+
+# Change RT priority of ksoftirqd and rcuc kernel threads on isolated CPUs
+i=0
+for c in `echo $host_isolcpus | sed 's/,/ /g'` ; do
+ tid=`pgrep -a ksoftirq | grep "ksoftirqd/${c}$" | cut -d ' ' -f 1`
+ chrt -fp 2 ${tid}
+
+ tid=`pgrep -a rcuc | grep "rcuc/${c}$" | cut -d ' ' -f 1`
+ chrt -fp 3 ${tid}
+
+ cpu[$i]=${c}
+ i=`expr $i + 1`
+done
+
+# Change RT priority of rcub kernel threads
+for tid in `pgrep -a rcub | cut -d ' ' -f 1` ; do
+ chrt -fp 3 ${tid}
+done
+
+# Disable RT throttling
+echo -1 > /proc/sys/kernel/sched_rt_period_us
+echo -1 > /proc/sys/kernel/sched_rt_runtime_us
+
+# Reroute interrupts bound to isolated CPUs to CPU 0
+for irq in /proc/irq/* ; do
+ if [ -d ${irq} ] && ! grep - ${irq}/smp_affinity_list > /dev/null ; then
+ al=`cat ${irq}/smp_affinity_list`
+ if [[ ${cpu[*]} =~ ${al} ]] ; then
+ echo 0 > ${irq}/smp_affinity_list
+ fi
+ fi
+done
+
+# Change the iptable so that we can ssh to the guest remotely
+iptables -I INPUT -p tcp --dport 5555 -j ACCEPT
+# TODO: download guest disk image from artifactory
+
diff --git a/ci/envs/rt-tests.patch b/ci/envs/rt-tests.patch
new file mode 100644
index 000000000..b938e0768
--- /dev/null
+++ b/ci/envs/rt-tests.patch
@@ -0,0 +1,26 @@
+/*******************************************************************************
+ * * Copyright (c) 2015 Intel Corp.
+ * *
+ * * All rights reserved. This program and the accompanying materials
+ * * are made available under the terms of the Apache License, Version 2.0
+ * * which accompanies this distribution, and is available at
+ * * http://www.apache.org/licenses/LICENSE-2.0
+ * *******************************************************************************/
+
+diff --git a/Makefile b/Makefile
+index 1e4b7d1b0d3a..98968b94a57f 100644
+--- a/Makefile
++++ b/Makefile
+@@ -198,10 +198,10 @@ release: distclean changelog
+ cp -r Makefile COPYING ChangeLog MAINTAINERS doc README.markdown src tmp/rt-tests
+ rm -f rt-tests-$(VERSION).tar rt-tests-$(VERSION).tar.asc
+ tar -C tmp -cf rt-tests-$(VERSION).tar rt-tests
+- gpg2 --default-key clrkwllms@kernel.org --detach-sign --armor rt-tests-$(VERSION).tar
++ #gpg2 --default-key clrkwllms@kernel.org --detach-sign --armor rt-tests-$(VERSION).tar
+ gzip rt-tests-$(VERSION).tar
+ rm -f ChangeLog
+- cp rt-tests-$(VERSION).tar.gz rt-tests-$(VERSION).tar.asc releases
++ cp rt-tests-$(VERSION).tar.gz releases
+
+ .PHONY: tarball
+ tarball:
diff --git a/kernel/arch/x86/configs/opnfv.config b/kernel/arch/x86/configs/opnfv.config
index 704af497d..462858c57 100644
--- a/kernel/arch/x86/configs/opnfv.config
+++ b/kernel/arch/x86/configs/opnfv.config
@@ -74,7 +74,7 @@ CONFIG_SYSVIPC_SYSCTL=y
CONFIG_POSIX_MQUEUE=y
CONFIG_POSIX_MQUEUE_SYSCTL=y
CONFIG_CROSS_MEMORY_ATTACH=y
-# CONFIG_FHANDLE is not set
+CONFIG_FHANDLE=y
CONFIG_USELIB=y
# CONFIG_AUDIT is not set
CONFIG_HAVE_ARCH_AUDITSYSCALL=y
@@ -734,15 +734,15 @@ CONFIG_IP_PNP_BOOTP=y
CONFIG_IP_PNP_RARP=y
# CONFIG_NET_IPIP is not set
# CONFIG_NET_IPGRE_DEMUX is not set
-# CONFIG_NET_IP_TUNNEL is not set
+CONFIG_NET_IP_TUNNEL=y
CONFIG_IP_MROUTE=y
# CONFIG_IP_MROUTE_MULTIPLE_TABLES is not set
CONFIG_IP_PIMSM_V1=y
CONFIG_IP_PIMSM_V2=y
CONFIG_SYN_COOKIES=y
-# CONFIG_NET_UDP_TUNNEL is not set
+CONFIG_NET_UDP_TUNNEL=y
# CONFIG_NET_FOU is not set
-# CONFIG_GENEVE is not set
+CONFIG_GENEVE=y
# CONFIG_INET_AH is not set
# CONFIG_INET_ESP is not set
# CONFIG_INET_IPCOMP is not set
@@ -972,11 +972,15 @@ CONFIG_NET_SCH_FIFO=y
# CONFIG_DCB is not set
CONFIG_DNS_RESOLVER=y
# CONFIG_BATMAN_ADV is not set
-# CONFIG_OPENVSWITCH is not set
+CONFIG_OPENVSWITCH=m
+CONFIG_OPENVSWITCH_VXLAN=m
+CONFIG_OPENVSWITCH_GENEVE=m
# CONFIG_VSOCKETS is not set
# CONFIG_NETLINK_MMAP is not set
# CONFIG_NETLINK_DIAG is not set
-# CONFIG_MPLS is not set
+CONFIG_MPLS=y
+CONFIG_NET_MPLS_GSO=m
+# CONFIG_MPLS_ROUTING is not set
# CONFIG_HSR is not set
# CONFIG_NET_SWITCHDEV is not set
CONFIG_RPS=y
@@ -1352,7 +1356,7 @@ CONFIG_NET_CORE=y
# CONFIG_NET_TEAM is not set
# CONFIG_MACVLAN is not set
# CONFIG_IPVLAN is not set
-# CONFIG_VXLAN is not set
+CONFIG_VXLAN=y
CONFIG_NETCONSOLE=y
CONFIG_NETPOLL=y
CONFIG_NET_POLL_CONTROLLER=y
@@ -1448,10 +1452,12 @@ CONFIG_IGB=y
CONFIG_IGBVF=y
CONFIG_IXGB=y
CONFIG_IXGBE=y
+CONFIG_IXGBE_VXLAN=y
# CONFIG_IXGBE_HWMON is not set
-# CONFIG_IXGBEVF is not set
+CONFIG_IXGBEVF=y
CONFIG_I40E=y
-# CONFIG_I40EVF is not set
+CONFIG_I40E_VXLAN=y
+CONFIG_I40EVF=y
# CONFIG_FM10K is not set
CONFIG_NET_VENDOR_I825XX=y
# CONFIG_IP1000 is not set
@@ -2373,7 +2379,7 @@ CONFIG_FB=y
# CONFIG_FIRMWARE_EDID is not set
CONFIG_FB_CMDLINE=y
# CONFIG_FB_DDC is not set
-# CONFIG_FB_BOOT_VESA_SUPPORT is not set
+CONFIG_FB_BOOT_VESA_SUPPORT=y
CONFIG_FB_CFB_FILLRECT=y
CONFIG_FB_CFB_COPYAREA=y
CONFIG_FB_CFB_IMAGEBLIT=y
@@ -2400,7 +2406,7 @@ CONFIG_FB_TILEBLITTING=y
# CONFIG_FB_IMSTT is not set
# CONFIG_FB_VGA16 is not set
# CONFIG_FB_UVESA is not set
-# CONFIG_FB_VESA is not set
+CONFIG_FB_VESA=y
# CONFIG_FB_N411 is not set
# CONFIG_FB_HGA is not set
# CONFIG_FB_OPENCORES is not set
diff --git a/kernel/drivers/vfio/pci/vfio_pci_intrs.c b/kernel/drivers/vfio/pci/vfio_pci_intrs.c
index 1f577b4ac..a21d8e1e3 100644
--- a/kernel/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/kernel/drivers/vfio/pci/vfio_pci_intrs.c
@@ -352,7 +352,7 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
pci_write_msi_msg(irq, &msg);
}
- ret = request_irq(irq, vfio_msihandler, 0,
+ ret = request_irq(irq, vfio_msihandler, IRQF_NO_THREAD,
vdev->ctx[vector].name, trigger);
if (ret) {
kfree(vdev->ctx[vector].name);
diff --git a/kernel/kernel/time/hrtimer.c b/kernel/kernel/time/hrtimer.c
index 2c6be169b..5d193396e 100644
--- a/kernel/kernel/time/hrtimer.c
+++ b/kernel/kernel/time/hrtimer.c
@@ -583,6 +583,12 @@ static int hrtimer_reprogram(struct hrtimer *timer,
if (hrtimer_callback_running(timer))
return 0;
+ if (base->cpu_base != cpu_base)
+ return 0;
+
+ if (cpu_base->in_hrtirq)
+ return 0;
+
/*
* CLOCK_REALTIME timer might be requested with an absolute
* expiry time which is less than base->offset. Nothing wrong
@@ -613,12 +619,11 @@ static int hrtimer_reprogram(struct hrtimer *timer,
if (cpu_base->hang_detected)
return 0;
+ cpu_base->expires_next = expires;
/*
* Clockevents returns -ETIME, when the event was in the past.
*/
- res = tick_program_event(expires, 0);
- if (!IS_ERR_VALUE(res))
- cpu_base->expires_next = expires;
+ res = tick_program_event(expires, 1);
return res;
}
diff --git a/kernel/kernel/time/tick-sched.c b/kernel/kernel/time/tick-sched.c
index b3841ba00..f61dbf202 100644
--- a/kernel/kernel/time/tick-sched.c
+++ b/kernel/kernel/time/tick-sched.c
@@ -576,6 +576,20 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
+static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
+{
+ hrtimer_cancel(&ts->sched_timer);
+ hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
+
+ /* Forward the time to expire in the future */
+ hrtimer_forward(&ts->sched_timer, now, tick_period);
+
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+ hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
+ else
+ tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+}
+
static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
ktime_t now, int cpu)
{
@@ -704,22 +718,16 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
goto out;
}
- if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
- hrtimer_start(&ts->sched_timer, expires,
- HRTIMER_MODE_ABS_PINNED);
- /* Check, if the timer was already in the past */
- if (hrtimer_active(&ts->sched_timer))
- goto out;
- } else if (!tick_program_event(expires, 0))
- goto out;
- /*
- * We are past the event already. So we crossed a
- * jiffie boundary. Update jiffies and raise the
- * softirq.
- */
- tick_do_update_jiffies64(ktime_get());
+ if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+ hrtimer_start(&ts->sched_timer, expires,
+ HRTIMER_MODE_ABS_PINNED);
+ else
+ tick_program_event(expires, 1);
+ } else {
+ /* Tick is stopped, but required now. Enforce it */
+ tick_nohz_restart(ts, now);
+
}
- raise_softirq_irqoff(TIMER_SOFTIRQ);
out:
ts->next_jiffies = next_jiffies;
ts->last_jiffies = last_jiffies;
@@ -880,32 +888,6 @@ ktime_t tick_nohz_get_sleep_length(void)
return ts->sleep_length;
}
-static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
-{
- hrtimer_cancel(&ts->sched_timer);
- hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
-
- while (1) {
- /* Forward the time to expire in the future */
- hrtimer_forward(&ts->sched_timer, now, tick_period);
-
- if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
- hrtimer_start_expires(&ts->sched_timer,
- HRTIMER_MODE_ABS_PINNED);
- /* Check, if the timer was already in the past */
- if (hrtimer_active(&ts->sched_timer))
- break;
- } else {
- if (!tick_program_event(
- hrtimer_get_expires(&ts->sched_timer), 0))
- break;
- }
- /* Reread time and update jiffies */
- now = ktime_get();
- tick_do_update_jiffies64(now);
- }
-}
-
static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
{
/* Update jiffies first */
diff --git a/qemu/hw/i386/kvm/clock.c b/qemu/hw/i386/kvm/clock.c
index efdf16584..0593a3f1f 100644
--- a/qemu/hw/i386/kvm/clock.c
+++ b/qemu/hw/i386/kvm/clock.c
@@ -17,7 +17,7 @@
#include "qemu/host-utils.h"
#include "sysemu/sysemu.h"
#include "sysemu/kvm.h"
-#include "sysemu/cpus.h"
+#include "kvm_i386.h"
#include "hw/sysbus.h"
#include "hw/kvm/clock.h"
@@ -125,21 +125,7 @@ static void kvmclock_vm_state_change(void *opaque, int running,
return;
}
- cpu_synchronize_all_states();
- /* In theory, the cpu_synchronize_all_states() call above wouldn't
- * affect the rest of the code, as the VCPU state inside CPUState
- * is supposed to always match the VCPU state on the kernel side.
- *
- * In practice, calling cpu_synchronize_state() too soon will load the
- * kernel-side APIC state into X86CPU.apic_state too early, APIC state
- * won't be reloaded later because CPUState.vcpu_dirty==true, and
- * outdated APIC state may be migrated to another host.
- *
- * The real fix would be to make sure outdated APIC state is read
- * from the kernel again when necessary. While this is not fixed, we
- * need the cpu_clean_all_dirty() call below.
- */
- cpu_clean_all_dirty();
+ kvm_synchronize_all_tsc();
ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
if (ret < 0) {
diff --git a/qemu/migration/savevm.c b/qemu/migration/savevm.c
index 60712153f..a42874b10 100644
--- a/qemu/migration/savevm.c
+++ b/qemu/migration/savevm.c
@@ -945,8 +945,8 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
qemu_savevm_state_complete(f);
ret = qemu_file_get_error(f);
}
+ qemu_savevm_state_cancel();
if (ret != 0) {
- qemu_savevm_state_cancel();
error_setg_errno(errp, -ret, "Error while writing VM state");
}
return ret;
diff --git a/qemu/target-i386/kvm.c b/qemu/target-i386/kvm.c
index 066d03d99..721c580ed 100644
--- a/qemu/target-i386/kvm.c
+++ b/qemu/target-i386/kvm.c
@@ -96,6 +96,51 @@ bool kvm_allows_irq0_override(void)
return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
}
+static int kvm_get_tsc(CPUState *cs)
+{
+ X86CPU *cpu = X86_CPU(cs);
+ CPUX86State *env = &cpu->env;
+ struct {
+ struct kvm_msrs info;
+ struct kvm_msr_entry entries[1];
+ } msr_data;
+ int ret;
+
+ if (env->tsc_valid) {
+ return 0;
+ }
+
+ msr_data.info.nmsrs = 1;
+ msr_data.entries[0].index = MSR_IA32_TSC;
+ env->tsc_valid = !runstate_is_running();
+
+ ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data);
+ if (ret < 0) {
+ return ret;
+ }
+
+ env->tsc = msr_data.entries[0].data;
+ return 0;
+}
+
+static inline void do_kvm_synchronize_tsc(void *arg)
+{
+ CPUState *cpu = arg;
+
+ kvm_get_tsc(cpu);
+}
+
+void kvm_synchronize_all_tsc(void)
+{
+ CPUState *cpu;
+
+ if (kvm_enabled()) {
+ CPU_FOREACH(cpu) {
+ run_on_cpu(cpu, do_kvm_synchronize_tsc, cpu);
+ }
+ }
+}
+
static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
{
struct kvm_cpuid2 *cpuid;
diff --git a/qemu/target-i386/kvm_i386.h b/qemu/target-i386/kvm_i386.h
index e557e94f4..c1b312ba2 100644
--- a/qemu/target-i386/kvm_i386.h
+++ b/qemu/target-i386/kvm_i386.h
@@ -15,6 +15,7 @@
bool kvm_allows_irq0_override(void);
bool kvm_has_smm(void);
+void kvm_synchronize_all_tsc(void);
void kvm_arch_reset_vcpu(X86CPU *cs);
void kvm_arch_do_init_vcpu(X86CPU *cs);
diff --git a/tests/vm-trace/Makefile b/tests/vm-trace/Makefile
new file mode 100644
index 000000000..e367739d4
--- /dev/null
+++ b/tests/vm-trace/Makefile
@@ -0,0 +1,23 @@
+DEBUG ?= n
+
+ifeq ($(DEBUG),y)
+ DBGFLAGS = -g -DDEBUG
+else
+ DBGFLAGS = -O2
+endif
+
+XTRA_CFLAGS = $(DBGFLAGS) -Wall
+XTRALIBS = -lrt -lpthread
+
+all: vm-trace cscope
+
+vm-trace: vm-trace.c
+ $(CC) $(XTRA_CFLAGS) $< -o $@ $(XTRALIBS)
+
+cscope.out: vm-trace.c
+ cscope -b
+
+cscope: cscope.out
+
+clean:
+ rm -f *.o core* vm-trace cscope.*
diff --git a/tests/vm-trace/README.txt b/tests/vm-trace/README.txt
new file mode 100644
index 000000000..815dca93a
--- /dev/null
+++ b/tests/vm-trace/README.txt
@@ -0,0 +1,51 @@
+vm-trace is a tool utilizing the ftrace infrastructure in Linux kernel to
+measure VM preemption latencies. For more info about ftrace, see
+Documentation/trace/ftrace.txt. See include/linux/ring_buffer.h and
+include/linux/ftrace_event.h for data structures used by ftrace.
+
+The tool enables 2 trace points in KVM driver:
+kvm_exit defined in vmx_vcpu_run() (see arch/x86/kvm/vmx.c), and
+kvm_entry defined in vcpu_enter_guest() (see arch/x86/kvm/x86.c).
+
+It then spawns a thread to extract trace data from the kernel ftrace ring
+buffer using the splice() system call. Once the tracing duration has elapsed,
+vm-trace calculates VM exit-entry latencies based on the timestamps of the
+events. (A future improvement could be to spawn another thread to process the
+trace on the fly to improve vm-trace's performance.)
+
+To take a trace, do the following:
+
+1. Run qemu-kvm to start guest VM
+2. Bind each qemu-kvm vCPU thread to an isolated pCPU
+3. Start desired workload on the guest
+4. Run vm-trace on the host:
+ vm-trace -p cpu_to_trace -c cpu_to_collect_trace -s duration_in_seconds
+
+cpu_to_trace is one of the pCPUs from step 2 above that you want to trace.
+vm-trace does not support tracing multiple pCPUs.
+
+cpu_to_collect_trace is the CPU used to read and save the trace data.
+If the host system is NUMA, make sure to assign a CPU in the same NUMA node
+as cpu_to_trace to cpu_to_collect_trace.
+
+A binary file named trace.bin will be saved in the current working directory.
+Be aware that, depending on the tracing duration and type of workload running
+on the guest, the file can become quite large.
+
+vm-trace requires root privileges.
+
+Some statistics of the events will be displayed similar to the following:
+
+ Number of VM events = 21608832
+ Average VM Exit-Entry latency = 1us
+ Maximum VM Exit-Entry latency = 5us
+ Maximum cumulative latency within 1ms = 12us
+
+trace.bin will be overwritten each time vm-trace is run in this mode,
+so rename/copy the file if you want to keep it.
+
+To process a previously collected trace file, run:
+ vm-trace -f trace_file [-v]
+
+If -v is specified, all events in the trace file will be displayed.
+This is helpful for identifying cause of long latency.
diff --git a/tests/vm-trace/vm-trace.c b/tests/vm-trace/vm-trace.c
new file mode 100644
index 000000000..32b4d8f74
--- /dev/null
+++ b/tests/vm-trace/vm-trace.c
@@ -0,0 +1,632 @@
+#define _LARGEFILE64_SOURCE
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <time.h>
+#include <pthread.h>
+#include <sched.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+struct file_header
+{
+ uint32_t signature;
+ uint16_t vmentry_id;
+ uint16_t vmexit_id;
+};
+
+struct thread_param
+{
+ int raw_fd;
+ int out_fd;
+ int pipefd[2];
+ int cpu;
+};
+
+struct event
+{
+ uint64_t timestamp;
+ uint16_t type;
+};
+
+enum rb_type
+{
+ RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28,
+ RINGBUF_TYPE_PADDING,
+ RINGBUF_TYPE_TIME_EXTEND,
+ RINGBUF_TYPE_TIME_STAMP
+};
+
+struct vmexit_data
+{
+ uint32_t exit_reason;
+ uint64_t guest_rip;
+ uint32_t isa;
+ uint64_t info1;
+ uint64_t info2;
+};
+
+struct vmentry_data
+{
+ uint32_t vcpu_id;
+};
+
+#pragma pack(1)
+
+struct trace_data
+{
+ uint16_t type;
+ uint8_t flags;
+ uint8_t preempt_count;
+ int32_t pid;
+ uint16_t migrate_disable;
+ uint16_t padding1;
+ uint32_t padding2;
+ union
+ {
+ struct vmexit_data vmexit;
+ struct vmentry_data vmentry;
+ };
+};
+
+struct event_entry
+{
+ uint32_t type_len: 5, time_delta: 27;
+ union
+ {
+ uint32_t array[1];
+ struct trace_data trace;
+ };
+};
+
+#define DATA_SIZE 4080
+
+struct event_page
+{
+ uint64_t timestamp;
+ uint64_t commit;
+ union
+ {
+ uint8_t data[DATA_SIZE];
+ struct event_entry event;
+ };
+};
+
+#define PAGE_SIZE sizeof(struct event_page)
+
+#define TRACE_PATH "/sys/kernel/debug/tracing"
+#define TRACE_FILE "trace.bin"
+#define TRACE_SIG 0xcce96d01
+
+#define VM_ENTRY "kvm/kvm_entry"
+#define VM_EXIT "kvm/kvm_exit"
+
+#ifdef DEBUG
+#define dbg_printf(_f_,...) {printf(_f_,##__VA_ARGS__);}
+#else
+#define dbg_printf(_f_,...)
+#endif
+
+static uint16_t vmentry_id;
+static uint16_t vmexit_id;
+
+static int stop_tracing;
+static int verbose;
+
+
+static struct event* read_events(int fd, size_t *n)
+{
+ int i, j;
+ ssize_t rd;
+ uint32_t len, offset;
+ uint64_t event_time, pre_event_time, pre_page_time = 0;
+ struct event_page page;
+ struct event_entry *e;
+ struct event *events = NULL;
+
+ *n = 0;
+ for (i = 0; 1; i++)
+ {
+ if ((rd = read(fd, &page, PAGE_SIZE)) == 0)
+ {
+ if (*n == 0)
+ fprintf(stderr, "No events found\nMake sure a VM is running and the CPU to trace is bound to a QEMU vCPU thread\n");
+
+ return events;
+ }
+ else if (rd < 0)
+ {
+ fprintf(stderr, "Failed to read trace file\n");
+ free(events);
+ return NULL;
+ }
+ else if (rd < PAGE_SIZE)
+ {
+ fprintf(stderr, "Trace file does not have enough data\n");
+ free(events);
+ return NULL;
+ }
+
+ dbg_printf("Page %d:\n", i);
+ dbg_printf(" timestamp = %ld\n", page.timestamp);
+ dbg_printf(" commit = %ld\n", page.commit);
+
+ if (page.timestamp < pre_page_time)
+ fprintf(stderr, "Warning: page time going backwards\n");
+
+ pre_page_time = page.timestamp;
+
+ offset = 0;
+ pre_event_time = 0;
+ for (j = 0; 1; j++)
+ {
+ e = (struct event_entry *)(page.data+offset);
+
+ if (e->type_len == 0)
+ e = (struct event_entry *)&e->array[1];
+
+ if (pre_event_time)
+ event_time = pre_event_time + e->time_delta;
+ else
+ event_time = page.timestamp;
+
+ if (e->type_len < RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
+ {
+ len = (e->type_len+1) * sizeof(uint32_t);
+
+ if (e->trace.type == vmexit_id || e->trace.type == vmentry_id)
+ {
+ if ((events = realloc(events, (*n+1) * sizeof(struct event))) == NULL)
+ {
+ fprintf(stderr, "Failed to allocate memory\n");
+ return NULL;
+ }
+
+ events[*n].timestamp = event_time;
+ events[*n].type = e->trace.type;
+
+ if (verbose)
+ {
+ if (e->trace.type == vmexit_id)
+ {
+ printf(" %ld: VM_EXIT reason: %08x, ", event_time, e->trace.vmexit.exit_reason);
+ printf("info1: %016lx, info2: %08lx\n", e->trace.vmexit.info1, e->trace.vmexit.info2);
+ }
+ else
+ {
+ printf(" %ld: VM_ENTRY dt: %d, vcpu: %d\n", event_time, e->time_delta, e->trace.vmentry.vcpu_id);
+ }
+ }
+
+ *n += 1;
+ }
+ else if (e->trace.type == 0)
+ break;
+ else
+ fprintf(stderr, "UNKNOWN event %d\n", e->trace.type);
+ }
+ else if (e->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ {
+ len = 8;
+ event_time = pre_event_time + (e->time_delta | (e->array[0]<<28));
+ dbg_printf(" entry %d: TIME_EXTEND %ld\n", j, event_time);
+ }
+ else if (e->type_len == RINGBUF_TYPE_PADDING)
+ {
+ if (e->time_delta == 0)
+ break;
+
+ len = e->array[0] + sizeof(uint32_t);
+ dbg_printf(" entry %d: PADDING, len %d @ %ld\n", j, len, event_time);
+ }
+ else if (e->type_len == RINGBUF_TYPE_TIME_STAMP)
+ {
+ len = 16;
+ dbg_printf(" entry %d: TIME_STAMP @ %ld\n", j, event_time);
+ }
+
+ pre_event_time = event_time;
+
+ offset += len;
+ if (offset >= DATA_SIZE)
+ break;
+ }
+
+ dbg_printf(" events in page %d = %d\n", i, j);
+ }
+
+ return events;
+}
+
+static int parse_trace_file(char *file)
+{
+ int fd;
+ uint16_t pre_event = 0;
+ struct event *events;
+ size_t num_events, i, j = 0;
+ uint64_t d, exit_time = 0, total = 0, max_lat = 0;
+ uint64_t pre_time = 0, acc_1ms = 0, max_acc = 0;
+ struct file_header header;
+
+ if ((fd = open(file, O_RDONLY|O_LARGEFILE)) < 0)
+ {
+ perror(file);
+ return -1;
+ }
+
+ if (read(fd, &header, sizeof(struct file_header)) < 0)
+ {
+ perror(file);
+ return -1;
+ }
+
+ if (header.signature != TRACE_SIG)
+ {
+ fprintf(stderr, "File %s is not a vm-trace file\n", file);
+ return -1;
+ }
+
+ vmentry_id = header.vmentry_id;
+ vmexit_id = header.vmexit_id;
+
+ if ((events = read_events(fd, &num_events)) == NULL)
+ return -1;
+
+ printf("Number of VM events = %ld\n", num_events);
+
+ for (i = 0; i < num_events; i++)
+ {
+ if (events[i].type == vmexit_id)
+ {
+ exit_time = events[i].timestamp;
+ }
+ else if (events[i].type == vmentry_id)
+ {
+ if (exit_time)
+ {
+ d = events[i].timestamp - exit_time;
+ if (d > max_lat)
+ max_lat = d;
+
+ total += d;
+ acc_1ms += d;
+ j++;
+ }
+ }
+
+ if (events[i].type == pre_event)
+ fprintf(stderr, "Warning: repeated events\n");
+ pre_event = events[i].type;
+
+ if (pre_time)
+ {
+ if (events[i].timestamp - pre_time >= 1000000)
+ {
+ if (acc_1ms > max_acc)
+ max_acc = acc_1ms;
+
+ acc_1ms = 0;
+ pre_time = events[i].timestamp;
+ }
+ }
+ else
+ pre_time = events[i].timestamp;
+ }
+
+ free(events);
+
+ printf("Average VM Exit-Entry latency = %ldus\n", total/j/1000);
+ printf("Maximum VM Exit-Entry latency = %ldus\n", max_lat/1000);
+ printf("Maximum cumulative latency within 1ms = %ldus\n", max_acc/1000);
+
+ close(fd);
+ return 0;
+}
+
+static int get_event_id(char *event)
+{
+ char path[PATH_MAX+1];
+ int fd;
+ ssize_t r;
+
+ sprintf(path, "%s/events/%s/id", TRACE_PATH, event);
+ if ((fd = open(path, O_RDONLY)) < 0)
+ {
+ perror(path);
+ return -1;
+ }
+
+ if ((r = read(fd, path, PATH_MAX)) < 0)
+ {
+ close(fd);
+ perror(path);
+ return -1;
+ }
+
+ close(fd);
+
+ path[r+1] = '\0';
+ return atoi(path);
+}
+
+static int enable_event(char *event, int en)
+{
+ char path[PATH_MAX+1], *s;
+ int fd;
+
+ if (en)
+ s = "1";
+ else
+ s = "0";
+
+ sprintf(path, "%s/events/%s/enable", TRACE_PATH, event);
+ if ((fd = open(path, O_WRONLY | O_TRUNC)) < 0)
+ {
+ perror(path);
+ return -1;
+ }
+
+ if (write(fd, s, 2) < 0)
+ {
+ close(fd);
+ perror(path);
+ return -1;
+ }
+
+ close(fd);
+
+ return 0;
+}
+
+static int enable_events(int en)
+{
+ if (enable_event(VM_ENTRY, en) < 0)
+ return -1;
+
+ if (enable_event(VM_EXIT, en) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int setup_tracing(int cpu)
+{
+ char path[PATH_MAX+1], mask[20];
+ int fd, h, l;
+
+ if (cpu > 31)
+ {
+ l = 0;
+ h = 1 << (cpu-32);
+ }
+ else
+ {
+ l = 1 << cpu;
+ h = 0;
+ }
+
+ sprintf(mask, "%X,%X", h, l);
+
+ sprintf(path, "%s/tracing_cpumask", TRACE_PATH);
+ if ((fd = open(path, O_WRONLY | O_TRUNC)) < 0)
+ {
+ perror(path);
+ return -1;
+ }
+
+ if (write(fd, mask, strlen(mask)) < 0)
+ {
+ close(fd);
+ perror(path);
+ return -1;
+ }
+
+ close(fd);
+
+ sprintf(path, "%s/trace", TRACE_PATH);
+ if ((fd = open(path, O_WRONLY | O_TRUNC)) < 0)
+ {
+ perror(path);
+ return -1;
+ }
+
+ if (write(fd, "", 1) < 0)
+ {
+ close(fd);
+ perror(path);
+ return -1;
+ }
+
+ close(fd);
+
+ if ((vmentry_id = get_event_id(VM_ENTRY)) < 0)
+ return -1;
+
+ if ((vmexit_id = get_event_id(VM_EXIT)) < 0)
+ return -1;
+
+ if (enable_events(1) < 0)
+ return -1;
+
+ return 0;
+}
+
+static void disable_tracing(int fd, pthread_t thread)
+{
+ if (write(fd, "0", 2) < 0)
+ perror("disable_tracing");
+ close(fd);
+
+ enable_events(0);
+
+ stop_tracing = 1;
+ pthread_join(thread, NULL);
+}
+
+static void *tracing_thread(void *param)
+{
+ cpu_set_t mask;
+ struct thread_param *p = param;
+ ssize_t r;
+
+ CPU_ZERO(&mask);
+ CPU_SET(p->cpu, &mask);
+ if(pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &mask) != 0)
+ fprintf(stderr, "Could not set CPU affinity to CPU #%d\n", p->cpu);
+
+ while (!stop_tracing)
+ {
+ if ((r = splice(p->raw_fd, NULL, p->pipefd[1], NULL, PAGE_SIZE, SPLICE_F_MOVE|SPLICE_F_NONBLOCK)) < 0)
+ {
+ if (errno == EAGAIN)
+ continue;
+
+ perror("splice1");
+ break;
+ }
+ else if (r == 0)
+ continue;
+
+ if (splice(p->pipefd[0], NULL, p->out_fd, NULL, PAGE_SIZE, SPLICE_F_MOVE|SPLICE_F_NONBLOCK) < 0)
+ {
+ perror("splice2");
+ break;
+ }
+ }
+
+ close(p->raw_fd);
+ close(p->pipefd[1]);
+ close(p->pipefd[0]);
+ close(p->out_fd);
+
+ return NULL;
+}
+
+static void usage(char *argv[])
+{
+ fprintf(stderr, "Usage: %s -p cpu_to_trace -c cpu_to_collect_trace -s duration_in_seconds\n", argv[0]);
+ fprintf(stderr, " %s -f trace_file [-v]\n", argv[0]);
+ exit(-1);
+}
+
+int main(int argc, char *argv[])
+{
+ char path[PATH_MAX+1], *file = NULL;
+ int cpu = -1, ttime = 0;
+ int opt, fd;
+ pthread_t thread;
+ struct file_header header;
+ struct thread_param param;
+ struct timespec interval;
+
+ param.cpu = -1;
+
+ while ((opt = getopt(argc, argv, "p:c:s:f:v")) != -1)
+ {
+ switch (opt)
+ {
+ case 'p':
+ cpu = atoi(optarg);
+ break;
+ case 'c':
+ param.cpu = atoi(optarg);
+ break;
+ case 's':
+ ttime = atoi(optarg);
+ break;
+ case 'f':
+ file = optarg;
+ break;
+ case 'v':
+ verbose = 1;
+ break;
+ default:
+ usage(argv);
+ }
+ }
+
+ if ((cpu < 0 || param.cpu < 0 || ttime <= 0) && file == NULL)
+ usage(argv);
+
+ if (file != NULL)
+ return parse_trace_file(file);
+
+ verbose = 0;
+
+ if (setup_tracing(cpu) < 0)
+ return -1;
+
+ if ((param.out_fd = open(TRACE_FILE, O_WRONLY|O_CREAT|O_TRUNC|O_LARGEFILE, 0644)) < 0)
+ {
+ perror(TRACE_FILE);
+ return -1;
+ }
+
+ header.signature = TRACE_SIG;
+ header.vmentry_id = vmentry_id;
+ header.vmexit_id = vmexit_id;
+
+ if (write(param.out_fd, &header, sizeof(struct file_header)) < 0)
+ {
+ perror(TRACE_FILE);
+ return -1;
+ }
+
+ sprintf(path, "%s/per_cpu/cpu%d/trace_pipe_raw", TRACE_PATH, cpu);
+ if ((param.raw_fd = open(path, O_RDONLY)) < 0)
+ {
+ perror(path);
+ return -1;
+ }
+
+ if (pipe(param.pipefd) < 0)
+ {
+ perror("pipe");
+ return -1;
+ }
+
+ sprintf(path, "%s/tracing_on", TRACE_PATH);
+ if ((fd = open(path, O_WRONLY)) < 0)
+ {
+ perror(path);
+ return -1;
+ }
+
+ if (pthread_create(&thread, NULL, tracing_thread, &param))
+ {
+ perror("pthread_create");
+ return -1;
+ }
+
+ if (write(fd, "1", 2) < 0)
+ {
+ perror(path);
+ disable_tracing(fd, thread);
+ return -1;
+ }
+
+ printf("CPU to trace: %d\n", cpu);
+ printf("CPU to collect trace: %d\n", param.cpu);
+ printf("Duration: %d seconds\n", ttime);
+
+ interval.tv_sec = ttime;
+ interval.tv_nsec = 0;
+ if (clock_nanosleep(CLOCK_MONOTONIC, 0, &interval, NULL))
+ {
+ perror("clock_nanosleep");
+ disable_tracing(fd, thread);
+ return -1;
+ }
+
+ disable_tracing(fd, thread);
+
+ printf("Processing event file ...\n");
+
+ return parse_trace_file(TRACE_FILE);
+}