diff options
-rwxr-xr-x | ci/build_qemu_rpm_deb/mkcontrol.sh | 8 | ||||
-rwxr-xr-x | ci/build_qemu_rpm_deb/mkspec | 43 | ||||
-rwxr-xr-x | ci/build_qemu_rpm_deb/mkversion | 10 | ||||
-rw-r--r-- | ci/build_qemu_rpm_deb/qemu_build.sh | 33 | ||||
-rwxr-xr-x | ci/build_qemu_rpm_deb/qemu_deb_build.sh | 30 | ||||
-rwxr-xr-x | ci/build_qemu_rpm_deb/qemu_rpm_build.sh | 32 | ||||
-rw-r--r-- | ci/envs/cyclictest.sh | 23 | ||||
-rw-r--r-- | ci/test_kvmfornfv.sh | 44 | ||||
-rwxr-xr-x | fuel-plugin/vagrant/build_fuel_plugin.sh | 12 | ||||
-rw-r--r-- | kernel/arch/x86/configs/opnfv.config | 7 | ||||
-rw-r--r-- | kernel/arch/x86/include/asm/kvm_host.h | 5 | ||||
-rw-r--r-- | kernel/arch/x86/kvm/lapic.c | 130 | ||||
-rw-r--r-- | kernel/arch/x86/kvm/lapic.h | 5 | ||||
-rw-r--r-- | kernel/arch/x86/kvm/trace.h | 15 | ||||
-rw-r--r-- | kernel/arch/x86/kvm/vmx.c | 214 | ||||
-rw-r--r-- | kernel/arch/x86/kvm/x86.c | 8 | ||||
-rw-r--r-- | kernel/kernel/locking/rtmutex.c | 7 | ||||
-rw-r--r-- | kernel/kernel/time/timer.c | 40 | ||||
-rw-r--r-- | kernel/scripts/package/Makefile | 4 | ||||
-rw-r--r-- | tests/pod.yaml | 18 |
20 files changed, 632 insertions, 56 deletions
diff --git a/ci/build_qemu_rpm_deb/mkcontrol.sh b/ci/build_qemu_rpm_deb/mkcontrol.sh new file mode 100755 index 000000000..7eb504a0e --- /dev/null +++ b/ci/build_qemu_rpm_deb/mkcontrol.sh @@ -0,0 +1,8 @@ +#!/bin/bash +echo "Package: qemu" +echo "Version: $1" +echo "Section: base" +echo "Priority: optional" +echo "Architecture: all" +echo "Maintainer: Intel" +echo "Description: control file for qemu debian build on centos" diff --git a/ci/build_qemu_rpm_deb/mkspec b/ci/build_qemu_rpm_deb/mkspec new file mode 100755 index 000000000..0b75a181b --- /dev/null +++ b/ci/build_qemu_rpm_deb/mkspec @@ -0,0 +1,43 @@ +#!/bin/sh +# +# Output a simple RPM spec file. +# +# starting to output the spec + +QEMURELEASE=$1 + +__QEMURELEASE=`echo $QEMURELEASE | sed -e "s/-/_/g"` + +echo $srctree +echo "Name: qemu" +echo "Summary: The Linux qemu" +echo "Version: $__QEMURELEASE" +# we need to determine the NEXT version number +# rpm -q will agree +echo "Release: `sudo sh mkversion`" +echo "License: GPLv2+ and LGPLv2+ and BSD" +echo "Group: Development/Tools" +echo "Vendor: The Linux Community" +echo "URL: http://www.qemu.org" +echo "Source: qemu-$__QEMURELEASE.tar.gz" +echo "%description" +echo "%prep" +echo " " +echo "%setup -q" +echo " " +echo "%build" +echo "%_configure" +echo "make" +echo "%install" +echo "rm -rf %{buildroot}" +echo "make install DESTDIR=%{buildroot}" +echo "%clean" +echo "rm -rf %{buildroot}" +echo "%files" +echo "%dir" +echo "/usr/local/share/qemu" +echo "%doc" +echo "/usr/local/bin/ivshmem*" +echo "/usr/local/bin/qemu*" +echo "/usr/local/libexec/qemu-bridge-helper" +echo "%changelog" diff --git a/ci/build_qemu_rpm_deb/mkversion b/ci/build_qemu_rpm_deb/mkversion new file mode 100755 index 000000000..fa4e585b9 --- /dev/null +++ b/ci/build_qemu_rpm_deb/mkversion @@ -0,0 +1,10 @@ +if [ ! -f .version ] +then + touch .version + sudo chmod 777 .version + echo 1 > .version + echo 1 +else + expr 0`cat .version` + 1 + expr 0`cat .version` + 1 > .version +fi diff --git a/ci/build_qemu_rpm_deb/qemu_build.sh b/ci/build_qemu_rpm_deb/qemu_build.sh new file mode 100644 index 000000000..a8863c3ca --- /dev/null +++ b/ci/build_qemu_rpm_deb/qemu_build.sh @@ -0,0 +1,33 @@ +#!/bin/bash +qemu_src_dir=qemu +workspace=/root +debbuild_dir=$workspace/debbuild +rpmbuild_dir=$workspace/rpmbuild +artifact_rpms=$rpmbuild_dir/RPMS +artifact_dir=$artifact_rpms/x86_64 +scripts_dir=ci/build_qemu_rpm_deb +output_dir="$1" +VERSION=`grep -m 1 "VERSION" ${qemu_src_dir}/config-host.mak | cut -d= -f2-` + +usage () { + echo "usage: ${0} output_dir" + exit 1 +} + +if [[ -z "$@" ]]; then + usage +fi + +if [ ! -d ${output_dir} -o ! -w ${output_dir} ] ; then + echo "${0}: Output directory '${output_dir}' does not exist or cannot be written" + exit 1 +fi + +if [ ! -d ${qemu_src_dir} ] ; then + echo "${0}: Directory '${qemu_src_dir}' does not exist, run this script from the root of kvmfornfv source tree" + exit 1 +fi + +echo +echo "Build" +echo diff --git a/ci/build_qemu_rpm_deb/qemu_deb_build.sh b/ci/build_qemu_rpm_deb/qemu_deb_build.sh new file mode 100755 index 000000000..7a830183d --- /dev/null +++ b/ci/build_qemu_rpm_deb/qemu_deb_build.sh @@ -0,0 +1,30 @@ +#!/bin/bash +#Build process for generating qemu debain file. + +source ci/build_qemu_rpm_deb/qemu_build.sh +qemu_deb_build() { + sudo mkdir -p $debbuild_dir/qemu-$VERSION + sudo cp -r $qemu_src_dir $debbuild_dir/qemu-$VERSION + sudo mkdir -p $debbuild_dir/qemu-$VERSION/DEBIAN + sudo touch control + +#creating control file for debian build. + (cd ${scripts_dir}; sudo ./mkcontrol.sh $VERSION > control) + sudo mv $scripts_dir/control $debbuild_dir/qemu-$VERSION/DEBIAN/control + +#building the qemu debian with control file developed. + sudo dpkg-deb --build $debbuild_dir/qemu-$VERSION + if [ ${?} -ne 0 ] ; then + echo "${0}: qemu build failed" + exit 1 + fi +} + +if [ ! -d ${debbuild_dir} ] ; then + echo "creating debbuild directory" + sudo mkdir -p $debbuild_dir +fi + +qemu_deb_build +latest_qemu_build=`sudo ls -rt $debbuild_dir | tail -1` +sudo cp $debbuild_dir/$latest_qemu_build build_output diff --git a/ci/build_qemu_rpm_deb/qemu_rpm_build.sh b/ci/build_qemu_rpm_deb/qemu_rpm_build.sh new file mode 100755 index 000000000..a52ee0f4a --- /dev/null +++ b/ci/build_qemu_rpm_deb/qemu_rpm_build.sh @@ -0,0 +1,32 @@ +#!/bin/bash +#Build process for Generating qemu rpm. + +source ci/build_qemu_rpm_deb/qemu_build.sh +qemu_rpm_build() { + sudo cp -r ${qemu_src_dir} ${qemu_src_dir}-$VERSION + sudo tar -zcvf ${qemu_src_dir}-$VERSION.tar.gz ${qemu_src_dir}-$VERSION + sudo mv ${qemu_src_dir}-$VERSION.tar.gz ${rpmbuild_dir}/SOURCES/ + +#create a spec file for rpm creation. + (cd ${scripts_dir}; ./mkspec $VERSION > qemu.spec) + sudo cp ${scripts_dir}/qemu.spec ${rpmbuild_dir}/SPECS/ + +#build the qemu rpm with spec file developed + sudo rpmbuild -ba ${rpmbuild_dir}/SPECS/qemu.spec + if [ ${?} -ne 0 ] ; then + echo "${0}: qemu build failed" + exit 1 + fi + sudo rm -rf ${qemu_src_dir}-$VERSION + sudo rm -rf ${rpmbuild_dir}/SOURCES/${qemu_src_dir}-$VERSION.tar.gz +} + +if [ ! -d ${rpmbuild_dir} ] ; then + sudo yum install rpm-build -y + mkdir -p ~/rpmbuild/{BUILD,RPMS,SOURCES,SPECS,SRPMS} + sudo mv rpmbuild $workspace +fi + +qemu_rpm_build +latest_qemu_build=`ls -rt $artifact_dir | grep qemu-2.6* | tail -1` +sudo cp $artifact_dir/$latest_qemu_build build_output diff --git a/ci/envs/cyclictest.sh b/ci/envs/cyclictest.sh new file mode 100644 index 000000000..e4dd9931f --- /dev/null +++ b/ci/envs/cyclictest.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +########################################################### +## Invoking this script from ubuntu docker container runs +## cyclictest through yardstick +########################################################### + +pod_config='/opt/pod.yaml' +cyclictest_context_file='/opt/cyclictest-node-context.yaml' + +if [ ! -f ${pod_config} ] ; then + echo "file ${pod_config} not found" + exit 1 +fi + +if [ ! -f ${cyclictest_context_file} ] ; then + echo "file ${cyclictest_context_file} not found" + exit 1 +fi + +#Running cyclictest through yardstick +yardstick task start ${cyclictest_context_file} +mv /tmp/yardstick.out /opt/ diff --git a/ci/test_kvmfornfv.sh b/ci/test_kvmfornfv.sh new file mode 100644 index 000000000..a33814907 --- /dev/null +++ b/ci/test_kvmfornfv.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +############################################################# +## This script will launch ubuntu docker container +## runs cyclictest through yardstick +## and verifies the test results. +############################################################ + +volume=/tmp/kvmtest-* +if [ -d ${volume} ] ; then + echo "Directory '${volume}' exists" + sudo rm -rf ${volume} +fi + +time_stamp=$(date +%Y%m%d%H%M%S) +mkdir -p /tmp/kvmtest-${time_stamp}/{image,rpm,scripts} + +#copying required files to run yardstick cyclic testcase +mv ${WORKSPACE}/build_output/* $volume/rpm +cp ${WORKSPACE}/ci/envs/* $volume/scripts +cp ${WORKSPACE}/tests/cyclictest-node-context.yaml $volume +cp ${WORKSPACE}/tests/pod.yaml $volume + +#Launching ubuntu docker container to run yardstick +sudo docker run -t -i -v $volume:/opt --net=host --name kvmfornfv \ +kvmfornfv:latest /bin/bash /opt/scripts/cyclictest.sh +container_id=`sudo docker ps -a | grep kvmfornfv |awk '{print $1}'` +sudo docker rm $container_id + +#Verifying the results of cyclictest +result=`grep -o '"errors":[^,]*' $volume/yardstick.out | awk -F '"' \ +'{print $4}'| awk '{if (NF=0) print "SUCCESS" }'` +if [ "$result" = "SUCCESS" ]; then + echo "####################################################" + echo "" + echo `grep -o '"data":[^}]*' $volume/yardstick.out | awk -F '{' '{print $2}'` + echo "" + echo "####################################################" + exit 0 +else + echo "Testcase failed" + echo `grep -o '"errors":[^,]*' $volume/yardstick.out | awk -F '"' '{print $4}'` + exit 1 +fi diff --git a/fuel-plugin/vagrant/build_fuel_plugin.sh b/fuel-plugin/vagrant/build_fuel_plugin.sh index b18ae173b..e315b111f 100755 --- a/fuel-plugin/vagrant/build_fuel_plugin.sh +++ b/fuel-plugin/vagrant/build_fuel_plugin.sh @@ -1,13 +1,8 @@ #!/bin/bash sudo apt-get update -y -sudo apt-get install createrepo rpm dpkg-dev -y -sudo apt-get install python-setuptools -y -sudo apt-get install python-pip -y -sudo easy_install pip -sudo pip install fuel-plugin-builder -sudo apt-get install ruby -y -sudo gem install rubygems-update +sudo apt-get install -y ruby-dev rubygems-integration python-pip rpm createrepo dpkg-dev sudo gem install fpm +sudo pip install fuel-plugin-builder sudo apt-get install docker.io -y cd /home/vagrant # Will build fuel-plugin-kvm in guest VM local directory, not change host @@ -15,5 +10,4 @@ cp -r /kvmfornfv . cd kvmfornfv/fuel-plugin fpb --debug --build . # Copy the built fuel-plugin-kvm back to the host -rm /kvmfornfv/fuel-plugin/fuel-plugin-kvm*.rpm -cp fuel-plugin-kvm*.rpm /kvmfornfv/fuel-plugin/. +cp *.rpm /vagrant diff --git a/kernel/arch/x86/configs/opnfv.config b/kernel/arch/x86/configs/opnfv.config index 2d6d1cc4e..b623b0cd4 100644 --- a/kernel/arch/x86/configs/opnfv.config +++ b/kernel/arch/x86/configs/opnfv.config @@ -885,7 +885,7 @@ CONFIG_NETFILTER_XTABLES=m # Xtables combined modules # CONFIG_NETFILTER_XT_MARK=m -# CONFIG_NETFILTER_XT_CONNMARK is not set +CONFIG_NETFILTER_XT_CONNMARK=m CONFIG_NETFILTER_XT_SET=m # @@ -894,7 +894,7 @@ CONFIG_NETFILTER_XT_SET=m CONFIG_NETFILTER_XT_TARGET_AUDIT=m CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m # CONFIG_NETFILTER_XT_TARGET_CLASSIFY is not set -# CONFIG_NETFILTER_XT_TARGET_CONNMARK is not set +CONFIG_NETFILTER_XT_TARGET_CONNMARK=m CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m CONFIG_NETFILTER_XT_TARGET_CT=m # CONFIG_NETFILTER_XT_TARGET_DSCP is not set @@ -929,7 +929,7 @@ CONFIG_NETFILTER_XT_MATCH_COMMENT=m # CONFIG_NETFILTER_XT_MATCH_CONNBYTES is not set # CONFIG_NETFILTER_XT_MATCH_CONNLABEL is not set # CONFIG_NETFILTER_XT_MATCH_CONNLIMIT is not set -# CONFIG_NETFILTER_XT_MATCH_CONNMARK is not set +CONFIG_NETFILTER_XT_MATCH_CONNMARK=m CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m # CONFIG_NETFILTER_XT_MATCH_CPU is not set # CONFIG_NETFILTER_XT_MATCH_DCCP is not set @@ -1227,6 +1227,7 @@ CONFIG_NET_CLS_ACT=y # CONFIG_NET_ACT_CSUM is not set # CONFIG_NET_ACT_VLAN is not set # CONFIG_NET_ACT_BPF is not set +# CONFIG_NET_ACT_CONNMARK is not set CONFIG_NET_SCH_FIFO=y # CONFIG_DCB is not set CONFIG_DNS_RESOLVER=y diff --git a/kernel/arch/x86/include/asm/kvm_host.h b/kernel/arch/x86/include/asm/kvm_host.h index 30cfd6429..fe68e836e 100644 --- a/kernel/arch/x86/include/asm/kvm_host.h +++ b/kernel/arch/x86/include/asm/kvm_host.h @@ -911,6 +911,9 @@ struct kvm_x86_ops { void (*post_block)(struct kvm_vcpu *vcpu); int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); + + int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc); + void (*cancel_hv_timer)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { @@ -980,6 +983,8 @@ extern u32 kvm_max_guest_tsc_khz; extern u8 kvm_tsc_scaling_ratio_frac_bits; /* maximum allowed value of TSC scaling ratio */ extern u64 kvm_max_tsc_scaling_ratio; +/* 1ull << kvm_tsc_scaling_ratio_frac_bits */ +extern u64 kvm_default_tsc_scaling_ratio; enum emulation_result { EMULATE_DONE, /* no further processing */ diff --git a/kernel/arch/x86/kvm/lapic.c b/kernel/arch/x86/kvm/lapic.c index 20d9e9fb3..3d1b17068 100644 --- a/kernel/arch/x86/kvm/lapic.c +++ b/kernel/arch/x86/kvm/lapic.c @@ -1258,6 +1258,108 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) __delay(tsc_deadline - guest_tsc); } +static void start_sw_tscdeadline(struct kvm_lapic *apic) +{ + u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; + u64 ns = 0; + ktime_t expire; + struct kvm_vcpu *vcpu = apic->vcpu; + unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; + unsigned long flags; + ktime_t now; + + if (unlikely(!tscdeadline || !this_tsc_khz)) + return; + + local_irq_save(flags); + + now = apic->lapic_timer.timer.base->get_time(); + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + if (likely(tscdeadline > guest_tsc)) { + ns = (tscdeadline - guest_tsc) * 1000000ULL; + do_div(ns, this_tsc_khz); + expire = ktime_add_ns(now, ns); + expire = ktime_sub_ns(expire, lapic_timer_advance_ns); + hrtimer_start(&apic->lapic_timer.timer, + expire, HRTIMER_MODE_ABS_PINNED); + } else + apic_timer_expired(apic); + + local_irq_restore(flags); +} + +bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.apic->lapic_timer.hv_timer_in_use; +} +EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); + +static void cancel_hv_tscdeadline(struct kvm_lapic *apic) +{ + kvm_x86_ops->cancel_hv_timer(apic->vcpu); + apic->lapic_timer.hv_timer_in_use = false; +} + +void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + WARN_ON(!apic->lapic_timer.hv_timer_in_use); + WARN_ON(swait_active(&vcpu->wq)); + cancel_hv_tscdeadline(apic); + apic_timer_expired(apic); +} +EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); + +static bool start_hv_tscdeadline(struct kvm_lapic *apic) +{ + u64 tscdeadline = apic->lapic_timer.tscdeadline; + + if (atomic_read(&apic->lapic_timer.pending) || + kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { + if (apic->lapic_timer.hv_timer_in_use) + cancel_hv_tscdeadline(apic); + } else { + apic->lapic_timer.hv_timer_in_use = true; + hrtimer_cancel(&apic->lapic_timer.timer); + + /* In case the sw timer triggered in the window */ + if (atomic_read(&apic->lapic_timer.pending)) + cancel_hv_tscdeadline(apic); + } + trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, + apic->lapic_timer.hv_timer_in_use); + return apic->lapic_timer.hv_timer_in_use; +} + +void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + WARN_ON(apic->lapic_timer.hv_timer_in_use); + + if (apic_lvtt_tscdeadline(apic)) + start_hv_tscdeadline(apic); +} +EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); + +void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + /* Possibly the TSC deadline timer is not enabled yet */ + if (!apic->lapic_timer.hv_timer_in_use) + return; + + cancel_hv_tscdeadline(apic); + + if (atomic_read(&apic->lapic_timer.pending)) + return; + + start_sw_tscdeadline(apic); +} +EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); + static void start_apic_timer(struct kvm_lapic *apic) { ktime_t now; @@ -1304,32 +1406,8 @@ static void start_apic_timer(struct kvm_lapic *apic) ktime_to_ns(ktime_add_ns(now, apic->lapic_timer.period))); } else if (apic_lvtt_tscdeadline(apic)) { - /* lapic timer in tsc deadline mode */ - u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; - u64 ns = 0; - ktime_t expire; - struct kvm_vcpu *vcpu = apic->vcpu; - unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; - unsigned long flags; - - if (unlikely(!tscdeadline || !this_tsc_khz)) - return; - - local_irq_save(flags); - - now = apic->lapic_timer.timer.base->get_time(); - guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - if (likely(tscdeadline > guest_tsc)) { - ns = (tscdeadline - guest_tsc) * 1000000ULL; - do_div(ns, this_tsc_khz); - expire = ktime_add_ns(now, ns); - expire = ktime_sub_ns(expire, lapic_timer_advance_ns); - hrtimer_start(&apic->lapic_timer.timer, - expire, HRTIMER_MODE_ABS); - } else - apic_timer_expired(apic); - - local_irq_restore(flags); + if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic))) + start_sw_tscdeadline(apic); } } diff --git a/kernel/arch/x86/kvm/lapic.h b/kernel/arch/x86/kvm/lapic.h index fde8e35d5..640ad275b 100644 --- a/kernel/arch/x86/kvm/lapic.h +++ b/kernel/arch/x86/kvm/lapic.h @@ -16,6 +16,7 @@ struct kvm_timer { u64 tscdeadline; u64 expired_tscdeadline; atomic_t pending; /* accumulated triggered timers */ + bool hv_timer_in_use; }; struct kvm_lapic { @@ -170,4 +171,8 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu); bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); +void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu); +void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); +void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); +bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); #endif diff --git a/kernel/arch/x86/kvm/trace.h b/kernel/arch/x86/kvm/trace.h index ab9ae67a8..b41f7a013 100644 --- a/kernel/arch/x86/kvm/trace.h +++ b/kernel/arch/x86/kvm/trace.h @@ -1025,6 +1025,21 @@ TRACE_EVENT(kvm_pi_irte_update, __entry->pi_desc_addr) ); +TRACE_EVENT(kvm_hv_timer_state, + TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use), + TP_ARGS(vcpu_id, hv_timer_in_use), + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + __field(unsigned int, hv_timer_in_use) + ), + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->hv_timer_in_use = hv_timer_in_use; + ), + TP_printk("vcpu_id %x hv_timer %x\n", + __entry->vcpu_id, + __entry->hv_timer_in_use) +); #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/kernel/arch/x86/kvm/vmx.c b/kernel/arch/x86/kvm/vmx.c index 0958fa2b7..a722f724c 100644 --- a/kernel/arch/x86/kvm/vmx.c +++ b/kernel/arch/x86/kvm/vmx.c @@ -109,6 +109,13 @@ module_param_named(pml, enable_pml, bool, S_IRUGO); #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL +/* Guest_tsc -> host_tsc conversion requires 64-bit division. */ +static int __read_mostly cpu_preemption_timer_multi; +static bool __read_mostly enable_preemption_timer = 1; +#ifdef CONFIG_X86_64 +module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); +#endif + #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) #define KVM_VM_CR0_ALWAYS_ON \ @@ -596,6 +603,9 @@ struct vcpu_vmx { #define PML_ENTITY_NUM 512 struct page *pml_pg; + /* apic deadline value in host tsc */ + u64 hv_deadline_tsc; + u64 current_tsc_ratio; }; @@ -1043,6 +1053,58 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; } +/* + * Comment's format: document - errata name - stepping - processor name. + * Refer from + * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp + */ +static u32 vmx_preemption_cpu_tfms[] = { +/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ +0x000206E6, +/* 323056.pdf - AAX65 - C2 - Xeon L3406 */ +/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ +/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ +0x00020652, +/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ +0x00020655, +/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ +/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ +/* + * 320767.pdf - AAP86 - B1 - + * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile + */ +0x000106E5, +/* 321333.pdf - AAM126 - C0 - Xeon 3500 */ +0x000106A0, +/* 321333.pdf - AAM126 - C1 - Xeon 3500 */ +0x000106A1, +/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ +0x000106A4, + /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ + /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ + /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ +0x000106A5, +}; + +static inline bool cpu_has_broken_vmx_preemption_timer(void) +{ + u32 eax = cpuid_eax(0x00000001), i; + + /* Clear the reserved bits */ + eax &= ~(0x3U << 14 | 0xfU << 28); + for (i = 0; i < sizeof(vmx_preemption_cpu_tfms)/sizeof(u32); i++) + if (eax == vmx_preemption_cpu_tfms[i]) + return true; + + return false; +} + +static inline bool cpu_has_vmx_preemption_timer(void) +{ + return vmcs_config.pin_based_exec_ctrl & + PIN_BASED_VMX_PREEMPTION_TIMER; +} + static inline bool cpu_has_vmx_posted_intr(void) { return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && @@ -3209,11 +3271,15 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) return -EIO; min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR; + opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | + PIN_BASED_VMX_PREEMPTION_TIMER; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, &_pin_based_exec_control) < 0) return -EIO; + if (cpu_has_broken_vmx_preemption_timer()) + _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) || !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) @@ -4683,6 +4749,8 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) if (!vmx_cpu_uses_apicv(&vmx->vcpu)) pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; + /* Enable the preemption timer dynamically */ + pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; return pin_based_exec_ctrl; } @@ -4781,6 +4849,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) /* Control */ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); + vmx->hv_deadline_tsc = -1; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); @@ -6292,6 +6361,17 @@ static __init int hardware_setup(void) kvm_x86_ops->enable_log_dirty_pt_masked = NULL; } + if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { + u64 vmx_msr; + + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); + cpu_preemption_timer_multi = + vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; + } else { + kvm_x86_ops->set_hv_timer = NULL; + kvm_x86_ops->cancel_hv_timer = NULL; + } + kvm_set_posted_intr_wakeup_handler(wakeup_handler); return alloc_kvm_area(); @@ -7460,6 +7540,12 @@ static int handle_pcommit(struct kvm_vcpu *vcpu) return 1; } +static int handle_preemption_timer(struct kvm_vcpu *vcpu) +{ + kvm_lapic_expired_hv_timer(vcpu); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -7511,6 +7597,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_XRSTORS] = handle_xrstors, [EXIT_REASON_PML_FULL] = handle_pml_full, [EXIT_REASON_PCOMMIT] = handle_pcommit, + [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, }; static const int kvm_vmx_max_exit_handlers = @@ -7814,6 +7901,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); case EXIT_REASON_PCOMMIT: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT); + case EXIT_REASON_PREEMPTION_TIMER: + return false; default: return true; } @@ -8510,6 +8599,26 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host); } +void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 tscl; + u32 delta_tsc; + + if (vmx->hv_deadline_tsc == -1) + return; + + tscl = rdtsc(); + if (vmx->hv_deadline_tsc > tscl) + /* sure to be 32 bit only because checked on set_hv_timer */ + delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> + cpu_preemption_timer_multi); + else + delta_tsc = 0; + + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); +} + static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -8556,6 +8665,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); + vmx_arm_hv_timer(vcpu); + vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ @@ -9520,9 +9631,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(VMCS_LINK_POINTER, -1ull); exec_control = vmcs12->pin_based_vm_exec_control; - exec_control |= vmcs_config.pin_based_exec_ctrl; + + /* Preemption timer setting is only taken from vmcs01. */ exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + exec_control |= vmcs_config.pin_based_exec_ctrl; + if (vmx->hv_deadline_tsc == -1) + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + /* Posted interrupts setting is only taken from vmcs12. */ if (nested_cpu_has_posted_intr(vmcs12)) { /* * Note that we use L0's vector here and in @@ -10451,8 +10567,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, load_vmcs12_host_state(vcpu, vmcs12); - /* Update TSC_OFFSET if TSC was changed while L2 ran */ + /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); + if (vmx->hv_deadline_tsc == -1) + vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + else + vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); /* This is needed for same reason as it was needed in prepare_vmcs02 */ vmx->host_rsp = 0; @@ -10532,6 +10654,64 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, return X86EMUL_CONTINUE; } +#ifdef CONFIG_X86_64 +/* (a << shift) / divisor, return 1 if overflow otherwise 0 */ +static inline int u64_shl_div_u64(u64 a, unsigned int shift, + u64 divisor, u64 *result) +{ + u64 low = a << shift, high = a >> (64 - shift); + + /* To avoid the overflow on divq */ + if (high >= divisor) + return 1; + + /* Low hold the result, high hold rem which is discarded */ + asm("divq %2\n\t" : "=a" (low), "=d" (high) : + "rm" (divisor), "0" (low), "1" (high)); + *result = low; + + return 0; +} + +static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 tscl = rdtsc(); + u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); + u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; + + /* Convert to host delta tsc if tsc scaling is enabled */ + if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && + u64_shl_div_u64(delta_tsc, + kvm_tsc_scaling_ratio_frac_bits, + vcpu->arch.tsc_scaling_ratio, + &delta_tsc)) + return -ERANGE; + + /* + * If the delta tsc can't fit in the 32 bit after the multi shift, + * we can't use the preemption timer. + * It's possible that it fits on later vmentries, but checking + * on every vmentry is costly so we just use an hrtimer. + */ + if (delta_tsc >> (cpu_preemption_timer_multi + 32)) + return -ERANGE; + + vmx->hv_deadline_tsc = tscl + delta_tsc; + vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + return 0; +} + +static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + vmx->hv_deadline_tsc = -1; + vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); +} +#endif + static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) { if (ple_gap) @@ -10576,7 +10756,7 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, * this case, return 1, otherwise, return 0. * */ -static int vmx_pre_block(struct kvm_vcpu *vcpu) +static int pi_pre_block(struct kvm_vcpu *vcpu) { unsigned long flags; unsigned int dest; @@ -10642,7 +10822,18 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) return 0; } -static void vmx_post_block(struct kvm_vcpu *vcpu) +static int vmx_pre_block(struct kvm_vcpu *vcpu) +{ + if (pi_pre_block(vcpu)) + return 1; + + if (kvm_lapic_hv_timer_in_use(vcpu)) + kvm_lapic_switch_to_sw_timer(vcpu); + + return 0; +} + +static void pi_post_block(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); struct pi_desc old, new; @@ -10683,6 +10874,14 @@ static void vmx_post_block(struct kvm_vcpu *vcpu) } } +static void vmx_post_block(struct kvm_vcpu *vcpu) +{ + if (kvm_x86_ops->set_hv_timer) + kvm_lapic_switch_to_hv_timer(vcpu); + + pi_post_block(vcpu); +} + /* * vmx_update_pi_irte - set IRTE for Posted-Interrupts * @@ -10878,6 +11077,11 @@ static struct kvm_x86_ops vmx_x86_ops = { .pmu_ops = &intel_pmu_ops, .update_pi_irte = vmx_update_pi_irte, + +#ifdef CONFIG_X86_64 + .set_hv_timer = vmx_set_hv_timer, + .cancel_hv_timer = vmx_cancel_hv_timer, +#endif }; static int __init vmx_init(void) diff --git a/kernel/arch/x86/kvm/x86.c b/kernel/arch/x86/kvm/x86.c index 27419ba5a..c7695cea4 100644 --- a/kernel/arch/x86/kvm/x86.c +++ b/kernel/arch/x86/kvm/x86.c @@ -113,7 +113,8 @@ u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); u64 __read_mostly kvm_max_tsc_scaling_ratio; EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); -static u64 __read_mostly kvm_default_tsc_scaling_ratio; +u64 __read_mostly kvm_default_tsc_scaling_ratio; +EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; @@ -2718,6 +2719,11 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) rdtsc() - vcpu->arch.last_host_tsc; if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); + + if (kvm_lapic_hv_timer_in_use(vcpu) && + kvm_x86_ops->set_hv_timer(vcpu, + kvm_get_lapic_tscdeadline_msr(vcpu))) + kvm_lapic_switch_to_sw_timer(vcpu); if (check_tsc_unstable()) { u64 offset = kvm_compute_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); diff --git a/kernel/kernel/locking/rtmutex.c b/kernel/kernel/locking/rtmutex.c index 66971005c..30777e813 100644 --- a/kernel/kernel/locking/rtmutex.c +++ b/kernel/kernel/locking/rtmutex.c @@ -2058,13 +2058,6 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); */ int __sched rt_mutex_trylock(struct rt_mutex *lock) { -#ifdef CONFIG_PREEMPT_RT_FULL - if (WARN_ON(in_irq() || in_nmi())) -#else - if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq())) -#endif - return 0; - return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); } EXPORT_SYMBOL_GPL(rt_mutex_trylock); diff --git a/kernel/kernel/time/timer.c b/kernel/kernel/time/timer.c index fee8682c2..016a0bf52 100644 --- a/kernel/kernel/time/timer.c +++ b/kernel/kernel/time/timer.c @@ -1453,9 +1453,11 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) * the base lock to check when the next timer is pending and so * we assume the next jiffy. */ - return basem + TICK_NSEC; -#endif + if (!spin_do_trylock(&base->lock)) + return basem + TICK_NSEC; +#else spin_lock(&base->lock); +#endif if (base->active_timers) { if (time_before_eq(base->next_timer, base->timer_jiffies)) base->next_timer = __next_timer_interrupt(base); @@ -1465,7 +1467,11 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) else expires = basem + (nextevt - basej) * TICK_NSEC; } +#ifdef CONFIG_PREEMPT_RT_FULL + rt_spin_unlock(&base->lock); +#else spin_unlock(&base->lock); +#endif return cmp_next_hrtimer_event(basem, expires); } @@ -1509,8 +1515,36 @@ static void run_timer_softirq(struct softirq_action *h) */ void run_local_timers(void) { + struct tvec_base *base = this_cpu_ptr(&tvec_bases); + hrtimer_run_queues(); - raise_softirq(TIMER_SOFTIRQ); + /* + * We can access this lockless as we are in the timer + * interrupt. If there are no timers queued, nothing to do in + * the timer softirq. + */ +#ifdef CONFIG_PREEMPT_RT_FULL + if (irq_work_needs_cpu()) { + raise_softirq(TIMER_SOFTIRQ); + return; + } + if (!spin_do_trylock(&base->lock)) { + raise_softirq(TIMER_SOFTIRQ); + return; + } +#endif + if (!base->active_timers) + goto out; + + /* Check whether the next pending timer has expired */ + if (time_before_eq(base->next_timer, jiffies)) + raise_softirq(TIMER_SOFTIRQ); +out: +#ifdef CONFIG_PREEMPT_RT_FULL + rt_spin_unlock(&base->lock); +#endif + /* The ; ensures that gcc won't complain in the !RT case */ + ; } #ifdef __ARCH_WANT_SYS_ALARM diff --git a/kernel/scripts/package/Makefile b/kernel/scripts/package/Makefile index 1aca224e8..493e22635 100644 --- a/kernel/scripts/package/Makefile +++ b/kernel/scripts/package/Makefile @@ -52,7 +52,7 @@ rpm-pkg rpm: FORCE $(call cmd,src_tar,$(KERNELPATH),kernel.spec) $(CONFIG_SHELL) $(srctree)/scripts/mkversion > $(objtree)/.tmp_version mv -f $(objtree)/.tmp_version $(objtree)/.version - rpmbuild --target $(UTS_MACHINE) -ta $(KERNELPATH).tar.gz + rpmbuild $(RPMOPTS) --target $(UTS_MACHINE) -ta $(KERNELPATH).tar.gz rm $(KERNELPATH).tar.gz kernel.spec # binrpm-pkg @@ -63,7 +63,7 @@ binrpm-pkg: FORCE $(CONFIG_SHELL) $(srctree)/scripts/mkversion > $(objtree)/.tmp_version mv -f $(objtree)/.tmp_version $(objtree)/.version - rpmbuild --define "_builddir $(objtree)" --target \ + rpmbuild $(RPMOPTS) --define "_builddir $(objtree)" --target \ $(UTS_MACHINE) -bb $(objtree)/binkernel.spec rm binkernel.spec diff --git a/tests/pod.yaml b/tests/pod.yaml new file mode 100644 index 000000000..fa66ebf7f --- /dev/null +++ b/tests/pod.yaml @@ -0,0 +1,18 @@ +--- +# sample config file about the POD information, including the +# name/IP/user/ssh key +# +# The options of this config file include: +# name: the name of this node +# role: node's role, support role: Master/Controller/Comupte/BareMetal +# ip: the node's IP address +# user: the username for login +# key_filename:the path of the private key file for login + +nodes: +- + name: kvm + role: Controller + ip: 10.2.117.23 + user: root + key_filename: /root/.ssh/id_rsa |