These changes are the raw update to linux-4.4.6-rt14. Kernel sources

are taken from kernel.org, and rt patch from the rt wiki download page. During the rebasing, the following patch collided: Force tick interrupt and get rid of softirq magic(I70131fb85). Collisions have been removed because its logic was found on the source already. Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769 Signed-off-by: José Pekkarinen <jose.pekkarinen@nokia.com>
author: José Pekkarinen <jose.pekkarinen@nokia.com> 2016-04-11 10:41:07 +0300
committer: José Pekkarinen <jose.pekkarinen@nokia.com> 2016-04-13 08:17:18 +0300
commit: e09b41010ba33a20a87472ee821fa407a5b8da36 (patch)
tree: d10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/drivers/hv
parent: f93b97fd65072de626c074dbe099a1fff05ce060 (diff)
14 files changed, 1316 insertions, 519 deletions
diff --git a/kernel/drivers/hv/Makefile b/kernel/drivers/hv/Makefile
index 5e4dfa4cf..39c9b2c08 100644
--- a/kernel/drivers/hv/Makefile
+++ b/kernel/drivers/hv/Makefile
@@ -5,4 +5,4 @@ obj-$(CONFIG_HYPERV_BALLOON)	+= hv_balloon.o
 hv_vmbus-y := vmbus_drv.o \
 		 hv.o connection.o channel.o \
 		 channel_mgmt.o ring_buffer.o
-hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_fcopy.o
+hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_fcopy.o hv_utils_transport.o
diff --git a/kernel/drivers/hv/channel.c b/kernel/drivers/hv/channel.c
index 54da66dc7..9098f13f2 100644
--- a/kernel/drivers/hv/channel.c
+++ b/kernel/drivers/hv/channel.c
@@ -73,6 +73,7 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size,
 	unsigned long flags;
 	int ret, err = 0;
 	unsigned long t;
+	struct page *page;
 
 	spin_lock_irqsave(&newchannel->lock, flags);
 	if (newchannel->state == CHANNEL_OPEN_STATE) {
@@ -87,8 +88,17 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size,
 	newchannel->channel_callback_context = context;
 
 	/* Allocate the ring buffer */
-	out = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
-		get_order(send_ringbuffer_size + recv_ringbuffer_size));
+	page = alloc_pages_node(cpu_to_node(newchannel->target_cpu),
+				GFP_KERNEL|__GFP_ZERO,
+				get_order(send_ringbuffer_size +
+				recv_ringbuffer_size));
+
+	if (!page)
+		out = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
+					       get_order(send_ringbuffer_size +
+					       recv_ringbuffer_size));
+	else
+		out = (void *)page_address(page);
 
 	if (!out) {
 		err = -ENOMEM;
@@ -178,19 +188,18 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size,
 		goto error1;
 	}
 
-
-	if (open_info->response.open_result.status)
-		err = open_info->response.open_result.status;
-
 	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
 	list_del(&open_info->msglistentry);
 	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
 
-	if (err == 0)
-		newchannel->state = CHANNEL_OPENED_STATE;
+	if (open_info->response.open_result.status) {
+		err = -EAGAIN;
+		goto error_gpadl;
+	}
 
+	newchannel->state = CHANNEL_OPENED_STATE;
 	kfree(open_info);
-	return err;
+	return 0;
 
 error1:
 	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
@@ -592,6 +601,7 @@ int vmbus_sendpacket_ctl(struct vmbus_channel *channel, void *buffer,
 	u64 aligned_data = 0;
 	int ret;
 	bool signal = false;
+	int num_vecs = ((bufferlen != 0) ? 3 : 1);
 
 
 	/* Setup the descriptor */
@@ -609,7 +619,8 @@ int vmbus_sendpacket_ctl(struct vmbus_channel *channel, void *buffer,
 	bufferlist[2].iov_base = &aligned_data;
 	bufferlist[2].iov_len = (packetlen_aligned - packetlen);
 
-	ret = hv_ringbuffer_write(&channel->outbound, bufferlist, 3, &signal);
+	ret = hv_ringbuffer_write(&channel->outbound, bufferlist, num_vecs,
+				  &signal);
 
 	/*
 	 * Signalling the host is conditional on many factors:
@@ -619,10 +630,19 @@ int vmbus_sendpacket_ctl(struct vmbus_channel *channel, void *buffer,
 	 *    on the ring. We will not signal if more data is
 	 *    to be placed.
 	 *
+	 * Based on the channel signal state, we will decide
+	 * which signaling policy will be applied.
+	 *
 	 * If we cannot write to the ring-buffer; signal the host
 	 * even if we may not have written anything. This is a rare
 	 * enough condition that it should not matter.
 	 */
+
+	if (channel->signal_policy)
+		signal = true;
+	else
+		kick_q = true;
+
 	if (((ret == 0) && kick_q && signal) || (ret))
 		vmbus_setevent(channel);
 
@@ -722,10 +742,19 @@ int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel,
 	 *    on the ring. We will not signal if more data is
 	 *    to be placed.
 	 *
+	 * Based on the channel signal state, we will decide
+	 * which signaling policy will be applied.
+	 *
 	 * If we cannot write to the ring-buffer; signal the host
 	 * even if we may not have written anything. This is a rare
 	 * enough condition that it should not matter.
 	 */
+
+	if (channel->signal_policy)
+		signal = true;
+	else
+		kick_q = true;
+
 	if (((ret == 0) && kick_q && signal) || (ret))
 		vmbus_setevent(channel);
 
diff --git a/kernel/drivers/hv/channel_mgmt.c b/kernel/drivers/hv/channel_mgmt.c
index 0eeb1b3bc..652afd11a 100644
--- a/kernel/drivers/hv/channel_mgmt.c
+++ b/kernel/drivers/hv/channel_mgmt.c
@@ -32,6 +32,9 @@
 
 #include "hyperv_vmbus.h"
 
+static void init_vp_index(struct vmbus_channel *channel,
+			  const uuid_le *type_guid);
+
 /**
  * vmbus_prep_negotiate_resp() - Create default response for Hyper-V Negotiate message
  * @icmsghdrp: Pointer to msg header structure
@@ -201,22 +204,38 @@ void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid)
 		spin_lock_irqsave(&vmbus_connection.channel_lock, flags);
 		list_del(&channel->listentry);
 		spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);
+
+		primary_channel = channel;
 	} else {
 		primary_channel = channel->primary_channel;
 		spin_lock_irqsave(&primary_channel->lock, flags);
 		list_del(&channel->sc_list);
+		primary_channel->num_sc--;
 		spin_unlock_irqrestore(&primary_channel->lock, flags);
 	}
+
+	/*
+	 * We need to free the bit for init_vp_index() to work in the case
+	 * of sub-channel, when we reload drivers like hv_netvsc.
+	 */
+	cpumask_clear_cpu(channel->target_cpu,
+			  &primary_channel->alloced_cpus_in_node);
+
 	free_channel(channel);
 }
 
 void vmbus_free_channels(void)
 {
-	struct vmbus_channel *channel;
+	struct vmbus_channel *channel, *tmp;
+
+	list_for_each_entry_safe(channel, tmp, &vmbus_connection.chn_list,
+		listentry) {
+		/* if we don't set rescind to true, vmbus_close_internal()
+		 * won't invoke hv_process_channel_removal().
+		 */
+		channel->rescind = true;
 
-	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
 		vmbus_device_unregister(channel->device_obj);
-		free_channel(channel);
 	}
 }
 
@@ -228,7 +247,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
 {
 	struct vmbus_channel *channel;
 	bool fnew = true;
-	bool enq = false;
 	unsigned long flags;
 
 	/* Make sure this is a new offer */
@@ -244,25 +262,12 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
 		}
 	}
 
-	if (fnew) {
+	if (fnew)
 		list_add_tail(&newchannel->listentry,
 			      &vmbus_connection.chn_list);
-		enq = true;
-	}
 
 	spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags);
 
-	if (enq) {
-		if (newchannel->target_cpu != get_cpu()) {
-			put_cpu();
-			smp_call_function_single(newchannel->target_cpu,
-						 percpu_channel_enq,
-						 newchannel, true);
-		} else {
-			percpu_channel_enq(newchannel);
-			put_cpu();
-		}
-	}
 	if (!fnew) {
 		/*
 		 * Check to see if this is a sub-channel.
@@ -274,27 +279,22 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
 			newchannel->primary_channel = channel;
 			spin_lock_irqsave(&channel->lock, flags);
 			list_add_tail(&newchannel->sc_list, &channel->sc_list);
-			spin_unlock_irqrestore(&channel->lock, flags);
-
-			if (newchannel->target_cpu != get_cpu()) {
-				put_cpu();
-				smp_call_function_single(newchannel->target_cpu,
-							 percpu_channel_enq,
-							 newchannel, true);
-			} else {
-				percpu_channel_enq(newchannel);
-				put_cpu();
-			}
-
-			newchannel->state = CHANNEL_OPEN_STATE;
 			channel->num_sc++;
-			if (channel->sc_creation_callback != NULL)
-				channel->sc_creation_callback(newchannel);
+			spin_unlock_irqrestore(&channel->lock, flags);
+		} else
+			goto err_free_chan;
+	}
 
-			return;
-		}
+	init_vp_index(newchannel, &newchannel->offermsg.offer.if_type);
 
-		goto err_free_chan;
+	if (newchannel->target_cpu != get_cpu()) {
+		put_cpu();
+		smp_call_function_single(newchannel->target_cpu,
+					 percpu_channel_enq,
+					 newchannel, true);
+	} else {
+		percpu_channel_enq(newchannel);
+		put_cpu();
 	}
 
 	/*
@@ -304,6 +304,12 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
 	 */
 	newchannel->state = CHANNEL_OPEN_STATE;
 
+	if (!fnew) {
+		if (channel->sc_creation_callback != NULL)
+			channel->sc_creation_callback(newchannel);
+		return;
+	}
+
 	/*
 	 * Start the process of binding this offer to the driver
 	 * We need to set the DeviceObject field before calling
@@ -351,6 +357,7 @@ enum {
 	IDE = 0,
 	SCSI,
 	NIC,
+	ND_NIC,
 	MAX_PERF_CHN,
 };
 
@@ -374,23 +381,28 @@ static const struct hv_vmbus_device_id hp_devs[] = {
 /*
  * We use this state to statically distribute the channel interrupt load.
  */
-static u32  next_vp;
+static int next_numa_node_id;
 
 /*
  * Starting with Win8, we can statically distribute the incoming
- * channel interrupt load by binding a channel to VCPU. We
- * implement here a simple round robin scheme for distributing
- * the interrupt load.
- * We will bind channels that are not performance critical to cpu 0 and
- * performance critical channels (IDE, SCSI and Network) will be uniformly
- * distributed across all available CPUs.
+ * channel interrupt load by binding a channel to VCPU.
+ * We do this in a hierarchical fashion:
+ * First distribute the primary channels across available NUMA nodes
+ * and then distribute the subchannels amongst the CPUs in the NUMA
+ * node assigned to the primary channel.
+ *
+ * For pre-win8 hosts or non-performance critical channels we assign the
+ * first CPU in the first NUMA node.
  */
 static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_guid)
 {
 	u32 cur_cpu;
 	int i;
 	bool perf_chn = false;
-	u32 max_cpus = num_online_cpus();
+	struct vmbus_channel *primary = channel->primary_channel;
+	int next_node;
+	struct cpumask available_mask;
+	struct cpumask *alloced_mask;
 
 	for (i = IDE; i < MAX_PERF_CHN; i++) {
 		if (!memcmp(type_guid->b, hp_devs[i].guid,
@@ -407,16 +419,104 @@ static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_gui
 		 * Also if the channel is not a performance critical
 		 * channel, bind it to cpu 0.
 		 */
+		channel->numa_node = 0;
 		channel->target_cpu = 0;
-		channel->target_vp = 0;
+		channel->target_vp = hv_context.vp_index[0];
 		return;
 	}
-	cur_cpu = (++next_vp % max_cpus);
+
+	/*
+	 * We distribute primary channels evenly across all the available
+	 * NUMA nodes and within the assigned NUMA node we will assign the
+	 * first available CPU to the primary channel.
+	 * The sub-channels will be assigned to the CPUs available in the
+	 * NUMA node evenly.
+	 */
+	if (!primary) {
+		while (true) {
+			next_node = next_numa_node_id++;
+			if (next_node == nr_node_ids)
+				next_node = next_numa_node_id = 0;
+			if (cpumask_empty(cpumask_of_node(next_node)))
+				continue;
+			break;
+		}
+		channel->numa_node = next_node;
+		primary = channel;
+	}
+	alloced_mask = &hv_context.hv_numa_map[primary->numa_node];
+
+	if (cpumask_weight(alloced_mask) ==
+	    cpumask_weight(cpumask_of_node(primary->numa_node))) {
+		/*
+		 * We have cycled through all the CPUs in the node;
+		 * reset the alloced map.
+		 */
+		cpumask_clear(alloced_mask);
+	}
+
+	cpumask_xor(&available_mask, alloced_mask,
+		    cpumask_of_node(primary->numa_node));
+
+	cur_cpu = -1;
+	while (true) {
+		cur_cpu = cpumask_next(cur_cpu, &available_mask);
+		if (cur_cpu >= nr_cpu_ids) {
+			cur_cpu = -1;
+			cpumask_copy(&available_mask,
+				     cpumask_of_node(primary->numa_node));
+			continue;
+		}
+
+		/*
+		 * NOTE: in the case of sub-channel, we clear the sub-channel
+		 * related bit(s) in primary->alloced_cpus_in_node in
+		 * hv_process_channel_removal(), so when we reload drivers
+		 * like hv_netvsc in SMP guest, here we're able to re-allocate
+		 * bit from primary->alloced_cpus_in_node.
+		 */
+		if (!cpumask_test_cpu(cur_cpu,
+				&primary->alloced_cpus_in_node)) {
+			cpumask_set_cpu(cur_cpu,
+					&primary->alloced_cpus_in_node);
+			cpumask_set_cpu(cur_cpu, alloced_mask);
+			break;
+		}
+	}
+
 	channel->target_cpu = cur_cpu;
 	channel->target_vp = hv_context.vp_index[cur_cpu];
 }
 
 /*
+ * vmbus_unload_response - Handler for the unload response.
+ */
+static void vmbus_unload_response(struct vmbus_channel_message_header *hdr)
+{
+	/*
+	 * This is a global event; just wakeup the waiting thread.
+	 * Once we successfully unload, we can cleanup the monitor state.
+	 */
+	complete(&vmbus_connection.unload_event);
+}
+
+void vmbus_initiate_unload(void)
+{
+	struct vmbus_channel_message_header hdr;
+
+	/* Pre-Win2012R2 hosts don't support reconnect */
+	if (vmbus_proto_version < VERSION_WIN8_1)
+		return;
+
+	init_completion(&vmbus_connection.unload_event);
+	memset(&hdr, 0, sizeof(struct vmbus_channel_message_header));
+	hdr.msgtype = CHANNELMSG_UNLOAD;
+	vmbus_post_msg(&hdr, sizeof(struct vmbus_channel_message_header));
+
+	wait_for_completion(&vmbus_connection.unload_event);
+}
+
+/*
  * vmbus_onoffer - Handler for channel offers from vmbus in parent partition.
  *
  */
@@ -461,8 +561,6 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
 				offer->connection_id;
 	}
 
-	init_vp_index(newchannel, &offer->offer.if_type);
-
 	memcpy(&newchannel->offermsg, offer,
 	       sizeof(struct vmbus_channel_offer_channel));
 	newchannel->monitor_grp = (u8)offer->monitorid / 32;
@@ -712,6 +810,7 @@ struct vmbus_channel_message_table_entry
 	{CHANNELMSG_INITIATE_CONTACT,		0, NULL},
 	{CHANNELMSG_VERSION_RESPONSE,		1, vmbus_onversion_response},
 	{CHANNELMSG_UNLOAD,			0, NULL},
+	{CHANNELMSG_UNLOAD_RESPONSE,		1, vmbus_unload_response},
 };
 
 /*
diff --git a/kernel/drivers/hv/connection.c b/kernel/drivers/hv/connection.c
index b27220a42..4fc2e8836 100644
--- a/kernel/drivers/hv/connection.c
+++ b/kernel/drivers/hv/connection.c
@@ -58,6 +58,9 @@ static __u32 vmbus_get_next_version(__u32 current_version)
 	case (VERSION_WIN8_1):
 		return VERSION_WIN8;
 
+	case (VERSION_WIN10):
+		return VERSION_WIN8_1;
+
 	case (VERSION_WS2008):
 	default:
 		return VERSION_INVAL;
@@ -80,7 +83,7 @@ static int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo,
 	msg->interrupt_page = virt_to_phys(vmbus_connection.int_page);
 	msg->monitor_page1 = virt_to_phys(vmbus_connection.monitor_pages[0]);
 	msg->monitor_page2 = virt_to_phys(vmbus_connection.monitor_pages[1]);
-	if (version == VERSION_WIN8_1) {
+	if (version >= VERSION_WIN8_1) {
 		msg->target_vcpu = hv_context.vp_index[get_cpu()];
 		put_cpu();
 	}
@@ -227,6 +230,11 @@ cleanup:
 
 void vmbus_disconnect(void)
 {
+	/*
+	 * First send the unload request to the host.
+	 */
+	vmbus_initiate_unload();
+
 	if (vmbus_connection.work_queue) {
 		drain_workqueue(vmbus_connection.work_queue);
 		destroy_workqueue(vmbus_connection.work_queue);
@@ -371,8 +379,7 @@ void vmbus_on_event(unsigned long data)
 	int cpu = smp_processor_id();
 	union hv_synic_event_flags *event;
 
-	if ((vmbus_proto_version == VERSION_WS2008) ||
-		(vmbus_proto_version == VERSION_WIN7)) {
+	if (vmbus_proto_version < VERSION_WIN8) {
 		maxdword = MAX_NUM_CHANNELS_SUPPORTED >> 5;
 		recv_int_page = vmbus_connection.recv_int_page;
 	} else {
diff --git a/kernel/drivers/hv/hv.c b/kernel/drivers/hv/hv.c
index d3943bcee..6341be873 100644
--- a/kernel/drivers/hv/hv.c
+++ b/kernel/drivers/hv/hv.c
@@ -93,11 +93,14 @@ static int query_hypervisor_info(void)
  */
 static u64 do_hypercall(u64 control, void *input, void *output)
 {
-#ifdef CONFIG_X86_64
-	u64 hv_status = 0;
 	u64 input_address = (input) ? virt_to_phys(input) : 0;
 	u64 output_address = (output) ? virt_to_phys(output) : 0;
 	void *hypercall_page = hv_context.hypercall_page;
+#ifdef CONFIG_X86_64
+	u64 hv_status = 0;
+
+	if (!hypercall_page)
+		return (u64)ULLONG_MAX;
 
 	__asm__ __volatile__("mov %0, %%r8" : : "r" (output_address) : "r8");
 	__asm__ __volatile__("call *%3" : "=a" (hv_status) :
@@ -112,13 +115,13 @@ static u64 do_hypercall(u64 control, void *input, void *output)
 	u32 control_lo = control & 0xFFFFFFFF;
 	u32 hv_status_hi = 1;
 	u32 hv_status_lo = 1;
-	u64 input_address = (input) ? virt_to_phys(input) : 0;
 	u32 input_address_hi = input_address >> 32;
 	u32 input_address_lo = input_address & 0xFFFFFFFF;
-	u64 output_address = (output) ? virt_to_phys(output) : 0;
 	u32 output_address_hi = output_address >> 32;
 	u32 output_address_lo = output_address & 0xFFFFFFFF;
-	void *hypercall_page = hv_context.hypercall_page;
+
+	if (!hypercall_page)
+		return (u64)ULLONG_MAX;
 
 	__asm__ __volatile__ ("call *%8" : "=d"(hv_status_hi),
 			      "=a"(hv_status_lo) : "d" (control_hi),
@@ -130,6 +133,56 @@ static u64 do_hypercall(u64 control, void *input, void *output)
 #endif /* !x86_64 */
 }
 
+#ifdef CONFIG_X86_64
+static cycle_t read_hv_clock_tsc(struct clocksource *arg)
+{
+	cycle_t current_tick;
+	struct ms_hyperv_tsc_page *tsc_pg = hv_context.tsc_page;
+
+	if (tsc_pg->tsc_sequence != -1) {
+		/*
+		 * Use the tsc page to compute the value.
+		 */
+
+		while (1) {
+			cycle_t tmp;
+			u32 sequence = tsc_pg->tsc_sequence;
+			u64 cur_tsc;
+			u64 scale = tsc_pg->tsc_scale;
+			s64 offset = tsc_pg->tsc_offset;
+
+			rdtscll(cur_tsc);
+			/* current_tick = ((cur_tsc *scale) >> 64) + offset */
+			asm("mulq %3"
+				: "=d" (current_tick), "=a" (tmp)
+				: "a" (cur_tsc), "r" (scale));
+
+			current_tick += offset;
+			if (tsc_pg->tsc_sequence == sequence)
+				return current_tick;
+
+			if (tsc_pg->tsc_sequence != -1)
+				continue;
+			/*
+			 * Fallback using MSR method.
+			 */
+			break;
+		}
+	}
+	rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick);
+	return current_tick;
+}
+
+static struct clocksource hyperv_cs_tsc = {
+		.name           = "hyperv_clocksource_tsc_page",
+		.rating         = 425,
+		.read           = read_hv_clock_tsc,
+		.mask           = CLOCKSOURCE_MASK(64),
+		.flags          = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+#endif
+
+
 /*
  * hv_init - Main initialization routine.
  *
@@ -139,7 +192,9 @@ int hv_init(void)
 {
 	int max_leaf;
 	union hv_x64_msr_hypercall_contents hypercall_msr;
+	union hv_x64_msr_hypercall_contents tsc_msr;
 	void *virtaddr = NULL;
+	void *va_tsc = NULL;
 
 	memset(hv_context.synic_event_page, 0, sizeof(void *) * NR_CPUS);
 	memset(hv_context.synic_message_page, 0,
@@ -183,6 +238,22 @@ int hv_init(void)
 
 	hv_context.hypercall_page = virtaddr;
 
+#ifdef CONFIG_X86_64
+	if (ms_hyperv.features & HV_X64_MSR_REFERENCE_TSC_AVAILABLE) {
+		va_tsc = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL);
+		if (!va_tsc)
+			goto cleanup;
+		hv_context.tsc_page = va_tsc;
+
+		rdmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
+
+		tsc_msr.enable = 1;
+		tsc_msr.guest_physical_address = vmalloc_to_pfn(va_tsc);
+
+		wrmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
+		clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100);
+	}
+#endif
 	return 0;
 
 cleanup:
@@ -216,6 +287,21 @@ void hv_cleanup(void)
 		vfree(hv_context.hypercall_page);
 		hv_context.hypercall_page = NULL;
 	}
+
+#ifdef CONFIG_X86_64
+	/*
+	 * Cleanup the TSC page based CS.
+	 */
+	if (ms_hyperv.features & HV_X64_MSR_REFERENCE_TSC_AVAILABLE) {
+		clocksource_change_rating(&hyperv_cs_tsc, 10);
+		clocksource_unregister(&hyperv_cs_tsc);
+
+		hypercall_msr.as_uint64 = 0;
+		wrmsrl(HV_X64_MSR_REFERENCE_TSC, hypercall_msr.as_uint64);
+		vfree(hv_context.tsc_page);
+		hv_context.tsc_page = NULL;
+	}
+#endif
 }
 
 /*
@@ -271,7 +357,7 @@ static int hv_ce_set_next_event(unsigned long delta,
 {
 	cycle_t current_tick;
 
-	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+	WARN_ON(!clockevent_state_oneshot(evt));
 
 	rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick);
 	current_tick += delta;
@@ -279,31 +365,24 @@ static int hv_ce_set_next_event(unsigned long delta,
 	return 0;
 }
 
-static void hv_ce_setmode(enum clock_event_mode mode,
-			  struct clock_event_device *evt)
+static int hv_ce_shutdown(struct clock_event_device *evt)
+{
+	wrmsrl(HV_X64_MSR_STIMER0_COUNT, 0);
+	wrmsrl(HV_X64_MSR_STIMER0_CONFIG, 0);
+
+	return 0;
+}
+
+static int hv_ce_set_oneshot(struct clock_event_device *evt)
 {
 	union hv_timer_config timer_cfg;
 
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-		/* unsupported */
-		break;
-
-	case CLOCK_EVT_MODE_ONESHOT:
-		timer_cfg.enable = 1;
-		timer_cfg.auto_enable = 1;
-		timer_cfg.sintx = VMBUS_MESSAGE_SINT;
-		wrmsrl(HV_X64_MSR_STIMER0_CONFIG, timer_cfg.as_uint64);
-		break;
-
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		wrmsrl(HV_X64_MSR_STIMER0_COUNT, 0);
-		wrmsrl(HV_X64_MSR_STIMER0_CONFIG, 0);
-		break;
-	case CLOCK_EVT_MODE_RESUME:
-		break;
-	}
+	timer_cfg.enable = 1;
+	timer_cfg.auto_enable = 1;
+	timer_cfg.sintx = VMBUS_MESSAGE_SINT;
+	wrmsrl(HV_X64_MSR_STIMER0_CONFIG, timer_cfg.as_uint64);
+
+	return 0;
 }
 
 static void hv_init_clockevent_device(struct clock_event_device *dev, int cpu)
@@ -318,7 +397,8 @@ static void hv_init_clockevent_device(struct clock_event_device *dev, int cpu)
 	 * references to the hv_vmbus module making it impossible to unload.
 	 */
 
-	dev->set_mode = hv_ce_setmode;
+	dev->set_state_shutdown = hv_ce_shutdown;
+	dev->set_state_oneshot = hv_ce_set_oneshot;
 	dev->set_next_event = hv_ce_set_next_event;
 }
 
@@ -329,6 +409,13 @@ int hv_synic_alloc(void)
 	size_t ced_size = sizeof(struct clock_event_device);
 	int cpu;
 
+	hv_context.hv_numa_map = kzalloc(sizeof(struct cpumask) * nr_node_ids,
+					 GFP_ATOMIC);
+	if (hv_context.hv_numa_map == NULL) {
+		pr_err("Unable to allocate NUMA map\n");
+		goto err;
+	}
+
 	for_each_online_cpu(cpu) {
 		hv_context.event_dpc[cpu] = kmalloc(size, GFP_ATOMIC);
 		if (hv_context.event_dpc[cpu] == NULL) {
@@ -342,6 +429,7 @@ int hv_synic_alloc(void)
 			pr_err("Unable to allocate clock event device\n");
 			goto err;
 		}
+
 		hv_init_clockevent_device(hv_context.clk_evt[cpu], cpu);
 
 		hv_context.synic_message_page[cpu] =
@@ -390,6 +478,7 @@ void hv_synic_free(void)
 {
 	int cpu;
 
+	kfree(hv_context.hv_numa_map);
 	for_each_online_cpu(cpu)
 		hv_synic_free_cpu(cpu);
 }
@@ -503,8 +592,7 @@ void hv_synic_cleanup(void *arg)
 
 	/* Turn off clockevent device */
 	if (ms_hyperv.features & HV_X64_MSR_SYNTIMER_AVAILABLE)
-		hv_ce_setmode(CLOCK_EVT_MODE_SHUTDOWN,
-			      hv_context.clk_evt[cpu]);
+		hv_ce_shutdown(hv_context.clk_evt[cpu]);
 
 	rdmsrl(HV_X64_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
 
@@ -530,6 +618,4 @@ void hv_synic_cleanup(void *arg)
 	rdmsrl(HV_X64_MSR_SCONTROL, sctrl.as_uint64);
 	sctrl.enable = 0;
 	wrmsrl(HV_X64_MSR_SCONTROL, sctrl.as_uint64);
-
-	hv_synic_free_cpu(cpu);
 }
diff --git a/kernel/drivers/hv/hv_balloon.c b/kernel/drivers/hv/hv_balloon.c
index cb5b7dc97..b853b4b08 100644
--- a/kernel/drivers/hv/hv_balloon.c
+++ b/kernel/drivers/hv/hv_balloon.c
@@ -62,11 +62,13 @@
 enum {
 	DYNMEM_PROTOCOL_VERSION_1 = DYNMEM_MAKE_VERSION(0, 3),
 	DYNMEM_PROTOCOL_VERSION_2 = DYNMEM_MAKE_VERSION(1, 0),
+	DYNMEM_PROTOCOL_VERSION_3 = DYNMEM_MAKE_VERSION(2, 0),
 
 	DYNMEM_PROTOCOL_VERSION_WIN7 = DYNMEM_PROTOCOL_VERSION_1,
 	DYNMEM_PROTOCOL_VERSION_WIN8 = DYNMEM_PROTOCOL_VERSION_2,
+	DYNMEM_PROTOCOL_VERSION_WIN10 = DYNMEM_PROTOCOL_VERSION_3,
 
-	DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN8
+	DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN10
 };
 
 
@@ -567,7 +569,9 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val,
 	case MEM_ONLINE:
 		dm_device.num_pages_onlined += mem->nr_pages;
 	case MEM_CANCEL_ONLINE:
-		mutex_unlock(&dm_device.ha_region_mutex);
+		if (val == MEM_ONLINE ||
+		    mutex_is_locked(&dm_device.ha_region_mutex))
+			mutex_unlock(&dm_device.ha_region_mutex);
 		if (dm_device.ha_waiting) {
 			dm_device.ha_waiting = false;
 			complete(&dm_device.ol_waitevent);
@@ -1294,13 +1298,25 @@ static void version_resp(struct hv_dynmem_device *dm,
 	if (dm->next_version == 0)
 		goto version_error;
 
-	dm->next_version = 0;
 	memset(&version_req, 0, sizeof(struct dm_version_request));
 	version_req.hdr.type = DM_VERSION_REQUEST;
 	version_req.hdr.size = sizeof(struct dm_version_request);
 	version_req.hdr.trans_id = atomic_inc_return(&trans_id);
-	version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN7;
-	version_req.is_last_attempt = 1;
+	version_req.version.version = dm->next_version;
+
+	/*
+	 * Set the next version to try in case current version fails.
+	 * Win7 protocol ought to be the last one to try.
+	 */
+	switch (version_req.version.version) {
+	case DYNMEM_PROTOCOL_VERSION_WIN8:
+		dm->next_version = DYNMEM_PROTOCOL_VERSION_WIN7;
+		version_req.is_last_attempt = 0;
+		break;
+	default:
+		dm->next_version = 0;
+		version_req.is_last_attempt = 1;
+	}
 
 	ret = vmbus_sendpacket(dm->dev->channel, &version_req,
 				sizeof(struct dm_version_request),
@@ -1440,7 +1456,7 @@ static int balloon_probe(struct hv_device *dev,
 
 	dm_device.dev = dev;
 	dm_device.state = DM_INITIALIZING;
-	dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN7;
+	dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN8;
 	init_completion(&dm_device.host_event);
 	init_completion(&dm_device.config_event);
 	INIT_LIST_HEAD(&dm_device.ha_region_list);
@@ -1472,7 +1488,7 @@ static int balloon_probe(struct hv_device *dev,
 	version_req.hdr.type = DM_VERSION_REQUEST;
 	version_req.hdr.size = sizeof(struct dm_version_request);
 	version_req.hdr.trans_id = atomic_inc_return(&trans_id);
-	version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN8;
+	version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN10;
 	version_req.is_last_attempt = 0;
 
 	ret = vmbus_sendpacket(dev->channel, &version_req,
diff --git a/kernel/drivers/hv/hv_fcopy.c b/kernel/drivers/hv/hv_fcopy.c
index cd453e4b2..db4b887b8 100644
--- a/kernel/drivers/hv/hv_fcopy.c
+++ b/kernel/drivers/hv/hv_fcopy.c
@@ -19,17 +19,13 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <linux/semaphore.h>
-#include <linux/fs.h>
 #include <linux/nls.h>
 #include <linux/workqueue.h>
-#include <linux/cdev.h>
 #include <linux/hyperv.h>
 #include <linux/sched.h>
-#include <linux/uaccess.h>
-#include <linux/miscdevice.h>
 
 #include "hyperv_vmbus.h"
+#include "hv_utils_transport.h"
 
 #define WIN8_SRV_MAJOR		1
 #define WIN8_SRV_MINOR		1
@@ -47,39 +43,31 @@
  * ensure this by serializing packet processing in this driver - we do not
  * read additional packets from the VMBUs until the current packet is fully
  * handled.
- *
- * The transaction "active" state is set when we receive a request from the
- * host and we cleanup this state when the transaction is completed - when we
- * respond to the host with our response. When the transaction active state is
- * set, we defer handling incoming packets.
  */
 
 static struct {
-	bool active; /* transaction status - active or not */
+	int state;   /* hvutil_device_state */
 	int recv_len; /* number of bytes received. */
 	struct hv_fcopy_hdr  *fcopy_msg; /* current message */
-	struct hv_start_fcopy  message; /*  sent to daemon */
 	struct vmbus_channel *recv_channel; /* chn we got the request */
 	u64 recv_req_id; /* request ID. */
 	void *fcopy_context; /* for the channel callback */
-	struct semaphore read_sema;
 } fcopy_transaction;
 
-static bool opened; /* currently device opened */
-
-/*
- * Before we can accept copy messages from the host, we need
- * to handshake with the user level daemon. This state tracks
- * if we are in the handshake phase.
- */
-static bool in_hand_shake = true;
-static void fcopy_send_data(void);
 static void fcopy_respond_to_host(int error);
-static void fcopy_work_func(struct work_struct *dummy);
-static DECLARE_DELAYED_WORK(fcopy_work, fcopy_work_func);
+static void fcopy_send_data(struct work_struct *dummy);
+static void fcopy_timeout_func(struct work_struct *dummy);
+static DECLARE_DELAYED_WORK(fcopy_timeout_work, fcopy_timeout_func);
+static DECLARE_WORK(fcopy_send_work, fcopy_send_data);
+static const char fcopy_devname[] = "vmbus/hv_fcopy";
 static u8 *recv_buffer;
+static struct hvutil_transport *hvt;
+/*
+ * This state maintains the version number registered by the daemon.
+ */
+static int dm_reg_value;
 
-static void fcopy_work_func(struct work_struct *dummy)
+static void fcopy_timeout_func(struct work_struct *dummy)
 {
 	/*
 	 * If the timer fires, the user-mode component has not responded;
@@ -87,23 +75,28 @@ static void fcopy_work_func(struct work_struct *dummy)
 	 */
 	fcopy_respond_to_host(HV_E_FAIL);
 
-	/* In the case the user-space daemon crashes, hangs or is killed, we
-	 * need to down the semaphore, otherwise, after the daemon starts next
-	 * time, the obsolete data in fcopy_transaction.message or
-	 * fcopy_transaction.fcopy_msg will be used immediately.
-	 *
-	 * NOTE: fcopy_read() happens to get the semaphore (very rare)? We're
-	 * still OK, because we've reported the failure to the host.
-	 */
-	if (down_trylock(&fcopy_transaction.read_sema))
-		;
+	/* Transaction is finished, reset the state. */
+	if (fcopy_transaction.state > HVUTIL_READY)
+		fcopy_transaction.state = HVUTIL_READY;
 
+	hv_poll_channel(fcopy_transaction.fcopy_context,
+			hv_fcopy_onchannelcallback);
 }
 
 static int fcopy_handle_handshake(u32 version)
 {
+	u32 our_ver = FCOPY_CURRENT_VERSION;
+
 	switch (version) {
-	case FCOPY_CURRENT_VERSION:
+	case FCOPY_VERSION_0:
+		/* Daemon doesn't expect us to reply */
+		dm_reg_value = version;
+		break;
+	case FCOPY_VERSION_1:
+		/* Daemon expects us to reply with our own version */
+		if (hvutil_transport_send(hvt, &our_ver, sizeof(our_ver)))
+			return -EFAULT;
+		dm_reg_value = version;
 		break;
 	default:
 		/*
@@ -114,20 +107,20 @@ static int fcopy_handle_handshake(u32 version)
 		 */
 		return -EINVAL;
 	}
-	pr_info("FCP: user-mode registering done. Daemon version: %d\n",
-		version);
-	fcopy_transaction.active = false;
-	if (fcopy_transaction.fcopy_context)
-		hv_fcopy_onchannelcallback(fcopy_transaction.fcopy_context);
-	in_hand_shake = false;
+	pr_debug("FCP: userspace daemon ver. %d registered\n", version);
+	fcopy_transaction.state = HVUTIL_READY;
+	hv_poll_channel(fcopy_transaction.fcopy_context,
+			hv_fcopy_onchannelcallback);
 	return 0;
 }
 
-static void fcopy_send_data(void)
+static void fcopy_send_data(struct work_struct *dummy)
 {
-	struct hv_start_fcopy *smsg_out = &fcopy_transaction.message;
+	struct hv_start_fcopy *smsg_out = NULL;
 	int operation = fcopy_transaction.fcopy_msg->operation;
 	struct hv_start_fcopy *smsg_in;
+	void *out_src;
+	int rc, out_len;
 
 	/*
 	 * The  strings sent from the host are encoded in
@@ -142,26 +135,44 @@ static void fcopy_send_data(void)
 
 	switch (operation) {
 	case START_FILE_COPY:
-		memset(smsg_out, 0, sizeof(struct hv_start_fcopy));
+		out_len = sizeof(struct hv_start_fcopy);
+		smsg_out = kzalloc(sizeof(*smsg_out), GFP_KERNEL);
+		if (!smsg_out)
+			return;
+
 		smsg_out->hdr.operation = operation;
 		smsg_in = (struct hv_start_fcopy *)fcopy_transaction.fcopy_msg;
 
 		utf16s_to_utf8s((wchar_t *)smsg_in->file_name, W_MAX_PATH,
 				UTF16_LITTLE_ENDIAN,
-				(__u8 *)smsg_out->file_name, W_MAX_PATH - 1);
+				(__u8 *)&smsg_out->file_name, W_MAX_PATH - 1);
 
 		utf16s_to_utf8s((wchar_t *)smsg_in->path_name, W_MAX_PATH,
 				UTF16_LITTLE_ENDIAN,
-				(__u8 *)smsg_out->path_name, W_MAX_PATH - 1);
+				(__u8 *)&smsg_out->path_name, W_MAX_PATH - 1);
 
 		smsg_out->copy_flags = smsg_in->copy_flags;
 		smsg_out->file_size = smsg_in->file_size;
+		out_src = smsg_out;
 		break;
 
 	default:
+		out_src = fcopy_transaction.fcopy_msg;
+		out_len = fcopy_transaction.recv_len;
 		break;
 	}
-	up(&fcopy_transaction.read_sema);
+
+	fcopy_transaction.state = HVUTIL_USERSPACE_REQ;
+	rc = hvutil_transport_send(hvt, out_src, out_len);
+	if (rc) {
+		pr_debug("FCP: failed to communicate to the daemon: %d\n", rc);
+		if (cancel_delayed_work_sync(&fcopy_timeout_work)) {
+			fcopy_respond_to_host(HV_E_FAIL);
+			fcopy_transaction.state = HVUTIL_READY;
+		}
+	}
+	kfree(smsg_out);
+
 	return;
 }
 
@@ -189,8 +200,6 @@ fcopy_respond_to_host(int error)
 	channel = fcopy_transaction.recv_channel;
 	req_id = fcopy_transaction.recv_req_id;
 
-	fcopy_transaction.active = false;
-
 	icmsghdr = (struct icmsg_hdr *)
 			&recv_buffer[sizeof(struct vmbuspipe_hdr)];
 
@@ -218,7 +227,7 @@ void hv_fcopy_onchannelcallback(void *context)
 	int util_fw_version;
 	int fcopy_srv_version;
 
-	if (fcopy_transaction.active) {
+	if (fcopy_transaction.state > HVUTIL_READY) {
 		/*
 		 * We will defer processing this callback once
 		 * the current transaction is complete.
@@ -226,6 +235,7 @@ void hv_fcopy_onchannelcallback(void *context)
 		fcopy_transaction.fcopy_context = context;
 		return;
 	}
+	fcopy_transaction.fcopy_context = NULL;
 
 	vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 2, &recvlen,
 			 &requestid);
@@ -249,17 +259,23 @@ void hv_fcopy_onchannelcallback(void *context)
 		 * transaction; note transactions are serialized.
 		 */
 
-		fcopy_transaction.active = true;
 		fcopy_transaction.recv_len = recvlen;
 		fcopy_transaction.recv_channel = channel;
 		fcopy_transaction.recv_req_id = requestid;
 		fcopy_transaction.fcopy_msg = fcopy_msg;
 
+		if (fcopy_transaction.state < HVUTIL_READY) {
+			/* Userspace is not registered yet */
+			fcopy_respond_to_host(HV_E_FAIL);
+			return;
+		}
+		fcopy_transaction.state = HVUTIL_HOSTMSG_RECEIVED;
+
 		/*
 		 * Send the information to the user-level daemon.
 		 */
-		schedule_delayed_work(&fcopy_work, 5*HZ);
-		fcopy_send_data();
+		schedule_work(&fcopy_send_work);
+		schedule_delayed_work(&fcopy_timeout_work, 5*HZ);
 		return;
 	}
 	icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE;
@@ -267,155 +283,44 @@ void hv_fcopy_onchannelcallback(void *context)
 			VM_PKT_DATA_INBAND, 0);
 }
 
-/*
- * Create a char device that can support read/write for passing
- * the payload.
- */
-
-static ssize_t fcopy_read(struct file *file, char __user *buf,
-		size_t count, loff_t *ppos)
-{
-	void *src;
-	size_t copy_size;
-	int operation;
-
-	/*
-	 * Wait until there is something to be read.
-	 */
-	if (down_interruptible(&fcopy_transaction.read_sema))
-		return -EINTR;
-
-	/*
-	 * The channel may be rescinded and in this case, we will wakeup the
-	 * the thread blocked on the semaphore and we will use the opened
-	 * state to correctly handle this case.
-	 */
-	if (!opened)
-		return -ENODEV;
-
-	operation = fcopy_transaction.fcopy_msg->operation;
-
-	if (operation == START_FILE_COPY) {
-		src = &fcopy_transaction.message;
-		copy_size = sizeof(struct hv_start_fcopy);
-		if (count < copy_size)
-			return 0;
-	} else {
-		src = fcopy_transaction.fcopy_msg;
-		copy_size = sizeof(struct hv_do_fcopy);
-		if (count < copy_size)
-			return 0;
-	}
-	if (copy_to_user(buf, src, copy_size))
-		return -EFAULT;
-
-	return copy_size;
-}
-
-static ssize_t fcopy_write(struct file *file, const char __user *buf,
-			size_t count, loff_t *ppos)
+/* Callback when data is received from userspace */
+static int fcopy_on_msg(void *msg, int len)
 {
-	int response = 0;
+	int *val = (int *)msg;
 
-	if (count != sizeof(int))
+	if (len != sizeof(int))
 		return -EINVAL;
 
-	if (copy_from_user(&response, buf, sizeof(int)))
-		return -EFAULT;
+	if (fcopy_transaction.state == HVUTIL_DEVICE_INIT)
+		return fcopy_handle_handshake(*val);
 
-	if (in_hand_shake) {
-		if (fcopy_handle_handshake(response))
-			return -EINVAL;
-		return sizeof(int);
-	}
+	if (fcopy_transaction.state != HVUTIL_USERSPACE_REQ)
+		return -EINVAL;
 
 	/*
 	 * Complete the transaction by forwarding the result
 	 * to the host. But first, cancel the timeout.
 	 */
-	if (cancel_delayed_work_sync(&fcopy_work))
-		fcopy_respond_to_host(response);
-
-	return sizeof(int);
-}
-
-static int fcopy_open(struct inode *inode, struct file *f)
-{
-	/*
-	 * The user level daemon that will open this device is
-	 * really an extension of this driver. We can have only
-	 * active open at a time.
-	 */
-	if (opened)
-		return -EBUSY;
+	if (cancel_delayed_work_sync(&fcopy_timeout_work)) {
+		fcopy_transaction.state = HVUTIL_USERSPACE_RECV;
+		fcopy_respond_to_host(*val);
+		fcopy_transaction.state = HVUTIL_READY;
+		hv_poll_channel(fcopy_transaction.fcopy_context,
+				hv_fcopy_onchannelcallback);
+	}
 
-	/*
-	 * The daemon is alive; setup the state.
-	 */
-	opened = true;
 	return 0;
 }
 
-/* XXX: there are still some tricky corner cases, e.g.,
- * 1) In a SMP guest, when fcopy_release() runs between
- * schedule_delayed_work() and fcopy_send_data(), there is
- * still a chance an obsolete message will be queued.
- *
- * 2) When the fcopy daemon is running, if we unload the driver,
- * we'll notice a kernel oops when we kill the daemon later.
- */
-static int fcopy_release(struct inode *inode, struct file *f)
+static void fcopy_on_reset(void)
 {
 	/*
 	 * The daemon has exited; reset the state.
 	 */
-	in_hand_shake = true;
-	opened = false;
+	fcopy_transaction.state = HVUTIL_DEVICE_INIT;
 
-	if (cancel_delayed_work_sync(&fcopy_work)) {
-		/* We haven't up()-ed the semaphore(very rare)? */
-		if (down_trylock(&fcopy_transaction.read_sema))
-			;
+	if (cancel_delayed_work_sync(&fcopy_timeout_work))
 		fcopy_respond_to_host(HV_E_FAIL);
-	}
-	return 0;
-}
-
-
-static const struct file_operations fcopy_fops = {
-	.read           = fcopy_read,
-	.write          = fcopy_write,
-	.release	= fcopy_release,
-	.open		= fcopy_open,
-};
-
-static struct miscdevice fcopy_misc = {
-	.minor          = MISC_DYNAMIC_MINOR,
-	.name           = "vmbus/hv_fcopy",
-	.fops           = &fcopy_fops,
-};
-
-static int fcopy_dev_init(void)
-{
-	return misc_register(&fcopy_misc);
-}
-
-static void fcopy_dev_deinit(void)
-{
-
-	/*
-	 * The device is going away - perhaps because the
-	 * host has rescinded the channel. Setup state so that
-	 * user level daemon can gracefully exit if it is blocked
-	 * on the read semaphore.
-	 */
-	opened = false;
-	/*
-	 * Signal the semaphore as the device is
-	 * going away.
-	 */
-	up(&fcopy_transaction.read_sema);
-	misc_deregister(&fcopy_misc);
 }
 
 int hv_fcopy_init(struct hv_util_service *srv)
@@ -428,14 +333,19 @@ int hv_fcopy_init(struct hv_util_service *srv)
 	 * Defer processing channel callbacks until the daemon
 	 * has registered.
 	 */
-	fcopy_transaction.active = true;
-	sema_init(&fcopy_transaction.read_sema, 0);
+	fcopy_transaction.state = HVUTIL_DEVICE_INIT;
+
+	hvt = hvutil_transport_init(fcopy_devname, 0, 0,
+				    fcopy_on_msg, fcopy_on_reset);
+	if (!hvt)
+		return -EFAULT;
 
-	return fcopy_dev_init();
+	return 0;
 }
 
 void hv_fcopy_deinit(void)
 {
-	cancel_delayed_work_sync(&fcopy_work);
-	fcopy_dev_deinit();
+	fcopy_transaction.state = HVUTIL_DEVICE_DYING;
+	cancel_delayed_work_sync(&fcopy_timeout_work);
+	hvutil_transport_destroy(hvt);
 }
diff --git a/kernel/drivers/hv/hv_kvp.c b/kernel/drivers/hv/hv_kvp.c
index beb8105c0..74c38a9f3 100644
--- a/kernel/drivers/hv/hv_kvp.c
+++ b/kernel/drivers/hv/hv_kvp.c
@@ -28,6 +28,8 @@
 #include <linux/workqueue.h>
 #include <linux/hyperv.h>
 
+#include "hyperv_vmbus.h"
+#include "hv_utils_transport.h"
 
 /*
  * Pre win8 version numbers used in ws2008 and ws 2008 r2 (win7)
@@ -45,16 +47,21 @@
 #define WIN8_SRV_VERSION     (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR)
 
 /*
- * Global state maintained for transaction that is being processed.
- * Note that only one transaction can be active at any point in time.
+ * Global state maintained for transaction that is being processed. For a class
+ * of integration services, including the "KVP service", the specified protocol
+ * is a "request/response" protocol which means that there can only be single
+ * outstanding transaction from the host at any given point in time. We use
+ * this to simplify memory management in this driver - we cache and process
+ * only one message at a time.
  *
- * This state is set when we receive a request from the host; we
- * cleanup this state when the transaction is completed - when we respond
- * to the host with the key value.
+ * While the request/response protocol is guaranteed by the host, we further
+ * ensure this by serializing packet processing in this driver - we do not
+ * read additional packets from the VMBUs until the current packet is fully
+ * handled.
  */
 
 static struct {
-	bool active; /* transaction status - active or not */
+	int state;   /* hvutil_device_state */
 	int recv_len; /* number of bytes received. */
 	struct hv_kvp_msg  *kvp_msg; /* current message */
 	struct vmbus_channel *recv_channel; /* chn we got the request */
@@ -63,13 +70,6 @@ static struct {
 } kvp_transaction;
 
 /*
- * Before we can accept KVP messages from the host, we need
- * to handshake with the user level daemon. This state tracks
- * if we are in the handshake phase.
- */
-static bool in_hand_shake = true;
-
-/*
  * This state maintains the version number registered by the daemon.
  */
 static int dm_reg_value;
@@ -78,15 +78,15 @@ static void kvp_send_key(struct work_struct *dummy);
 
 
 static void kvp_respond_to_host(struct hv_kvp_msg *msg, int error);
-static void kvp_work_func(struct work_struct *dummy);
+static void kvp_timeout_func(struct work_struct *dummy);
 static void kvp_register(int);
 
-static DECLARE_DELAYED_WORK(kvp_work, kvp_work_func);
+static DECLARE_DELAYED_WORK(kvp_timeout_work, kvp_timeout_func);
 static DECLARE_WORK(kvp_sendkey_work, kvp_send_key);
 
-static struct cb_id kvp_id = { CN_KVP_IDX, CN_KVP_VAL };
-static const char kvp_name[] = "kvp_kernel_module";
+static const char kvp_devname[] = "vmbus/hv_kvp";
 static u8 *recv_buffer;
+static struct hvutil_transport *hvt;
 /*
  * Register the kernel component with the user-level daemon.
  * As part of this registration, pass the LIC version number.
@@ -98,50 +98,39 @@ static void
 kvp_register(int reg_value)
 {
 
-	struct cn_msg *msg;
 	struct hv_kvp_msg *kvp_msg;
 	char *version;
 
-	msg = kzalloc(sizeof(*msg) + sizeof(struct hv_kvp_msg), GFP_ATOMIC);
+	kvp_msg = kzalloc(sizeof(*kvp_msg), GFP_KERNEL);
 
-	if (msg) {
-		kvp_msg = (struct hv_kvp_msg *)msg->data;
+	if (kvp_msg) {
 		version = kvp_msg->body.kvp_register.version;
-		msg->id.idx =  CN_KVP_IDX;
-		msg->id.val = CN_KVP_VAL;
-
 		kvp_msg->kvp_hdr.operation = reg_value;
 		strcpy(version, HV_DRV_VERSION);
-		msg->len = sizeof(struct hv_kvp_msg);
-		cn_netlink_send(msg, 0, 0, GFP_ATOMIC);
-		kfree(msg);
+
+		hvutil_transport_send(hvt, kvp_msg, sizeof(*kvp_msg));
+		kfree(kvp_msg);
 	}
 }
-static void
-kvp_work_func(struct work_struct *dummy)
+
+static void kvp_timeout_func(struct work_struct *dummy)
 {
 	/*
 	 * If the timer fires, the user-mode component has not responded;
 	 * process the pending transaction.
 	 */
 	kvp_respond_to_host(NULL, HV_E_FAIL);
-}
 
-static void poll_channel(struct vmbus_channel *channel)
-{
-	if (channel->target_cpu != smp_processor_id())
-		smp_call_function_single(channel->target_cpu,
-					 hv_kvp_onchannelcallback,
-					 channel, true);
-	else
-		hv_kvp_onchannelcallback(channel);
-}
+	/* Transaction is finished, reset the state. */
+	if (kvp_transaction.state > HVUTIL_READY)
+		kvp_transaction.state = HVUTIL_READY;
 
+	hv_poll_channel(kvp_transaction.kvp_context,
+			hv_kvp_onchannelcallback);
+}
 
 static int kvp_handle_handshake(struct hv_kvp_msg *msg)
 {
-	int ret = 1;
-
 	switch (msg->kvp_hdr.operation) {
 	case KVP_OP_REGISTER:
 		dm_reg_value = KVP_OP_REGISTER;
@@ -155,20 +144,18 @@ static int kvp_handle_handshake(struct hv_kvp_msg *msg)
 		pr_info("KVP: incompatible daemon\n");
 		pr_info("KVP: KVP version: %d, Daemon version: %d\n",
 			KVP_OP_REGISTER1, msg->kvp_hdr.operation);
-		ret = 0;
+		return -EINVAL;
 	}
 
-	if (ret) {
-		/*
-		 * We have a compatible daemon; complete the handshake.
-		 */
-		pr_info("KVP: user-mode registering done.\n");
-		kvp_register(dm_reg_value);
-		kvp_transaction.active = false;
-		if (kvp_transaction.kvp_context)
-			poll_channel(kvp_transaction.kvp_context);
-	}
-	return ret;
+	/*
+	 * We have a compatible daemon; complete the handshake.
+	 */
+	pr_debug("KVP: userspace daemon ver. %d registered\n",
+		 KVP_OP_REGISTER);
+	kvp_register(dm_reg_value);
+	kvp_transaction.state = HVUTIL_READY;
+
+	return 0;
 }
 
 
@@ -176,26 +163,30 @@ static int kvp_handle_handshake(struct hv_kvp_msg *msg)
  * Callback when data is received from user mode.
  */
 
-static void
-kvp_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
+static int kvp_on_msg(void *msg, int len)
 {
-	struct hv_kvp_msg *message;
+	struct hv_kvp_msg *message = (struct hv_kvp_msg *)msg;
 	struct hv_kvp_msg_enumerate *data;
 	int	error = 0;
 
-	message = (struct hv_kvp_msg *)msg->data;
+	if (len < sizeof(*message))
+		return -EINVAL;
 
 	/*
 	 * If we are negotiating the version information
 	 * with the daemon; handle that first.
 	 */
 
-	if (in_hand_shake) {
-		if (kvp_handle_handshake(message))
-			in_hand_shake = false;
-		return;
+	if (kvp_transaction.state < HVUTIL_READY) {
+		return kvp_handle_handshake(message);
 	}
 
+	/* We didn't send anything to userspace so the reply is spurious */
+	if (kvp_transaction.state < HVUTIL_USERSPACE_REQ)
+		return -EINVAL;
+
+	kvp_transaction.state = HVUTIL_USERSPACE_RECV;
+
 	/*
 	 * Based on the version of the daemon, we propagate errors from the
 	 * daemon differently.
@@ -225,8 +216,14 @@ kvp_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
 	 * Complete the transaction by forwarding the key value
 	 * to the host. But first, cancel the timeout.
 	 */
-	if (cancel_delayed_work_sync(&kvp_work))
+	if (cancel_delayed_work_sync(&kvp_timeout_work)) {
 		kvp_respond_to_host(message, error);
+		kvp_transaction.state = HVUTIL_READY;
+		hv_poll_channel(kvp_transaction.kvp_context,
+				hv_kvp_onchannelcallback);
+	}
+
+	return 0;
 }
 
 
@@ -343,7 +340,6 @@ static void process_ib_ipinfo(void *in_msg, void *out_msg, int op)
 static void
 kvp_send_key(struct work_struct *dummy)
 {
-	struct cn_msg *msg;
 	struct hv_kvp_msg *message;
 	struct hv_kvp_msg *in_msg;
 	__u8 operation = kvp_transaction.kvp_msg->kvp_hdr.operation;
@@ -352,14 +348,14 @@ kvp_send_key(struct work_struct *dummy)
 	__u64 val64;
 	int rc;
 
-	msg = kzalloc(sizeof(*msg) + sizeof(struct hv_kvp_msg) , GFP_ATOMIC);
-	if (!msg)
+	/* The transaction state is wrong. */
+	if (kvp_transaction.state != HVUTIL_HOSTMSG_RECEIVED)
 		return;
 
-	msg->id.idx =  CN_KVP_IDX;
-	msg->id.val = CN_KVP_VAL;
+	message = kzalloc(sizeof(*message), GFP_KERNEL);
+	if (!message)
+		return;
 
-	message = (struct hv_kvp_msg *)msg->data;
 	message->kvp_hdr.operation = operation;
 	message->kvp_hdr.pool = pool;
 	in_msg = kvp_transaction.kvp_msg;
@@ -446,15 +442,17 @@ kvp_send_key(struct work_struct *dummy)
 			break;
 	}
 
-	msg->len = sizeof(struct hv_kvp_msg);
-	rc = cn_netlink_send(msg, 0, 0, GFP_ATOMIC);
+	kvp_transaction.state = HVUTIL_USERSPACE_REQ;
+	rc = hvutil_transport_send(hvt, message, sizeof(*message));
 	if (rc) {
 		pr_debug("KVP: failed to communicate to the daemon: %d\n", rc);
-		if (cancel_delayed_work_sync(&kvp_work))
+		if (cancel_delayed_work_sync(&kvp_timeout_work)) {
 			kvp_respond_to_host(message, HV_E_FAIL);
+			kvp_transaction.state = HVUTIL_READY;
+		}
 	}
 
-	kfree(msg);
+	kfree(message);
 
 	return;
 }
@@ -479,17 +477,6 @@ kvp_respond_to_host(struct hv_kvp_msg *msg_to_host, int error)
 	int ret;
 
 	/*
-	 * If a transaction is not active; log and return.
-	 */
-
-	if (!kvp_transaction.active) {
-		/*
-		 * This is a spurious call!
-		 */
-		pr_warn("KVP: Transaction not active\n");
-		return;
-	}
-	/*
 	 * Copy the global state for completing the transaction. Note that
 	 * only one transaction can be active at a time.
 	 */
@@ -498,8 +485,6 @@ kvp_respond_to_host(struct hv_kvp_msg *msg_to_host, int error)
 	channel = kvp_transaction.recv_channel;
 	req_id = kvp_transaction.recv_req_id;
 
-	kvp_transaction.active = false;
-
 	icmsghdrp = (struct icmsg_hdr *)
 			&recv_buffer[sizeof(struct vmbuspipe_hdr)];
 
@@ -586,7 +571,6 @@ response_done:
 
 	vmbus_sendpacket(channel, recv_buffer, buf_len, req_id,
 				VM_PKT_DATA_INBAND, 0);
-	poll_channel(channel);
 }
 
 /*
@@ -612,7 +596,7 @@ void hv_kvp_onchannelcallback(void *context)
 	int util_fw_version;
 	int kvp_srv_version;
 
-	if (kvp_transaction.active) {
+	if (kvp_transaction.state > HVUTIL_READY) {
 		/*
 		 * We will defer processing this callback once
 		 * the current transaction is complete.
@@ -620,6 +604,7 @@ void hv_kvp_onchannelcallback(void *context)
 		kvp_transaction.kvp_context = context;
 		return;
 	}
+	kvp_transaction.kvp_context = NULL;
 
 	vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 4, &recvlen,
 			 &requestid);
@@ -664,9 +649,15 @@ void hv_kvp_onchannelcallback(void *context)
 			kvp_transaction.recv_len = recvlen;
 			kvp_transaction.recv_channel = channel;
 			kvp_transaction.recv_req_id = requestid;
-			kvp_transaction.active = true;
 			kvp_transaction.kvp_msg = kvp_msg;
 
+			if (kvp_transaction.state < HVUTIL_READY) {
+				/* Userspace is not registered yet */
+				kvp_respond_to_host(NULL, HV_E_FAIL);
+				return;
+			}
+			kvp_transaction.state = HVUTIL_HOSTMSG_RECEIVED;
+
 			/*
 			 * Get the information from the
 			 * user-mode component.
@@ -677,7 +668,7 @@ void hv_kvp_onchannelcallback(void *context)
 			 * user-mode not responding.
 			 */
 			schedule_work(&kvp_sendkey_work);
-			schedule_delayed_work(&kvp_work, 5*HZ);
+			schedule_delayed_work(&kvp_timeout_work, 5*HZ);
 
 			return;
 
@@ -693,14 +684,16 @@ void hv_kvp_onchannelcallback(void *context)
 
 }
 
+static void kvp_on_reset(void)
+{
+	if (cancel_delayed_work_sync(&kvp_timeout_work))
+		kvp_respond_to_host(NULL, HV_E_FAIL);
+	kvp_transaction.state = HVUTIL_DEVICE_INIT;
+}
+
 int
 hv_kvp_init(struct hv_util_service *srv)
 {
-	int err;
-
-	err = cn_add_callback(&kvp_id, kvp_name, kvp_cn_callback);
-	if (err)
-		return err;
 	recv_buffer = srv->recv_buffer;
 
 	/*
@@ -709,14 +702,20 @@ hv_kvp_init(struct hv_util_service *srv)
 	 * Defer processing channel callbacks until the daemon
 	 * has registered.
 	 */
-	kvp_transaction.active = true;
+	kvp_transaction.state = HVUTIL_DEVICE_INIT;
+
+	hvt = hvutil_transport_init(kvp_devname, CN_KVP_IDX, CN_KVP_VAL,
+				    kvp_on_msg, kvp_on_reset);
+	if (!hvt)
+		return -EFAULT;
 
 	return 0;
 }
 
 void hv_kvp_deinit(void)
 {
-	cn_del_callback(&kvp_id);
-	cancel_delayed_work_sync(&kvp_work);
+	kvp_transaction.state = HVUTIL_DEVICE_DYING;
+	cancel_delayed_work_sync(&kvp_timeout_work);
 	cancel_work_sync(&kvp_sendkey_work);
+	hvutil_transport_destroy(hvt);
 }
diff --git a/kernel/drivers/hv/hv_snapshot.c b/kernel/drivers/hv/hv_snapshot.c
index 9d5e0d1ef..815405f2e 100644
--- a/kernel/drivers/hv/hv_snapshot.c
+++ b/kernel/drivers/hv/hv_snapshot.c
@@ -24,6 +24,9 @@
 #include <linux/workqueue.h>
 #include <linux/hyperv.h>
 
+#include "hyperv_vmbus.h"
+#include "hv_utils_transport.h"
+
 #define VSS_MAJOR  5
 #define VSS_MINOR  0
 #define VSS_VERSION    (VSS_MAJOR << 16 | VSS_MINOR)
@@ -31,28 +34,39 @@
 #define VSS_USERSPACE_TIMEOUT (msecs_to_jiffies(10 * 1000))
 
 /*
- * Global state maintained for transaction that is being processed.
- * Note that only one transaction can be active at any point in time.
+ * Global state maintained for transaction that is being processed. For a class
+ * of integration services, including the "VSS service", the specified protocol
+ * is a "request/response" protocol which means that there can only be single
+ * outstanding transaction from the host at any given point in time. We use
+ * this to simplify memory management in this driver - we cache and process
+ * only one message at a time.
  *
- * This state is set when we receive a request from the host; we
- * cleanup this state when the transaction is completed - when we respond
- * to the host with the key value.
+ * While the request/response protocol is guaranteed by the host, we further
+ * ensure this by serializing packet processing in this driver - we do not
+ * read additional packets from the VMBUs until the current packet is fully
+ * handled.
  */
 
 static struct {
-	bool active; /* transaction status - active or not */
+	int state;   /* hvutil_device_state */
 	int recv_len; /* number of bytes received. */
 	struct vmbus_channel *recv_channel; /* chn we got the request */
 	u64 recv_req_id; /* request ID. */
 	struct hv_vss_msg  *msg; /* current message */
+	void *vss_context; /* for the channel callback */
 } vss_transaction;
 
 
 static void vss_respond_to_host(int error);
 
-static struct cb_id vss_id = { CN_VSS_IDX, CN_VSS_VAL };
-static const char vss_name[] = "vss_kernel_module";
+/*
+ * This state maintains the version number registered by the daemon.
+ */
+static int dm_reg_value;
+
+static const char vss_devname[] = "vmbus/hv_vss";
 static __u8 *recv_buffer;
+static struct hvutil_transport *hvt;
 
 static void vss_send_op(struct work_struct *dummy);
 static void vss_timeout_func(struct work_struct *dummy);
@@ -71,25 +85,69 @@ static void vss_timeout_func(struct work_struct *dummy)
 	 */
 	pr_warn("VSS: timeout waiting for daemon to reply\n");
 	vss_respond_to_host(HV_E_FAIL);
+
+	/* Transaction is finished, reset the state. */
+	if (vss_transaction.state > HVUTIL_READY)
+		vss_transaction.state = HVUTIL_READY;
+
+	hv_poll_channel(vss_transaction.vss_context,
+			hv_vss_onchannelcallback);
 }
 
-static void
-vss_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
+static int vss_handle_handshake(struct hv_vss_msg *vss_msg)
 {
-	struct hv_vss_msg *vss_msg;
+	u32 our_ver = VSS_OP_REGISTER1;
+
+	switch (vss_msg->vss_hdr.operation) {
+	case VSS_OP_REGISTER:
+		/* Daemon doesn't expect us to reply */
+		dm_reg_value = VSS_OP_REGISTER;
+		break;
+	case VSS_OP_REGISTER1:
+		/* Daemon expects us to reply with our own version*/
+		if (hvutil_transport_send(hvt, &our_ver, sizeof(our_ver)))
+			return -EFAULT;
+		dm_reg_value = VSS_OP_REGISTER1;
+		break;
+	default:
+		return -EINVAL;
+	}
+	vss_transaction.state = HVUTIL_READY;
+	pr_debug("VSS: userspace daemon ver. %d registered\n", dm_reg_value);
+	return 0;
+}
 
-	vss_msg = (struct hv_vss_msg *)msg->data;
+static int vss_on_msg(void *msg, int len)
+{
+	struct hv_vss_msg *vss_msg = (struct hv_vss_msg *)msg;
 
-	if (vss_msg->vss_hdr.operation == VSS_OP_REGISTER) {
-		pr_info("VSS daemon registered\n");
-		vss_transaction.active = false;
-		if (vss_transaction.recv_channel != NULL)
-			hv_vss_onchannelcallback(vss_transaction.recv_channel);
-		return;
+	if (len != sizeof(*vss_msg))
+		return -EINVAL;
 
+	if (vss_msg->vss_hdr.operation == VSS_OP_REGISTER ||
+	    vss_msg->vss_hdr.operation == VSS_OP_REGISTER1) {
+		/*
+		 * Don't process registration messages if we're in the middle
+		 * of a transaction processing.
+		 */
+		if (vss_transaction.state > HVUTIL_READY)
+			return -EINVAL;
+		return vss_handle_handshake(vss_msg);
+	} else if (vss_transaction.state == HVUTIL_USERSPACE_REQ) {
+		vss_transaction.state = HVUTIL_USERSPACE_RECV;
+		if (cancel_delayed_work_sync(&vss_timeout_work)) {
+			vss_respond_to_host(vss_msg->error);
+			/* Transaction is finished, reset the state. */
+			vss_transaction.state = HVUTIL_READY;
+			hv_poll_channel(vss_transaction.vss_context,
+					hv_vss_onchannelcallback);
+		}
+	} else {
+		/* This is a spurious call! */
+		pr_warn("VSS: Transaction not active\n");
+		return -EINVAL;
 	}
-	if (cancel_delayed_work_sync(&vss_timeout_work))
-		vss_respond_to_host(vss_msg->error);
+	return 0;
 }
 
 
@@ -97,28 +155,29 @@ static void vss_send_op(struct work_struct *dummy)
 {
 	int op = vss_transaction.msg->vss_hdr.operation;
 	int rc;
-	struct cn_msg *msg;
 	struct hv_vss_msg *vss_msg;
 
-	msg = kzalloc(sizeof(*msg) + sizeof(*vss_msg), GFP_ATOMIC);
-	if (!msg)
+	/* The transaction state is wrong. */
+	if (vss_transaction.state != HVUTIL_HOSTMSG_RECEIVED)
 		return;
 
-	vss_msg = (struct hv_vss_msg *)msg->data;
-
-	msg->id.idx =  CN_VSS_IDX;
-	msg->id.val = CN_VSS_VAL;
+	vss_msg = kzalloc(sizeof(*vss_msg), GFP_KERNEL);
+	if (!vss_msg)
+		return;
 
 	vss_msg->vss_hdr.operation = op;
-	msg->len = sizeof(struct hv_vss_msg);
 
-	rc = cn_netlink_send(msg, 0, 0, GFP_ATOMIC);
+	vss_transaction.state = HVUTIL_USERSPACE_REQ;
+	rc = hvutil_transport_send(hvt, vss_msg, sizeof(*vss_msg));
 	if (rc) {
 		pr_warn("VSS: failed to communicate to the daemon: %d\n", rc);
-		if (cancel_delayed_work_sync(&vss_timeout_work))
+		if (cancel_delayed_work_sync(&vss_timeout_work)) {
 			vss_respond_to_host(HV_E_FAIL);
+			vss_transaction.state = HVUTIL_READY;
+		}
 	}
-	kfree(msg);
+
+	kfree(vss_msg);
 
 	return;
 }
@@ -136,17 +195,6 @@ vss_respond_to_host(int error)
 	u64	req_id;
 
 	/*
-	 * If a transaction is not active; log and return.
-	 */
-
-	if (!vss_transaction.active) {
-		/*
-		 * This is a spurious call!
-		 */
-		pr_warn("VSS: Transaction not active\n");
-		return;
-	}
-	/*
 	 * Copy the global state for completing the transaction. Note that
 	 * only one transaction can be active at a time.
 	 */
@@ -154,7 +202,6 @@ vss_respond_to_host(int error)
 	buf_len = vss_transaction.recv_len;
 	channel = vss_transaction.recv_channel;
 	req_id = vss_transaction.recv_req_id;
-	vss_transaction.active = false;
 
 	icmsghdrp = (struct icmsg_hdr *)
 			&recv_buffer[sizeof(struct vmbuspipe_hdr)];
@@ -191,14 +238,15 @@ void hv_vss_onchannelcallback(void *context)
 	struct icmsg_hdr *icmsghdrp;
 	struct icmsg_negotiate *negop = NULL;
 
-	if (vss_transaction.active) {
+	if (vss_transaction.state > HVUTIL_READY) {
 		/*
 		 * We will defer processing this callback once
 		 * the current transaction is complete.
 		 */
-		vss_transaction.recv_channel = channel;
+		vss_transaction.vss_context = context;
 		return;
 	}
+	vss_transaction.vss_context = NULL;
 
 	vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 2, &recvlen,
 			 &requestid);
@@ -224,7 +272,6 @@ void hv_vss_onchannelcallback(void *context)
 			vss_transaction.recv_len = recvlen;
 			vss_transaction.recv_channel = channel;
 			vss_transaction.recv_req_id = requestid;
-			vss_transaction.active = true;
 			vss_transaction.msg = (struct hv_vss_msg *)vss_msg;
 
 			switch (vss_msg->vss_hdr.operation) {
@@ -241,6 +288,12 @@ void hv_vss_onchannelcallback(void *context)
 				 */
 			case VSS_OP_FREEZE:
 			case VSS_OP_THAW:
+				if (vss_transaction.state < HVUTIL_READY) {
+					/* Userspace is not registered yet */
+					vss_respond_to_host(HV_E_FAIL);
+					return;
+				}
+				vss_transaction.state = HVUTIL_HOSTMSG_RECEIVED;
 				schedule_work(&vss_send_op_work);
 				schedule_delayed_work(&vss_timeout_work,
 						      VSS_USERSPACE_TIMEOUT);
@@ -275,14 +328,16 @@ void hv_vss_onchannelcallback(void *context)
 
 }
 
+static void vss_on_reset(void)
+{
+	if (cancel_delayed_work_sync(&vss_timeout_work))
+		vss_respond_to_host(HV_E_FAIL);
+	vss_transaction.state = HVUTIL_DEVICE_INIT;
+}
+
 int
 hv_vss_init(struct hv_util_service *srv)
 {
-	int err;
-
-	err = cn_add_callback(&vss_id, vss_name, vss_cn_callback);
-	if (err)
-		return err;
 	recv_buffer = srv->recv_buffer;
 
 	/*
@@ -291,13 +346,20 @@ hv_vss_init(struct hv_util_service *srv)
 	 * Defer processing channel callbacks until the daemon
 	 * has registered.
 	 */
-	vss_transaction.active = true;
+	vss_transaction.state = HVUTIL_DEVICE_INIT;
+
+	hvt = hvutil_transport_init(vss_devname, CN_VSS_IDX, CN_VSS_VAL,
+				    vss_on_msg, vss_on_reset);
+	if (!hvt)
+		return -EFAULT;
+
 	return 0;
 }
 
 void hv_vss_deinit(void)
 {
-	cn_del_callback(&vss_id);
+	vss_transaction.state = HVUTIL_DEVICE_DYING;
 	cancel_delayed_work_sync(&vss_timeout_work);
 	cancel_work_sync(&vss_send_op_work);
+	hvutil_transport_destroy(hvt);
 }
diff --git a/kernel/drivers/hv/hv_utils_transport.c b/kernel/drivers/hv/hv_utils_transport.c
new file mode 100644
index 000000000..6a9d80a53
--- /dev/null
+++ b/kernel/drivers/hv/hv_utils_transport.c
@@ -0,0 +1,276 @@
+/*
+ * Kernel/userspace transport abstraction for Hyper-V util driver.
+ *
+ * Copyright (C) 2015, Vitaly Kuznetsov <vkuznets@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+
+#include "hyperv_vmbus.h"
+#include "hv_utils_transport.h"
+
+static DEFINE_SPINLOCK(hvt_list_lock);
+static struct list_head hvt_list = LIST_HEAD_INIT(hvt_list);
+
+static void hvt_reset(struct hvutil_transport *hvt)
+{
+	mutex_lock(&hvt->outmsg_lock);
+	kfree(hvt->outmsg);
+	hvt->outmsg = NULL;
+	hvt->outmsg_len = 0;
+	mutex_unlock(&hvt->outmsg_lock);
+	if (hvt->on_reset)
+		hvt->on_reset();
+}
+
+static ssize_t hvt_op_read(struct file *file, char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	struct hvutil_transport *hvt;
+	int ret;
+
+	hvt = container_of(file->f_op, struct hvutil_transport, fops);
+
+	if (wait_event_interruptible(hvt->outmsg_q, hvt->outmsg_len > 0))
+		return -EINTR;
+
+	mutex_lock(&hvt->outmsg_lock);
+	if (!hvt->outmsg) {
+		ret = -EAGAIN;
+		goto out_unlock;
+	}
+
+	if (count < hvt->outmsg_len) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (!copy_to_user(buf, hvt->outmsg, hvt->outmsg_len))
+		ret = hvt->outmsg_len;
+	else
+		ret = -EFAULT;
+
+	kfree(hvt->outmsg);
+	hvt->outmsg = NULL;
+	hvt->outmsg_len = 0;
+
+out_unlock:
+	mutex_unlock(&hvt->outmsg_lock);
+	return ret;
+}
+
+static ssize_t hvt_op_write(struct file *file, const char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	struct hvutil_transport *hvt;
+	u8 *inmsg;
+
+	hvt = container_of(file->f_op, struct hvutil_transport, fops);
+
+	inmsg = kzalloc(count, GFP_KERNEL);
+	if (copy_from_user(inmsg, buf, count)) {
+		kfree(inmsg);
+		return -EFAULT;
+	}
+	if (hvt->on_msg(inmsg, count))
+		return -EFAULT;
+	kfree(inmsg);
+
+	return count;
+}
+
+static unsigned int hvt_op_poll(struct file *file, poll_table *wait)
+{
+	struct hvutil_transport *hvt;
+
+	hvt = container_of(file->f_op, struct hvutil_transport, fops);
+
+	poll_wait(file, &hvt->outmsg_q, wait);
+	if (hvt->outmsg_len > 0)
+		return POLLIN | POLLRDNORM;
+
+	return 0;
+}
+
+static int hvt_op_open(struct inode *inode, struct file *file)
+{
+	struct hvutil_transport *hvt;
+
+	hvt = container_of(file->f_op, struct hvutil_transport, fops);
+
+	/*
+	 * Switching to CHARDEV mode. We switch bach to INIT when device
+	 * gets released.
+	 */
+	if (hvt->mode == HVUTIL_TRANSPORT_INIT)
+		hvt->mode = HVUTIL_TRANSPORT_CHARDEV;
+	else if (hvt->mode == HVUTIL_TRANSPORT_NETLINK) {
+		/*
+		 * We're switching from netlink communication to using char
+		 * device. Issue the reset first.
+		 */
+		hvt_reset(hvt);
+		hvt->mode = HVUTIL_TRANSPORT_CHARDEV;
+	} else
+		return -EBUSY;
+
+	return 0;
+}
+
+static int hvt_op_release(struct inode *inode, struct file *file)
+{
+	struct hvutil_transport *hvt;
+
+	hvt = container_of(file->f_op, struct hvutil_transport, fops);
+
+	hvt->mode = HVUTIL_TRANSPORT_INIT;
+	/*
+	 * Cleanup message buffers to avoid spurious messages when the daemon
+	 * connects back.
+	 */
+	hvt_reset(hvt);
+
+	return 0;
+}
+
+static void hvt_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
+{
+	struct hvutil_transport *hvt, *hvt_found = NULL;
+
+	spin_lock(&hvt_list_lock);
+	list_for_each_entry(hvt, &hvt_list, list) {
+		if (hvt->cn_id.idx == msg->id.idx &&
+		    hvt->cn_id.val == msg->id.val) {
+			hvt_found = hvt;
+			break;
+		}
+	}
+	spin_unlock(&hvt_list_lock);
+	if (!hvt_found) {
+		pr_warn("hvt_cn_callback: spurious message received!\n");
+		return;
+	}
+
+	/*
+	 * Switching to NETLINK mode. Switching to CHARDEV happens when someone
+	 * opens the device.
+	 */
+	if (hvt->mode == HVUTIL_TRANSPORT_INIT)
+		hvt->mode = HVUTIL_TRANSPORT_NETLINK;
+
+	if (hvt->mode == HVUTIL_TRANSPORT_NETLINK)
+		hvt_found->on_msg(msg->data, msg->len);
+	else
+		pr_warn("hvt_cn_callback: unexpected netlink message!\n");
+}
+
+int hvutil_transport_send(struct hvutil_transport *hvt, void *msg, int len)
+{
+	struct cn_msg *cn_msg;
+	int ret = 0;
+
+	if (hvt->mode == HVUTIL_TRANSPORT_INIT) {
+		return -EINVAL;
+	} else if (hvt->mode == HVUTIL_TRANSPORT_NETLINK) {
+		cn_msg = kzalloc(sizeof(*cn_msg) + len, GFP_ATOMIC);
+		if (!cn_msg)
+			return -ENOMEM;
+		cn_msg->id.idx = hvt->cn_id.idx;
+		cn_msg->id.val = hvt->cn_id.val;
+		cn_msg->len = len;
+		memcpy(cn_msg->data, msg, len);
+		ret = cn_netlink_send(cn_msg, 0, 0, GFP_ATOMIC);
+		kfree(cn_msg);
+		return ret;
+	}
+	/* HVUTIL_TRANSPORT_CHARDEV */
+	mutex_lock(&hvt->outmsg_lock);
+	if (hvt->outmsg) {
+		/* Previous message wasn't received */
+		ret = -EFAULT;
+		goto out_unlock;
+	}
+	hvt->outmsg = kzalloc(len, GFP_KERNEL);
+	memcpy(hvt->outmsg, msg, len);
+	hvt->outmsg_len = len;
+	wake_up_interruptible(&hvt->outmsg_q);
+out_unlock:
+	mutex_unlock(&hvt->outmsg_lock);
+	return ret;
+}
+
+struct hvutil_transport *hvutil_transport_init(const char *name,
+					       u32 cn_idx, u32 cn_val,
+					       int (*on_msg)(void *, int),
+					       void (*on_reset)(void))
+{
+	struct hvutil_transport *hvt;
+
+	hvt = kzalloc(sizeof(*hvt), GFP_KERNEL);
+	if (!hvt)
+		return NULL;
+
+	hvt->cn_id.idx = cn_idx;
+	hvt->cn_id.val = cn_val;
+
+	hvt->mdev.minor = MISC_DYNAMIC_MINOR;
+	hvt->mdev.name = name;
+
+	hvt->fops.owner = THIS_MODULE;
+	hvt->fops.read = hvt_op_read;
+	hvt->fops.write = hvt_op_write;
+	hvt->fops.poll = hvt_op_poll;
+	hvt->fops.open = hvt_op_open;
+	hvt->fops.release = hvt_op_release;
+
+	hvt->mdev.fops = &hvt->fops;
+
+	init_waitqueue_head(&hvt->outmsg_q);
+	mutex_init(&hvt->outmsg_lock);
+
+	spin_lock(&hvt_list_lock);
+	list_add(&hvt->list, &hvt_list);
+	spin_unlock(&hvt_list_lock);
+
+	hvt->on_msg = on_msg;
+	hvt->on_reset = on_reset;
+
+	if (misc_register(&hvt->mdev))
+		goto err_free_hvt;
+
+	/* Use cn_id.idx/cn_id.val to determine if we need to setup netlink */
+	if (hvt->cn_id.idx > 0 && hvt->cn_id.val > 0 &&
+	    cn_add_callback(&hvt->cn_id, name, hvt_cn_callback))
+		goto err_free_hvt;
+
+	return hvt;
+
+err_free_hvt:
+	kfree(hvt);
+	return NULL;
+}
+
+void hvutil_transport_destroy(struct hvutil_transport *hvt)
+{
+	spin_lock(&hvt_list_lock);
+	list_del(&hvt->list);
+	spin_unlock(&hvt_list_lock);
+	if (hvt->cn_id.idx > 0 && hvt->cn_id.val > 0)
+		cn_del_callback(&hvt->cn_id);
+	misc_deregister(&hvt->mdev);
+	kfree(hvt->outmsg);
+	kfree(hvt);
+}
diff --git a/kernel/drivers/hv/hv_utils_transport.h b/kernel/drivers/hv/hv_utils_transport.h
new file mode 100644
index 000000000..314c76ce1
--- /dev/null
+++ b/kernel/drivers/hv/hv_utils_transport.h
@@ -0,0 +1,51 @@
+/*
+ * Kernel/userspace transport abstraction for Hyper-V util driver.
+ *
+ * Copyright (C) 2015, Vitaly Kuznetsov <vkuznets@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ */
+
+#ifndef _HV_UTILS_TRANSPORT_H
+#define _HV_UTILS_TRANSPORT_H
+
+#include <linux/connector.h>
+#include <linux/miscdevice.h>
+
+enum hvutil_transport_mode {
+	HVUTIL_TRANSPORT_INIT = 0,
+	HVUTIL_TRANSPORT_NETLINK,
+	HVUTIL_TRANSPORT_CHARDEV,
+};
+
+struct hvutil_transport {
+	int mode;                           /* hvutil_transport_mode */
+	struct file_operations fops;        /* file operations */
+	struct miscdevice mdev;             /* misc device */
+	struct cb_id cn_id;                 /* CN_*_IDX/CN_*_VAL */
+	struct list_head list;              /* hvt_list */
+	int (*on_msg)(void *, int);         /* callback on new user message */
+	void (*on_reset)(void);             /* callback when userspace drops */
+	u8 *outmsg;                         /* message to the userspace */
+	int outmsg_len;                     /* its length */
+	wait_queue_head_t outmsg_q;         /* poll/read wait queue */
+	struct mutex outmsg_lock;           /* protects outmsg */
+};
+
+struct hvutil_transport *hvutil_transport_init(const char *name,
+					       u32 cn_idx, u32 cn_val,
+					       int (*on_msg)(void *, int),
+					       void (*on_reset)(void));
+int hvutil_transport_send(struct hvutil_transport *hvt, void *msg, int len);
+void hvutil_transport_destroy(struct hvutil_transport *hvt);
+
+#endif /* _HV_UTILS_TRANSPORT_H */
diff --git a/kernel/drivers/hv/hyperv_vmbus.h b/kernel/drivers/hv/hyperv_vmbus.h
index 887287ad4..378263656 100644
--- a/kernel/drivers/hv/hyperv_vmbus.h
+++ b/kernel/drivers/hv/hyperv_vmbus.h
@@ -63,9 +63,6 @@ enum hv_cpuid_function {
 /* Define version of the synthetic interrupt controller. */
 #define HV_SYNIC_VERSION		(1)
 
-/* Define the expected SynIC version. */
-#define HV_SYNIC_VERSION_1		(0x1)
-
 /* Define synthetic interrupt controller message constants. */
 #define HV_MESSAGE_SIZE			(256)
 #define HV_MESSAGE_PAYLOAD_BYTE_COUNT	(240)
@@ -105,8 +102,6 @@ enum hv_message_type {
 	HVMSG_X64_LEGACY_FP_ERROR		= 0x80010005
 };
 
-/* Define the number of synthetic interrupt sources. */
-#define HV_SYNIC_SINT_COUNT		(16)
 #define HV_SYNIC_STIMER_COUNT		(4)
 
 /* Define invalid partition identifier. */
@@ -141,7 +136,7 @@ struct hv_port_info {
 		struct {
 			u32 target_sint;
 			u32 target_vp;
-			u16 base_flag_bumber;
+			u16 base_flag_number;
 			u16 flag_count;
 			u32 rsvdz;
 		} event_port_info;
@@ -517,6 +512,7 @@ struct hv_context {
 	u64 guestid;
 
 	void *hypercall_page;
+	void *tsc_page;
 
 	bool synic_initialized;
 
@@ -551,10 +547,23 @@ struct hv_context {
 	 * Support PV clockevent device.
 	 */
 	struct clock_event_device *clk_evt[NR_CPUS];
+	/*
+	 * To manage allocations in a NUMA node.
+	 * Array indexed by numa node ID.
+	 */
+	struct cpumask *hv_numa_map;
 };
 
 extern struct hv_context hv_context;
 
+struct ms_hyperv_tsc_page {
+	volatile u32 tsc_sequence;
+	u32 reserved1;
+	volatile u64 tsc_scale;
+	volatile s64 tsc_offset;
+	u64 reserved2[509];
+};
+
 struct hv_ring_buffer_debug_info {
 	u32 current_interrupt_mask;
 	u32 current_read_index;
@@ -647,6 +656,7 @@ struct vmbus_connection {
 
 	atomic_t next_gpadl_handle;
 
+	struct completion  unload_event;
 	/*
 	 * Represents channel interrupts. Each bit position represents a
 	 * channel.  When a channel sends an interrupt via VMBUS, it finds its
@@ -730,9 +740,39 @@ int vmbus_set_event(struct vmbus_channel *channel);
 
 void vmbus_on_event(unsigned long data);
 
+int hv_kvp_init(struct hv_util_service *);
+void hv_kvp_deinit(void);
+void hv_kvp_onchannelcallback(void *);
+
+int hv_vss_init(struct hv_util_service *);
+void hv_vss_deinit(void);
+void hv_vss_onchannelcallback(void *);
+
 int hv_fcopy_init(struct hv_util_service *);
 void hv_fcopy_deinit(void);
 void hv_fcopy_onchannelcallback(void *);
+void vmbus_initiate_unload(void);
 
+static inline void hv_poll_channel(struct vmbus_channel *channel,
+				   void (*cb)(void *))
+{
+	if (!channel)
+		return;
+
+	if (channel->target_cpu != smp_processor_id())
+		smp_call_function_single(channel->target_cpu,
+					 cb, channel, true);
+	else
+		cb(channel);
+}
+
+enum hvutil_device_state {
+	HVUTIL_DEVICE_INIT = 0,  /* driver is loaded, waiting for userspace */
+	HVUTIL_READY,            /* userspace is registered */
+	HVUTIL_HOSTMSG_RECEIVED, /* message from the host was received */
+	HVUTIL_USERSPACE_REQ,    /* request to userspace was sent */
+	HVUTIL_USERSPACE_RECV,   /* reply from userspace was received */
+	HVUTIL_DEVICE_DYING,     /* driver unload is in progress */
+};
 
 #endif /* _HYPERV_VMBUS_H */
diff --git a/kernel/drivers/hv/ring_buffer.c b/kernel/drivers/hv/ring_buffer.c
index 6361d124f..70a1a9a22 100644
--- a/kernel/drivers/hv/ring_buffer.c
+++ b/kernel/drivers/hv/ring_buffer.c
@@ -103,10 +103,9 @@ static bool hv_need_to_signal(u32 old_write, struct hv_ring_buffer_info *rbi)
  *    there is room for the producer to send the pending packet.
  */
 
-static bool hv_need_to_signal_on_read(u32 old_rd,
-					 struct hv_ring_buffer_info *rbi)
+static bool hv_need_to_signal_on_read(u32 prev_write_sz,
+				      struct hv_ring_buffer_info *rbi)
 {
-	u32 prev_write_sz;
 	u32 cur_write_sz;
 	u32 r_size;
 	u32 write_loc = rbi->ring_buffer->write_index;
@@ -123,10 +122,6 @@ static bool hv_need_to_signal_on_read(u32 old_rd,
 	cur_write_sz = write_loc >= read_loc ? r_size - (write_loc - read_loc) :
 			read_loc - write_loc;
 
-	prev_write_sz = write_loc >= old_rd ? r_size - (write_loc - old_rd) :
-			old_rd - write_loc;
-
-
 	if ((prev_write_sz < pending_sz) && (cur_write_sz >= pending_sz))
 		return true;
 
@@ -517,7 +512,6 @@ int hv_ringbuffer_read(struct hv_ring_buffer_info *inring_info, void *buffer,
 	u32 next_read_location = 0;
 	u64 prev_indices = 0;
 	unsigned long flags;
-	u32 old_read;
 
 	if (buflen <= 0)
 		return -EINVAL;
@@ -528,8 +522,6 @@ int hv_ringbuffer_read(struct hv_ring_buffer_info *inring_info, void *buffer,
 				&bytes_avail_toread,
 				&bytes_avail_towrite);
 
-	old_read = bytes_avail_toread;
-
 	/* Make sure there is something to read */
 	if (bytes_avail_toread < buflen) {
 		spin_unlock_irqrestore(&inring_info->ring_lock, flags);
@@ -560,7 +552,7 @@ int hv_ringbuffer_read(struct hv_ring_buffer_info *inring_info, void *buffer,
 
 	spin_unlock_irqrestore(&inring_info->ring_lock, flags);
 
-	*signal = hv_need_to_signal_on_read(old_read, inring_info);
+	*signal = hv_need_to_signal_on_read(bytes_avail_towrite, inring_info);
 
 	return 0;
 }
diff --git a/kernel/drivers/hv/vmbus_drv.c b/kernel/drivers/hv/vmbus_drv.c
index c85235e9f..f19b6f7a4 100644
--- a/kernel/drivers/hv/vmbus_drv.c
+++ b/kernel/drivers/hv/vmbus_drv.c
@@ -39,6 +39,8 @@
 #include <asm/mshyperv.h>
 #include <linux/notifier.h>
 #include <linux/ptrace.h>
+#include <linux/screen_info.h>
+#include <linux/kdebug.h>
 #include "hyperv_vmbus.h"
 
 static struct acpi_device  *hv_acpi_dev;
@@ -48,12 +50,18 @@ static struct completion probe_event;
 static int irq;
 
 
-static int hyperv_panic_event(struct notifier_block *nb,
-			unsigned long event, void *ptr)
+static void hyperv_report_panic(struct pt_regs *regs)
 {
-	struct pt_regs *regs;
+	static bool panic_reported;
 
-	regs = current_pt_regs();
+	/*
+	 * We prefer to report panic on 'die' chain as we have proper
+	 * registers to report, but if we miss it (e.g. on BUG()) we need
+	 * to report it on 'panic'.
+	 */
+	if (panic_reported)
+		return;
+	panic_reported = true;
 
 	wrmsrl(HV_X64_MSR_CRASH_P0, regs->ip);
 	wrmsrl(HV_X64_MSR_CRASH_P1, regs->ax);
@@ -65,18 +73,37 @@ static int hyperv_panic_event(struct notifier_block *nb,
 	 * Let Hyper-V know there is crash data available
 	 */
 	wrmsrl(HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_CRASH_NOTIFY);
+}
+
+static int hyperv_panic_event(struct notifier_block *nb, unsigned long val,
+			      void *args)
+{
+	struct pt_regs *regs;
+
+	regs = current_pt_regs();
+
+	hyperv_report_panic(regs);
+	return NOTIFY_DONE;
+}
+
+static int hyperv_die_event(struct notifier_block *nb, unsigned long val,
+			    void *args)
+{
+	struct die_args *die = (struct die_args *)args;
+	struct pt_regs *regs = die->regs;
+
+	hyperv_report_panic(regs);
 	return NOTIFY_DONE;
 }
 
+static struct notifier_block hyperv_die_block = {
+	.notifier_call = hyperv_die_event,
+};
 static struct notifier_block hyperv_panic_block = {
 	.notifier_call = hyperv_panic_event,
 };
 
-struct resource hyperv_mmio = {
-	.name  = "hyperv mmio",
-	.flags = IORESOURCE_MEM,
-};
-EXPORT_SYMBOL_GPL(hyperv_mmio);
+struct resource *hyperv_mmio;
 
 static int vmbus_exists(void)
 {
@@ -414,6 +441,43 @@ static ssize_t in_write_bytes_avail_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(in_write_bytes_avail);
 
+static ssize_t channel_vp_mapping_show(struct device *dev,
+				       struct device_attribute *dev_attr,
+				       char *buf)
+{
+	struct hv_device *hv_dev = device_to_hv_device(dev);
+	struct vmbus_channel *channel = hv_dev->channel, *cur_sc;
+	unsigned long flags;
+	int buf_size = PAGE_SIZE, n_written, tot_written;
+	struct list_head *cur;
+
+	if (!channel)
+		return -ENODEV;
+
+	tot_written = snprintf(buf, buf_size, "%u:%u\n",
+		channel->offermsg.child_relid, channel->target_cpu);
+
+	spin_lock_irqsave(&channel->lock, flags);
+
+	list_for_each(cur, &channel->sc_list) {
+		if (tot_written >= buf_size - 1)
+			break;
+
+		cur_sc = list_entry(cur, struct vmbus_channel, sc_list);
+		n_written = scnprintf(buf + tot_written,
+				     buf_size - tot_written,
+				     "%u:%u\n",
+				     cur_sc->offermsg.child_relid,
+				     cur_sc->target_cpu);
+		tot_written += n_written;
+	}
+
+	spin_unlock_irqrestore(&channel->lock, flags);
+
+	return tot_written;
+}
+static DEVICE_ATTR_RO(channel_vp_mapping);
+
 /* Set up per device attributes in /sys/bus/vmbus/devices/<bus device> */
 static struct attribute *vmbus_attrs[] = {
 	&dev_attr_id.attr,
@@ -438,6 +502,7 @@ static struct attribute *vmbus_attrs[] = {
 	&dev_attr_in_write_index.attr,
 	&dev_attr_in_read_bytes_avail.attr,
 	&dev_attr_in_write_bytes_avail.attr,
+	&dev_attr_channel_vp_mapping.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(vmbus);
@@ -763,38 +828,6 @@ static void vmbus_isr(void)
 	}
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-static int hyperv_cpu_disable(void)
-{
-	return -ENOSYS;
-}
-
-static void hv_cpu_hotplug_quirk(bool vmbus_loaded)
-{
-	static void *previous_cpu_disable;
-
-	/*
-	 * Offlining a CPU when running on newer hypervisors (WS2012R2, Win8,
-	 * ...) is not supported at this moment as channel interrupts are
-	 * distributed across all of them.
-	 */
-
-	if ((vmbus_proto_version == VERSION_WS2008) ||
-	    (vmbus_proto_version == VERSION_WIN7))
-		return;
-
-	if (vmbus_loaded) {
-		previous_cpu_disable = smp_ops.cpu_disable;
-		smp_ops.cpu_disable = hyperv_cpu_disable;
-		pr_notice("CPU offlining is not supported by hypervisor\n");
-	} else if (previous_cpu_disable)
-		smp_ops.cpu_disable = previous_cpu_disable;
-}
-#else
-static void hv_cpu_hotplug_quirk(bool vmbus_loaded)
-{
-}
-#endif
 
 /*
  * vmbus_bus_init -Main vmbus driver initialization routine.
@@ -836,12 +869,14 @@ static int vmbus_bus_init(int irq)
 	if (ret)
 		goto err_alloc;
 
-	hv_cpu_hotplug_quirk(true);
+	if (vmbus_proto_version > VERSION_WIN7)
+		cpu_hotplug_disable();
 
 	/*
 	 * Only register if the crash MSRs are available
 	 */
-	if (ms_hyperv.features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
+	if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
+		register_die_notifier(&hyperv_die_block);
 		atomic_notifier_chain_register(&panic_notifier_list,
 					       &hyperv_panic_block);
 	}
@@ -863,8 +898,8 @@ err_cleanup:
 }
 
 /**
- * __vmbus_child_driver_register - Register a vmbus's driver
- * @drv: Pointer to driver structure you want to register
+ * __vmbus_child_driver_register() - Register a vmbus's driver
+ * @hv_driver: Pointer to driver structure you want to register
  * @owner: owner module of the drv
  * @mod_name: module name string
  *
@@ -896,7 +931,8 @@ EXPORT_SYMBOL_GPL(__vmbus_driver_register);
 
 /**
  * vmbus_driver_unregister() - Unregister a vmbus's driver
- * @drv: Pointer to driver structure you want to un-register
+ * @hv_driver: Pointer to driver structure you want to
+ *             un-register
  *
  * Un-register the given driver that was previous registered with a call to
  * vmbus_driver_register()
@@ -982,30 +1018,184 @@ void vmbus_device_unregister(struct hv_device *device_obj)
 
 
 /*
- * VMBUS is an acpi enumerated device. Get the the information we
+ * VMBUS is an acpi enumerated device. Get the information we
  * need from DSDT.
  */
-
+#define VTPM_BASE_ADDRESS 0xfed40000
 static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx)
 {
+	resource_size_t start = 0;
+	resource_size_t end = 0;
+	struct resource *new_res;
+	struct resource **old_res = &hyperv_mmio;
+	struct resource **prev_res = NULL;
+
 	switch (res->type) {
 	case ACPI_RESOURCE_TYPE_IRQ:
 		irq = res->data.irq.interrupts[0];
+		return AE_OK;
+
+	/*
+	 * "Address" descriptors are for bus windows. Ignore
+	 * "memory" descriptors, which are for registers on
+	 * devices.
+	 */
+	case ACPI_RESOURCE_TYPE_ADDRESS32:
+		start = res->data.address32.address.minimum;
+		end = res->data.address32.address.maximum;
 		break;
 
 	case ACPI_RESOURCE_TYPE_ADDRESS64:
-		hyperv_mmio.start = res->data.address64.address.minimum;
-		hyperv_mmio.end = res->data.address64.address.maximum;
+		start = res->data.address64.address.minimum;
+		end = res->data.address64.address.maximum;
 		break;
+
+	default:
+		/* Unused resource type */
+		return AE_OK;
+
 	}
+	/*
+	 * Ignore ranges that are below 1MB, as they're not
+	 * necessary or useful here.
+	 */
+	if (end < 0x100000)
+		return AE_OK;
+
+	new_res = kzalloc(sizeof(*new_res), GFP_ATOMIC);
+	if (!new_res)
+		return AE_NO_MEMORY;
+
+	/* If this range overlaps the virtual TPM, truncate it. */
+	if (end > VTPM_BASE_ADDRESS && start < VTPM_BASE_ADDRESS)
+		end = VTPM_BASE_ADDRESS;
+
+	new_res->name = "hyperv mmio";
+	new_res->flags = IORESOURCE_MEM;
+	new_res->start = start;
+	new_res->end = end;
+
+	do {
+		if (!*old_res) {
+			*old_res = new_res;
+			break;
+		}
+
+		if ((*old_res)->end < new_res->start) {
+			new_res->sibling = *old_res;
+			if (prev_res)
+				(*prev_res)->sibling = new_res;
+			*old_res = new_res;
+			break;
+		}
+
+		prev_res = old_res;
+		old_res = &(*old_res)->sibling;
+
+	} while (1);
 
 	return AE_OK;
 }
 
+static int vmbus_acpi_remove(struct acpi_device *device)
+{
+	struct resource *cur_res;
+	struct resource *next_res;
+
+	if (hyperv_mmio) {
+		for (cur_res = hyperv_mmio; cur_res; cur_res = next_res) {
+			next_res = cur_res->sibling;
+			kfree(cur_res);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * vmbus_allocate_mmio() - Pick a memory-mapped I/O range.
+ * @new:		If successful, supplied a pointer to the
+ *			allocated MMIO space.
+ * @device_obj:		Identifies the caller
+ * @min:		Minimum guest physical address of the
+ *			allocation
+ * @max:		Maximum guest physical address
+ * @size:		Size of the range to be allocated
+ * @align:		Alignment of the range to be allocated
+ * @fb_overlap_ok:	Whether this allocation can be allowed
+ *			to overlap the video frame buffer.
+ *
+ * This function walks the resources granted to VMBus by the
+ * _CRS object in the ACPI namespace underneath the parent
+ * "bridge" whether that's a root PCI bus in the Generation 1
+ * case or a Module Device in the Generation 2 case.  It then
+ * attempts to allocate from the global MMIO pool in a way that
+ * matches the constraints supplied in these parameters and by
+ * that _CRS.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj,
+			resource_size_t min, resource_size_t max,
+			resource_size_t size, resource_size_t align,
+			bool fb_overlap_ok)
+{
+	struct resource *iter;
+	resource_size_t range_min, range_max, start, local_min, local_max;
+	const char *dev_n = dev_name(&device_obj->device);
+	u32 fb_end = screen_info.lfb_base + (screen_info.lfb_size << 1);
+	int i;
+
+	for (iter = hyperv_mmio; iter; iter = iter->sibling) {
+		if ((iter->start >= max) || (iter->end <= min))
+			continue;
+
+		range_min = iter->start;
+		range_max = iter->end;
+
+		/* If this range overlaps the frame buffer, split it into
+		   two tries. */
+		for (i = 0; i < 2; i++) {
+			local_min = range_min;
+			local_max = range_max;
+			if (fb_overlap_ok || (range_min >= fb_end) ||
+			    (range_max <= screen_info.lfb_base)) {
+				i++;
+			} else {
+				if ((range_min <= screen_info.lfb_base) &&
+				    (range_max >= screen_info.lfb_base)) {
+					/*
+					 * The frame buffer is in this window,
+					 * so trim this into the part that
+					 * preceeds the frame buffer.
+					 */
+					local_max = screen_info.lfb_base - 1;
+					range_min = fb_end;
+				} else {
+					range_min = fb_end;
+					continue;
+				}
+			}
+
+			start = (local_min + align - 1) & ~(align - 1);
+			for (; start + size - 1 <= local_max; start += align) {
+				*new = request_mem_region_exclusive(start, size,
+								    dev_n);
+				if (*new)
+					return 0;
+			}
+		}
+	}
+
+	return -ENXIO;
+}
+EXPORT_SYMBOL_GPL(vmbus_allocate_mmio);
+
 static int vmbus_acpi_add(struct acpi_device *device)
 {
 	acpi_status result;
 	int ret_val = -ENODEV;
+	struct acpi_device *ancestor;
 
 	hv_acpi_dev = device;
 
@@ -1015,23 +1205,24 @@ static int vmbus_acpi_add(struct acpi_device *device)
 	if (ACPI_FAILURE(result))
 		goto acpi_walk_err;
 	/*
-	 * The parent of the vmbus acpi device (Gen2 firmware) is the VMOD that
-	 * has the mmio ranges. Get that.
+	 * Some ancestor of the vmbus acpi device (Gen1 or Gen2
+	 * firmware) is the VMOD that has the mmio ranges. Get that.
 	 */
-	if (device->parent) {
-		result = acpi_walk_resources(device->parent->handle,
-					METHOD_NAME__CRS,
-					vmbus_walk_resources, NULL);
+	for (ancestor = device->parent; ancestor; ancestor = ancestor->parent) {
+		result = acpi_walk_resources(ancestor->handle, METHOD_NAME__CRS,
+					     vmbus_walk_resources, NULL);
 
 		if (ACPI_FAILURE(result))
-			goto acpi_walk_err;
-		if (hyperv_mmio.start && hyperv_mmio.end)
-			request_resource(&iomem_resource, &hyperv_mmio);
+			continue;
+		if (hyperv_mmio)
+			break;
 	}
 	ret_val = 0;
 
 acpi_walk_err:
 	complete(&probe_event);
+	if (ret_val)
+		vmbus_acpi_remove(device);
 	return ret_val;
 }
 
@@ -1047,9 +1238,33 @@ static struct acpi_driver vmbus_acpi_driver = {
 	.ids = vmbus_acpi_device_ids,
 	.ops = {
 		.add = vmbus_acpi_add,
+		.remove = vmbus_acpi_remove,
 	},
 };
 
+static void hv_kexec_handler(void)
+{
+	int cpu;
+
+	hv_synic_clockevents_cleanup();
+	vmbus_initiate_unload();
+	for_each_online_cpu(cpu)
+		smp_call_function_single(cpu, hv_synic_cleanup, NULL, 1);
+	hv_cleanup();
+};
+
+static void hv_crash_handler(struct pt_regs *regs)
+{
+	vmbus_initiate_unload();
+	/*
+	 * In crash handler we can't schedule synic cleanup for all CPUs,
+	 * doing the cleanup for current CPU only. This should be sufficient
+	 * for kdump.
+	 */
+	hv_synic_cleanup(NULL);
+	hv_cleanup();
+};
+
 static int __init hv_acpi_init(void)
 {
 	int ret, t;
@@ -1082,6 +1297,9 @@ static int __init hv_acpi_init(void)
 	if (ret)
 		goto cleanup;
 
+	hv_setup_kexec_handler(hv_kexec_handler);
+	hv_setup_crash_handler(hv_crash_handler);
+
 	return 0;
 
 cleanup:
@@ -1094,17 +1312,29 @@ static void __exit vmbus_exit(void)
 {
 	int cpu;
 
+	hv_remove_kexec_handler();
+	hv_remove_crash_handler();
 	vmbus_connection.conn_state = DISCONNECTED;
 	hv_synic_clockevents_cleanup();
+	vmbus_disconnect();
 	hv_remove_vmbus_irq();
+	tasklet_kill(&msg_dpc);
 	vmbus_free_channels();
+	if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
+		unregister_die_notifier(&hyperv_die_block);
+		atomic_notifier_chain_unregister(&panic_notifier_list,
+						 &hyperv_panic_block);
+	}
 	bus_unregister(&hv_bus);
 	hv_cleanup();
-	for_each_online_cpu(cpu)
+	for_each_online_cpu(cpu) {
+		tasklet_kill(hv_context.event_dpc[cpu]);
 		smp_call_function_single(cpu, hv_synic_cleanup, NULL, 1);
+	}
+	hv_synic_free();
 	acpi_bus_unregister_driver(&vmbus_acpi_driver);
-	hv_cpu_hotplug_quirk(false);
-	vmbus_disconnect();
+	if (vmbus_proto_version > VERSION_WIN7)
+		cpu_hotplug_enable();
 }
author	José Pekkarinen <jose.pekkarinen@nokia.com>	2016-04-11 10:41:07 +0300
committer	José Pekkarinen <jose.pekkarinen@nokia.com>	2016-04-13 08:17:18 +0300
commit	e09b41010ba33a20a87472ee821fa407a5b8da36 (patch)
tree	d10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/drivers/hv
parent	f93b97fd65072de626c074dbe099a1fff05ce060 (diff)