diff options
author | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-04-11 10:41:07 +0300 |
---|---|---|
committer | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-04-13 08:17:18 +0300 |
commit | e09b41010ba33a20a87472ee821fa407a5b8da36 (patch) | |
tree | d10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/drivers/hv | |
parent | f93b97fd65072de626c074dbe099a1fff05ce060 (diff) |
These changes are the raw update to linux-4.4.6-rt14. Kernel sources
are taken from kernel.org, and rt patch from the rt wiki download page.
During the rebasing, the following patch collided:
Force tick interrupt and get rid of softirq magic(I70131fb85).
Collisions have been removed because its logic was found on the
source already.
Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769
Signed-off-by: José Pekkarinen <jose.pekkarinen@nokia.com>
Diffstat (limited to 'kernel/drivers/hv')
-rw-r--r-- | kernel/drivers/hv/Makefile | 2 | ||||
-rw-r--r-- | kernel/drivers/hv/channel.c | 49 | ||||
-rw-r--r-- | kernel/drivers/hv/channel_mgmt.c | 195 | ||||
-rw-r--r-- | kernel/drivers/hv/connection.c | 13 | ||||
-rw-r--r-- | kernel/drivers/hv/hv.c | 152 | ||||
-rw-r--r-- | kernel/drivers/hv/hv_balloon.c | 30 | ||||
-rw-r--r-- | kernel/drivers/hv/hv_fcopy.c | 286 | ||||
-rw-r--r-- | kernel/drivers/hv/hv_kvp.c | 193 | ||||
-rw-r--r-- | kernel/drivers/hv/hv_snapshot.c | 168 | ||||
-rw-r--r-- | kernel/drivers/hv/hv_utils_transport.c | 276 | ||||
-rw-r--r-- | kernel/drivers/hv/hv_utils_transport.h | 51 | ||||
-rw-r--r-- | kernel/drivers/hv/hyperv_vmbus.h | 52 | ||||
-rw-r--r-- | kernel/drivers/hv/ring_buffer.c | 14 | ||||
-rw-r--r-- | kernel/drivers/hv/vmbus_drv.c | 354 |
14 files changed, 1316 insertions, 519 deletions
diff --git a/kernel/drivers/hv/Makefile b/kernel/drivers/hv/Makefile index 5e4dfa4cf..39c9b2c08 100644 --- a/kernel/drivers/hv/Makefile +++ b/kernel/drivers/hv/Makefile @@ -5,4 +5,4 @@ obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o hv_vmbus-y := vmbus_drv.o \ hv.o connection.o channel.o \ channel_mgmt.o ring_buffer.o -hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_fcopy.o +hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_fcopy.o hv_utils_transport.o diff --git a/kernel/drivers/hv/channel.c b/kernel/drivers/hv/channel.c index 54da66dc7..9098f13f2 100644 --- a/kernel/drivers/hv/channel.c +++ b/kernel/drivers/hv/channel.c @@ -73,6 +73,7 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size, unsigned long flags; int ret, err = 0; unsigned long t; + struct page *page; spin_lock_irqsave(&newchannel->lock, flags); if (newchannel->state == CHANNEL_OPEN_STATE) { @@ -87,8 +88,17 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size, newchannel->channel_callback_context = context; /* Allocate the ring buffer */ - out = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, - get_order(send_ringbuffer_size + recv_ringbuffer_size)); + page = alloc_pages_node(cpu_to_node(newchannel->target_cpu), + GFP_KERNEL|__GFP_ZERO, + get_order(send_ringbuffer_size + + recv_ringbuffer_size)); + + if (!page) + out = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, + get_order(send_ringbuffer_size + + recv_ringbuffer_size)); + else + out = (void *)page_address(page); if (!out) { err = -ENOMEM; @@ -178,19 +188,18 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size, goto error1; } - - if (open_info->response.open_result.status) - err = open_info->response.open_result.status; - spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); list_del(&open_info->msglistentry); spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); - if (err == 0) - newchannel->state = CHANNEL_OPENED_STATE; + if (open_info->response.open_result.status) { + err = -EAGAIN; + goto error_gpadl; + } + newchannel->state = CHANNEL_OPENED_STATE; kfree(open_info); - return err; + return 0; error1: spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); @@ -592,6 +601,7 @@ int vmbus_sendpacket_ctl(struct vmbus_channel *channel, void *buffer, u64 aligned_data = 0; int ret; bool signal = false; + int num_vecs = ((bufferlen != 0) ? 3 : 1); /* Setup the descriptor */ @@ -609,7 +619,8 @@ int vmbus_sendpacket_ctl(struct vmbus_channel *channel, void *buffer, bufferlist[2].iov_base = &aligned_data; bufferlist[2].iov_len = (packetlen_aligned - packetlen); - ret = hv_ringbuffer_write(&channel->outbound, bufferlist, 3, &signal); + ret = hv_ringbuffer_write(&channel->outbound, bufferlist, num_vecs, + &signal); /* * Signalling the host is conditional on many factors: @@ -619,10 +630,19 @@ int vmbus_sendpacket_ctl(struct vmbus_channel *channel, void *buffer, * on the ring. We will not signal if more data is * to be placed. * + * Based on the channel signal state, we will decide + * which signaling policy will be applied. + * * If we cannot write to the ring-buffer; signal the host * even if we may not have written anything. This is a rare * enough condition that it should not matter. */ + + if (channel->signal_policy) + signal = true; + else + kick_q = true; + if (((ret == 0) && kick_q && signal) || (ret)) vmbus_setevent(channel); @@ -722,10 +742,19 @@ int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel, * on the ring. We will not signal if more data is * to be placed. * + * Based on the channel signal state, we will decide + * which signaling policy will be applied. + * * If we cannot write to the ring-buffer; signal the host * even if we may not have written anything. This is a rare * enough condition that it should not matter. */ + + if (channel->signal_policy) + signal = true; + else + kick_q = true; + if (((ret == 0) && kick_q && signal) || (ret)) vmbus_setevent(channel); diff --git a/kernel/drivers/hv/channel_mgmt.c b/kernel/drivers/hv/channel_mgmt.c index 0eeb1b3bc..652afd11a 100644 --- a/kernel/drivers/hv/channel_mgmt.c +++ b/kernel/drivers/hv/channel_mgmt.c @@ -32,6 +32,9 @@ #include "hyperv_vmbus.h" +static void init_vp_index(struct vmbus_channel *channel, + const uuid_le *type_guid); + /** * vmbus_prep_negotiate_resp() - Create default response for Hyper-V Negotiate message * @icmsghdrp: Pointer to msg header structure @@ -201,22 +204,38 @@ void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid) spin_lock_irqsave(&vmbus_connection.channel_lock, flags); list_del(&channel->listentry); spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags); + + primary_channel = channel; } else { primary_channel = channel->primary_channel; spin_lock_irqsave(&primary_channel->lock, flags); list_del(&channel->sc_list); + primary_channel->num_sc--; spin_unlock_irqrestore(&primary_channel->lock, flags); } + + /* + * We need to free the bit for init_vp_index() to work in the case + * of sub-channel, when we reload drivers like hv_netvsc. + */ + cpumask_clear_cpu(channel->target_cpu, + &primary_channel->alloced_cpus_in_node); + free_channel(channel); } void vmbus_free_channels(void) { - struct vmbus_channel *channel; + struct vmbus_channel *channel, *tmp; + + list_for_each_entry_safe(channel, tmp, &vmbus_connection.chn_list, + listentry) { + /* if we don't set rescind to true, vmbus_close_internal() + * won't invoke hv_process_channel_removal(). + */ + channel->rescind = true; - list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { vmbus_device_unregister(channel->device_obj); - free_channel(channel); } } @@ -228,7 +247,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) { struct vmbus_channel *channel; bool fnew = true; - bool enq = false; unsigned long flags; /* Make sure this is a new offer */ @@ -244,25 +262,12 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) } } - if (fnew) { + if (fnew) list_add_tail(&newchannel->listentry, &vmbus_connection.chn_list); - enq = true; - } spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags); - if (enq) { - if (newchannel->target_cpu != get_cpu()) { - put_cpu(); - smp_call_function_single(newchannel->target_cpu, - percpu_channel_enq, - newchannel, true); - } else { - percpu_channel_enq(newchannel); - put_cpu(); - } - } if (!fnew) { /* * Check to see if this is a sub-channel. @@ -274,27 +279,22 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) newchannel->primary_channel = channel; spin_lock_irqsave(&channel->lock, flags); list_add_tail(&newchannel->sc_list, &channel->sc_list); - spin_unlock_irqrestore(&channel->lock, flags); - - if (newchannel->target_cpu != get_cpu()) { - put_cpu(); - smp_call_function_single(newchannel->target_cpu, - percpu_channel_enq, - newchannel, true); - } else { - percpu_channel_enq(newchannel); - put_cpu(); - } - - newchannel->state = CHANNEL_OPEN_STATE; channel->num_sc++; - if (channel->sc_creation_callback != NULL) - channel->sc_creation_callback(newchannel); + spin_unlock_irqrestore(&channel->lock, flags); + } else + goto err_free_chan; + } - return; - } + init_vp_index(newchannel, &newchannel->offermsg.offer.if_type); - goto err_free_chan; + if (newchannel->target_cpu != get_cpu()) { + put_cpu(); + smp_call_function_single(newchannel->target_cpu, + percpu_channel_enq, + newchannel, true); + } else { + percpu_channel_enq(newchannel); + put_cpu(); } /* @@ -304,6 +304,12 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) */ newchannel->state = CHANNEL_OPEN_STATE; + if (!fnew) { + if (channel->sc_creation_callback != NULL) + channel->sc_creation_callback(newchannel); + return; + } + /* * Start the process of binding this offer to the driver * We need to set the DeviceObject field before calling @@ -351,6 +357,7 @@ enum { IDE = 0, SCSI, NIC, + ND_NIC, MAX_PERF_CHN, }; @@ -374,23 +381,28 @@ static const struct hv_vmbus_device_id hp_devs[] = { /* * We use this state to statically distribute the channel interrupt load. */ -static u32 next_vp; +static int next_numa_node_id; /* * Starting with Win8, we can statically distribute the incoming - * channel interrupt load by binding a channel to VCPU. We - * implement here a simple round robin scheme for distributing - * the interrupt load. - * We will bind channels that are not performance critical to cpu 0 and - * performance critical channels (IDE, SCSI and Network) will be uniformly - * distributed across all available CPUs. + * channel interrupt load by binding a channel to VCPU. + * We do this in a hierarchical fashion: + * First distribute the primary channels across available NUMA nodes + * and then distribute the subchannels amongst the CPUs in the NUMA + * node assigned to the primary channel. + * + * For pre-win8 hosts or non-performance critical channels we assign the + * first CPU in the first NUMA node. */ static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_guid) { u32 cur_cpu; int i; bool perf_chn = false; - u32 max_cpus = num_online_cpus(); + struct vmbus_channel *primary = channel->primary_channel; + int next_node; + struct cpumask available_mask; + struct cpumask *alloced_mask; for (i = IDE; i < MAX_PERF_CHN; i++) { if (!memcmp(type_guid->b, hp_devs[i].guid, @@ -407,16 +419,104 @@ static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_gui * Also if the channel is not a performance critical * channel, bind it to cpu 0. */ + channel->numa_node = 0; channel->target_cpu = 0; - channel->target_vp = 0; + channel->target_vp = hv_context.vp_index[0]; return; } - cur_cpu = (++next_vp % max_cpus); + + /* + * We distribute primary channels evenly across all the available + * NUMA nodes and within the assigned NUMA node we will assign the + * first available CPU to the primary channel. + * The sub-channels will be assigned to the CPUs available in the + * NUMA node evenly. + */ + if (!primary) { + while (true) { + next_node = next_numa_node_id++; + if (next_node == nr_node_ids) + next_node = next_numa_node_id = 0; + if (cpumask_empty(cpumask_of_node(next_node))) + continue; + break; + } + channel->numa_node = next_node; + primary = channel; + } + alloced_mask = &hv_context.hv_numa_map[primary->numa_node]; + + if (cpumask_weight(alloced_mask) == + cpumask_weight(cpumask_of_node(primary->numa_node))) { + /* + * We have cycled through all the CPUs in the node; + * reset the alloced map. + */ + cpumask_clear(alloced_mask); + } + + cpumask_xor(&available_mask, alloced_mask, + cpumask_of_node(primary->numa_node)); + + cur_cpu = -1; + while (true) { + cur_cpu = cpumask_next(cur_cpu, &available_mask); + if (cur_cpu >= nr_cpu_ids) { + cur_cpu = -1; + cpumask_copy(&available_mask, + cpumask_of_node(primary->numa_node)); + continue; + } + + /* + * NOTE: in the case of sub-channel, we clear the sub-channel + * related bit(s) in primary->alloced_cpus_in_node in + * hv_process_channel_removal(), so when we reload drivers + * like hv_netvsc in SMP guest, here we're able to re-allocate + * bit from primary->alloced_cpus_in_node. + */ + if (!cpumask_test_cpu(cur_cpu, + &primary->alloced_cpus_in_node)) { + cpumask_set_cpu(cur_cpu, + &primary->alloced_cpus_in_node); + cpumask_set_cpu(cur_cpu, alloced_mask); + break; + } + } + channel->target_cpu = cur_cpu; channel->target_vp = hv_context.vp_index[cur_cpu]; } /* + * vmbus_unload_response - Handler for the unload response. + */ +static void vmbus_unload_response(struct vmbus_channel_message_header *hdr) +{ + /* + * This is a global event; just wakeup the waiting thread. + * Once we successfully unload, we can cleanup the monitor state. + */ + complete(&vmbus_connection.unload_event); +} + +void vmbus_initiate_unload(void) +{ + struct vmbus_channel_message_header hdr; + + /* Pre-Win2012R2 hosts don't support reconnect */ + if (vmbus_proto_version < VERSION_WIN8_1) + return; + + init_completion(&vmbus_connection.unload_event); + memset(&hdr, 0, sizeof(struct vmbus_channel_message_header)); + hdr.msgtype = CHANNELMSG_UNLOAD; + vmbus_post_msg(&hdr, sizeof(struct vmbus_channel_message_header)); + + wait_for_completion(&vmbus_connection.unload_event); +} + +/* * vmbus_onoffer - Handler for channel offers from vmbus in parent partition. * */ @@ -461,8 +561,6 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) offer->connection_id; } - init_vp_index(newchannel, &offer->offer.if_type); - memcpy(&newchannel->offermsg, offer, sizeof(struct vmbus_channel_offer_channel)); newchannel->monitor_grp = (u8)offer->monitorid / 32; @@ -712,6 +810,7 @@ struct vmbus_channel_message_table_entry {CHANNELMSG_INITIATE_CONTACT, 0, NULL}, {CHANNELMSG_VERSION_RESPONSE, 1, vmbus_onversion_response}, {CHANNELMSG_UNLOAD, 0, NULL}, + {CHANNELMSG_UNLOAD_RESPONSE, 1, vmbus_unload_response}, }; /* diff --git a/kernel/drivers/hv/connection.c b/kernel/drivers/hv/connection.c index b27220a42..4fc2e8836 100644 --- a/kernel/drivers/hv/connection.c +++ b/kernel/drivers/hv/connection.c @@ -58,6 +58,9 @@ static __u32 vmbus_get_next_version(__u32 current_version) case (VERSION_WIN8_1): return VERSION_WIN8; + case (VERSION_WIN10): + return VERSION_WIN8_1; + case (VERSION_WS2008): default: return VERSION_INVAL; @@ -80,7 +83,7 @@ static int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, msg->interrupt_page = virt_to_phys(vmbus_connection.int_page); msg->monitor_page1 = virt_to_phys(vmbus_connection.monitor_pages[0]); msg->monitor_page2 = virt_to_phys(vmbus_connection.monitor_pages[1]); - if (version == VERSION_WIN8_1) { + if (version >= VERSION_WIN8_1) { msg->target_vcpu = hv_context.vp_index[get_cpu()]; put_cpu(); } @@ -227,6 +230,11 @@ cleanup: void vmbus_disconnect(void) { + /* + * First send the unload request to the host. + */ + vmbus_initiate_unload(); + if (vmbus_connection.work_queue) { drain_workqueue(vmbus_connection.work_queue); destroy_workqueue(vmbus_connection.work_queue); @@ -371,8 +379,7 @@ void vmbus_on_event(unsigned long data) int cpu = smp_processor_id(); union hv_synic_event_flags *event; - if ((vmbus_proto_version == VERSION_WS2008) || - (vmbus_proto_version == VERSION_WIN7)) { + if (vmbus_proto_version < VERSION_WIN8) { maxdword = MAX_NUM_CHANNELS_SUPPORTED >> 5; recv_int_page = vmbus_connection.recv_int_page; } else { diff --git a/kernel/drivers/hv/hv.c b/kernel/drivers/hv/hv.c index d3943bcee..6341be873 100644 --- a/kernel/drivers/hv/hv.c +++ b/kernel/drivers/hv/hv.c @@ -93,11 +93,14 @@ static int query_hypervisor_info(void) */ static u64 do_hypercall(u64 control, void *input, void *output) { -#ifdef CONFIG_X86_64 - u64 hv_status = 0; u64 input_address = (input) ? virt_to_phys(input) : 0; u64 output_address = (output) ? virt_to_phys(output) : 0; void *hypercall_page = hv_context.hypercall_page; +#ifdef CONFIG_X86_64 + u64 hv_status = 0; + + if (!hypercall_page) + return (u64)ULLONG_MAX; __asm__ __volatile__("mov %0, %%r8" : : "r" (output_address) : "r8"); __asm__ __volatile__("call *%3" : "=a" (hv_status) : @@ -112,13 +115,13 @@ static u64 do_hypercall(u64 control, void *input, void *output) u32 control_lo = control & 0xFFFFFFFF; u32 hv_status_hi = 1; u32 hv_status_lo = 1; - u64 input_address = (input) ? virt_to_phys(input) : 0; u32 input_address_hi = input_address >> 32; u32 input_address_lo = input_address & 0xFFFFFFFF; - u64 output_address = (output) ? virt_to_phys(output) : 0; u32 output_address_hi = output_address >> 32; u32 output_address_lo = output_address & 0xFFFFFFFF; - void *hypercall_page = hv_context.hypercall_page; + + if (!hypercall_page) + return (u64)ULLONG_MAX; __asm__ __volatile__ ("call *%8" : "=d"(hv_status_hi), "=a"(hv_status_lo) : "d" (control_hi), @@ -130,6 +133,56 @@ static u64 do_hypercall(u64 control, void *input, void *output) #endif /* !x86_64 */ } +#ifdef CONFIG_X86_64 +static cycle_t read_hv_clock_tsc(struct clocksource *arg) +{ + cycle_t current_tick; + struct ms_hyperv_tsc_page *tsc_pg = hv_context.tsc_page; + + if (tsc_pg->tsc_sequence != -1) { + /* + * Use the tsc page to compute the value. + */ + + while (1) { + cycle_t tmp; + u32 sequence = tsc_pg->tsc_sequence; + u64 cur_tsc; + u64 scale = tsc_pg->tsc_scale; + s64 offset = tsc_pg->tsc_offset; + + rdtscll(cur_tsc); + /* current_tick = ((cur_tsc *scale) >> 64) + offset */ + asm("mulq %3" + : "=d" (current_tick), "=a" (tmp) + : "a" (cur_tsc), "r" (scale)); + + current_tick += offset; + if (tsc_pg->tsc_sequence == sequence) + return current_tick; + + if (tsc_pg->tsc_sequence != -1) + continue; + /* + * Fallback using MSR method. + */ + break; + } + } + rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); + return current_tick; +} + +static struct clocksource hyperv_cs_tsc = { + .name = "hyperv_clocksource_tsc_page", + .rating = 425, + .read = read_hv_clock_tsc, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; +#endif + + /* * hv_init - Main initialization routine. * @@ -139,7 +192,9 @@ int hv_init(void) { int max_leaf; union hv_x64_msr_hypercall_contents hypercall_msr; + union hv_x64_msr_hypercall_contents tsc_msr; void *virtaddr = NULL; + void *va_tsc = NULL; memset(hv_context.synic_event_page, 0, sizeof(void *) * NR_CPUS); memset(hv_context.synic_message_page, 0, @@ -183,6 +238,22 @@ int hv_init(void) hv_context.hypercall_page = virtaddr; +#ifdef CONFIG_X86_64 + if (ms_hyperv.features & HV_X64_MSR_REFERENCE_TSC_AVAILABLE) { + va_tsc = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); + if (!va_tsc) + goto cleanup; + hv_context.tsc_page = va_tsc; + + rdmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64); + + tsc_msr.enable = 1; + tsc_msr.guest_physical_address = vmalloc_to_pfn(va_tsc); + + wrmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64); + clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); + } +#endif return 0; cleanup: @@ -216,6 +287,21 @@ void hv_cleanup(void) vfree(hv_context.hypercall_page); hv_context.hypercall_page = NULL; } + +#ifdef CONFIG_X86_64 + /* + * Cleanup the TSC page based CS. + */ + if (ms_hyperv.features & HV_X64_MSR_REFERENCE_TSC_AVAILABLE) { + clocksource_change_rating(&hyperv_cs_tsc, 10); + clocksource_unregister(&hyperv_cs_tsc); + + hypercall_msr.as_uint64 = 0; + wrmsrl(HV_X64_MSR_REFERENCE_TSC, hypercall_msr.as_uint64); + vfree(hv_context.tsc_page); + hv_context.tsc_page = NULL; + } +#endif } /* @@ -271,7 +357,7 @@ static int hv_ce_set_next_event(unsigned long delta, { cycle_t current_tick; - WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); + WARN_ON(!clockevent_state_oneshot(evt)); rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); current_tick += delta; @@ -279,31 +365,24 @@ static int hv_ce_set_next_event(unsigned long delta, return 0; } -static void hv_ce_setmode(enum clock_event_mode mode, - struct clock_event_device *evt) +static int hv_ce_shutdown(struct clock_event_device *evt) +{ + wrmsrl(HV_X64_MSR_STIMER0_COUNT, 0); + wrmsrl(HV_X64_MSR_STIMER0_CONFIG, 0); + + return 0; +} + +static int hv_ce_set_oneshot(struct clock_event_device *evt) { union hv_timer_config timer_cfg; - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - /* unsupported */ - break; - - case CLOCK_EVT_MODE_ONESHOT: - timer_cfg.enable = 1; - timer_cfg.auto_enable = 1; - timer_cfg.sintx = VMBUS_MESSAGE_SINT; - wrmsrl(HV_X64_MSR_STIMER0_CONFIG, timer_cfg.as_uint64); - break; - - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - wrmsrl(HV_X64_MSR_STIMER0_COUNT, 0); - wrmsrl(HV_X64_MSR_STIMER0_CONFIG, 0); - break; - case CLOCK_EVT_MODE_RESUME: - break; - } + timer_cfg.enable = 1; + timer_cfg.auto_enable = 1; + timer_cfg.sintx = VMBUS_MESSAGE_SINT; + wrmsrl(HV_X64_MSR_STIMER0_CONFIG, timer_cfg.as_uint64); + + return 0; } static void hv_init_clockevent_device(struct clock_event_device *dev, int cpu) @@ -318,7 +397,8 @@ static void hv_init_clockevent_device(struct clock_event_device *dev, int cpu) * references to the hv_vmbus module making it impossible to unload. */ - dev->set_mode = hv_ce_setmode; + dev->set_state_shutdown = hv_ce_shutdown; + dev->set_state_oneshot = hv_ce_set_oneshot; dev->set_next_event = hv_ce_set_next_event; } @@ -329,6 +409,13 @@ int hv_synic_alloc(void) size_t ced_size = sizeof(struct clock_event_device); int cpu; + hv_context.hv_numa_map = kzalloc(sizeof(struct cpumask) * nr_node_ids, + GFP_ATOMIC); + if (hv_context.hv_numa_map == NULL) { + pr_err("Unable to allocate NUMA map\n"); + goto err; + } + for_each_online_cpu(cpu) { hv_context.event_dpc[cpu] = kmalloc(size, GFP_ATOMIC); if (hv_context.event_dpc[cpu] == NULL) { @@ -342,6 +429,7 @@ int hv_synic_alloc(void) pr_err("Unable to allocate clock event device\n"); goto err; } + hv_init_clockevent_device(hv_context.clk_evt[cpu], cpu); hv_context.synic_message_page[cpu] = @@ -390,6 +478,7 @@ void hv_synic_free(void) { int cpu; + kfree(hv_context.hv_numa_map); for_each_online_cpu(cpu) hv_synic_free_cpu(cpu); } @@ -503,8 +592,7 @@ void hv_synic_cleanup(void *arg) /* Turn off clockevent device */ if (ms_hyperv.features & HV_X64_MSR_SYNTIMER_AVAILABLE) - hv_ce_setmode(CLOCK_EVT_MODE_SHUTDOWN, - hv_context.clk_evt[cpu]); + hv_ce_shutdown(hv_context.clk_evt[cpu]); rdmsrl(HV_X64_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); @@ -530,6 +618,4 @@ void hv_synic_cleanup(void *arg) rdmsrl(HV_X64_MSR_SCONTROL, sctrl.as_uint64); sctrl.enable = 0; wrmsrl(HV_X64_MSR_SCONTROL, sctrl.as_uint64); - - hv_synic_free_cpu(cpu); } diff --git a/kernel/drivers/hv/hv_balloon.c b/kernel/drivers/hv/hv_balloon.c index cb5b7dc97..b853b4b08 100644 --- a/kernel/drivers/hv/hv_balloon.c +++ b/kernel/drivers/hv/hv_balloon.c @@ -62,11 +62,13 @@ enum { DYNMEM_PROTOCOL_VERSION_1 = DYNMEM_MAKE_VERSION(0, 3), DYNMEM_PROTOCOL_VERSION_2 = DYNMEM_MAKE_VERSION(1, 0), + DYNMEM_PROTOCOL_VERSION_3 = DYNMEM_MAKE_VERSION(2, 0), DYNMEM_PROTOCOL_VERSION_WIN7 = DYNMEM_PROTOCOL_VERSION_1, DYNMEM_PROTOCOL_VERSION_WIN8 = DYNMEM_PROTOCOL_VERSION_2, + DYNMEM_PROTOCOL_VERSION_WIN10 = DYNMEM_PROTOCOL_VERSION_3, - DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN8 + DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN10 }; @@ -567,7 +569,9 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val, case MEM_ONLINE: dm_device.num_pages_onlined += mem->nr_pages; case MEM_CANCEL_ONLINE: - mutex_unlock(&dm_device.ha_region_mutex); + if (val == MEM_ONLINE || + mutex_is_locked(&dm_device.ha_region_mutex)) + mutex_unlock(&dm_device.ha_region_mutex); if (dm_device.ha_waiting) { dm_device.ha_waiting = false; complete(&dm_device.ol_waitevent); @@ -1294,13 +1298,25 @@ static void version_resp(struct hv_dynmem_device *dm, if (dm->next_version == 0) goto version_error; - dm->next_version = 0; memset(&version_req, 0, sizeof(struct dm_version_request)); version_req.hdr.type = DM_VERSION_REQUEST; version_req.hdr.size = sizeof(struct dm_version_request); version_req.hdr.trans_id = atomic_inc_return(&trans_id); - version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN7; - version_req.is_last_attempt = 1; + version_req.version.version = dm->next_version; + + /* + * Set the next version to try in case current version fails. + * Win7 protocol ought to be the last one to try. + */ + switch (version_req.version.version) { + case DYNMEM_PROTOCOL_VERSION_WIN8: + dm->next_version = DYNMEM_PROTOCOL_VERSION_WIN7; + version_req.is_last_attempt = 0; + break; + default: + dm->next_version = 0; + version_req.is_last_attempt = 1; + } ret = vmbus_sendpacket(dm->dev->channel, &version_req, sizeof(struct dm_version_request), @@ -1440,7 +1456,7 @@ static int balloon_probe(struct hv_device *dev, dm_device.dev = dev; dm_device.state = DM_INITIALIZING; - dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN7; + dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN8; init_completion(&dm_device.host_event); init_completion(&dm_device.config_event); INIT_LIST_HEAD(&dm_device.ha_region_list); @@ -1472,7 +1488,7 @@ static int balloon_probe(struct hv_device *dev, version_req.hdr.type = DM_VERSION_REQUEST; version_req.hdr.size = sizeof(struct dm_version_request); version_req.hdr.trans_id = atomic_inc_return(&trans_id); - version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN8; + version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN10; version_req.is_last_attempt = 0; ret = vmbus_sendpacket(dev->channel, &version_req, diff --git a/kernel/drivers/hv/hv_fcopy.c b/kernel/drivers/hv/hv_fcopy.c index cd453e4b2..db4b887b8 100644 --- a/kernel/drivers/hv/hv_fcopy.c +++ b/kernel/drivers/hv/hv_fcopy.c @@ -19,17 +19,13 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/semaphore.h> -#include <linux/fs.h> #include <linux/nls.h> #include <linux/workqueue.h> -#include <linux/cdev.h> #include <linux/hyperv.h> #include <linux/sched.h> -#include <linux/uaccess.h> -#include <linux/miscdevice.h> #include "hyperv_vmbus.h" +#include "hv_utils_transport.h" #define WIN8_SRV_MAJOR 1 #define WIN8_SRV_MINOR 1 @@ -47,39 +43,31 @@ * ensure this by serializing packet processing in this driver - we do not * read additional packets from the VMBUs until the current packet is fully * handled. - * - * The transaction "active" state is set when we receive a request from the - * host and we cleanup this state when the transaction is completed - when we - * respond to the host with our response. When the transaction active state is - * set, we defer handling incoming packets. */ static struct { - bool active; /* transaction status - active or not */ + int state; /* hvutil_device_state */ int recv_len; /* number of bytes received. */ struct hv_fcopy_hdr *fcopy_msg; /* current message */ - struct hv_start_fcopy message; /* sent to daemon */ struct vmbus_channel *recv_channel; /* chn we got the request */ u64 recv_req_id; /* request ID. */ void *fcopy_context; /* for the channel callback */ - struct semaphore read_sema; } fcopy_transaction; -static bool opened; /* currently device opened */ - -/* - * Before we can accept copy messages from the host, we need - * to handshake with the user level daemon. This state tracks - * if we are in the handshake phase. - */ -static bool in_hand_shake = true; -static void fcopy_send_data(void); static void fcopy_respond_to_host(int error); -static void fcopy_work_func(struct work_struct *dummy); -static DECLARE_DELAYED_WORK(fcopy_work, fcopy_work_func); +static void fcopy_send_data(struct work_struct *dummy); +static void fcopy_timeout_func(struct work_struct *dummy); +static DECLARE_DELAYED_WORK(fcopy_timeout_work, fcopy_timeout_func); +static DECLARE_WORK(fcopy_send_work, fcopy_send_data); +static const char fcopy_devname[] = "vmbus/hv_fcopy"; static u8 *recv_buffer; +static struct hvutil_transport *hvt; +/* + * This state maintains the version number registered by the daemon. + */ +static int dm_reg_value; -static void fcopy_work_func(struct work_struct *dummy) +static void fcopy_timeout_func(struct work_struct *dummy) { /* * If the timer fires, the user-mode component has not responded; @@ -87,23 +75,28 @@ static void fcopy_work_func(struct work_struct *dummy) */ fcopy_respond_to_host(HV_E_FAIL); - /* In the case the user-space daemon crashes, hangs or is killed, we - * need to down the semaphore, otherwise, after the daemon starts next - * time, the obsolete data in fcopy_transaction.message or - * fcopy_transaction.fcopy_msg will be used immediately. - * - * NOTE: fcopy_read() happens to get the semaphore (very rare)? We're - * still OK, because we've reported the failure to the host. - */ - if (down_trylock(&fcopy_transaction.read_sema)) - ; + /* Transaction is finished, reset the state. */ + if (fcopy_transaction.state > HVUTIL_READY) + fcopy_transaction.state = HVUTIL_READY; + hv_poll_channel(fcopy_transaction.fcopy_context, + hv_fcopy_onchannelcallback); } static int fcopy_handle_handshake(u32 version) { + u32 our_ver = FCOPY_CURRENT_VERSION; + switch (version) { - case FCOPY_CURRENT_VERSION: + case FCOPY_VERSION_0: + /* Daemon doesn't expect us to reply */ + dm_reg_value = version; + break; + case FCOPY_VERSION_1: + /* Daemon expects us to reply with our own version */ + if (hvutil_transport_send(hvt, &our_ver, sizeof(our_ver))) + return -EFAULT; + dm_reg_value = version; break; default: /* @@ -114,20 +107,20 @@ static int fcopy_handle_handshake(u32 version) */ return -EINVAL; } - pr_info("FCP: user-mode registering done. Daemon version: %d\n", - version); - fcopy_transaction.active = false; - if (fcopy_transaction.fcopy_context) - hv_fcopy_onchannelcallback(fcopy_transaction.fcopy_context); - in_hand_shake = false; + pr_debug("FCP: userspace daemon ver. %d registered\n", version); + fcopy_transaction.state = HVUTIL_READY; + hv_poll_channel(fcopy_transaction.fcopy_context, + hv_fcopy_onchannelcallback); return 0; } -static void fcopy_send_data(void) +static void fcopy_send_data(struct work_struct *dummy) { - struct hv_start_fcopy *smsg_out = &fcopy_transaction.message; + struct hv_start_fcopy *smsg_out = NULL; int operation = fcopy_transaction.fcopy_msg->operation; struct hv_start_fcopy *smsg_in; + void *out_src; + int rc, out_len; /* * The strings sent from the host are encoded in @@ -142,26 +135,44 @@ static void fcopy_send_data(void) switch (operation) { case START_FILE_COPY: - memset(smsg_out, 0, sizeof(struct hv_start_fcopy)); + out_len = sizeof(struct hv_start_fcopy); + smsg_out = kzalloc(sizeof(*smsg_out), GFP_KERNEL); + if (!smsg_out) + return; + smsg_out->hdr.operation = operation; smsg_in = (struct hv_start_fcopy *)fcopy_transaction.fcopy_msg; utf16s_to_utf8s((wchar_t *)smsg_in->file_name, W_MAX_PATH, UTF16_LITTLE_ENDIAN, - (__u8 *)smsg_out->file_name, W_MAX_PATH - 1); + (__u8 *)&smsg_out->file_name, W_MAX_PATH - 1); utf16s_to_utf8s((wchar_t *)smsg_in->path_name, W_MAX_PATH, UTF16_LITTLE_ENDIAN, - (__u8 *)smsg_out->path_name, W_MAX_PATH - 1); + (__u8 *)&smsg_out->path_name, W_MAX_PATH - 1); smsg_out->copy_flags = smsg_in->copy_flags; smsg_out->file_size = smsg_in->file_size; + out_src = smsg_out; break; default: + out_src = fcopy_transaction.fcopy_msg; + out_len = fcopy_transaction.recv_len; break; } - up(&fcopy_transaction.read_sema); + + fcopy_transaction.state = HVUTIL_USERSPACE_REQ; + rc = hvutil_transport_send(hvt, out_src, out_len); + if (rc) { + pr_debug("FCP: failed to communicate to the daemon: %d\n", rc); + if (cancel_delayed_work_sync(&fcopy_timeout_work)) { + fcopy_respond_to_host(HV_E_FAIL); + fcopy_transaction.state = HVUTIL_READY; + } + } + kfree(smsg_out); + return; } @@ -189,8 +200,6 @@ fcopy_respond_to_host(int error) channel = fcopy_transaction.recv_channel; req_id = fcopy_transaction.recv_req_id; - fcopy_transaction.active = false; - icmsghdr = (struct icmsg_hdr *) &recv_buffer[sizeof(struct vmbuspipe_hdr)]; @@ -218,7 +227,7 @@ void hv_fcopy_onchannelcallback(void *context) int util_fw_version; int fcopy_srv_version; - if (fcopy_transaction.active) { + if (fcopy_transaction.state > HVUTIL_READY) { /* * We will defer processing this callback once * the current transaction is complete. @@ -226,6 +235,7 @@ void hv_fcopy_onchannelcallback(void *context) fcopy_transaction.fcopy_context = context; return; } + fcopy_transaction.fcopy_context = NULL; vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 2, &recvlen, &requestid); @@ -249,17 +259,23 @@ void hv_fcopy_onchannelcallback(void *context) * transaction; note transactions are serialized. */ - fcopy_transaction.active = true; fcopy_transaction.recv_len = recvlen; fcopy_transaction.recv_channel = channel; fcopy_transaction.recv_req_id = requestid; fcopy_transaction.fcopy_msg = fcopy_msg; + if (fcopy_transaction.state < HVUTIL_READY) { + /* Userspace is not registered yet */ + fcopy_respond_to_host(HV_E_FAIL); + return; + } + fcopy_transaction.state = HVUTIL_HOSTMSG_RECEIVED; + /* * Send the information to the user-level daemon. */ - schedule_delayed_work(&fcopy_work, 5*HZ); - fcopy_send_data(); + schedule_work(&fcopy_send_work); + schedule_delayed_work(&fcopy_timeout_work, 5*HZ); return; } icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE; @@ -267,155 +283,44 @@ void hv_fcopy_onchannelcallback(void *context) VM_PKT_DATA_INBAND, 0); } -/* - * Create a char device that can support read/write for passing - * the payload. - */ - -static ssize_t fcopy_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - void *src; - size_t copy_size; - int operation; - - /* - * Wait until there is something to be read. - */ - if (down_interruptible(&fcopy_transaction.read_sema)) - return -EINTR; - - /* - * The channel may be rescinded and in this case, we will wakeup the - * the thread blocked on the semaphore and we will use the opened - * state to correctly handle this case. - */ - if (!opened) - return -ENODEV; - - operation = fcopy_transaction.fcopy_msg->operation; - - if (operation == START_FILE_COPY) { - src = &fcopy_transaction.message; - copy_size = sizeof(struct hv_start_fcopy); - if (count < copy_size) - return 0; - } else { - src = fcopy_transaction.fcopy_msg; - copy_size = sizeof(struct hv_do_fcopy); - if (count < copy_size) - return 0; - } - if (copy_to_user(buf, src, copy_size)) - return -EFAULT; - - return copy_size; -} - -static ssize_t fcopy_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +/* Callback when data is received from userspace */ +static int fcopy_on_msg(void *msg, int len) { - int response = 0; + int *val = (int *)msg; - if (count != sizeof(int)) + if (len != sizeof(int)) return -EINVAL; - if (copy_from_user(&response, buf, sizeof(int))) - return -EFAULT; + if (fcopy_transaction.state == HVUTIL_DEVICE_INIT) + return fcopy_handle_handshake(*val); - if (in_hand_shake) { - if (fcopy_handle_handshake(response)) - return -EINVAL; - return sizeof(int); - } + if (fcopy_transaction.state != HVUTIL_USERSPACE_REQ) + return -EINVAL; /* * Complete the transaction by forwarding the result * to the host. But first, cancel the timeout. */ - if (cancel_delayed_work_sync(&fcopy_work)) - fcopy_respond_to_host(response); - - return sizeof(int); -} - -static int fcopy_open(struct inode *inode, struct file *f) -{ - /* - * The user level daemon that will open this device is - * really an extension of this driver. We can have only - * active open at a time. - */ - if (opened) - return -EBUSY; + if (cancel_delayed_work_sync(&fcopy_timeout_work)) { + fcopy_transaction.state = HVUTIL_USERSPACE_RECV; + fcopy_respond_to_host(*val); + fcopy_transaction.state = HVUTIL_READY; + hv_poll_channel(fcopy_transaction.fcopy_context, + hv_fcopy_onchannelcallback); + } - /* - * The daemon is alive; setup the state. - */ - opened = true; return 0; } -/* XXX: there are still some tricky corner cases, e.g., - * 1) In a SMP guest, when fcopy_release() runs between - * schedule_delayed_work() and fcopy_send_data(), there is - * still a chance an obsolete message will be queued. - * - * 2) When the fcopy daemon is running, if we unload the driver, - * we'll notice a kernel oops when we kill the daemon later. - */ -static int fcopy_release(struct inode *inode, struct file *f) +static void fcopy_on_reset(void) { /* * The daemon has exited; reset the state. */ - in_hand_shake = true; - opened = false; + fcopy_transaction.state = HVUTIL_DEVICE_INIT; - if (cancel_delayed_work_sync(&fcopy_work)) { - /* We haven't up()-ed the semaphore(very rare)? */ - if (down_trylock(&fcopy_transaction.read_sema)) - ; + if (cancel_delayed_work_sync(&fcopy_timeout_work)) fcopy_respond_to_host(HV_E_FAIL); - } - return 0; -} - - -static const struct file_operations fcopy_fops = { - .read = fcopy_read, - .write = fcopy_write, - .release = fcopy_release, - .open = fcopy_open, -}; - -static struct miscdevice fcopy_misc = { - .minor = MISC_DYNAMIC_MINOR, - .name = "vmbus/hv_fcopy", - .fops = &fcopy_fops, -}; - -static int fcopy_dev_init(void) -{ - return misc_register(&fcopy_misc); -} - -static void fcopy_dev_deinit(void) -{ - - /* - * The device is going away - perhaps because the - * host has rescinded the channel. Setup state so that - * user level daemon can gracefully exit if it is blocked - * on the read semaphore. - */ - opened = false; - /* - * Signal the semaphore as the device is - * going away. - */ - up(&fcopy_transaction.read_sema); - misc_deregister(&fcopy_misc); } int hv_fcopy_init(struct hv_util_service *srv) @@ -428,14 +333,19 @@ int hv_fcopy_init(struct hv_util_service *srv) * Defer processing channel callbacks until the daemon * has registered. */ - fcopy_transaction.active = true; - sema_init(&fcopy_transaction.read_sema, 0); + fcopy_transaction.state = HVUTIL_DEVICE_INIT; + + hvt = hvutil_transport_init(fcopy_devname, 0, 0, + fcopy_on_msg, fcopy_on_reset); + if (!hvt) + return -EFAULT; - return fcopy_dev_init(); + return 0; } void hv_fcopy_deinit(void) { - cancel_delayed_work_sync(&fcopy_work); - fcopy_dev_deinit(); + fcopy_transaction.state = HVUTIL_DEVICE_DYING; + cancel_delayed_work_sync(&fcopy_timeout_work); + hvutil_transport_destroy(hvt); } diff --git a/kernel/drivers/hv/hv_kvp.c b/kernel/drivers/hv/hv_kvp.c index beb8105c0..74c38a9f3 100644 --- a/kernel/drivers/hv/hv_kvp.c +++ b/kernel/drivers/hv/hv_kvp.c @@ -28,6 +28,8 @@ #include <linux/workqueue.h> #include <linux/hyperv.h> +#include "hyperv_vmbus.h" +#include "hv_utils_transport.h" /* * Pre win8 version numbers used in ws2008 and ws 2008 r2 (win7) @@ -45,16 +47,21 @@ #define WIN8_SRV_VERSION (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR) /* - * Global state maintained for transaction that is being processed. - * Note that only one transaction can be active at any point in time. + * Global state maintained for transaction that is being processed. For a class + * of integration services, including the "KVP service", the specified protocol + * is a "request/response" protocol which means that there can only be single + * outstanding transaction from the host at any given point in time. We use + * this to simplify memory management in this driver - we cache and process + * only one message at a time. * - * This state is set when we receive a request from the host; we - * cleanup this state when the transaction is completed - when we respond - * to the host with the key value. + * While the request/response protocol is guaranteed by the host, we further + * ensure this by serializing packet processing in this driver - we do not + * read additional packets from the VMBUs until the current packet is fully + * handled. */ static struct { - bool active; /* transaction status - active or not */ + int state; /* hvutil_device_state */ int recv_len; /* number of bytes received. */ struct hv_kvp_msg *kvp_msg; /* current message */ struct vmbus_channel *recv_channel; /* chn we got the request */ @@ -63,13 +70,6 @@ static struct { } kvp_transaction; /* - * Before we can accept KVP messages from the host, we need - * to handshake with the user level daemon. This state tracks - * if we are in the handshake phase. - */ -static bool in_hand_shake = true; - -/* * This state maintains the version number registered by the daemon. */ static int dm_reg_value; @@ -78,15 +78,15 @@ static void kvp_send_key(struct work_struct *dummy); static void kvp_respond_to_host(struct hv_kvp_msg *msg, int error); -static void kvp_work_func(struct work_struct *dummy); +static void kvp_timeout_func(struct work_struct *dummy); static void kvp_register(int); -static DECLARE_DELAYED_WORK(kvp_work, kvp_work_func); +static DECLARE_DELAYED_WORK(kvp_timeout_work, kvp_timeout_func); static DECLARE_WORK(kvp_sendkey_work, kvp_send_key); -static struct cb_id kvp_id = { CN_KVP_IDX, CN_KVP_VAL }; -static const char kvp_name[] = "kvp_kernel_module"; +static const char kvp_devname[] = "vmbus/hv_kvp"; static u8 *recv_buffer; +static struct hvutil_transport *hvt; /* * Register the kernel component with the user-level daemon. * As part of this registration, pass the LIC version number. @@ -98,50 +98,39 @@ static void kvp_register(int reg_value) { - struct cn_msg *msg; struct hv_kvp_msg *kvp_msg; char *version; - msg = kzalloc(sizeof(*msg) + sizeof(struct hv_kvp_msg), GFP_ATOMIC); + kvp_msg = kzalloc(sizeof(*kvp_msg), GFP_KERNEL); - if (msg) { - kvp_msg = (struct hv_kvp_msg *)msg->data; + if (kvp_msg) { version = kvp_msg->body.kvp_register.version; - msg->id.idx = CN_KVP_IDX; - msg->id.val = CN_KVP_VAL; - kvp_msg->kvp_hdr.operation = reg_value; strcpy(version, HV_DRV_VERSION); - msg->len = sizeof(struct hv_kvp_msg); - cn_netlink_send(msg, 0, 0, GFP_ATOMIC); - kfree(msg); + + hvutil_transport_send(hvt, kvp_msg, sizeof(*kvp_msg)); + kfree(kvp_msg); } } -static void -kvp_work_func(struct work_struct *dummy) + +static void kvp_timeout_func(struct work_struct *dummy) { /* * If the timer fires, the user-mode component has not responded; * process the pending transaction. */ kvp_respond_to_host(NULL, HV_E_FAIL); -} -static void poll_channel(struct vmbus_channel *channel) -{ - if (channel->target_cpu != smp_processor_id()) - smp_call_function_single(channel->target_cpu, - hv_kvp_onchannelcallback, - channel, true); - else - hv_kvp_onchannelcallback(channel); -} + /* Transaction is finished, reset the state. */ + if (kvp_transaction.state > HVUTIL_READY) + kvp_transaction.state = HVUTIL_READY; + hv_poll_channel(kvp_transaction.kvp_context, + hv_kvp_onchannelcallback); +} static int kvp_handle_handshake(struct hv_kvp_msg *msg) { - int ret = 1; - switch (msg->kvp_hdr.operation) { case KVP_OP_REGISTER: dm_reg_value = KVP_OP_REGISTER; @@ -155,20 +144,18 @@ static int kvp_handle_handshake(struct hv_kvp_msg *msg) pr_info("KVP: incompatible daemon\n"); pr_info("KVP: KVP version: %d, Daemon version: %d\n", KVP_OP_REGISTER1, msg->kvp_hdr.operation); - ret = 0; + return -EINVAL; } - if (ret) { - /* - * We have a compatible daemon; complete the handshake. - */ - pr_info("KVP: user-mode registering done.\n"); - kvp_register(dm_reg_value); - kvp_transaction.active = false; - if (kvp_transaction.kvp_context) - poll_channel(kvp_transaction.kvp_context); - } - return ret; + /* + * We have a compatible daemon; complete the handshake. + */ + pr_debug("KVP: userspace daemon ver. %d registered\n", + KVP_OP_REGISTER); + kvp_register(dm_reg_value); + kvp_transaction.state = HVUTIL_READY; + + return 0; } @@ -176,26 +163,30 @@ static int kvp_handle_handshake(struct hv_kvp_msg *msg) * Callback when data is received from user mode. */ -static void -kvp_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) +static int kvp_on_msg(void *msg, int len) { - struct hv_kvp_msg *message; + struct hv_kvp_msg *message = (struct hv_kvp_msg *)msg; struct hv_kvp_msg_enumerate *data; int error = 0; - message = (struct hv_kvp_msg *)msg->data; + if (len < sizeof(*message)) + return -EINVAL; /* * If we are negotiating the version information * with the daemon; handle that first. */ - if (in_hand_shake) { - if (kvp_handle_handshake(message)) - in_hand_shake = false; - return; + if (kvp_transaction.state < HVUTIL_READY) { + return kvp_handle_handshake(message); } + /* We didn't send anything to userspace so the reply is spurious */ + if (kvp_transaction.state < HVUTIL_USERSPACE_REQ) + return -EINVAL; + + kvp_transaction.state = HVUTIL_USERSPACE_RECV; + /* * Based on the version of the daemon, we propagate errors from the * daemon differently. @@ -225,8 +216,14 @@ kvp_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) * Complete the transaction by forwarding the key value * to the host. But first, cancel the timeout. */ - if (cancel_delayed_work_sync(&kvp_work)) + if (cancel_delayed_work_sync(&kvp_timeout_work)) { kvp_respond_to_host(message, error); + kvp_transaction.state = HVUTIL_READY; + hv_poll_channel(kvp_transaction.kvp_context, + hv_kvp_onchannelcallback); + } + + return 0; } @@ -343,7 +340,6 @@ static void process_ib_ipinfo(void *in_msg, void *out_msg, int op) static void kvp_send_key(struct work_struct *dummy) { - struct cn_msg *msg; struct hv_kvp_msg *message; struct hv_kvp_msg *in_msg; __u8 operation = kvp_transaction.kvp_msg->kvp_hdr.operation; @@ -352,14 +348,14 @@ kvp_send_key(struct work_struct *dummy) __u64 val64; int rc; - msg = kzalloc(sizeof(*msg) + sizeof(struct hv_kvp_msg) , GFP_ATOMIC); - if (!msg) + /* The transaction state is wrong. */ + if (kvp_transaction.state != HVUTIL_HOSTMSG_RECEIVED) return; - msg->id.idx = CN_KVP_IDX; - msg->id.val = CN_KVP_VAL; + message = kzalloc(sizeof(*message), GFP_KERNEL); + if (!message) + return; - message = (struct hv_kvp_msg *)msg->data; message->kvp_hdr.operation = operation; message->kvp_hdr.pool = pool; in_msg = kvp_transaction.kvp_msg; @@ -446,15 +442,17 @@ kvp_send_key(struct work_struct *dummy) break; } - msg->len = sizeof(struct hv_kvp_msg); - rc = cn_netlink_send(msg, 0, 0, GFP_ATOMIC); + kvp_transaction.state = HVUTIL_USERSPACE_REQ; + rc = hvutil_transport_send(hvt, message, sizeof(*message)); if (rc) { pr_debug("KVP: failed to communicate to the daemon: %d\n", rc); - if (cancel_delayed_work_sync(&kvp_work)) + if (cancel_delayed_work_sync(&kvp_timeout_work)) { kvp_respond_to_host(message, HV_E_FAIL); + kvp_transaction.state = HVUTIL_READY; + } } - kfree(msg); + kfree(message); return; } @@ -479,17 +477,6 @@ kvp_respond_to_host(struct hv_kvp_msg *msg_to_host, int error) int ret; /* - * If a transaction is not active; log and return. - */ - - if (!kvp_transaction.active) { - /* - * This is a spurious call! - */ - pr_warn("KVP: Transaction not active\n"); - return; - } - /* * Copy the global state for completing the transaction. Note that * only one transaction can be active at a time. */ @@ -498,8 +485,6 @@ kvp_respond_to_host(struct hv_kvp_msg *msg_to_host, int error) channel = kvp_transaction.recv_channel; req_id = kvp_transaction.recv_req_id; - kvp_transaction.active = false; - icmsghdrp = (struct icmsg_hdr *) &recv_buffer[sizeof(struct vmbuspipe_hdr)]; @@ -586,7 +571,6 @@ response_done: vmbus_sendpacket(channel, recv_buffer, buf_len, req_id, VM_PKT_DATA_INBAND, 0); - poll_channel(channel); } /* @@ -612,7 +596,7 @@ void hv_kvp_onchannelcallback(void *context) int util_fw_version; int kvp_srv_version; - if (kvp_transaction.active) { + if (kvp_transaction.state > HVUTIL_READY) { /* * We will defer processing this callback once * the current transaction is complete. @@ -620,6 +604,7 @@ void hv_kvp_onchannelcallback(void *context) kvp_transaction.kvp_context = context; return; } + kvp_transaction.kvp_context = NULL; vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 4, &recvlen, &requestid); @@ -664,9 +649,15 @@ void hv_kvp_onchannelcallback(void *context) kvp_transaction.recv_len = recvlen; kvp_transaction.recv_channel = channel; kvp_transaction.recv_req_id = requestid; - kvp_transaction.active = true; kvp_transaction.kvp_msg = kvp_msg; + if (kvp_transaction.state < HVUTIL_READY) { + /* Userspace is not registered yet */ + kvp_respond_to_host(NULL, HV_E_FAIL); + return; + } + kvp_transaction.state = HVUTIL_HOSTMSG_RECEIVED; + /* * Get the information from the * user-mode component. @@ -677,7 +668,7 @@ void hv_kvp_onchannelcallback(void *context) * user-mode not responding. */ schedule_work(&kvp_sendkey_work); - schedule_delayed_work(&kvp_work, 5*HZ); + schedule_delayed_work(&kvp_timeout_work, 5*HZ); return; @@ -693,14 +684,16 @@ void hv_kvp_onchannelcallback(void *context) } +static void kvp_on_reset(void) +{ + if (cancel_delayed_work_sync(&kvp_timeout_work)) + kvp_respond_to_host(NULL, HV_E_FAIL); + kvp_transaction.state = HVUTIL_DEVICE_INIT; +} + int hv_kvp_init(struct hv_util_service *srv) { - int err; - - err = cn_add_callback(&kvp_id, kvp_name, kvp_cn_callback); - if (err) - return err; recv_buffer = srv->recv_buffer; /* @@ -709,14 +702,20 @@ hv_kvp_init(struct hv_util_service *srv) * Defer processing channel callbacks until the daemon * has registered. */ - kvp_transaction.active = true; + kvp_transaction.state = HVUTIL_DEVICE_INIT; + + hvt = hvutil_transport_init(kvp_devname, CN_KVP_IDX, CN_KVP_VAL, + kvp_on_msg, kvp_on_reset); + if (!hvt) + return -EFAULT; return 0; } void hv_kvp_deinit(void) { - cn_del_callback(&kvp_id); - cancel_delayed_work_sync(&kvp_work); + kvp_transaction.state = HVUTIL_DEVICE_DYING; + cancel_delayed_work_sync(&kvp_timeout_work); cancel_work_sync(&kvp_sendkey_work); + hvutil_transport_destroy(hvt); } diff --git a/kernel/drivers/hv/hv_snapshot.c b/kernel/drivers/hv/hv_snapshot.c index 9d5e0d1ef..815405f2e 100644 --- a/kernel/drivers/hv/hv_snapshot.c +++ b/kernel/drivers/hv/hv_snapshot.c @@ -24,6 +24,9 @@ #include <linux/workqueue.h> #include <linux/hyperv.h> +#include "hyperv_vmbus.h" +#include "hv_utils_transport.h" + #define VSS_MAJOR 5 #define VSS_MINOR 0 #define VSS_VERSION (VSS_MAJOR << 16 | VSS_MINOR) @@ -31,28 +34,39 @@ #define VSS_USERSPACE_TIMEOUT (msecs_to_jiffies(10 * 1000)) /* - * Global state maintained for transaction that is being processed. - * Note that only one transaction can be active at any point in time. + * Global state maintained for transaction that is being processed. For a class + * of integration services, including the "VSS service", the specified protocol + * is a "request/response" protocol which means that there can only be single + * outstanding transaction from the host at any given point in time. We use + * this to simplify memory management in this driver - we cache and process + * only one message at a time. * - * This state is set when we receive a request from the host; we - * cleanup this state when the transaction is completed - when we respond - * to the host with the key value. + * While the request/response protocol is guaranteed by the host, we further + * ensure this by serializing packet processing in this driver - we do not + * read additional packets from the VMBUs until the current packet is fully + * handled. */ static struct { - bool active; /* transaction status - active or not */ + int state; /* hvutil_device_state */ int recv_len; /* number of bytes received. */ struct vmbus_channel *recv_channel; /* chn we got the request */ u64 recv_req_id; /* request ID. */ struct hv_vss_msg *msg; /* current message */ + void *vss_context; /* for the channel callback */ } vss_transaction; static void vss_respond_to_host(int error); -static struct cb_id vss_id = { CN_VSS_IDX, CN_VSS_VAL }; -static const char vss_name[] = "vss_kernel_module"; +/* + * This state maintains the version number registered by the daemon. + */ +static int dm_reg_value; + +static const char vss_devname[] = "vmbus/hv_vss"; static __u8 *recv_buffer; +static struct hvutil_transport *hvt; static void vss_send_op(struct work_struct *dummy); static void vss_timeout_func(struct work_struct *dummy); @@ -71,25 +85,69 @@ static void vss_timeout_func(struct work_struct *dummy) */ pr_warn("VSS: timeout waiting for daemon to reply\n"); vss_respond_to_host(HV_E_FAIL); + + /* Transaction is finished, reset the state. */ + if (vss_transaction.state > HVUTIL_READY) + vss_transaction.state = HVUTIL_READY; + + hv_poll_channel(vss_transaction.vss_context, + hv_vss_onchannelcallback); } -static void -vss_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) +static int vss_handle_handshake(struct hv_vss_msg *vss_msg) { - struct hv_vss_msg *vss_msg; + u32 our_ver = VSS_OP_REGISTER1; + + switch (vss_msg->vss_hdr.operation) { + case VSS_OP_REGISTER: + /* Daemon doesn't expect us to reply */ + dm_reg_value = VSS_OP_REGISTER; + break; + case VSS_OP_REGISTER1: + /* Daemon expects us to reply with our own version*/ + if (hvutil_transport_send(hvt, &our_ver, sizeof(our_ver))) + return -EFAULT; + dm_reg_value = VSS_OP_REGISTER1; + break; + default: + return -EINVAL; + } + vss_transaction.state = HVUTIL_READY; + pr_debug("VSS: userspace daemon ver. %d registered\n", dm_reg_value); + return 0; +} - vss_msg = (struct hv_vss_msg *)msg->data; +static int vss_on_msg(void *msg, int len) +{ + struct hv_vss_msg *vss_msg = (struct hv_vss_msg *)msg; - if (vss_msg->vss_hdr.operation == VSS_OP_REGISTER) { - pr_info("VSS daemon registered\n"); - vss_transaction.active = false; - if (vss_transaction.recv_channel != NULL) - hv_vss_onchannelcallback(vss_transaction.recv_channel); - return; + if (len != sizeof(*vss_msg)) + return -EINVAL; + if (vss_msg->vss_hdr.operation == VSS_OP_REGISTER || + vss_msg->vss_hdr.operation == VSS_OP_REGISTER1) { + /* + * Don't process registration messages if we're in the middle + * of a transaction processing. + */ + if (vss_transaction.state > HVUTIL_READY) + return -EINVAL; + return vss_handle_handshake(vss_msg); + } else if (vss_transaction.state == HVUTIL_USERSPACE_REQ) { + vss_transaction.state = HVUTIL_USERSPACE_RECV; + if (cancel_delayed_work_sync(&vss_timeout_work)) { + vss_respond_to_host(vss_msg->error); + /* Transaction is finished, reset the state. */ + vss_transaction.state = HVUTIL_READY; + hv_poll_channel(vss_transaction.vss_context, + hv_vss_onchannelcallback); + } + } else { + /* This is a spurious call! */ + pr_warn("VSS: Transaction not active\n"); + return -EINVAL; } - if (cancel_delayed_work_sync(&vss_timeout_work)) - vss_respond_to_host(vss_msg->error); + return 0; } @@ -97,28 +155,29 @@ static void vss_send_op(struct work_struct *dummy) { int op = vss_transaction.msg->vss_hdr.operation; int rc; - struct cn_msg *msg; struct hv_vss_msg *vss_msg; - msg = kzalloc(sizeof(*msg) + sizeof(*vss_msg), GFP_ATOMIC); - if (!msg) + /* The transaction state is wrong. */ + if (vss_transaction.state != HVUTIL_HOSTMSG_RECEIVED) return; - vss_msg = (struct hv_vss_msg *)msg->data; - - msg->id.idx = CN_VSS_IDX; - msg->id.val = CN_VSS_VAL; + vss_msg = kzalloc(sizeof(*vss_msg), GFP_KERNEL); + if (!vss_msg) + return; vss_msg->vss_hdr.operation = op; - msg->len = sizeof(struct hv_vss_msg); - rc = cn_netlink_send(msg, 0, 0, GFP_ATOMIC); + vss_transaction.state = HVUTIL_USERSPACE_REQ; + rc = hvutil_transport_send(hvt, vss_msg, sizeof(*vss_msg)); if (rc) { pr_warn("VSS: failed to communicate to the daemon: %d\n", rc); - if (cancel_delayed_work_sync(&vss_timeout_work)) + if (cancel_delayed_work_sync(&vss_timeout_work)) { vss_respond_to_host(HV_E_FAIL); + vss_transaction.state = HVUTIL_READY; + } } - kfree(msg); + + kfree(vss_msg); return; } @@ -136,17 +195,6 @@ vss_respond_to_host(int error) u64 req_id; /* - * If a transaction is not active; log and return. - */ - - if (!vss_transaction.active) { - /* - * This is a spurious call! - */ - pr_warn("VSS: Transaction not active\n"); - return; - } - /* * Copy the global state for completing the transaction. Note that * only one transaction can be active at a time. */ @@ -154,7 +202,6 @@ vss_respond_to_host(int error) buf_len = vss_transaction.recv_len; channel = vss_transaction.recv_channel; req_id = vss_transaction.recv_req_id; - vss_transaction.active = false; icmsghdrp = (struct icmsg_hdr *) &recv_buffer[sizeof(struct vmbuspipe_hdr)]; @@ -191,14 +238,15 @@ void hv_vss_onchannelcallback(void *context) struct icmsg_hdr *icmsghdrp; struct icmsg_negotiate *negop = NULL; - if (vss_transaction.active) { + if (vss_transaction.state > HVUTIL_READY) { /* * We will defer processing this callback once * the current transaction is complete. */ - vss_transaction.recv_channel = channel; + vss_transaction.vss_context = context; return; } + vss_transaction.vss_context = NULL; vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 2, &recvlen, &requestid); @@ -224,7 +272,6 @@ void hv_vss_onchannelcallback(void *context) vss_transaction.recv_len = recvlen; vss_transaction.recv_channel = channel; vss_transaction.recv_req_id = requestid; - vss_transaction.active = true; vss_transaction.msg = (struct hv_vss_msg *)vss_msg; switch (vss_msg->vss_hdr.operation) { @@ -241,6 +288,12 @@ void hv_vss_onchannelcallback(void *context) */ case VSS_OP_FREEZE: case VSS_OP_THAW: + if (vss_transaction.state < HVUTIL_READY) { + /* Userspace is not registered yet */ + vss_respond_to_host(HV_E_FAIL); + return; + } + vss_transaction.state = HVUTIL_HOSTMSG_RECEIVED; schedule_work(&vss_send_op_work); schedule_delayed_work(&vss_timeout_work, VSS_USERSPACE_TIMEOUT); @@ -275,14 +328,16 @@ void hv_vss_onchannelcallback(void *context) } +static void vss_on_reset(void) +{ + if (cancel_delayed_work_sync(&vss_timeout_work)) + vss_respond_to_host(HV_E_FAIL); + vss_transaction.state = HVUTIL_DEVICE_INIT; +} + int hv_vss_init(struct hv_util_service *srv) { - int err; - - err = cn_add_callback(&vss_id, vss_name, vss_cn_callback); - if (err) - return err; recv_buffer = srv->recv_buffer; /* @@ -291,13 +346,20 @@ hv_vss_init(struct hv_util_service *srv) * Defer processing channel callbacks until the daemon * has registered. */ - vss_transaction.active = true; + vss_transaction.state = HVUTIL_DEVICE_INIT; + + hvt = hvutil_transport_init(vss_devname, CN_VSS_IDX, CN_VSS_VAL, + vss_on_msg, vss_on_reset); + if (!hvt) + return -EFAULT; + return 0; } void hv_vss_deinit(void) { - cn_del_callback(&vss_id); + vss_transaction.state = HVUTIL_DEVICE_DYING; cancel_delayed_work_sync(&vss_timeout_work); cancel_work_sync(&vss_send_op_work); + hvutil_transport_destroy(hvt); } diff --git a/kernel/drivers/hv/hv_utils_transport.c b/kernel/drivers/hv/hv_utils_transport.c new file mode 100644 index 000000000..6a9d80a53 --- /dev/null +++ b/kernel/drivers/hv/hv_utils_transport.c @@ -0,0 +1,276 @@ +/* + * Kernel/userspace transport abstraction for Hyper-V util driver. + * + * Copyright (C) 2015, Vitaly Kuznetsov <vkuznets@redhat.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + */ + +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/poll.h> + +#include "hyperv_vmbus.h" +#include "hv_utils_transport.h" + +static DEFINE_SPINLOCK(hvt_list_lock); +static struct list_head hvt_list = LIST_HEAD_INIT(hvt_list); + +static void hvt_reset(struct hvutil_transport *hvt) +{ + mutex_lock(&hvt->outmsg_lock); + kfree(hvt->outmsg); + hvt->outmsg = NULL; + hvt->outmsg_len = 0; + mutex_unlock(&hvt->outmsg_lock); + if (hvt->on_reset) + hvt->on_reset(); +} + +static ssize_t hvt_op_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct hvutil_transport *hvt; + int ret; + + hvt = container_of(file->f_op, struct hvutil_transport, fops); + + if (wait_event_interruptible(hvt->outmsg_q, hvt->outmsg_len > 0)) + return -EINTR; + + mutex_lock(&hvt->outmsg_lock); + if (!hvt->outmsg) { + ret = -EAGAIN; + goto out_unlock; + } + + if (count < hvt->outmsg_len) { + ret = -EINVAL; + goto out_unlock; + } + + if (!copy_to_user(buf, hvt->outmsg, hvt->outmsg_len)) + ret = hvt->outmsg_len; + else + ret = -EFAULT; + + kfree(hvt->outmsg); + hvt->outmsg = NULL; + hvt->outmsg_len = 0; + +out_unlock: + mutex_unlock(&hvt->outmsg_lock); + return ret; +} + +static ssize_t hvt_op_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct hvutil_transport *hvt; + u8 *inmsg; + + hvt = container_of(file->f_op, struct hvutil_transport, fops); + + inmsg = kzalloc(count, GFP_KERNEL); + if (copy_from_user(inmsg, buf, count)) { + kfree(inmsg); + return -EFAULT; + } + if (hvt->on_msg(inmsg, count)) + return -EFAULT; + kfree(inmsg); + + return count; +} + +static unsigned int hvt_op_poll(struct file *file, poll_table *wait) +{ + struct hvutil_transport *hvt; + + hvt = container_of(file->f_op, struct hvutil_transport, fops); + + poll_wait(file, &hvt->outmsg_q, wait); + if (hvt->outmsg_len > 0) + return POLLIN | POLLRDNORM; + + return 0; +} + +static int hvt_op_open(struct inode *inode, struct file *file) +{ + struct hvutil_transport *hvt; + + hvt = container_of(file->f_op, struct hvutil_transport, fops); + + /* + * Switching to CHARDEV mode. We switch bach to INIT when device + * gets released. + */ + if (hvt->mode == HVUTIL_TRANSPORT_INIT) + hvt->mode = HVUTIL_TRANSPORT_CHARDEV; + else if (hvt->mode == HVUTIL_TRANSPORT_NETLINK) { + /* + * We're switching from netlink communication to using char + * device. Issue the reset first. + */ + hvt_reset(hvt); + hvt->mode = HVUTIL_TRANSPORT_CHARDEV; + } else + return -EBUSY; + + return 0; +} + +static int hvt_op_release(struct inode *inode, struct file *file) +{ + struct hvutil_transport *hvt; + + hvt = container_of(file->f_op, struct hvutil_transport, fops); + + hvt->mode = HVUTIL_TRANSPORT_INIT; + /* + * Cleanup message buffers to avoid spurious messages when the daemon + * connects back. + */ + hvt_reset(hvt); + + return 0; +} + +static void hvt_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) +{ + struct hvutil_transport *hvt, *hvt_found = NULL; + + spin_lock(&hvt_list_lock); + list_for_each_entry(hvt, &hvt_list, list) { + if (hvt->cn_id.idx == msg->id.idx && + hvt->cn_id.val == msg->id.val) { + hvt_found = hvt; + break; + } + } + spin_unlock(&hvt_list_lock); + if (!hvt_found) { + pr_warn("hvt_cn_callback: spurious message received!\n"); + return; + } + + /* + * Switching to NETLINK mode. Switching to CHARDEV happens when someone + * opens the device. + */ + if (hvt->mode == HVUTIL_TRANSPORT_INIT) + hvt->mode = HVUTIL_TRANSPORT_NETLINK; + + if (hvt->mode == HVUTIL_TRANSPORT_NETLINK) + hvt_found->on_msg(msg->data, msg->len); + else + pr_warn("hvt_cn_callback: unexpected netlink message!\n"); +} + +int hvutil_transport_send(struct hvutil_transport *hvt, void *msg, int len) +{ + struct cn_msg *cn_msg; + int ret = 0; + + if (hvt->mode == HVUTIL_TRANSPORT_INIT) { + return -EINVAL; + } else if (hvt->mode == HVUTIL_TRANSPORT_NETLINK) { + cn_msg = kzalloc(sizeof(*cn_msg) + len, GFP_ATOMIC); + if (!cn_msg) + return -ENOMEM; + cn_msg->id.idx = hvt->cn_id.idx; + cn_msg->id.val = hvt->cn_id.val; + cn_msg->len = len; + memcpy(cn_msg->data, msg, len); + ret = cn_netlink_send(cn_msg, 0, 0, GFP_ATOMIC); + kfree(cn_msg); + return ret; + } + /* HVUTIL_TRANSPORT_CHARDEV */ + mutex_lock(&hvt->outmsg_lock); + if (hvt->outmsg) { + /* Previous message wasn't received */ + ret = -EFAULT; + goto out_unlock; + } + hvt->outmsg = kzalloc(len, GFP_KERNEL); + memcpy(hvt->outmsg, msg, len); + hvt->outmsg_len = len; + wake_up_interruptible(&hvt->outmsg_q); +out_unlock: + mutex_unlock(&hvt->outmsg_lock); + return ret; +} + +struct hvutil_transport *hvutil_transport_init(const char *name, + u32 cn_idx, u32 cn_val, + int (*on_msg)(void *, int), + void (*on_reset)(void)) +{ + struct hvutil_transport *hvt; + + hvt = kzalloc(sizeof(*hvt), GFP_KERNEL); + if (!hvt) + return NULL; + + hvt->cn_id.idx = cn_idx; + hvt->cn_id.val = cn_val; + + hvt->mdev.minor = MISC_DYNAMIC_MINOR; + hvt->mdev.name = name; + + hvt->fops.owner = THIS_MODULE; + hvt->fops.read = hvt_op_read; + hvt->fops.write = hvt_op_write; + hvt->fops.poll = hvt_op_poll; + hvt->fops.open = hvt_op_open; + hvt->fops.release = hvt_op_release; + + hvt->mdev.fops = &hvt->fops; + + init_waitqueue_head(&hvt->outmsg_q); + mutex_init(&hvt->outmsg_lock); + + spin_lock(&hvt_list_lock); + list_add(&hvt->list, &hvt_list); + spin_unlock(&hvt_list_lock); + + hvt->on_msg = on_msg; + hvt->on_reset = on_reset; + + if (misc_register(&hvt->mdev)) + goto err_free_hvt; + + /* Use cn_id.idx/cn_id.val to determine if we need to setup netlink */ + if (hvt->cn_id.idx > 0 && hvt->cn_id.val > 0 && + cn_add_callback(&hvt->cn_id, name, hvt_cn_callback)) + goto err_free_hvt; + + return hvt; + +err_free_hvt: + kfree(hvt); + return NULL; +} + +void hvutil_transport_destroy(struct hvutil_transport *hvt) +{ + spin_lock(&hvt_list_lock); + list_del(&hvt->list); + spin_unlock(&hvt_list_lock); + if (hvt->cn_id.idx > 0 && hvt->cn_id.val > 0) + cn_del_callback(&hvt->cn_id); + misc_deregister(&hvt->mdev); + kfree(hvt->outmsg); + kfree(hvt); +} diff --git a/kernel/drivers/hv/hv_utils_transport.h b/kernel/drivers/hv/hv_utils_transport.h new file mode 100644 index 000000000..314c76ce1 --- /dev/null +++ b/kernel/drivers/hv/hv_utils_transport.h @@ -0,0 +1,51 @@ +/* + * Kernel/userspace transport abstraction for Hyper-V util driver. + * + * Copyright (C) 2015, Vitaly Kuznetsov <vkuznets@redhat.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + */ + +#ifndef _HV_UTILS_TRANSPORT_H +#define _HV_UTILS_TRANSPORT_H + +#include <linux/connector.h> +#include <linux/miscdevice.h> + +enum hvutil_transport_mode { + HVUTIL_TRANSPORT_INIT = 0, + HVUTIL_TRANSPORT_NETLINK, + HVUTIL_TRANSPORT_CHARDEV, +}; + +struct hvutil_transport { + int mode; /* hvutil_transport_mode */ + struct file_operations fops; /* file operations */ + struct miscdevice mdev; /* misc device */ + struct cb_id cn_id; /* CN_*_IDX/CN_*_VAL */ + struct list_head list; /* hvt_list */ + int (*on_msg)(void *, int); /* callback on new user message */ + void (*on_reset)(void); /* callback when userspace drops */ + u8 *outmsg; /* message to the userspace */ + int outmsg_len; /* its length */ + wait_queue_head_t outmsg_q; /* poll/read wait queue */ + struct mutex outmsg_lock; /* protects outmsg */ +}; + +struct hvutil_transport *hvutil_transport_init(const char *name, + u32 cn_idx, u32 cn_val, + int (*on_msg)(void *, int), + void (*on_reset)(void)); +int hvutil_transport_send(struct hvutil_transport *hvt, void *msg, int len); +void hvutil_transport_destroy(struct hvutil_transport *hvt); + +#endif /* _HV_UTILS_TRANSPORT_H */ diff --git a/kernel/drivers/hv/hyperv_vmbus.h b/kernel/drivers/hv/hyperv_vmbus.h index 887287ad4..378263656 100644 --- a/kernel/drivers/hv/hyperv_vmbus.h +++ b/kernel/drivers/hv/hyperv_vmbus.h @@ -63,9 +63,6 @@ enum hv_cpuid_function { /* Define version of the synthetic interrupt controller. */ #define HV_SYNIC_VERSION (1) -/* Define the expected SynIC version. */ -#define HV_SYNIC_VERSION_1 (0x1) - /* Define synthetic interrupt controller message constants. */ #define HV_MESSAGE_SIZE (256) #define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240) @@ -105,8 +102,6 @@ enum hv_message_type { HVMSG_X64_LEGACY_FP_ERROR = 0x80010005 }; -/* Define the number of synthetic interrupt sources. */ -#define HV_SYNIC_SINT_COUNT (16) #define HV_SYNIC_STIMER_COUNT (4) /* Define invalid partition identifier. */ @@ -141,7 +136,7 @@ struct hv_port_info { struct { u32 target_sint; u32 target_vp; - u16 base_flag_bumber; + u16 base_flag_number; u16 flag_count; u32 rsvdz; } event_port_info; @@ -517,6 +512,7 @@ struct hv_context { u64 guestid; void *hypercall_page; + void *tsc_page; bool synic_initialized; @@ -551,10 +547,23 @@ struct hv_context { * Support PV clockevent device. */ struct clock_event_device *clk_evt[NR_CPUS]; + /* + * To manage allocations in a NUMA node. + * Array indexed by numa node ID. + */ + struct cpumask *hv_numa_map; }; extern struct hv_context hv_context; +struct ms_hyperv_tsc_page { + volatile u32 tsc_sequence; + u32 reserved1; + volatile u64 tsc_scale; + volatile s64 tsc_offset; + u64 reserved2[509]; +}; + struct hv_ring_buffer_debug_info { u32 current_interrupt_mask; u32 current_read_index; @@ -647,6 +656,7 @@ struct vmbus_connection { atomic_t next_gpadl_handle; + struct completion unload_event; /* * Represents channel interrupts. Each bit position represents a * channel. When a channel sends an interrupt via VMBUS, it finds its @@ -730,9 +740,39 @@ int vmbus_set_event(struct vmbus_channel *channel); void vmbus_on_event(unsigned long data); +int hv_kvp_init(struct hv_util_service *); +void hv_kvp_deinit(void); +void hv_kvp_onchannelcallback(void *); + +int hv_vss_init(struct hv_util_service *); +void hv_vss_deinit(void); +void hv_vss_onchannelcallback(void *); + int hv_fcopy_init(struct hv_util_service *); void hv_fcopy_deinit(void); void hv_fcopy_onchannelcallback(void *); +void vmbus_initiate_unload(void); +static inline void hv_poll_channel(struct vmbus_channel *channel, + void (*cb)(void *)) +{ + if (!channel) + return; + + if (channel->target_cpu != smp_processor_id()) + smp_call_function_single(channel->target_cpu, + cb, channel, true); + else + cb(channel); +} + +enum hvutil_device_state { + HVUTIL_DEVICE_INIT = 0, /* driver is loaded, waiting for userspace */ + HVUTIL_READY, /* userspace is registered */ + HVUTIL_HOSTMSG_RECEIVED, /* message from the host was received */ + HVUTIL_USERSPACE_REQ, /* request to userspace was sent */ + HVUTIL_USERSPACE_RECV, /* reply from userspace was received */ + HVUTIL_DEVICE_DYING, /* driver unload is in progress */ +}; #endif /* _HYPERV_VMBUS_H */ diff --git a/kernel/drivers/hv/ring_buffer.c b/kernel/drivers/hv/ring_buffer.c index 6361d124f..70a1a9a22 100644 --- a/kernel/drivers/hv/ring_buffer.c +++ b/kernel/drivers/hv/ring_buffer.c @@ -103,10 +103,9 @@ static bool hv_need_to_signal(u32 old_write, struct hv_ring_buffer_info *rbi) * there is room for the producer to send the pending packet. */ -static bool hv_need_to_signal_on_read(u32 old_rd, - struct hv_ring_buffer_info *rbi) +static bool hv_need_to_signal_on_read(u32 prev_write_sz, + struct hv_ring_buffer_info *rbi) { - u32 prev_write_sz; u32 cur_write_sz; u32 r_size; u32 write_loc = rbi->ring_buffer->write_index; @@ -123,10 +122,6 @@ static bool hv_need_to_signal_on_read(u32 old_rd, cur_write_sz = write_loc >= read_loc ? r_size - (write_loc - read_loc) : read_loc - write_loc; - prev_write_sz = write_loc >= old_rd ? r_size - (write_loc - old_rd) : - old_rd - write_loc; - - if ((prev_write_sz < pending_sz) && (cur_write_sz >= pending_sz)) return true; @@ -517,7 +512,6 @@ int hv_ringbuffer_read(struct hv_ring_buffer_info *inring_info, void *buffer, u32 next_read_location = 0; u64 prev_indices = 0; unsigned long flags; - u32 old_read; if (buflen <= 0) return -EINVAL; @@ -528,8 +522,6 @@ int hv_ringbuffer_read(struct hv_ring_buffer_info *inring_info, void *buffer, &bytes_avail_toread, &bytes_avail_towrite); - old_read = bytes_avail_toread; - /* Make sure there is something to read */ if (bytes_avail_toread < buflen) { spin_unlock_irqrestore(&inring_info->ring_lock, flags); @@ -560,7 +552,7 @@ int hv_ringbuffer_read(struct hv_ring_buffer_info *inring_info, void *buffer, spin_unlock_irqrestore(&inring_info->ring_lock, flags); - *signal = hv_need_to_signal_on_read(old_read, inring_info); + *signal = hv_need_to_signal_on_read(bytes_avail_towrite, inring_info); return 0; } diff --git a/kernel/drivers/hv/vmbus_drv.c b/kernel/drivers/hv/vmbus_drv.c index c85235e9f..f19b6f7a4 100644 --- a/kernel/drivers/hv/vmbus_drv.c +++ b/kernel/drivers/hv/vmbus_drv.c @@ -39,6 +39,8 @@ #include <asm/mshyperv.h> #include <linux/notifier.h> #include <linux/ptrace.h> +#include <linux/screen_info.h> +#include <linux/kdebug.h> #include "hyperv_vmbus.h" static struct acpi_device *hv_acpi_dev; @@ -48,12 +50,18 @@ static struct completion probe_event; static int irq; -static int hyperv_panic_event(struct notifier_block *nb, - unsigned long event, void *ptr) +static void hyperv_report_panic(struct pt_regs *regs) { - struct pt_regs *regs; + static bool panic_reported; - regs = current_pt_regs(); + /* + * We prefer to report panic on 'die' chain as we have proper + * registers to report, but if we miss it (e.g. on BUG()) we need + * to report it on 'panic'. + */ + if (panic_reported) + return; + panic_reported = true; wrmsrl(HV_X64_MSR_CRASH_P0, regs->ip); wrmsrl(HV_X64_MSR_CRASH_P1, regs->ax); @@ -65,18 +73,37 @@ static int hyperv_panic_event(struct notifier_block *nb, * Let Hyper-V know there is crash data available */ wrmsrl(HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_CRASH_NOTIFY); +} + +static int hyperv_panic_event(struct notifier_block *nb, unsigned long val, + void *args) +{ + struct pt_regs *regs; + + regs = current_pt_regs(); + + hyperv_report_panic(regs); + return NOTIFY_DONE; +} + +static int hyperv_die_event(struct notifier_block *nb, unsigned long val, + void *args) +{ + struct die_args *die = (struct die_args *)args; + struct pt_regs *regs = die->regs; + + hyperv_report_panic(regs); return NOTIFY_DONE; } +static struct notifier_block hyperv_die_block = { + .notifier_call = hyperv_die_event, +}; static struct notifier_block hyperv_panic_block = { .notifier_call = hyperv_panic_event, }; -struct resource hyperv_mmio = { - .name = "hyperv mmio", - .flags = IORESOURCE_MEM, -}; -EXPORT_SYMBOL_GPL(hyperv_mmio); +struct resource *hyperv_mmio; static int vmbus_exists(void) { @@ -414,6 +441,43 @@ static ssize_t in_write_bytes_avail_show(struct device *dev, } static DEVICE_ATTR_RO(in_write_bytes_avail); +static ssize_t channel_vp_mapping_show(struct device *dev, + struct device_attribute *dev_attr, + char *buf) +{ + struct hv_device *hv_dev = device_to_hv_device(dev); + struct vmbus_channel *channel = hv_dev->channel, *cur_sc; + unsigned long flags; + int buf_size = PAGE_SIZE, n_written, tot_written; + struct list_head *cur; + + if (!channel) + return -ENODEV; + + tot_written = snprintf(buf, buf_size, "%u:%u\n", + channel->offermsg.child_relid, channel->target_cpu); + + spin_lock_irqsave(&channel->lock, flags); + + list_for_each(cur, &channel->sc_list) { + if (tot_written >= buf_size - 1) + break; + + cur_sc = list_entry(cur, struct vmbus_channel, sc_list); + n_written = scnprintf(buf + tot_written, + buf_size - tot_written, + "%u:%u\n", + cur_sc->offermsg.child_relid, + cur_sc->target_cpu); + tot_written += n_written; + } + + spin_unlock_irqrestore(&channel->lock, flags); + + return tot_written; +} +static DEVICE_ATTR_RO(channel_vp_mapping); + /* Set up per device attributes in /sys/bus/vmbus/devices/<bus device> */ static struct attribute *vmbus_attrs[] = { &dev_attr_id.attr, @@ -438,6 +502,7 @@ static struct attribute *vmbus_attrs[] = { &dev_attr_in_write_index.attr, &dev_attr_in_read_bytes_avail.attr, &dev_attr_in_write_bytes_avail.attr, + &dev_attr_channel_vp_mapping.attr, NULL, }; ATTRIBUTE_GROUPS(vmbus); @@ -763,38 +828,6 @@ static void vmbus_isr(void) } } -#ifdef CONFIG_HOTPLUG_CPU -static int hyperv_cpu_disable(void) -{ - return -ENOSYS; -} - -static void hv_cpu_hotplug_quirk(bool vmbus_loaded) -{ - static void *previous_cpu_disable; - - /* - * Offlining a CPU when running on newer hypervisors (WS2012R2, Win8, - * ...) is not supported at this moment as channel interrupts are - * distributed across all of them. - */ - - if ((vmbus_proto_version == VERSION_WS2008) || - (vmbus_proto_version == VERSION_WIN7)) - return; - - if (vmbus_loaded) { - previous_cpu_disable = smp_ops.cpu_disable; - smp_ops.cpu_disable = hyperv_cpu_disable; - pr_notice("CPU offlining is not supported by hypervisor\n"); - } else if (previous_cpu_disable) - smp_ops.cpu_disable = previous_cpu_disable; -} -#else -static void hv_cpu_hotplug_quirk(bool vmbus_loaded) -{ -} -#endif /* * vmbus_bus_init -Main vmbus driver initialization routine. @@ -836,12 +869,14 @@ static int vmbus_bus_init(int irq) if (ret) goto err_alloc; - hv_cpu_hotplug_quirk(true); + if (vmbus_proto_version > VERSION_WIN7) + cpu_hotplug_disable(); /* * Only register if the crash MSRs are available */ - if (ms_hyperv.features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { + if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { + register_die_notifier(&hyperv_die_block); atomic_notifier_chain_register(&panic_notifier_list, &hyperv_panic_block); } @@ -863,8 +898,8 @@ err_cleanup: } /** - * __vmbus_child_driver_register - Register a vmbus's driver - * @drv: Pointer to driver structure you want to register + * __vmbus_child_driver_register() - Register a vmbus's driver + * @hv_driver: Pointer to driver structure you want to register * @owner: owner module of the drv * @mod_name: module name string * @@ -896,7 +931,8 @@ EXPORT_SYMBOL_GPL(__vmbus_driver_register); /** * vmbus_driver_unregister() - Unregister a vmbus's driver - * @drv: Pointer to driver structure you want to un-register + * @hv_driver: Pointer to driver structure you want to + * un-register * * Un-register the given driver that was previous registered with a call to * vmbus_driver_register() @@ -982,30 +1018,184 @@ void vmbus_device_unregister(struct hv_device *device_obj) /* - * VMBUS is an acpi enumerated device. Get the the information we + * VMBUS is an acpi enumerated device. Get the information we * need from DSDT. */ - +#define VTPM_BASE_ADDRESS 0xfed40000 static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx) { + resource_size_t start = 0; + resource_size_t end = 0; + struct resource *new_res; + struct resource **old_res = &hyperv_mmio; + struct resource **prev_res = NULL; + switch (res->type) { case ACPI_RESOURCE_TYPE_IRQ: irq = res->data.irq.interrupts[0]; + return AE_OK; + + /* + * "Address" descriptors are for bus windows. Ignore + * "memory" descriptors, which are for registers on + * devices. + */ + case ACPI_RESOURCE_TYPE_ADDRESS32: + start = res->data.address32.address.minimum; + end = res->data.address32.address.maximum; break; case ACPI_RESOURCE_TYPE_ADDRESS64: - hyperv_mmio.start = res->data.address64.address.minimum; - hyperv_mmio.end = res->data.address64.address.maximum; + start = res->data.address64.address.minimum; + end = res->data.address64.address.maximum; break; + + default: + /* Unused resource type */ + return AE_OK; + } + /* + * Ignore ranges that are below 1MB, as they're not + * necessary or useful here. + */ + if (end < 0x100000) + return AE_OK; + + new_res = kzalloc(sizeof(*new_res), GFP_ATOMIC); + if (!new_res) + return AE_NO_MEMORY; + + /* If this range overlaps the virtual TPM, truncate it. */ + if (end > VTPM_BASE_ADDRESS && start < VTPM_BASE_ADDRESS) + end = VTPM_BASE_ADDRESS; + + new_res->name = "hyperv mmio"; + new_res->flags = IORESOURCE_MEM; + new_res->start = start; + new_res->end = end; + + do { + if (!*old_res) { + *old_res = new_res; + break; + } + + if ((*old_res)->end < new_res->start) { + new_res->sibling = *old_res; + if (prev_res) + (*prev_res)->sibling = new_res; + *old_res = new_res; + break; + } + + prev_res = old_res; + old_res = &(*old_res)->sibling; + + } while (1); return AE_OK; } +static int vmbus_acpi_remove(struct acpi_device *device) +{ + struct resource *cur_res; + struct resource *next_res; + + if (hyperv_mmio) { + for (cur_res = hyperv_mmio; cur_res; cur_res = next_res) { + next_res = cur_res->sibling; + kfree(cur_res); + } + } + + return 0; +} + +/** + * vmbus_allocate_mmio() - Pick a memory-mapped I/O range. + * @new: If successful, supplied a pointer to the + * allocated MMIO space. + * @device_obj: Identifies the caller + * @min: Minimum guest physical address of the + * allocation + * @max: Maximum guest physical address + * @size: Size of the range to be allocated + * @align: Alignment of the range to be allocated + * @fb_overlap_ok: Whether this allocation can be allowed + * to overlap the video frame buffer. + * + * This function walks the resources granted to VMBus by the + * _CRS object in the ACPI namespace underneath the parent + * "bridge" whether that's a root PCI bus in the Generation 1 + * case or a Module Device in the Generation 2 case. It then + * attempts to allocate from the global MMIO pool in a way that + * matches the constraints supplied in these parameters and by + * that _CRS. + * + * Return: 0 on success, -errno on failure + */ +int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj, + resource_size_t min, resource_size_t max, + resource_size_t size, resource_size_t align, + bool fb_overlap_ok) +{ + struct resource *iter; + resource_size_t range_min, range_max, start, local_min, local_max; + const char *dev_n = dev_name(&device_obj->device); + u32 fb_end = screen_info.lfb_base + (screen_info.lfb_size << 1); + int i; + + for (iter = hyperv_mmio; iter; iter = iter->sibling) { + if ((iter->start >= max) || (iter->end <= min)) + continue; + + range_min = iter->start; + range_max = iter->end; + + /* If this range overlaps the frame buffer, split it into + two tries. */ + for (i = 0; i < 2; i++) { + local_min = range_min; + local_max = range_max; + if (fb_overlap_ok || (range_min >= fb_end) || + (range_max <= screen_info.lfb_base)) { + i++; + } else { + if ((range_min <= screen_info.lfb_base) && + (range_max >= screen_info.lfb_base)) { + /* + * The frame buffer is in this window, + * so trim this into the part that + * preceeds the frame buffer. + */ + local_max = screen_info.lfb_base - 1; + range_min = fb_end; + } else { + range_min = fb_end; + continue; + } + } + + start = (local_min + align - 1) & ~(align - 1); + for (; start + size - 1 <= local_max; start += align) { + *new = request_mem_region_exclusive(start, size, + dev_n); + if (*new) + return 0; + } + } + } + + return -ENXIO; +} +EXPORT_SYMBOL_GPL(vmbus_allocate_mmio); + static int vmbus_acpi_add(struct acpi_device *device) { acpi_status result; int ret_val = -ENODEV; + struct acpi_device *ancestor; hv_acpi_dev = device; @@ -1015,23 +1205,24 @@ static int vmbus_acpi_add(struct acpi_device *device) if (ACPI_FAILURE(result)) goto acpi_walk_err; /* - * The parent of the vmbus acpi device (Gen2 firmware) is the VMOD that - * has the mmio ranges. Get that. + * Some ancestor of the vmbus acpi device (Gen1 or Gen2 + * firmware) is the VMOD that has the mmio ranges. Get that. */ - if (device->parent) { - result = acpi_walk_resources(device->parent->handle, - METHOD_NAME__CRS, - vmbus_walk_resources, NULL); + for (ancestor = device->parent; ancestor; ancestor = ancestor->parent) { + result = acpi_walk_resources(ancestor->handle, METHOD_NAME__CRS, + vmbus_walk_resources, NULL); if (ACPI_FAILURE(result)) - goto acpi_walk_err; - if (hyperv_mmio.start && hyperv_mmio.end) - request_resource(&iomem_resource, &hyperv_mmio); + continue; + if (hyperv_mmio) + break; } ret_val = 0; acpi_walk_err: complete(&probe_event); + if (ret_val) + vmbus_acpi_remove(device); return ret_val; } @@ -1047,9 +1238,33 @@ static struct acpi_driver vmbus_acpi_driver = { .ids = vmbus_acpi_device_ids, .ops = { .add = vmbus_acpi_add, + .remove = vmbus_acpi_remove, }, }; +static void hv_kexec_handler(void) +{ + int cpu; + + hv_synic_clockevents_cleanup(); + vmbus_initiate_unload(); + for_each_online_cpu(cpu) + smp_call_function_single(cpu, hv_synic_cleanup, NULL, 1); + hv_cleanup(); +}; + +static void hv_crash_handler(struct pt_regs *regs) +{ + vmbus_initiate_unload(); + /* + * In crash handler we can't schedule synic cleanup for all CPUs, + * doing the cleanup for current CPU only. This should be sufficient + * for kdump. + */ + hv_synic_cleanup(NULL); + hv_cleanup(); +}; + static int __init hv_acpi_init(void) { int ret, t; @@ -1082,6 +1297,9 @@ static int __init hv_acpi_init(void) if (ret) goto cleanup; + hv_setup_kexec_handler(hv_kexec_handler); + hv_setup_crash_handler(hv_crash_handler); + return 0; cleanup: @@ -1094,17 +1312,29 @@ static void __exit vmbus_exit(void) { int cpu; + hv_remove_kexec_handler(); + hv_remove_crash_handler(); vmbus_connection.conn_state = DISCONNECTED; hv_synic_clockevents_cleanup(); + vmbus_disconnect(); hv_remove_vmbus_irq(); + tasklet_kill(&msg_dpc); vmbus_free_channels(); + if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { + unregister_die_notifier(&hyperv_die_block); + atomic_notifier_chain_unregister(&panic_notifier_list, + &hyperv_panic_block); + } bus_unregister(&hv_bus); hv_cleanup(); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + tasklet_kill(hv_context.event_dpc[cpu]); smp_call_function_single(cpu, hv_synic_cleanup, NULL, 1); + } + hv_synic_free(); acpi_bus_unregister_driver(&vmbus_acpi_driver); - hv_cpu_hotplug_quirk(false); - vmbus_disconnect(); + if (vmbus_proto_version > VERSION_WIN7) + cpu_hotplug_enable(); } |