hv.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (c) 2009, Microsoft Corporation.
  4. *
  5. * Authors:
  6. * Haiyang Zhang <haiyangz@microsoft.com>
  7. * Hank Janssen <hjanssen@microsoft.com>
  8. */
  9. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  10. #include <linux/io.h>
  11. #include <linux/kernel.h>
  12. #include <linux/mm.h>
  13. #include <linux/slab.h>
  14. #include <linux/vmalloc.h>
  15. #include <linux/hyperv.h>
  16. #include <linux/random.h>
  17. #include <linux/clockchips.h>
  18. #include <linux/delay.h>
  19. #include <linux/interrupt.h>
  20. #include <clocksource/hyperv_timer.h>
  21. #include <asm/mshyperv.h>
  22. #include <linux/set_memory.h>
  23. #include "hyperv_vmbus.h"
  24. /* The one and only */
  25. struct hv_context hv_context;
  26. /*
  27. * hv_init - Main initialization routine.
  28. *
  29. * This routine must be called before any other routines in here are called
  30. */
  31. int hv_init(void)
  32. {
  33. hv_context.cpu_context = alloc_percpu(struct hv_per_cpu_context);
  34. if (!hv_context.cpu_context)
  35. return -ENOMEM;
  36. return 0;
  37. }
  38. /*
  39. * hv_post_message - Post a message using the hypervisor message IPC.
  40. *
  41. * This involves a hypercall.
  42. */
  43. int hv_post_message(union hv_connection_id connection_id,
  44. enum hv_message_type message_type,
  45. void *payload, size_t payload_size)
  46. {
  47. struct hv_input_post_message *aligned_msg;
  48. unsigned long flags;
  49. u64 status;
  50. if (payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT)
  51. return -EMSGSIZE;
  52. local_irq_save(flags);
  53. /*
  54. * A TDX VM with the paravisor must use the decrypted post_msg_page: see
  55. * the comment in struct hv_per_cpu_context. A SNP VM with the paravisor
  56. * can use the encrypted hyperv_pcpu_input_arg because it copies the
  57. * input into the GHCB page, which has been decrypted by the paravisor.
  58. */
  59. if (hv_isolation_type_tdx() && ms_hyperv.paravisor_present)
  60. aligned_msg = this_cpu_ptr(hv_context.cpu_context)->post_msg_page;
  61. else
  62. aligned_msg = *this_cpu_ptr(hyperv_pcpu_input_arg);
  63. aligned_msg->connectionid = connection_id;
  64. aligned_msg->reserved = 0;
  65. aligned_msg->message_type = message_type;
  66. aligned_msg->payload_size = payload_size;
  67. memcpy((void *)aligned_msg->payload, payload, payload_size);
  68. if (ms_hyperv.paravisor_present) {
  69. if (hv_isolation_type_tdx())
  70. status = hv_tdx_hypercall(HVCALL_POST_MESSAGE,
  71. virt_to_phys(aligned_msg), 0);
  72. else if (hv_isolation_type_snp())
  73. status = hv_ghcb_hypercall(HVCALL_POST_MESSAGE,
  74. aligned_msg, NULL,
  75. sizeof(*aligned_msg));
  76. else
  77. status = HV_STATUS_INVALID_PARAMETER;
  78. } else {
  79. status = hv_do_hypercall(HVCALL_POST_MESSAGE,
  80. aligned_msg, NULL);
  81. }
  82. local_irq_restore(flags);
  83. return hv_result(status);
  84. }
  85. int hv_synic_alloc(void)
  86. {
  87. int cpu, ret = -ENOMEM;
  88. struct hv_per_cpu_context *hv_cpu;
  89. /*
  90. * First, zero all per-cpu memory areas so hv_synic_free() can
  91. * detect what memory has been allocated and cleanup properly
  92. * after any failures.
  93. */
  94. for_each_present_cpu(cpu) {
  95. hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu);
  96. memset(hv_cpu, 0, sizeof(*hv_cpu));
  97. }
  98. hv_context.hv_numa_map = kcalloc(nr_node_ids, sizeof(struct cpumask),
  99. GFP_KERNEL);
  100. if (!hv_context.hv_numa_map) {
  101. pr_err("Unable to allocate NUMA map\n");
  102. goto err;
  103. }
  104. for_each_present_cpu(cpu) {
  105. hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu);
  106. tasklet_init(&hv_cpu->msg_dpc,
  107. vmbus_on_msg_dpc, (unsigned long)hv_cpu);
  108. if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) {
  109. hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC);
  110. if (!hv_cpu->post_msg_page) {
  111. pr_err("Unable to allocate post msg page\n");
  112. goto err;
  113. }
  114. ret = set_memory_decrypted((unsigned long)hv_cpu->post_msg_page, 1);
  115. if (ret) {
  116. pr_err("Failed to decrypt post msg page: %d\n", ret);
  117. /* Just leak the page, as it's unsafe to free the page. */
  118. hv_cpu->post_msg_page = NULL;
  119. goto err;
  120. }
  121. memset(hv_cpu->post_msg_page, 0, PAGE_SIZE);
  122. }
  123. /*
  124. * Synic message and event pages are allocated by paravisor.
  125. * Skip these pages allocation here.
  126. */
  127. if (!ms_hyperv.paravisor_present && !hv_root_partition) {
  128. hv_cpu->synic_message_page =
  129. (void *)get_zeroed_page(GFP_ATOMIC);
  130. if (!hv_cpu->synic_message_page) {
  131. pr_err("Unable to allocate SYNIC message page\n");
  132. goto err;
  133. }
  134. hv_cpu->synic_event_page =
  135. (void *)get_zeroed_page(GFP_ATOMIC);
  136. if (!hv_cpu->synic_event_page) {
  137. pr_err("Unable to allocate SYNIC event page\n");
  138. free_page((unsigned long)hv_cpu->synic_message_page);
  139. hv_cpu->synic_message_page = NULL;
  140. goto err;
  141. }
  142. }
  143. if (!ms_hyperv.paravisor_present &&
  144. (hv_isolation_type_snp() || hv_isolation_type_tdx())) {
  145. ret = set_memory_decrypted((unsigned long)
  146. hv_cpu->synic_message_page, 1);
  147. if (ret) {
  148. pr_err("Failed to decrypt SYNIC msg page: %d\n", ret);
  149. hv_cpu->synic_message_page = NULL;
  150. /*
  151. * Free the event page here so that hv_synic_free()
  152. * won't later try to re-encrypt it.
  153. */
  154. free_page((unsigned long)hv_cpu->synic_event_page);
  155. hv_cpu->synic_event_page = NULL;
  156. goto err;
  157. }
  158. ret = set_memory_decrypted((unsigned long)
  159. hv_cpu->synic_event_page, 1);
  160. if (ret) {
  161. pr_err("Failed to decrypt SYNIC event page: %d\n", ret);
  162. hv_cpu->synic_event_page = NULL;
  163. goto err;
  164. }
  165. memset(hv_cpu->synic_message_page, 0, PAGE_SIZE);
  166. memset(hv_cpu->synic_event_page, 0, PAGE_SIZE);
  167. }
  168. }
  169. return 0;
  170. err:
  171. /*
  172. * Any memory allocations that succeeded will be freed when
  173. * the caller cleans up by calling hv_synic_free()
  174. */
  175. return ret;
  176. }
  177. void hv_synic_free(void)
  178. {
  179. int cpu, ret;
  180. for_each_present_cpu(cpu) {
  181. struct hv_per_cpu_context *hv_cpu =
  182. per_cpu_ptr(hv_context.cpu_context, cpu);
  183. /* It's better to leak the page if the encryption fails. */
  184. if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) {
  185. if (hv_cpu->post_msg_page) {
  186. ret = set_memory_encrypted((unsigned long)
  187. hv_cpu->post_msg_page, 1);
  188. if (ret) {
  189. pr_err("Failed to encrypt post msg page: %d\n", ret);
  190. hv_cpu->post_msg_page = NULL;
  191. }
  192. }
  193. }
  194. if (!ms_hyperv.paravisor_present &&
  195. (hv_isolation_type_snp() || hv_isolation_type_tdx())) {
  196. if (hv_cpu->synic_message_page) {
  197. ret = set_memory_encrypted((unsigned long)
  198. hv_cpu->synic_message_page, 1);
  199. if (ret) {
  200. pr_err("Failed to encrypt SYNIC msg page: %d\n", ret);
  201. hv_cpu->synic_message_page = NULL;
  202. }
  203. }
  204. if (hv_cpu->synic_event_page) {
  205. ret = set_memory_encrypted((unsigned long)
  206. hv_cpu->synic_event_page, 1);
  207. if (ret) {
  208. pr_err("Failed to encrypt SYNIC event page: %d\n", ret);
  209. hv_cpu->synic_event_page = NULL;
  210. }
  211. }
  212. }
  213. free_page((unsigned long)hv_cpu->post_msg_page);
  214. free_page((unsigned long)hv_cpu->synic_event_page);
  215. free_page((unsigned long)hv_cpu->synic_message_page);
  216. }
  217. kfree(hv_context.hv_numa_map);
  218. }
  219. /*
  220. * hv_synic_init - Initialize the Synthetic Interrupt Controller.
  221. *
  222. * If it is already initialized by another entity (ie x2v shim), we need to
  223. * retrieve the initialized message and event pages. Otherwise, we create and
  224. * initialize the message and event pages.
  225. */
  226. void hv_synic_enable_regs(unsigned int cpu)
  227. {
  228. struct hv_per_cpu_context *hv_cpu =
  229. per_cpu_ptr(hv_context.cpu_context, cpu);
  230. union hv_synic_simp simp;
  231. union hv_synic_siefp siefp;
  232. union hv_synic_sint shared_sint;
  233. union hv_synic_scontrol sctrl;
  234. /* Setup the Synic's message page */
  235. simp.as_uint64 = hv_get_msr(HV_MSR_SIMP);
  236. simp.simp_enabled = 1;
  237. if (ms_hyperv.paravisor_present || hv_root_partition) {
  238. /* Mask out vTOM bit. ioremap_cache() maps decrypted */
  239. u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) &
  240. ~ms_hyperv.shared_gpa_boundary;
  241. hv_cpu->synic_message_page =
  242. (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
  243. if (!hv_cpu->synic_message_page)
  244. pr_err("Fail to map synic message page.\n");
  245. } else {
  246. simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page)
  247. >> HV_HYP_PAGE_SHIFT;
  248. }
  249. hv_set_msr(HV_MSR_SIMP, simp.as_uint64);
  250. /* Setup the Synic's event page */
  251. siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP);
  252. siefp.siefp_enabled = 1;
  253. if (ms_hyperv.paravisor_present || hv_root_partition) {
  254. /* Mask out vTOM bit. ioremap_cache() maps decrypted */
  255. u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) &
  256. ~ms_hyperv.shared_gpa_boundary;
  257. hv_cpu->synic_event_page =
  258. (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
  259. if (!hv_cpu->synic_event_page)
  260. pr_err("Fail to map synic event page.\n");
  261. } else {
  262. siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page)
  263. >> HV_HYP_PAGE_SHIFT;
  264. }
  265. hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64);
  266. /* Setup the shared SINT. */
  267. if (vmbus_irq != -1)
  268. enable_percpu_irq(vmbus_irq, 0);
  269. shared_sint.as_uint64 = hv_get_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT);
  270. shared_sint.vector = vmbus_interrupt;
  271. shared_sint.masked = false;
  272. /*
  273. * On architectures where Hyper-V doesn't support AEOI (e.g., ARM64),
  274. * it doesn't provide a recommendation flag and AEOI must be disabled.
  275. */
  276. #ifdef HV_DEPRECATING_AEOI_RECOMMENDED
  277. shared_sint.auto_eoi =
  278. !(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED);
  279. #else
  280. shared_sint.auto_eoi = 0;
  281. #endif
  282. hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
  283. /* Enable the global synic bit */
  284. sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL);
  285. sctrl.enable = 1;
  286. hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
  287. }
  288. int hv_synic_init(unsigned int cpu)
  289. {
  290. hv_synic_enable_regs(cpu);
  291. hv_stimer_legacy_init(cpu, VMBUS_MESSAGE_SINT);
  292. return 0;
  293. }
  294. void hv_synic_disable_regs(unsigned int cpu)
  295. {
  296. struct hv_per_cpu_context *hv_cpu =
  297. per_cpu_ptr(hv_context.cpu_context, cpu);
  298. union hv_synic_sint shared_sint;
  299. union hv_synic_simp simp;
  300. union hv_synic_siefp siefp;
  301. union hv_synic_scontrol sctrl;
  302. shared_sint.as_uint64 = hv_get_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT);
  303. shared_sint.masked = 1;
  304. /* Need to correctly cleanup in the case of SMP!!! */
  305. /* Disable the interrupt */
  306. hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
  307. simp.as_uint64 = hv_get_msr(HV_MSR_SIMP);
  308. /*
  309. * In Isolation VM, sim and sief pages are allocated by
  310. * paravisor. These pages also will be used by kdump
  311. * kernel. So just reset enable bit here and keep page
  312. * addresses.
  313. */
  314. simp.simp_enabled = 0;
  315. if (ms_hyperv.paravisor_present || hv_root_partition) {
  316. iounmap(hv_cpu->synic_message_page);
  317. hv_cpu->synic_message_page = NULL;
  318. } else {
  319. simp.base_simp_gpa = 0;
  320. }
  321. hv_set_msr(HV_MSR_SIMP, simp.as_uint64);
  322. siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP);
  323. siefp.siefp_enabled = 0;
  324. if (ms_hyperv.paravisor_present || hv_root_partition) {
  325. iounmap(hv_cpu->synic_event_page);
  326. hv_cpu->synic_event_page = NULL;
  327. } else {
  328. siefp.base_siefp_gpa = 0;
  329. }
  330. hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64);
  331. /* Disable the global synic bit */
  332. sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL);
  333. sctrl.enable = 0;
  334. hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
  335. if (vmbus_irq != -1)
  336. disable_percpu_irq(vmbus_irq);
  337. }
  338. #define HV_MAX_TRIES 3
  339. /*
  340. * Scan the event flags page of 'this' CPU looking for any bit that is set. If we find one
  341. * bit set, then wait for a few milliseconds. Repeat these steps for a maximum of 3 times.
  342. * Return 'true', if there is still any set bit after this operation; 'false', otherwise.
  343. *
  344. * If a bit is set, that means there is a pending channel interrupt. The expectation is
  345. * that the normal interrupt handling mechanism will find and process the channel interrupt
  346. * "very soon", and in the process clear the bit.
  347. */
  348. static bool hv_synic_event_pending(void)
  349. {
  350. struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context);
  351. union hv_synic_event_flags *event =
  352. (union hv_synic_event_flags *)hv_cpu->synic_event_page + VMBUS_MESSAGE_SINT;
  353. unsigned long *recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */
  354. bool pending;
  355. u32 relid;
  356. int tries = 0;
  357. retry:
  358. pending = false;
  359. for_each_set_bit(relid, recv_int_page, HV_EVENT_FLAGS_COUNT) {
  360. /* Special case - VMBus channel protocol messages */
  361. if (relid == 0)
  362. continue;
  363. pending = true;
  364. break;
  365. }
  366. if (pending && tries++ < HV_MAX_TRIES) {
  367. usleep_range(10000, 20000);
  368. goto retry;
  369. }
  370. return pending;
  371. }
  372. /*
  373. * hv_synic_cleanup - Cleanup routine for hv_synic_init().
  374. */
  375. int hv_synic_cleanup(unsigned int cpu)
  376. {
  377. struct vmbus_channel *channel, *sc;
  378. bool channel_found = false;
  379. if (vmbus_connection.conn_state != CONNECTED)
  380. goto always_cleanup;
  381. /*
  382. * Hyper-V does not provide a way to change the connect CPU once
  383. * it is set; we must prevent the connect CPU from going offline
  384. * while the VM is running normally. But in the panic or kexec()
  385. * path where the vmbus is already disconnected, the CPU must be
  386. * allowed to shut down.
  387. */
  388. if (cpu == VMBUS_CONNECT_CPU)
  389. return -EBUSY;
  390. /*
  391. * Search for channels which are bound to the CPU we're about to
  392. * cleanup. In case we find one and vmbus is still connected, we
  393. * fail; this will effectively prevent CPU offlining.
  394. *
  395. * TODO: Re-bind the channels to different CPUs.
  396. */
  397. mutex_lock(&vmbus_connection.channel_mutex);
  398. list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
  399. if (channel->target_cpu == cpu) {
  400. channel_found = true;
  401. break;
  402. }
  403. list_for_each_entry(sc, &channel->sc_list, sc_list) {
  404. if (sc->target_cpu == cpu) {
  405. channel_found = true;
  406. break;
  407. }
  408. }
  409. if (channel_found)
  410. break;
  411. }
  412. mutex_unlock(&vmbus_connection.channel_mutex);
  413. if (channel_found)
  414. return -EBUSY;
  415. /*
  416. * channel_found == false means that any channels that were previously
  417. * assigned to the CPU have been reassigned elsewhere with a call of
  418. * vmbus_send_modifychannel(). Scan the event flags page looking for
  419. * bits that are set and waiting with a timeout for vmbus_chan_sched()
  420. * to process such bits. If bits are still set after this operation
  421. * and VMBus is connected, fail the CPU offlining operation.
  422. */
  423. if (vmbus_proto_version >= VERSION_WIN10_V4_1 && hv_synic_event_pending())
  424. return -EBUSY;
  425. always_cleanup:
  426. hv_stimer_legacy_cleanup(cpu);
  427. hv_synic_disable_regs(cpu);
  428. return 0;
  429. }