hyperv_transport.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Hyper-V transport for vsock
  4. *
  5. * Hyper-V Sockets supplies a byte-stream based communication mechanism
  6. * between the host and the VM. This driver implements the necessary
  7. * support in the VM by introducing the new vsock transport.
  8. *
  9. * Copyright (c) 2017, Microsoft Corporation.
  10. */
  11. #include <linux/module.h>
  12. #include <linux/vmalloc.h>
  13. #include <linux/hyperv.h>
  14. #include <net/sock.h>
  15. #include <net/af_vsock.h>
  16. #include <asm/hyperv-tlfs.h>
  17. /* Older (VMBUS version 'VERSION_WIN10' or before) Windows hosts have some
  18. * stricter requirements on the hv_sock ring buffer size of six 4K pages.
  19. * hyperv-tlfs defines HV_HYP_PAGE_SIZE as 4K. Newer hosts don't have this
  20. * limitation; but, keep the defaults the same for compat.
  21. */
  22. #define RINGBUFFER_HVS_RCV_SIZE (HV_HYP_PAGE_SIZE * 6)
  23. #define RINGBUFFER_HVS_SND_SIZE (HV_HYP_PAGE_SIZE * 6)
  24. #define RINGBUFFER_HVS_MAX_SIZE (HV_HYP_PAGE_SIZE * 64)
  25. /* The MTU is 16KB per the host side's design */
  26. #define HVS_MTU_SIZE (1024 * 16)
  27. /* How long to wait for graceful shutdown of a connection */
  28. #define HVS_CLOSE_TIMEOUT (8 * HZ)
  29. struct vmpipe_proto_header {
  30. u32 pkt_type;
  31. u32 data_size;
  32. };
  33. /* For recv, we use the VMBus in-place packet iterator APIs to directly copy
  34. * data from the ringbuffer into the userspace buffer.
  35. */
  36. struct hvs_recv_buf {
  37. /* The header before the payload data */
  38. struct vmpipe_proto_header hdr;
  39. /* The payload */
  40. u8 data[HVS_MTU_SIZE];
  41. };
  42. /* We can send up to HVS_MTU_SIZE bytes of payload to the host, but let's use
  43. * a smaller size, i.e. HVS_SEND_BUF_SIZE, to maximize concurrency between the
  44. * guest and the host processing as one VMBUS packet is the smallest processing
  45. * unit.
  46. *
  47. * Note: the buffer can be eliminated in the future when we add new VMBus
  48. * ringbuffer APIs that allow us to directly copy data from userspace buffer
  49. * to VMBus ringbuffer.
  50. */
  51. #define HVS_SEND_BUF_SIZE \
  52. (HV_HYP_PAGE_SIZE - sizeof(struct vmpipe_proto_header))
  53. struct hvs_send_buf {
  54. /* The header before the payload data */
  55. struct vmpipe_proto_header hdr;
  56. /* The payload */
  57. u8 data[HVS_SEND_BUF_SIZE];
  58. };
  59. #define HVS_HEADER_LEN (sizeof(struct vmpacket_descriptor) + \
  60. sizeof(struct vmpipe_proto_header))
  61. /* See 'prev_indices' in hv_ringbuffer_read(), hv_ringbuffer_write(), and
  62. * __hv_pkt_iter_next().
  63. */
  64. #define VMBUS_PKT_TRAILER_SIZE (sizeof(u64))
  65. #define HVS_PKT_LEN(payload_len) (HVS_HEADER_LEN + \
  66. ALIGN((payload_len), 8) + \
  67. VMBUS_PKT_TRAILER_SIZE)
  68. /* Upper bound on the size of a VMbus packet for hv_sock */
  69. #define HVS_MAX_PKT_SIZE HVS_PKT_LEN(HVS_MTU_SIZE)
  70. union hvs_service_id {
  71. guid_t srv_id;
  72. struct {
  73. unsigned int svm_port;
  74. unsigned char b[sizeof(guid_t) - sizeof(unsigned int)];
  75. };
  76. };
  77. /* Per-socket state (accessed via vsk->trans) */
  78. struct hvsock {
  79. struct vsock_sock *vsk;
  80. guid_t vm_srv_id;
  81. guid_t host_srv_id;
  82. struct vmbus_channel *chan;
  83. struct vmpacket_descriptor *recv_desc;
  84. /* The length of the payload not delivered to userland yet */
  85. u32 recv_data_len;
  86. /* The offset of the payload */
  87. u32 recv_data_off;
  88. /* Have we sent the zero-length packet (FIN)? */
  89. bool fin_sent;
  90. };
  91. /* In the VM, we support Hyper-V Sockets with AF_VSOCK, and the endpoint is
  92. * <cid, port> (see struct sockaddr_vm). Note: cid is not really used here:
  93. * when we write apps to connect to the host, we can only use VMADDR_CID_ANY
  94. * or VMADDR_CID_HOST (both are equivalent) as the remote cid, and when we
  95. * write apps to bind() & listen() in the VM, we can only use VMADDR_CID_ANY
  96. * as the local cid.
  97. *
  98. * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV:
  99. * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user-
  100. * guide/make-integration-service, and the endpoint is <VmID, ServiceId> with
  101. * the below sockaddr:
  102. *
  103. * struct SOCKADDR_HV
  104. * {
  105. * ADDRESS_FAMILY Family;
  106. * USHORT Reserved;
  107. * GUID VmId;
  108. * GUID ServiceId;
  109. * };
  110. * Note: VmID is not used by Linux VM and actually it isn't transmitted via
  111. * VMBus, because here it's obvious the host and the VM can easily identify
  112. * each other. Though the VmID is useful on the host, especially in the case
  113. * of Windows container, Linux VM doesn't need it at all.
  114. *
  115. * To make use of the AF_VSOCK infrastructure in Linux VM, we have to limit
  116. * the available GUID space of SOCKADDR_HV so that we can create a mapping
  117. * between AF_VSOCK port and SOCKADDR_HV Service GUID. The rule of writing
  118. * Hyper-V Sockets apps on the host and in Linux VM is:
  119. *
  120. ****************************************************************************
  121. * The only valid Service GUIDs, from the perspectives of both the host and *
  122. * Linux VM, that can be connected by the other end, must conform to this *
  123. * format: <port>-facb-11e6-bd58-64006a7986d3. *
  124. ****************************************************************************
  125. *
  126. * When we write apps on the host to connect(), the GUID ServiceID is used.
  127. * When we write apps in Linux VM to connect(), we only need to specify the
  128. * port and the driver will form the GUID and use that to request the host.
  129. *
  130. */
  131. /* 00000000-facb-11e6-bd58-64006a7986d3 */
  132. static const guid_t srv_id_template =
  133. GUID_INIT(0x00000000, 0xfacb, 0x11e6, 0xbd, 0x58,
  134. 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3);
  135. static bool hvs_check_transport(struct vsock_sock *vsk);
  136. static bool is_valid_srv_id(const guid_t *id)
  137. {
  138. return !memcmp(&id->b[4], &srv_id_template.b[4], sizeof(guid_t) - 4);
  139. }
  140. static unsigned int get_port_by_srv_id(const guid_t *svr_id)
  141. {
  142. return *((unsigned int *)svr_id);
  143. }
  144. static void hvs_addr_init(struct sockaddr_vm *addr, const guid_t *svr_id)
  145. {
  146. unsigned int port = get_port_by_srv_id(svr_id);
  147. vsock_addr_init(addr, VMADDR_CID_ANY, port);
  148. }
  149. static void hvs_set_channel_pending_send_size(struct vmbus_channel *chan)
  150. {
  151. set_channel_pending_send_size(chan,
  152. HVS_PKT_LEN(HVS_SEND_BUF_SIZE));
  153. virt_mb();
  154. }
  155. static bool hvs_channel_readable(struct vmbus_channel *chan)
  156. {
  157. u32 readable = hv_get_bytes_to_read(&chan->inbound);
  158. /* 0-size payload means FIN */
  159. return readable >= HVS_PKT_LEN(0);
  160. }
  161. static int hvs_channel_readable_payload(struct vmbus_channel *chan)
  162. {
  163. u32 readable = hv_get_bytes_to_read(&chan->inbound);
  164. if (readable > HVS_PKT_LEN(0)) {
  165. /* At least we have 1 byte to read. We don't need to return
  166. * the exact readable bytes: see vsock_stream_recvmsg() ->
  167. * vsock_stream_has_data().
  168. */
  169. return 1;
  170. }
  171. if (readable == HVS_PKT_LEN(0)) {
  172. /* 0-size payload means FIN */
  173. return 0;
  174. }
  175. /* No payload or FIN */
  176. return -1;
  177. }
  178. static size_t hvs_channel_writable_bytes(struct vmbus_channel *chan)
  179. {
  180. u32 writeable = hv_get_bytes_to_write(&chan->outbound);
  181. size_t ret;
  182. /* The ringbuffer mustn't be 100% full, and we should reserve a
  183. * zero-length-payload packet for the FIN: see hv_ringbuffer_write()
  184. * and hvs_shutdown().
  185. */
  186. if (writeable <= HVS_PKT_LEN(1) + HVS_PKT_LEN(0))
  187. return 0;
  188. ret = writeable - HVS_PKT_LEN(1) - HVS_PKT_LEN(0);
  189. return round_down(ret, 8);
  190. }
  191. static int __hvs_send_data(struct vmbus_channel *chan,
  192. struct vmpipe_proto_header *hdr,
  193. size_t to_write)
  194. {
  195. hdr->pkt_type = 1;
  196. hdr->data_size = to_write;
  197. return vmbus_sendpacket(chan, hdr, sizeof(*hdr) + to_write,
  198. 0, VM_PKT_DATA_INBAND, 0);
  199. }
  200. static int hvs_send_data(struct vmbus_channel *chan,
  201. struct hvs_send_buf *send_buf, size_t to_write)
  202. {
  203. return __hvs_send_data(chan, &send_buf->hdr, to_write);
  204. }
  205. static void hvs_channel_cb(void *ctx)
  206. {
  207. struct sock *sk = (struct sock *)ctx;
  208. struct vsock_sock *vsk = vsock_sk(sk);
  209. struct hvsock *hvs = vsk->trans;
  210. struct vmbus_channel *chan = hvs->chan;
  211. if (hvs_channel_readable(chan))
  212. sk->sk_data_ready(sk);
  213. if (hv_get_bytes_to_write(&chan->outbound) > 0)
  214. sk->sk_write_space(sk);
  215. }
  216. static void hvs_do_close_lock_held(struct vsock_sock *vsk,
  217. bool cancel_timeout)
  218. {
  219. struct sock *sk = sk_vsock(vsk);
  220. sock_set_flag(sk, SOCK_DONE);
  221. vsk->peer_shutdown = SHUTDOWN_MASK;
  222. if (vsock_stream_has_data(vsk) <= 0)
  223. sk->sk_state = TCP_CLOSING;
  224. sk->sk_state_change(sk);
  225. if (vsk->close_work_scheduled &&
  226. (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) {
  227. vsk->close_work_scheduled = false;
  228. vsock_remove_sock(vsk);
  229. /* Release the reference taken while scheduling the timeout */
  230. sock_put(sk);
  231. }
  232. }
  233. static void hvs_close_connection(struct vmbus_channel *chan)
  234. {
  235. struct sock *sk = get_per_channel_state(chan);
  236. lock_sock(sk);
  237. hvs_do_close_lock_held(vsock_sk(sk), true);
  238. release_sock(sk);
  239. /* Release the refcnt for the channel that's opened in
  240. * hvs_open_connection().
  241. */
  242. sock_put(sk);
  243. }
  244. static void hvs_open_connection(struct vmbus_channel *chan)
  245. {
  246. guid_t *if_instance, *if_type;
  247. unsigned char conn_from_host;
  248. struct sockaddr_vm addr;
  249. struct sock *sk, *new = NULL;
  250. struct vsock_sock *vnew = NULL;
  251. struct hvsock *hvs = NULL;
  252. struct hvsock *hvs_new = NULL;
  253. int rcvbuf;
  254. int ret;
  255. int sndbuf;
  256. if_type = &chan->offermsg.offer.if_type;
  257. if_instance = &chan->offermsg.offer.if_instance;
  258. conn_from_host = chan->offermsg.offer.u.pipe.user_def[0];
  259. if (!is_valid_srv_id(if_type))
  260. return;
  261. hvs_addr_init(&addr, conn_from_host ? if_type : if_instance);
  262. sk = vsock_find_bound_socket(&addr);
  263. if (!sk)
  264. return;
  265. lock_sock(sk);
  266. if ((conn_from_host && sk->sk_state != TCP_LISTEN) ||
  267. (!conn_from_host && sk->sk_state != TCP_SYN_SENT))
  268. goto out;
  269. if (conn_from_host) {
  270. if (sk->sk_ack_backlog >= sk->sk_max_ack_backlog)
  271. goto out;
  272. new = vsock_create_connected(sk);
  273. if (!new)
  274. goto out;
  275. new->sk_state = TCP_SYN_SENT;
  276. vnew = vsock_sk(new);
  277. hvs_addr_init(&vnew->local_addr, if_type);
  278. /* Remote peer is always the host */
  279. vsock_addr_init(&vnew->remote_addr,
  280. VMADDR_CID_HOST, VMADDR_PORT_ANY);
  281. vnew->remote_addr.svm_port = get_port_by_srv_id(if_instance);
  282. ret = vsock_assign_transport(vnew, vsock_sk(sk));
  283. /* Transport assigned (looking at remote_addr) must be the
  284. * same where we received the request.
  285. */
  286. if (ret || !hvs_check_transport(vnew)) {
  287. sock_put(new);
  288. goto out;
  289. }
  290. hvs_new = vnew->trans;
  291. hvs_new->chan = chan;
  292. } else {
  293. hvs = vsock_sk(sk)->trans;
  294. hvs->chan = chan;
  295. }
  296. set_channel_read_mode(chan, HV_CALL_DIRECT);
  297. /* Use the socket buffer sizes as hints for the VMBUS ring size. For
  298. * server side sockets, 'sk' is the parent socket and thus, this will
  299. * allow the child sockets to inherit the size from the parent. Keep
  300. * the mins to the default value and align to page size as per VMBUS
  301. * requirements.
  302. * For the max, the socket core library will limit the socket buffer
  303. * size that can be set by the user, but, since currently, the hv_sock
  304. * VMBUS ring buffer is physically contiguous allocation, restrict it
  305. * further.
  306. * Older versions of hv_sock host side code cannot handle bigger VMBUS
  307. * ring buffer size. Use the version number to limit the change to newer
  308. * versions.
  309. */
  310. if (vmbus_proto_version < VERSION_WIN10_V5) {
  311. sndbuf = RINGBUFFER_HVS_SND_SIZE;
  312. rcvbuf = RINGBUFFER_HVS_RCV_SIZE;
  313. } else {
  314. sndbuf = max_t(int, sk->sk_sndbuf, RINGBUFFER_HVS_SND_SIZE);
  315. sndbuf = min_t(int, sndbuf, RINGBUFFER_HVS_MAX_SIZE);
  316. sndbuf = ALIGN(sndbuf, HV_HYP_PAGE_SIZE);
  317. rcvbuf = max_t(int, sk->sk_rcvbuf, RINGBUFFER_HVS_RCV_SIZE);
  318. rcvbuf = min_t(int, rcvbuf, RINGBUFFER_HVS_MAX_SIZE);
  319. rcvbuf = ALIGN(rcvbuf, HV_HYP_PAGE_SIZE);
  320. }
  321. chan->max_pkt_size = HVS_MAX_PKT_SIZE;
  322. ret = vmbus_open(chan, sndbuf, rcvbuf, NULL, 0, hvs_channel_cb,
  323. conn_from_host ? new : sk);
  324. if (ret != 0) {
  325. if (conn_from_host) {
  326. hvs_new->chan = NULL;
  327. sock_put(new);
  328. } else {
  329. hvs->chan = NULL;
  330. }
  331. goto out;
  332. }
  333. set_per_channel_state(chan, conn_from_host ? new : sk);
  334. /* This reference will be dropped by hvs_close_connection(). */
  335. sock_hold(conn_from_host ? new : sk);
  336. vmbus_set_chn_rescind_callback(chan, hvs_close_connection);
  337. /* Set the pending send size to max packet size to always get
  338. * notifications from the host when there is enough writable space.
  339. * The host is optimized to send notifications only when the pending
  340. * size boundary is crossed, and not always.
  341. */
  342. hvs_set_channel_pending_send_size(chan);
  343. if (conn_from_host) {
  344. new->sk_state = TCP_ESTABLISHED;
  345. sk_acceptq_added(sk);
  346. hvs_new->vm_srv_id = *if_type;
  347. hvs_new->host_srv_id = *if_instance;
  348. vsock_insert_connected(vnew);
  349. vsock_enqueue_accept(sk, new);
  350. } else {
  351. sk->sk_state = TCP_ESTABLISHED;
  352. sk->sk_socket->state = SS_CONNECTED;
  353. vsock_insert_connected(vsock_sk(sk));
  354. }
  355. sk->sk_state_change(sk);
  356. out:
  357. /* Release refcnt obtained when we called vsock_find_bound_socket() */
  358. sock_put(sk);
  359. release_sock(sk);
  360. }
  361. static u32 hvs_get_local_cid(void)
  362. {
  363. return VMADDR_CID_ANY;
  364. }
  365. static int hvs_sock_init(struct vsock_sock *vsk, struct vsock_sock *psk)
  366. {
  367. struct hvsock *hvs;
  368. struct sock *sk = sk_vsock(vsk);
  369. hvs = kzalloc(sizeof(*hvs), GFP_KERNEL);
  370. if (!hvs)
  371. return -ENOMEM;
  372. vsk->trans = hvs;
  373. hvs->vsk = vsk;
  374. sk->sk_sndbuf = RINGBUFFER_HVS_SND_SIZE;
  375. sk->sk_rcvbuf = RINGBUFFER_HVS_RCV_SIZE;
  376. return 0;
  377. }
  378. static int hvs_connect(struct vsock_sock *vsk)
  379. {
  380. union hvs_service_id vm, host;
  381. struct hvsock *h = vsk->trans;
  382. vm.srv_id = srv_id_template;
  383. vm.svm_port = vsk->local_addr.svm_port;
  384. h->vm_srv_id = vm.srv_id;
  385. host.srv_id = srv_id_template;
  386. host.svm_port = vsk->remote_addr.svm_port;
  387. h->host_srv_id = host.srv_id;
  388. return vmbus_send_tl_connect_request(&h->vm_srv_id, &h->host_srv_id);
  389. }
  390. static void hvs_shutdown_lock_held(struct hvsock *hvs, int mode)
  391. {
  392. struct vmpipe_proto_header hdr;
  393. if (hvs->fin_sent || !hvs->chan)
  394. return;
  395. /* It can't fail: see hvs_channel_writable_bytes(). */
  396. (void)__hvs_send_data(hvs->chan, &hdr, 0);
  397. hvs->fin_sent = true;
  398. }
  399. static int hvs_shutdown(struct vsock_sock *vsk, int mode)
  400. {
  401. if (!(mode & SEND_SHUTDOWN))
  402. return 0;
  403. hvs_shutdown_lock_held(vsk->trans, mode);
  404. return 0;
  405. }
  406. static void hvs_close_timeout(struct work_struct *work)
  407. {
  408. struct vsock_sock *vsk =
  409. container_of(work, struct vsock_sock, close_work.work);
  410. struct sock *sk = sk_vsock(vsk);
  411. sock_hold(sk);
  412. lock_sock(sk);
  413. if (!sock_flag(sk, SOCK_DONE))
  414. hvs_do_close_lock_held(vsk, false);
  415. vsk->close_work_scheduled = false;
  416. release_sock(sk);
  417. sock_put(sk);
  418. }
  419. /* Returns true, if it is safe to remove socket; false otherwise */
  420. static bool hvs_close_lock_held(struct vsock_sock *vsk)
  421. {
  422. struct sock *sk = sk_vsock(vsk);
  423. if (!(sk->sk_state == TCP_ESTABLISHED ||
  424. sk->sk_state == TCP_CLOSING))
  425. return true;
  426. if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK)
  427. hvs_shutdown_lock_held(vsk->trans, SHUTDOWN_MASK);
  428. if (sock_flag(sk, SOCK_DONE))
  429. return true;
  430. /* This reference will be dropped by the delayed close routine */
  431. sock_hold(sk);
  432. INIT_DELAYED_WORK(&vsk->close_work, hvs_close_timeout);
  433. vsk->close_work_scheduled = true;
  434. schedule_delayed_work(&vsk->close_work, HVS_CLOSE_TIMEOUT);
  435. return false;
  436. }
  437. static void hvs_release(struct vsock_sock *vsk)
  438. {
  439. bool remove_sock;
  440. remove_sock = hvs_close_lock_held(vsk);
  441. if (remove_sock)
  442. vsock_remove_sock(vsk);
  443. }
  444. static void hvs_destruct(struct vsock_sock *vsk)
  445. {
  446. struct hvsock *hvs = vsk->trans;
  447. struct vmbus_channel *chan = hvs->chan;
  448. if (chan)
  449. vmbus_hvsock_device_unregister(chan);
  450. kfree(hvs);
  451. vsk->trans = NULL;
  452. }
  453. static int hvs_dgram_bind(struct vsock_sock *vsk, struct sockaddr_vm *addr)
  454. {
  455. return -EOPNOTSUPP;
  456. }
  457. static int hvs_dgram_dequeue(struct vsock_sock *vsk, struct msghdr *msg,
  458. size_t len, int flags)
  459. {
  460. return -EOPNOTSUPP;
  461. }
  462. static int hvs_dgram_enqueue(struct vsock_sock *vsk,
  463. struct sockaddr_vm *remote, struct msghdr *msg,
  464. size_t dgram_len)
  465. {
  466. return -EOPNOTSUPP;
  467. }
  468. static bool hvs_dgram_allow(u32 cid, u32 port)
  469. {
  470. return false;
  471. }
  472. static int hvs_update_recv_data(struct hvsock *hvs)
  473. {
  474. struct hvs_recv_buf *recv_buf;
  475. u32 pkt_len, payload_len;
  476. pkt_len = hv_pkt_len(hvs->recv_desc);
  477. if (pkt_len < HVS_HEADER_LEN)
  478. return -EIO;
  479. recv_buf = (struct hvs_recv_buf *)(hvs->recv_desc + 1);
  480. payload_len = recv_buf->hdr.data_size;
  481. if (payload_len > pkt_len - HVS_HEADER_LEN ||
  482. payload_len > HVS_MTU_SIZE)
  483. return -EIO;
  484. if (payload_len == 0)
  485. hvs->vsk->peer_shutdown |= SEND_SHUTDOWN;
  486. hvs->recv_data_len = payload_len;
  487. hvs->recv_data_off = 0;
  488. return 0;
  489. }
  490. static ssize_t hvs_stream_dequeue(struct vsock_sock *vsk, struct msghdr *msg,
  491. size_t len, int flags)
  492. {
  493. struct hvsock *hvs = vsk->trans;
  494. bool need_refill = !hvs->recv_desc;
  495. struct hvs_recv_buf *recv_buf;
  496. u32 to_read;
  497. int ret;
  498. if (flags & MSG_PEEK)
  499. return -EOPNOTSUPP;
  500. if (need_refill) {
  501. hvs->recv_desc = hv_pkt_iter_first(hvs->chan);
  502. if (!hvs->recv_desc)
  503. return -ENOBUFS;
  504. ret = hvs_update_recv_data(hvs);
  505. if (ret)
  506. return ret;
  507. }
  508. recv_buf = (struct hvs_recv_buf *)(hvs->recv_desc + 1);
  509. to_read = min_t(u32, len, hvs->recv_data_len);
  510. ret = memcpy_to_msg(msg, recv_buf->data + hvs->recv_data_off, to_read);
  511. if (ret != 0)
  512. return ret;
  513. hvs->recv_data_len -= to_read;
  514. if (hvs->recv_data_len == 0) {
  515. hvs->recv_desc = hv_pkt_iter_next(hvs->chan, hvs->recv_desc);
  516. if (hvs->recv_desc) {
  517. ret = hvs_update_recv_data(hvs);
  518. if (ret)
  519. return ret;
  520. }
  521. } else {
  522. hvs->recv_data_off += to_read;
  523. }
  524. return to_read;
  525. }
  526. static ssize_t hvs_stream_enqueue(struct vsock_sock *vsk, struct msghdr *msg,
  527. size_t len)
  528. {
  529. struct hvsock *hvs = vsk->trans;
  530. struct vmbus_channel *chan = hvs->chan;
  531. struct hvs_send_buf *send_buf;
  532. ssize_t to_write, max_writable;
  533. ssize_t ret = 0;
  534. ssize_t bytes_written = 0;
  535. BUILD_BUG_ON(sizeof(*send_buf) != HV_HYP_PAGE_SIZE);
  536. send_buf = kmalloc(sizeof(*send_buf), GFP_KERNEL);
  537. if (!send_buf)
  538. return -ENOMEM;
  539. /* Reader(s) could be draining data from the channel as we write.
  540. * Maximize bandwidth, by iterating until the channel is found to be
  541. * full.
  542. */
  543. while (len) {
  544. max_writable = hvs_channel_writable_bytes(chan);
  545. if (!max_writable)
  546. break;
  547. to_write = min_t(ssize_t, len, max_writable);
  548. to_write = min_t(ssize_t, to_write, HVS_SEND_BUF_SIZE);
  549. /* memcpy_from_msg is safe for loop as it advances the offsets
  550. * within the message iterator.
  551. */
  552. ret = memcpy_from_msg(send_buf->data, msg, to_write);
  553. if (ret < 0)
  554. goto out;
  555. ret = hvs_send_data(hvs->chan, send_buf, to_write);
  556. if (ret < 0)
  557. goto out;
  558. bytes_written += to_write;
  559. len -= to_write;
  560. }
  561. out:
  562. /* If any data has been sent, return that */
  563. if (bytes_written)
  564. ret = bytes_written;
  565. kfree(send_buf);
  566. return ret;
  567. }
  568. static s64 hvs_stream_has_data(struct vsock_sock *vsk)
  569. {
  570. struct hvsock *hvs = vsk->trans;
  571. s64 ret;
  572. if (hvs->recv_data_len > 0)
  573. return 1;
  574. switch (hvs_channel_readable_payload(hvs->chan)) {
  575. case 1:
  576. ret = 1;
  577. break;
  578. case 0:
  579. vsk->peer_shutdown |= SEND_SHUTDOWN;
  580. ret = 0;
  581. break;
  582. default: /* -1 */
  583. ret = 0;
  584. break;
  585. }
  586. return ret;
  587. }
  588. static s64 hvs_stream_has_space(struct vsock_sock *vsk)
  589. {
  590. struct hvsock *hvs = vsk->trans;
  591. return hvs_channel_writable_bytes(hvs->chan);
  592. }
  593. static u64 hvs_stream_rcvhiwat(struct vsock_sock *vsk)
  594. {
  595. return HVS_MTU_SIZE + 1;
  596. }
  597. static bool hvs_stream_is_active(struct vsock_sock *vsk)
  598. {
  599. struct hvsock *hvs = vsk->trans;
  600. return hvs->chan != NULL;
  601. }
  602. static bool hvs_stream_allow(u32 cid, u32 port)
  603. {
  604. if (cid == VMADDR_CID_HOST)
  605. return true;
  606. return false;
  607. }
  608. static
  609. int hvs_notify_poll_in(struct vsock_sock *vsk, size_t target, bool *readable)
  610. {
  611. struct hvsock *hvs = vsk->trans;
  612. *readable = hvs_channel_readable(hvs->chan);
  613. return 0;
  614. }
  615. static
  616. int hvs_notify_poll_out(struct vsock_sock *vsk, size_t target, bool *writable)
  617. {
  618. *writable = hvs_stream_has_space(vsk) > 0;
  619. return 0;
  620. }
  621. static
  622. int hvs_notify_recv_init(struct vsock_sock *vsk, size_t target,
  623. struct vsock_transport_recv_notify_data *d)
  624. {
  625. return 0;
  626. }
  627. static
  628. int hvs_notify_recv_pre_block(struct vsock_sock *vsk, size_t target,
  629. struct vsock_transport_recv_notify_data *d)
  630. {
  631. return 0;
  632. }
  633. static
  634. int hvs_notify_recv_pre_dequeue(struct vsock_sock *vsk, size_t target,
  635. struct vsock_transport_recv_notify_data *d)
  636. {
  637. return 0;
  638. }
  639. static
  640. int hvs_notify_recv_post_dequeue(struct vsock_sock *vsk, size_t target,
  641. ssize_t copied, bool data_read,
  642. struct vsock_transport_recv_notify_data *d)
  643. {
  644. return 0;
  645. }
  646. static
  647. int hvs_notify_send_init(struct vsock_sock *vsk,
  648. struct vsock_transport_send_notify_data *d)
  649. {
  650. return 0;
  651. }
  652. static
  653. int hvs_notify_send_pre_block(struct vsock_sock *vsk,
  654. struct vsock_transport_send_notify_data *d)
  655. {
  656. return 0;
  657. }
  658. static
  659. int hvs_notify_send_pre_enqueue(struct vsock_sock *vsk,
  660. struct vsock_transport_send_notify_data *d)
  661. {
  662. return 0;
  663. }
  664. static
  665. int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, ssize_t written,
  666. struct vsock_transport_send_notify_data *d)
  667. {
  668. return 0;
  669. }
  670. static
  671. int hvs_notify_set_rcvlowat(struct vsock_sock *vsk, int val)
  672. {
  673. return -EOPNOTSUPP;
  674. }
  675. static struct vsock_transport hvs_transport = {
  676. .module = THIS_MODULE,
  677. .get_local_cid = hvs_get_local_cid,
  678. .init = hvs_sock_init,
  679. .destruct = hvs_destruct,
  680. .release = hvs_release,
  681. .connect = hvs_connect,
  682. .shutdown = hvs_shutdown,
  683. .dgram_bind = hvs_dgram_bind,
  684. .dgram_dequeue = hvs_dgram_dequeue,
  685. .dgram_enqueue = hvs_dgram_enqueue,
  686. .dgram_allow = hvs_dgram_allow,
  687. .stream_dequeue = hvs_stream_dequeue,
  688. .stream_enqueue = hvs_stream_enqueue,
  689. .stream_has_data = hvs_stream_has_data,
  690. .stream_has_space = hvs_stream_has_space,
  691. .stream_rcvhiwat = hvs_stream_rcvhiwat,
  692. .stream_is_active = hvs_stream_is_active,
  693. .stream_allow = hvs_stream_allow,
  694. .notify_poll_in = hvs_notify_poll_in,
  695. .notify_poll_out = hvs_notify_poll_out,
  696. .notify_recv_init = hvs_notify_recv_init,
  697. .notify_recv_pre_block = hvs_notify_recv_pre_block,
  698. .notify_recv_pre_dequeue = hvs_notify_recv_pre_dequeue,
  699. .notify_recv_post_dequeue = hvs_notify_recv_post_dequeue,
  700. .notify_send_init = hvs_notify_send_init,
  701. .notify_send_pre_block = hvs_notify_send_pre_block,
  702. .notify_send_pre_enqueue = hvs_notify_send_pre_enqueue,
  703. .notify_send_post_enqueue = hvs_notify_send_post_enqueue,
  704. .notify_set_rcvlowat = hvs_notify_set_rcvlowat
  705. };
  706. static bool hvs_check_transport(struct vsock_sock *vsk)
  707. {
  708. return vsk->transport == &hvs_transport;
  709. }
  710. static int hvs_probe(struct hv_device *hdev,
  711. const struct hv_vmbus_device_id *dev_id)
  712. {
  713. struct vmbus_channel *chan = hdev->channel;
  714. hvs_open_connection(chan);
  715. /* Always return success to suppress the unnecessary error message
  716. * in vmbus_probe(): on error the host will rescind the device in
  717. * 30 seconds and we can do cleanup at that time in
  718. * vmbus_onoffer_rescind().
  719. */
  720. return 0;
  721. }
  722. static void hvs_remove(struct hv_device *hdev)
  723. {
  724. struct vmbus_channel *chan = hdev->channel;
  725. vmbus_close(chan);
  726. }
  727. /* hv_sock connections can not persist across hibernation, and all the hv_sock
  728. * channels are forced to be rescinded before hibernation: see
  729. * vmbus_bus_suspend(). Here the dummy hvs_suspend() and hvs_resume()
  730. * are only needed because hibernation requires that every vmbus device's
  731. * driver should have a .suspend and .resume callback: see vmbus_suspend().
  732. */
  733. static int hvs_suspend(struct hv_device *hv_dev)
  734. {
  735. /* Dummy */
  736. return 0;
  737. }
  738. static int hvs_resume(struct hv_device *dev)
  739. {
  740. /* Dummy */
  741. return 0;
  742. }
  743. /* This isn't really used. See vmbus_match() and vmbus_probe() */
  744. static const struct hv_vmbus_device_id id_table[] = {
  745. {},
  746. };
  747. static struct hv_driver hvs_drv = {
  748. .name = "hv_sock",
  749. .hvsock = true,
  750. .id_table = id_table,
  751. .probe = hvs_probe,
  752. .remove = hvs_remove,
  753. .suspend = hvs_suspend,
  754. .resume = hvs_resume,
  755. };
  756. static int __init hvs_init(void)
  757. {
  758. int ret;
  759. if (vmbus_proto_version < VERSION_WIN10)
  760. return -ENODEV;
  761. ret = vmbus_driver_register(&hvs_drv);
  762. if (ret != 0)
  763. return ret;
  764. ret = vsock_core_register(&hvs_transport, VSOCK_TRANSPORT_F_G2H);
  765. if (ret) {
  766. vmbus_driver_unregister(&hvs_drv);
  767. return ret;
  768. }
  769. return 0;
  770. }
  771. static void __exit hvs_exit(void)
  772. {
  773. vsock_core_unregister(&hvs_transport);
  774. vmbus_driver_unregister(&hvs_drv);
  775. }
  776. module_init(hvs_init);
  777. module_exit(hvs_exit);
  778. MODULE_DESCRIPTION("Hyper-V Sockets");
  779. MODULE_VERSION("1.0.0");
  780. MODULE_LICENSE("GPL");
  781. MODULE_ALIAS_NETPROTO(PF_VSOCK);