vsock.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * vhost transport for vsock
  4. *
  5. * Copyright (C) 2013-2015 Red Hat, Inc.
  6. * Author: Asias He <asias@redhat.com>
  7. * Stefan Hajnoczi <stefanha@redhat.com>
  8. */
  9. #include <linux/miscdevice.h>
  10. #include <linux/atomic.h>
  11. #include <linux/module.h>
  12. #include <linux/mutex.h>
  13. #include <linux/vmalloc.h>
  14. #include <net/sock.h>
  15. #include <linux/virtio_vsock.h>
  16. #include <linux/vhost.h>
  17. #include <linux/hashtable.h>
  18. #include <net/af_vsock.h>
  19. #include "vhost.h"
  20. #define VHOST_VSOCK_DEFAULT_HOST_CID 2
  21. /* Max number of bytes transferred before requeueing the job.
  22. * Using this limit prevents one virtqueue from starving others. */
  23. #define VHOST_VSOCK_WEIGHT 0x80000
  24. /* Max number of packets transferred before requeueing the job.
  25. * Using this limit prevents one virtqueue from starving others with
  26. * small pkts.
  27. */
  28. #define VHOST_VSOCK_PKT_WEIGHT 256
  29. enum {
  30. VHOST_VSOCK_FEATURES = VHOST_FEATURES |
  31. (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
  32. (1ULL << VIRTIO_VSOCK_F_SEQPACKET)
  33. };
  34. enum {
  35. VHOST_VSOCK_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2)
  36. };
  37. /* Used to track all the vhost_vsock instances on the system. */
  38. static DEFINE_MUTEX(vhost_vsock_mutex);
  39. static DEFINE_READ_MOSTLY_HASHTABLE(vhost_vsock_hash, 8);
  40. struct vhost_vsock {
  41. struct vhost_dev dev;
  42. struct vhost_virtqueue vqs[2];
  43. /* Link to global vhost_vsock_hash, writes use vhost_vsock_mutex */
  44. struct hlist_node hash;
  45. struct vhost_work send_pkt_work;
  46. struct sk_buff_head send_pkt_queue; /* host->guest pending packets */
  47. atomic_t queued_replies;
  48. u32 guest_cid;
  49. bool seqpacket_allow;
  50. };
  51. static u32 vhost_transport_get_local_cid(void)
  52. {
  53. return VHOST_VSOCK_DEFAULT_HOST_CID;
  54. }
  55. /* Callers that dereference the return value must hold vhost_vsock_mutex or the
  56. * RCU read lock.
  57. */
  58. static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
  59. {
  60. struct vhost_vsock *vsock;
  61. hash_for_each_possible_rcu(vhost_vsock_hash, vsock, hash, guest_cid) {
  62. u32 other_cid = vsock->guest_cid;
  63. /* Skip instances that have no CID yet */
  64. if (other_cid == 0)
  65. continue;
  66. if (other_cid == guest_cid)
  67. return vsock;
  68. }
  69. return NULL;
  70. }
  71. static void
  72. vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
  73. struct vhost_virtqueue *vq)
  74. {
  75. struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
  76. int pkts = 0, total_len = 0;
  77. bool added = false;
  78. bool restart_tx = false;
  79. mutex_lock(&vq->mutex);
  80. if (!vhost_vq_get_backend(vq))
  81. goto out;
  82. if (!vq_meta_prefetch(vq))
  83. goto out;
  84. /* Avoid further vmexits, we're already processing the virtqueue */
  85. vhost_disable_notify(&vsock->dev, vq);
  86. do {
  87. struct virtio_vsock_hdr *hdr;
  88. size_t iov_len, payload_len;
  89. struct iov_iter iov_iter;
  90. u32 flags_to_restore = 0;
  91. struct sk_buff *skb;
  92. unsigned out, in;
  93. size_t nbytes;
  94. u32 offset;
  95. int head;
  96. skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue);
  97. if (!skb) {
  98. vhost_enable_notify(&vsock->dev, vq);
  99. break;
  100. }
  101. head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
  102. &out, &in, NULL, NULL);
  103. if (head < 0) {
  104. virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb);
  105. break;
  106. }
  107. if (head == vq->num) {
  108. virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb);
  109. /* We cannot finish yet if more buffers snuck in while
  110. * re-enabling notify.
  111. */
  112. if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
  113. vhost_disable_notify(&vsock->dev, vq);
  114. continue;
  115. }
  116. break;
  117. }
  118. if (out) {
  119. kfree_skb(skb);
  120. vq_err(vq, "Expected 0 output buffers, got %u\n", out);
  121. break;
  122. }
  123. iov_len = iov_length(&vq->iov[out], in);
  124. if (iov_len < sizeof(*hdr)) {
  125. kfree_skb(skb);
  126. vq_err(vq, "Buffer len [%zu] too small\n", iov_len);
  127. break;
  128. }
  129. iov_iter_init(&iov_iter, ITER_DEST, &vq->iov[out], in, iov_len);
  130. offset = VIRTIO_VSOCK_SKB_CB(skb)->offset;
  131. payload_len = skb->len - offset;
  132. hdr = virtio_vsock_hdr(skb);
  133. /* If the packet is greater than the space available in the
  134. * buffer, we split it using multiple buffers.
  135. */
  136. if (payload_len > iov_len - sizeof(*hdr)) {
  137. payload_len = iov_len - sizeof(*hdr);
  138. /* As we are copying pieces of large packet's buffer to
  139. * small rx buffers, headers of packets in rx queue are
  140. * created dynamically and are initialized with header
  141. * of current packet(except length). But in case of
  142. * SOCK_SEQPACKET, we also must clear message delimeter
  143. * bit (VIRTIO_VSOCK_SEQ_EOM) and MSG_EOR bit
  144. * (VIRTIO_VSOCK_SEQ_EOR) if set. Otherwise,
  145. * there will be sequence of packets with these
  146. * bits set. After initialized header will be copied to
  147. * rx buffer, these required bits will be restored.
  148. */
  149. if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) {
  150. hdr->flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
  151. flags_to_restore |= VIRTIO_VSOCK_SEQ_EOM;
  152. if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR) {
  153. hdr->flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
  154. flags_to_restore |= VIRTIO_VSOCK_SEQ_EOR;
  155. }
  156. }
  157. }
  158. /* Set the correct length in the header */
  159. hdr->len = cpu_to_le32(payload_len);
  160. nbytes = copy_to_iter(hdr, sizeof(*hdr), &iov_iter);
  161. if (nbytes != sizeof(*hdr)) {
  162. kfree_skb(skb);
  163. vq_err(vq, "Faulted on copying pkt hdr\n");
  164. break;
  165. }
  166. if (skb_copy_datagram_iter(skb,
  167. offset,
  168. &iov_iter,
  169. payload_len)) {
  170. kfree_skb(skb);
  171. vq_err(vq, "Faulted on copying pkt buf\n");
  172. break;
  173. }
  174. /* Deliver to monitoring devices all packets that we
  175. * will transmit.
  176. */
  177. virtio_transport_deliver_tap_pkt(skb);
  178. vhost_add_used(vq, head, sizeof(*hdr) + payload_len);
  179. added = true;
  180. VIRTIO_VSOCK_SKB_CB(skb)->offset += payload_len;
  181. total_len += payload_len;
  182. /* If we didn't send all the payload we can requeue the packet
  183. * to send it with the next available buffer.
  184. */
  185. if (VIRTIO_VSOCK_SKB_CB(skb)->offset < skb->len) {
  186. hdr->flags |= cpu_to_le32(flags_to_restore);
  187. /* We are queueing the same skb to handle
  188. * the remaining bytes, and we want to deliver it
  189. * to monitoring devices in the next iteration.
  190. */
  191. virtio_vsock_skb_clear_tap_delivered(skb);
  192. virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb);
  193. } else {
  194. if (virtio_vsock_skb_reply(skb)) {
  195. int val;
  196. val = atomic_dec_return(&vsock->queued_replies);
  197. /* Do we have resources to resume tx
  198. * processing?
  199. */
  200. if (val + 1 == tx_vq->num)
  201. restart_tx = true;
  202. }
  203. virtio_transport_consume_skb_sent(skb, true);
  204. }
  205. } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
  206. if (added)
  207. vhost_signal(&vsock->dev, vq);
  208. out:
  209. mutex_unlock(&vq->mutex);
  210. if (restart_tx)
  211. vhost_poll_queue(&tx_vq->poll);
  212. }
  213. static void vhost_transport_send_pkt_work(struct vhost_work *work)
  214. {
  215. struct vhost_virtqueue *vq;
  216. struct vhost_vsock *vsock;
  217. vsock = container_of(work, struct vhost_vsock, send_pkt_work);
  218. vq = &vsock->vqs[VSOCK_VQ_RX];
  219. vhost_transport_do_send_pkt(vsock, vq);
  220. }
  221. static int
  222. vhost_transport_send_pkt(struct sk_buff *skb)
  223. {
  224. struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
  225. struct vhost_vsock *vsock;
  226. int len = skb->len;
  227. rcu_read_lock();
  228. /* Find the vhost_vsock according to guest context id */
  229. vsock = vhost_vsock_get(le64_to_cpu(hdr->dst_cid));
  230. if (!vsock) {
  231. rcu_read_unlock();
  232. kfree_skb(skb);
  233. return -ENODEV;
  234. }
  235. if (virtio_vsock_skb_reply(skb))
  236. atomic_inc(&vsock->queued_replies);
  237. virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb);
  238. vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work);
  239. rcu_read_unlock();
  240. return len;
  241. }
  242. static int
  243. vhost_transport_cancel_pkt(struct vsock_sock *vsk)
  244. {
  245. struct vhost_vsock *vsock;
  246. int cnt = 0;
  247. int ret = -ENODEV;
  248. rcu_read_lock();
  249. /* Find the vhost_vsock according to guest context id */
  250. vsock = vhost_vsock_get(vsk->remote_addr.svm_cid);
  251. if (!vsock)
  252. goto out;
  253. cnt = virtio_transport_purge_skbs(vsk, &vsock->send_pkt_queue);
  254. if (cnt) {
  255. struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
  256. int new_cnt;
  257. new_cnt = atomic_sub_return(cnt, &vsock->queued_replies);
  258. if (new_cnt + cnt >= tx_vq->num && new_cnt < tx_vq->num)
  259. vhost_poll_queue(&tx_vq->poll);
  260. }
  261. ret = 0;
  262. out:
  263. rcu_read_unlock();
  264. return ret;
  265. }
  266. static struct sk_buff *
  267. vhost_vsock_alloc_skb(struct vhost_virtqueue *vq,
  268. unsigned int out, unsigned int in)
  269. {
  270. struct virtio_vsock_hdr *hdr;
  271. struct iov_iter iov_iter;
  272. struct sk_buff *skb;
  273. size_t payload_len;
  274. size_t nbytes;
  275. size_t len;
  276. if (in != 0) {
  277. vq_err(vq, "Expected 0 input buffers, got %u\n", in);
  278. return NULL;
  279. }
  280. len = iov_length(vq->iov, out);
  281. if (len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM)
  282. return NULL;
  283. /* len contains both payload and hdr */
  284. skb = virtio_vsock_alloc_skb(len, GFP_KERNEL);
  285. if (!skb)
  286. return NULL;
  287. iov_iter_init(&iov_iter, ITER_SOURCE, vq->iov, out, len);
  288. hdr = virtio_vsock_hdr(skb);
  289. nbytes = copy_from_iter(hdr, sizeof(*hdr), &iov_iter);
  290. if (nbytes != sizeof(*hdr)) {
  291. vq_err(vq, "Expected %zu bytes for pkt->hdr, got %zu bytes\n",
  292. sizeof(*hdr), nbytes);
  293. kfree_skb(skb);
  294. return NULL;
  295. }
  296. payload_len = le32_to_cpu(hdr->len);
  297. /* No payload */
  298. if (!payload_len)
  299. return skb;
  300. /* The pkt is too big or the length in the header is invalid */
  301. if (payload_len + sizeof(*hdr) > len) {
  302. kfree_skb(skb);
  303. return NULL;
  304. }
  305. virtio_vsock_skb_rx_put(skb);
  306. nbytes = copy_from_iter(skb->data, payload_len, &iov_iter);
  307. if (nbytes != payload_len) {
  308. vq_err(vq, "Expected %zu byte payload, got %zu bytes\n",
  309. payload_len, nbytes);
  310. kfree_skb(skb);
  311. return NULL;
  312. }
  313. return skb;
  314. }
  315. /* Is there space left for replies to rx packets? */
  316. static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
  317. {
  318. struct vhost_virtqueue *vq = &vsock->vqs[VSOCK_VQ_TX];
  319. int val;
  320. smp_rmb(); /* paired with atomic_inc() and atomic_dec_return() */
  321. val = atomic_read(&vsock->queued_replies);
  322. return val < vq->num;
  323. }
  324. static bool vhost_transport_msgzerocopy_allow(void)
  325. {
  326. return true;
  327. }
  328. static bool vhost_transport_seqpacket_allow(u32 remote_cid);
  329. static struct virtio_transport vhost_transport = {
  330. .transport = {
  331. .module = THIS_MODULE,
  332. .get_local_cid = vhost_transport_get_local_cid,
  333. .init = virtio_transport_do_socket_init,
  334. .destruct = virtio_transport_destruct,
  335. .release = virtio_transport_release,
  336. .connect = virtio_transport_connect,
  337. .shutdown = virtio_transport_shutdown,
  338. .cancel_pkt = vhost_transport_cancel_pkt,
  339. .dgram_enqueue = virtio_transport_dgram_enqueue,
  340. .dgram_dequeue = virtio_transport_dgram_dequeue,
  341. .dgram_bind = virtio_transport_dgram_bind,
  342. .dgram_allow = virtio_transport_dgram_allow,
  343. .stream_enqueue = virtio_transport_stream_enqueue,
  344. .stream_dequeue = virtio_transport_stream_dequeue,
  345. .stream_has_data = virtio_transport_stream_has_data,
  346. .stream_has_space = virtio_transport_stream_has_space,
  347. .stream_rcvhiwat = virtio_transport_stream_rcvhiwat,
  348. .stream_is_active = virtio_transport_stream_is_active,
  349. .stream_allow = virtio_transport_stream_allow,
  350. .seqpacket_dequeue = virtio_transport_seqpacket_dequeue,
  351. .seqpacket_enqueue = virtio_transport_seqpacket_enqueue,
  352. .seqpacket_allow = vhost_transport_seqpacket_allow,
  353. .seqpacket_has_data = virtio_transport_seqpacket_has_data,
  354. .msgzerocopy_allow = vhost_transport_msgzerocopy_allow,
  355. .notify_poll_in = virtio_transport_notify_poll_in,
  356. .notify_poll_out = virtio_transport_notify_poll_out,
  357. .notify_recv_init = virtio_transport_notify_recv_init,
  358. .notify_recv_pre_block = virtio_transport_notify_recv_pre_block,
  359. .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue,
  360. .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue,
  361. .notify_send_init = virtio_transport_notify_send_init,
  362. .notify_send_pre_block = virtio_transport_notify_send_pre_block,
  363. .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue,
  364. .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
  365. .notify_buffer_size = virtio_transport_notify_buffer_size,
  366. .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat,
  367. .unsent_bytes = virtio_transport_unsent_bytes,
  368. .read_skb = virtio_transport_read_skb,
  369. },
  370. .send_pkt = vhost_transport_send_pkt,
  371. };
  372. static bool vhost_transport_seqpacket_allow(u32 remote_cid)
  373. {
  374. struct vhost_vsock *vsock;
  375. bool seqpacket_allow = false;
  376. rcu_read_lock();
  377. vsock = vhost_vsock_get(remote_cid);
  378. if (vsock)
  379. seqpacket_allow = vsock->seqpacket_allow;
  380. rcu_read_unlock();
  381. return seqpacket_allow;
  382. }
  383. static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
  384. {
  385. struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
  386. poll.work);
  387. struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
  388. dev);
  389. int head, pkts = 0, total_len = 0;
  390. unsigned int out, in;
  391. struct sk_buff *skb;
  392. bool added = false;
  393. mutex_lock(&vq->mutex);
  394. if (!vhost_vq_get_backend(vq))
  395. goto out;
  396. if (!vq_meta_prefetch(vq))
  397. goto out;
  398. vhost_disable_notify(&vsock->dev, vq);
  399. do {
  400. struct virtio_vsock_hdr *hdr;
  401. if (!vhost_vsock_more_replies(vsock)) {
  402. /* Stop tx until the device processes already
  403. * pending replies. Leave tx virtqueue
  404. * callbacks disabled.
  405. */
  406. goto no_more_replies;
  407. }
  408. head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
  409. &out, &in, NULL, NULL);
  410. if (head < 0)
  411. break;
  412. if (head == vq->num) {
  413. if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
  414. vhost_disable_notify(&vsock->dev, vq);
  415. continue;
  416. }
  417. break;
  418. }
  419. skb = vhost_vsock_alloc_skb(vq, out, in);
  420. if (!skb) {
  421. vq_err(vq, "Faulted on pkt\n");
  422. continue;
  423. }
  424. total_len += sizeof(*hdr) + skb->len;
  425. /* Deliver to monitoring devices all received packets */
  426. virtio_transport_deliver_tap_pkt(skb);
  427. hdr = virtio_vsock_hdr(skb);
  428. /* Only accept correctly addressed packets */
  429. if (le64_to_cpu(hdr->src_cid) == vsock->guest_cid &&
  430. le64_to_cpu(hdr->dst_cid) ==
  431. vhost_transport_get_local_cid())
  432. virtio_transport_recv_pkt(&vhost_transport, skb);
  433. else
  434. kfree_skb(skb);
  435. vhost_add_used(vq, head, 0);
  436. added = true;
  437. } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
  438. no_more_replies:
  439. if (added)
  440. vhost_signal(&vsock->dev, vq);
  441. out:
  442. mutex_unlock(&vq->mutex);
  443. }
  444. static void vhost_vsock_handle_rx_kick(struct vhost_work *work)
  445. {
  446. struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
  447. poll.work);
  448. struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
  449. dev);
  450. vhost_transport_do_send_pkt(vsock, vq);
  451. }
  452. static int vhost_vsock_start(struct vhost_vsock *vsock)
  453. {
  454. struct vhost_virtqueue *vq;
  455. size_t i;
  456. int ret;
  457. mutex_lock(&vsock->dev.mutex);
  458. ret = vhost_dev_check_owner(&vsock->dev);
  459. if (ret)
  460. goto err;
  461. for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
  462. vq = &vsock->vqs[i];
  463. mutex_lock(&vq->mutex);
  464. if (!vhost_vq_access_ok(vq)) {
  465. ret = -EFAULT;
  466. goto err_vq;
  467. }
  468. if (!vhost_vq_get_backend(vq)) {
  469. vhost_vq_set_backend(vq, vsock);
  470. ret = vhost_vq_init_access(vq);
  471. if (ret)
  472. goto err_vq;
  473. }
  474. mutex_unlock(&vq->mutex);
  475. }
  476. /* Some packets may have been queued before the device was started,
  477. * let's kick the send worker to send them.
  478. */
  479. vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work);
  480. mutex_unlock(&vsock->dev.mutex);
  481. return 0;
  482. err_vq:
  483. vhost_vq_set_backend(vq, NULL);
  484. mutex_unlock(&vq->mutex);
  485. for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
  486. vq = &vsock->vqs[i];
  487. mutex_lock(&vq->mutex);
  488. vhost_vq_set_backend(vq, NULL);
  489. mutex_unlock(&vq->mutex);
  490. }
  491. err:
  492. mutex_unlock(&vsock->dev.mutex);
  493. return ret;
  494. }
  495. static int vhost_vsock_stop(struct vhost_vsock *vsock, bool check_owner)
  496. {
  497. size_t i;
  498. int ret = 0;
  499. mutex_lock(&vsock->dev.mutex);
  500. if (check_owner) {
  501. ret = vhost_dev_check_owner(&vsock->dev);
  502. if (ret)
  503. goto err;
  504. }
  505. for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
  506. struct vhost_virtqueue *vq = &vsock->vqs[i];
  507. mutex_lock(&vq->mutex);
  508. vhost_vq_set_backend(vq, NULL);
  509. mutex_unlock(&vq->mutex);
  510. }
  511. err:
  512. mutex_unlock(&vsock->dev.mutex);
  513. return ret;
  514. }
  515. static void vhost_vsock_free(struct vhost_vsock *vsock)
  516. {
  517. kvfree(vsock);
  518. }
  519. static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
  520. {
  521. struct vhost_virtqueue **vqs;
  522. struct vhost_vsock *vsock;
  523. int ret;
  524. /* This struct is large and allocation could fail, fall back to vmalloc
  525. * if there is no other way.
  526. */
  527. vsock = kvmalloc(sizeof(*vsock), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
  528. if (!vsock)
  529. return -ENOMEM;
  530. vqs = kmalloc_array(ARRAY_SIZE(vsock->vqs), sizeof(*vqs), GFP_KERNEL);
  531. if (!vqs) {
  532. ret = -ENOMEM;
  533. goto out;
  534. }
  535. vsock->guest_cid = 0; /* no CID assigned yet */
  536. vsock->seqpacket_allow = false;
  537. atomic_set(&vsock->queued_replies, 0);
  538. vqs[VSOCK_VQ_TX] = &vsock->vqs[VSOCK_VQ_TX];
  539. vqs[VSOCK_VQ_RX] = &vsock->vqs[VSOCK_VQ_RX];
  540. vsock->vqs[VSOCK_VQ_TX].handle_kick = vhost_vsock_handle_tx_kick;
  541. vsock->vqs[VSOCK_VQ_RX].handle_kick = vhost_vsock_handle_rx_kick;
  542. vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs),
  543. UIO_MAXIOV, VHOST_VSOCK_PKT_WEIGHT,
  544. VHOST_VSOCK_WEIGHT, true, NULL);
  545. file->private_data = vsock;
  546. skb_queue_head_init(&vsock->send_pkt_queue);
  547. vhost_work_init(&vsock->send_pkt_work, vhost_transport_send_pkt_work);
  548. return 0;
  549. out:
  550. vhost_vsock_free(vsock);
  551. return ret;
  552. }
  553. static void vhost_vsock_flush(struct vhost_vsock *vsock)
  554. {
  555. vhost_dev_flush(&vsock->dev);
  556. }
  557. static void vhost_vsock_reset_orphans(struct sock *sk)
  558. {
  559. struct vsock_sock *vsk = vsock_sk(sk);
  560. /* vmci_transport.c doesn't take sk_lock here either. At least we're
  561. * under vsock_table_lock so the sock cannot disappear while we're
  562. * executing.
  563. */
  564. /* If the peer is still valid, no need to reset connection */
  565. if (vhost_vsock_get(vsk->remote_addr.svm_cid))
  566. return;
  567. /* If the close timeout is pending, let it expire. This avoids races
  568. * with the timeout callback.
  569. */
  570. if (vsk->close_work_scheduled)
  571. return;
  572. sock_set_flag(sk, SOCK_DONE);
  573. vsk->peer_shutdown = SHUTDOWN_MASK;
  574. sk->sk_state = SS_UNCONNECTED;
  575. sk->sk_err = ECONNRESET;
  576. sk_error_report(sk);
  577. }
  578. static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
  579. {
  580. struct vhost_vsock *vsock = file->private_data;
  581. mutex_lock(&vhost_vsock_mutex);
  582. if (vsock->guest_cid)
  583. hash_del_rcu(&vsock->hash);
  584. mutex_unlock(&vhost_vsock_mutex);
  585. /* Wait for other CPUs to finish using vsock */
  586. synchronize_rcu();
  587. /* Iterating over all connections for all CIDs to find orphans is
  588. * inefficient. Room for improvement here. */
  589. vsock_for_each_connected_socket(&vhost_transport.transport,
  590. vhost_vsock_reset_orphans);
  591. /* Don't check the owner, because we are in the release path, so we
  592. * need to stop the vsock device in any case.
  593. * vhost_vsock_stop() can not fail in this case, so we don't need to
  594. * check the return code.
  595. */
  596. vhost_vsock_stop(vsock, false);
  597. vhost_vsock_flush(vsock);
  598. vhost_dev_stop(&vsock->dev);
  599. virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue);
  600. vhost_dev_cleanup(&vsock->dev);
  601. kfree(vsock->dev.vqs);
  602. vhost_vsock_free(vsock);
  603. return 0;
  604. }
  605. static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u64 guest_cid)
  606. {
  607. struct vhost_vsock *other;
  608. /* Refuse reserved CIDs */
  609. if (guest_cid <= VMADDR_CID_HOST ||
  610. guest_cid == U32_MAX)
  611. return -EINVAL;
  612. /* 64-bit CIDs are not yet supported */
  613. if (guest_cid > U32_MAX)
  614. return -EINVAL;
  615. /* Refuse if CID is assigned to the guest->host transport (i.e. nested
  616. * VM), to make the loopback work.
  617. */
  618. if (vsock_find_cid(guest_cid))
  619. return -EADDRINUSE;
  620. /* Refuse if CID is already in use */
  621. mutex_lock(&vhost_vsock_mutex);
  622. other = vhost_vsock_get(guest_cid);
  623. if (other && other != vsock) {
  624. mutex_unlock(&vhost_vsock_mutex);
  625. return -EADDRINUSE;
  626. }
  627. if (vsock->guest_cid)
  628. hash_del_rcu(&vsock->hash);
  629. vsock->guest_cid = guest_cid;
  630. hash_add_rcu(vhost_vsock_hash, &vsock->hash, vsock->guest_cid);
  631. mutex_unlock(&vhost_vsock_mutex);
  632. return 0;
  633. }
  634. static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
  635. {
  636. struct vhost_virtqueue *vq;
  637. int i;
  638. if (features & ~VHOST_VSOCK_FEATURES)
  639. return -EOPNOTSUPP;
  640. mutex_lock(&vsock->dev.mutex);
  641. if ((features & (1 << VHOST_F_LOG_ALL)) &&
  642. !vhost_log_access_ok(&vsock->dev)) {
  643. goto err;
  644. }
  645. if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) {
  646. if (vhost_init_device_iotlb(&vsock->dev))
  647. goto err;
  648. }
  649. vsock->seqpacket_allow = features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET);
  650. for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
  651. vq = &vsock->vqs[i];
  652. mutex_lock(&vq->mutex);
  653. vq->acked_features = features;
  654. mutex_unlock(&vq->mutex);
  655. }
  656. mutex_unlock(&vsock->dev.mutex);
  657. return 0;
  658. err:
  659. mutex_unlock(&vsock->dev.mutex);
  660. return -EFAULT;
  661. }
  662. static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl,
  663. unsigned long arg)
  664. {
  665. struct vhost_vsock *vsock = f->private_data;
  666. void __user *argp = (void __user *)arg;
  667. u64 guest_cid;
  668. u64 features;
  669. int start;
  670. int r;
  671. switch (ioctl) {
  672. case VHOST_VSOCK_SET_GUEST_CID:
  673. if (copy_from_user(&guest_cid, argp, sizeof(guest_cid)))
  674. return -EFAULT;
  675. return vhost_vsock_set_cid(vsock, guest_cid);
  676. case VHOST_VSOCK_SET_RUNNING:
  677. if (copy_from_user(&start, argp, sizeof(start)))
  678. return -EFAULT;
  679. if (start)
  680. return vhost_vsock_start(vsock);
  681. else
  682. return vhost_vsock_stop(vsock, true);
  683. case VHOST_GET_FEATURES:
  684. features = VHOST_VSOCK_FEATURES;
  685. if (copy_to_user(argp, &features, sizeof(features)))
  686. return -EFAULT;
  687. return 0;
  688. case VHOST_SET_FEATURES:
  689. if (copy_from_user(&features, argp, sizeof(features)))
  690. return -EFAULT;
  691. return vhost_vsock_set_features(vsock, features);
  692. case VHOST_GET_BACKEND_FEATURES:
  693. features = VHOST_VSOCK_BACKEND_FEATURES;
  694. if (copy_to_user(argp, &features, sizeof(features)))
  695. return -EFAULT;
  696. return 0;
  697. case VHOST_SET_BACKEND_FEATURES:
  698. if (copy_from_user(&features, argp, sizeof(features)))
  699. return -EFAULT;
  700. if (features & ~VHOST_VSOCK_BACKEND_FEATURES)
  701. return -EOPNOTSUPP;
  702. vhost_set_backend_features(&vsock->dev, features);
  703. return 0;
  704. default:
  705. mutex_lock(&vsock->dev.mutex);
  706. r = vhost_dev_ioctl(&vsock->dev, ioctl, argp);
  707. if (r == -ENOIOCTLCMD)
  708. r = vhost_vring_ioctl(&vsock->dev, ioctl, argp);
  709. else
  710. vhost_vsock_flush(vsock);
  711. mutex_unlock(&vsock->dev.mutex);
  712. return r;
  713. }
  714. }
  715. static ssize_t vhost_vsock_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
  716. {
  717. struct file *file = iocb->ki_filp;
  718. struct vhost_vsock *vsock = file->private_data;
  719. struct vhost_dev *dev = &vsock->dev;
  720. int noblock = file->f_flags & O_NONBLOCK;
  721. return vhost_chr_read_iter(dev, to, noblock);
  722. }
  723. static ssize_t vhost_vsock_chr_write_iter(struct kiocb *iocb,
  724. struct iov_iter *from)
  725. {
  726. struct file *file = iocb->ki_filp;
  727. struct vhost_vsock *vsock = file->private_data;
  728. struct vhost_dev *dev = &vsock->dev;
  729. return vhost_chr_write_iter(dev, from);
  730. }
  731. static __poll_t vhost_vsock_chr_poll(struct file *file, poll_table *wait)
  732. {
  733. struct vhost_vsock *vsock = file->private_data;
  734. struct vhost_dev *dev = &vsock->dev;
  735. return vhost_chr_poll(file, dev, wait);
  736. }
  737. static const struct file_operations vhost_vsock_fops = {
  738. .owner = THIS_MODULE,
  739. .open = vhost_vsock_dev_open,
  740. .release = vhost_vsock_dev_release,
  741. .llseek = noop_llseek,
  742. .unlocked_ioctl = vhost_vsock_dev_ioctl,
  743. .compat_ioctl = compat_ptr_ioctl,
  744. .read_iter = vhost_vsock_chr_read_iter,
  745. .write_iter = vhost_vsock_chr_write_iter,
  746. .poll = vhost_vsock_chr_poll,
  747. };
  748. static struct miscdevice vhost_vsock_misc = {
  749. .minor = VHOST_VSOCK_MINOR,
  750. .name = "vhost-vsock",
  751. .fops = &vhost_vsock_fops,
  752. };
  753. static int __init vhost_vsock_init(void)
  754. {
  755. int ret;
  756. ret = vsock_core_register(&vhost_transport.transport,
  757. VSOCK_TRANSPORT_F_H2G);
  758. if (ret < 0)
  759. return ret;
  760. ret = misc_register(&vhost_vsock_misc);
  761. if (ret) {
  762. vsock_core_unregister(&vhost_transport.transport);
  763. return ret;
  764. }
  765. return 0;
  766. };
  767. static void __exit vhost_vsock_exit(void)
  768. {
  769. misc_deregister(&vhost_vsock_misc);
  770. vsock_core_unregister(&vhost_transport.transport);
  771. };
  772. module_init(vhost_vsock_init);
  773. module_exit(vhost_vsock_exit);
  774. MODULE_LICENSE("GPL v2");
  775. MODULE_AUTHOR("Asias He");
  776. MODULE_DESCRIPTION("vhost transport for vsock ");
  777. MODULE_ALIAS_MISCDEV(VHOST_VSOCK_MINOR);
  778. MODULE_ALIAS("devname:vhost-vsock");