kbuf.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/kernel.h>
  3. #include <linux/errno.h>
  4. #include <linux/fs.h>
  5. #include <linux/file.h>
  6. #include <linux/mm.h>
  7. #include <linux/slab.h>
  8. #include <linux/namei.h>
  9. #include <linux/poll.h>
  10. #include <linux/vmalloc.h>
  11. #include <linux/io_uring.h>
  12. #include <uapi/linux/io_uring.h>
  13. #include "io_uring.h"
  14. #include "opdef.h"
  15. #include "kbuf.h"
  16. #include "memmap.h"
  17. /* BIDs are addressed by a 16-bit field in a CQE */
  18. #define MAX_BIDS_PER_BGID (1 << 16)
  19. struct kmem_cache *io_buf_cachep;
  20. struct io_provide_buf {
  21. struct file *file;
  22. __u64 addr;
  23. __u32 len;
  24. __u32 bgid;
  25. __u32 nbufs;
  26. __u16 bid;
  27. };
  28. static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
  29. unsigned int bgid)
  30. {
  31. lockdep_assert_held(&ctx->uring_lock);
  32. return xa_load(&ctx->io_bl_xa, bgid);
  33. }
  34. static int io_buffer_add_list(struct io_ring_ctx *ctx,
  35. struct io_buffer_list *bl, unsigned int bgid)
  36. {
  37. /*
  38. * Store buffer group ID and finally mark the list as visible.
  39. * The normal lookup doesn't care about the visibility as we're
  40. * always under the ->uring_lock, but the RCU lookup from mmap does.
  41. */
  42. bl->bgid = bgid;
  43. atomic_set(&bl->refs, 1);
  44. return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
  45. }
  46. bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
  47. {
  48. struct io_ring_ctx *ctx = req->ctx;
  49. struct io_buffer_list *bl;
  50. struct io_buffer *buf;
  51. io_ring_submit_lock(ctx, issue_flags);
  52. buf = req->kbuf;
  53. bl = io_buffer_get_list(ctx, buf->bgid);
  54. list_add(&buf->list, &bl->buf_list);
  55. req->flags &= ~REQ_F_BUFFER_SELECTED;
  56. req->buf_index = buf->bgid;
  57. io_ring_submit_unlock(ctx, issue_flags);
  58. return true;
  59. }
  60. void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags)
  61. {
  62. /*
  63. * We can add this buffer back to two lists:
  64. *
  65. * 1) The io_buffers_cache list. This one is protected by the
  66. * ctx->uring_lock. If we already hold this lock, add back to this
  67. * list as we can grab it from issue as well.
  68. * 2) The io_buffers_comp list. This one is protected by the
  69. * ctx->completion_lock.
  70. *
  71. * We migrate buffers from the comp_list to the issue cache list
  72. * when we need one.
  73. */
  74. if (issue_flags & IO_URING_F_UNLOCKED) {
  75. struct io_ring_ctx *ctx = req->ctx;
  76. spin_lock(&ctx->completion_lock);
  77. __io_put_kbuf_list(req, len, &ctx->io_buffers_comp);
  78. spin_unlock(&ctx->completion_lock);
  79. } else {
  80. lockdep_assert_held(&req->ctx->uring_lock);
  81. __io_put_kbuf_list(req, len, &req->ctx->io_buffers_cache);
  82. }
  83. }
  84. static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
  85. struct io_buffer_list *bl)
  86. {
  87. if (!list_empty(&bl->buf_list)) {
  88. struct io_buffer *kbuf;
  89. kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
  90. list_del(&kbuf->list);
  91. if (*len == 0 || *len > kbuf->len)
  92. *len = kbuf->len;
  93. if (list_empty(&bl->buf_list))
  94. req->flags |= REQ_F_BL_EMPTY;
  95. req->flags |= REQ_F_BUFFER_SELECTED;
  96. req->kbuf = kbuf;
  97. req->buf_index = kbuf->bid;
  98. return u64_to_user_ptr(kbuf->addr);
  99. }
  100. return NULL;
  101. }
  102. static int io_provided_buffers_select(struct io_kiocb *req, size_t *len,
  103. struct io_buffer_list *bl,
  104. struct iovec *iov)
  105. {
  106. void __user *buf;
  107. buf = io_provided_buffer_select(req, len, bl);
  108. if (unlikely(!buf))
  109. return -ENOBUFS;
  110. iov[0].iov_base = buf;
  111. iov[0].iov_len = *len;
  112. return 1;
  113. }
  114. static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
  115. struct io_buffer_list *bl,
  116. unsigned int issue_flags)
  117. {
  118. struct io_uring_buf_ring *br = bl->buf_ring;
  119. __u16 tail, head = bl->head;
  120. struct io_uring_buf *buf;
  121. void __user *ret;
  122. tail = smp_load_acquire(&br->tail);
  123. if (unlikely(tail == head))
  124. return NULL;
  125. if (head + 1 == tail)
  126. req->flags |= REQ_F_BL_EMPTY;
  127. buf = io_ring_head_to_buf(br, head, bl->mask);
  128. if (*len == 0 || *len > buf->len)
  129. *len = buf->len;
  130. req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
  131. req->buf_list = bl;
  132. req->buf_index = buf->bid;
  133. ret = u64_to_user_ptr(buf->addr);
  134. if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) {
  135. /*
  136. * If we came in unlocked, we have no choice but to consume the
  137. * buffer here, otherwise nothing ensures that the buffer won't
  138. * get used by others. This does mean it'll be pinned until the
  139. * IO completes, coming in unlocked means we're being called from
  140. * io-wq context and there may be further retries in async hybrid
  141. * mode. For the locked case, the caller must call commit when
  142. * the transfer completes (or if we get -EAGAIN and must poll of
  143. * retry).
  144. */
  145. io_kbuf_commit(req, bl, *len, 1);
  146. req->buf_list = NULL;
  147. }
  148. return ret;
  149. }
  150. void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
  151. unsigned int issue_flags)
  152. {
  153. struct io_ring_ctx *ctx = req->ctx;
  154. struct io_buffer_list *bl;
  155. void __user *ret = NULL;
  156. io_ring_submit_lock(req->ctx, issue_flags);
  157. bl = io_buffer_get_list(ctx, req->buf_index);
  158. if (likely(bl)) {
  159. if (bl->flags & IOBL_BUF_RING)
  160. ret = io_ring_buffer_select(req, len, bl, issue_flags);
  161. else
  162. ret = io_provided_buffer_select(req, len, bl);
  163. }
  164. io_ring_submit_unlock(req->ctx, issue_flags);
  165. return ret;
  166. }
  167. /* cap it at a reasonable 256, will be one page even for 4K */
  168. #define PEEK_MAX_IMPORT 256
  169. static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
  170. struct io_buffer_list *bl)
  171. {
  172. struct io_uring_buf_ring *br = bl->buf_ring;
  173. struct iovec *iov = arg->iovs;
  174. int nr_iovs = arg->nr_iovs;
  175. __u16 nr_avail, tail, head;
  176. struct io_uring_buf *buf;
  177. tail = smp_load_acquire(&br->tail);
  178. head = bl->head;
  179. nr_avail = min_t(__u16, tail - head, UIO_MAXIOV);
  180. if (unlikely(!nr_avail))
  181. return -ENOBUFS;
  182. buf = io_ring_head_to_buf(br, head, bl->mask);
  183. if (arg->max_len) {
  184. u32 len = READ_ONCE(buf->len);
  185. if (unlikely(!len))
  186. return -ENOBUFS;
  187. /*
  188. * Limit incremental buffers to 1 segment. No point trying
  189. * to peek ahead and map more than we need, when the buffers
  190. * themselves should be large when setup with
  191. * IOU_PBUF_RING_INC.
  192. */
  193. if (bl->flags & IOBL_INC) {
  194. nr_avail = 1;
  195. } else {
  196. size_t needed;
  197. needed = (arg->max_len + len - 1) / len;
  198. needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT);
  199. if (nr_avail > needed)
  200. nr_avail = needed;
  201. }
  202. }
  203. /*
  204. * only alloc a bigger array if we know we have data to map, eg not
  205. * a speculative peek operation.
  206. */
  207. if (arg->mode & KBUF_MODE_EXPAND && nr_avail > nr_iovs && arg->max_len) {
  208. iov = kmalloc_array(nr_avail, sizeof(struct iovec), GFP_KERNEL);
  209. if (unlikely(!iov))
  210. return -ENOMEM;
  211. if (arg->mode & KBUF_MODE_FREE)
  212. kfree(arg->iovs);
  213. arg->iovs = iov;
  214. nr_iovs = nr_avail;
  215. } else if (nr_avail < nr_iovs) {
  216. nr_iovs = nr_avail;
  217. }
  218. /* set it to max, if not set, so we can use it unconditionally */
  219. if (!arg->max_len)
  220. arg->max_len = INT_MAX;
  221. req->buf_index = buf->bid;
  222. do {
  223. u32 len = buf->len;
  224. /* truncate end piece, if needed, for non partial buffers */
  225. if (len > arg->max_len) {
  226. len = arg->max_len;
  227. if (!(bl->flags & IOBL_INC))
  228. buf->len = len;
  229. }
  230. iov->iov_base = u64_to_user_ptr(buf->addr);
  231. iov->iov_len = len;
  232. iov++;
  233. arg->out_len += len;
  234. arg->max_len -= len;
  235. if (!arg->max_len)
  236. break;
  237. buf = io_ring_head_to_buf(br, ++head, bl->mask);
  238. } while (--nr_iovs);
  239. if (head == tail)
  240. req->flags |= REQ_F_BL_EMPTY;
  241. req->flags |= REQ_F_BUFFER_RING;
  242. req->buf_list = bl;
  243. return iov - arg->iovs;
  244. }
  245. int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
  246. unsigned int issue_flags)
  247. {
  248. struct io_ring_ctx *ctx = req->ctx;
  249. struct io_buffer_list *bl;
  250. int ret = -ENOENT;
  251. io_ring_submit_lock(ctx, issue_flags);
  252. bl = io_buffer_get_list(ctx, req->buf_index);
  253. if (unlikely(!bl))
  254. goto out_unlock;
  255. if (bl->flags & IOBL_BUF_RING) {
  256. ret = io_ring_buffers_peek(req, arg, bl);
  257. /*
  258. * Don't recycle these buffers if we need to go through poll.
  259. * Nobody else can use them anyway, and holding on to provided
  260. * buffers for a send/write operation would happen on the app
  261. * side anyway with normal buffers. Besides, we already
  262. * committed them, they cannot be put back in the queue.
  263. */
  264. if (ret > 0) {
  265. req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE;
  266. io_kbuf_commit(req, bl, arg->out_len, ret);
  267. }
  268. } else {
  269. ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs);
  270. }
  271. out_unlock:
  272. io_ring_submit_unlock(ctx, issue_flags);
  273. return ret;
  274. }
  275. int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
  276. {
  277. struct io_ring_ctx *ctx = req->ctx;
  278. struct io_buffer_list *bl;
  279. int ret;
  280. lockdep_assert_held(&ctx->uring_lock);
  281. bl = io_buffer_get_list(ctx, req->buf_index);
  282. if (unlikely(!bl))
  283. return -ENOENT;
  284. if (bl->flags & IOBL_BUF_RING) {
  285. ret = io_ring_buffers_peek(req, arg, bl);
  286. if (ret > 0)
  287. req->flags |= REQ_F_BUFFERS_COMMIT;
  288. return ret;
  289. }
  290. /* don't support multiple buffer selections for legacy */
  291. return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs);
  292. }
  293. static int __io_remove_buffers(struct io_ring_ctx *ctx,
  294. struct io_buffer_list *bl, unsigned nbufs)
  295. {
  296. unsigned i = 0;
  297. /* shouldn't happen */
  298. if (!nbufs)
  299. return 0;
  300. if (bl->flags & IOBL_BUF_RING) {
  301. i = bl->buf_ring->tail - bl->head;
  302. if (bl->buf_nr_pages) {
  303. int j;
  304. if (!(bl->flags & IOBL_MMAP)) {
  305. for (j = 0; j < bl->buf_nr_pages; j++)
  306. unpin_user_page(bl->buf_pages[j]);
  307. }
  308. io_pages_unmap(bl->buf_ring, &bl->buf_pages,
  309. &bl->buf_nr_pages, bl->flags & IOBL_MMAP);
  310. bl->flags &= ~IOBL_MMAP;
  311. }
  312. /* make sure it's seen as empty */
  313. INIT_LIST_HEAD(&bl->buf_list);
  314. bl->flags &= ~IOBL_BUF_RING;
  315. return i;
  316. }
  317. /* protects io_buffers_cache */
  318. lockdep_assert_held(&ctx->uring_lock);
  319. while (!list_empty(&bl->buf_list)) {
  320. struct io_buffer *nxt;
  321. nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
  322. list_move(&nxt->list, &ctx->io_buffers_cache);
  323. if (++i == nbufs)
  324. return i;
  325. cond_resched();
  326. }
  327. return i;
  328. }
  329. void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
  330. {
  331. if (atomic_dec_and_test(&bl->refs)) {
  332. __io_remove_buffers(ctx, bl, -1U);
  333. kfree_rcu(bl, rcu);
  334. }
  335. }
  336. void io_destroy_buffers(struct io_ring_ctx *ctx)
  337. {
  338. struct io_buffer_list *bl;
  339. struct list_head *item, *tmp;
  340. struct io_buffer *buf;
  341. unsigned long index;
  342. xa_for_each(&ctx->io_bl_xa, index, bl) {
  343. xa_erase(&ctx->io_bl_xa, bl->bgid);
  344. io_put_bl(ctx, bl);
  345. }
  346. /*
  347. * Move deferred locked entries to cache before pruning
  348. */
  349. spin_lock(&ctx->completion_lock);
  350. if (!list_empty(&ctx->io_buffers_comp))
  351. list_splice_init(&ctx->io_buffers_comp, &ctx->io_buffers_cache);
  352. spin_unlock(&ctx->completion_lock);
  353. list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
  354. buf = list_entry(item, struct io_buffer, list);
  355. kmem_cache_free(io_buf_cachep, buf);
  356. }
  357. }
  358. static void io_destroy_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
  359. {
  360. xa_erase(&ctx->io_bl_xa, bl->bgid);
  361. io_put_bl(ctx, bl);
  362. }
  363. int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  364. {
  365. struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
  366. u64 tmp;
  367. if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
  368. sqe->splice_fd_in)
  369. return -EINVAL;
  370. tmp = READ_ONCE(sqe->fd);
  371. if (!tmp || tmp > MAX_BIDS_PER_BGID)
  372. return -EINVAL;
  373. memset(p, 0, sizeof(*p));
  374. p->nbufs = tmp;
  375. p->bgid = READ_ONCE(sqe->buf_group);
  376. return 0;
  377. }
  378. int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
  379. {
  380. struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
  381. struct io_ring_ctx *ctx = req->ctx;
  382. struct io_buffer_list *bl;
  383. int ret = 0;
  384. io_ring_submit_lock(ctx, issue_flags);
  385. ret = -ENOENT;
  386. bl = io_buffer_get_list(ctx, p->bgid);
  387. if (bl) {
  388. ret = -EINVAL;
  389. /* can't use provide/remove buffers command on mapped buffers */
  390. if (!(bl->flags & IOBL_BUF_RING))
  391. ret = __io_remove_buffers(ctx, bl, p->nbufs);
  392. }
  393. io_ring_submit_unlock(ctx, issue_flags);
  394. if (ret < 0)
  395. req_set_fail(req);
  396. io_req_set_res(req, ret, 0);
  397. return IOU_OK;
  398. }
  399. int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  400. {
  401. unsigned long size, tmp_check;
  402. struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
  403. u64 tmp;
  404. if (sqe->rw_flags || sqe->splice_fd_in)
  405. return -EINVAL;
  406. tmp = READ_ONCE(sqe->fd);
  407. if (!tmp || tmp > MAX_BIDS_PER_BGID)
  408. return -E2BIG;
  409. p->nbufs = tmp;
  410. p->addr = READ_ONCE(sqe->addr);
  411. p->len = READ_ONCE(sqe->len);
  412. if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
  413. &size))
  414. return -EOVERFLOW;
  415. if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
  416. return -EOVERFLOW;
  417. size = (unsigned long)p->len * p->nbufs;
  418. if (!access_ok(u64_to_user_ptr(p->addr), size))
  419. return -EFAULT;
  420. p->bgid = READ_ONCE(sqe->buf_group);
  421. tmp = READ_ONCE(sqe->off);
  422. if (tmp > USHRT_MAX)
  423. return -E2BIG;
  424. if (tmp + p->nbufs > MAX_BIDS_PER_BGID)
  425. return -EINVAL;
  426. p->bid = tmp;
  427. return 0;
  428. }
  429. #define IO_BUFFER_ALLOC_BATCH 64
  430. static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
  431. {
  432. struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH];
  433. int allocated;
  434. /*
  435. * Completions that don't happen inline (eg not under uring_lock) will
  436. * add to ->io_buffers_comp. If we don't have any free buffers, check
  437. * the completion list and splice those entries first.
  438. */
  439. if (!list_empty_careful(&ctx->io_buffers_comp)) {
  440. spin_lock(&ctx->completion_lock);
  441. if (!list_empty(&ctx->io_buffers_comp)) {
  442. list_splice_init(&ctx->io_buffers_comp,
  443. &ctx->io_buffers_cache);
  444. spin_unlock(&ctx->completion_lock);
  445. return 0;
  446. }
  447. spin_unlock(&ctx->completion_lock);
  448. }
  449. /*
  450. * No free buffers and no completion entries either. Allocate a new
  451. * batch of buffer entries and add those to our freelist.
  452. */
  453. allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT,
  454. ARRAY_SIZE(bufs), (void **) bufs);
  455. if (unlikely(!allocated)) {
  456. /*
  457. * Bulk alloc is all-or-nothing. If we fail to get a batch,
  458. * retry single alloc to be on the safe side.
  459. */
  460. bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL);
  461. if (!bufs[0])
  462. return -ENOMEM;
  463. allocated = 1;
  464. }
  465. while (allocated)
  466. list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache);
  467. return 0;
  468. }
  469. static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
  470. struct io_buffer_list *bl)
  471. {
  472. struct io_buffer *buf;
  473. u64 addr = pbuf->addr;
  474. int i, bid = pbuf->bid;
  475. for (i = 0; i < pbuf->nbufs; i++) {
  476. if (list_empty(&ctx->io_buffers_cache) &&
  477. io_refill_buffer_cache(ctx))
  478. break;
  479. buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
  480. list);
  481. list_move_tail(&buf->list, &bl->buf_list);
  482. buf->addr = addr;
  483. buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
  484. buf->bid = bid;
  485. buf->bgid = pbuf->bgid;
  486. addr += pbuf->len;
  487. bid++;
  488. cond_resched();
  489. }
  490. return i ? 0 : -ENOMEM;
  491. }
  492. int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
  493. {
  494. struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
  495. struct io_ring_ctx *ctx = req->ctx;
  496. struct io_buffer_list *bl;
  497. int ret = 0;
  498. io_ring_submit_lock(ctx, issue_flags);
  499. bl = io_buffer_get_list(ctx, p->bgid);
  500. if (unlikely(!bl)) {
  501. bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
  502. if (!bl) {
  503. ret = -ENOMEM;
  504. goto err;
  505. }
  506. INIT_LIST_HEAD(&bl->buf_list);
  507. ret = io_buffer_add_list(ctx, bl, p->bgid);
  508. if (ret) {
  509. /*
  510. * Doesn't need rcu free as it was never visible, but
  511. * let's keep it consistent throughout.
  512. */
  513. kfree_rcu(bl, rcu);
  514. goto err;
  515. }
  516. }
  517. /* can't add buffers via this command for a mapped buffer ring */
  518. if (bl->flags & IOBL_BUF_RING) {
  519. ret = -EINVAL;
  520. goto err;
  521. }
  522. ret = io_add_buffers(ctx, p, bl);
  523. err:
  524. io_ring_submit_unlock(ctx, issue_flags);
  525. if (ret < 0)
  526. req_set_fail(req);
  527. io_req_set_res(req, ret, 0);
  528. return IOU_OK;
  529. }
  530. static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
  531. struct io_buffer_list *bl)
  532. {
  533. struct io_uring_buf_ring *br = NULL;
  534. struct page **pages;
  535. int nr_pages, ret;
  536. pages = io_pin_pages(reg->ring_addr,
  537. flex_array_size(br, bufs, reg->ring_entries),
  538. &nr_pages);
  539. if (IS_ERR(pages))
  540. return PTR_ERR(pages);
  541. br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
  542. if (!br) {
  543. ret = -ENOMEM;
  544. goto error_unpin;
  545. }
  546. #ifdef SHM_COLOUR
  547. /*
  548. * On platforms that have specific aliasing requirements, SHM_COLOUR
  549. * is set and we must guarantee that the kernel and user side align
  550. * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
  551. * the application mmap's the provided ring buffer. Fail the request
  552. * if we, by chance, don't end up with aligned addresses. The app
  553. * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
  554. * this transparently.
  555. */
  556. if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) {
  557. ret = -EINVAL;
  558. goto error_unpin;
  559. }
  560. #endif
  561. bl->buf_pages = pages;
  562. bl->buf_nr_pages = nr_pages;
  563. bl->buf_ring = br;
  564. bl->flags |= IOBL_BUF_RING;
  565. bl->flags &= ~IOBL_MMAP;
  566. return 0;
  567. error_unpin:
  568. unpin_user_pages(pages, nr_pages);
  569. kvfree(pages);
  570. vunmap(br);
  571. return ret;
  572. }
  573. static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
  574. struct io_uring_buf_reg *reg,
  575. struct io_buffer_list *bl)
  576. {
  577. size_t ring_size;
  578. ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
  579. bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size);
  580. if (IS_ERR(bl->buf_ring)) {
  581. bl->buf_ring = NULL;
  582. return -ENOMEM;
  583. }
  584. bl->flags |= (IOBL_BUF_RING | IOBL_MMAP);
  585. return 0;
  586. }
  587. int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
  588. {
  589. struct io_uring_buf_reg reg;
  590. struct io_buffer_list *bl, *free_bl = NULL;
  591. int ret;
  592. lockdep_assert_held(&ctx->uring_lock);
  593. if (copy_from_user(&reg, arg, sizeof(reg)))
  594. return -EFAULT;
  595. if (reg.resv[0] || reg.resv[1] || reg.resv[2])
  596. return -EINVAL;
  597. if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
  598. return -EINVAL;
  599. if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
  600. if (!reg.ring_addr)
  601. return -EFAULT;
  602. if (reg.ring_addr & ~PAGE_MASK)
  603. return -EINVAL;
  604. } else {
  605. if (reg.ring_addr)
  606. return -EINVAL;
  607. }
  608. if (!is_power_of_2(reg.ring_entries))
  609. return -EINVAL;
  610. /* cannot disambiguate full vs empty due to head/tail size */
  611. if (reg.ring_entries >= 65536)
  612. return -EINVAL;
  613. bl = io_buffer_get_list(ctx, reg.bgid);
  614. if (bl) {
  615. /* if mapped buffer ring OR classic exists, don't allow */
  616. if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list))
  617. return -EEXIST;
  618. io_destroy_bl(ctx, bl);
  619. }
  620. free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
  621. if (!bl)
  622. return -ENOMEM;
  623. if (!(reg.flags & IOU_PBUF_RING_MMAP))
  624. ret = io_pin_pbuf_ring(&reg, bl);
  625. else
  626. ret = io_alloc_pbuf_ring(ctx, &reg, bl);
  627. if (!ret) {
  628. bl->nr_entries = reg.ring_entries;
  629. bl->mask = reg.ring_entries - 1;
  630. if (reg.flags & IOU_PBUF_RING_INC)
  631. bl->flags |= IOBL_INC;
  632. io_buffer_add_list(ctx, bl, reg.bgid);
  633. return 0;
  634. }
  635. kfree_rcu(free_bl, rcu);
  636. return ret;
  637. }
  638. int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
  639. {
  640. struct io_uring_buf_reg reg;
  641. struct io_buffer_list *bl;
  642. lockdep_assert_held(&ctx->uring_lock);
  643. if (copy_from_user(&reg, arg, sizeof(reg)))
  644. return -EFAULT;
  645. if (reg.resv[0] || reg.resv[1] || reg.resv[2])
  646. return -EINVAL;
  647. if (reg.flags)
  648. return -EINVAL;
  649. bl = io_buffer_get_list(ctx, reg.bgid);
  650. if (!bl)
  651. return -ENOENT;
  652. if (!(bl->flags & IOBL_BUF_RING))
  653. return -EINVAL;
  654. xa_erase(&ctx->io_bl_xa, bl->bgid);
  655. io_put_bl(ctx, bl);
  656. return 0;
  657. }
  658. int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
  659. {
  660. struct io_uring_buf_status buf_status;
  661. struct io_buffer_list *bl;
  662. int i;
  663. if (copy_from_user(&buf_status, arg, sizeof(buf_status)))
  664. return -EFAULT;
  665. for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++)
  666. if (buf_status.resv[i])
  667. return -EINVAL;
  668. bl = io_buffer_get_list(ctx, buf_status.buf_group);
  669. if (!bl)
  670. return -ENOENT;
  671. if (!(bl->flags & IOBL_BUF_RING))
  672. return -EINVAL;
  673. buf_status.head = bl->head;
  674. if (copy_to_user(arg, &buf_status, sizeof(buf_status)))
  675. return -EFAULT;
  676. return 0;
  677. }
  678. struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
  679. unsigned long bgid)
  680. {
  681. struct io_buffer_list *bl;
  682. bool ret;
  683. /*
  684. * We have to be a bit careful here - we're inside mmap and cannot grab
  685. * the uring_lock. This means the buffer_list could be simultaneously
  686. * going away, if someone is trying to be sneaky. Look it up under rcu
  687. * so we know it's not going away, and attempt to grab a reference to
  688. * it. If the ref is already zero, then fail the mapping. If successful,
  689. * the caller will call io_put_bl() to drop the the reference at at the
  690. * end. This may then safely free the buffer_list (and drop the pages)
  691. * at that point, vm_insert_pages() would've already grabbed the
  692. * necessary vma references.
  693. */
  694. rcu_read_lock();
  695. bl = xa_load(&ctx->io_bl_xa, bgid);
  696. /* must be a mmap'able buffer ring and have pages */
  697. ret = false;
  698. if (bl && bl->flags & IOBL_MMAP)
  699. ret = atomic_inc_not_zero(&bl->refs);
  700. rcu_read_unlock();
  701. if (ret)
  702. return bl;
  703. return ERR_PTR(-EINVAL);
  704. }
  705. int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma)
  706. {
  707. struct io_ring_ctx *ctx = file->private_data;
  708. loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT;
  709. struct io_buffer_list *bl;
  710. int bgid, ret;
  711. bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
  712. bl = io_pbuf_get_bl(ctx, bgid);
  713. if (IS_ERR(bl))
  714. return PTR_ERR(bl);
  715. ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages);
  716. io_put_bl(ctx, bl);
  717. return ret;
  718. }