register.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Code related to the io_uring_register() syscall
  4. *
  5. * Copyright (C) 2023 Jens Axboe
  6. */
  7. #include <linux/kernel.h>
  8. #include <linux/errno.h>
  9. #include <linux/syscalls.h>
  10. #include <linux/refcount.h>
  11. #include <linux/bits.h>
  12. #include <linux/fs.h>
  13. #include <linux/file.h>
  14. #include <linux/slab.h>
  15. #include <linux/uaccess.h>
  16. #include <linux/nospec.h>
  17. #include <linux/compat.h>
  18. #include <linux/io_uring.h>
  19. #include <linux/io_uring_types.h>
  20. #include "io_uring.h"
  21. #include "opdef.h"
  22. #include "tctx.h"
  23. #include "rsrc.h"
  24. #include "sqpoll.h"
  25. #include "register.h"
  26. #include "cancel.h"
  27. #include "kbuf.h"
  28. #include "napi.h"
  29. #include "eventfd.h"
  30. #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
  31. IORING_REGISTER_LAST + IORING_OP_LAST)
  32. static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
  33. unsigned nr_args)
  34. {
  35. struct io_uring_probe *p;
  36. size_t size;
  37. int i, ret;
  38. if (nr_args > IORING_OP_LAST)
  39. nr_args = IORING_OP_LAST;
  40. size = struct_size(p, ops, nr_args);
  41. p = kzalloc(size, GFP_KERNEL);
  42. if (!p)
  43. return -ENOMEM;
  44. ret = -EFAULT;
  45. if (copy_from_user(p, arg, size))
  46. goto out;
  47. ret = -EINVAL;
  48. if (memchr_inv(p, 0, size))
  49. goto out;
  50. p->last_op = IORING_OP_LAST - 1;
  51. for (i = 0; i < nr_args; i++) {
  52. p->ops[i].op = i;
  53. if (io_uring_op_supported(i))
  54. p->ops[i].flags = IO_URING_OP_SUPPORTED;
  55. }
  56. p->ops_len = i;
  57. ret = 0;
  58. if (copy_to_user(arg, p, size))
  59. ret = -EFAULT;
  60. out:
  61. kfree(p);
  62. return ret;
  63. }
  64. int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
  65. {
  66. const struct cred *creds;
  67. creds = xa_erase(&ctx->personalities, id);
  68. if (creds) {
  69. put_cred(creds);
  70. return 0;
  71. }
  72. return -EINVAL;
  73. }
  74. static int io_register_personality(struct io_ring_ctx *ctx)
  75. {
  76. const struct cred *creds;
  77. u32 id;
  78. int ret;
  79. creds = get_current_cred();
  80. ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
  81. XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
  82. if (ret < 0) {
  83. put_cred(creds);
  84. return ret;
  85. }
  86. return id;
  87. }
  88. static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
  89. void __user *arg, unsigned int nr_args)
  90. {
  91. struct io_uring_restriction *res;
  92. size_t size;
  93. int i, ret;
  94. /* Restrictions allowed only if rings started disabled */
  95. if (!(ctx->flags & IORING_SETUP_R_DISABLED))
  96. return -EBADFD;
  97. /* We allow only a single restrictions registration */
  98. if (ctx->restrictions.registered)
  99. return -EBUSY;
  100. if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
  101. return -EINVAL;
  102. size = array_size(nr_args, sizeof(*res));
  103. if (size == SIZE_MAX)
  104. return -EOVERFLOW;
  105. res = memdup_user(arg, size);
  106. if (IS_ERR(res))
  107. return PTR_ERR(res);
  108. ret = 0;
  109. for (i = 0; i < nr_args; i++) {
  110. switch (res[i].opcode) {
  111. case IORING_RESTRICTION_REGISTER_OP:
  112. if (res[i].register_op >= IORING_REGISTER_LAST) {
  113. ret = -EINVAL;
  114. goto out;
  115. }
  116. __set_bit(res[i].register_op,
  117. ctx->restrictions.register_op);
  118. break;
  119. case IORING_RESTRICTION_SQE_OP:
  120. if (res[i].sqe_op >= IORING_OP_LAST) {
  121. ret = -EINVAL;
  122. goto out;
  123. }
  124. __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
  125. break;
  126. case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
  127. ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
  128. break;
  129. case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
  130. ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
  131. break;
  132. default:
  133. ret = -EINVAL;
  134. goto out;
  135. }
  136. }
  137. out:
  138. /* Reset all restrictions if an error happened */
  139. if (ret != 0)
  140. memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
  141. else
  142. ctx->restrictions.registered = true;
  143. kfree(res);
  144. return ret;
  145. }
  146. static int io_register_enable_rings(struct io_ring_ctx *ctx)
  147. {
  148. if (!(ctx->flags & IORING_SETUP_R_DISABLED))
  149. return -EBADFD;
  150. if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
  151. WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
  152. /*
  153. * Lazy activation attempts would fail if it was polled before
  154. * submitter_task is set.
  155. */
  156. if (wq_has_sleeper(&ctx->poll_wq))
  157. io_activate_pollwq(ctx);
  158. }
  159. if (ctx->restrictions.registered)
  160. ctx->restricted = 1;
  161. ctx->flags &= ~IORING_SETUP_R_DISABLED;
  162. if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
  163. wake_up(&ctx->sq_data->wait);
  164. return 0;
  165. }
  166. static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
  167. cpumask_var_t new_mask)
  168. {
  169. int ret;
  170. if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
  171. ret = io_wq_cpu_affinity(current->io_uring, new_mask);
  172. } else {
  173. mutex_unlock(&ctx->uring_lock);
  174. ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
  175. mutex_lock(&ctx->uring_lock);
  176. }
  177. return ret;
  178. }
  179. static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
  180. void __user *arg, unsigned len)
  181. {
  182. cpumask_var_t new_mask;
  183. int ret;
  184. if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
  185. return -ENOMEM;
  186. cpumask_clear(new_mask);
  187. if (len > cpumask_size())
  188. len = cpumask_size();
  189. #ifdef CONFIG_COMPAT
  190. if (in_compat_syscall())
  191. ret = compat_get_bitmap(cpumask_bits(new_mask),
  192. (const compat_ulong_t __user *)arg,
  193. len * 8 /* CHAR_BIT */);
  194. else
  195. #endif
  196. ret = copy_from_user(new_mask, arg, len);
  197. if (ret) {
  198. free_cpumask_var(new_mask);
  199. return -EFAULT;
  200. }
  201. ret = __io_register_iowq_aff(ctx, new_mask);
  202. free_cpumask_var(new_mask);
  203. return ret;
  204. }
  205. static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
  206. {
  207. return __io_register_iowq_aff(ctx, NULL);
  208. }
  209. static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
  210. void __user *arg)
  211. __must_hold(&ctx->uring_lock)
  212. {
  213. struct io_tctx_node *node;
  214. struct io_uring_task *tctx = NULL;
  215. struct io_sq_data *sqd = NULL;
  216. __u32 new_count[2];
  217. int i, ret;
  218. if (copy_from_user(new_count, arg, sizeof(new_count)))
  219. return -EFAULT;
  220. for (i = 0; i < ARRAY_SIZE(new_count); i++)
  221. if (new_count[i] > INT_MAX)
  222. return -EINVAL;
  223. if (ctx->flags & IORING_SETUP_SQPOLL) {
  224. sqd = ctx->sq_data;
  225. if (sqd) {
  226. /*
  227. * Observe the correct sqd->lock -> ctx->uring_lock
  228. * ordering. Fine to drop uring_lock here, we hold
  229. * a ref to the ctx.
  230. */
  231. refcount_inc(&sqd->refs);
  232. mutex_unlock(&ctx->uring_lock);
  233. mutex_lock(&sqd->lock);
  234. mutex_lock(&ctx->uring_lock);
  235. if (sqd->thread)
  236. tctx = sqd->thread->io_uring;
  237. }
  238. } else {
  239. tctx = current->io_uring;
  240. }
  241. BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
  242. for (i = 0; i < ARRAY_SIZE(new_count); i++)
  243. if (new_count[i])
  244. ctx->iowq_limits[i] = new_count[i];
  245. ctx->iowq_limits_set = true;
  246. if (tctx && tctx->io_wq) {
  247. ret = io_wq_max_workers(tctx->io_wq, new_count);
  248. if (ret)
  249. goto err;
  250. } else {
  251. memset(new_count, 0, sizeof(new_count));
  252. }
  253. if (sqd) {
  254. mutex_unlock(&ctx->uring_lock);
  255. mutex_unlock(&sqd->lock);
  256. io_put_sq_data(sqd);
  257. mutex_lock(&ctx->uring_lock);
  258. }
  259. if (copy_to_user(arg, new_count, sizeof(new_count)))
  260. return -EFAULT;
  261. /* that's it for SQPOLL, only the SQPOLL task creates requests */
  262. if (sqd)
  263. return 0;
  264. /* now propagate the restriction to all registered users */
  265. list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
  266. tctx = node->task->io_uring;
  267. if (WARN_ON_ONCE(!tctx->io_wq))
  268. continue;
  269. for (i = 0; i < ARRAY_SIZE(new_count); i++)
  270. new_count[i] = ctx->iowq_limits[i];
  271. /* ignore errors, it always returns zero anyway */
  272. (void)io_wq_max_workers(tctx->io_wq, new_count);
  273. }
  274. return 0;
  275. err:
  276. if (sqd) {
  277. mutex_unlock(&ctx->uring_lock);
  278. mutex_unlock(&sqd->lock);
  279. io_put_sq_data(sqd);
  280. mutex_lock(&ctx->uring_lock);
  281. }
  282. return ret;
  283. }
  284. static int io_register_clock(struct io_ring_ctx *ctx,
  285. struct io_uring_clock_register __user *arg)
  286. {
  287. struct io_uring_clock_register reg;
  288. if (copy_from_user(&reg, arg, sizeof(reg)))
  289. return -EFAULT;
  290. if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
  291. return -EINVAL;
  292. switch (reg.clockid) {
  293. case CLOCK_MONOTONIC:
  294. ctx->clock_offset = 0;
  295. break;
  296. case CLOCK_BOOTTIME:
  297. ctx->clock_offset = TK_OFFS_BOOT;
  298. break;
  299. default:
  300. return -EINVAL;
  301. }
  302. ctx->clockid = reg.clockid;
  303. return 0;
  304. }
  305. static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
  306. void __user *arg, unsigned nr_args)
  307. __releases(ctx->uring_lock)
  308. __acquires(ctx->uring_lock)
  309. {
  310. int ret;
  311. /*
  312. * We don't quiesce the refs for register anymore and so it can't be
  313. * dying as we're holding a file ref here.
  314. */
  315. if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
  316. return -ENXIO;
  317. if (ctx->submitter_task && ctx->submitter_task != current)
  318. return -EEXIST;
  319. if (ctx->restricted) {
  320. opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
  321. if (!test_bit(opcode, ctx->restrictions.register_op))
  322. return -EACCES;
  323. }
  324. switch (opcode) {
  325. case IORING_REGISTER_BUFFERS:
  326. ret = -EFAULT;
  327. if (!arg)
  328. break;
  329. ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
  330. break;
  331. case IORING_UNREGISTER_BUFFERS:
  332. ret = -EINVAL;
  333. if (arg || nr_args)
  334. break;
  335. ret = io_sqe_buffers_unregister(ctx);
  336. break;
  337. case IORING_REGISTER_FILES:
  338. ret = -EFAULT;
  339. if (!arg)
  340. break;
  341. ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
  342. break;
  343. case IORING_UNREGISTER_FILES:
  344. ret = -EINVAL;
  345. if (arg || nr_args)
  346. break;
  347. ret = io_sqe_files_unregister(ctx);
  348. break;
  349. case IORING_REGISTER_FILES_UPDATE:
  350. ret = io_register_files_update(ctx, arg, nr_args);
  351. break;
  352. case IORING_REGISTER_EVENTFD:
  353. ret = -EINVAL;
  354. if (nr_args != 1)
  355. break;
  356. ret = io_eventfd_register(ctx, arg, 0);
  357. break;
  358. case IORING_REGISTER_EVENTFD_ASYNC:
  359. ret = -EINVAL;
  360. if (nr_args != 1)
  361. break;
  362. ret = io_eventfd_register(ctx, arg, 1);
  363. break;
  364. case IORING_UNREGISTER_EVENTFD:
  365. ret = -EINVAL;
  366. if (arg || nr_args)
  367. break;
  368. ret = io_eventfd_unregister(ctx);
  369. break;
  370. case IORING_REGISTER_PROBE:
  371. ret = -EINVAL;
  372. if (!arg || nr_args > 256)
  373. break;
  374. ret = io_probe(ctx, arg, nr_args);
  375. break;
  376. case IORING_REGISTER_PERSONALITY:
  377. ret = -EINVAL;
  378. if (arg || nr_args)
  379. break;
  380. ret = io_register_personality(ctx);
  381. break;
  382. case IORING_UNREGISTER_PERSONALITY:
  383. ret = -EINVAL;
  384. if (arg)
  385. break;
  386. ret = io_unregister_personality(ctx, nr_args);
  387. break;
  388. case IORING_REGISTER_ENABLE_RINGS:
  389. ret = -EINVAL;
  390. if (arg || nr_args)
  391. break;
  392. ret = io_register_enable_rings(ctx);
  393. break;
  394. case IORING_REGISTER_RESTRICTIONS:
  395. ret = io_register_restrictions(ctx, arg, nr_args);
  396. break;
  397. case IORING_REGISTER_FILES2:
  398. ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
  399. break;
  400. case IORING_REGISTER_FILES_UPDATE2:
  401. ret = io_register_rsrc_update(ctx, arg, nr_args,
  402. IORING_RSRC_FILE);
  403. break;
  404. case IORING_REGISTER_BUFFERS2:
  405. ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
  406. break;
  407. case IORING_REGISTER_BUFFERS_UPDATE:
  408. ret = io_register_rsrc_update(ctx, arg, nr_args,
  409. IORING_RSRC_BUFFER);
  410. break;
  411. case IORING_REGISTER_IOWQ_AFF:
  412. ret = -EINVAL;
  413. if (!arg || !nr_args)
  414. break;
  415. ret = io_register_iowq_aff(ctx, arg, nr_args);
  416. break;
  417. case IORING_UNREGISTER_IOWQ_AFF:
  418. ret = -EINVAL;
  419. if (arg || nr_args)
  420. break;
  421. ret = io_unregister_iowq_aff(ctx);
  422. break;
  423. case IORING_REGISTER_IOWQ_MAX_WORKERS:
  424. ret = -EINVAL;
  425. if (!arg || nr_args != 2)
  426. break;
  427. ret = io_register_iowq_max_workers(ctx, arg);
  428. break;
  429. case IORING_REGISTER_RING_FDS:
  430. ret = io_ringfd_register(ctx, arg, nr_args);
  431. break;
  432. case IORING_UNREGISTER_RING_FDS:
  433. ret = io_ringfd_unregister(ctx, arg, nr_args);
  434. break;
  435. case IORING_REGISTER_PBUF_RING:
  436. ret = -EINVAL;
  437. if (!arg || nr_args != 1)
  438. break;
  439. ret = io_register_pbuf_ring(ctx, arg);
  440. break;
  441. case IORING_UNREGISTER_PBUF_RING:
  442. ret = -EINVAL;
  443. if (!arg || nr_args != 1)
  444. break;
  445. ret = io_unregister_pbuf_ring(ctx, arg);
  446. break;
  447. case IORING_REGISTER_SYNC_CANCEL:
  448. ret = -EINVAL;
  449. if (!arg || nr_args != 1)
  450. break;
  451. ret = io_sync_cancel(ctx, arg);
  452. break;
  453. case IORING_REGISTER_FILE_ALLOC_RANGE:
  454. ret = -EINVAL;
  455. if (!arg || nr_args)
  456. break;
  457. ret = io_register_file_alloc_range(ctx, arg);
  458. break;
  459. case IORING_REGISTER_PBUF_STATUS:
  460. ret = -EINVAL;
  461. if (!arg || nr_args != 1)
  462. break;
  463. ret = io_register_pbuf_status(ctx, arg);
  464. break;
  465. case IORING_REGISTER_NAPI:
  466. ret = -EINVAL;
  467. if (!arg || nr_args != 1)
  468. break;
  469. ret = io_register_napi(ctx, arg);
  470. break;
  471. case IORING_UNREGISTER_NAPI:
  472. ret = -EINVAL;
  473. if (nr_args != 1)
  474. break;
  475. ret = io_unregister_napi(ctx, arg);
  476. break;
  477. case IORING_REGISTER_CLOCK:
  478. ret = -EINVAL;
  479. if (!arg || nr_args)
  480. break;
  481. ret = io_register_clock(ctx, arg);
  482. break;
  483. case IORING_REGISTER_CLONE_BUFFERS:
  484. ret = -EINVAL;
  485. if (!arg || nr_args != 1)
  486. break;
  487. ret = io_register_clone_buffers(ctx, arg);
  488. break;
  489. default:
  490. ret = -EINVAL;
  491. break;
  492. }
  493. return ret;
  494. }
  495. /*
  496. * Given an 'fd' value, return the ctx associated with if. If 'registered' is
  497. * true, then the registered index is used. Otherwise, the normal fd table.
  498. * Caller must call fput() on the returned file, unless it's an ERR_PTR.
  499. */
  500. struct file *io_uring_register_get_file(unsigned int fd, bool registered)
  501. {
  502. struct file *file;
  503. if (registered) {
  504. /*
  505. * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
  506. * need only dereference our task private array to find it.
  507. */
  508. struct io_uring_task *tctx = current->io_uring;
  509. if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
  510. return ERR_PTR(-EINVAL);
  511. fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
  512. file = tctx->registered_rings[fd];
  513. } else {
  514. file = fget(fd);
  515. }
  516. if (unlikely(!file))
  517. return ERR_PTR(-EBADF);
  518. if (io_is_uring_fops(file))
  519. return file;
  520. fput(file);
  521. return ERR_PTR(-EOPNOTSUPP);
  522. }
  523. SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
  524. void __user *, arg, unsigned int, nr_args)
  525. {
  526. struct io_ring_ctx *ctx;
  527. long ret = -EBADF;
  528. struct file *file;
  529. bool use_registered_ring;
  530. use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
  531. opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
  532. if (opcode >= IORING_REGISTER_LAST)
  533. return -EINVAL;
  534. file = io_uring_register_get_file(fd, use_registered_ring);
  535. if (IS_ERR(file))
  536. return PTR_ERR(file);
  537. ctx = file->private_data;
  538. mutex_lock(&ctx->uring_lock);
  539. ret = __io_uring_register(ctx, opcode, arg, nr_args);
  540. mutex_unlock(&ctx->uring_lock);
  541. trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
  542. if (!use_registered_ring)
  543. fput(file);
  544. return ret;
  545. }