register.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Code related to the io_uring_register() syscall
  4. *
  5. * Copyright (C) 2023 Jens Axboe
  6. */
  7. #include <linux/kernel.h>
  8. #include <linux/errno.h>
  9. #include <linux/syscalls.h>
  10. #include <linux/refcount.h>
  11. #include <linux/bits.h>
  12. #include <linux/fs.h>
  13. #include <linux/file.h>
  14. #include <linux/slab.h>
  15. #include <linux/uaccess.h>
  16. #include <linux/nospec.h>
  17. #include <linux/compat.h>
  18. #include <linux/io_uring.h>
  19. #include <linux/io_uring_types.h>
  20. #include "io_uring.h"
  21. #include "opdef.h"
  22. #include "tctx.h"
  23. #include "rsrc.h"
  24. #include "sqpoll.h"
  25. #include "register.h"
  26. #include "cancel.h"
  27. #include "kbuf.h"
  28. #include "napi.h"
  29. #include "eventfd.h"
  30. #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
  31. IORING_REGISTER_LAST + IORING_OP_LAST)
  32. static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
  33. unsigned nr_args)
  34. {
  35. struct io_uring_probe *p;
  36. size_t size;
  37. int i, ret;
  38. if (nr_args > IORING_OP_LAST)
  39. nr_args = IORING_OP_LAST;
  40. size = struct_size(p, ops, nr_args);
  41. p = kzalloc(size, GFP_KERNEL);
  42. if (!p)
  43. return -ENOMEM;
  44. ret = -EFAULT;
  45. if (copy_from_user(p, arg, size))
  46. goto out;
  47. ret = -EINVAL;
  48. if (memchr_inv(p, 0, size))
  49. goto out;
  50. p->last_op = IORING_OP_LAST - 1;
  51. for (i = 0; i < nr_args; i++) {
  52. p->ops[i].op = i;
  53. if (io_uring_op_supported(i))
  54. p->ops[i].flags = IO_URING_OP_SUPPORTED;
  55. }
  56. p->ops_len = i;
  57. ret = 0;
  58. if (copy_to_user(arg, p, size))
  59. ret = -EFAULT;
  60. out:
  61. kfree(p);
  62. return ret;
  63. }
  64. int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
  65. {
  66. const struct cred *creds;
  67. creds = xa_erase(&ctx->personalities, id);
  68. if (creds) {
  69. put_cred(creds);
  70. return 0;
  71. }
  72. return -EINVAL;
  73. }
  74. static int io_register_personality(struct io_ring_ctx *ctx)
  75. {
  76. const struct cred *creds;
  77. u32 id;
  78. int ret;
  79. creds = get_current_cred();
  80. ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
  81. XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
  82. if (ret < 0) {
  83. put_cred(creds);
  84. return ret;
  85. }
  86. return id;
  87. }
  88. static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
  89. void __user *arg, unsigned int nr_args)
  90. {
  91. struct io_uring_restriction *res;
  92. size_t size;
  93. int i, ret;
  94. /* Restrictions allowed only if rings started disabled */
  95. if (!(ctx->flags & IORING_SETUP_R_DISABLED))
  96. return -EBADFD;
  97. /* We allow only a single restrictions registration */
  98. if (ctx->restrictions.registered)
  99. return -EBUSY;
  100. if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
  101. return -EINVAL;
  102. size = array_size(nr_args, sizeof(*res));
  103. if (size == SIZE_MAX)
  104. return -EOVERFLOW;
  105. res = memdup_user(arg, size);
  106. if (IS_ERR(res))
  107. return PTR_ERR(res);
  108. ret = 0;
  109. for (i = 0; i < nr_args; i++) {
  110. switch (res[i].opcode) {
  111. case IORING_RESTRICTION_REGISTER_OP:
  112. if (res[i].register_op >= IORING_REGISTER_LAST) {
  113. ret = -EINVAL;
  114. goto out;
  115. }
  116. __set_bit(res[i].register_op,
  117. ctx->restrictions.register_op);
  118. break;
  119. case IORING_RESTRICTION_SQE_OP:
  120. if (res[i].sqe_op >= IORING_OP_LAST) {
  121. ret = -EINVAL;
  122. goto out;
  123. }
  124. __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
  125. break;
  126. case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
  127. ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
  128. break;
  129. case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
  130. ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
  131. break;
  132. default:
  133. ret = -EINVAL;
  134. goto out;
  135. }
  136. }
  137. out:
  138. /* Reset all restrictions if an error happened */
  139. if (ret != 0)
  140. memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
  141. else
  142. ctx->restrictions.registered = true;
  143. kfree(res);
  144. return ret;
  145. }
  146. static int io_register_enable_rings(struct io_ring_ctx *ctx)
  147. {
  148. if (!(ctx->flags & IORING_SETUP_R_DISABLED))
  149. return -EBADFD;
  150. if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
  151. WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
  152. /*
  153. * Lazy activation attempts would fail if it was polled before
  154. * submitter_task is set.
  155. */
  156. if (wq_has_sleeper(&ctx->poll_wq))
  157. io_activate_pollwq(ctx);
  158. }
  159. if (ctx->restrictions.registered)
  160. ctx->restricted = 1;
  161. ctx->flags &= ~IORING_SETUP_R_DISABLED;
  162. if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
  163. wake_up(&ctx->sq_data->wait);
  164. return 0;
  165. }
  166. static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
  167. cpumask_var_t new_mask)
  168. {
  169. int ret;
  170. if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
  171. ret = io_wq_cpu_affinity(current->io_uring, new_mask);
  172. } else {
  173. mutex_unlock(&ctx->uring_lock);
  174. ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
  175. mutex_lock(&ctx->uring_lock);
  176. }
  177. return ret;
  178. }
  179. static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
  180. void __user *arg, unsigned len)
  181. {
  182. cpumask_var_t new_mask;
  183. int ret;
  184. if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
  185. return -ENOMEM;
  186. cpumask_clear(new_mask);
  187. if (len > cpumask_size())
  188. len = cpumask_size();
  189. #ifdef CONFIG_COMPAT
  190. if (in_compat_syscall())
  191. ret = compat_get_bitmap(cpumask_bits(new_mask),
  192. (const compat_ulong_t __user *)arg,
  193. len * 8 /* CHAR_BIT */);
  194. else
  195. #endif
  196. ret = copy_from_user(new_mask, arg, len);
  197. if (ret) {
  198. free_cpumask_var(new_mask);
  199. return -EFAULT;
  200. }
  201. ret = __io_register_iowq_aff(ctx, new_mask);
  202. free_cpumask_var(new_mask);
  203. return ret;
  204. }
  205. static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
  206. {
  207. return __io_register_iowq_aff(ctx, NULL);
  208. }
  209. static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
  210. void __user *arg)
  211. __must_hold(&ctx->uring_lock)
  212. {
  213. struct io_tctx_node *node;
  214. struct io_uring_task *tctx = NULL;
  215. struct io_sq_data *sqd = NULL;
  216. __u32 new_count[2];
  217. int i, ret;
  218. if (copy_from_user(new_count, arg, sizeof(new_count)))
  219. return -EFAULT;
  220. for (i = 0; i < ARRAY_SIZE(new_count); i++)
  221. if (new_count[i] > INT_MAX)
  222. return -EINVAL;
  223. if (ctx->flags & IORING_SETUP_SQPOLL) {
  224. sqd = ctx->sq_data;
  225. if (sqd) {
  226. struct task_struct *tsk;
  227. /*
  228. * Observe the correct sqd->lock -> ctx->uring_lock
  229. * ordering. Fine to drop uring_lock here, we hold
  230. * a ref to the ctx.
  231. */
  232. refcount_inc(&sqd->refs);
  233. mutex_unlock(&ctx->uring_lock);
  234. mutex_lock(&sqd->lock);
  235. mutex_lock(&ctx->uring_lock);
  236. tsk = sqpoll_task_locked(sqd);
  237. if (tsk)
  238. tctx = tsk->io_uring;
  239. }
  240. } else {
  241. tctx = current->io_uring;
  242. }
  243. BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
  244. for (i = 0; i < ARRAY_SIZE(new_count); i++)
  245. if (new_count[i])
  246. ctx->iowq_limits[i] = new_count[i];
  247. ctx->iowq_limits_set = true;
  248. if (tctx && tctx->io_wq) {
  249. ret = io_wq_max_workers(tctx->io_wq, new_count);
  250. if (ret)
  251. goto err;
  252. } else {
  253. memset(new_count, 0, sizeof(new_count));
  254. }
  255. if (sqd) {
  256. mutex_unlock(&ctx->uring_lock);
  257. mutex_unlock(&sqd->lock);
  258. io_put_sq_data(sqd);
  259. mutex_lock(&ctx->uring_lock);
  260. }
  261. if (copy_to_user(arg, new_count, sizeof(new_count)))
  262. return -EFAULT;
  263. /* that's it for SQPOLL, only the SQPOLL task creates requests */
  264. if (sqd)
  265. return 0;
  266. /* now propagate the restriction to all registered users */
  267. list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
  268. tctx = node->task->io_uring;
  269. if (WARN_ON_ONCE(!tctx->io_wq))
  270. continue;
  271. for (i = 0; i < ARRAY_SIZE(new_count); i++)
  272. new_count[i] = ctx->iowq_limits[i];
  273. /* ignore errors, it always returns zero anyway */
  274. (void)io_wq_max_workers(tctx->io_wq, new_count);
  275. }
  276. return 0;
  277. err:
  278. if (sqd) {
  279. mutex_unlock(&ctx->uring_lock);
  280. mutex_unlock(&sqd->lock);
  281. io_put_sq_data(sqd);
  282. mutex_lock(&ctx->uring_lock);
  283. }
  284. return ret;
  285. }
  286. static int io_register_clock(struct io_ring_ctx *ctx,
  287. struct io_uring_clock_register __user *arg)
  288. {
  289. struct io_uring_clock_register reg;
  290. if (copy_from_user(&reg, arg, sizeof(reg)))
  291. return -EFAULT;
  292. if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
  293. return -EINVAL;
  294. switch (reg.clockid) {
  295. case CLOCK_MONOTONIC:
  296. ctx->clock_offset = 0;
  297. break;
  298. case CLOCK_BOOTTIME:
  299. ctx->clock_offset = TK_OFFS_BOOT;
  300. break;
  301. default:
  302. return -EINVAL;
  303. }
  304. ctx->clockid = reg.clockid;
  305. return 0;
  306. }
  307. static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
  308. void __user *arg, unsigned nr_args)
  309. __releases(ctx->uring_lock)
  310. __acquires(ctx->uring_lock)
  311. {
  312. int ret;
  313. /*
  314. * We don't quiesce the refs for register anymore and so it can't be
  315. * dying as we're holding a file ref here.
  316. */
  317. if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
  318. return -ENXIO;
  319. if (ctx->submitter_task && ctx->submitter_task != current)
  320. return -EEXIST;
  321. if (ctx->restricted) {
  322. opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
  323. if (!test_bit(opcode, ctx->restrictions.register_op))
  324. return -EACCES;
  325. }
  326. switch (opcode) {
  327. case IORING_REGISTER_BUFFERS:
  328. ret = -EFAULT;
  329. if (!arg)
  330. break;
  331. ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
  332. break;
  333. case IORING_UNREGISTER_BUFFERS:
  334. ret = -EINVAL;
  335. if (arg || nr_args)
  336. break;
  337. ret = io_sqe_buffers_unregister(ctx);
  338. break;
  339. case IORING_REGISTER_FILES:
  340. ret = -EFAULT;
  341. if (!arg)
  342. break;
  343. ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
  344. break;
  345. case IORING_UNREGISTER_FILES:
  346. ret = -EINVAL;
  347. if (arg || nr_args)
  348. break;
  349. ret = io_sqe_files_unregister(ctx);
  350. break;
  351. case IORING_REGISTER_FILES_UPDATE:
  352. ret = io_register_files_update(ctx, arg, nr_args);
  353. break;
  354. case IORING_REGISTER_EVENTFD:
  355. ret = -EINVAL;
  356. if (nr_args != 1)
  357. break;
  358. ret = io_eventfd_register(ctx, arg, 0);
  359. break;
  360. case IORING_REGISTER_EVENTFD_ASYNC:
  361. ret = -EINVAL;
  362. if (nr_args != 1)
  363. break;
  364. ret = io_eventfd_register(ctx, arg, 1);
  365. break;
  366. case IORING_UNREGISTER_EVENTFD:
  367. ret = -EINVAL;
  368. if (arg || nr_args)
  369. break;
  370. ret = io_eventfd_unregister(ctx);
  371. break;
  372. case IORING_REGISTER_PROBE:
  373. ret = -EINVAL;
  374. if (!arg || nr_args > 256)
  375. break;
  376. ret = io_probe(ctx, arg, nr_args);
  377. break;
  378. case IORING_REGISTER_PERSONALITY:
  379. ret = -EINVAL;
  380. if (arg || nr_args)
  381. break;
  382. ret = io_register_personality(ctx);
  383. break;
  384. case IORING_UNREGISTER_PERSONALITY:
  385. ret = -EINVAL;
  386. if (arg)
  387. break;
  388. ret = io_unregister_personality(ctx, nr_args);
  389. break;
  390. case IORING_REGISTER_ENABLE_RINGS:
  391. ret = -EINVAL;
  392. if (arg || nr_args)
  393. break;
  394. ret = io_register_enable_rings(ctx);
  395. break;
  396. case IORING_REGISTER_RESTRICTIONS:
  397. ret = io_register_restrictions(ctx, arg, nr_args);
  398. break;
  399. case IORING_REGISTER_FILES2:
  400. ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
  401. break;
  402. case IORING_REGISTER_FILES_UPDATE2:
  403. ret = io_register_rsrc_update(ctx, arg, nr_args,
  404. IORING_RSRC_FILE);
  405. break;
  406. case IORING_REGISTER_BUFFERS2:
  407. ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
  408. break;
  409. case IORING_REGISTER_BUFFERS_UPDATE:
  410. ret = io_register_rsrc_update(ctx, arg, nr_args,
  411. IORING_RSRC_BUFFER);
  412. break;
  413. case IORING_REGISTER_IOWQ_AFF:
  414. ret = -EINVAL;
  415. if (!arg || !nr_args)
  416. break;
  417. ret = io_register_iowq_aff(ctx, arg, nr_args);
  418. break;
  419. case IORING_UNREGISTER_IOWQ_AFF:
  420. ret = -EINVAL;
  421. if (arg || nr_args)
  422. break;
  423. ret = io_unregister_iowq_aff(ctx);
  424. break;
  425. case IORING_REGISTER_IOWQ_MAX_WORKERS:
  426. ret = -EINVAL;
  427. if (!arg || nr_args != 2)
  428. break;
  429. ret = io_register_iowq_max_workers(ctx, arg);
  430. break;
  431. case IORING_REGISTER_RING_FDS:
  432. ret = io_ringfd_register(ctx, arg, nr_args);
  433. break;
  434. case IORING_UNREGISTER_RING_FDS:
  435. ret = io_ringfd_unregister(ctx, arg, nr_args);
  436. break;
  437. case IORING_REGISTER_PBUF_RING:
  438. ret = -EINVAL;
  439. if (!arg || nr_args != 1)
  440. break;
  441. ret = io_register_pbuf_ring(ctx, arg);
  442. break;
  443. case IORING_UNREGISTER_PBUF_RING:
  444. ret = -EINVAL;
  445. if (!arg || nr_args != 1)
  446. break;
  447. ret = io_unregister_pbuf_ring(ctx, arg);
  448. break;
  449. case IORING_REGISTER_SYNC_CANCEL:
  450. ret = -EINVAL;
  451. if (!arg || nr_args != 1)
  452. break;
  453. ret = io_sync_cancel(ctx, arg);
  454. break;
  455. case IORING_REGISTER_FILE_ALLOC_RANGE:
  456. ret = -EINVAL;
  457. if (!arg || nr_args)
  458. break;
  459. ret = io_register_file_alloc_range(ctx, arg);
  460. break;
  461. case IORING_REGISTER_PBUF_STATUS:
  462. ret = -EINVAL;
  463. if (!arg || nr_args != 1)
  464. break;
  465. ret = io_register_pbuf_status(ctx, arg);
  466. break;
  467. case IORING_REGISTER_NAPI:
  468. ret = -EINVAL;
  469. if (!arg || nr_args != 1)
  470. break;
  471. ret = io_register_napi(ctx, arg);
  472. break;
  473. case IORING_UNREGISTER_NAPI:
  474. ret = -EINVAL;
  475. if (nr_args != 1)
  476. break;
  477. ret = io_unregister_napi(ctx, arg);
  478. break;
  479. case IORING_REGISTER_CLOCK:
  480. ret = -EINVAL;
  481. if (!arg || nr_args)
  482. break;
  483. ret = io_register_clock(ctx, arg);
  484. break;
  485. case IORING_REGISTER_CLONE_BUFFERS:
  486. ret = -EINVAL;
  487. if (!arg || nr_args != 1)
  488. break;
  489. ret = io_register_clone_buffers(ctx, arg);
  490. break;
  491. default:
  492. ret = -EINVAL;
  493. break;
  494. }
  495. return ret;
  496. }
  497. /*
  498. * Given an 'fd' value, return the ctx associated with if. If 'registered' is
  499. * true, then the registered index is used. Otherwise, the normal fd table.
  500. * Caller must call fput() on the returned file, unless it's an ERR_PTR.
  501. */
  502. struct file *io_uring_register_get_file(unsigned int fd, bool registered)
  503. {
  504. struct file *file;
  505. if (registered) {
  506. /*
  507. * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
  508. * need only dereference our task private array to find it.
  509. */
  510. struct io_uring_task *tctx = current->io_uring;
  511. if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
  512. return ERR_PTR(-EINVAL);
  513. fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
  514. file = tctx->registered_rings[fd];
  515. } else {
  516. file = fget(fd);
  517. }
  518. if (unlikely(!file))
  519. return ERR_PTR(-EBADF);
  520. if (io_is_uring_fops(file))
  521. return file;
  522. fput(file);
  523. return ERR_PTR(-EOPNOTSUPP);
  524. }
  525. SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
  526. void __user *, arg, unsigned int, nr_args)
  527. {
  528. struct io_ring_ctx *ctx;
  529. long ret = -EBADF;
  530. struct file *file;
  531. bool use_registered_ring;
  532. use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
  533. opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
  534. if (opcode >= IORING_REGISTER_LAST)
  535. return -EINVAL;
  536. file = io_uring_register_get_file(fd, use_registered_ring);
  537. if (IS_ERR(file))
  538. return PTR_ERR(file);
  539. ctx = file->private_data;
  540. mutex_lock(&ctx->uring_lock);
  541. ret = __io_uring_register(ctx, opcode, arg, nr_args);
  542. mutex_unlock(&ctx->uring_lock);
  543. trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
  544. if (!use_registered_ring)
  545. fput(file);
  546. return ret;
  547. }