syscalls.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. #include <linux/syscalls.h>
  3. #include <linux/time_namespace.h>
  4. #include "futex.h"
  5. /*
  6. * Support for robust futexes: the kernel cleans up held futexes at
  7. * thread exit time.
  8. *
  9. * Implementation: user-space maintains a per-thread list of locks it
  10. * is holding. Upon do_exit(), the kernel carefully walks this list,
  11. * and marks all locks that are owned by this thread with the
  12. * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
  13. * always manipulated with the lock held, so the list is private and
  14. * per-thread. Userspace also maintains a per-thread 'list_op_pending'
  15. * field, to allow the kernel to clean up if the thread dies after
  16. * acquiring the lock, but just before it could have added itself to
  17. * the list. There can only be one such pending lock.
  18. */
  19. /**
  20. * sys_set_robust_list() - Set the robust-futex list head of a task
  21. * @head: pointer to the list-head
  22. * @len: length of the list-head, as userspace expects
  23. */
  24. SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
  25. size_t, len)
  26. {
  27. /*
  28. * The kernel knows only one size for now:
  29. */
  30. if (unlikely(len != sizeof(*head)))
  31. return -EINVAL;
  32. current->robust_list = head;
  33. return 0;
  34. }
  35. /**
  36. * sys_get_robust_list() - Get the robust-futex list head of a task
  37. * @pid: pid of the process [zero for current task]
  38. * @head_ptr: pointer to a list-head pointer, the kernel fills it in
  39. * @len_ptr: pointer to a length field, the kernel fills in the header size
  40. */
  41. SYSCALL_DEFINE3(get_robust_list, int, pid,
  42. struct robust_list_head __user * __user *, head_ptr,
  43. size_t __user *, len_ptr)
  44. {
  45. struct robust_list_head __user *head;
  46. unsigned long ret;
  47. struct task_struct *p;
  48. rcu_read_lock();
  49. ret = -ESRCH;
  50. if (!pid)
  51. p = current;
  52. else {
  53. p = find_task_by_vpid(pid);
  54. if (!p)
  55. goto err_unlock;
  56. }
  57. ret = -EPERM;
  58. if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
  59. goto err_unlock;
  60. head = p->robust_list;
  61. rcu_read_unlock();
  62. if (put_user(sizeof(*head), len_ptr))
  63. return -EFAULT;
  64. return put_user(head, head_ptr);
  65. err_unlock:
  66. rcu_read_unlock();
  67. return ret;
  68. }
  69. long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
  70. u32 __user *uaddr2, u32 val2, u32 val3)
  71. {
  72. unsigned int flags = futex_to_flags(op);
  73. int cmd = op & FUTEX_CMD_MASK;
  74. if (flags & FLAGS_CLOCKRT) {
  75. if (cmd != FUTEX_WAIT_BITSET &&
  76. cmd != FUTEX_WAIT_REQUEUE_PI &&
  77. cmd != FUTEX_LOCK_PI2)
  78. return -ENOSYS;
  79. }
  80. switch (cmd) {
  81. case FUTEX_WAIT:
  82. val3 = FUTEX_BITSET_MATCH_ANY;
  83. fallthrough;
  84. case FUTEX_WAIT_BITSET:
  85. return futex_wait(uaddr, flags, val, timeout, val3);
  86. case FUTEX_WAKE:
  87. val3 = FUTEX_BITSET_MATCH_ANY;
  88. fallthrough;
  89. case FUTEX_WAKE_BITSET:
  90. return futex_wake(uaddr, flags, val, val3);
  91. case FUTEX_REQUEUE:
  92. return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0);
  93. case FUTEX_CMP_REQUEUE:
  94. return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0);
  95. case FUTEX_WAKE_OP:
  96. return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
  97. case FUTEX_LOCK_PI:
  98. flags |= FLAGS_CLOCKRT;
  99. fallthrough;
  100. case FUTEX_LOCK_PI2:
  101. return futex_lock_pi(uaddr, flags, timeout, 0);
  102. case FUTEX_UNLOCK_PI:
  103. return futex_unlock_pi(uaddr, flags);
  104. case FUTEX_TRYLOCK_PI:
  105. return futex_lock_pi(uaddr, flags, NULL, 1);
  106. case FUTEX_WAIT_REQUEUE_PI:
  107. val3 = FUTEX_BITSET_MATCH_ANY;
  108. return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
  109. uaddr2);
  110. case FUTEX_CMP_REQUEUE_PI:
  111. return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1);
  112. }
  113. return -ENOSYS;
  114. }
  115. static __always_inline bool futex_cmd_has_timeout(u32 cmd)
  116. {
  117. switch (cmd) {
  118. case FUTEX_WAIT:
  119. case FUTEX_LOCK_PI:
  120. case FUTEX_LOCK_PI2:
  121. case FUTEX_WAIT_BITSET:
  122. case FUTEX_WAIT_REQUEUE_PI:
  123. return true;
  124. }
  125. return false;
  126. }
  127. static __always_inline int
  128. futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
  129. {
  130. if (!timespec64_valid(ts))
  131. return -EINVAL;
  132. *t = timespec64_to_ktime(*ts);
  133. if (cmd == FUTEX_WAIT)
  134. *t = ktime_add_safe(ktime_get(), *t);
  135. else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
  136. *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
  137. return 0;
  138. }
  139. SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
  140. const struct __kernel_timespec __user *, utime,
  141. u32 __user *, uaddr2, u32, val3)
  142. {
  143. int ret, cmd = op & FUTEX_CMD_MASK;
  144. ktime_t t, *tp = NULL;
  145. struct timespec64 ts;
  146. if (utime && futex_cmd_has_timeout(cmd)) {
  147. if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
  148. return -EFAULT;
  149. if (get_timespec64(&ts, utime))
  150. return -EFAULT;
  151. ret = futex_init_timeout(cmd, op, &ts, &t);
  152. if (ret)
  153. return ret;
  154. tp = &t;
  155. }
  156. return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
  157. }
  158. /**
  159. * futex_parse_waitv - Parse a waitv array from userspace
  160. * @futexv: Kernel side list of waiters to be filled
  161. * @uwaitv: Userspace list to be parsed
  162. * @nr_futexes: Length of futexv
  163. * @wake: Wake to call when futex is woken
  164. * @wake_data: Data for the wake handler
  165. *
  166. * Return: Error code on failure, 0 on success
  167. */
  168. int futex_parse_waitv(struct futex_vector *futexv,
  169. struct futex_waitv __user *uwaitv,
  170. unsigned int nr_futexes, futex_wake_fn *wake,
  171. void *wake_data)
  172. {
  173. struct futex_waitv aux;
  174. unsigned int i;
  175. for (i = 0; i < nr_futexes; i++) {
  176. unsigned int flags;
  177. if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
  178. return -EFAULT;
  179. if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved)
  180. return -EINVAL;
  181. flags = futex2_to_flags(aux.flags);
  182. if (!futex_flags_valid(flags))
  183. return -EINVAL;
  184. if (!futex_validate_input(flags, aux.val))
  185. return -EINVAL;
  186. futexv[i].w.flags = flags;
  187. futexv[i].w.val = aux.val;
  188. futexv[i].w.uaddr = aux.uaddr;
  189. futexv[i].q = futex_q_init;
  190. futexv[i].q.wake = wake;
  191. futexv[i].q.wake_data = wake_data;
  192. }
  193. return 0;
  194. }
  195. static int futex2_setup_timeout(struct __kernel_timespec __user *timeout,
  196. clockid_t clockid, struct hrtimer_sleeper *to)
  197. {
  198. int flag_clkid = 0, flag_init = 0;
  199. struct timespec64 ts;
  200. ktime_t time;
  201. int ret;
  202. if (!timeout)
  203. return 0;
  204. if (clockid == CLOCK_REALTIME) {
  205. flag_clkid = FLAGS_CLOCKRT;
  206. flag_init = FUTEX_CLOCK_REALTIME;
  207. }
  208. if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
  209. return -EINVAL;
  210. if (get_timespec64(&ts, timeout))
  211. return -EFAULT;
  212. /*
  213. * Since there's no opcode for futex_waitv, use
  214. * FUTEX_WAIT_BITSET that uses absolute timeout as well
  215. */
  216. ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
  217. if (ret)
  218. return ret;
  219. futex_setup_timer(&time, to, flag_clkid, 0);
  220. return 0;
  221. }
  222. static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to)
  223. {
  224. hrtimer_cancel(&to->timer);
  225. destroy_hrtimer_on_stack(&to->timer);
  226. }
  227. /**
  228. * sys_futex_waitv - Wait on a list of futexes
  229. * @waiters: List of futexes to wait on
  230. * @nr_futexes: Length of futexv
  231. * @flags: Flag for timeout (monotonic/realtime)
  232. * @timeout: Optional absolute timeout.
  233. * @clockid: Clock to be used for the timeout, realtime or monotonic.
  234. *
  235. * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes
  236. * if a futex_wake() is performed at any uaddr. The syscall returns immediately
  237. * if any waiter has *uaddr != val. *timeout is an optional timeout value for
  238. * the operation. Each waiter has individual flags. The `flags` argument for
  239. * the syscall should be used solely for specifying the timeout as realtime, if
  240. * needed. Flags for private futexes, sizes, etc. should be used on the
  241. * individual flags of each waiter.
  242. *
  243. * Returns the array index of one of the woken futexes. No further information
  244. * is provided: any number of other futexes may also have been woken by the
  245. * same event, and if more than one futex was woken, the retrned index may
  246. * refer to any one of them. (It is not necessaryily the futex with the
  247. * smallest index, nor the one most recently woken, nor...)
  248. */
  249. SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
  250. unsigned int, nr_futexes, unsigned int, flags,
  251. struct __kernel_timespec __user *, timeout, clockid_t, clockid)
  252. {
  253. struct hrtimer_sleeper to;
  254. struct futex_vector *futexv;
  255. int ret;
  256. /* This syscall supports no flags for now */
  257. if (flags)
  258. return -EINVAL;
  259. if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
  260. return -EINVAL;
  261. if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
  262. return ret;
  263. futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
  264. if (!futexv) {
  265. ret = -ENOMEM;
  266. goto destroy_timer;
  267. }
  268. ret = futex_parse_waitv(futexv, waiters, nr_futexes, futex_wake_mark,
  269. NULL);
  270. if (!ret)
  271. ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
  272. kfree(futexv);
  273. destroy_timer:
  274. if (timeout)
  275. futex2_destroy_timeout(&to);
  276. return ret;
  277. }
  278. /*
  279. * sys_futex_wake - Wake a number of futexes
  280. * @uaddr: Address of the futex(es) to wake
  281. * @mask: bitmask
  282. * @nr: Number of the futexes to wake
  283. * @flags: FUTEX2 flags
  284. *
  285. * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the
  286. * futex2 family of calls.
  287. */
  288. SYSCALL_DEFINE4(futex_wake,
  289. void __user *, uaddr,
  290. unsigned long, mask,
  291. int, nr,
  292. unsigned int, flags)
  293. {
  294. if (flags & ~FUTEX2_VALID_MASK)
  295. return -EINVAL;
  296. flags = futex2_to_flags(flags);
  297. if (!futex_flags_valid(flags))
  298. return -EINVAL;
  299. if (!futex_validate_input(flags, mask))
  300. return -EINVAL;
  301. return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask);
  302. }
  303. /*
  304. * sys_futex_wait - Wait on a futex
  305. * @uaddr: Address of the futex to wait on
  306. * @val: Value of @uaddr
  307. * @mask: bitmask
  308. * @flags: FUTEX2 flags
  309. * @timeout: Optional absolute timeout
  310. * @clockid: Clock to be used for the timeout, realtime or monotonic
  311. *
  312. * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the
  313. * futex2 familiy of calls.
  314. */
  315. SYSCALL_DEFINE6(futex_wait,
  316. void __user *, uaddr,
  317. unsigned long, val,
  318. unsigned long, mask,
  319. unsigned int, flags,
  320. struct __kernel_timespec __user *, timeout,
  321. clockid_t, clockid)
  322. {
  323. struct hrtimer_sleeper to;
  324. int ret;
  325. if (flags & ~FUTEX2_VALID_MASK)
  326. return -EINVAL;
  327. flags = futex2_to_flags(flags);
  328. if (!futex_flags_valid(flags))
  329. return -EINVAL;
  330. if (!futex_validate_input(flags, val) ||
  331. !futex_validate_input(flags, mask))
  332. return -EINVAL;
  333. if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
  334. return ret;
  335. ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask);
  336. if (timeout)
  337. futex2_destroy_timeout(&to);
  338. return ret;
  339. }
  340. /*
  341. * sys_futex_requeue - Requeue a waiter from one futex to another
  342. * @waiters: array describing the source and destination futex
  343. * @flags: unused
  344. * @nr_wake: number of futexes to wake
  345. * @nr_requeue: number of futexes to requeue
  346. *
  347. * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the
  348. * futex2 family of calls.
  349. */
  350. SYSCALL_DEFINE4(futex_requeue,
  351. struct futex_waitv __user *, waiters,
  352. unsigned int, flags,
  353. int, nr_wake,
  354. int, nr_requeue)
  355. {
  356. struct futex_vector futexes[2];
  357. u32 cmpval;
  358. int ret;
  359. if (flags)
  360. return -EINVAL;
  361. if (!waiters)
  362. return -EINVAL;
  363. ret = futex_parse_waitv(futexes, waiters, 2, futex_wake_mark, NULL);
  364. if (ret)
  365. return ret;
  366. cmpval = futexes[0].w.val;
  367. return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags,
  368. u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags,
  369. nr_wake, nr_requeue, &cmpval, 0);
  370. }
  371. #ifdef CONFIG_COMPAT
  372. COMPAT_SYSCALL_DEFINE2(set_robust_list,
  373. struct compat_robust_list_head __user *, head,
  374. compat_size_t, len)
  375. {
  376. if (unlikely(len != sizeof(*head)))
  377. return -EINVAL;
  378. current->compat_robust_list = head;
  379. return 0;
  380. }
  381. COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
  382. compat_uptr_t __user *, head_ptr,
  383. compat_size_t __user *, len_ptr)
  384. {
  385. struct compat_robust_list_head __user *head;
  386. unsigned long ret;
  387. struct task_struct *p;
  388. rcu_read_lock();
  389. ret = -ESRCH;
  390. if (!pid)
  391. p = current;
  392. else {
  393. p = find_task_by_vpid(pid);
  394. if (!p)
  395. goto err_unlock;
  396. }
  397. ret = -EPERM;
  398. if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
  399. goto err_unlock;
  400. head = p->compat_robust_list;
  401. rcu_read_unlock();
  402. if (put_user(sizeof(*head), len_ptr))
  403. return -EFAULT;
  404. return put_user(ptr_to_compat(head), head_ptr);
  405. err_unlock:
  406. rcu_read_unlock();
  407. return ret;
  408. }
  409. #endif /* CONFIG_COMPAT */
  410. #ifdef CONFIG_COMPAT_32BIT_TIME
  411. SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
  412. const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
  413. u32, val3)
  414. {
  415. int ret, cmd = op & FUTEX_CMD_MASK;
  416. ktime_t t, *tp = NULL;
  417. struct timespec64 ts;
  418. if (utime && futex_cmd_has_timeout(cmd)) {
  419. if (get_old_timespec32(&ts, utime))
  420. return -EFAULT;
  421. ret = futex_init_timeout(cmd, op, &ts, &t);
  422. if (ret)
  423. return ret;
  424. tp = &t;
  425. }
  426. return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
  427. }
  428. #endif /* CONFIG_COMPAT_32BIT_TIME */