pi.c 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. #include <linux/slab.h>
  3. #include <linux/sched/rt.h>
  4. #include <linux/sched/task.h>
  5. #include "futex.h"
  6. #include "../locking/rtmutex_common.h"
  7. /*
  8. * PI code:
  9. */
  10. int refill_pi_state_cache(void)
  11. {
  12. struct futex_pi_state *pi_state;
  13. if (likely(current->pi_state_cache))
  14. return 0;
  15. pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
  16. if (!pi_state)
  17. return -ENOMEM;
  18. INIT_LIST_HEAD(&pi_state->list);
  19. /* pi_mutex gets initialized later */
  20. pi_state->owner = NULL;
  21. refcount_set(&pi_state->refcount, 1);
  22. pi_state->key = FUTEX_KEY_INIT;
  23. current->pi_state_cache = pi_state;
  24. return 0;
  25. }
  26. static struct futex_pi_state *alloc_pi_state(void)
  27. {
  28. struct futex_pi_state *pi_state = current->pi_state_cache;
  29. WARN_ON(!pi_state);
  30. current->pi_state_cache = NULL;
  31. return pi_state;
  32. }
  33. static void pi_state_update_owner(struct futex_pi_state *pi_state,
  34. struct task_struct *new_owner)
  35. {
  36. struct task_struct *old_owner = pi_state->owner;
  37. lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
  38. if (old_owner) {
  39. raw_spin_lock(&old_owner->pi_lock);
  40. WARN_ON(list_empty(&pi_state->list));
  41. list_del_init(&pi_state->list);
  42. raw_spin_unlock(&old_owner->pi_lock);
  43. }
  44. if (new_owner) {
  45. raw_spin_lock(&new_owner->pi_lock);
  46. WARN_ON(!list_empty(&pi_state->list));
  47. list_add(&pi_state->list, &new_owner->pi_state_list);
  48. pi_state->owner = new_owner;
  49. raw_spin_unlock(&new_owner->pi_lock);
  50. }
  51. }
  52. void get_pi_state(struct futex_pi_state *pi_state)
  53. {
  54. WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
  55. }
  56. /*
  57. * Drops a reference to the pi_state object and frees or caches it
  58. * when the last reference is gone.
  59. */
  60. void put_pi_state(struct futex_pi_state *pi_state)
  61. {
  62. if (!pi_state)
  63. return;
  64. if (!refcount_dec_and_test(&pi_state->refcount))
  65. return;
  66. /*
  67. * If pi_state->owner is NULL, the owner is most probably dying
  68. * and has cleaned up the pi_state already
  69. */
  70. if (pi_state->owner) {
  71. unsigned long flags;
  72. raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
  73. pi_state_update_owner(pi_state, NULL);
  74. rt_mutex_proxy_unlock(&pi_state->pi_mutex);
  75. raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
  76. }
  77. if (current->pi_state_cache) {
  78. kfree(pi_state);
  79. } else {
  80. /*
  81. * pi_state->list is already empty.
  82. * clear pi_state->owner.
  83. * refcount is at 0 - put it back to 1.
  84. */
  85. pi_state->owner = NULL;
  86. refcount_set(&pi_state->refcount, 1);
  87. current->pi_state_cache = pi_state;
  88. }
  89. }
  90. /*
  91. * We need to check the following states:
  92. *
  93. * Waiter | pi_state | pi->owner | uTID | uODIED | ?
  94. *
  95. * [1] NULL | --- | --- | 0 | 0/1 | Valid
  96. * [2] NULL | --- | --- | >0 | 0/1 | Valid
  97. *
  98. * [3] Found | NULL | -- | Any | 0/1 | Invalid
  99. *
  100. * [4] Found | Found | NULL | 0 | 1 | Valid
  101. * [5] Found | Found | NULL | >0 | 1 | Invalid
  102. *
  103. * [6] Found | Found | task | 0 | 1 | Valid
  104. *
  105. * [7] Found | Found | NULL | Any | 0 | Invalid
  106. *
  107. * [8] Found | Found | task | ==taskTID | 0/1 | Valid
  108. * [9] Found | Found | task | 0 | 0 | Invalid
  109. * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
  110. *
  111. * [1] Indicates that the kernel can acquire the futex atomically. We
  112. * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
  113. *
  114. * [2] Valid, if TID does not belong to a kernel thread. If no matching
  115. * thread is found then it indicates that the owner TID has died.
  116. *
  117. * [3] Invalid. The waiter is queued on a non PI futex
  118. *
  119. * [4] Valid state after exit_robust_list(), which sets the user space
  120. * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
  121. *
  122. * [5] The user space value got manipulated between exit_robust_list()
  123. * and exit_pi_state_list()
  124. *
  125. * [6] Valid state after exit_pi_state_list() which sets the new owner in
  126. * the pi_state but cannot access the user space value.
  127. *
  128. * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
  129. *
  130. * [8] Owner and user space value match
  131. *
  132. * [9] There is no transient state which sets the user space TID to 0
  133. * except exit_robust_list(), but this is indicated by the
  134. * FUTEX_OWNER_DIED bit. See [4]
  135. *
  136. * [10] There is no transient state which leaves owner and user space
  137. * TID out of sync. Except one error case where the kernel is denied
  138. * write access to the user address, see fixup_pi_state_owner().
  139. *
  140. *
  141. * Serialization and lifetime rules:
  142. *
  143. * hb->lock:
  144. *
  145. * hb -> futex_q, relation
  146. * futex_q -> pi_state, relation
  147. *
  148. * (cannot be raw because hb can contain arbitrary amount
  149. * of futex_q's)
  150. *
  151. * pi_mutex->wait_lock:
  152. *
  153. * {uval, pi_state}
  154. *
  155. * (and pi_mutex 'obviously')
  156. *
  157. * p->pi_lock:
  158. *
  159. * p->pi_state_list -> pi_state->list, relation
  160. * pi_mutex->owner -> pi_state->owner, relation
  161. *
  162. * pi_state->refcount:
  163. *
  164. * pi_state lifetime
  165. *
  166. *
  167. * Lock order:
  168. *
  169. * hb->lock
  170. * pi_mutex->wait_lock
  171. * p->pi_lock
  172. *
  173. */
  174. /*
  175. * Validate that the existing waiter has a pi_state and sanity check
  176. * the pi_state against the user space value. If correct, attach to
  177. * it.
  178. */
  179. static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
  180. struct futex_pi_state *pi_state,
  181. struct futex_pi_state **ps)
  182. {
  183. pid_t pid = uval & FUTEX_TID_MASK;
  184. u32 uval2;
  185. int ret;
  186. /*
  187. * Userspace might have messed up non-PI and PI futexes [3]
  188. */
  189. if (unlikely(!pi_state))
  190. return -EINVAL;
  191. /*
  192. * We get here with hb->lock held, and having found a
  193. * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
  194. * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
  195. * which in turn means that futex_lock_pi() still has a reference on
  196. * our pi_state.
  197. *
  198. * The waiter holding a reference on @pi_state also protects against
  199. * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
  200. * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
  201. * free pi_state before we can take a reference ourselves.
  202. */
  203. WARN_ON(!refcount_read(&pi_state->refcount));
  204. /*
  205. * Now that we have a pi_state, we can acquire wait_lock
  206. * and do the state validation.
  207. */
  208. raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
  209. /*
  210. * Since {uval, pi_state} is serialized by wait_lock, and our current
  211. * uval was read without holding it, it can have changed. Verify it
  212. * still is what we expect it to be, otherwise retry the entire
  213. * operation.
  214. */
  215. if (futex_get_value_locked(&uval2, uaddr))
  216. goto out_efault;
  217. if (uval != uval2)
  218. goto out_eagain;
  219. /*
  220. * Handle the owner died case:
  221. */
  222. if (uval & FUTEX_OWNER_DIED) {
  223. /*
  224. * exit_pi_state_list sets owner to NULL and wakes the
  225. * topmost waiter. The task which acquires the
  226. * pi_state->rt_mutex will fixup owner.
  227. */
  228. if (!pi_state->owner) {
  229. /*
  230. * No pi state owner, but the user space TID
  231. * is not 0. Inconsistent state. [5]
  232. */
  233. if (pid)
  234. goto out_einval;
  235. /*
  236. * Take a ref on the state and return success. [4]
  237. */
  238. goto out_attach;
  239. }
  240. /*
  241. * If TID is 0, then either the dying owner has not
  242. * yet executed exit_pi_state_list() or some waiter
  243. * acquired the rtmutex in the pi state, but did not
  244. * yet fixup the TID in user space.
  245. *
  246. * Take a ref on the state and return success. [6]
  247. */
  248. if (!pid)
  249. goto out_attach;
  250. } else {
  251. /*
  252. * If the owner died bit is not set, then the pi_state
  253. * must have an owner. [7]
  254. */
  255. if (!pi_state->owner)
  256. goto out_einval;
  257. }
  258. /*
  259. * Bail out if user space manipulated the futex value. If pi
  260. * state exists then the owner TID must be the same as the
  261. * user space TID. [9/10]
  262. */
  263. if (pid != task_pid_vnr(pi_state->owner))
  264. goto out_einval;
  265. out_attach:
  266. get_pi_state(pi_state);
  267. raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
  268. *ps = pi_state;
  269. return 0;
  270. out_einval:
  271. ret = -EINVAL;
  272. goto out_error;
  273. out_eagain:
  274. ret = -EAGAIN;
  275. goto out_error;
  276. out_efault:
  277. ret = -EFAULT;
  278. goto out_error;
  279. out_error:
  280. raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
  281. return ret;
  282. }
  283. static int handle_exit_race(u32 __user *uaddr, u32 uval,
  284. struct task_struct *tsk)
  285. {
  286. u32 uval2;
  287. /*
  288. * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
  289. * caller that the alleged owner is busy.
  290. */
  291. if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
  292. return -EBUSY;
  293. /*
  294. * Reread the user space value to handle the following situation:
  295. *
  296. * CPU0 CPU1
  297. *
  298. * sys_exit() sys_futex()
  299. * do_exit() futex_lock_pi()
  300. * futex_lock_pi_atomic()
  301. * exit_signals(tsk) No waiters:
  302. * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
  303. * mm_release(tsk) Set waiter bit
  304. * exit_robust_list(tsk) { *uaddr = 0x80000PID;
  305. * Set owner died attach_to_pi_owner() {
  306. * *uaddr = 0xC0000000; tsk = get_task(PID);
  307. * } if (!tsk->flags & PF_EXITING) {
  308. * ... attach();
  309. * tsk->futex_state = } else {
  310. * FUTEX_STATE_DEAD; if (tsk->futex_state !=
  311. * FUTEX_STATE_DEAD)
  312. * return -EAGAIN;
  313. * return -ESRCH; <--- FAIL
  314. * }
  315. *
  316. * Returning ESRCH unconditionally is wrong here because the
  317. * user space value has been changed by the exiting task.
  318. *
  319. * The same logic applies to the case where the exiting task is
  320. * already gone.
  321. */
  322. if (futex_get_value_locked(&uval2, uaddr))
  323. return -EFAULT;
  324. /* If the user space value has changed, try again. */
  325. if (uval2 != uval)
  326. return -EAGAIN;
  327. /*
  328. * The exiting task did not have a robust list, the robust list was
  329. * corrupted or the user space value in *uaddr is simply bogus.
  330. * Give up and tell user space.
  331. */
  332. return -ESRCH;
  333. }
  334. static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
  335. struct futex_pi_state **ps)
  336. {
  337. /*
  338. * No existing pi state. First waiter. [2]
  339. *
  340. * This creates pi_state, we have hb->lock held, this means nothing can
  341. * observe this state, wait_lock is irrelevant.
  342. */
  343. struct futex_pi_state *pi_state = alloc_pi_state();
  344. /*
  345. * Initialize the pi_mutex in locked state and make @p
  346. * the owner of it:
  347. */
  348. rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
  349. /* Store the key for possible exit cleanups: */
  350. pi_state->key = *key;
  351. WARN_ON(!list_empty(&pi_state->list));
  352. list_add(&pi_state->list, &p->pi_state_list);
  353. /*
  354. * Assignment without holding pi_state->pi_mutex.wait_lock is safe
  355. * because there is no concurrency as the object is not published yet.
  356. */
  357. pi_state->owner = p;
  358. *ps = pi_state;
  359. }
  360. /*
  361. * Lookup the task for the TID provided from user space and attach to
  362. * it after doing proper sanity checks.
  363. */
  364. static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
  365. struct futex_pi_state **ps,
  366. struct task_struct **exiting)
  367. {
  368. pid_t pid = uval & FUTEX_TID_MASK;
  369. struct task_struct *p;
  370. /*
  371. * We are the first waiter - try to look up the real owner and attach
  372. * the new pi_state to it, but bail out when TID = 0 [1]
  373. *
  374. * The !pid check is paranoid. None of the call sites should end up
  375. * with pid == 0, but better safe than sorry. Let the caller retry
  376. */
  377. if (!pid)
  378. return -EAGAIN;
  379. p = find_get_task_by_vpid(pid);
  380. if (!p)
  381. return handle_exit_race(uaddr, uval, NULL);
  382. if (unlikely(p->flags & PF_KTHREAD)) {
  383. put_task_struct(p);
  384. return -EPERM;
  385. }
  386. /*
  387. * We need to look at the task state to figure out, whether the
  388. * task is exiting. To protect against the change of the task state
  389. * in futex_exit_release(), we do this protected by p->pi_lock:
  390. */
  391. raw_spin_lock_irq(&p->pi_lock);
  392. if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
  393. /*
  394. * The task is on the way out. When the futex state is
  395. * FUTEX_STATE_DEAD, we know that the task has finished
  396. * the cleanup:
  397. */
  398. int ret = handle_exit_race(uaddr, uval, p);
  399. raw_spin_unlock_irq(&p->pi_lock);
  400. /*
  401. * If the owner task is between FUTEX_STATE_EXITING and
  402. * FUTEX_STATE_DEAD then store the task pointer and keep
  403. * the reference on the task struct. The calling code will
  404. * drop all locks, wait for the task to reach
  405. * FUTEX_STATE_DEAD and then drop the refcount. This is
  406. * required to prevent a live lock when the current task
  407. * preempted the exiting task between the two states.
  408. */
  409. if (ret == -EBUSY)
  410. *exiting = p;
  411. else
  412. put_task_struct(p);
  413. return ret;
  414. }
  415. __attach_to_pi_owner(p, key, ps);
  416. raw_spin_unlock_irq(&p->pi_lock);
  417. put_task_struct(p);
  418. return 0;
  419. }
  420. static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
  421. {
  422. int err;
  423. u32 curval;
  424. if (unlikely(should_fail_futex(true)))
  425. return -EFAULT;
  426. err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
  427. if (unlikely(err))
  428. return err;
  429. /* If user space value changed, let the caller retry */
  430. return curval != uval ? -EAGAIN : 0;
  431. }
  432. /**
  433. * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
  434. * @uaddr: the pi futex user address
  435. * @hb: the pi futex hash bucket
  436. * @key: the futex key associated with uaddr and hb
  437. * @ps: the pi_state pointer where we store the result of the
  438. * lookup
  439. * @task: the task to perform the atomic lock work for. This will
  440. * be "current" except in the case of requeue pi.
  441. * @exiting: Pointer to store the task pointer of the owner task
  442. * which is in the middle of exiting
  443. * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
  444. *
  445. * Return:
  446. * - 0 - ready to wait;
  447. * - 1 - acquired the lock;
  448. * - <0 - error
  449. *
  450. * The hb->lock must be held by the caller.
  451. *
  452. * @exiting is only set when the return value is -EBUSY. If so, this holds
  453. * a refcount on the exiting task on return and the caller needs to drop it
  454. * after waiting for the exit to complete.
  455. */
  456. int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
  457. union futex_key *key,
  458. struct futex_pi_state **ps,
  459. struct task_struct *task,
  460. struct task_struct **exiting,
  461. int set_waiters)
  462. {
  463. u32 uval, newval, vpid = task_pid_vnr(task);
  464. struct futex_q *top_waiter;
  465. int ret;
  466. /*
  467. * Read the user space value first so we can validate a few
  468. * things before proceeding further.
  469. */
  470. if (futex_get_value_locked(&uval, uaddr))
  471. return -EFAULT;
  472. if (unlikely(should_fail_futex(true)))
  473. return -EFAULT;
  474. /*
  475. * Detect deadlocks.
  476. */
  477. if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
  478. return -EDEADLK;
  479. if ((unlikely(should_fail_futex(true))))
  480. return -EDEADLK;
  481. /*
  482. * Lookup existing state first. If it exists, try to attach to
  483. * its pi_state.
  484. */
  485. top_waiter = futex_top_waiter(hb, key);
  486. if (top_waiter)
  487. return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
  488. /*
  489. * No waiter and user TID is 0. We are here because the
  490. * waiters or the owner died bit is set or called from
  491. * requeue_cmp_pi or for whatever reason something took the
  492. * syscall.
  493. */
  494. if (!(uval & FUTEX_TID_MASK)) {
  495. /*
  496. * We take over the futex. No other waiters and the user space
  497. * TID is 0. We preserve the owner died bit.
  498. */
  499. newval = uval & FUTEX_OWNER_DIED;
  500. newval |= vpid;
  501. /* The futex requeue_pi code can enforce the waiters bit */
  502. if (set_waiters)
  503. newval |= FUTEX_WAITERS;
  504. ret = lock_pi_update_atomic(uaddr, uval, newval);
  505. if (ret)
  506. return ret;
  507. /*
  508. * If the waiter bit was requested the caller also needs PI
  509. * state attached to the new owner of the user space futex.
  510. *
  511. * @task is guaranteed to be alive and it cannot be exiting
  512. * because it is either sleeping or waiting in
  513. * futex_requeue_pi_wakeup_sync().
  514. *
  515. * No need to do the full attach_to_pi_owner() exercise
  516. * because @task is known and valid.
  517. */
  518. if (set_waiters) {
  519. raw_spin_lock_irq(&task->pi_lock);
  520. __attach_to_pi_owner(task, key, ps);
  521. raw_spin_unlock_irq(&task->pi_lock);
  522. }
  523. return 1;
  524. }
  525. /*
  526. * First waiter. Set the waiters bit before attaching ourself to
  527. * the owner. If owner tries to unlock, it will be forced into
  528. * the kernel and blocked on hb->lock.
  529. */
  530. newval = uval | FUTEX_WAITERS;
  531. ret = lock_pi_update_atomic(uaddr, uval, newval);
  532. if (ret)
  533. return ret;
  534. /*
  535. * If the update of the user space value succeeded, we try to
  536. * attach to the owner. If that fails, no harm done, we only
  537. * set the FUTEX_WAITERS bit in the user space variable.
  538. */
  539. return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
  540. }
  541. /*
  542. * Caller must hold a reference on @pi_state.
  543. */
  544. static int wake_futex_pi(u32 __user *uaddr, u32 uval,
  545. struct futex_pi_state *pi_state,
  546. struct rt_mutex_waiter *top_waiter)
  547. {
  548. struct task_struct *new_owner;
  549. bool postunlock = false;
  550. DEFINE_RT_WAKE_Q(wqh);
  551. u32 curval, newval;
  552. int ret = 0;
  553. new_owner = top_waiter->task;
  554. /*
  555. * We pass it to the next owner. The WAITERS bit is always kept
  556. * enabled while there is PI state around. We cleanup the owner
  557. * died bit, because we are the owner.
  558. */
  559. newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
  560. if (unlikely(should_fail_futex(true))) {
  561. ret = -EFAULT;
  562. goto out_unlock;
  563. }
  564. ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
  565. if (!ret && (curval != uval)) {
  566. /*
  567. * If a unconditional UNLOCK_PI operation (user space did not
  568. * try the TID->0 transition) raced with a waiter setting the
  569. * FUTEX_WAITERS flag between get_user() and locking the hash
  570. * bucket lock, retry the operation.
  571. */
  572. if ((FUTEX_TID_MASK & curval) == uval)
  573. ret = -EAGAIN;
  574. else
  575. ret = -EINVAL;
  576. }
  577. if (!ret) {
  578. /*
  579. * This is a point of no return; once we modified the uval
  580. * there is no going back and subsequent operations must
  581. * not fail.
  582. */
  583. pi_state_update_owner(pi_state, new_owner);
  584. postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
  585. }
  586. out_unlock:
  587. raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
  588. if (postunlock)
  589. rt_mutex_postunlock(&wqh);
  590. return ret;
  591. }
  592. static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
  593. struct task_struct *argowner)
  594. {
  595. struct futex_pi_state *pi_state = q->pi_state;
  596. struct task_struct *oldowner, *newowner;
  597. u32 uval, curval, newval, newtid;
  598. int err = 0;
  599. oldowner = pi_state->owner;
  600. /*
  601. * We are here because either:
  602. *
  603. * - we stole the lock and pi_state->owner needs updating to reflect
  604. * that (@argowner == current),
  605. *
  606. * or:
  607. *
  608. * - someone stole our lock and we need to fix things to point to the
  609. * new owner (@argowner == NULL).
  610. *
  611. * Either way, we have to replace the TID in the user space variable.
  612. * This must be atomic as we have to preserve the owner died bit here.
  613. *
  614. * Note: We write the user space value _before_ changing the pi_state
  615. * because we can fault here. Imagine swapped out pages or a fork
  616. * that marked all the anonymous memory readonly for cow.
  617. *
  618. * Modifying pi_state _before_ the user space value would leave the
  619. * pi_state in an inconsistent state when we fault here, because we
  620. * need to drop the locks to handle the fault. This might be observed
  621. * in the PID checks when attaching to PI state .
  622. */
  623. retry:
  624. if (!argowner) {
  625. if (oldowner != current) {
  626. /*
  627. * We raced against a concurrent self; things are
  628. * already fixed up. Nothing to do.
  629. */
  630. return 0;
  631. }
  632. if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
  633. /* We got the lock. pi_state is correct. Tell caller. */
  634. return 1;
  635. }
  636. /*
  637. * The trylock just failed, so either there is an owner or
  638. * there is a higher priority waiter than this one.
  639. */
  640. newowner = rt_mutex_owner(&pi_state->pi_mutex);
  641. /*
  642. * If the higher priority waiter has not yet taken over the
  643. * rtmutex then newowner is NULL. We can't return here with
  644. * that state because it's inconsistent vs. the user space
  645. * state. So drop the locks and try again. It's a valid
  646. * situation and not any different from the other retry
  647. * conditions.
  648. */
  649. if (unlikely(!newowner)) {
  650. err = -EAGAIN;
  651. goto handle_err;
  652. }
  653. } else {
  654. WARN_ON_ONCE(argowner != current);
  655. if (oldowner == current) {
  656. /*
  657. * We raced against a concurrent self; things are
  658. * already fixed up. Nothing to do.
  659. */
  660. return 1;
  661. }
  662. newowner = argowner;
  663. }
  664. newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
  665. /* Owner died? */
  666. if (!pi_state->owner)
  667. newtid |= FUTEX_OWNER_DIED;
  668. err = futex_get_value_locked(&uval, uaddr);
  669. if (err)
  670. goto handle_err;
  671. for (;;) {
  672. newval = (uval & FUTEX_OWNER_DIED) | newtid;
  673. err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
  674. if (err)
  675. goto handle_err;
  676. if (curval == uval)
  677. break;
  678. uval = curval;
  679. }
  680. /*
  681. * We fixed up user space. Now we need to fix the pi_state
  682. * itself.
  683. */
  684. pi_state_update_owner(pi_state, newowner);
  685. return argowner == current;
  686. /*
  687. * In order to reschedule or handle a page fault, we need to drop the
  688. * locks here. In the case of a fault, this gives the other task
  689. * (either the highest priority waiter itself or the task which stole
  690. * the rtmutex) the chance to try the fixup of the pi_state. So once we
  691. * are back from handling the fault we need to check the pi_state after
  692. * reacquiring the locks and before trying to do another fixup. When
  693. * the fixup has been done already we simply return.
  694. *
  695. * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
  696. * drop hb->lock since the caller owns the hb -> futex_q relation.
  697. * Dropping the pi_mutex->wait_lock requires the state revalidate.
  698. */
  699. handle_err:
  700. raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
  701. spin_unlock(q->lock_ptr);
  702. switch (err) {
  703. case -EFAULT:
  704. err = fault_in_user_writeable(uaddr);
  705. break;
  706. case -EAGAIN:
  707. cond_resched();
  708. err = 0;
  709. break;
  710. default:
  711. WARN_ON_ONCE(1);
  712. break;
  713. }
  714. spin_lock(q->lock_ptr);
  715. raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
  716. /*
  717. * Check if someone else fixed it for us:
  718. */
  719. if (pi_state->owner != oldowner)
  720. return argowner == current;
  721. /* Retry if err was -EAGAIN or the fault in succeeded */
  722. if (!err)
  723. goto retry;
  724. /*
  725. * fault_in_user_writeable() failed so user state is immutable. At
  726. * best we can make the kernel state consistent but user state will
  727. * be most likely hosed and any subsequent unlock operation will be
  728. * rejected due to PI futex rule [10].
  729. *
  730. * Ensure that the rtmutex owner is also the pi_state owner despite
  731. * the user space value claiming something different. There is no
  732. * point in unlocking the rtmutex if current is the owner as it
  733. * would need to wait until the next waiter has taken the rtmutex
  734. * to guarantee consistent state. Keep it simple. Userspace asked
  735. * for this wreckaged state.
  736. *
  737. * The rtmutex has an owner - either current or some other
  738. * task. See the EAGAIN loop above.
  739. */
  740. pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
  741. return err;
  742. }
  743. static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
  744. struct task_struct *argowner)
  745. {
  746. struct futex_pi_state *pi_state = q->pi_state;
  747. int ret;
  748. lockdep_assert_held(q->lock_ptr);
  749. raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
  750. ret = __fixup_pi_state_owner(uaddr, q, argowner);
  751. raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
  752. return ret;
  753. }
  754. /**
  755. * fixup_pi_owner() - Post lock pi_state and corner case management
  756. * @uaddr: user address of the futex
  757. * @q: futex_q (contains pi_state and access to the rt_mutex)
  758. * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
  759. *
  760. * After attempting to lock an rt_mutex, this function is called to cleanup
  761. * the pi_state owner as well as handle race conditions that may allow us to
  762. * acquire the lock. Must be called with the hb lock held.
  763. *
  764. * Return:
  765. * - 1 - success, lock taken;
  766. * - 0 - success, lock not taken;
  767. * - <0 - on error (-EFAULT)
  768. */
  769. int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
  770. {
  771. if (locked) {
  772. /*
  773. * Got the lock. We might not be the anticipated owner if we
  774. * did a lock-steal - fix up the PI-state in that case:
  775. *
  776. * Speculative pi_state->owner read (we don't hold wait_lock);
  777. * since we own the lock pi_state->owner == current is the
  778. * stable state, anything else needs more attention.
  779. */
  780. if (q->pi_state->owner != current)
  781. return fixup_pi_state_owner(uaddr, q, current);
  782. return 1;
  783. }
  784. /*
  785. * If we didn't get the lock; check if anybody stole it from us. In
  786. * that case, we need to fix up the uval to point to them instead of
  787. * us, otherwise bad things happen. [10]
  788. *
  789. * Another speculative read; pi_state->owner == current is unstable
  790. * but needs our attention.
  791. */
  792. if (q->pi_state->owner == current)
  793. return fixup_pi_state_owner(uaddr, q, NULL);
  794. /*
  795. * Paranoia check. If we did not take the lock, then we should not be
  796. * the owner of the rt_mutex. Warn and establish consistent state.
  797. */
  798. if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
  799. return fixup_pi_state_owner(uaddr, q, current);
  800. return 0;
  801. }
  802. /*
  803. * Userspace tried a 0 -> TID atomic transition of the futex value
  804. * and failed. The kernel side here does the whole locking operation:
  805. * if there are waiters then it will block as a consequence of relying
  806. * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
  807. * a 0 value of the futex too.).
  808. *
  809. * Also serves as futex trylock_pi()'ing, and due semantics.
  810. */
  811. int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
  812. {
  813. struct hrtimer_sleeper timeout, *to;
  814. struct task_struct *exiting = NULL;
  815. struct rt_mutex_waiter rt_waiter;
  816. struct futex_hash_bucket *hb;
  817. struct futex_q q = futex_q_init;
  818. int res, ret;
  819. if (!IS_ENABLED(CONFIG_FUTEX_PI))
  820. return -ENOSYS;
  821. if (refill_pi_state_cache())
  822. return -ENOMEM;
  823. to = futex_setup_timer(time, &timeout, flags, 0);
  824. retry:
  825. ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
  826. if (unlikely(ret != 0))
  827. goto out;
  828. retry_private:
  829. hb = futex_q_lock(&q);
  830. ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
  831. &exiting, 0);
  832. if (unlikely(ret)) {
  833. /*
  834. * Atomic work succeeded and we got the lock,
  835. * or failed. Either way, we do _not_ block.
  836. */
  837. switch (ret) {
  838. case 1:
  839. /* We got the lock. */
  840. ret = 0;
  841. goto out_unlock_put_key;
  842. case -EFAULT:
  843. goto uaddr_faulted;
  844. case -EBUSY:
  845. case -EAGAIN:
  846. /*
  847. * Two reasons for this:
  848. * - EBUSY: Task is exiting and we just wait for the
  849. * exit to complete.
  850. * - EAGAIN: The user space value changed.
  851. */
  852. futex_q_unlock(hb);
  853. /*
  854. * Handle the case where the owner is in the middle of
  855. * exiting. Wait for the exit to complete otherwise
  856. * this task might loop forever, aka. live lock.
  857. */
  858. wait_for_owner_exiting(ret, exiting);
  859. cond_resched();
  860. goto retry;
  861. default:
  862. goto out_unlock_put_key;
  863. }
  864. }
  865. WARN_ON(!q.pi_state);
  866. /*
  867. * Only actually queue now that the atomic ops are done:
  868. */
  869. __futex_queue(&q, hb, current);
  870. if (trylock) {
  871. ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
  872. /* Fixup the trylock return value: */
  873. ret = ret ? 0 : -EWOULDBLOCK;
  874. goto no_block;
  875. }
  876. /*
  877. * Must be done before we enqueue the waiter, here is unfortunately
  878. * under the hb lock, but that *should* work because it does nothing.
  879. */
  880. rt_mutex_pre_schedule();
  881. rt_mutex_init_waiter(&rt_waiter);
  882. /*
  883. * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
  884. * hold it while doing rt_mutex_start_proxy(), because then it will
  885. * include hb->lock in the blocking chain, even through we'll not in
  886. * fact hold it while blocking. This will lead it to report -EDEADLK
  887. * and BUG when futex_unlock_pi() interleaves with this.
  888. *
  889. * Therefore acquire wait_lock while holding hb->lock, but drop the
  890. * latter before calling __rt_mutex_start_proxy_lock(). This
  891. * interleaves with futex_unlock_pi() -- which does a similar lock
  892. * handoff -- such that the latter can observe the futex_q::pi_state
  893. * before __rt_mutex_start_proxy_lock() is done.
  894. */
  895. raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
  896. spin_unlock(q.lock_ptr);
  897. /*
  898. * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
  899. * such that futex_unlock_pi() is guaranteed to observe the waiter when
  900. * it sees the futex_q::pi_state.
  901. */
  902. ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
  903. raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
  904. if (ret) {
  905. if (ret == 1)
  906. ret = 0;
  907. goto cleanup;
  908. }
  909. if (unlikely(to))
  910. hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
  911. ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
  912. cleanup:
  913. /*
  914. * If we failed to acquire the lock (deadlock/signal/timeout), we must
  915. * must unwind the above, however we canont lock hb->lock because
  916. * rt_mutex already has a waiter enqueued and hb->lock can itself try
  917. * and enqueue an rt_waiter through rtlock.
  918. *
  919. * Doing the cleanup without holding hb->lock can cause inconsistent
  920. * state between hb and pi_state, but only in the direction of not
  921. * seeing a waiter that is leaving.
  922. *
  923. * See futex_unlock_pi(), it deals with this inconsistency.
  924. *
  925. * There be dragons here, since we must deal with the inconsistency on
  926. * the way out (here), it is impossible to detect/warn about the race
  927. * the other way around (missing an incoming waiter).
  928. *
  929. * What could possibly go wrong...
  930. */
  931. if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
  932. ret = 0;
  933. /*
  934. * Now that the rt_waiter has been dequeued, it is safe to use
  935. * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
  936. * the
  937. */
  938. spin_lock(q.lock_ptr);
  939. /*
  940. * Waiter is unqueued.
  941. */
  942. rt_mutex_post_schedule();
  943. no_block:
  944. /*
  945. * Fixup the pi_state owner and possibly acquire the lock if we
  946. * haven't already.
  947. */
  948. res = fixup_pi_owner(uaddr, &q, !ret);
  949. /*
  950. * If fixup_pi_owner() returned an error, propagate that. If it acquired
  951. * the lock, clear our -ETIMEDOUT or -EINTR.
  952. */
  953. if (res)
  954. ret = (res < 0) ? res : 0;
  955. futex_unqueue_pi(&q);
  956. spin_unlock(q.lock_ptr);
  957. goto out;
  958. out_unlock_put_key:
  959. futex_q_unlock(hb);
  960. out:
  961. if (to) {
  962. hrtimer_cancel(&to->timer);
  963. destroy_hrtimer_on_stack(&to->timer);
  964. }
  965. return ret != -EINTR ? ret : -ERESTARTNOINTR;
  966. uaddr_faulted:
  967. futex_q_unlock(hb);
  968. ret = fault_in_user_writeable(uaddr);
  969. if (ret)
  970. goto out;
  971. if (!(flags & FLAGS_SHARED))
  972. goto retry_private;
  973. goto retry;
  974. }
  975. /*
  976. * Userspace attempted a TID -> 0 atomic transition, and failed.
  977. * This is the in-kernel slowpath: we look up the PI state (if any),
  978. * and do the rt-mutex unlock.
  979. */
  980. int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
  981. {
  982. u32 curval, uval, vpid = task_pid_vnr(current);
  983. union futex_key key = FUTEX_KEY_INIT;
  984. struct futex_hash_bucket *hb;
  985. struct futex_q *top_waiter;
  986. int ret;
  987. if (!IS_ENABLED(CONFIG_FUTEX_PI))
  988. return -ENOSYS;
  989. retry:
  990. if (get_user(uval, uaddr))
  991. return -EFAULT;
  992. /*
  993. * We release only a lock we actually own:
  994. */
  995. if ((uval & FUTEX_TID_MASK) != vpid)
  996. return -EPERM;
  997. ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE);
  998. if (ret)
  999. return ret;
  1000. hb = futex_hash(&key);
  1001. spin_lock(&hb->lock);
  1002. retry_hb:
  1003. /*
  1004. * Check waiters first. We do not trust user space values at
  1005. * all and we at least want to know if user space fiddled
  1006. * with the futex value instead of blindly unlocking.
  1007. */
  1008. top_waiter = futex_top_waiter(hb, &key);
  1009. if (top_waiter) {
  1010. struct futex_pi_state *pi_state = top_waiter->pi_state;
  1011. struct rt_mutex_waiter *rt_waiter;
  1012. ret = -EINVAL;
  1013. if (!pi_state)
  1014. goto out_unlock;
  1015. /*
  1016. * If current does not own the pi_state then the futex is
  1017. * inconsistent and user space fiddled with the futex value.
  1018. */
  1019. if (pi_state->owner != current)
  1020. goto out_unlock;
  1021. /*
  1022. * By taking wait_lock while still holding hb->lock, we ensure
  1023. * there is no point where we hold neither; and thereby
  1024. * wake_futex_pi() must observe any new waiters.
  1025. *
  1026. * Since the cleanup: case in futex_lock_pi() removes the
  1027. * rt_waiter without holding hb->lock, it is possible for
  1028. * wake_futex_pi() to not find a waiter while the above does,
  1029. * in this case the waiter is on the way out and it can be
  1030. * ignored.
  1031. *
  1032. * In particular; this forces __rt_mutex_start_proxy() to
  1033. * complete such that we're guaranteed to observe the
  1034. * rt_waiter.
  1035. */
  1036. raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
  1037. /*
  1038. * Futex vs rt_mutex waiter state -- if there are no rt_mutex
  1039. * waiters even though futex thinks there are, then the waiter
  1040. * is leaving. The entry needs to be removed from the list so a
  1041. * new futex_lock_pi() is not using this stale PI-state while
  1042. * the futex is available in user space again.
  1043. * There can be more than one task on its way out so it needs
  1044. * to retry.
  1045. */
  1046. rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
  1047. if (!rt_waiter) {
  1048. __futex_unqueue(top_waiter);
  1049. raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
  1050. goto retry_hb;
  1051. }
  1052. get_pi_state(pi_state);
  1053. spin_unlock(&hb->lock);
  1054. /* drops pi_state->pi_mutex.wait_lock */
  1055. ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter);
  1056. put_pi_state(pi_state);
  1057. /*
  1058. * Success, we're done! No tricky corner cases.
  1059. */
  1060. if (!ret)
  1061. return ret;
  1062. /*
  1063. * The atomic access to the futex value generated a
  1064. * pagefault, so retry the user-access and the wakeup:
  1065. */
  1066. if (ret == -EFAULT)
  1067. goto pi_faulted;
  1068. /*
  1069. * A unconditional UNLOCK_PI op raced against a waiter
  1070. * setting the FUTEX_WAITERS bit. Try again.
  1071. */
  1072. if (ret == -EAGAIN)
  1073. goto pi_retry;
  1074. /*
  1075. * wake_futex_pi has detected invalid state. Tell user
  1076. * space.
  1077. */
  1078. return ret;
  1079. }
  1080. /*
  1081. * We have no kernel internal state, i.e. no waiters in the
  1082. * kernel. Waiters which are about to queue themselves are stuck
  1083. * on hb->lock. So we can safely ignore them. We do neither
  1084. * preserve the WAITERS bit not the OWNER_DIED one. We are the
  1085. * owner.
  1086. */
  1087. if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
  1088. spin_unlock(&hb->lock);
  1089. switch (ret) {
  1090. case -EFAULT:
  1091. goto pi_faulted;
  1092. case -EAGAIN:
  1093. goto pi_retry;
  1094. default:
  1095. WARN_ON_ONCE(1);
  1096. return ret;
  1097. }
  1098. }
  1099. /*
  1100. * If uval has changed, let user space handle it.
  1101. */
  1102. ret = (curval == uval) ? 0 : -EAGAIN;
  1103. out_unlock:
  1104. spin_unlock(&hb->lock);
  1105. return ret;
  1106. pi_retry:
  1107. cond_resched();
  1108. goto retry;
  1109. pi_faulted:
  1110. ret = fault_in_user_writeable(uaddr);
  1111. if (!ret)
  1112. goto retry;
  1113. return ret;
  1114. }