rwsem.c 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720
  1. // SPDX-License-Identifier: GPL-2.0
  2. /* kernel/rwsem.c: R/W semaphores, public implementation
  3. *
  4. * Written by David Howells (dhowells@redhat.com).
  5. * Derived from asm-i386/semaphore.h
  6. *
  7. * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
  8. * and Michel Lespinasse <walken@google.com>
  9. *
  10. * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
  11. * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
  12. *
  13. * Rwsem count bit fields re-definition and rwsem rearchitecture by
  14. * Waiman Long <longman@redhat.com> and
  15. * Peter Zijlstra <peterz@infradead.org>.
  16. */
  17. #include <linux/types.h>
  18. #include <linux/kernel.h>
  19. #include <linux/sched.h>
  20. #include <linux/sched/rt.h>
  21. #include <linux/sched/task.h>
  22. #include <linux/sched/debug.h>
  23. #include <linux/sched/wake_q.h>
  24. #include <linux/sched/signal.h>
  25. #include <linux/sched/clock.h>
  26. #include <linux/export.h>
  27. #include <linux/rwsem.h>
  28. #include <linux/atomic.h>
  29. #include <trace/events/lock.h>
  30. #ifndef CONFIG_PREEMPT_RT
  31. #include "lock_events.h"
  32. /*
  33. * The least significant 2 bits of the owner value has the following
  34. * meanings when set.
  35. * - Bit 0: RWSEM_READER_OWNED - rwsem may be owned by readers (just a hint)
  36. * - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock
  37. *
  38. * When the rwsem is reader-owned and a spinning writer has timed out,
  39. * the nonspinnable bit will be set to disable optimistic spinning.
  40. * When a writer acquires a rwsem, it puts its task_struct pointer
  41. * into the owner field. It is cleared after an unlock.
  42. *
  43. * When a reader acquires a rwsem, it will also puts its task_struct
  44. * pointer into the owner field with the RWSEM_READER_OWNED bit set.
  45. * On unlock, the owner field will largely be left untouched. So
  46. * for a free or reader-owned rwsem, the owner value may contain
  47. * information about the last reader that acquires the rwsem.
  48. *
  49. * That information may be helpful in debugging cases where the system
  50. * seems to hang on a reader owned rwsem especially if only one reader
  51. * is involved. Ideally we would like to track all the readers that own
  52. * a rwsem, but the overhead is simply too big.
  53. *
  54. * A fast path reader optimistic lock stealing is supported when the rwsem
  55. * is previously owned by a writer and the following conditions are met:
  56. * - rwsem is not currently writer owned
  57. * - the handoff isn't set.
  58. */
  59. #define RWSEM_READER_OWNED (1UL << 0)
  60. #define RWSEM_NONSPINNABLE (1UL << 1)
  61. #define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
  62. #ifdef CONFIG_DEBUG_RWSEMS
  63. # define DEBUG_RWSEMS_WARN_ON(c, sem) do { \
  64. if (!debug_locks_silent && \
  65. WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, magic = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
  66. #c, atomic_long_read(&(sem)->count), \
  67. (unsigned long) sem->magic, \
  68. atomic_long_read(&(sem)->owner), (long)current, \
  69. list_empty(&(sem)->wait_list) ? "" : "not ")) \
  70. debug_locks_off(); \
  71. } while (0)
  72. #else
  73. # define DEBUG_RWSEMS_WARN_ON(c, sem)
  74. #endif
  75. /*
  76. * On 64-bit architectures, the bit definitions of the count are:
  77. *
  78. * Bit 0 - writer locked bit
  79. * Bit 1 - waiters present bit
  80. * Bit 2 - lock handoff bit
  81. * Bits 3-7 - reserved
  82. * Bits 8-62 - 55-bit reader count
  83. * Bit 63 - read fail bit
  84. *
  85. * On 32-bit architectures, the bit definitions of the count are:
  86. *
  87. * Bit 0 - writer locked bit
  88. * Bit 1 - waiters present bit
  89. * Bit 2 - lock handoff bit
  90. * Bits 3-7 - reserved
  91. * Bits 8-30 - 23-bit reader count
  92. * Bit 31 - read fail bit
  93. *
  94. * It is not likely that the most significant bit (read fail bit) will ever
  95. * be set. This guard bit is still checked anyway in the down_read() fastpath
  96. * just in case we need to use up more of the reader bits for other purpose
  97. * in the future.
  98. *
  99. * atomic_long_fetch_add() is used to obtain reader lock, whereas
  100. * atomic_long_cmpxchg() will be used to obtain writer lock.
  101. *
  102. * There are three places where the lock handoff bit may be set or cleared.
  103. * 1) rwsem_mark_wake() for readers -- set, clear
  104. * 2) rwsem_try_write_lock() for writers -- set, clear
  105. * 3) rwsem_del_waiter() -- clear
  106. *
  107. * For all the above cases, wait_lock will be held. A writer must also
  108. * be the first one in the wait_list to be eligible for setting the handoff
  109. * bit. So concurrent setting/clearing of handoff bit is not possible.
  110. */
  111. #define RWSEM_WRITER_LOCKED (1UL << 0)
  112. #define RWSEM_FLAG_WAITERS (1UL << 1)
  113. #define RWSEM_FLAG_HANDOFF (1UL << 2)
  114. #define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1))
  115. #define RWSEM_READER_SHIFT 8
  116. #define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT)
  117. #define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1))
  118. #define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED
  119. #define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK)
  120. #define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\
  121. RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL)
  122. /*
  123. * All writes to owner are protected by WRITE_ONCE() to make sure that
  124. * store tearing can't happen as optimistic spinners may read and use
  125. * the owner value concurrently without lock. Read from owner, however,
  126. * may not need READ_ONCE() as long as the pointer value is only used
  127. * for comparison and isn't being dereferenced.
  128. *
  129. * Both rwsem_{set,clear}_owner() functions should be in the same
  130. * preempt disable section as the atomic op that changes sem->count.
  131. */
  132. static inline void rwsem_set_owner(struct rw_semaphore *sem)
  133. {
  134. lockdep_assert_preemption_disabled();
  135. atomic_long_set(&sem->owner, (long)current);
  136. }
  137. static inline void rwsem_clear_owner(struct rw_semaphore *sem)
  138. {
  139. lockdep_assert_preemption_disabled();
  140. atomic_long_set(&sem->owner, 0);
  141. }
  142. /*
  143. * Test the flags in the owner field.
  144. */
  145. static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags)
  146. {
  147. return atomic_long_read(&sem->owner) & flags;
  148. }
  149. /*
  150. * The task_struct pointer of the last owning reader will be left in
  151. * the owner field.
  152. *
  153. * Note that the owner value just indicates the task has owned the rwsem
  154. * previously, it may not be the real owner or one of the real owners
  155. * anymore when that field is examined, so take it with a grain of salt.
  156. *
  157. * The reader non-spinnable bit is preserved.
  158. */
  159. static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
  160. struct task_struct *owner)
  161. {
  162. unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED |
  163. (atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE);
  164. atomic_long_set(&sem->owner, val);
  165. }
  166. static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
  167. {
  168. __rwsem_set_reader_owned(sem, current);
  169. }
  170. #ifdef CONFIG_DEBUG_RWSEMS
  171. /*
  172. * Return just the real task structure pointer of the owner
  173. */
  174. static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
  175. {
  176. return (struct task_struct *)
  177. (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
  178. }
  179. /*
  180. * Return true if the rwsem is owned by a reader.
  181. */
  182. static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
  183. {
  184. /*
  185. * Check the count to see if it is write-locked.
  186. */
  187. long count = atomic_long_read(&sem->count);
  188. if (count & RWSEM_WRITER_MASK)
  189. return false;
  190. return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
  191. }
  192. /*
  193. * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
  194. * is a task pointer in owner of a reader-owned rwsem, it will be the
  195. * real owner or one of the real owners. The only exception is when the
  196. * unlock is done by up_read_non_owner().
  197. */
  198. static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
  199. {
  200. unsigned long val = atomic_long_read(&sem->owner);
  201. while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) {
  202. if (atomic_long_try_cmpxchg(&sem->owner, &val,
  203. val & RWSEM_OWNER_FLAGS_MASK))
  204. return;
  205. }
  206. }
  207. #else
  208. static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
  209. {
  210. }
  211. #endif
  212. /*
  213. * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag
  214. * remains set. Otherwise, the operation will be aborted.
  215. */
  216. static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem)
  217. {
  218. unsigned long owner = atomic_long_read(&sem->owner);
  219. do {
  220. if (!(owner & RWSEM_READER_OWNED))
  221. break;
  222. if (owner & RWSEM_NONSPINNABLE)
  223. break;
  224. } while (!atomic_long_try_cmpxchg(&sem->owner, &owner,
  225. owner | RWSEM_NONSPINNABLE));
  226. }
  227. static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp)
  228. {
  229. *cntp = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count);
  230. if (WARN_ON_ONCE(*cntp < 0))
  231. rwsem_set_nonspinnable(sem);
  232. if (!(*cntp & RWSEM_READ_FAILED_MASK)) {
  233. rwsem_set_reader_owned(sem);
  234. return true;
  235. }
  236. return false;
  237. }
  238. static inline bool rwsem_write_trylock(struct rw_semaphore *sem)
  239. {
  240. long tmp = RWSEM_UNLOCKED_VALUE;
  241. if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) {
  242. rwsem_set_owner(sem);
  243. return true;
  244. }
  245. return false;
  246. }
  247. /*
  248. * Return the real task structure pointer of the owner and the embedded
  249. * flags in the owner. pflags must be non-NULL.
  250. */
  251. static inline struct task_struct *
  252. rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags)
  253. {
  254. unsigned long owner = atomic_long_read(&sem->owner);
  255. *pflags = owner & RWSEM_OWNER_FLAGS_MASK;
  256. return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK);
  257. }
  258. /*
  259. * Guide to the rw_semaphore's count field.
  260. *
  261. * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned
  262. * by a writer.
  263. *
  264. * The lock is owned by readers when
  265. * (1) the RWSEM_WRITER_LOCKED isn't set in count,
  266. * (2) some of the reader bits are set in count, and
  267. * (3) the owner field has RWSEM_READ_OWNED bit set.
  268. *
  269. * Having some reader bits set is not enough to guarantee a readers owned
  270. * lock as the readers may be in the process of backing out from the count
  271. * and a writer has just released the lock. So another writer may steal
  272. * the lock immediately after that.
  273. */
  274. /*
  275. * Initialize an rwsem:
  276. */
  277. void __init_rwsem(struct rw_semaphore *sem, const char *name,
  278. struct lock_class_key *key)
  279. {
  280. #ifdef CONFIG_DEBUG_LOCK_ALLOC
  281. /*
  282. * Make sure we are not reinitializing a held semaphore:
  283. */
  284. debug_check_no_locks_freed((void *)sem, sizeof(*sem));
  285. lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
  286. #endif
  287. #ifdef CONFIG_DEBUG_RWSEMS
  288. sem->magic = sem;
  289. #endif
  290. atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
  291. raw_spin_lock_init(&sem->wait_lock);
  292. INIT_LIST_HEAD(&sem->wait_list);
  293. atomic_long_set(&sem->owner, 0L);
  294. #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
  295. osq_lock_init(&sem->osq);
  296. #endif
  297. }
  298. EXPORT_SYMBOL(__init_rwsem);
  299. enum rwsem_waiter_type {
  300. RWSEM_WAITING_FOR_WRITE,
  301. RWSEM_WAITING_FOR_READ
  302. };
  303. struct rwsem_waiter {
  304. struct list_head list;
  305. struct task_struct *task;
  306. enum rwsem_waiter_type type;
  307. unsigned long timeout;
  308. bool handoff_set;
  309. };
  310. #define rwsem_first_waiter(sem) \
  311. list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
  312. enum rwsem_wake_type {
  313. RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
  314. RWSEM_WAKE_READERS, /* Wake readers only */
  315. RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
  316. };
  317. /*
  318. * The typical HZ value is either 250 or 1000. So set the minimum waiting
  319. * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
  320. * queue before initiating the handoff protocol.
  321. */
  322. #define RWSEM_WAIT_TIMEOUT DIV_ROUND_UP(HZ, 250)
  323. /*
  324. * Magic number to batch-wakeup waiting readers, even when writers are
  325. * also present in the queue. This both limits the amount of work the
  326. * waking thread must do and also prevents any potential counter overflow,
  327. * however unlikely.
  328. */
  329. #define MAX_READERS_WAKEUP 0x100
  330. static inline void
  331. rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
  332. {
  333. lockdep_assert_held(&sem->wait_lock);
  334. list_add_tail(&waiter->list, &sem->wait_list);
  335. /* caller will set RWSEM_FLAG_WAITERS */
  336. }
  337. /*
  338. * Remove a waiter from the wait_list and clear flags.
  339. *
  340. * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
  341. * this function. Modify with care.
  342. *
  343. * Return: true if wait_list isn't empty and false otherwise
  344. */
  345. static inline bool
  346. rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
  347. {
  348. lockdep_assert_held(&sem->wait_lock);
  349. list_del(&waiter->list);
  350. if (likely(!list_empty(&sem->wait_list)))
  351. return true;
  352. atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
  353. return false;
  354. }
  355. /*
  356. * handle the lock release when processes blocked on it that can now run
  357. * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
  358. * have been set.
  359. * - there must be someone on the queue
  360. * - the wait_lock must be held by the caller
  361. * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
  362. * to actually wakeup the blocked task(s) and drop the reference count,
  363. * preferably when the wait_lock is released
  364. * - woken process blocks are discarded from the list after having task zeroed
  365. * - writers are only marked woken if downgrading is false
  366. *
  367. * Implies rwsem_del_waiter() for all woken readers.
  368. */
  369. static void rwsem_mark_wake(struct rw_semaphore *sem,
  370. enum rwsem_wake_type wake_type,
  371. struct wake_q_head *wake_q)
  372. {
  373. struct rwsem_waiter *waiter, *tmp;
  374. long oldcount, woken = 0, adjustment = 0;
  375. struct list_head wlist;
  376. lockdep_assert_held(&sem->wait_lock);
  377. /*
  378. * Take a peek at the queue head waiter such that we can determine
  379. * the wakeup(s) to perform.
  380. */
  381. waiter = rwsem_first_waiter(sem);
  382. if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
  383. if (wake_type == RWSEM_WAKE_ANY) {
  384. /*
  385. * Mark writer at the front of the queue for wakeup.
  386. * Until the task is actually later awoken later by
  387. * the caller, other writers are able to steal it.
  388. * Readers, on the other hand, will block as they
  389. * will notice the queued writer.
  390. */
  391. wake_q_add(wake_q, waiter->task);
  392. lockevent_inc(rwsem_wake_writer);
  393. }
  394. return;
  395. }
  396. /*
  397. * No reader wakeup if there are too many of them already.
  398. */
  399. if (unlikely(atomic_long_read(&sem->count) < 0))
  400. return;
  401. /*
  402. * Writers might steal the lock before we grant it to the next reader.
  403. * We prefer to do the first reader grant before counting readers
  404. * so we can bail out early if a writer stole the lock.
  405. */
  406. if (wake_type != RWSEM_WAKE_READ_OWNED) {
  407. struct task_struct *owner;
  408. adjustment = RWSEM_READER_BIAS;
  409. oldcount = atomic_long_fetch_add(adjustment, &sem->count);
  410. if (unlikely(oldcount & RWSEM_WRITER_MASK)) {
  411. /*
  412. * When we've been waiting "too" long (for writers
  413. * to give up the lock), request a HANDOFF to
  414. * force the issue.
  415. */
  416. if (time_after(jiffies, waiter->timeout)) {
  417. if (!(oldcount & RWSEM_FLAG_HANDOFF)) {
  418. adjustment -= RWSEM_FLAG_HANDOFF;
  419. lockevent_inc(rwsem_rlock_handoff);
  420. }
  421. waiter->handoff_set = true;
  422. }
  423. atomic_long_add(-adjustment, &sem->count);
  424. return;
  425. }
  426. /*
  427. * Set it to reader-owned to give spinners an early
  428. * indication that readers now have the lock.
  429. * The reader nonspinnable bit seen at slowpath entry of
  430. * the reader is copied over.
  431. */
  432. owner = waiter->task;
  433. __rwsem_set_reader_owned(sem, owner);
  434. }
  435. /*
  436. * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the
  437. * queue. We know that the woken will be at least 1 as we accounted
  438. * for above. Note we increment the 'active part' of the count by the
  439. * number of readers before waking any processes up.
  440. *
  441. * This is an adaptation of the phase-fair R/W locks where at the
  442. * reader phase (first waiter is a reader), all readers are eligible
  443. * to acquire the lock at the same time irrespective of their order
  444. * in the queue. The writers acquire the lock according to their
  445. * order in the queue.
  446. *
  447. * We have to do wakeup in 2 passes to prevent the possibility that
  448. * the reader count may be decremented before it is incremented. It
  449. * is because the to-be-woken waiter may not have slept yet. So it
  450. * may see waiter->task got cleared, finish its critical section and
  451. * do an unlock before the reader count increment.
  452. *
  453. * 1) Collect the read-waiters in a separate list, count them and
  454. * fully increment the reader count in rwsem.
  455. * 2) For each waiters in the new list, clear waiter->task and
  456. * put them into wake_q to be woken up later.
  457. */
  458. INIT_LIST_HEAD(&wlist);
  459. list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
  460. if (waiter->type == RWSEM_WAITING_FOR_WRITE)
  461. continue;
  462. woken++;
  463. list_move_tail(&waiter->list, &wlist);
  464. /*
  465. * Limit # of readers that can be woken up per wakeup call.
  466. */
  467. if (unlikely(woken >= MAX_READERS_WAKEUP))
  468. break;
  469. }
  470. adjustment = woken * RWSEM_READER_BIAS - adjustment;
  471. lockevent_cond_inc(rwsem_wake_reader, woken);
  472. oldcount = atomic_long_read(&sem->count);
  473. if (list_empty(&sem->wait_list)) {
  474. /*
  475. * Combined with list_move_tail() above, this implies
  476. * rwsem_del_waiter().
  477. */
  478. adjustment -= RWSEM_FLAG_WAITERS;
  479. if (oldcount & RWSEM_FLAG_HANDOFF)
  480. adjustment -= RWSEM_FLAG_HANDOFF;
  481. } else if (woken) {
  482. /*
  483. * When we've woken a reader, we no longer need to force
  484. * writers to give up the lock and we can clear HANDOFF.
  485. */
  486. if (oldcount & RWSEM_FLAG_HANDOFF)
  487. adjustment -= RWSEM_FLAG_HANDOFF;
  488. }
  489. if (adjustment)
  490. atomic_long_add(adjustment, &sem->count);
  491. /* 2nd pass */
  492. list_for_each_entry_safe(waiter, tmp, &wlist, list) {
  493. struct task_struct *tsk;
  494. tsk = waiter->task;
  495. get_task_struct(tsk);
  496. /*
  497. * Ensure calling get_task_struct() before setting the reader
  498. * waiter to nil such that rwsem_down_read_slowpath() cannot
  499. * race with do_exit() by always holding a reference count
  500. * to the task to wakeup.
  501. */
  502. smp_store_release(&waiter->task, NULL);
  503. /*
  504. * Ensure issuing the wakeup (either by us or someone else)
  505. * after setting the reader waiter to nil.
  506. */
  507. wake_q_add_safe(wake_q, tsk);
  508. }
  509. }
  510. /*
  511. * Remove a waiter and try to wake up other waiters in the wait queue
  512. * This function is called from the out_nolock path of both the reader and
  513. * writer slowpaths with wait_lock held. It releases the wait_lock and
  514. * optionally wake up waiters before it returns.
  515. */
  516. static inline void
  517. rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
  518. struct wake_q_head *wake_q)
  519. __releases(&sem->wait_lock)
  520. {
  521. bool first = rwsem_first_waiter(sem) == waiter;
  522. wake_q_init(wake_q);
  523. /*
  524. * If the wait_list isn't empty and the waiter to be deleted is
  525. * the first waiter, we wake up the remaining waiters as they may
  526. * be eligible to acquire or spin on the lock.
  527. */
  528. if (rwsem_del_waiter(sem, waiter) && first)
  529. rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q);
  530. raw_spin_unlock_irq(&sem->wait_lock);
  531. if (!wake_q_empty(wake_q))
  532. wake_up_q(wake_q);
  533. }
  534. /*
  535. * This function must be called with the sem->wait_lock held to prevent
  536. * race conditions between checking the rwsem wait list and setting the
  537. * sem->count accordingly.
  538. *
  539. * Implies rwsem_del_waiter() on success.
  540. */
  541. static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
  542. struct rwsem_waiter *waiter)
  543. {
  544. struct rwsem_waiter *first = rwsem_first_waiter(sem);
  545. long count, new;
  546. lockdep_assert_held(&sem->wait_lock);
  547. count = atomic_long_read(&sem->count);
  548. do {
  549. bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
  550. if (has_handoff) {
  551. /*
  552. * Honor handoff bit and yield only when the first
  553. * waiter is the one that set it. Otherwisee, we
  554. * still try to acquire the rwsem.
  555. */
  556. if (first->handoff_set && (waiter != first))
  557. return false;
  558. }
  559. new = count;
  560. if (count & RWSEM_LOCK_MASK) {
  561. /*
  562. * A waiter (first or not) can set the handoff bit
  563. * if it is an RT task or wait in the wait queue
  564. * for too long.
  565. */
  566. if (has_handoff || (!rt_or_dl_task(waiter->task) &&
  567. !time_after(jiffies, waiter->timeout)))
  568. return false;
  569. new |= RWSEM_FLAG_HANDOFF;
  570. } else {
  571. new |= RWSEM_WRITER_LOCKED;
  572. new &= ~RWSEM_FLAG_HANDOFF;
  573. if (list_is_singular(&sem->wait_list))
  574. new &= ~RWSEM_FLAG_WAITERS;
  575. }
  576. } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
  577. /*
  578. * We have either acquired the lock with handoff bit cleared or set
  579. * the handoff bit. Only the first waiter can have its handoff_set
  580. * set here to enable optimistic spinning in slowpath loop.
  581. */
  582. if (new & RWSEM_FLAG_HANDOFF) {
  583. first->handoff_set = true;
  584. lockevent_inc(rwsem_wlock_handoff);
  585. return false;
  586. }
  587. /*
  588. * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
  589. * success.
  590. */
  591. list_del(&waiter->list);
  592. rwsem_set_owner(sem);
  593. return true;
  594. }
  595. /*
  596. * The rwsem_spin_on_owner() function returns the following 4 values
  597. * depending on the lock owner state.
  598. * OWNER_NULL : owner is currently NULL
  599. * OWNER_WRITER: when owner changes and is a writer
  600. * OWNER_READER: when owner changes and the new owner may be a reader.
  601. * OWNER_NONSPINNABLE:
  602. * when optimistic spinning has to stop because either the
  603. * owner stops running, is unknown, or its timeslice has
  604. * been used up.
  605. */
  606. enum owner_state {
  607. OWNER_NULL = 1 << 0,
  608. OWNER_WRITER = 1 << 1,
  609. OWNER_READER = 1 << 2,
  610. OWNER_NONSPINNABLE = 1 << 3,
  611. };
  612. #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
  613. /*
  614. * Try to acquire write lock before the writer has been put on wait queue.
  615. */
  616. static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
  617. {
  618. long count = atomic_long_read(&sem->count);
  619. while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) {
  620. if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
  621. count | RWSEM_WRITER_LOCKED)) {
  622. rwsem_set_owner(sem);
  623. lockevent_inc(rwsem_opt_lock);
  624. return true;
  625. }
  626. }
  627. return false;
  628. }
  629. static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
  630. {
  631. struct task_struct *owner;
  632. unsigned long flags;
  633. bool ret = true;
  634. if (need_resched()) {
  635. lockevent_inc(rwsem_opt_fail);
  636. return false;
  637. }
  638. /*
  639. * Disable preemption is equal to the RCU read-side crital section,
  640. * thus the task_strcut structure won't go away.
  641. */
  642. owner = rwsem_owner_flags(sem, &flags);
  643. /*
  644. * Don't check the read-owner as the entry may be stale.
  645. */
  646. if ((flags & RWSEM_NONSPINNABLE) ||
  647. (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
  648. ret = false;
  649. lockevent_cond_inc(rwsem_opt_fail, !ret);
  650. return ret;
  651. }
  652. #define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER)
  653. static inline enum owner_state
  654. rwsem_owner_state(struct task_struct *owner, unsigned long flags)
  655. {
  656. if (flags & RWSEM_NONSPINNABLE)
  657. return OWNER_NONSPINNABLE;
  658. if (flags & RWSEM_READER_OWNED)
  659. return OWNER_READER;
  660. return owner ? OWNER_WRITER : OWNER_NULL;
  661. }
  662. static noinline enum owner_state
  663. rwsem_spin_on_owner(struct rw_semaphore *sem)
  664. {
  665. struct task_struct *new, *owner;
  666. unsigned long flags, new_flags;
  667. enum owner_state state;
  668. lockdep_assert_preemption_disabled();
  669. owner = rwsem_owner_flags(sem, &flags);
  670. state = rwsem_owner_state(owner, flags);
  671. if (state != OWNER_WRITER)
  672. return state;
  673. for (;;) {
  674. /*
  675. * When a waiting writer set the handoff flag, it may spin
  676. * on the owner as well. Once that writer acquires the lock,
  677. * we can spin on it. So we don't need to quit even when the
  678. * handoff bit is set.
  679. */
  680. new = rwsem_owner_flags(sem, &new_flags);
  681. if ((new != owner) || (new_flags != flags)) {
  682. state = rwsem_owner_state(new, new_flags);
  683. break;
  684. }
  685. /*
  686. * Ensure we emit the owner->on_cpu, dereference _after_
  687. * checking sem->owner still matches owner, if that fails,
  688. * owner might point to free()d memory, if it still matches,
  689. * our spinning context already disabled preemption which is
  690. * equal to RCU read-side crital section ensures the memory
  691. * stays valid.
  692. */
  693. barrier();
  694. if (need_resched() || !owner_on_cpu(owner)) {
  695. state = OWNER_NONSPINNABLE;
  696. break;
  697. }
  698. cpu_relax();
  699. }
  700. return state;
  701. }
  702. /*
  703. * Calculate reader-owned rwsem spinning threshold for writer
  704. *
  705. * The more readers own the rwsem, the longer it will take for them to
  706. * wind down and free the rwsem. So the empirical formula used to
  707. * determine the actual spinning time limit here is:
  708. *
  709. * Spinning threshold = (10 + nr_readers/2)us
  710. *
  711. * The limit is capped to a maximum of 25us (30 readers). This is just
  712. * a heuristic and is subjected to change in the future.
  713. */
  714. static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem)
  715. {
  716. long count = atomic_long_read(&sem->count);
  717. int readers = count >> RWSEM_READER_SHIFT;
  718. u64 delta;
  719. if (readers > 30)
  720. readers = 30;
  721. delta = (20 + readers) * NSEC_PER_USEC / 2;
  722. return sched_clock() + delta;
  723. }
  724. static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
  725. {
  726. bool taken = false;
  727. int prev_owner_state = OWNER_NULL;
  728. int loop = 0;
  729. u64 rspin_threshold = 0;
  730. /* sem->wait_lock should not be held when doing optimistic spinning */
  731. if (!osq_lock(&sem->osq))
  732. goto done;
  733. /*
  734. * Optimistically spin on the owner field and attempt to acquire the
  735. * lock whenever the owner changes. Spinning will be stopped when:
  736. * 1) the owning writer isn't running; or
  737. * 2) readers own the lock and spinning time has exceeded limit.
  738. */
  739. for (;;) {
  740. enum owner_state owner_state;
  741. owner_state = rwsem_spin_on_owner(sem);
  742. if (!(owner_state & OWNER_SPINNABLE))
  743. break;
  744. /*
  745. * Try to acquire the lock
  746. */
  747. taken = rwsem_try_write_lock_unqueued(sem);
  748. if (taken)
  749. break;
  750. /*
  751. * Time-based reader-owned rwsem optimistic spinning
  752. */
  753. if (owner_state == OWNER_READER) {
  754. /*
  755. * Re-initialize rspin_threshold every time when
  756. * the owner state changes from non-reader to reader.
  757. * This allows a writer to steal the lock in between
  758. * 2 reader phases and have the threshold reset at
  759. * the beginning of the 2nd reader phase.
  760. */
  761. if (prev_owner_state != OWNER_READER) {
  762. if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))
  763. break;
  764. rspin_threshold = rwsem_rspin_threshold(sem);
  765. loop = 0;
  766. }
  767. /*
  768. * Check time threshold once every 16 iterations to
  769. * avoid calling sched_clock() too frequently so
  770. * as to reduce the average latency between the times
  771. * when the lock becomes free and when the spinner
  772. * is ready to do a trylock.
  773. */
  774. else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) {
  775. rwsem_set_nonspinnable(sem);
  776. lockevent_inc(rwsem_opt_nospin);
  777. break;
  778. }
  779. }
  780. /*
  781. * An RT task cannot do optimistic spinning if it cannot
  782. * be sure the lock holder is running or live-lock may
  783. * happen if the current task and the lock holder happen
  784. * to run in the same CPU. However, aborting optimistic
  785. * spinning while a NULL owner is detected may miss some
  786. * opportunity where spinning can continue without causing
  787. * problem.
  788. *
  789. * There are 2 possible cases where an RT task may be able
  790. * to continue spinning.
  791. *
  792. * 1) The lock owner is in the process of releasing the
  793. * lock, sem->owner is cleared but the lock has not
  794. * been released yet.
  795. * 2) The lock was free and owner cleared, but another
  796. * task just comes in and acquire the lock before
  797. * we try to get it. The new owner may be a spinnable
  798. * writer.
  799. *
  800. * To take advantage of two scenarios listed above, the RT
  801. * task is made to retry one more time to see if it can
  802. * acquire the lock or continue spinning on the new owning
  803. * writer. Of course, if the time lag is long enough or the
  804. * new owner is not a writer or spinnable, the RT task will
  805. * quit spinning.
  806. *
  807. * If the owner is a writer, the need_resched() check is
  808. * done inside rwsem_spin_on_owner(). If the owner is not
  809. * a writer, need_resched() check needs to be done here.
  810. */
  811. if (owner_state != OWNER_WRITER) {
  812. if (need_resched())
  813. break;
  814. if (rt_or_dl_task(current) &&
  815. (prev_owner_state != OWNER_WRITER))
  816. break;
  817. }
  818. prev_owner_state = owner_state;
  819. /*
  820. * The cpu_relax() call is a compiler barrier which forces
  821. * everything in this loop to be re-loaded. We don't need
  822. * memory barriers as we'll eventually observe the right
  823. * values at the cost of a few extra spins.
  824. */
  825. cpu_relax();
  826. }
  827. osq_unlock(&sem->osq);
  828. done:
  829. lockevent_cond_inc(rwsem_opt_fail, !taken);
  830. return taken;
  831. }
  832. /*
  833. * Clear the owner's RWSEM_NONSPINNABLE bit if it is set. This should
  834. * only be called when the reader count reaches 0.
  835. */
  836. static inline void clear_nonspinnable(struct rw_semaphore *sem)
  837. {
  838. if (unlikely(rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)))
  839. atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner);
  840. }
  841. #else
  842. static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
  843. {
  844. return false;
  845. }
  846. static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem)
  847. {
  848. return false;
  849. }
  850. static inline void clear_nonspinnable(struct rw_semaphore *sem) { }
  851. static inline enum owner_state
  852. rwsem_spin_on_owner(struct rw_semaphore *sem)
  853. {
  854. return OWNER_NONSPINNABLE;
  855. }
  856. #endif
  857. /*
  858. * Prepare to wake up waiter(s) in the wait queue by putting them into the
  859. * given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely
  860. * reader-owned, wake up read lock waiters in queue front or wake up any
  861. * front waiter otherwise.
  862. * This is being called from both reader and writer slow paths.
  863. */
  864. static inline void rwsem_cond_wake_waiter(struct rw_semaphore *sem, long count,
  865. struct wake_q_head *wake_q)
  866. {
  867. enum rwsem_wake_type wake_type;
  868. if (count & RWSEM_WRITER_MASK)
  869. return;
  870. if (count & RWSEM_READER_MASK) {
  871. wake_type = RWSEM_WAKE_READERS;
  872. } else {
  873. wake_type = RWSEM_WAKE_ANY;
  874. clear_nonspinnable(sem);
  875. }
  876. rwsem_mark_wake(sem, wake_type, wake_q);
  877. }
  878. /*
  879. * Wait for the read lock to be granted
  880. */
  881. static struct rw_semaphore __sched *
  882. rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int state)
  883. {
  884. long adjustment = -RWSEM_READER_BIAS;
  885. long rcnt = (count >> RWSEM_READER_SHIFT);
  886. struct rwsem_waiter waiter;
  887. DEFINE_WAKE_Q(wake_q);
  888. /*
  889. * To prevent a constant stream of readers from starving a sleeping
  890. * writer, don't attempt optimistic lock stealing if the lock is
  891. * very likely owned by readers.
  892. */
  893. if ((atomic_long_read(&sem->owner) & RWSEM_READER_OWNED) &&
  894. (rcnt > 1) && !(count & RWSEM_WRITER_LOCKED))
  895. goto queue;
  896. /*
  897. * Reader optimistic lock stealing.
  898. */
  899. if (!(count & (RWSEM_WRITER_LOCKED | RWSEM_FLAG_HANDOFF))) {
  900. rwsem_set_reader_owned(sem);
  901. lockevent_inc(rwsem_rlock_steal);
  902. /*
  903. * Wake up other readers in the wait queue if it is
  904. * the first reader.
  905. */
  906. if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) {
  907. raw_spin_lock_irq(&sem->wait_lock);
  908. if (!list_empty(&sem->wait_list))
  909. rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED,
  910. &wake_q);
  911. raw_spin_unlock_irq(&sem->wait_lock);
  912. wake_up_q(&wake_q);
  913. }
  914. return sem;
  915. }
  916. queue:
  917. waiter.task = current;
  918. waiter.type = RWSEM_WAITING_FOR_READ;
  919. waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
  920. waiter.handoff_set = false;
  921. raw_spin_lock_irq(&sem->wait_lock);
  922. if (list_empty(&sem->wait_list)) {
  923. /*
  924. * In case the wait queue is empty and the lock isn't owned
  925. * by a writer, this reader can exit the slowpath and return
  926. * immediately as its RWSEM_READER_BIAS has already been set
  927. * in the count.
  928. */
  929. if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) {
  930. /* Provide lock ACQUIRE */
  931. smp_acquire__after_ctrl_dep();
  932. raw_spin_unlock_irq(&sem->wait_lock);
  933. rwsem_set_reader_owned(sem);
  934. lockevent_inc(rwsem_rlock_fast);
  935. return sem;
  936. }
  937. adjustment += RWSEM_FLAG_WAITERS;
  938. }
  939. rwsem_add_waiter(sem, &waiter);
  940. /* we're now waiting on the lock, but no longer actively locking */
  941. count = atomic_long_add_return(adjustment, &sem->count);
  942. rwsem_cond_wake_waiter(sem, count, &wake_q);
  943. raw_spin_unlock_irq(&sem->wait_lock);
  944. if (!wake_q_empty(&wake_q))
  945. wake_up_q(&wake_q);
  946. trace_contention_begin(sem, LCB_F_READ);
  947. /* wait to be given the lock */
  948. for (;;) {
  949. set_current_state(state);
  950. if (!smp_load_acquire(&waiter.task)) {
  951. /* Matches rwsem_mark_wake()'s smp_store_release(). */
  952. break;
  953. }
  954. if (signal_pending_state(state, current)) {
  955. raw_spin_lock_irq(&sem->wait_lock);
  956. if (waiter.task)
  957. goto out_nolock;
  958. raw_spin_unlock_irq(&sem->wait_lock);
  959. /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
  960. break;
  961. }
  962. schedule_preempt_disabled();
  963. lockevent_inc(rwsem_sleep_reader);
  964. }
  965. __set_current_state(TASK_RUNNING);
  966. lockevent_inc(rwsem_rlock);
  967. trace_contention_end(sem, 0);
  968. return sem;
  969. out_nolock:
  970. rwsem_del_wake_waiter(sem, &waiter, &wake_q);
  971. __set_current_state(TASK_RUNNING);
  972. lockevent_inc(rwsem_rlock_fail);
  973. trace_contention_end(sem, -EINTR);
  974. return ERR_PTR(-EINTR);
  975. }
  976. /*
  977. * Wait until we successfully acquire the write lock
  978. */
  979. static struct rw_semaphore __sched *
  980. rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
  981. {
  982. struct rwsem_waiter waiter;
  983. DEFINE_WAKE_Q(wake_q);
  984. /* do optimistic spinning and steal lock if possible */
  985. if (rwsem_can_spin_on_owner(sem) && rwsem_optimistic_spin(sem)) {
  986. /* rwsem_optimistic_spin() implies ACQUIRE on success */
  987. return sem;
  988. }
  989. /*
  990. * Optimistic spinning failed, proceed to the slowpath
  991. * and block until we can acquire the sem.
  992. */
  993. waiter.task = current;
  994. waiter.type = RWSEM_WAITING_FOR_WRITE;
  995. waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
  996. waiter.handoff_set = false;
  997. raw_spin_lock_irq(&sem->wait_lock);
  998. rwsem_add_waiter(sem, &waiter);
  999. /* we're now waiting on the lock */
  1000. if (rwsem_first_waiter(sem) != &waiter) {
  1001. rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count),
  1002. &wake_q);
  1003. if (!wake_q_empty(&wake_q)) {
  1004. /*
  1005. * We want to minimize wait_lock hold time especially
  1006. * when a large number of readers are to be woken up.
  1007. */
  1008. raw_spin_unlock_irq(&sem->wait_lock);
  1009. wake_up_q(&wake_q);
  1010. raw_spin_lock_irq(&sem->wait_lock);
  1011. }
  1012. } else {
  1013. atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
  1014. }
  1015. /* wait until we successfully acquire the lock */
  1016. set_current_state(state);
  1017. trace_contention_begin(sem, LCB_F_WRITE);
  1018. for (;;) {
  1019. if (rwsem_try_write_lock(sem, &waiter)) {
  1020. /* rwsem_try_write_lock() implies ACQUIRE on success */
  1021. break;
  1022. }
  1023. raw_spin_unlock_irq(&sem->wait_lock);
  1024. if (signal_pending_state(state, current))
  1025. goto out_nolock;
  1026. /*
  1027. * After setting the handoff bit and failing to acquire
  1028. * the lock, attempt to spin on owner to accelerate lock
  1029. * transfer. If the previous owner is a on-cpu writer and it
  1030. * has just released the lock, OWNER_NULL will be returned.
  1031. * In this case, we attempt to acquire the lock again
  1032. * without sleeping.
  1033. */
  1034. if (waiter.handoff_set) {
  1035. enum owner_state owner_state;
  1036. owner_state = rwsem_spin_on_owner(sem);
  1037. if (owner_state == OWNER_NULL)
  1038. goto trylock_again;
  1039. }
  1040. schedule_preempt_disabled();
  1041. lockevent_inc(rwsem_sleep_writer);
  1042. set_current_state(state);
  1043. trylock_again:
  1044. raw_spin_lock_irq(&sem->wait_lock);
  1045. }
  1046. __set_current_state(TASK_RUNNING);
  1047. raw_spin_unlock_irq(&sem->wait_lock);
  1048. lockevent_inc(rwsem_wlock);
  1049. trace_contention_end(sem, 0);
  1050. return sem;
  1051. out_nolock:
  1052. __set_current_state(TASK_RUNNING);
  1053. raw_spin_lock_irq(&sem->wait_lock);
  1054. rwsem_del_wake_waiter(sem, &waiter, &wake_q);
  1055. lockevent_inc(rwsem_wlock_fail);
  1056. trace_contention_end(sem, -EINTR);
  1057. return ERR_PTR(-EINTR);
  1058. }
  1059. /*
  1060. * handle waking up a waiter on the semaphore
  1061. * - up_read/up_write has decremented the active part of count if we come here
  1062. */
  1063. static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
  1064. {
  1065. unsigned long flags;
  1066. DEFINE_WAKE_Q(wake_q);
  1067. raw_spin_lock_irqsave(&sem->wait_lock, flags);
  1068. if (!list_empty(&sem->wait_list))
  1069. rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
  1070. raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
  1071. wake_up_q(&wake_q);
  1072. return sem;
  1073. }
  1074. /*
  1075. * downgrade a write lock into a read lock
  1076. * - caller incremented waiting part of count and discovered it still negative
  1077. * - just wake up any readers at the front of the queue
  1078. */
  1079. static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
  1080. {
  1081. unsigned long flags;
  1082. DEFINE_WAKE_Q(wake_q);
  1083. raw_spin_lock_irqsave(&sem->wait_lock, flags);
  1084. if (!list_empty(&sem->wait_list))
  1085. rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
  1086. raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
  1087. wake_up_q(&wake_q);
  1088. return sem;
  1089. }
  1090. /*
  1091. * lock for reading
  1092. */
  1093. static __always_inline int __down_read_common(struct rw_semaphore *sem, int state)
  1094. {
  1095. int ret = 0;
  1096. long count;
  1097. preempt_disable();
  1098. if (!rwsem_read_trylock(sem, &count)) {
  1099. if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) {
  1100. ret = -EINTR;
  1101. goto out;
  1102. }
  1103. DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
  1104. }
  1105. out:
  1106. preempt_enable();
  1107. return ret;
  1108. }
  1109. static __always_inline void __down_read(struct rw_semaphore *sem)
  1110. {
  1111. __down_read_common(sem, TASK_UNINTERRUPTIBLE);
  1112. }
  1113. static __always_inline int __down_read_interruptible(struct rw_semaphore *sem)
  1114. {
  1115. return __down_read_common(sem, TASK_INTERRUPTIBLE);
  1116. }
  1117. static __always_inline int __down_read_killable(struct rw_semaphore *sem)
  1118. {
  1119. return __down_read_common(sem, TASK_KILLABLE);
  1120. }
  1121. static inline int __down_read_trylock(struct rw_semaphore *sem)
  1122. {
  1123. int ret = 0;
  1124. long tmp;
  1125. DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
  1126. preempt_disable();
  1127. tmp = atomic_long_read(&sem->count);
  1128. while (!(tmp & RWSEM_READ_FAILED_MASK)) {
  1129. if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
  1130. tmp + RWSEM_READER_BIAS)) {
  1131. rwsem_set_reader_owned(sem);
  1132. ret = 1;
  1133. break;
  1134. }
  1135. }
  1136. preempt_enable();
  1137. return ret;
  1138. }
  1139. /*
  1140. * lock for writing
  1141. */
  1142. static __always_inline int __down_write_common(struct rw_semaphore *sem, int state)
  1143. {
  1144. int ret = 0;
  1145. preempt_disable();
  1146. if (unlikely(!rwsem_write_trylock(sem))) {
  1147. if (IS_ERR(rwsem_down_write_slowpath(sem, state)))
  1148. ret = -EINTR;
  1149. }
  1150. preempt_enable();
  1151. return ret;
  1152. }
  1153. static __always_inline void __down_write(struct rw_semaphore *sem)
  1154. {
  1155. __down_write_common(sem, TASK_UNINTERRUPTIBLE);
  1156. }
  1157. static __always_inline int __down_write_killable(struct rw_semaphore *sem)
  1158. {
  1159. return __down_write_common(sem, TASK_KILLABLE);
  1160. }
  1161. static inline int __down_write_trylock(struct rw_semaphore *sem)
  1162. {
  1163. int ret;
  1164. preempt_disable();
  1165. DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
  1166. ret = rwsem_write_trylock(sem);
  1167. preempt_enable();
  1168. return ret;
  1169. }
  1170. /*
  1171. * unlock after reading
  1172. */
  1173. static inline void __up_read(struct rw_semaphore *sem)
  1174. {
  1175. long tmp;
  1176. DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
  1177. DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
  1178. preempt_disable();
  1179. rwsem_clear_reader_owned(sem);
  1180. tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
  1181. DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
  1182. if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
  1183. RWSEM_FLAG_WAITERS)) {
  1184. clear_nonspinnable(sem);
  1185. rwsem_wake(sem);
  1186. }
  1187. preempt_enable();
  1188. }
  1189. /*
  1190. * unlock after writing
  1191. */
  1192. static inline void __up_write(struct rw_semaphore *sem)
  1193. {
  1194. long tmp;
  1195. DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
  1196. /*
  1197. * sem->owner may differ from current if the ownership is transferred
  1198. * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits.
  1199. */
  1200. DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
  1201. !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
  1202. preempt_disable();
  1203. rwsem_clear_owner(sem);
  1204. tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
  1205. if (unlikely(tmp & RWSEM_FLAG_WAITERS))
  1206. rwsem_wake(sem);
  1207. preempt_enable();
  1208. }
  1209. /*
  1210. * downgrade write lock to read lock
  1211. */
  1212. static inline void __downgrade_write(struct rw_semaphore *sem)
  1213. {
  1214. long tmp;
  1215. /*
  1216. * When downgrading from exclusive to shared ownership,
  1217. * anything inside the write-locked region cannot leak
  1218. * into the read side. In contrast, anything in the
  1219. * read-locked region is ok to be re-ordered into the
  1220. * write side. As such, rely on RELEASE semantics.
  1221. */
  1222. DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
  1223. preempt_disable();
  1224. tmp = atomic_long_fetch_add_release(
  1225. -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
  1226. rwsem_set_reader_owned(sem);
  1227. if (tmp & RWSEM_FLAG_WAITERS)
  1228. rwsem_downgrade_wake(sem);
  1229. preempt_enable();
  1230. }
  1231. #else /* !CONFIG_PREEMPT_RT */
  1232. #define RT_MUTEX_BUILD_MUTEX
  1233. #include "rtmutex.c"
  1234. #define rwbase_set_and_save_current_state(state) \
  1235. set_current_state(state)
  1236. #define rwbase_restore_current_state() \
  1237. __set_current_state(TASK_RUNNING)
  1238. #define rwbase_rtmutex_lock_state(rtm, state) \
  1239. __rt_mutex_lock(rtm, state)
  1240. #define rwbase_rtmutex_slowlock_locked(rtm, state) \
  1241. __rt_mutex_slowlock_locked(rtm, NULL, state)
  1242. #define rwbase_rtmutex_unlock(rtm) \
  1243. __rt_mutex_unlock(rtm)
  1244. #define rwbase_rtmutex_trylock(rtm) \
  1245. __rt_mutex_trylock(rtm)
  1246. #define rwbase_signal_pending_state(state, current) \
  1247. signal_pending_state(state, current)
  1248. #define rwbase_pre_schedule() \
  1249. rt_mutex_pre_schedule()
  1250. #define rwbase_schedule() \
  1251. rt_mutex_schedule()
  1252. #define rwbase_post_schedule() \
  1253. rt_mutex_post_schedule()
  1254. #include "rwbase_rt.c"
  1255. void __init_rwsem(struct rw_semaphore *sem, const char *name,
  1256. struct lock_class_key *key)
  1257. {
  1258. init_rwbase_rt(&(sem)->rwbase);
  1259. #ifdef CONFIG_DEBUG_LOCK_ALLOC
  1260. debug_check_no_locks_freed((void *)sem, sizeof(*sem));
  1261. lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
  1262. #endif
  1263. }
  1264. EXPORT_SYMBOL(__init_rwsem);
  1265. static inline void __down_read(struct rw_semaphore *sem)
  1266. {
  1267. rwbase_read_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
  1268. }
  1269. static inline int __down_read_interruptible(struct rw_semaphore *sem)
  1270. {
  1271. return rwbase_read_lock(&sem->rwbase, TASK_INTERRUPTIBLE);
  1272. }
  1273. static inline int __down_read_killable(struct rw_semaphore *sem)
  1274. {
  1275. return rwbase_read_lock(&sem->rwbase, TASK_KILLABLE);
  1276. }
  1277. static inline int __down_read_trylock(struct rw_semaphore *sem)
  1278. {
  1279. return rwbase_read_trylock(&sem->rwbase);
  1280. }
  1281. static inline void __up_read(struct rw_semaphore *sem)
  1282. {
  1283. rwbase_read_unlock(&sem->rwbase, TASK_NORMAL);
  1284. }
  1285. static inline void __sched __down_write(struct rw_semaphore *sem)
  1286. {
  1287. rwbase_write_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
  1288. }
  1289. static inline int __sched __down_write_killable(struct rw_semaphore *sem)
  1290. {
  1291. return rwbase_write_lock(&sem->rwbase, TASK_KILLABLE);
  1292. }
  1293. static inline int __down_write_trylock(struct rw_semaphore *sem)
  1294. {
  1295. return rwbase_write_trylock(&sem->rwbase);
  1296. }
  1297. static inline void __up_write(struct rw_semaphore *sem)
  1298. {
  1299. rwbase_write_unlock(&sem->rwbase);
  1300. }
  1301. static inline void __downgrade_write(struct rw_semaphore *sem)
  1302. {
  1303. rwbase_write_downgrade(&sem->rwbase);
  1304. }
  1305. /* Debug stubs for the common API */
  1306. #define DEBUG_RWSEMS_WARN_ON(c, sem)
  1307. static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
  1308. struct task_struct *owner)
  1309. {
  1310. }
  1311. static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
  1312. {
  1313. int count = atomic_read(&sem->rwbase.readers);
  1314. return count < 0 && count != READER_BIAS;
  1315. }
  1316. #endif /* CONFIG_PREEMPT_RT */
  1317. /*
  1318. * lock for reading
  1319. */
  1320. void __sched down_read(struct rw_semaphore *sem)
  1321. {
  1322. might_sleep();
  1323. rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
  1324. LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
  1325. }
  1326. EXPORT_SYMBOL(down_read);
  1327. int __sched down_read_interruptible(struct rw_semaphore *sem)
  1328. {
  1329. might_sleep();
  1330. rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
  1331. if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_interruptible)) {
  1332. rwsem_release(&sem->dep_map, _RET_IP_);
  1333. return -EINTR;
  1334. }
  1335. return 0;
  1336. }
  1337. EXPORT_SYMBOL(down_read_interruptible);
  1338. int __sched down_read_killable(struct rw_semaphore *sem)
  1339. {
  1340. might_sleep();
  1341. rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
  1342. if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
  1343. rwsem_release(&sem->dep_map, _RET_IP_);
  1344. return -EINTR;
  1345. }
  1346. return 0;
  1347. }
  1348. EXPORT_SYMBOL(down_read_killable);
  1349. /*
  1350. * trylock for reading -- returns 1 if successful, 0 if contention
  1351. */
  1352. int down_read_trylock(struct rw_semaphore *sem)
  1353. {
  1354. int ret = __down_read_trylock(sem);
  1355. if (ret == 1)
  1356. rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
  1357. return ret;
  1358. }
  1359. EXPORT_SYMBOL(down_read_trylock);
  1360. /*
  1361. * lock for writing
  1362. */
  1363. void __sched down_write(struct rw_semaphore *sem)
  1364. {
  1365. might_sleep();
  1366. rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
  1367. LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
  1368. }
  1369. EXPORT_SYMBOL(down_write);
  1370. /*
  1371. * lock for writing
  1372. */
  1373. int __sched down_write_killable(struct rw_semaphore *sem)
  1374. {
  1375. might_sleep();
  1376. rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
  1377. if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
  1378. __down_write_killable)) {
  1379. rwsem_release(&sem->dep_map, _RET_IP_);
  1380. return -EINTR;
  1381. }
  1382. return 0;
  1383. }
  1384. EXPORT_SYMBOL(down_write_killable);
  1385. /*
  1386. * trylock for writing -- returns 1 if successful, 0 if contention
  1387. */
  1388. int down_write_trylock(struct rw_semaphore *sem)
  1389. {
  1390. int ret = __down_write_trylock(sem);
  1391. if (ret == 1)
  1392. rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
  1393. return ret;
  1394. }
  1395. EXPORT_SYMBOL(down_write_trylock);
  1396. /*
  1397. * release a read lock
  1398. */
  1399. void up_read(struct rw_semaphore *sem)
  1400. {
  1401. rwsem_release(&sem->dep_map, _RET_IP_);
  1402. __up_read(sem);
  1403. }
  1404. EXPORT_SYMBOL(up_read);
  1405. /*
  1406. * release a write lock
  1407. */
  1408. void up_write(struct rw_semaphore *sem)
  1409. {
  1410. rwsem_release(&sem->dep_map, _RET_IP_);
  1411. __up_write(sem);
  1412. }
  1413. EXPORT_SYMBOL(up_write);
  1414. /*
  1415. * downgrade write lock to read lock
  1416. */
  1417. void downgrade_write(struct rw_semaphore *sem)
  1418. {
  1419. lock_downgrade(&sem->dep_map, _RET_IP_);
  1420. __downgrade_write(sem);
  1421. }
  1422. EXPORT_SYMBOL(downgrade_write);
  1423. #ifdef CONFIG_DEBUG_LOCK_ALLOC
  1424. void down_read_nested(struct rw_semaphore *sem, int subclass)
  1425. {
  1426. might_sleep();
  1427. rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
  1428. LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
  1429. }
  1430. EXPORT_SYMBOL(down_read_nested);
  1431. int down_read_killable_nested(struct rw_semaphore *sem, int subclass)
  1432. {
  1433. might_sleep();
  1434. rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
  1435. if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
  1436. rwsem_release(&sem->dep_map, _RET_IP_);
  1437. return -EINTR;
  1438. }
  1439. return 0;
  1440. }
  1441. EXPORT_SYMBOL(down_read_killable_nested);
  1442. void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
  1443. {
  1444. might_sleep();
  1445. rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
  1446. LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
  1447. }
  1448. EXPORT_SYMBOL(_down_write_nest_lock);
  1449. void down_read_non_owner(struct rw_semaphore *sem)
  1450. {
  1451. might_sleep();
  1452. __down_read(sem);
  1453. /*
  1454. * The owner value for a reader-owned lock is mostly for debugging
  1455. * purpose only and is not critical to the correct functioning of
  1456. * rwsem. So it is perfectly fine to set it in a preempt-enabled
  1457. * context here.
  1458. */
  1459. __rwsem_set_reader_owned(sem, NULL);
  1460. }
  1461. EXPORT_SYMBOL(down_read_non_owner);
  1462. void down_write_nested(struct rw_semaphore *sem, int subclass)
  1463. {
  1464. might_sleep();
  1465. rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
  1466. LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
  1467. }
  1468. EXPORT_SYMBOL(down_write_nested);
  1469. int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
  1470. {
  1471. might_sleep();
  1472. rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
  1473. if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
  1474. __down_write_killable)) {
  1475. rwsem_release(&sem->dep_map, _RET_IP_);
  1476. return -EINTR;
  1477. }
  1478. return 0;
  1479. }
  1480. EXPORT_SYMBOL(down_write_killable_nested);
  1481. void up_read_non_owner(struct rw_semaphore *sem)
  1482. {
  1483. DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
  1484. __up_read(sem);
  1485. }
  1486. EXPORT_SYMBOL(up_read_non_owner);
  1487. #endif