rcuref.c 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * rcuref - A scalable reference count implementation for RCU managed objects
  4. *
  5. * rcuref is provided to replace open coded reference count implementations
  6. * based on atomic_t. It protects explicitely RCU managed objects which can
  7. * be visible even after the last reference has been dropped and the object
  8. * is heading towards destruction.
  9. *
  10. * A common usage pattern is:
  11. *
  12. * get()
  13. * rcu_read_lock();
  14. * p = get_ptr();
  15. * if (p && !atomic_inc_not_zero(&p->refcnt))
  16. * p = NULL;
  17. * rcu_read_unlock();
  18. * return p;
  19. *
  20. * put()
  21. * if (!atomic_dec_return(&->refcnt)) {
  22. * remove_ptr(p);
  23. * kfree_rcu((p, rcu);
  24. * }
  25. *
  26. * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has
  27. * O(N^2) behaviour under contention with N concurrent operations.
  28. *
  29. * rcuref uses atomic_add_negative_relaxed() for the fast path, which scales
  30. * better under contention.
  31. *
  32. * Why not refcount?
  33. * =================
  34. *
  35. * In principle it should be possible to make refcount use the rcuref
  36. * scheme, but the destruction race described below cannot be prevented
  37. * unless the protected object is RCU managed.
  38. *
  39. * Theory of operation
  40. * ===================
  41. *
  42. * rcuref uses an unsigned integer reference counter. As long as the
  43. * counter value is greater than or equal to RCUREF_ONEREF and not larger
  44. * than RCUREF_MAXREF the reference is alive:
  45. *
  46. * ONEREF MAXREF SATURATED RELEASED DEAD NOREF
  47. * 0 0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF
  48. * <---valid --------> <-------saturation zone-------> <-----dead zone----->
  49. *
  50. * The get() and put() operations do unconditional increments and
  51. * decrements. The result is checked after the operation. This optimizes
  52. * for the fast path.
  53. *
  54. * If the reference count is saturated or dead, then the increments and
  55. * decrements are not harmful as the reference count still stays in the
  56. * respective zones and is always set back to STATURATED resp. DEAD. The
  57. * zones have room for 2^28 racing operations in each direction, which
  58. * makes it practically impossible to escape the zones.
  59. *
  60. * Once the last reference is dropped the reference count becomes
  61. * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The
  62. * slowpath then tries to set the reference count from RCUREF_NOREF to
  63. * RCUREF_DEAD via a cmpxchg(). This opens a small window where a
  64. * concurrent rcuref_get() can acquire the reference count and bring it
  65. * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD.
  66. *
  67. * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in
  68. * DEAD + 1, which is inside the dead zone. If that happens the reference
  69. * count is put back to DEAD.
  70. *
  71. * The actual race is possible due to the unconditional increment and
  72. * decrements in rcuref_get() and rcuref_put():
  73. *
  74. * T1 T2
  75. * get() put()
  76. * if (atomic_add_negative(-1, &ref->refcnt))
  77. * succeeds-> atomic_cmpxchg(&ref->refcnt, NOREF, DEAD);
  78. *
  79. * atomic_add_negative(1, &ref->refcnt); <- Elevates refcount to DEAD + 1
  80. *
  81. * As the result of T1's add is negative, the get() goes into the slow path
  82. * and observes refcnt being in the dead zone which makes the operation fail.
  83. *
  84. * Possible critical states:
  85. *
  86. * Context Counter References Operation
  87. * T1 0 1 init()
  88. * T2 1 2 get()
  89. * T1 0 1 put()
  90. * T2 -1 0 put() tries to mark dead
  91. * T1 0 1 get()
  92. * T2 0 1 put() mark dead fails
  93. * T1 -1 0 put() tries to mark dead
  94. * T1 DEAD 0 put() mark dead succeeds
  95. * T2 DEAD+1 0 get() fails and puts it back to DEAD
  96. *
  97. * Of course there are more complex scenarios, but the above illustrates
  98. * the working principle. The rest is left to the imagination of the
  99. * reader.
  100. *
  101. * Deconstruction race
  102. * ===================
  103. *
  104. * The release operation must be protected by prohibiting a grace period in
  105. * order to prevent a possible use after free:
  106. *
  107. * T1 T2
  108. * put() get()
  109. * // ref->refcnt = ONEREF
  110. * if (!atomic_add_negative(-1, &ref->refcnt))
  111. * return false; <- Not taken
  112. *
  113. * // ref->refcnt == NOREF
  114. * --> preemption
  115. * // Elevates ref->refcnt to ONEREF
  116. * if (!atomic_add_negative(1, &ref->refcnt))
  117. * return true; <- taken
  118. *
  119. * if (put(&p->ref)) { <-- Succeeds
  120. * remove_pointer(p);
  121. * kfree_rcu(p, rcu);
  122. * }
  123. *
  124. * RCU grace period ends, object is freed
  125. *
  126. * atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); <- UAF
  127. *
  128. * This is prevented by disabling preemption around the put() operation as
  129. * that's in most kernel configurations cheaper than a rcu_read_lock() /
  130. * rcu_read_unlock() pair and in many cases even a NOOP. In any case it
  131. * prevents the grace period which keeps the object alive until all put()
  132. * operations complete.
  133. *
  134. * Saturation protection
  135. * =====================
  136. *
  137. * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX).
  138. * Once this is exceedded the reference count becomes stale by setting it
  139. * to RCUREF_SATURATED, which will cause a memory leak, but it prevents
  140. * wrap arounds which obviously cause worse problems than a memory
  141. * leak. When saturation is reached a warning is emitted.
  142. *
  143. * Race conditions
  144. * ===============
  145. *
  146. * All reference count increment/decrement operations are unconditional and
  147. * only verified after the fact. This optimizes for the good case and takes
  148. * the occasional race vs. a dead or already saturated refcount into
  149. * account. The saturation and dead zones are large enough to accomodate
  150. * for that.
  151. *
  152. * Memory ordering
  153. * ===============
  154. *
  155. * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
  156. * and provide only what is strictly required for refcounts.
  157. *
  158. * The increments are fully relaxed; these will not provide ordering. The
  159. * rationale is that whatever is used to obtain the object to increase the
  160. * reference count on will provide the ordering. For locked data
  161. * structures, its the lock acquire, for RCU/lockless data structures its
  162. * the dependent load.
  163. *
  164. * rcuref_get() provides a control dependency ordering future stores which
  165. * ensures that the object is not modified when acquiring a reference
  166. * fails.
  167. *
  168. * rcuref_put() provides release order, i.e. all prior loads and stores
  169. * will be issued before. It also provides a control dependency ordering
  170. * against the subsequent destruction of the object.
  171. *
  172. * If rcuref_put() successfully dropped the last reference and marked the
  173. * object DEAD it also provides acquire ordering.
  174. */
  175. #include <linux/export.h>
  176. #include <linux/rcuref.h>
  177. /**
  178. * rcuref_get_slowpath - Slowpath of rcuref_get()
  179. * @ref: Pointer to the reference count
  180. *
  181. * Invoked when the reference count is outside of the valid zone.
  182. *
  183. * Return:
  184. * False if the reference count was already marked dead
  185. *
  186. * True if the reference count is saturated, which prevents the
  187. * object from being deconstructed ever.
  188. */
  189. bool rcuref_get_slowpath(rcuref_t *ref)
  190. {
  191. unsigned int cnt = atomic_read(&ref->refcnt);
  192. /*
  193. * If the reference count was already marked dead, undo the
  194. * increment so it stays in the middle of the dead zone and return
  195. * fail.
  196. */
  197. if (cnt >= RCUREF_RELEASED) {
  198. atomic_set(&ref->refcnt, RCUREF_DEAD);
  199. return false;
  200. }
  201. /*
  202. * If it was saturated, warn and mark it so. In case the increment
  203. * was already on a saturated value restore the saturation
  204. * marker. This keeps it in the middle of the saturation zone and
  205. * prevents the reference count from overflowing. This leaks the
  206. * object memory, but prevents the obvious reference count overflow
  207. * damage.
  208. */
  209. if (WARN_ONCE(cnt > RCUREF_MAXREF, "rcuref saturated - leaking memory"))
  210. atomic_set(&ref->refcnt, RCUREF_SATURATED);
  211. return true;
  212. }
  213. EXPORT_SYMBOL_GPL(rcuref_get_slowpath);
  214. /**
  215. * rcuref_put_slowpath - Slowpath of __rcuref_put()
  216. * @ref: Pointer to the reference count
  217. *
  218. * Invoked when the reference count is outside of the valid zone.
  219. *
  220. * Return:
  221. * True if this was the last reference with no future references
  222. * possible. This signals the caller that it can safely schedule the
  223. * object, which is protected by the reference counter, for
  224. * deconstruction.
  225. *
  226. * False if there are still active references or the put() raced
  227. * with a concurrent get()/put() pair. Caller is not allowed to
  228. * deconstruct the protected object.
  229. */
  230. bool rcuref_put_slowpath(rcuref_t *ref)
  231. {
  232. unsigned int cnt = atomic_read(&ref->refcnt);
  233. /* Did this drop the last reference? */
  234. if (likely(cnt == RCUREF_NOREF)) {
  235. /*
  236. * Carefully try to set the reference count to RCUREF_DEAD.
  237. *
  238. * This can fail if a concurrent get() operation has
  239. * elevated it again or the corresponding put() even marked
  240. * it dead already. Both are valid situations and do not
  241. * require a retry. If this fails the caller is not
  242. * allowed to deconstruct the object.
  243. */
  244. if (!atomic_try_cmpxchg_release(&ref->refcnt, &cnt, RCUREF_DEAD))
  245. return false;
  246. /*
  247. * The caller can safely schedule the object for
  248. * deconstruction. Provide acquire ordering.
  249. */
  250. smp_acquire__after_ctrl_dep();
  251. return true;
  252. }
  253. /*
  254. * If the reference count was already in the dead zone, then this
  255. * put() operation is imbalanced. Warn, put the reference count back to
  256. * DEAD and tell the caller to not deconstruct the object.
  257. */
  258. if (WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()")) {
  259. atomic_set(&ref->refcnt, RCUREF_DEAD);
  260. return false;
  261. }
  262. /*
  263. * This is a put() operation on a saturated refcount. Restore the
  264. * mean saturation value and tell the caller to not deconstruct the
  265. * object.
  266. */
  267. if (cnt > RCUREF_MAXREF)
  268. atomic_set(&ref->refcnt, RCUREF_SATURATED);
  269. return false;
  270. }
  271. EXPORT_SYMBOL_GPL(rcuref_put_slowpath);