shrinker.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/memcontrol.h>
  3. #include <linux/rwsem.h>
  4. #include <linux/shrinker.h>
  5. #include <linux/rculist.h>
  6. #include <trace/events/vmscan.h>
  7. #include "internal.h"
  8. LIST_HEAD(shrinker_list);
  9. DEFINE_MUTEX(shrinker_mutex);
  10. #ifdef CONFIG_MEMCG
  11. static int shrinker_nr_max;
  12. static inline int shrinker_unit_size(int nr_items)
  13. {
  14. return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *));
  15. }
  16. static inline void shrinker_unit_free(struct shrinker_info *info, int start)
  17. {
  18. struct shrinker_info_unit **unit;
  19. int nr, i;
  20. if (!info)
  21. return;
  22. unit = info->unit;
  23. nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS);
  24. for (i = start; i < nr; i++) {
  25. if (!unit[i])
  26. break;
  27. kfree(unit[i]);
  28. unit[i] = NULL;
  29. }
  30. }
  31. static inline int shrinker_unit_alloc(struct shrinker_info *new,
  32. struct shrinker_info *old, int nid)
  33. {
  34. struct shrinker_info_unit *unit;
  35. int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS);
  36. int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0;
  37. int i;
  38. for (i = start; i < nr; i++) {
  39. unit = kzalloc_node(sizeof(*unit), GFP_KERNEL, nid);
  40. if (!unit) {
  41. shrinker_unit_free(new, start);
  42. return -ENOMEM;
  43. }
  44. new->unit[i] = unit;
  45. }
  46. return 0;
  47. }
  48. void free_shrinker_info(struct mem_cgroup *memcg)
  49. {
  50. struct mem_cgroup_per_node *pn;
  51. struct shrinker_info *info;
  52. int nid;
  53. for_each_node(nid) {
  54. pn = memcg->nodeinfo[nid];
  55. info = rcu_dereference_protected(pn->shrinker_info, true);
  56. shrinker_unit_free(info, 0);
  57. kvfree(info);
  58. rcu_assign_pointer(pn->shrinker_info, NULL);
  59. }
  60. }
  61. int alloc_shrinker_info(struct mem_cgroup *memcg)
  62. {
  63. int nid, ret = 0;
  64. int array_size = 0;
  65. mutex_lock(&shrinker_mutex);
  66. array_size = shrinker_unit_size(shrinker_nr_max);
  67. for_each_node(nid) {
  68. struct shrinker_info *info = kvzalloc_node(sizeof(*info) + array_size,
  69. GFP_KERNEL, nid);
  70. if (!info)
  71. goto err;
  72. info->map_nr_max = shrinker_nr_max;
  73. if (shrinker_unit_alloc(info, NULL, nid)) {
  74. kvfree(info);
  75. goto err;
  76. }
  77. rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
  78. }
  79. mutex_unlock(&shrinker_mutex);
  80. return ret;
  81. err:
  82. mutex_unlock(&shrinker_mutex);
  83. free_shrinker_info(memcg);
  84. return -ENOMEM;
  85. }
  86. static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
  87. int nid)
  88. {
  89. return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
  90. lockdep_is_held(&shrinker_mutex));
  91. }
  92. static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size,
  93. int old_size, int new_nr_max)
  94. {
  95. struct shrinker_info *new, *old;
  96. struct mem_cgroup_per_node *pn;
  97. int nid;
  98. for_each_node(nid) {
  99. pn = memcg->nodeinfo[nid];
  100. old = shrinker_info_protected(memcg, nid);
  101. /* Not yet online memcg */
  102. if (!old)
  103. return 0;
  104. /* Already expanded this shrinker_info */
  105. if (new_nr_max <= old->map_nr_max)
  106. continue;
  107. new = kvzalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid);
  108. if (!new)
  109. return -ENOMEM;
  110. new->map_nr_max = new_nr_max;
  111. memcpy(new->unit, old->unit, old_size);
  112. if (shrinker_unit_alloc(new, old, nid)) {
  113. kvfree(new);
  114. return -ENOMEM;
  115. }
  116. rcu_assign_pointer(pn->shrinker_info, new);
  117. kvfree_rcu(old, rcu);
  118. }
  119. return 0;
  120. }
  121. static int expand_shrinker_info(int new_id)
  122. {
  123. int ret = 0;
  124. int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS);
  125. int new_size, old_size = 0;
  126. struct mem_cgroup *memcg;
  127. if (!root_mem_cgroup)
  128. goto out;
  129. lockdep_assert_held(&shrinker_mutex);
  130. new_size = shrinker_unit_size(new_nr_max);
  131. old_size = shrinker_unit_size(shrinker_nr_max);
  132. memcg = mem_cgroup_iter(NULL, NULL, NULL);
  133. do {
  134. ret = expand_one_shrinker_info(memcg, new_size, old_size,
  135. new_nr_max);
  136. if (ret) {
  137. mem_cgroup_iter_break(NULL, memcg);
  138. goto out;
  139. }
  140. } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
  141. out:
  142. if (!ret)
  143. shrinker_nr_max = new_nr_max;
  144. return ret;
  145. }
  146. static inline int shrinker_id_to_index(int shrinker_id)
  147. {
  148. return shrinker_id / SHRINKER_UNIT_BITS;
  149. }
  150. static inline int shrinker_id_to_offset(int shrinker_id)
  151. {
  152. return shrinker_id % SHRINKER_UNIT_BITS;
  153. }
  154. static inline int calc_shrinker_id(int index, int offset)
  155. {
  156. return index * SHRINKER_UNIT_BITS + offset;
  157. }
  158. void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
  159. {
  160. if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
  161. struct shrinker_info *info;
  162. struct shrinker_info_unit *unit;
  163. rcu_read_lock();
  164. info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
  165. unit = info->unit[shrinker_id_to_index(shrinker_id)];
  166. if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
  167. /* Pairs with smp mb in shrink_slab() */
  168. smp_mb__before_atomic();
  169. set_bit(shrinker_id_to_offset(shrinker_id), unit->map);
  170. }
  171. rcu_read_unlock();
  172. }
  173. }
  174. static DEFINE_IDR(shrinker_idr);
  175. static int shrinker_memcg_alloc(struct shrinker *shrinker)
  176. {
  177. int id, ret = -ENOMEM;
  178. if (mem_cgroup_disabled())
  179. return -ENOSYS;
  180. mutex_lock(&shrinker_mutex);
  181. id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
  182. if (id < 0)
  183. goto unlock;
  184. if (id >= shrinker_nr_max) {
  185. if (expand_shrinker_info(id)) {
  186. idr_remove(&shrinker_idr, id);
  187. goto unlock;
  188. }
  189. }
  190. shrinker->id = id;
  191. ret = 0;
  192. unlock:
  193. mutex_unlock(&shrinker_mutex);
  194. return ret;
  195. }
  196. static void shrinker_memcg_remove(struct shrinker *shrinker)
  197. {
  198. int id = shrinker->id;
  199. BUG_ON(id < 0);
  200. lockdep_assert_held(&shrinker_mutex);
  201. idr_remove(&shrinker_idr, id);
  202. }
  203. static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
  204. struct mem_cgroup *memcg)
  205. {
  206. struct shrinker_info *info;
  207. struct shrinker_info_unit *unit;
  208. long nr_deferred;
  209. rcu_read_lock();
  210. info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
  211. unit = info->unit[shrinker_id_to_index(shrinker->id)];
  212. nr_deferred = atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0);
  213. rcu_read_unlock();
  214. return nr_deferred;
  215. }
  216. static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
  217. struct mem_cgroup *memcg)
  218. {
  219. struct shrinker_info *info;
  220. struct shrinker_info_unit *unit;
  221. long nr_deferred;
  222. rcu_read_lock();
  223. info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
  224. unit = info->unit[shrinker_id_to_index(shrinker->id)];
  225. nr_deferred =
  226. atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]);
  227. rcu_read_unlock();
  228. return nr_deferred;
  229. }
  230. void reparent_shrinker_deferred(struct mem_cgroup *memcg)
  231. {
  232. int nid, index, offset;
  233. long nr;
  234. struct mem_cgroup *parent;
  235. struct shrinker_info *child_info, *parent_info;
  236. struct shrinker_info_unit *child_unit, *parent_unit;
  237. parent = parent_mem_cgroup(memcg);
  238. if (!parent)
  239. parent = root_mem_cgroup;
  240. /* Prevent from concurrent shrinker_info expand */
  241. mutex_lock(&shrinker_mutex);
  242. for_each_node(nid) {
  243. child_info = shrinker_info_protected(memcg, nid);
  244. parent_info = shrinker_info_protected(parent, nid);
  245. for (index = 0; index < shrinker_id_to_index(child_info->map_nr_max); index++) {
  246. child_unit = child_info->unit[index];
  247. parent_unit = parent_info->unit[index];
  248. for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) {
  249. nr = atomic_long_read(&child_unit->nr_deferred[offset]);
  250. atomic_long_add(nr, &parent_unit->nr_deferred[offset]);
  251. }
  252. }
  253. }
  254. mutex_unlock(&shrinker_mutex);
  255. }
  256. #else
  257. static int shrinker_memcg_alloc(struct shrinker *shrinker)
  258. {
  259. return -ENOSYS;
  260. }
  261. static void shrinker_memcg_remove(struct shrinker *shrinker)
  262. {
  263. }
  264. static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
  265. struct mem_cgroup *memcg)
  266. {
  267. return 0;
  268. }
  269. static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
  270. struct mem_cgroup *memcg)
  271. {
  272. return 0;
  273. }
  274. #endif /* CONFIG_MEMCG */
  275. static long xchg_nr_deferred(struct shrinker *shrinker,
  276. struct shrink_control *sc)
  277. {
  278. int nid = sc->nid;
  279. if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
  280. nid = 0;
  281. if (sc->memcg &&
  282. (shrinker->flags & SHRINKER_MEMCG_AWARE))
  283. return xchg_nr_deferred_memcg(nid, shrinker,
  284. sc->memcg);
  285. return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
  286. }
  287. static long add_nr_deferred(long nr, struct shrinker *shrinker,
  288. struct shrink_control *sc)
  289. {
  290. int nid = sc->nid;
  291. if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
  292. nid = 0;
  293. if (sc->memcg &&
  294. (shrinker->flags & SHRINKER_MEMCG_AWARE))
  295. return add_nr_deferred_memcg(nr, nid, shrinker,
  296. sc->memcg);
  297. return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
  298. }
  299. #define SHRINK_BATCH 128
  300. static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
  301. struct shrinker *shrinker, int priority)
  302. {
  303. unsigned long freed = 0;
  304. unsigned long long delta;
  305. long total_scan;
  306. long freeable;
  307. long nr;
  308. long new_nr;
  309. long batch_size = shrinker->batch ? shrinker->batch
  310. : SHRINK_BATCH;
  311. long scanned = 0, next_deferred;
  312. freeable = shrinker->count_objects(shrinker, shrinkctl);
  313. if (freeable == 0 || freeable == SHRINK_EMPTY)
  314. return freeable;
  315. /*
  316. * copy the current shrinker scan count into a local variable
  317. * and zero it so that other concurrent shrinker invocations
  318. * don't also do this scanning work.
  319. */
  320. nr = xchg_nr_deferred(shrinker, shrinkctl);
  321. if (shrinker->seeks) {
  322. delta = freeable >> priority;
  323. delta *= 4;
  324. do_div(delta, shrinker->seeks);
  325. } else {
  326. /*
  327. * These objects don't require any IO to create. Trim
  328. * them aggressively under memory pressure to keep
  329. * them from causing refetches in the IO caches.
  330. */
  331. delta = freeable / 2;
  332. }
  333. total_scan = nr >> priority;
  334. total_scan += delta;
  335. total_scan = min(total_scan, (2 * freeable));
  336. trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
  337. freeable, delta, total_scan, priority);
  338. /*
  339. * Normally, we should not scan less than batch_size objects in one
  340. * pass to avoid too frequent shrinker calls, but if the slab has less
  341. * than batch_size objects in total and we are really tight on memory,
  342. * we will try to reclaim all available objects, otherwise we can end
  343. * up failing allocations although there are plenty of reclaimable
  344. * objects spread over several slabs with usage less than the
  345. * batch_size.
  346. *
  347. * We detect the "tight on memory" situations by looking at the total
  348. * number of objects we want to scan (total_scan). If it is greater
  349. * than the total number of objects on slab (freeable), we must be
  350. * scanning at high prio and therefore should try to reclaim as much as
  351. * possible.
  352. */
  353. while (total_scan >= batch_size ||
  354. total_scan >= freeable) {
  355. unsigned long ret;
  356. unsigned long nr_to_scan = min(batch_size, total_scan);
  357. shrinkctl->nr_to_scan = nr_to_scan;
  358. shrinkctl->nr_scanned = nr_to_scan;
  359. ret = shrinker->scan_objects(shrinker, shrinkctl);
  360. if (ret == SHRINK_STOP)
  361. break;
  362. freed += ret;
  363. count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
  364. total_scan -= shrinkctl->nr_scanned;
  365. scanned += shrinkctl->nr_scanned;
  366. cond_resched();
  367. }
  368. /*
  369. * The deferred work is increased by any new work (delta) that wasn't
  370. * done, decreased by old deferred work that was done now.
  371. *
  372. * And it is capped to two times of the freeable items.
  373. */
  374. next_deferred = max_t(long, (nr + delta - scanned), 0);
  375. next_deferred = min(next_deferred, (2 * freeable));
  376. /*
  377. * move the unused scan count back into the shrinker in a
  378. * manner that handles concurrent updates.
  379. */
  380. new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
  381. trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
  382. return freed;
  383. }
  384. #ifdef CONFIG_MEMCG
  385. static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
  386. struct mem_cgroup *memcg, int priority)
  387. {
  388. struct shrinker_info *info;
  389. unsigned long ret, freed = 0;
  390. int offset, index = 0;
  391. if (!mem_cgroup_online(memcg))
  392. return 0;
  393. /*
  394. * lockless algorithm of memcg shrink.
  395. *
  396. * The shrinker_info may be freed asynchronously via RCU in the
  397. * expand_one_shrinker_info(), so the rcu_read_lock() needs to be used
  398. * to ensure the existence of the shrinker_info.
  399. *
  400. * The shrinker_info_unit is never freed unless its corresponding memcg
  401. * is destroyed. Here we already hold the refcount of memcg, so the
  402. * memcg will not be destroyed, and of course shrinker_info_unit will
  403. * not be freed.
  404. *
  405. * So in the memcg shrink:
  406. * step 1: use rcu_read_lock() to guarantee existence of the
  407. * shrinker_info.
  408. * step 2: after getting shrinker_info_unit we can safely release the
  409. * RCU lock.
  410. * step 3: traverse the bitmap and calculate shrinker_id
  411. * step 4: use rcu_read_lock() to guarantee existence of the shrinker.
  412. * step 5: use shrinker_id to find the shrinker, then use
  413. * shrinker_try_get() to guarantee existence of the shrinker,
  414. * then we can release the RCU lock to do do_shrink_slab() that
  415. * may sleep.
  416. * step 6: do shrinker_put() paired with step 5 to put the refcount,
  417. * if the refcount reaches 0, then wake up the waiter in
  418. * shrinker_free() by calling complete().
  419. * Note: here is different from the global shrink, we don't
  420. * need to acquire the RCU lock to guarantee existence of
  421. * the shrinker, because we don't need to use this
  422. * shrinker to traverse the next shrinker in the bitmap.
  423. * step 7: we have already exited the read-side of rcu critical section
  424. * before calling do_shrink_slab(), the shrinker_info may be
  425. * released in expand_one_shrinker_info(), so go back to step 1
  426. * to reacquire the shrinker_info.
  427. */
  428. again:
  429. rcu_read_lock();
  430. info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
  431. if (unlikely(!info))
  432. goto unlock;
  433. if (index < shrinker_id_to_index(info->map_nr_max)) {
  434. struct shrinker_info_unit *unit;
  435. unit = info->unit[index];
  436. rcu_read_unlock();
  437. for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) {
  438. struct shrink_control sc = {
  439. .gfp_mask = gfp_mask,
  440. .nid = nid,
  441. .memcg = memcg,
  442. };
  443. struct shrinker *shrinker;
  444. int shrinker_id = calc_shrinker_id(index, offset);
  445. rcu_read_lock();
  446. shrinker = idr_find(&shrinker_idr, shrinker_id);
  447. if (unlikely(!shrinker || !shrinker_try_get(shrinker))) {
  448. clear_bit(offset, unit->map);
  449. rcu_read_unlock();
  450. continue;
  451. }
  452. rcu_read_unlock();
  453. /* Call non-slab shrinkers even though kmem is disabled */
  454. if (!memcg_kmem_online() &&
  455. !(shrinker->flags & SHRINKER_NONSLAB))
  456. continue;
  457. ret = do_shrink_slab(&sc, shrinker, priority);
  458. if (ret == SHRINK_EMPTY) {
  459. clear_bit(offset, unit->map);
  460. /*
  461. * After the shrinker reported that it had no objects to
  462. * free, but before we cleared the corresponding bit in
  463. * the memcg shrinker map, a new object might have been
  464. * added. To make sure, we have the bit set in this
  465. * case, we invoke the shrinker one more time and reset
  466. * the bit if it reports that it is not empty anymore.
  467. * The memory barrier here pairs with the barrier in
  468. * set_shrinker_bit():
  469. *
  470. * list_lru_add() shrink_slab_memcg()
  471. * list_add_tail() clear_bit()
  472. * <MB> <MB>
  473. * set_bit() do_shrink_slab()
  474. */
  475. smp_mb__after_atomic();
  476. ret = do_shrink_slab(&sc, shrinker, priority);
  477. if (ret == SHRINK_EMPTY)
  478. ret = 0;
  479. else
  480. set_shrinker_bit(memcg, nid, shrinker_id);
  481. }
  482. freed += ret;
  483. shrinker_put(shrinker);
  484. }
  485. index++;
  486. goto again;
  487. }
  488. unlock:
  489. rcu_read_unlock();
  490. return freed;
  491. }
  492. #else /* !CONFIG_MEMCG */
  493. static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
  494. struct mem_cgroup *memcg, int priority)
  495. {
  496. return 0;
  497. }
  498. #endif /* CONFIG_MEMCG */
  499. /**
  500. * shrink_slab - shrink slab caches
  501. * @gfp_mask: allocation context
  502. * @nid: node whose slab caches to target
  503. * @memcg: memory cgroup whose slab caches to target
  504. * @priority: the reclaim priority
  505. *
  506. * Call the shrink functions to age shrinkable caches.
  507. *
  508. * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
  509. * unaware shrinkers will receive a node id of 0 instead.
  510. *
  511. * @memcg specifies the memory cgroup to target. Unaware shrinkers
  512. * are called only if it is the root cgroup.
  513. *
  514. * @priority is sc->priority, we take the number of objects and >> by priority
  515. * in order to get the scan target.
  516. *
  517. * Returns the number of reclaimed slab objects.
  518. */
  519. unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
  520. int priority)
  521. {
  522. unsigned long ret, freed = 0;
  523. struct shrinker *shrinker;
  524. /*
  525. * The root memcg might be allocated even though memcg is disabled
  526. * via "cgroup_disable=memory" boot parameter. This could make
  527. * mem_cgroup_is_root() return false, then just run memcg slab
  528. * shrink, but skip global shrink. This may result in premature
  529. * oom.
  530. */
  531. if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
  532. return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
  533. /*
  534. * lockless algorithm of global shrink.
  535. *
  536. * In the unregistration setp, the shrinker will be freed asynchronously
  537. * via RCU after its refcount reaches 0. So both rcu_read_lock() and
  538. * shrinker_try_get() can be used to ensure the existence of the shrinker.
  539. *
  540. * So in the global shrink:
  541. * step 1: use rcu_read_lock() to guarantee existence of the shrinker
  542. * and the validity of the shrinker_list walk.
  543. * step 2: use shrinker_try_get() to try get the refcount, if successful,
  544. * then the existence of the shrinker can also be guaranteed,
  545. * so we can release the RCU lock to do do_shrink_slab() that
  546. * may sleep.
  547. * step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(),
  548. * which ensures that neither this shrinker nor the next shrinker
  549. * will be freed in the next traversal operation.
  550. * step 4: do shrinker_put() paired with step 2 to put the refcount,
  551. * if the refcount reaches 0, then wake up the waiter in
  552. * shrinker_free() by calling complete().
  553. */
  554. rcu_read_lock();
  555. list_for_each_entry_rcu(shrinker, &shrinker_list, list) {
  556. struct shrink_control sc = {
  557. .gfp_mask = gfp_mask,
  558. .nid = nid,
  559. .memcg = memcg,
  560. };
  561. if (!shrinker_try_get(shrinker))
  562. continue;
  563. rcu_read_unlock();
  564. ret = do_shrink_slab(&sc, shrinker, priority);
  565. if (ret == SHRINK_EMPTY)
  566. ret = 0;
  567. freed += ret;
  568. rcu_read_lock();
  569. shrinker_put(shrinker);
  570. }
  571. rcu_read_unlock();
  572. cond_resched();
  573. return freed;
  574. }
  575. struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...)
  576. {
  577. struct shrinker *shrinker;
  578. unsigned int size;
  579. va_list ap;
  580. int err;
  581. shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL);
  582. if (!shrinker)
  583. return NULL;
  584. va_start(ap, fmt);
  585. err = shrinker_debugfs_name_alloc(shrinker, fmt, ap);
  586. va_end(ap);
  587. if (err)
  588. goto err_name;
  589. shrinker->flags = flags | SHRINKER_ALLOCATED;
  590. shrinker->seeks = DEFAULT_SEEKS;
  591. if (flags & SHRINKER_MEMCG_AWARE) {
  592. err = shrinker_memcg_alloc(shrinker);
  593. if (err == -ENOSYS) {
  594. /* Memcg is not supported, fallback to non-memcg-aware shrinker. */
  595. shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
  596. goto non_memcg;
  597. }
  598. if (err)
  599. goto err_flags;
  600. return shrinker;
  601. }
  602. non_memcg:
  603. /*
  604. * The nr_deferred is available on per memcg level for memcg aware
  605. * shrinkers, so only allocate nr_deferred in the following cases:
  606. * - non-memcg-aware shrinkers
  607. * - !CONFIG_MEMCG
  608. * - memcg is disabled by kernel command line
  609. */
  610. size = sizeof(*shrinker->nr_deferred);
  611. if (flags & SHRINKER_NUMA_AWARE)
  612. size *= nr_node_ids;
  613. shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
  614. if (!shrinker->nr_deferred)
  615. goto err_flags;
  616. return shrinker;
  617. err_flags:
  618. shrinker_debugfs_name_free(shrinker);
  619. err_name:
  620. kfree(shrinker);
  621. return NULL;
  622. }
  623. EXPORT_SYMBOL_GPL(shrinker_alloc);
  624. void shrinker_register(struct shrinker *shrinker)
  625. {
  626. if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) {
  627. pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker");
  628. return;
  629. }
  630. mutex_lock(&shrinker_mutex);
  631. list_add_tail_rcu(&shrinker->list, &shrinker_list);
  632. shrinker->flags |= SHRINKER_REGISTERED;
  633. shrinker_debugfs_add(shrinker);
  634. mutex_unlock(&shrinker_mutex);
  635. init_completion(&shrinker->done);
  636. /*
  637. * Now the shrinker is fully set up, take the first reference to it to
  638. * indicate that lookup operations are now allowed to use it via
  639. * shrinker_try_get().
  640. */
  641. refcount_set(&shrinker->refcount, 1);
  642. }
  643. EXPORT_SYMBOL_GPL(shrinker_register);
  644. static void shrinker_free_rcu_cb(struct rcu_head *head)
  645. {
  646. struct shrinker *shrinker = container_of(head, struct shrinker, rcu);
  647. kfree(shrinker->nr_deferred);
  648. kfree(shrinker);
  649. }
  650. void shrinker_free(struct shrinker *shrinker)
  651. {
  652. struct dentry *debugfs_entry = NULL;
  653. int debugfs_id;
  654. if (!shrinker)
  655. return;
  656. if (shrinker->flags & SHRINKER_REGISTERED) {
  657. /* drop the initial refcount */
  658. shrinker_put(shrinker);
  659. /*
  660. * Wait for all lookups of the shrinker to complete, after that,
  661. * no shrinker is running or will run again, then we can safely
  662. * free it asynchronously via RCU and safely free the structure
  663. * where the shrinker is located, such as super_block etc.
  664. */
  665. wait_for_completion(&shrinker->done);
  666. }
  667. mutex_lock(&shrinker_mutex);
  668. if (shrinker->flags & SHRINKER_REGISTERED) {
  669. /*
  670. * Now we can safely remove it from the shrinker_list and then
  671. * free it.
  672. */
  673. list_del_rcu(&shrinker->list);
  674. debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
  675. shrinker->flags &= ~SHRINKER_REGISTERED;
  676. }
  677. shrinker_debugfs_name_free(shrinker);
  678. if (shrinker->flags & SHRINKER_MEMCG_AWARE)
  679. shrinker_memcg_remove(shrinker);
  680. mutex_unlock(&shrinker_mutex);
  681. if (debugfs_entry)
  682. shrinker_debugfs_remove(debugfs_entry, debugfs_id);
  683. call_rcu(&shrinker->rcu, shrinker_free_rcu_cb);
  684. }
  685. EXPORT_SYMBOL_GPL(shrinker_free);