page_idle.c 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/init.h>
  3. #include <linux/bootmem.h>
  4. #include <linux/fs.h>
  5. #include <linux/sysfs.h>
  6. #include <linux/kobject.h>
  7. #include <linux/mm.h>
  8. #include <linux/mmzone.h>
  9. #include <linux/pagemap.h>
  10. #include <linux/rmap.h>
  11. #include <linux/mmu_notifier.h>
  12. #include <linux/page_ext.h>
  13. #include <linux/page_idle.h>
  14. #define BITMAP_CHUNK_SIZE sizeof(u64)
  15. #define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
  16. /*
  17. * Idle page tracking only considers user memory pages, for other types of
  18. * pages the idle flag is always unset and an attempt to set it is silently
  19. * ignored.
  20. *
  21. * We treat a page as a user memory page if it is on an LRU list, because it is
  22. * always safe to pass such a page to rmap_walk(), which is essential for idle
  23. * page tracking. With such an indicator of user pages we can skip isolated
  24. * pages, but since there are not usually many of them, it will hardly affect
  25. * the overall result.
  26. *
  27. * This function tries to get a user memory page by pfn as described above.
  28. */
  29. static struct page *page_idle_get_page(unsigned long pfn)
  30. {
  31. struct page *page;
  32. struct zone *zone;
  33. if (!pfn_valid(pfn))
  34. return NULL;
  35. page = pfn_to_page(pfn);
  36. if (!page || !PageLRU(page) ||
  37. !get_page_unless_zero(page))
  38. return NULL;
  39. zone = page_zone(page);
  40. spin_lock_irq(zone_lru_lock(zone));
  41. if (unlikely(!PageLRU(page))) {
  42. put_page(page);
  43. page = NULL;
  44. }
  45. spin_unlock_irq(zone_lru_lock(zone));
  46. return page;
  47. }
  48. static bool page_idle_clear_pte_refs_one(struct page *page,
  49. struct vm_area_struct *vma,
  50. unsigned long addr, void *arg)
  51. {
  52. struct page_vma_mapped_walk pvmw = {
  53. .page = page,
  54. .vma = vma,
  55. .address = addr,
  56. };
  57. bool referenced = false;
  58. while (page_vma_mapped_walk(&pvmw)) {
  59. addr = pvmw.address;
  60. if (pvmw.pte) {
  61. /*
  62. * For PTE-mapped THP, one sub page is referenced,
  63. * the whole THP is referenced.
  64. */
  65. if (ptep_clear_young_notify(vma, addr, pvmw.pte))
  66. referenced = true;
  67. } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
  68. if (pmdp_clear_young_notify(vma, addr, pvmw.pmd))
  69. referenced = true;
  70. } else {
  71. /* unexpected pmd-mapped page? */
  72. WARN_ON_ONCE(1);
  73. }
  74. }
  75. if (referenced) {
  76. clear_page_idle(page);
  77. /*
  78. * We cleared the referenced bit in a mapping to this page. To
  79. * avoid interference with page reclaim, mark it young so that
  80. * page_referenced() will return > 0.
  81. */
  82. set_page_young(page);
  83. }
  84. return true;
  85. }
  86. static void page_idle_clear_pte_refs(struct page *page)
  87. {
  88. /*
  89. * Since rwc.arg is unused, rwc is effectively immutable, so we
  90. * can make it static const to save some cycles and stack.
  91. */
  92. static const struct rmap_walk_control rwc = {
  93. .rmap_one = page_idle_clear_pte_refs_one,
  94. .anon_lock = page_lock_anon_vma_read,
  95. };
  96. bool need_lock;
  97. if (!page_mapped(page) ||
  98. !page_rmapping(page))
  99. return;
  100. need_lock = !PageAnon(page) || PageKsm(page);
  101. if (need_lock && !trylock_page(page))
  102. return;
  103. rmap_walk(page, (struct rmap_walk_control *)&rwc);
  104. if (need_lock)
  105. unlock_page(page);
  106. }
  107. static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
  108. struct bin_attribute *attr, char *buf,
  109. loff_t pos, size_t count)
  110. {
  111. u64 *out = (u64 *)buf;
  112. struct page *page;
  113. unsigned long pfn, end_pfn;
  114. int bit;
  115. if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
  116. return -EINVAL;
  117. pfn = pos * BITS_PER_BYTE;
  118. if (pfn >= max_pfn)
  119. return 0;
  120. end_pfn = pfn + count * BITS_PER_BYTE;
  121. if (end_pfn > max_pfn)
  122. end_pfn = max_pfn;
  123. for (; pfn < end_pfn; pfn++) {
  124. bit = pfn % BITMAP_CHUNK_BITS;
  125. if (!bit)
  126. *out = 0ULL;
  127. page = page_idle_get_page(pfn);
  128. if (page) {
  129. if (page_is_idle(page)) {
  130. /*
  131. * The page might have been referenced via a
  132. * pte, in which case it is not idle. Clear
  133. * refs and recheck.
  134. */
  135. page_idle_clear_pte_refs(page);
  136. if (page_is_idle(page))
  137. *out |= 1ULL << bit;
  138. }
  139. put_page(page);
  140. }
  141. if (bit == BITMAP_CHUNK_BITS - 1)
  142. out++;
  143. cond_resched();
  144. }
  145. return (char *)out - buf;
  146. }
  147. static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
  148. struct bin_attribute *attr, char *buf,
  149. loff_t pos, size_t count)
  150. {
  151. const u64 *in = (u64 *)buf;
  152. struct page *page;
  153. unsigned long pfn, end_pfn;
  154. int bit;
  155. if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
  156. return -EINVAL;
  157. pfn = pos * BITS_PER_BYTE;
  158. if (pfn >= max_pfn)
  159. return -ENXIO;
  160. end_pfn = pfn + count * BITS_PER_BYTE;
  161. if (end_pfn > max_pfn)
  162. end_pfn = max_pfn;
  163. for (; pfn < end_pfn; pfn++) {
  164. bit = pfn % BITMAP_CHUNK_BITS;
  165. if ((*in >> bit) & 1) {
  166. page = page_idle_get_page(pfn);
  167. if (page) {
  168. page_idle_clear_pte_refs(page);
  169. set_page_idle(page);
  170. put_page(page);
  171. }
  172. }
  173. if (bit == BITMAP_CHUNK_BITS - 1)
  174. in++;
  175. cond_resched();
  176. }
  177. return (char *)in - buf;
  178. }
  179. static struct bin_attribute page_idle_bitmap_attr =
  180. __BIN_ATTR(bitmap, 0600,
  181. page_idle_bitmap_read, page_idle_bitmap_write, 0);
  182. static struct bin_attribute *page_idle_bin_attrs[] = {
  183. &page_idle_bitmap_attr,
  184. NULL,
  185. };
  186. static const struct attribute_group page_idle_attr_group = {
  187. .bin_attrs = page_idle_bin_attrs,
  188. .name = "page_idle",
  189. };
  190. #ifndef CONFIG_64BIT
  191. static bool need_page_idle(void)
  192. {
  193. return true;
  194. }
  195. struct page_ext_operations page_idle_ops = {
  196. .need = need_page_idle,
  197. };
  198. #endif
  199. static int __init page_idle_init(void)
  200. {
  201. int err;
  202. err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
  203. if (err) {
  204. pr_err("page_idle: register sysfs failed\n");
  205. return err;
  206. }
  207. return 0;
  208. }
  209. subsys_initcall(page_idle_init);