pagewalk.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/pagewalk.h>
  3. #include <linux/highmem.h>
  4. #include <linux/sched.h>
  5. #include <linux/hugetlb.h>
  6. #include <linux/swap.h>
  7. #include <linux/swapops.h>
  8. /*
  9. * We want to know the real level where a entry is located ignoring any
  10. * folding of levels which may be happening. For example if p4d is folded then
  11. * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
  12. */
  13. static int real_depth(int depth)
  14. {
  15. if (depth == 3 && PTRS_PER_PMD == 1)
  16. depth = 2;
  17. if (depth == 2 && PTRS_PER_PUD == 1)
  18. depth = 1;
  19. if (depth == 1 && PTRS_PER_P4D == 1)
  20. depth = 0;
  21. return depth;
  22. }
  23. static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
  24. unsigned long end, struct mm_walk *walk)
  25. {
  26. const struct mm_walk_ops *ops = walk->ops;
  27. int err = 0;
  28. for (;;) {
  29. err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
  30. if (err)
  31. break;
  32. if (addr >= end - PAGE_SIZE)
  33. break;
  34. addr += PAGE_SIZE;
  35. pte++;
  36. }
  37. return err;
  38. }
  39. static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
  40. struct mm_walk *walk)
  41. {
  42. pte_t *pte;
  43. int err = 0;
  44. spinlock_t *ptl;
  45. if (walk->no_vma) {
  46. /*
  47. * pte_offset_map() might apply user-specific validation.
  48. * Indeed, on x86_64 the pmd entries set up by init_espfix_ap()
  49. * fit its pmd_bad() check (_PAGE_NX set and _PAGE_RW clear),
  50. * and CONFIG_EFI_PGT_DUMP efi_mm goes so far as to walk them.
  51. */
  52. if (walk->mm == &init_mm || addr >= TASK_SIZE)
  53. pte = pte_offset_kernel(pmd, addr);
  54. else
  55. pte = pte_offset_map(pmd, addr);
  56. if (pte) {
  57. err = walk_pte_range_inner(pte, addr, end, walk);
  58. if (walk->mm != &init_mm && addr < TASK_SIZE)
  59. pte_unmap(pte);
  60. }
  61. } else {
  62. pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
  63. if (pte) {
  64. err = walk_pte_range_inner(pte, addr, end, walk);
  65. pte_unmap_unlock(pte, ptl);
  66. }
  67. }
  68. if (!pte)
  69. walk->action = ACTION_AGAIN;
  70. return err;
  71. }
  72. static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
  73. struct mm_walk *walk)
  74. {
  75. pmd_t *pmd;
  76. unsigned long next;
  77. const struct mm_walk_ops *ops = walk->ops;
  78. int err = 0;
  79. int depth = real_depth(3);
  80. pmd = pmd_offset(pud, addr);
  81. do {
  82. again:
  83. next = pmd_addr_end(addr, end);
  84. if (pmd_none(*pmd)) {
  85. if (ops->pte_hole)
  86. err = ops->pte_hole(addr, next, depth, walk);
  87. if (err)
  88. break;
  89. continue;
  90. }
  91. walk->action = ACTION_SUBTREE;
  92. /*
  93. * This implies that each ->pmd_entry() handler
  94. * needs to know about pmd_trans_huge() pmds
  95. */
  96. if (ops->pmd_entry)
  97. err = ops->pmd_entry(pmd, addr, next, walk);
  98. if (err)
  99. break;
  100. if (walk->action == ACTION_AGAIN)
  101. goto again;
  102. /*
  103. * Check this here so we only break down trans_huge
  104. * pages when we _need_ to
  105. */
  106. if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
  107. walk->action == ACTION_CONTINUE ||
  108. !(ops->pte_entry))
  109. continue;
  110. if (walk->vma)
  111. split_huge_pmd(walk->vma, pmd, addr);
  112. err = walk_pte_range(pmd, addr, next, walk);
  113. if (err)
  114. break;
  115. if (walk->action == ACTION_AGAIN)
  116. goto again;
  117. } while (pmd++, addr = next, addr != end);
  118. return err;
  119. }
  120. static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
  121. struct mm_walk *walk)
  122. {
  123. pud_t *pud;
  124. unsigned long next;
  125. const struct mm_walk_ops *ops = walk->ops;
  126. int err = 0;
  127. int depth = real_depth(2);
  128. pud = pud_offset(p4d, addr);
  129. do {
  130. again:
  131. next = pud_addr_end(addr, end);
  132. if (pud_none(*pud)) {
  133. if (ops->pte_hole)
  134. err = ops->pte_hole(addr, next, depth, walk);
  135. if (err)
  136. break;
  137. continue;
  138. }
  139. walk->action = ACTION_SUBTREE;
  140. if (ops->pud_entry)
  141. err = ops->pud_entry(pud, addr, next, walk);
  142. if (err)
  143. break;
  144. if (walk->action == ACTION_AGAIN)
  145. goto again;
  146. if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
  147. walk->action == ACTION_CONTINUE ||
  148. !(ops->pmd_entry || ops->pte_entry))
  149. continue;
  150. if (walk->vma)
  151. split_huge_pud(walk->vma, pud, addr);
  152. if (pud_none(*pud))
  153. goto again;
  154. err = walk_pmd_range(pud, addr, next, walk);
  155. if (err)
  156. break;
  157. } while (pud++, addr = next, addr != end);
  158. return err;
  159. }
  160. static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
  161. struct mm_walk *walk)
  162. {
  163. p4d_t *p4d;
  164. unsigned long next;
  165. const struct mm_walk_ops *ops = walk->ops;
  166. int err = 0;
  167. int depth = real_depth(1);
  168. p4d = p4d_offset(pgd, addr);
  169. do {
  170. next = p4d_addr_end(addr, end);
  171. if (p4d_none_or_clear_bad(p4d)) {
  172. if (ops->pte_hole)
  173. err = ops->pte_hole(addr, next, depth, walk);
  174. if (err)
  175. break;
  176. continue;
  177. }
  178. if (ops->p4d_entry) {
  179. err = ops->p4d_entry(p4d, addr, next, walk);
  180. if (err)
  181. break;
  182. }
  183. if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
  184. err = walk_pud_range(p4d, addr, next, walk);
  185. if (err)
  186. break;
  187. } while (p4d++, addr = next, addr != end);
  188. return err;
  189. }
  190. static int walk_pgd_range(unsigned long addr, unsigned long end,
  191. struct mm_walk *walk)
  192. {
  193. pgd_t *pgd;
  194. unsigned long next;
  195. const struct mm_walk_ops *ops = walk->ops;
  196. int err = 0;
  197. if (walk->pgd)
  198. pgd = walk->pgd + pgd_index(addr);
  199. else
  200. pgd = pgd_offset(walk->mm, addr);
  201. do {
  202. next = pgd_addr_end(addr, end);
  203. if (pgd_none_or_clear_bad(pgd)) {
  204. if (ops->pte_hole)
  205. err = ops->pte_hole(addr, next, 0, walk);
  206. if (err)
  207. break;
  208. continue;
  209. }
  210. if (ops->pgd_entry) {
  211. err = ops->pgd_entry(pgd, addr, next, walk);
  212. if (err)
  213. break;
  214. }
  215. if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry)
  216. err = walk_p4d_range(pgd, addr, next, walk);
  217. if (err)
  218. break;
  219. } while (pgd++, addr = next, addr != end);
  220. return err;
  221. }
  222. #ifdef CONFIG_HUGETLB_PAGE
  223. static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
  224. unsigned long end)
  225. {
  226. unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
  227. return boundary < end ? boundary : end;
  228. }
  229. static int walk_hugetlb_range(unsigned long addr, unsigned long end,
  230. struct mm_walk *walk)
  231. {
  232. struct vm_area_struct *vma = walk->vma;
  233. struct hstate *h = hstate_vma(vma);
  234. unsigned long next;
  235. unsigned long hmask = huge_page_mask(h);
  236. unsigned long sz = huge_page_size(h);
  237. pte_t *pte;
  238. const struct mm_walk_ops *ops = walk->ops;
  239. int err = 0;
  240. hugetlb_vma_lock_read(vma);
  241. do {
  242. next = hugetlb_entry_end(h, addr, end);
  243. pte = hugetlb_walk(vma, addr & hmask, sz);
  244. if (pte)
  245. err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
  246. else if (ops->pte_hole)
  247. err = ops->pte_hole(addr, next, -1, walk);
  248. if (err)
  249. break;
  250. } while (addr = next, addr != end);
  251. hugetlb_vma_unlock_read(vma);
  252. return err;
  253. }
  254. #else /* CONFIG_HUGETLB_PAGE */
  255. static int walk_hugetlb_range(unsigned long addr, unsigned long end,
  256. struct mm_walk *walk)
  257. {
  258. return 0;
  259. }
  260. #endif /* CONFIG_HUGETLB_PAGE */
  261. /*
  262. * Decide whether we really walk over the current vma on [@start, @end)
  263. * or skip it via the returned value. Return 0 if we do walk over the
  264. * current vma, and return 1 if we skip the vma. Negative values means
  265. * error, where we abort the current walk.
  266. */
  267. static int walk_page_test(unsigned long start, unsigned long end,
  268. struct mm_walk *walk)
  269. {
  270. struct vm_area_struct *vma = walk->vma;
  271. const struct mm_walk_ops *ops = walk->ops;
  272. if (ops->test_walk)
  273. return ops->test_walk(start, end, walk);
  274. /*
  275. * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
  276. * range, so we don't walk over it as we do for normal vmas. However,
  277. * Some callers are interested in handling hole range and they don't
  278. * want to just ignore any single address range. Such users certainly
  279. * define their ->pte_hole() callbacks, so let's delegate them to handle
  280. * vma(VM_PFNMAP).
  281. */
  282. if (vma->vm_flags & VM_PFNMAP) {
  283. int err = 1;
  284. if (ops->pte_hole)
  285. err = ops->pte_hole(start, end, -1, walk);
  286. return err ? err : 1;
  287. }
  288. return 0;
  289. }
  290. static int __walk_page_range(unsigned long start, unsigned long end,
  291. struct mm_walk *walk)
  292. {
  293. int err = 0;
  294. struct vm_area_struct *vma = walk->vma;
  295. const struct mm_walk_ops *ops = walk->ops;
  296. if (ops->pre_vma) {
  297. err = ops->pre_vma(start, end, walk);
  298. if (err)
  299. return err;
  300. }
  301. if (is_vm_hugetlb_page(vma)) {
  302. if (ops->hugetlb_entry)
  303. err = walk_hugetlb_range(start, end, walk);
  304. } else
  305. err = walk_pgd_range(start, end, walk);
  306. if (ops->post_vma)
  307. ops->post_vma(walk);
  308. return err;
  309. }
  310. static inline void process_mm_walk_lock(struct mm_struct *mm,
  311. enum page_walk_lock walk_lock)
  312. {
  313. if (walk_lock == PGWALK_RDLOCK)
  314. mmap_assert_locked(mm);
  315. else
  316. mmap_assert_write_locked(mm);
  317. }
  318. static inline void process_vma_walk_lock(struct vm_area_struct *vma,
  319. enum page_walk_lock walk_lock)
  320. {
  321. #ifdef CONFIG_PER_VMA_LOCK
  322. switch (walk_lock) {
  323. case PGWALK_WRLOCK:
  324. vma_start_write(vma);
  325. break;
  326. case PGWALK_WRLOCK_VERIFY:
  327. vma_assert_write_locked(vma);
  328. break;
  329. case PGWALK_RDLOCK:
  330. /* PGWALK_RDLOCK is handled by process_mm_walk_lock */
  331. break;
  332. }
  333. #endif
  334. }
  335. /**
  336. * walk_page_range - walk page table with caller specific callbacks
  337. * @mm: mm_struct representing the target process of page table walk
  338. * @start: start address of the virtual address range
  339. * @end: end address of the virtual address range
  340. * @ops: operation to call during the walk
  341. * @private: private data for callbacks' usage
  342. *
  343. * Recursively walk the page table tree of the process represented by @mm
  344. * within the virtual address range [@start, @end). During walking, we can do
  345. * some caller-specific works for each entry, by setting up pmd_entry(),
  346. * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
  347. * callbacks, the associated entries/pages are just ignored.
  348. * The return values of these callbacks are commonly defined like below:
  349. *
  350. * - 0 : succeeded to handle the current entry, and if you don't reach the
  351. * end address yet, continue to walk.
  352. * - >0 : succeeded to handle the current entry, and return to the caller
  353. * with caller specific value.
  354. * - <0 : failed to handle the current entry, and return to the caller
  355. * with error code.
  356. *
  357. * Before starting to walk page table, some callers want to check whether
  358. * they really want to walk over the current vma, typically by checking
  359. * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
  360. * purpose.
  361. *
  362. * If operations need to be staged before and committed after a vma is walked,
  363. * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
  364. * since it is intended to handle commit-type operations, can't return any
  365. * errors.
  366. *
  367. * struct mm_walk keeps current values of some common data like vma and pmd,
  368. * which are useful for the access from callbacks. If you want to pass some
  369. * caller-specific data to callbacks, @private should be helpful.
  370. *
  371. * Locking:
  372. * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
  373. * because these function traverse vma list and/or access to vma's data.
  374. */
  375. int walk_page_range(struct mm_struct *mm, unsigned long start,
  376. unsigned long end, const struct mm_walk_ops *ops,
  377. void *private)
  378. {
  379. int err = 0;
  380. unsigned long next;
  381. struct vm_area_struct *vma;
  382. struct mm_walk walk = {
  383. .ops = ops,
  384. .mm = mm,
  385. .private = private,
  386. };
  387. if (start >= end)
  388. return -EINVAL;
  389. if (!walk.mm)
  390. return -EINVAL;
  391. process_mm_walk_lock(walk.mm, ops->walk_lock);
  392. vma = find_vma(walk.mm, start);
  393. do {
  394. if (!vma) { /* after the last vma */
  395. walk.vma = NULL;
  396. next = end;
  397. if (ops->pte_hole)
  398. err = ops->pte_hole(start, next, -1, &walk);
  399. } else if (start < vma->vm_start) { /* outside vma */
  400. walk.vma = NULL;
  401. next = min(end, vma->vm_start);
  402. if (ops->pte_hole)
  403. err = ops->pte_hole(start, next, -1, &walk);
  404. } else { /* inside vma */
  405. process_vma_walk_lock(vma, ops->walk_lock);
  406. walk.vma = vma;
  407. next = min(end, vma->vm_end);
  408. vma = find_vma(mm, vma->vm_end);
  409. err = walk_page_test(start, next, &walk);
  410. if (err > 0) {
  411. /*
  412. * positive return values are purely for
  413. * controlling the pagewalk, so should never
  414. * be passed to the callers.
  415. */
  416. err = 0;
  417. continue;
  418. }
  419. if (err < 0)
  420. break;
  421. err = __walk_page_range(start, next, &walk);
  422. }
  423. if (err)
  424. break;
  425. } while (start = next, start < end);
  426. return err;
  427. }
  428. /**
  429. * walk_page_range_novma - walk a range of pagetables not backed by a vma
  430. * @mm: mm_struct representing the target process of page table walk
  431. * @start: start address of the virtual address range
  432. * @end: end address of the virtual address range
  433. * @ops: operation to call during the walk
  434. * @pgd: pgd to walk if different from mm->pgd
  435. * @private: private data for callbacks' usage
  436. *
  437. * Similar to walk_page_range() but can walk any page tables even if they are
  438. * not backed by VMAs. Because 'unusual' entries may be walked this function
  439. * will also not lock the PTEs for the pte_entry() callback. This is useful for
  440. * walking the kernel pages tables or page tables for firmware.
  441. *
  442. * Note: Be careful to walk the kernel pages tables, the caller may be need to
  443. * take other effective approache (mmap lock may be insufficient) to prevent
  444. * the intermediate kernel page tables belonging to the specified address range
  445. * from being freed (e.g. memory hot-remove).
  446. */
  447. int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
  448. unsigned long end, const struct mm_walk_ops *ops,
  449. pgd_t *pgd,
  450. void *private)
  451. {
  452. struct mm_walk walk = {
  453. .ops = ops,
  454. .mm = mm,
  455. .pgd = pgd,
  456. .private = private,
  457. .no_vma = true
  458. };
  459. if (start >= end || !walk.mm)
  460. return -EINVAL;
  461. /*
  462. * 1) For walking the user virtual address space:
  463. *
  464. * The mmap lock protects the page walker from changes to the page
  465. * tables during the walk. However a read lock is insufficient to
  466. * protect those areas which don't have a VMA as munmap() detaches
  467. * the VMAs before downgrading to a read lock and actually tearing
  468. * down PTEs/page tables. In which case, the mmap write lock should
  469. * be hold.
  470. *
  471. * 2) For walking the kernel virtual address space:
  472. *
  473. * The kernel intermediate page tables usually do not be freed, so
  474. * the mmap map read lock is sufficient. But there are some exceptions.
  475. * E.g. memory hot-remove. In which case, the mmap lock is insufficient
  476. * to prevent the intermediate kernel pages tables belonging to the
  477. * specified address range from being freed. The caller should take
  478. * other actions to prevent this race.
  479. */
  480. if (mm == &init_mm)
  481. mmap_assert_locked(walk.mm);
  482. else
  483. mmap_assert_write_locked(walk.mm);
  484. return walk_pgd_range(start, end, &walk);
  485. }
  486. int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
  487. unsigned long end, const struct mm_walk_ops *ops,
  488. void *private)
  489. {
  490. struct mm_walk walk = {
  491. .ops = ops,
  492. .mm = vma->vm_mm,
  493. .vma = vma,
  494. .private = private,
  495. };
  496. if (start >= end || !walk.mm)
  497. return -EINVAL;
  498. if (start < vma->vm_start || end > vma->vm_end)
  499. return -EINVAL;
  500. process_mm_walk_lock(walk.mm, ops->walk_lock);
  501. process_vma_walk_lock(vma, ops->walk_lock);
  502. return __walk_page_range(start, end, &walk);
  503. }
  504. int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
  505. void *private)
  506. {
  507. struct mm_walk walk = {
  508. .ops = ops,
  509. .mm = vma->vm_mm,
  510. .vma = vma,
  511. .private = private,
  512. };
  513. if (!walk.mm)
  514. return -EINVAL;
  515. process_mm_walk_lock(walk.mm, ops->walk_lock);
  516. process_vma_walk_lock(vma, ops->walk_lock);
  517. return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
  518. }
  519. /**
  520. * walk_page_mapping - walk all memory areas mapped into a struct address_space.
  521. * @mapping: Pointer to the struct address_space
  522. * @first_index: First page offset in the address_space
  523. * @nr: Number of incremental page offsets to cover
  524. * @ops: operation to call during the walk
  525. * @private: private data for callbacks' usage
  526. *
  527. * This function walks all memory areas mapped into a struct address_space.
  528. * The walk is limited to only the given page-size index range, but if
  529. * the index boundaries cross a huge page-table entry, that entry will be
  530. * included.
  531. *
  532. * Also see walk_page_range() for additional information.
  533. *
  534. * Locking:
  535. * This function can't require that the struct mm_struct::mmap_lock is held,
  536. * since @mapping may be mapped by multiple processes. Instead
  537. * @mapping->i_mmap_rwsem must be held. This might have implications in the
  538. * callbacks, and it's up tho the caller to ensure that the
  539. * struct mm_struct::mmap_lock is not needed.
  540. *
  541. * Also this means that a caller can't rely on the struct
  542. * vm_area_struct::vm_flags to be constant across a call,
  543. * except for immutable flags. Callers requiring this shouldn't use
  544. * this function.
  545. *
  546. * Return: 0 on success, negative error code on failure, positive number on
  547. * caller defined premature termination.
  548. */
  549. int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
  550. pgoff_t nr, const struct mm_walk_ops *ops,
  551. void *private)
  552. {
  553. struct mm_walk walk = {
  554. .ops = ops,
  555. .private = private,
  556. };
  557. struct vm_area_struct *vma;
  558. pgoff_t vba, vea, cba, cea;
  559. unsigned long start_addr, end_addr;
  560. int err = 0;
  561. lockdep_assert_held(&mapping->i_mmap_rwsem);
  562. vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
  563. first_index + nr - 1) {
  564. /* Clip to the vma */
  565. vba = vma->vm_pgoff;
  566. vea = vba + vma_pages(vma);
  567. cba = first_index;
  568. cba = max(cba, vba);
  569. cea = first_index + nr;
  570. cea = min(cea, vea);
  571. start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
  572. end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
  573. if (start_addr >= end_addr)
  574. continue;
  575. walk.vma = vma;
  576. walk.mm = vma->vm_mm;
  577. err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
  578. if (err > 0) {
  579. err = 0;
  580. break;
  581. } else if (err < 0)
  582. break;
  583. err = __walk_page_range(start_addr, end_addr, &walk);
  584. if (err)
  585. break;
  586. }
  587. return err;
  588. }
  589. /**
  590. * folio_walk_start - walk the page tables to a folio
  591. * @fw: filled with information on success.
  592. * @vma: the VMA.
  593. * @addr: the virtual address to use for the page table walk.
  594. * @flags: flags modifying which folios to walk to.
  595. *
  596. * Walk the page tables using @addr in a given @vma to a mapped folio and
  597. * return the folio, making sure that the page table entry referenced by
  598. * @addr cannot change until folio_walk_end() was called.
  599. *
  600. * As default, this function returns only folios that are not special (e.g., not
  601. * the zeropage) and never returns folios that are supposed to be ignored by the
  602. * VM as documented by vm_normal_page(). If requested, zeropages will be
  603. * returned as well.
  604. *
  605. * As default, this function only considers present page table entries.
  606. * If requested, it will also consider migration entries.
  607. *
  608. * If this function returns NULL it might either indicate "there is nothing" or
  609. * "there is nothing suitable".
  610. *
  611. * On success, @fw is filled and the function returns the folio while the PTL
  612. * is still held and folio_walk_end() must be called to clean up,
  613. * releasing any held locks. The returned folio must *not* be used after the
  614. * call to folio_walk_end(), unless a short-term folio reference is taken before
  615. * that call.
  616. *
  617. * @fw->page will correspond to the page that is effectively referenced by
  618. * @addr. However, for migration entries and shared zeropages @fw->page is
  619. * set to NULL. Note that large folios might be mapped by multiple page table
  620. * entries, and this function will always only lookup a single entry as
  621. * specified by @addr, which might or might not cover more than a single page of
  622. * the returned folio.
  623. *
  624. * This function must *not* be used as a naive replacement for
  625. * get_user_pages() / pin_user_pages(), especially not to perform DMA or
  626. * to carelessly modify page content. This function may *only* be used to grab
  627. * short-term folio references, never to grab long-term folio references.
  628. *
  629. * Using the page table entry pointers in @fw for reading or modifying the
  630. * entry should be avoided where possible: however, there might be valid
  631. * use cases.
  632. *
  633. * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care.
  634. * For example, PMD page table sharing might require prior unsharing. Also,
  635. * logical hugetlb entries might span multiple physical page table entries,
  636. * which *must* be modified in a single operation (set_huge_pte_at(),
  637. * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might
  638. * not correspond to the first physical entry of a logical hugetlb entry.
  639. *
  640. * The mmap lock must be held in read mode.
  641. *
  642. * Return: folio pointer on success, otherwise NULL.
  643. */
  644. struct folio *folio_walk_start(struct folio_walk *fw,
  645. struct vm_area_struct *vma, unsigned long addr,
  646. folio_walk_flags_t flags)
  647. {
  648. unsigned long entry_size;
  649. bool expose_page = true;
  650. struct page *page;
  651. pud_t *pudp, pud;
  652. pmd_t *pmdp, pmd;
  653. pte_t *ptep, pte;
  654. spinlock_t *ptl;
  655. pgd_t *pgdp;
  656. p4d_t *p4dp;
  657. mmap_assert_locked(vma->vm_mm);
  658. vma_pgtable_walk_begin(vma);
  659. if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end))
  660. goto not_found;
  661. pgdp = pgd_offset(vma->vm_mm, addr);
  662. if (pgd_none_or_clear_bad(pgdp))
  663. goto not_found;
  664. p4dp = p4d_offset(pgdp, addr);
  665. if (p4d_none_or_clear_bad(p4dp))
  666. goto not_found;
  667. pudp = pud_offset(p4dp, addr);
  668. pud = pudp_get(pudp);
  669. if (pud_none(pud))
  670. goto not_found;
  671. if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
  672. (!pud_present(pud) || pud_leaf(pud))) {
  673. ptl = pud_lock(vma->vm_mm, pudp);
  674. pud = pudp_get(pudp);
  675. entry_size = PUD_SIZE;
  676. fw->level = FW_LEVEL_PUD;
  677. fw->pudp = pudp;
  678. fw->pud = pud;
  679. /*
  680. * TODO: FW_MIGRATION support for PUD migration entries
  681. * once there are relevant users.
  682. */
  683. if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) {
  684. spin_unlock(ptl);
  685. goto not_found;
  686. } else if (!pud_leaf(pud)) {
  687. spin_unlock(ptl);
  688. goto pmd_table;
  689. }
  690. /*
  691. * TODO: vm_normal_page_pud() will be handy once we want to
  692. * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs.
  693. */
  694. page = pud_page(pud);
  695. goto found;
  696. }
  697. pmd_table:
  698. VM_WARN_ON_ONCE(!pud_present(pud) || pud_leaf(pud));
  699. pmdp = pmd_offset(pudp, addr);
  700. pmd = pmdp_get_lockless(pmdp);
  701. if (pmd_none(pmd))
  702. goto not_found;
  703. if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
  704. (!pmd_present(pmd) || pmd_leaf(pmd))) {
  705. ptl = pmd_lock(vma->vm_mm, pmdp);
  706. pmd = pmdp_get(pmdp);
  707. entry_size = PMD_SIZE;
  708. fw->level = FW_LEVEL_PMD;
  709. fw->pmdp = pmdp;
  710. fw->pmd = pmd;
  711. if (pmd_none(pmd)) {
  712. spin_unlock(ptl);
  713. goto not_found;
  714. } else if (pmd_present(pmd) && !pmd_leaf(pmd)) {
  715. spin_unlock(ptl);
  716. goto pte_table;
  717. } else if (pmd_present(pmd)) {
  718. page = vm_normal_page_pmd(vma, addr, pmd);
  719. if (page) {
  720. goto found;
  721. } else if ((flags & FW_ZEROPAGE) &&
  722. is_huge_zero_pmd(pmd)) {
  723. page = pfn_to_page(pmd_pfn(pmd));
  724. expose_page = false;
  725. goto found;
  726. }
  727. } else if ((flags & FW_MIGRATION) &&
  728. is_pmd_migration_entry(pmd)) {
  729. swp_entry_t entry = pmd_to_swp_entry(pmd);
  730. page = pfn_swap_entry_to_page(entry);
  731. expose_page = false;
  732. goto found;
  733. }
  734. spin_unlock(ptl);
  735. goto not_found;
  736. }
  737. pte_table:
  738. VM_WARN_ON_ONCE(!pmd_present(pmd) || pmd_leaf(pmd));
  739. ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl);
  740. if (!ptep)
  741. goto not_found;
  742. pte = ptep_get(ptep);
  743. entry_size = PAGE_SIZE;
  744. fw->level = FW_LEVEL_PTE;
  745. fw->ptep = ptep;
  746. fw->pte = pte;
  747. if (pte_present(pte)) {
  748. page = vm_normal_page(vma, addr, pte);
  749. if (page)
  750. goto found;
  751. if ((flags & FW_ZEROPAGE) &&
  752. is_zero_pfn(pte_pfn(pte))) {
  753. page = pfn_to_page(pte_pfn(pte));
  754. expose_page = false;
  755. goto found;
  756. }
  757. } else if (!pte_none(pte)) {
  758. swp_entry_t entry = pte_to_swp_entry(pte);
  759. if ((flags & FW_MIGRATION) &&
  760. is_migration_entry(entry)) {
  761. page = pfn_swap_entry_to_page(entry);
  762. expose_page = false;
  763. goto found;
  764. }
  765. }
  766. pte_unmap_unlock(ptep, ptl);
  767. not_found:
  768. vma_pgtable_walk_end(vma);
  769. return NULL;
  770. found:
  771. if (expose_page)
  772. /* Note: Offset from the mapped page, not the folio start. */
  773. fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT);
  774. else
  775. fw->page = NULL;
  776. fw->ptl = ptl;
  777. return page_folio(page);
  778. }