book3s_64_mmu_radix.c 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. *
  4. * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  5. */
  6. #include <linux/types.h>
  7. #include <linux/string.h>
  8. #include <linux/kvm.h>
  9. #include <linux/kvm_host.h>
  10. #include <linux/anon_inodes.h>
  11. #include <linux/file.h>
  12. #include <linux/debugfs.h>
  13. #include <linux/pgtable.h>
  14. #include <asm/kvm_ppc.h>
  15. #include <asm/kvm_book3s.h>
  16. #include "book3s_hv.h"
  17. #include <asm/page.h>
  18. #include <asm/mmu.h>
  19. #include <asm/pgalloc.h>
  20. #include <asm/pte-walk.h>
  21. #include <asm/ultravisor.h>
  22. #include <asm/kvm_book3s_uvmem.h>
  23. #include <asm/plpar_wrappers.h>
  24. #include <asm/firmware.h>
  25. /*
  26. * Supported radix tree geometry.
  27. * Like p9, we support either 5 or 9 bits at the first (lowest) level,
  28. * for a page size of 64k or 4k.
  29. */
  30. static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
  31. unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
  32. gva_t eaddr, void *to, void *from,
  33. unsigned long n)
  34. {
  35. int old_pid, old_lpid;
  36. unsigned long quadrant, ret = n;
  37. bool is_load = !!to;
  38. if (kvmhv_is_nestedv2())
  39. return H_UNSUPPORTED;
  40. /* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */
  41. if (kvmhv_on_pseries())
  42. return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,
  43. (to != NULL) ? __pa(to): 0,
  44. (from != NULL) ? __pa(from): 0, n);
  45. if (eaddr & (0xFFFUL << 52))
  46. return ret;
  47. quadrant = 1;
  48. if (!pid)
  49. quadrant = 2;
  50. if (is_load)
  51. from = (void *) (eaddr | (quadrant << 62));
  52. else
  53. to = (void *) (eaddr | (quadrant << 62));
  54. preempt_disable();
  55. asm volatile("hwsync" ::: "memory");
  56. isync();
  57. /* switch the lpid first to avoid running host with unallocated pid */
  58. old_lpid = mfspr(SPRN_LPID);
  59. if (old_lpid != lpid)
  60. mtspr(SPRN_LPID, lpid);
  61. if (quadrant == 1) {
  62. old_pid = mfspr(SPRN_PID);
  63. if (old_pid != pid)
  64. mtspr(SPRN_PID, pid);
  65. }
  66. isync();
  67. pagefault_disable();
  68. if (is_load)
  69. ret = __copy_from_user_inatomic(to, (const void __user *)from, n);
  70. else
  71. ret = __copy_to_user_inatomic((void __user *)to, from, n);
  72. pagefault_enable();
  73. asm volatile("hwsync" ::: "memory");
  74. isync();
  75. /* switch the pid first to avoid running host with unallocated pid */
  76. if (quadrant == 1 && pid != old_pid)
  77. mtspr(SPRN_PID, old_pid);
  78. if (lpid != old_lpid)
  79. mtspr(SPRN_LPID, old_lpid);
  80. isync();
  81. preempt_enable();
  82. return ret;
  83. }
  84. static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
  85. void *to, void *from, unsigned long n)
  86. {
  87. int lpid = vcpu->kvm->arch.lpid;
  88. int pid;
  89. /* This would cause a data segment intr so don't allow the access */
  90. if (eaddr & (0x3FFUL << 52))
  91. return -EINVAL;
  92. /* Should we be using the nested lpid */
  93. if (vcpu->arch.nested)
  94. lpid = vcpu->arch.nested->shadow_lpid;
  95. /* If accessing quadrant 3 then pid is expected to be 0 */
  96. if (((eaddr >> 62) & 0x3) == 0x3)
  97. pid = 0;
  98. else
  99. pid = kvmppc_get_pid(vcpu);
  100. eaddr &= ~(0xFFFUL << 52);
  101. return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n);
  102. }
  103. long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to,
  104. unsigned long n)
  105. {
  106. long ret;
  107. ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n);
  108. if (ret > 0)
  109. memset(to + (n - ret), 0, ret);
  110. return ret;
  111. }
  112. long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from,
  113. unsigned long n)
  114. {
  115. return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n);
  116. }
  117. int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
  118. struct kvmppc_pte *gpte, u64 root,
  119. u64 *pte_ret_p)
  120. {
  121. struct kvm *kvm = vcpu->kvm;
  122. int ret, level, ps;
  123. unsigned long rts, bits, offset, index;
  124. u64 pte, base, gpa;
  125. __be64 rpte;
  126. rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
  127. ((root & RTS2_MASK) >> RTS2_SHIFT);
  128. bits = root & RPDS_MASK;
  129. base = root & RPDB_MASK;
  130. offset = rts + 31;
  131. /* Current implementations only support 52-bit space */
  132. if (offset != 52)
  133. return -EINVAL;
  134. /* Walk each level of the radix tree */
  135. for (level = 3; level >= 0; --level) {
  136. u64 addr;
  137. /* Check a valid size */
  138. if (level && bits != p9_supported_radix_bits[level])
  139. return -EINVAL;
  140. if (level == 0 && !(bits == 5 || bits == 9))
  141. return -EINVAL;
  142. offset -= bits;
  143. index = (eaddr >> offset) & ((1UL << bits) - 1);
  144. /* Check that low bits of page table base are zero */
  145. if (base & ((1UL << (bits + 3)) - 1))
  146. return -EINVAL;
  147. /* Read the entry from guest memory */
  148. addr = base + (index * sizeof(rpte));
  149. kvm_vcpu_srcu_read_lock(vcpu);
  150. ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
  151. kvm_vcpu_srcu_read_unlock(vcpu);
  152. if (ret) {
  153. if (pte_ret_p)
  154. *pte_ret_p = addr;
  155. return ret;
  156. }
  157. pte = __be64_to_cpu(rpte);
  158. if (!(pte & _PAGE_PRESENT))
  159. return -ENOENT;
  160. /* Check if a leaf entry */
  161. if (pte & _PAGE_PTE)
  162. break;
  163. /* Get ready to walk the next level */
  164. base = pte & RPDB_MASK;
  165. bits = pte & RPDS_MASK;
  166. }
  167. /* Need a leaf at lowest level; 512GB pages not supported */
  168. if (level < 0 || level == 3)
  169. return -EINVAL;
  170. /* We found a valid leaf PTE */
  171. /* Offset is now log base 2 of the page size */
  172. gpa = pte & 0x01fffffffffff000ul;
  173. if (gpa & ((1ul << offset) - 1))
  174. return -EINVAL;
  175. gpa |= eaddr & ((1ul << offset) - 1);
  176. for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
  177. if (offset == mmu_psize_defs[ps].shift)
  178. break;
  179. gpte->page_size = ps;
  180. gpte->page_shift = offset;
  181. gpte->eaddr = eaddr;
  182. gpte->raddr = gpa;
  183. /* Work out permissions */
  184. gpte->may_read = !!(pte & _PAGE_READ);
  185. gpte->may_write = !!(pte & _PAGE_WRITE);
  186. gpte->may_execute = !!(pte & _PAGE_EXEC);
  187. gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
  188. if (pte_ret_p)
  189. *pte_ret_p = pte;
  190. return 0;
  191. }
  192. /*
  193. * Used to walk a partition or process table radix tree in guest memory
  194. * Note: We exploit the fact that a partition table and a process
  195. * table have the same layout, a partition-scoped page table and a
  196. * process-scoped page table have the same layout, and the 2nd
  197. * doubleword of a partition table entry has the same layout as
  198. * the PTCR register.
  199. */
  200. int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
  201. struct kvmppc_pte *gpte, u64 table,
  202. int table_index, u64 *pte_ret_p)
  203. {
  204. struct kvm *kvm = vcpu->kvm;
  205. int ret;
  206. unsigned long size, ptbl, root;
  207. struct prtb_entry entry;
  208. if ((table & PRTS_MASK) > 24)
  209. return -EINVAL;
  210. size = 1ul << ((table & PRTS_MASK) + 12);
  211. /* Is the table big enough to contain this entry? */
  212. if ((table_index * sizeof(entry)) >= size)
  213. return -EINVAL;
  214. /* Read the table to find the root of the radix tree */
  215. ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
  216. kvm_vcpu_srcu_read_lock(vcpu);
  217. ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
  218. kvm_vcpu_srcu_read_unlock(vcpu);
  219. if (ret)
  220. return ret;
  221. /* Root is stored in the first double word */
  222. root = be64_to_cpu(entry.prtb0);
  223. return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
  224. }
  225. int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
  226. struct kvmppc_pte *gpte, bool data, bool iswrite)
  227. {
  228. u32 pid;
  229. u64 pte;
  230. int ret;
  231. /* Work out effective PID */
  232. switch (eaddr >> 62) {
  233. case 0:
  234. pid = kvmppc_get_pid(vcpu);
  235. break;
  236. case 3:
  237. pid = 0;
  238. break;
  239. default:
  240. return -EINVAL;
  241. }
  242. ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
  243. vcpu->kvm->arch.process_table, pid, &pte);
  244. if (ret)
  245. return ret;
  246. /* Check privilege (applies only to process scoped translations) */
  247. if (kvmppc_get_msr(vcpu) & MSR_PR) {
  248. if (pte & _PAGE_PRIVILEGED) {
  249. gpte->may_read = 0;
  250. gpte->may_write = 0;
  251. gpte->may_execute = 0;
  252. }
  253. } else {
  254. if (!(pte & _PAGE_PRIVILEGED)) {
  255. /* Check AMR/IAMR to see if strict mode is in force */
  256. if (kvmppc_get_amr_hv(vcpu) & (1ul << 62))
  257. gpte->may_read = 0;
  258. if (kvmppc_get_amr_hv(vcpu) & (1ul << 63))
  259. gpte->may_write = 0;
  260. if (vcpu->arch.iamr & (1ul << 62))
  261. gpte->may_execute = 0;
  262. }
  263. }
  264. return 0;
  265. }
  266. void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
  267. unsigned int pshift, u64 lpid)
  268. {
  269. unsigned long psize = PAGE_SIZE;
  270. int psi;
  271. long rc;
  272. unsigned long rb;
  273. if (pshift)
  274. psize = 1UL << pshift;
  275. else
  276. pshift = PAGE_SHIFT;
  277. addr &= ~(psize - 1);
  278. if (!kvmhv_on_pseries()) {
  279. radix__flush_tlb_lpid_page(lpid, addr, psize);
  280. return;
  281. }
  282. psi = shift_to_mmu_psize(pshift);
  283. if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) {
  284. rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
  285. rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
  286. lpid, rb);
  287. } else {
  288. rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
  289. H_RPTI_TYPE_NESTED |
  290. H_RPTI_TYPE_TLB,
  291. psize_to_rpti_pgsize(psi),
  292. addr, addr + psize);
  293. }
  294. if (rc)
  295. pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
  296. }
  297. static void kvmppc_radix_flush_pwc(struct kvm *kvm, u64 lpid)
  298. {
  299. long rc;
  300. if (!kvmhv_on_pseries()) {
  301. radix__flush_pwc_lpid(lpid);
  302. return;
  303. }
  304. if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE))
  305. rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
  306. lpid, TLBIEL_INVAL_SET_LPID);
  307. else
  308. rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
  309. H_RPTI_TYPE_NESTED |
  310. H_RPTI_TYPE_PWC, H_RPTI_PAGE_ALL,
  311. 0, -1UL);
  312. if (rc)
  313. pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
  314. }
  315. static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
  316. unsigned long clr, unsigned long set,
  317. unsigned long addr, unsigned int shift)
  318. {
  319. return __radix_pte_update(ptep, clr, set);
  320. }
  321. static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
  322. pte_t *ptep, pte_t pte)
  323. {
  324. radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
  325. }
  326. static struct kmem_cache *kvm_pte_cache;
  327. static struct kmem_cache *kvm_pmd_cache;
  328. static pte_t *kvmppc_pte_alloc(void)
  329. {
  330. pte_t *pte;
  331. pte = kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
  332. /* pmd_populate() will only reference _pa(pte). */
  333. kmemleak_ignore(pte);
  334. return pte;
  335. }
  336. static void kvmppc_pte_free(pte_t *ptep)
  337. {
  338. kmem_cache_free(kvm_pte_cache, ptep);
  339. }
  340. static pmd_t *kvmppc_pmd_alloc(void)
  341. {
  342. pmd_t *pmd;
  343. pmd = kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);
  344. /* pud_populate() will only reference _pa(pmd). */
  345. kmemleak_ignore(pmd);
  346. return pmd;
  347. }
  348. static void kvmppc_pmd_free(pmd_t *pmdp)
  349. {
  350. kmem_cache_free(kvm_pmd_cache, pmdp);
  351. }
  352. /* Called with kvm->mmu_lock held */
  353. void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
  354. unsigned int shift,
  355. const struct kvm_memory_slot *memslot,
  356. u64 lpid)
  357. {
  358. unsigned long old;
  359. unsigned long gfn = gpa >> PAGE_SHIFT;
  360. unsigned long page_size = PAGE_SIZE;
  361. unsigned long hpa;
  362. old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
  363. kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
  364. /* The following only applies to L1 entries */
  365. if (lpid != kvm->arch.lpid)
  366. return;
  367. if (!memslot) {
  368. memslot = gfn_to_memslot(kvm, gfn);
  369. if (!memslot)
  370. return;
  371. }
  372. if (shift) { /* 1GB or 2MB page */
  373. page_size = 1ul << shift;
  374. if (shift == PMD_SHIFT)
  375. kvm->stat.num_2M_pages--;
  376. else if (shift == PUD_SHIFT)
  377. kvm->stat.num_1G_pages--;
  378. }
  379. gpa &= ~(page_size - 1);
  380. hpa = old & PTE_RPN_MASK;
  381. kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
  382. if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
  383. kvmppc_update_dirty_map(memslot, gfn, page_size);
  384. }
  385. /*
  386. * kvmppc_free_p?d are used to free existing page tables, and recursively
  387. * descend and clear and free children.
  388. * Callers are responsible for flushing the PWC.
  389. *
  390. * When page tables are being unmapped/freed as part of page fault path
  391. * (full == false), valid ptes are generally not expected; however, there
  392. * is one situation where they arise, which is when dirty page logging is
  393. * turned off for a memslot while the VM is running. The new memslot
  394. * becomes visible to page faults before the memslot commit function
  395. * gets to flush the memslot, which can lead to a 2MB page mapping being
  396. * installed for a guest physical address where there are already 64kB
  397. * (or 4kB) mappings (of sub-pages of the same 2MB page).
  398. */
  399. static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
  400. u64 lpid)
  401. {
  402. if (full) {
  403. memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE);
  404. } else {
  405. pte_t *p = pte;
  406. unsigned long it;
  407. for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
  408. if (pte_val(*p) == 0)
  409. continue;
  410. kvmppc_unmap_pte(kvm, p,
  411. pte_pfn(*p) << PAGE_SHIFT,
  412. PAGE_SHIFT, NULL, lpid);
  413. }
  414. }
  415. kvmppc_pte_free(pte);
  416. }
  417. static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
  418. u64 lpid)
  419. {
  420. unsigned long im;
  421. pmd_t *p = pmd;
  422. for (im = 0; im < PTRS_PER_PMD; ++im, ++p) {
  423. if (!pmd_present(*p))
  424. continue;
  425. if (pmd_leaf(*p)) {
  426. if (full) {
  427. pmd_clear(p);
  428. } else {
  429. WARN_ON_ONCE(1);
  430. kvmppc_unmap_pte(kvm, (pte_t *)p,
  431. pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
  432. PMD_SHIFT, NULL, lpid);
  433. }
  434. } else {
  435. pte_t *pte;
  436. pte = pte_offset_kernel(p, 0);
  437. kvmppc_unmap_free_pte(kvm, pte, full, lpid);
  438. pmd_clear(p);
  439. }
  440. }
  441. kvmppc_pmd_free(pmd);
  442. }
  443. static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
  444. u64 lpid)
  445. {
  446. unsigned long iu;
  447. pud_t *p = pud;
  448. for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {
  449. if (!pud_present(*p))
  450. continue;
  451. if (pud_leaf(*p)) {
  452. pud_clear(p);
  453. } else {
  454. pmd_t *pmd;
  455. pmd = pmd_offset(p, 0);
  456. kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
  457. pud_clear(p);
  458. }
  459. }
  460. pud_free(kvm->mm, pud);
  461. }
  462. void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, u64 lpid)
  463. {
  464. unsigned long ig;
  465. for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
  466. p4d_t *p4d = p4d_offset(pgd, 0);
  467. pud_t *pud;
  468. if (!p4d_present(*p4d))
  469. continue;
  470. pud = pud_offset(p4d, 0);
  471. kvmppc_unmap_free_pud(kvm, pud, lpid);
  472. p4d_clear(p4d);
  473. }
  474. }
  475. void kvmppc_free_radix(struct kvm *kvm)
  476. {
  477. if (kvm->arch.pgtable) {
  478. kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
  479. kvm->arch.lpid);
  480. pgd_free(kvm->mm, kvm->arch.pgtable);
  481. kvm->arch.pgtable = NULL;
  482. }
  483. }
  484. static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
  485. unsigned long gpa, u64 lpid)
  486. {
  487. pte_t *pte = pte_offset_kernel(pmd, 0);
  488. /*
  489. * Clearing the pmd entry then flushing the PWC ensures that the pte
  490. * page no longer be cached by the MMU, so can be freed without
  491. * flushing the PWC again.
  492. */
  493. pmd_clear(pmd);
  494. kvmppc_radix_flush_pwc(kvm, lpid);
  495. kvmppc_unmap_free_pte(kvm, pte, false, lpid);
  496. }
  497. static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
  498. unsigned long gpa, u64 lpid)
  499. {
  500. pmd_t *pmd = pmd_offset(pud, 0);
  501. /*
  502. * Clearing the pud entry then flushing the PWC ensures that the pmd
  503. * page and any children pte pages will no longer be cached by the MMU,
  504. * so can be freed without flushing the PWC again.
  505. */
  506. pud_clear(pud);
  507. kvmppc_radix_flush_pwc(kvm, lpid);
  508. kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
  509. }
  510. /*
  511. * There are a number of bits which may differ between different faults to
  512. * the same partition scope entry. RC bits, in the course of cleaning and
  513. * aging. And the write bit can change, either the access could have been
  514. * upgraded, or a read fault could happen concurrently with a write fault
  515. * that sets those bits first.
  516. */
  517. #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
  518. int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
  519. unsigned long gpa, unsigned int level,
  520. unsigned long mmu_seq, u64 lpid,
  521. unsigned long *rmapp, struct rmap_nested **n_rmap)
  522. {
  523. pgd_t *pgd;
  524. p4d_t *p4d;
  525. pud_t *pud, *new_pud = NULL;
  526. pmd_t *pmd, *new_pmd = NULL;
  527. pte_t *ptep, *new_ptep = NULL;
  528. int ret;
  529. /* Traverse the guest's 2nd-level tree, allocate new levels needed */
  530. pgd = pgtable + pgd_index(gpa);
  531. p4d = p4d_offset(pgd, gpa);
  532. pud = NULL;
  533. if (p4d_present(*p4d))
  534. pud = pud_offset(p4d, gpa);
  535. else
  536. new_pud = pud_alloc_one(kvm->mm, gpa);
  537. pmd = NULL;
  538. if (pud && pud_present(*pud) && !pud_leaf(*pud))
  539. pmd = pmd_offset(pud, gpa);
  540. else if (level <= 1)
  541. new_pmd = kvmppc_pmd_alloc();
  542. if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_leaf(*pmd)))
  543. new_ptep = kvmppc_pte_alloc();
  544. /* Check if we might have been invalidated; let the guest retry if so */
  545. spin_lock(&kvm->mmu_lock);
  546. ret = -EAGAIN;
  547. if (mmu_invalidate_retry(kvm, mmu_seq))
  548. goto out_unlock;
  549. /* Now traverse again under the lock and change the tree */
  550. ret = -ENOMEM;
  551. if (p4d_none(*p4d)) {
  552. if (!new_pud)
  553. goto out_unlock;
  554. p4d_populate(kvm->mm, p4d, new_pud);
  555. new_pud = NULL;
  556. }
  557. pud = pud_offset(p4d, gpa);
  558. if (pud_leaf(*pud)) {
  559. unsigned long hgpa = gpa & PUD_MASK;
  560. /* Check if we raced and someone else has set the same thing */
  561. if (level == 2) {
  562. if (pud_raw(*pud) == pte_raw(pte)) {
  563. ret = 0;
  564. goto out_unlock;
  565. }
  566. /* Valid 1GB page here already, add our extra bits */
  567. WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) &
  568. PTE_BITS_MUST_MATCH);
  569. kvmppc_radix_update_pte(kvm, (pte_t *)pud,
  570. 0, pte_val(pte), hgpa, PUD_SHIFT);
  571. ret = 0;
  572. goto out_unlock;
  573. }
  574. /*
  575. * If we raced with another CPU which has just put
  576. * a 1GB pte in after we saw a pmd page, try again.
  577. */
  578. if (!new_pmd) {
  579. ret = -EAGAIN;
  580. goto out_unlock;
  581. }
  582. /* Valid 1GB page here already, remove it */
  583. kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
  584. lpid);
  585. }
  586. if (level == 2) {
  587. if (!pud_none(*pud)) {
  588. /*
  589. * There's a page table page here, but we wanted to
  590. * install a large page, so remove and free the page
  591. * table page.
  592. */
  593. kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
  594. }
  595. kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
  596. if (rmapp && n_rmap)
  597. kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
  598. ret = 0;
  599. goto out_unlock;
  600. }
  601. if (pud_none(*pud)) {
  602. if (!new_pmd)
  603. goto out_unlock;
  604. pud_populate(kvm->mm, pud, new_pmd);
  605. new_pmd = NULL;
  606. }
  607. pmd = pmd_offset(pud, gpa);
  608. if (pmd_leaf(*pmd)) {
  609. unsigned long lgpa = gpa & PMD_MASK;
  610. /* Check if we raced and someone else has set the same thing */
  611. if (level == 1) {
  612. if (pmd_raw(*pmd) == pte_raw(pte)) {
  613. ret = 0;
  614. goto out_unlock;
  615. }
  616. /* Valid 2MB page here already, add our extra bits */
  617. WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
  618. PTE_BITS_MUST_MATCH);
  619. kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
  620. 0, pte_val(pte), lgpa, PMD_SHIFT);
  621. ret = 0;
  622. goto out_unlock;
  623. }
  624. /*
  625. * If we raced with another CPU which has just put
  626. * a 2MB pte in after we saw a pte page, try again.
  627. */
  628. if (!new_ptep) {
  629. ret = -EAGAIN;
  630. goto out_unlock;
  631. }
  632. /* Valid 2MB page here already, remove it */
  633. kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
  634. lpid);
  635. }
  636. if (level == 1) {
  637. if (!pmd_none(*pmd)) {
  638. /*
  639. * There's a page table page here, but we wanted to
  640. * install a large page, so remove and free the page
  641. * table page.
  642. */
  643. kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
  644. }
  645. kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
  646. if (rmapp && n_rmap)
  647. kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
  648. ret = 0;
  649. goto out_unlock;
  650. }
  651. if (pmd_none(*pmd)) {
  652. if (!new_ptep)
  653. goto out_unlock;
  654. pmd_populate(kvm->mm, pmd, new_ptep);
  655. new_ptep = NULL;
  656. }
  657. ptep = pte_offset_kernel(pmd, gpa);
  658. if (pte_present(*ptep)) {
  659. /* Check if someone else set the same thing */
  660. if (pte_raw(*ptep) == pte_raw(pte)) {
  661. ret = 0;
  662. goto out_unlock;
  663. }
  664. /* Valid page here already, add our extra bits */
  665. WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) &
  666. PTE_BITS_MUST_MATCH);
  667. kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0);
  668. ret = 0;
  669. goto out_unlock;
  670. }
  671. kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
  672. if (rmapp && n_rmap)
  673. kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
  674. ret = 0;
  675. out_unlock:
  676. spin_unlock(&kvm->mmu_lock);
  677. if (new_pud)
  678. pud_free(kvm->mm, new_pud);
  679. if (new_pmd)
  680. kvmppc_pmd_free(new_pmd);
  681. if (new_ptep)
  682. kvmppc_pte_free(new_ptep);
  683. return ret;
  684. }
  685. bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing,
  686. unsigned long gpa, u64 lpid)
  687. {
  688. unsigned long pgflags;
  689. unsigned int shift;
  690. pte_t *ptep;
  691. /*
  692. * Need to set an R or C bit in the 2nd-level tables;
  693. * since we are just helping out the hardware here,
  694. * it is sufficient to do what the hardware does.
  695. */
  696. pgflags = _PAGE_ACCESSED;
  697. if (writing)
  698. pgflags |= _PAGE_DIRTY;
  699. if (nested)
  700. ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift);
  701. else
  702. ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
  703. if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
  704. kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
  705. return true;
  706. }
  707. return false;
  708. }
  709. int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
  710. unsigned long gpa,
  711. struct kvm_memory_slot *memslot,
  712. bool writing, bool kvm_ro,
  713. pte_t *inserted_pte, unsigned int *levelp)
  714. {
  715. struct kvm *kvm = vcpu->kvm;
  716. struct page *page = NULL;
  717. unsigned long mmu_seq;
  718. unsigned long hva, gfn = gpa >> PAGE_SHIFT;
  719. bool upgrade_write = false;
  720. bool *upgrade_p = &upgrade_write;
  721. pte_t pte, *ptep;
  722. unsigned int shift, level;
  723. int ret;
  724. bool large_enable;
  725. /* used to check for invalidations in progress */
  726. mmu_seq = kvm->mmu_invalidate_seq;
  727. smp_rmb();
  728. /*
  729. * Do a fast check first, since __gfn_to_pfn_memslot doesn't
  730. * do it with !atomic && !async, which is how we call it.
  731. * We always ask for write permission since the common case
  732. * is that the page is writable.
  733. */
  734. hva = gfn_to_hva_memslot(memslot, gfn);
  735. if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) {
  736. upgrade_write = true;
  737. } else {
  738. unsigned long pfn;
  739. /* Call KVM generic code to do the slow-path check */
  740. pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
  741. writing, upgrade_p, NULL);
  742. if (is_error_noslot_pfn(pfn))
  743. return -EFAULT;
  744. page = NULL;
  745. if (pfn_valid(pfn)) {
  746. page = pfn_to_page(pfn);
  747. if (PageReserved(page))
  748. page = NULL;
  749. }
  750. }
  751. /*
  752. * Read the PTE from the process' radix tree and use that
  753. * so we get the shift and attribute bits.
  754. */
  755. spin_lock(&kvm->mmu_lock);
  756. ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
  757. pte = __pte(0);
  758. if (ptep)
  759. pte = READ_ONCE(*ptep);
  760. spin_unlock(&kvm->mmu_lock);
  761. /*
  762. * If the PTE disappeared temporarily due to a THP
  763. * collapse, just return and let the guest try again.
  764. */
  765. if (!pte_present(pte)) {
  766. if (page)
  767. put_page(page);
  768. return RESUME_GUEST;
  769. }
  770. /* If we're logging dirty pages, always map single pages */
  771. large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
  772. /* Get pte level from shift/size */
  773. if (large_enable && shift == PUD_SHIFT &&
  774. (gpa & (PUD_SIZE - PAGE_SIZE)) ==
  775. (hva & (PUD_SIZE - PAGE_SIZE))) {
  776. level = 2;
  777. } else if (large_enable && shift == PMD_SHIFT &&
  778. (gpa & (PMD_SIZE - PAGE_SIZE)) ==
  779. (hva & (PMD_SIZE - PAGE_SIZE))) {
  780. level = 1;
  781. } else {
  782. level = 0;
  783. if (shift > PAGE_SHIFT) {
  784. /*
  785. * If the pte maps more than one page, bring over
  786. * bits from the virtual address to get the real
  787. * address of the specific single page we want.
  788. */
  789. unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
  790. pte = __pte(pte_val(pte) | (hva & rpnmask));
  791. }
  792. }
  793. pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
  794. if (writing || upgrade_write) {
  795. if (pte_val(pte) & _PAGE_WRITE)
  796. pte = __pte(pte_val(pte) | _PAGE_DIRTY);
  797. } else {
  798. pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
  799. }
  800. /* Allocate space in the tree and write the PTE */
  801. ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
  802. mmu_seq, kvm->arch.lpid, NULL, NULL);
  803. if (inserted_pte)
  804. *inserted_pte = pte;
  805. if (levelp)
  806. *levelp = level;
  807. if (page) {
  808. if (!ret && (pte_val(pte) & _PAGE_WRITE))
  809. set_page_dirty_lock(page);
  810. put_page(page);
  811. }
  812. /* Increment number of large pages if we (successfully) inserted one */
  813. if (!ret) {
  814. if (level == 1)
  815. kvm->stat.num_2M_pages++;
  816. else if (level == 2)
  817. kvm->stat.num_1G_pages++;
  818. }
  819. return ret;
  820. }
  821. int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu,
  822. unsigned long ea, unsigned long dsisr)
  823. {
  824. struct kvm *kvm = vcpu->kvm;
  825. unsigned long gpa, gfn;
  826. struct kvm_memory_slot *memslot;
  827. long ret;
  828. bool writing = !!(dsisr & DSISR_ISSTORE);
  829. bool kvm_ro = false;
  830. /* Check for unusual errors */
  831. if (dsisr & DSISR_UNSUPP_MMU) {
  832. pr_err("KVM: Got unsupported MMU fault\n");
  833. return -EFAULT;
  834. }
  835. if (dsisr & DSISR_BADACCESS) {
  836. /* Reflect to the guest as DSI */
  837. pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
  838. kvmppc_core_queue_data_storage(vcpu,
  839. kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
  840. ea, dsisr);
  841. return RESUME_GUEST;
  842. }
  843. /* Translate the logical address */
  844. gpa = vcpu->arch.fault_gpa & ~0xfffUL;
  845. gpa &= ~0xF000000000000000ul;
  846. gfn = gpa >> PAGE_SHIFT;
  847. if (!(dsisr & DSISR_PRTABLE_FAULT))
  848. gpa |= ea & 0xfff;
  849. if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
  850. return kvmppc_send_page_to_uv(kvm, gfn);
  851. /* Get the corresponding memslot */
  852. memslot = gfn_to_memslot(kvm, gfn);
  853. /* No memslot means it's an emulated MMIO region */
  854. if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
  855. if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
  856. DSISR_SET_RC)) {
  857. /*
  858. * Bad address in guest page table tree, or other
  859. * unusual error - reflect it to the guest as DSI.
  860. */
  861. kvmppc_core_queue_data_storage(vcpu,
  862. kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
  863. ea, dsisr);
  864. return RESUME_GUEST;
  865. }
  866. return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, writing);
  867. }
  868. if (memslot->flags & KVM_MEM_READONLY) {
  869. if (writing) {
  870. /* give the guest a DSI */
  871. kvmppc_core_queue_data_storage(vcpu,
  872. kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
  873. ea, DSISR_ISSTORE | DSISR_PROTFAULT);
  874. return RESUME_GUEST;
  875. }
  876. kvm_ro = true;
  877. }
  878. /* Failed to set the reference/change bits */
  879. if (dsisr & DSISR_SET_RC) {
  880. spin_lock(&kvm->mmu_lock);
  881. if (kvmppc_hv_handle_set_rc(kvm, false, writing,
  882. gpa, kvm->arch.lpid))
  883. dsisr &= ~DSISR_SET_RC;
  884. spin_unlock(&kvm->mmu_lock);
  885. if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
  886. DSISR_PROTFAULT | DSISR_SET_RC)))
  887. return RESUME_GUEST;
  888. }
  889. /* Try to insert a pte */
  890. ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
  891. kvm_ro, NULL, NULL);
  892. if (ret == 0 || ret == -EAGAIN)
  893. ret = RESUME_GUEST;
  894. return ret;
  895. }
  896. /* Called with kvm->mmu_lock held */
  897. void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
  898. unsigned long gfn)
  899. {
  900. pte_t *ptep;
  901. unsigned long gpa = gfn << PAGE_SHIFT;
  902. unsigned int shift;
  903. if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) {
  904. uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT);
  905. return;
  906. }
  907. ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
  908. if (ptep && pte_present(*ptep))
  909. kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
  910. kvm->arch.lpid);
  911. }
  912. /* Called with kvm->mmu_lock held */
  913. bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
  914. unsigned long gfn)
  915. {
  916. pte_t *ptep;
  917. unsigned long gpa = gfn << PAGE_SHIFT;
  918. unsigned int shift;
  919. bool ref = false;
  920. unsigned long old, *rmapp;
  921. if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
  922. return ref;
  923. ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
  924. if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
  925. old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
  926. gpa, shift);
  927. /* XXX need to flush tlb here? */
  928. /* Also clear bit in ptes in shadow pgtable for nested guests */
  929. rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
  930. kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
  931. old & PTE_RPN_MASK,
  932. 1UL << shift);
  933. ref = true;
  934. }
  935. return ref;
  936. }
  937. /* Called with kvm->mmu_lock held */
  938. bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
  939. unsigned long gfn)
  940. {
  941. pte_t *ptep;
  942. unsigned long gpa = gfn << PAGE_SHIFT;
  943. unsigned int shift;
  944. bool ref = false;
  945. if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
  946. return ref;
  947. ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
  948. if (ptep && pte_present(*ptep) && pte_young(*ptep))
  949. ref = true;
  950. return ref;
  951. }
  952. /* Returns the number of PAGE_SIZE pages that are dirty */
  953. static int kvm_radix_test_clear_dirty(struct kvm *kvm,
  954. struct kvm_memory_slot *memslot, int pagenum)
  955. {
  956. unsigned long gfn = memslot->base_gfn + pagenum;
  957. unsigned long gpa = gfn << PAGE_SHIFT;
  958. pte_t *ptep, pte;
  959. unsigned int shift;
  960. int ret = 0;
  961. unsigned long old, *rmapp;
  962. if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
  963. return ret;
  964. /*
  965. * For performance reasons we don't hold kvm->mmu_lock while walking the
  966. * partition scoped table.
  967. */
  968. ptep = find_kvm_secondary_pte_unlocked(kvm, gpa, &shift);
  969. if (!ptep)
  970. return 0;
  971. pte = READ_ONCE(*ptep);
  972. if (pte_present(pte) && pte_dirty(pte)) {
  973. spin_lock(&kvm->mmu_lock);
  974. /*
  975. * Recheck the pte again
  976. */
  977. if (pte_val(pte) != pte_val(*ptep)) {
  978. /*
  979. * We have KVM_MEM_LOG_DIRTY_PAGES enabled. Hence we can
  980. * only find PAGE_SIZE pte entries here. We can continue
  981. * to use the pte addr returned by above page table
  982. * walk.
  983. */
  984. if (!pte_present(*ptep) || !pte_dirty(*ptep)) {
  985. spin_unlock(&kvm->mmu_lock);
  986. return 0;
  987. }
  988. }
  989. ret = 1;
  990. VM_BUG_ON(shift);
  991. old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
  992. gpa, shift);
  993. kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
  994. /* Also clear bit in ptes in shadow pgtable for nested guests */
  995. rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
  996. kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0,
  997. old & PTE_RPN_MASK,
  998. 1UL << shift);
  999. spin_unlock(&kvm->mmu_lock);
  1000. }
  1001. return ret;
  1002. }
  1003. long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
  1004. struct kvm_memory_slot *memslot, unsigned long *map)
  1005. {
  1006. unsigned long i, j;
  1007. int npages;
  1008. for (i = 0; i < memslot->npages; i = j) {
  1009. npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
  1010. /*
  1011. * Note that if npages > 0 then i must be a multiple of npages,
  1012. * since huge pages are only used to back the guest at guest
  1013. * real addresses that are a multiple of their size.
  1014. * Since we have at most one PTE covering any given guest
  1015. * real address, if npages > 1 we can skip to i + npages.
  1016. */
  1017. j = i + 1;
  1018. if (npages) {
  1019. set_dirty_bits(map, i, npages);
  1020. j = i + npages;
  1021. }
  1022. }
  1023. return 0;
  1024. }
  1025. void kvmppc_radix_flush_memslot(struct kvm *kvm,
  1026. const struct kvm_memory_slot *memslot)
  1027. {
  1028. unsigned long n;
  1029. pte_t *ptep;
  1030. unsigned long gpa;
  1031. unsigned int shift;
  1032. if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)
  1033. kvmppc_uvmem_drop_pages(memslot, kvm, true);
  1034. if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
  1035. return;
  1036. gpa = memslot->base_gfn << PAGE_SHIFT;
  1037. spin_lock(&kvm->mmu_lock);
  1038. for (n = memslot->npages; n; --n) {
  1039. ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
  1040. if (ptep && pte_present(*ptep))
  1041. kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
  1042. kvm->arch.lpid);
  1043. gpa += PAGE_SIZE;
  1044. }
  1045. /*
  1046. * Increase the mmu notifier sequence number to prevent any page
  1047. * fault that read the memslot earlier from writing a PTE.
  1048. */
  1049. kvm->mmu_invalidate_seq++;
  1050. spin_unlock(&kvm->mmu_lock);
  1051. }
  1052. static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
  1053. int psize, int *indexp)
  1054. {
  1055. if (!mmu_psize_defs[psize].shift)
  1056. return;
  1057. info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
  1058. (mmu_psize_defs[psize].ap << 29);
  1059. ++(*indexp);
  1060. }
  1061. int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
  1062. {
  1063. int i;
  1064. if (!radix_enabled())
  1065. return -EINVAL;
  1066. memset(info, 0, sizeof(*info));
  1067. /* 4k page size */
  1068. info->geometries[0].page_shift = 12;
  1069. info->geometries[0].level_bits[0] = 9;
  1070. for (i = 1; i < 4; ++i)
  1071. info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
  1072. /* 64k page size */
  1073. info->geometries[1].page_shift = 16;
  1074. for (i = 0; i < 4; ++i)
  1075. info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
  1076. i = 0;
  1077. add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
  1078. add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
  1079. add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
  1080. add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
  1081. return 0;
  1082. }
  1083. int kvmppc_init_vm_radix(struct kvm *kvm)
  1084. {
  1085. kvm->arch.pgtable = pgd_alloc(kvm->mm);
  1086. if (!kvm->arch.pgtable)
  1087. return -ENOMEM;
  1088. return 0;
  1089. }
  1090. static void pte_ctor(void *addr)
  1091. {
  1092. memset(addr, 0, RADIX_PTE_TABLE_SIZE);
  1093. }
  1094. static void pmd_ctor(void *addr)
  1095. {
  1096. memset(addr, 0, RADIX_PMD_TABLE_SIZE);
  1097. }
  1098. struct debugfs_radix_state {
  1099. struct kvm *kvm;
  1100. struct mutex mutex;
  1101. unsigned long gpa;
  1102. int lpid;
  1103. int chars_left;
  1104. int buf_index;
  1105. char buf[128];
  1106. u8 hdr;
  1107. };
  1108. static int debugfs_radix_open(struct inode *inode, struct file *file)
  1109. {
  1110. struct kvm *kvm = inode->i_private;
  1111. struct debugfs_radix_state *p;
  1112. p = kzalloc(sizeof(*p), GFP_KERNEL);
  1113. if (!p)
  1114. return -ENOMEM;
  1115. kvm_get_kvm(kvm);
  1116. p->kvm = kvm;
  1117. mutex_init(&p->mutex);
  1118. file->private_data = p;
  1119. return nonseekable_open(inode, file);
  1120. }
  1121. static int debugfs_radix_release(struct inode *inode, struct file *file)
  1122. {
  1123. struct debugfs_radix_state *p = file->private_data;
  1124. kvm_put_kvm(p->kvm);
  1125. kfree(p);
  1126. return 0;
  1127. }
  1128. static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
  1129. size_t len, loff_t *ppos)
  1130. {
  1131. struct debugfs_radix_state *p = file->private_data;
  1132. ssize_t ret, r;
  1133. unsigned long n;
  1134. struct kvm *kvm;
  1135. unsigned long gpa;
  1136. pgd_t *pgt;
  1137. struct kvm_nested_guest *nested;
  1138. pgd_t *pgdp;
  1139. p4d_t p4d, *p4dp;
  1140. pud_t pud, *pudp;
  1141. pmd_t pmd, *pmdp;
  1142. pte_t *ptep;
  1143. int shift;
  1144. unsigned long pte;
  1145. kvm = p->kvm;
  1146. if (!kvm_is_radix(kvm))
  1147. return 0;
  1148. ret = mutex_lock_interruptible(&p->mutex);
  1149. if (ret)
  1150. return ret;
  1151. if (p->chars_left) {
  1152. n = p->chars_left;
  1153. if (n > len)
  1154. n = len;
  1155. r = copy_to_user(buf, p->buf + p->buf_index, n);
  1156. n -= r;
  1157. p->chars_left -= n;
  1158. p->buf_index += n;
  1159. buf += n;
  1160. len -= n;
  1161. ret = n;
  1162. if (r) {
  1163. if (!n)
  1164. ret = -EFAULT;
  1165. goto out;
  1166. }
  1167. }
  1168. gpa = p->gpa;
  1169. nested = NULL;
  1170. pgt = NULL;
  1171. while (len != 0 && p->lpid >= 0) {
  1172. if (gpa >= RADIX_PGTABLE_RANGE) {
  1173. gpa = 0;
  1174. pgt = NULL;
  1175. if (nested) {
  1176. kvmhv_put_nested(nested);
  1177. nested = NULL;
  1178. }
  1179. p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
  1180. p->hdr = 0;
  1181. if (p->lpid < 0)
  1182. break;
  1183. }
  1184. if (!pgt) {
  1185. if (p->lpid == 0) {
  1186. pgt = kvm->arch.pgtable;
  1187. } else {
  1188. nested = kvmhv_get_nested(kvm, p->lpid, false);
  1189. if (!nested) {
  1190. gpa = RADIX_PGTABLE_RANGE;
  1191. continue;
  1192. }
  1193. pgt = nested->shadow_pgtable;
  1194. }
  1195. }
  1196. n = 0;
  1197. if (!p->hdr) {
  1198. if (p->lpid > 0)
  1199. n = scnprintf(p->buf, sizeof(p->buf),
  1200. "\nNested LPID %d: ", p->lpid);
  1201. n += scnprintf(p->buf + n, sizeof(p->buf) - n,
  1202. "pgdir: %lx\n", (unsigned long)pgt);
  1203. p->hdr = 1;
  1204. goto copy;
  1205. }
  1206. pgdp = pgt + pgd_index(gpa);
  1207. p4dp = p4d_offset(pgdp, gpa);
  1208. p4d = READ_ONCE(*p4dp);
  1209. if (!(p4d_val(p4d) & _PAGE_PRESENT)) {
  1210. gpa = (gpa & P4D_MASK) + P4D_SIZE;
  1211. continue;
  1212. }
  1213. pudp = pud_offset(&p4d, gpa);
  1214. pud = READ_ONCE(*pudp);
  1215. if (!(pud_val(pud) & _PAGE_PRESENT)) {
  1216. gpa = (gpa & PUD_MASK) + PUD_SIZE;
  1217. continue;
  1218. }
  1219. if (pud_val(pud) & _PAGE_PTE) {
  1220. pte = pud_val(pud);
  1221. shift = PUD_SHIFT;
  1222. goto leaf;
  1223. }
  1224. pmdp = pmd_offset(&pud, gpa);
  1225. pmd = READ_ONCE(*pmdp);
  1226. if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
  1227. gpa = (gpa & PMD_MASK) + PMD_SIZE;
  1228. continue;
  1229. }
  1230. if (pmd_val(pmd) & _PAGE_PTE) {
  1231. pte = pmd_val(pmd);
  1232. shift = PMD_SHIFT;
  1233. goto leaf;
  1234. }
  1235. ptep = pte_offset_kernel(&pmd, gpa);
  1236. pte = pte_val(READ_ONCE(*ptep));
  1237. if (!(pte & _PAGE_PRESENT)) {
  1238. gpa += PAGE_SIZE;
  1239. continue;
  1240. }
  1241. shift = PAGE_SHIFT;
  1242. leaf:
  1243. n = scnprintf(p->buf, sizeof(p->buf),
  1244. " %lx: %lx %d\n", gpa, pte, shift);
  1245. gpa += 1ul << shift;
  1246. copy:
  1247. p->chars_left = n;
  1248. if (n > len)
  1249. n = len;
  1250. r = copy_to_user(buf, p->buf, n);
  1251. n -= r;
  1252. p->chars_left -= n;
  1253. p->buf_index = n;
  1254. buf += n;
  1255. len -= n;
  1256. ret += n;
  1257. if (r) {
  1258. if (!ret)
  1259. ret = -EFAULT;
  1260. break;
  1261. }
  1262. }
  1263. p->gpa = gpa;
  1264. if (nested)
  1265. kvmhv_put_nested(nested);
  1266. out:
  1267. mutex_unlock(&p->mutex);
  1268. return ret;
  1269. }
  1270. static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
  1271. size_t len, loff_t *ppos)
  1272. {
  1273. return -EACCES;
  1274. }
  1275. static const struct file_operations debugfs_radix_fops = {
  1276. .owner = THIS_MODULE,
  1277. .open = debugfs_radix_open,
  1278. .release = debugfs_radix_release,
  1279. .read = debugfs_radix_read,
  1280. .write = debugfs_radix_write,
  1281. .llseek = generic_file_llseek,
  1282. };
  1283. void kvmhv_radix_debugfs_init(struct kvm *kvm)
  1284. {
  1285. debugfs_create_file("radix", 0400, kvm->debugfs_dentry, kvm,
  1286. &debugfs_radix_fops);
  1287. }
  1288. int kvmppc_radix_init(void)
  1289. {
  1290. unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
  1291. kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
  1292. if (!kvm_pte_cache)
  1293. return -ENOMEM;
  1294. size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
  1295. kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
  1296. if (!kvm_pmd_cache) {
  1297. kmem_cache_destroy(kvm_pte_cache);
  1298. return -ENOMEM;
  1299. }
  1300. return 0;
  1301. }
  1302. void kvmppc_radix_exit(void)
  1303. {
  1304. kmem_cache_destroy(kvm_pte_cache);
  1305. kmem_cache_destroy(kvm_pmd_cache);
  1306. }