book3s_64_mmu_radix.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901
  1. /*
  2. * This program is free software; you can redistribute it and/or modify
  3. * it under the terms of the GNU General Public License, version 2, as
  4. * published by the Free Software Foundation.
  5. *
  6. * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  7. */
  8. #include <linux/types.h>
  9. #include <linux/string.h>
  10. #include <linux/kvm.h>
  11. #include <linux/kvm_host.h>
  12. #include <asm/kvm_ppc.h>
  13. #include <asm/kvm_book3s.h>
  14. #include <asm/page.h>
  15. #include <asm/mmu.h>
  16. #include <asm/pgtable.h>
  17. #include <asm/pgalloc.h>
  18. #include <asm/pte-walk.h>
  19. /*
  20. * Supported radix tree geometry.
  21. * Like p9, we support either 5 or 9 bits at the first (lowest) level,
  22. * for a page size of 64k or 4k.
  23. */
  24. static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
  25. int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
  26. struct kvmppc_pte *gpte, bool data, bool iswrite)
  27. {
  28. struct kvm *kvm = vcpu->kvm;
  29. u32 pid;
  30. int ret, level, ps;
  31. __be64 prte, rpte;
  32. unsigned long ptbl;
  33. unsigned long root, pte, index;
  34. unsigned long rts, bits, offset;
  35. unsigned long gpa;
  36. unsigned long proc_tbl_size;
  37. /* Work out effective PID */
  38. switch (eaddr >> 62) {
  39. case 0:
  40. pid = vcpu->arch.pid;
  41. break;
  42. case 3:
  43. pid = 0;
  44. break;
  45. default:
  46. return -EINVAL;
  47. }
  48. proc_tbl_size = 1 << ((kvm->arch.process_table & PRTS_MASK) + 12);
  49. if (pid * 16 >= proc_tbl_size)
  50. return -EINVAL;
  51. /* Read partition table to find root of tree for effective PID */
  52. ptbl = (kvm->arch.process_table & PRTB_MASK) + (pid * 16);
  53. ret = kvm_read_guest(kvm, ptbl, &prte, sizeof(prte));
  54. if (ret)
  55. return ret;
  56. root = be64_to_cpu(prte);
  57. rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
  58. ((root & RTS2_MASK) >> RTS2_SHIFT);
  59. bits = root & RPDS_MASK;
  60. root = root & RPDB_MASK;
  61. offset = rts + 31;
  62. /* current implementations only support 52-bit space */
  63. if (offset != 52)
  64. return -EINVAL;
  65. for (level = 3; level >= 0; --level) {
  66. if (level && bits != p9_supported_radix_bits[level])
  67. return -EINVAL;
  68. if (level == 0 && !(bits == 5 || bits == 9))
  69. return -EINVAL;
  70. offset -= bits;
  71. index = (eaddr >> offset) & ((1UL << bits) - 1);
  72. /* check that low bits of page table base are zero */
  73. if (root & ((1UL << (bits + 3)) - 1))
  74. return -EINVAL;
  75. ret = kvm_read_guest(kvm, root + index * 8,
  76. &rpte, sizeof(rpte));
  77. if (ret)
  78. return ret;
  79. pte = __be64_to_cpu(rpte);
  80. if (!(pte & _PAGE_PRESENT))
  81. return -ENOENT;
  82. if (pte & _PAGE_PTE)
  83. break;
  84. bits = pte & 0x1f;
  85. root = pte & 0x0fffffffffffff00ul;
  86. }
  87. /* need a leaf at lowest level; 512GB pages not supported */
  88. if (level < 0 || level == 3)
  89. return -EINVAL;
  90. /* offset is now log base 2 of the page size */
  91. gpa = pte & 0x01fffffffffff000ul;
  92. if (gpa & ((1ul << offset) - 1))
  93. return -EINVAL;
  94. gpa += eaddr & ((1ul << offset) - 1);
  95. for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
  96. if (offset == mmu_psize_defs[ps].shift)
  97. break;
  98. gpte->page_size = ps;
  99. gpte->eaddr = eaddr;
  100. gpte->raddr = gpa;
  101. /* Work out permissions */
  102. gpte->may_read = !!(pte & _PAGE_READ);
  103. gpte->may_write = !!(pte & _PAGE_WRITE);
  104. gpte->may_execute = !!(pte & _PAGE_EXEC);
  105. if (kvmppc_get_msr(vcpu) & MSR_PR) {
  106. if (pte & _PAGE_PRIVILEGED) {
  107. gpte->may_read = 0;
  108. gpte->may_write = 0;
  109. gpte->may_execute = 0;
  110. }
  111. } else {
  112. if (!(pte & _PAGE_PRIVILEGED)) {
  113. /* Check AMR/IAMR to see if strict mode is in force */
  114. if (vcpu->arch.amr & (1ul << 62))
  115. gpte->may_read = 0;
  116. if (vcpu->arch.amr & (1ul << 63))
  117. gpte->may_write = 0;
  118. if (vcpu->arch.iamr & (1ul << 62))
  119. gpte->may_execute = 0;
  120. }
  121. }
  122. return 0;
  123. }
  124. static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
  125. unsigned int pshift)
  126. {
  127. unsigned long psize = PAGE_SIZE;
  128. if (pshift)
  129. psize = 1UL << pshift;
  130. addr &= ~(psize - 1);
  131. radix__flush_tlb_lpid_page(kvm->arch.lpid, addr, psize);
  132. }
  133. static void kvmppc_radix_flush_pwc(struct kvm *kvm)
  134. {
  135. radix__flush_pwc_lpid(kvm->arch.lpid);
  136. }
  137. static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
  138. unsigned long clr, unsigned long set,
  139. unsigned long addr, unsigned int shift)
  140. {
  141. return __radix_pte_update(ptep, clr, set);
  142. }
  143. void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
  144. pte_t *ptep, pte_t pte)
  145. {
  146. radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
  147. }
  148. static struct kmem_cache *kvm_pte_cache;
  149. static struct kmem_cache *kvm_pmd_cache;
  150. static pte_t *kvmppc_pte_alloc(void)
  151. {
  152. pte_t *pte;
  153. pte = kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
  154. /* pmd_populate() will only reference _pa(pte). */
  155. kmemleak_ignore(pte);
  156. return pte;
  157. }
  158. static void kvmppc_pte_free(pte_t *ptep)
  159. {
  160. kmem_cache_free(kvm_pte_cache, ptep);
  161. }
  162. /* Like pmd_huge() and pmd_large(), but works regardless of config options */
  163. static inline int pmd_is_leaf(pmd_t pmd)
  164. {
  165. return !!(pmd_val(pmd) & _PAGE_PTE);
  166. }
  167. static pmd_t *kvmppc_pmd_alloc(void)
  168. {
  169. pmd_t *pmd;
  170. pmd = kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);
  171. /* pud_populate() will only reference _pa(pmd). */
  172. kmemleak_ignore(pmd);
  173. return pmd;
  174. }
  175. static void kvmppc_pmd_free(pmd_t *pmdp)
  176. {
  177. kmem_cache_free(kvm_pmd_cache, pmdp);
  178. }
  179. static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
  180. unsigned long gpa, unsigned int shift)
  181. {
  182. unsigned long page_size = 1ul << shift;
  183. unsigned long old;
  184. old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
  185. kvmppc_radix_tlbie_page(kvm, gpa, shift);
  186. if (old & _PAGE_DIRTY) {
  187. unsigned long gfn = gpa >> PAGE_SHIFT;
  188. struct kvm_memory_slot *memslot;
  189. memslot = gfn_to_memslot(kvm, gfn);
  190. if (memslot && memslot->dirty_bitmap)
  191. kvmppc_update_dirty_map(memslot, gfn, page_size);
  192. }
  193. }
  194. /*
  195. * kvmppc_free_p?d are used to free existing page tables, and recursively
  196. * descend and clear and free children.
  197. * Callers are responsible for flushing the PWC.
  198. *
  199. * When page tables are being unmapped/freed as part of page fault path
  200. * (full == false), ptes are not expected. There is code to unmap them
  201. * and emit a warning if encountered, but there may already be data
  202. * corruption due to the unexpected mappings.
  203. */
  204. static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full)
  205. {
  206. if (full) {
  207. memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
  208. } else {
  209. pte_t *p = pte;
  210. unsigned long it;
  211. for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
  212. if (pte_val(*p) == 0)
  213. continue;
  214. WARN_ON_ONCE(1);
  215. kvmppc_unmap_pte(kvm, p,
  216. pte_pfn(*p) << PAGE_SHIFT,
  217. PAGE_SHIFT);
  218. }
  219. }
  220. kvmppc_pte_free(pte);
  221. }
  222. static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full)
  223. {
  224. unsigned long im;
  225. pmd_t *p = pmd;
  226. for (im = 0; im < PTRS_PER_PMD; ++im, ++p) {
  227. if (!pmd_present(*p))
  228. continue;
  229. if (pmd_is_leaf(*p)) {
  230. if (full) {
  231. pmd_clear(p);
  232. } else {
  233. WARN_ON_ONCE(1);
  234. kvmppc_unmap_pte(kvm, (pte_t *)p,
  235. pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
  236. PMD_SHIFT);
  237. }
  238. } else {
  239. pte_t *pte;
  240. pte = pte_offset_map(p, 0);
  241. kvmppc_unmap_free_pte(kvm, pte, full);
  242. pmd_clear(p);
  243. }
  244. }
  245. kvmppc_pmd_free(pmd);
  246. }
  247. static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud)
  248. {
  249. unsigned long iu;
  250. pud_t *p = pud;
  251. for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {
  252. if (!pud_present(*p))
  253. continue;
  254. if (pud_huge(*p)) {
  255. pud_clear(p);
  256. } else {
  257. pmd_t *pmd;
  258. pmd = pmd_offset(p, 0);
  259. kvmppc_unmap_free_pmd(kvm, pmd, true);
  260. pud_clear(p);
  261. }
  262. }
  263. pud_free(kvm->mm, pud);
  264. }
  265. void kvmppc_free_radix(struct kvm *kvm)
  266. {
  267. unsigned long ig;
  268. pgd_t *pgd;
  269. if (!kvm->arch.pgtable)
  270. return;
  271. pgd = kvm->arch.pgtable;
  272. for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
  273. pud_t *pud;
  274. if (!pgd_present(*pgd))
  275. continue;
  276. pud = pud_offset(pgd, 0);
  277. kvmppc_unmap_free_pud(kvm, pud);
  278. pgd_clear(pgd);
  279. }
  280. pgd_free(kvm->mm, kvm->arch.pgtable);
  281. kvm->arch.pgtable = NULL;
  282. }
  283. static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
  284. unsigned long gpa)
  285. {
  286. pte_t *pte = pte_offset_kernel(pmd, 0);
  287. /*
  288. * Clearing the pmd entry then flushing the PWC ensures that the pte
  289. * page no longer be cached by the MMU, so can be freed without
  290. * flushing the PWC again.
  291. */
  292. pmd_clear(pmd);
  293. kvmppc_radix_flush_pwc(kvm);
  294. kvmppc_unmap_free_pte(kvm, pte, false);
  295. }
  296. static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
  297. unsigned long gpa)
  298. {
  299. pmd_t *pmd = pmd_offset(pud, 0);
  300. /*
  301. * Clearing the pud entry then flushing the PWC ensures that the pmd
  302. * page and any children pte pages will no longer be cached by the MMU,
  303. * so can be freed without flushing the PWC again.
  304. */
  305. pud_clear(pud);
  306. kvmppc_radix_flush_pwc(kvm);
  307. kvmppc_unmap_free_pmd(kvm, pmd, false);
  308. }
  309. /*
  310. * There are a number of bits which may differ between different faults to
  311. * the same partition scope entry. RC bits, in the course of cleaning and
  312. * aging. And the write bit can change, either the access could have been
  313. * upgraded, or a read fault could happen concurrently with a write fault
  314. * that sets those bits first.
  315. */
  316. #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
  317. static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
  318. unsigned int level, unsigned long mmu_seq)
  319. {
  320. pgd_t *pgd;
  321. pud_t *pud, *new_pud = NULL;
  322. pmd_t *pmd, *new_pmd = NULL;
  323. pte_t *ptep, *new_ptep = NULL;
  324. int ret;
  325. /* Traverse the guest's 2nd-level tree, allocate new levels needed */
  326. pgd = kvm->arch.pgtable + pgd_index(gpa);
  327. pud = NULL;
  328. if (pgd_present(*pgd))
  329. pud = pud_offset(pgd, gpa);
  330. else
  331. new_pud = pud_alloc_one(kvm->mm, gpa);
  332. pmd = NULL;
  333. if (pud && pud_present(*pud) && !pud_huge(*pud))
  334. pmd = pmd_offset(pud, gpa);
  335. else if (level <= 1)
  336. new_pmd = kvmppc_pmd_alloc();
  337. if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd)))
  338. new_ptep = kvmppc_pte_alloc();
  339. /* Check if we might have been invalidated; let the guest retry if so */
  340. spin_lock(&kvm->mmu_lock);
  341. ret = -EAGAIN;
  342. if (mmu_notifier_retry(kvm, mmu_seq))
  343. goto out_unlock;
  344. /* Now traverse again under the lock and change the tree */
  345. ret = -ENOMEM;
  346. if (pgd_none(*pgd)) {
  347. if (!new_pud)
  348. goto out_unlock;
  349. pgd_populate(kvm->mm, pgd, new_pud);
  350. new_pud = NULL;
  351. }
  352. pud = pud_offset(pgd, gpa);
  353. if (pud_huge(*pud)) {
  354. unsigned long hgpa = gpa & PUD_MASK;
  355. /* Check if we raced and someone else has set the same thing */
  356. if (level == 2) {
  357. if (pud_raw(*pud) == pte_raw(pte)) {
  358. ret = 0;
  359. goto out_unlock;
  360. }
  361. /* Valid 1GB page here already, add our extra bits */
  362. WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) &
  363. PTE_BITS_MUST_MATCH);
  364. kvmppc_radix_update_pte(kvm, (pte_t *)pud,
  365. 0, pte_val(pte), hgpa, PUD_SHIFT);
  366. ret = 0;
  367. goto out_unlock;
  368. }
  369. /*
  370. * If we raced with another CPU which has just put
  371. * a 1GB pte in after we saw a pmd page, try again.
  372. */
  373. if (!new_pmd) {
  374. ret = -EAGAIN;
  375. goto out_unlock;
  376. }
  377. /* Valid 1GB page here already, remove it */
  378. kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT);
  379. }
  380. if (level == 2) {
  381. if (!pud_none(*pud)) {
  382. /*
  383. * There's a page table page here, but we wanted to
  384. * install a large page, so remove and free the page
  385. * table page.
  386. */
  387. kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa);
  388. }
  389. kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
  390. ret = 0;
  391. goto out_unlock;
  392. }
  393. if (pud_none(*pud)) {
  394. if (!new_pmd)
  395. goto out_unlock;
  396. pud_populate(kvm->mm, pud, new_pmd);
  397. new_pmd = NULL;
  398. }
  399. pmd = pmd_offset(pud, gpa);
  400. if (pmd_is_leaf(*pmd)) {
  401. unsigned long lgpa = gpa & PMD_MASK;
  402. /* Check if we raced and someone else has set the same thing */
  403. if (level == 1) {
  404. if (pmd_raw(*pmd) == pte_raw(pte)) {
  405. ret = 0;
  406. goto out_unlock;
  407. }
  408. /* Valid 2MB page here already, add our extra bits */
  409. WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
  410. PTE_BITS_MUST_MATCH);
  411. kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
  412. 0, pte_val(pte), lgpa, PMD_SHIFT);
  413. ret = 0;
  414. goto out_unlock;
  415. }
  416. /*
  417. * If we raced with another CPU which has just put
  418. * a 2MB pte in after we saw a pte page, try again.
  419. */
  420. if (!new_ptep) {
  421. ret = -EAGAIN;
  422. goto out_unlock;
  423. }
  424. /* Valid 2MB page here already, remove it */
  425. kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT);
  426. }
  427. if (level == 1) {
  428. if (!pmd_none(*pmd)) {
  429. /*
  430. * There's a page table page here, but we wanted to
  431. * install a large page, so remove and free the page
  432. * table page.
  433. */
  434. kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa);
  435. }
  436. kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
  437. ret = 0;
  438. goto out_unlock;
  439. }
  440. if (pmd_none(*pmd)) {
  441. if (!new_ptep)
  442. goto out_unlock;
  443. pmd_populate(kvm->mm, pmd, new_ptep);
  444. new_ptep = NULL;
  445. }
  446. ptep = pte_offset_kernel(pmd, gpa);
  447. if (pte_present(*ptep)) {
  448. /* Check if someone else set the same thing */
  449. if (pte_raw(*ptep) == pte_raw(pte)) {
  450. ret = 0;
  451. goto out_unlock;
  452. }
  453. /* Valid page here already, add our extra bits */
  454. WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) &
  455. PTE_BITS_MUST_MATCH);
  456. kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0);
  457. ret = 0;
  458. goto out_unlock;
  459. }
  460. kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
  461. ret = 0;
  462. out_unlock:
  463. spin_unlock(&kvm->mmu_lock);
  464. if (new_pud)
  465. pud_free(kvm->mm, new_pud);
  466. if (new_pmd)
  467. kvmppc_pmd_free(new_pmd);
  468. if (new_ptep)
  469. kvmppc_pte_free(new_ptep);
  470. return ret;
  471. }
  472. int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
  473. unsigned long ea, unsigned long dsisr)
  474. {
  475. struct kvm *kvm = vcpu->kvm;
  476. unsigned long mmu_seq;
  477. unsigned long gpa, gfn, hva;
  478. struct kvm_memory_slot *memslot;
  479. struct page *page = NULL;
  480. long ret;
  481. bool writing;
  482. bool upgrade_write = false;
  483. bool *upgrade_p = &upgrade_write;
  484. pte_t pte, *ptep;
  485. unsigned long pgflags;
  486. unsigned int shift, level;
  487. /* Check for unusual errors */
  488. if (dsisr & DSISR_UNSUPP_MMU) {
  489. pr_err("KVM: Got unsupported MMU fault\n");
  490. return -EFAULT;
  491. }
  492. if (dsisr & DSISR_BADACCESS) {
  493. /* Reflect to the guest as DSI */
  494. pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
  495. kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
  496. return RESUME_GUEST;
  497. }
  498. /* Translate the logical address and get the page */
  499. gpa = vcpu->arch.fault_gpa & ~0xfffUL;
  500. gpa &= ~0xF000000000000000ul;
  501. gfn = gpa >> PAGE_SHIFT;
  502. if (!(dsisr & DSISR_PRTABLE_FAULT))
  503. gpa |= ea & 0xfff;
  504. memslot = gfn_to_memslot(kvm, gfn);
  505. /* No memslot means it's an emulated MMIO region */
  506. if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
  507. if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
  508. DSISR_SET_RC)) {
  509. /*
  510. * Bad address in guest page table tree, or other
  511. * unusual error - reflect it to the guest as DSI.
  512. */
  513. kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
  514. return RESUME_GUEST;
  515. }
  516. return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
  517. dsisr & DSISR_ISSTORE);
  518. }
  519. writing = (dsisr & DSISR_ISSTORE) != 0;
  520. if (memslot->flags & KVM_MEM_READONLY) {
  521. if (writing) {
  522. /* give the guest a DSI */
  523. dsisr = DSISR_ISSTORE | DSISR_PROTFAULT;
  524. kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
  525. return RESUME_GUEST;
  526. }
  527. upgrade_p = NULL;
  528. }
  529. if (dsisr & DSISR_SET_RC) {
  530. /*
  531. * Need to set an R or C bit in the 2nd-level tables;
  532. * since we are just helping out the hardware here,
  533. * it is sufficient to do what the hardware does.
  534. */
  535. pgflags = _PAGE_ACCESSED;
  536. if (writing)
  537. pgflags |= _PAGE_DIRTY;
  538. /*
  539. * We are walking the secondary page table here. We can do this
  540. * without disabling irq.
  541. */
  542. spin_lock(&kvm->mmu_lock);
  543. ptep = __find_linux_pte(kvm->arch.pgtable,
  544. gpa, NULL, &shift);
  545. if (ptep && pte_present(*ptep) &&
  546. (!writing || pte_write(*ptep))) {
  547. kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
  548. gpa, shift);
  549. dsisr &= ~DSISR_SET_RC;
  550. }
  551. spin_unlock(&kvm->mmu_lock);
  552. if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
  553. DSISR_PROTFAULT | DSISR_SET_RC)))
  554. return RESUME_GUEST;
  555. }
  556. /* used to check for invalidations in progress */
  557. mmu_seq = kvm->mmu_notifier_seq;
  558. smp_rmb();
  559. /*
  560. * Do a fast check first, since __gfn_to_pfn_memslot doesn't
  561. * do it with !atomic && !async, which is how we call it.
  562. * We always ask for write permission since the common case
  563. * is that the page is writable.
  564. */
  565. hva = gfn_to_hva_memslot(memslot, gfn);
  566. if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
  567. upgrade_write = true;
  568. } else {
  569. unsigned long pfn;
  570. /* Call KVM generic code to do the slow-path check */
  571. pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
  572. writing, upgrade_p);
  573. if (is_error_noslot_pfn(pfn))
  574. return -EFAULT;
  575. page = NULL;
  576. if (pfn_valid(pfn)) {
  577. page = pfn_to_page(pfn);
  578. if (PageReserved(page))
  579. page = NULL;
  580. }
  581. }
  582. /*
  583. * Read the PTE from the process' radix tree and use that
  584. * so we get the shift and attribute bits.
  585. */
  586. local_irq_disable();
  587. ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
  588. /*
  589. * If the PTE disappeared temporarily due to a THP
  590. * collapse, just return and let the guest try again.
  591. */
  592. if (!ptep) {
  593. local_irq_enable();
  594. if (page)
  595. put_page(page);
  596. return RESUME_GUEST;
  597. }
  598. pte = *ptep;
  599. local_irq_enable();
  600. /* Get pte level from shift/size */
  601. if (shift == PUD_SHIFT &&
  602. (gpa & (PUD_SIZE - PAGE_SIZE)) ==
  603. (hva & (PUD_SIZE - PAGE_SIZE))) {
  604. level = 2;
  605. } else if (shift == PMD_SHIFT &&
  606. (gpa & (PMD_SIZE - PAGE_SIZE)) ==
  607. (hva & (PMD_SIZE - PAGE_SIZE))) {
  608. level = 1;
  609. } else {
  610. level = 0;
  611. if (shift > PAGE_SHIFT) {
  612. /*
  613. * If the pte maps more than one page, bring over
  614. * bits from the virtual address to get the real
  615. * address of the specific single page we want.
  616. */
  617. unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
  618. pte = __pte(pte_val(pte) | (hva & rpnmask));
  619. }
  620. }
  621. pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
  622. if (writing || upgrade_write) {
  623. if (pte_val(pte) & _PAGE_WRITE)
  624. pte = __pte(pte_val(pte) | _PAGE_DIRTY);
  625. } else {
  626. pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
  627. }
  628. /* Allocate space in the tree and write the PTE */
  629. ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
  630. if (page) {
  631. if (!ret && (pte_val(pte) & _PAGE_WRITE))
  632. set_page_dirty_lock(page);
  633. put_page(page);
  634. }
  635. if (ret == 0 || ret == -EAGAIN)
  636. ret = RESUME_GUEST;
  637. return ret;
  638. }
  639. /* Called with kvm->lock held */
  640. int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
  641. unsigned long gfn)
  642. {
  643. pte_t *ptep;
  644. unsigned long gpa = gfn << PAGE_SHIFT;
  645. unsigned int shift;
  646. unsigned long old;
  647. ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
  648. if (ptep && pte_present(*ptep)) {
  649. old = kvmppc_radix_update_pte(kvm, ptep, ~0UL, 0,
  650. gpa, shift);
  651. kvmppc_radix_tlbie_page(kvm, gpa, shift);
  652. if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) {
  653. unsigned long psize = PAGE_SIZE;
  654. if (shift)
  655. psize = 1ul << shift;
  656. kvmppc_update_dirty_map(memslot, gfn, psize);
  657. }
  658. }
  659. return 0;
  660. }
  661. /* Called with kvm->lock held */
  662. int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
  663. unsigned long gfn)
  664. {
  665. pte_t *ptep;
  666. unsigned long gpa = gfn << PAGE_SHIFT;
  667. unsigned int shift;
  668. int ref = 0;
  669. ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
  670. if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
  671. kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
  672. gpa, shift);
  673. /* XXX need to flush tlb here? */
  674. ref = 1;
  675. }
  676. return ref;
  677. }
  678. /* Called with kvm->lock held */
  679. int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
  680. unsigned long gfn)
  681. {
  682. pte_t *ptep;
  683. unsigned long gpa = gfn << PAGE_SHIFT;
  684. unsigned int shift;
  685. int ref = 0;
  686. ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
  687. if (ptep && pte_present(*ptep) && pte_young(*ptep))
  688. ref = 1;
  689. return ref;
  690. }
  691. /* Returns the number of PAGE_SIZE pages that are dirty */
  692. static int kvm_radix_test_clear_dirty(struct kvm *kvm,
  693. struct kvm_memory_slot *memslot, int pagenum)
  694. {
  695. unsigned long gfn = memslot->base_gfn + pagenum;
  696. unsigned long gpa = gfn << PAGE_SHIFT;
  697. pte_t *ptep;
  698. unsigned int shift;
  699. int ret = 0;
  700. ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
  701. if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) {
  702. ret = 1;
  703. if (shift)
  704. ret = 1 << (shift - PAGE_SHIFT);
  705. kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
  706. gpa, shift);
  707. kvmppc_radix_tlbie_page(kvm, gpa, shift);
  708. }
  709. return ret;
  710. }
  711. long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
  712. struct kvm_memory_slot *memslot, unsigned long *map)
  713. {
  714. unsigned long i, j;
  715. int npages;
  716. for (i = 0; i < memslot->npages; i = j) {
  717. npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
  718. /*
  719. * Note that if npages > 0 then i must be a multiple of npages,
  720. * since huge pages are only used to back the guest at guest
  721. * real addresses that are a multiple of their size.
  722. * Since we have at most one PTE covering any given guest
  723. * real address, if npages > 1 we can skip to i + npages.
  724. */
  725. j = i + 1;
  726. if (npages) {
  727. set_dirty_bits(map, i, npages);
  728. j = i + npages;
  729. }
  730. }
  731. return 0;
  732. }
  733. static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
  734. int psize, int *indexp)
  735. {
  736. if (!mmu_psize_defs[psize].shift)
  737. return;
  738. info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
  739. (mmu_psize_defs[psize].ap << 29);
  740. ++(*indexp);
  741. }
  742. int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
  743. {
  744. int i;
  745. if (!radix_enabled())
  746. return -EINVAL;
  747. memset(info, 0, sizeof(*info));
  748. /* 4k page size */
  749. info->geometries[0].page_shift = 12;
  750. info->geometries[0].level_bits[0] = 9;
  751. for (i = 1; i < 4; ++i)
  752. info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
  753. /* 64k page size */
  754. info->geometries[1].page_shift = 16;
  755. for (i = 0; i < 4; ++i)
  756. info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
  757. i = 0;
  758. add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
  759. add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
  760. add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
  761. add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
  762. return 0;
  763. }
  764. int kvmppc_init_vm_radix(struct kvm *kvm)
  765. {
  766. kvm->arch.pgtable = pgd_alloc(kvm->mm);
  767. if (!kvm->arch.pgtable)
  768. return -ENOMEM;
  769. return 0;
  770. }
  771. static void pte_ctor(void *addr)
  772. {
  773. memset(addr, 0, RADIX_PTE_TABLE_SIZE);
  774. }
  775. static void pmd_ctor(void *addr)
  776. {
  777. memset(addr, 0, RADIX_PMD_TABLE_SIZE);
  778. }
  779. int kvmppc_radix_init(void)
  780. {
  781. unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
  782. kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
  783. if (!kvm_pte_cache)
  784. return -ENOMEM;
  785. size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
  786. kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
  787. if (!kvm_pmd_cache) {
  788. kmem_cache_destroy(kvm_pte_cache);
  789. return -ENOMEM;
  790. }
  791. return 0;
  792. }
  793. void kvmppc_radix_exit(void)
  794. {
  795. kmem_cache_destroy(kvm_pte_cache);
  796. kmem_cache_destroy(kvm_pmd_cache);
  797. }