book3s_64_mmu_hv.c 53 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160
  1. /*
  2. * This program is free software; you can redistribute it and/or modify
  3. * it under the terms of the GNU General Public License, version 2, as
  4. * published by the Free Software Foundation.
  5. *
  6. * This program is distributed in the hope that it will be useful,
  7. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. * GNU General Public License for more details.
  10. *
  11. * You should have received a copy of the GNU General Public License
  12. * along with this program; if not, write to the Free Software
  13. * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  14. *
  15. * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  16. */
  17. #include <linux/types.h>
  18. #include <linux/string.h>
  19. #include <linux/kvm.h>
  20. #include <linux/kvm_host.h>
  21. #include <linux/highmem.h>
  22. #include <linux/gfp.h>
  23. #include <linux/slab.h>
  24. #include <linux/hugetlb.h>
  25. #include <linux/vmalloc.h>
  26. #include <linux/srcu.h>
  27. #include <linux/anon_inodes.h>
  28. #include <linux/file.h>
  29. #include <linux/debugfs.h>
  30. #include <asm/kvm_ppc.h>
  31. #include <asm/kvm_book3s.h>
  32. #include <asm/book3s/64/mmu-hash.h>
  33. #include <asm/hvcall.h>
  34. #include <asm/synch.h>
  35. #include <asm/ppc-opcode.h>
  36. #include <asm/cputable.h>
  37. #include <asm/pte-walk.h>
  38. #include "trace_hv.h"
  39. //#define DEBUG_RESIZE_HPT 1
  40. #ifdef DEBUG_RESIZE_HPT
  41. #define resize_hpt_debug(resize, ...) \
  42. do { \
  43. printk(KERN_DEBUG "RESIZE HPT %p: ", resize); \
  44. printk(__VA_ARGS__); \
  45. } while (0)
  46. #else
  47. #define resize_hpt_debug(resize, ...) \
  48. do { } while (0)
  49. #endif
  50. static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
  51. long pte_index, unsigned long pteh,
  52. unsigned long ptel, unsigned long *pte_idx_ret);
  53. struct kvm_resize_hpt {
  54. /* These fields read-only after init */
  55. struct kvm *kvm;
  56. struct work_struct work;
  57. u32 order;
  58. /* These fields protected by kvm->lock */
  59. /* Possible values and their usage:
  60. * <0 an error occurred during allocation,
  61. * -EBUSY allocation is in the progress,
  62. * 0 allocation made successfuly.
  63. */
  64. int error;
  65. /* Private to the work thread, until error != -EBUSY,
  66. * then protected by kvm->lock.
  67. */
  68. struct kvm_hpt_info hpt;
  69. };
  70. int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
  71. {
  72. unsigned long hpt = 0;
  73. int cma = 0;
  74. struct page *page = NULL;
  75. struct revmap_entry *rev;
  76. unsigned long npte;
  77. if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER))
  78. return -EINVAL;
  79. page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT));
  80. if (page) {
  81. hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
  82. memset((void *)hpt, 0, (1ul << order));
  83. cma = 1;
  84. }
  85. if (!hpt)
  86. hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_RETRY_MAYFAIL
  87. |__GFP_NOWARN, order - PAGE_SHIFT);
  88. if (!hpt)
  89. return -ENOMEM;
  90. /* HPTEs are 2**4 bytes long */
  91. npte = 1ul << (order - 4);
  92. /* Allocate reverse map array */
  93. rev = vmalloc(array_size(npte, sizeof(struct revmap_entry)));
  94. if (!rev) {
  95. if (cma)
  96. kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT));
  97. else
  98. free_pages(hpt, order - PAGE_SHIFT);
  99. return -ENOMEM;
  100. }
  101. info->order = order;
  102. info->virt = hpt;
  103. info->cma = cma;
  104. info->rev = rev;
  105. return 0;
  106. }
  107. void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info)
  108. {
  109. atomic64_set(&kvm->arch.mmio_update, 0);
  110. kvm->arch.hpt = *info;
  111. kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18);
  112. pr_debug("KVM guest htab at %lx (order %ld), LPID %x\n",
  113. info->virt, (long)info->order, kvm->arch.lpid);
  114. }
  115. long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
  116. {
  117. long err = -EBUSY;
  118. struct kvm_hpt_info info;
  119. mutex_lock(&kvm->lock);
  120. if (kvm->arch.mmu_ready) {
  121. kvm->arch.mmu_ready = 0;
  122. /* order mmu_ready vs. vcpus_running */
  123. smp_mb();
  124. if (atomic_read(&kvm->arch.vcpus_running)) {
  125. kvm->arch.mmu_ready = 1;
  126. goto out;
  127. }
  128. }
  129. if (kvm_is_radix(kvm)) {
  130. err = kvmppc_switch_mmu_to_hpt(kvm);
  131. if (err)
  132. goto out;
  133. }
  134. if (kvm->arch.hpt.order == order) {
  135. /* We already have a suitable HPT */
  136. /* Set the entire HPT to 0, i.e. invalid HPTEs */
  137. memset((void *)kvm->arch.hpt.virt, 0, 1ul << order);
  138. /*
  139. * Reset all the reverse-mapping chains for all memslots
  140. */
  141. kvmppc_rmap_reset(kvm);
  142. err = 0;
  143. goto out;
  144. }
  145. if (kvm->arch.hpt.virt) {
  146. kvmppc_free_hpt(&kvm->arch.hpt);
  147. kvmppc_rmap_reset(kvm);
  148. }
  149. err = kvmppc_allocate_hpt(&info, order);
  150. if (err < 0)
  151. goto out;
  152. kvmppc_set_hpt(kvm, &info);
  153. out:
  154. if (err == 0)
  155. /* Ensure that each vcpu will flush its TLB on next entry. */
  156. cpumask_setall(&kvm->arch.need_tlb_flush);
  157. mutex_unlock(&kvm->lock);
  158. return err;
  159. }
  160. void kvmppc_free_hpt(struct kvm_hpt_info *info)
  161. {
  162. vfree(info->rev);
  163. info->rev = NULL;
  164. if (info->cma)
  165. kvm_free_hpt_cma(virt_to_page(info->virt),
  166. 1 << (info->order - PAGE_SHIFT));
  167. else if (info->virt)
  168. free_pages(info->virt, info->order - PAGE_SHIFT);
  169. info->virt = 0;
  170. info->order = 0;
  171. }
  172. /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
  173. static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize)
  174. {
  175. return (pgsize > 0x1000) ? HPTE_V_LARGE : 0;
  176. }
  177. /* Bits in second HPTE dword for pagesize 4k, 64k or 16M */
  178. static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize)
  179. {
  180. return (pgsize == 0x10000) ? 0x1000 : 0;
  181. }
  182. void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
  183. unsigned long porder)
  184. {
  185. unsigned long i;
  186. unsigned long npages;
  187. unsigned long hp_v, hp_r;
  188. unsigned long addr, hash;
  189. unsigned long psize;
  190. unsigned long hp0, hp1;
  191. unsigned long idx_ret;
  192. long ret;
  193. struct kvm *kvm = vcpu->kvm;
  194. psize = 1ul << porder;
  195. npages = memslot->npages >> (porder - PAGE_SHIFT);
  196. /* VRMA can't be > 1TB */
  197. if (npages > 1ul << (40 - porder))
  198. npages = 1ul << (40 - porder);
  199. /* Can't use more than 1 HPTE per HPTEG */
  200. if (npages > kvmppc_hpt_mask(&kvm->arch.hpt) + 1)
  201. npages = kvmppc_hpt_mask(&kvm->arch.hpt) + 1;
  202. hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
  203. HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
  204. hp1 = hpte1_pgsize_encoding(psize) |
  205. HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
  206. for (i = 0; i < npages; ++i) {
  207. addr = i << porder;
  208. /* can't use hpt_hash since va > 64 bits */
  209. hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25)))
  210. & kvmppc_hpt_mask(&kvm->arch.hpt);
  211. /*
  212. * We assume that the hash table is empty and no
  213. * vcpus are using it at this stage. Since we create
  214. * at most one HPTE per HPTEG, we just assume entry 7
  215. * is available and use it.
  216. */
  217. hash = (hash << 3) + 7;
  218. hp_v = hp0 | ((addr >> 16) & ~0x7fUL);
  219. hp_r = hp1 | addr;
  220. ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r,
  221. &idx_ret);
  222. if (ret != H_SUCCESS) {
  223. pr_err("KVM: map_vrma at %lx failed, ret=%ld\n",
  224. addr, ret);
  225. break;
  226. }
  227. }
  228. }
  229. int kvmppc_mmu_hv_init(void)
  230. {
  231. unsigned long host_lpid, rsvd_lpid;
  232. if (!cpu_has_feature(CPU_FTR_HVMODE))
  233. return -EINVAL;
  234. if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE))
  235. return -EINVAL;
  236. /* POWER7 has 10-bit LPIDs (12-bit in POWER8) */
  237. host_lpid = mfspr(SPRN_LPID);
  238. rsvd_lpid = LPID_RSVD;
  239. kvmppc_init_lpid(rsvd_lpid + 1);
  240. kvmppc_claim_lpid(host_lpid);
  241. /* rsvd_lpid is reserved for use in partition switching */
  242. kvmppc_claim_lpid(rsvd_lpid);
  243. return 0;
  244. }
  245. static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
  246. {
  247. unsigned long msr = vcpu->arch.intr_msr;
  248. /* If transactional, change to suspend mode on IRQ delivery */
  249. if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr))
  250. msr |= MSR_TS_S;
  251. else
  252. msr |= vcpu->arch.shregs.msr & MSR_TS_MASK;
  253. kvmppc_set_msr(vcpu, msr);
  254. }
  255. static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
  256. long pte_index, unsigned long pteh,
  257. unsigned long ptel, unsigned long *pte_idx_ret)
  258. {
  259. long ret;
  260. /* Protect linux PTE lookup from page table destruction */
  261. rcu_read_lock_sched(); /* this disables preemption too */
  262. ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel,
  263. current->mm->pgd, false, pte_idx_ret);
  264. rcu_read_unlock_sched();
  265. if (ret == H_TOO_HARD) {
  266. /* this can't happen */
  267. pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n");
  268. ret = H_RESOURCE; /* or something */
  269. }
  270. return ret;
  271. }
  272. static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
  273. gva_t eaddr)
  274. {
  275. u64 mask;
  276. int i;
  277. for (i = 0; i < vcpu->arch.slb_nr; i++) {
  278. if (!(vcpu->arch.slb[i].orige & SLB_ESID_V))
  279. continue;
  280. if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T)
  281. mask = ESID_MASK_1T;
  282. else
  283. mask = ESID_MASK;
  284. if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0)
  285. return &vcpu->arch.slb[i];
  286. }
  287. return NULL;
  288. }
  289. static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
  290. unsigned long ea)
  291. {
  292. unsigned long ra_mask;
  293. ra_mask = kvmppc_actual_pgsz(v, r) - 1;
  294. return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
  295. }
  296. static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
  297. struct kvmppc_pte *gpte, bool data, bool iswrite)
  298. {
  299. struct kvm *kvm = vcpu->kvm;
  300. struct kvmppc_slb *slbe;
  301. unsigned long slb_v;
  302. unsigned long pp, key;
  303. unsigned long v, orig_v, gr;
  304. __be64 *hptep;
  305. long int index;
  306. int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
  307. if (kvm_is_radix(vcpu->kvm))
  308. return kvmppc_mmu_radix_xlate(vcpu, eaddr, gpte, data, iswrite);
  309. /* Get SLB entry */
  310. if (virtmode) {
  311. slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
  312. if (!slbe)
  313. return -EINVAL;
  314. slb_v = slbe->origv;
  315. } else {
  316. /* real mode access */
  317. slb_v = vcpu->kvm->arch.vrma_slb_v;
  318. }
  319. preempt_disable();
  320. /* Find the HPTE in the hash table */
  321. index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v,
  322. HPTE_V_VALID | HPTE_V_ABSENT);
  323. if (index < 0) {
  324. preempt_enable();
  325. return -ENOENT;
  326. }
  327. hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
  328. v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
  329. if (cpu_has_feature(CPU_FTR_ARCH_300))
  330. v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1]));
  331. gr = kvm->arch.hpt.rev[index].guest_rpte;
  332. unlock_hpte(hptep, orig_v);
  333. preempt_enable();
  334. gpte->eaddr = eaddr;
  335. gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff);
  336. /* Get PP bits and key for permission check */
  337. pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
  338. key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
  339. key &= slb_v;
  340. /* Calculate permissions */
  341. gpte->may_read = hpte_read_permission(pp, key);
  342. gpte->may_write = hpte_write_permission(pp, key);
  343. gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G));
  344. /* Storage key permission check for POWER7 */
  345. if (data && virtmode) {
  346. int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr);
  347. if (amrfield & 1)
  348. gpte->may_read = 0;
  349. if (amrfield & 2)
  350. gpte->may_write = 0;
  351. }
  352. /* Get the guest physical address */
  353. gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr);
  354. return 0;
  355. }
  356. /*
  357. * Quick test for whether an instruction is a load or a store.
  358. * If the instruction is a load or a store, then this will indicate
  359. * which it is, at least on server processors. (Embedded processors
  360. * have some external PID instructions that don't follow the rule
  361. * embodied here.) If the instruction isn't a load or store, then
  362. * this doesn't return anything useful.
  363. */
  364. static int instruction_is_store(unsigned int instr)
  365. {
  366. unsigned int mask;
  367. mask = 0x10000000;
  368. if ((instr & 0xfc000000) == 0x7c000000)
  369. mask = 0x100; /* major opcode 31 */
  370. return (instr & mask) != 0;
  371. }
  372. int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
  373. unsigned long gpa, gva_t ea, int is_store)
  374. {
  375. u32 last_inst;
  376. /*
  377. * If we fail, we just return to the guest and try executing it again.
  378. */
  379. if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
  380. EMULATE_DONE)
  381. return RESUME_GUEST;
  382. /*
  383. * WARNING: We do not know for sure whether the instruction we just
  384. * read from memory is the same that caused the fault in the first
  385. * place. If the instruction we read is neither an load or a store,
  386. * then it can't access memory, so we don't need to worry about
  387. * enforcing access permissions. So, assuming it is a load or
  388. * store, we just check that its direction (load or store) is
  389. * consistent with the original fault, since that's what we
  390. * checked the access permissions against. If there is a mismatch
  391. * we just return and retry the instruction.
  392. */
  393. if (instruction_is_store(last_inst) != !!is_store)
  394. return RESUME_GUEST;
  395. /*
  396. * Emulated accesses are emulated by looking at the hash for
  397. * translation once, then performing the access later. The
  398. * translation could be invalidated in the meantime in which
  399. * point performing the subsequent memory access on the old
  400. * physical address could possibly be a security hole for the
  401. * guest (but not the host).
  402. *
  403. * This is less of an issue for MMIO stores since they aren't
  404. * globally visible. It could be an issue for MMIO loads to
  405. * a certain extent but we'll ignore it for now.
  406. */
  407. vcpu->arch.paddr_accessed = gpa;
  408. vcpu->arch.vaddr_accessed = ea;
  409. return kvmppc_emulate_mmio(run, vcpu);
  410. }
  411. int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
  412. unsigned long ea, unsigned long dsisr)
  413. {
  414. struct kvm *kvm = vcpu->kvm;
  415. unsigned long hpte[3], r;
  416. unsigned long hnow_v, hnow_r;
  417. __be64 *hptep;
  418. unsigned long mmu_seq, psize, pte_size;
  419. unsigned long gpa_base, gfn_base;
  420. unsigned long gpa, gfn, hva, pfn;
  421. struct kvm_memory_slot *memslot;
  422. unsigned long *rmap;
  423. struct revmap_entry *rev;
  424. struct page *page, *pages[1];
  425. long index, ret, npages;
  426. bool is_ci;
  427. unsigned int writing, write_ok;
  428. struct vm_area_struct *vma;
  429. unsigned long rcbits;
  430. long mmio_update;
  431. if (kvm_is_radix(kvm))
  432. return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr);
  433. /*
  434. * Real-mode code has already searched the HPT and found the
  435. * entry we're interested in. Lock the entry and check that
  436. * it hasn't changed. If it has, just return and re-execute the
  437. * instruction.
  438. */
  439. if (ea != vcpu->arch.pgfault_addr)
  440. return RESUME_GUEST;
  441. if (vcpu->arch.pgfault_cache) {
  442. mmio_update = atomic64_read(&kvm->arch.mmio_update);
  443. if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) {
  444. r = vcpu->arch.pgfault_cache->rpte;
  445. psize = kvmppc_actual_pgsz(vcpu->arch.pgfault_hpte[0],
  446. r);
  447. gpa_base = r & HPTE_R_RPN & ~(psize - 1);
  448. gfn_base = gpa_base >> PAGE_SHIFT;
  449. gpa = gpa_base | (ea & (psize - 1));
  450. return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
  451. dsisr & DSISR_ISSTORE);
  452. }
  453. }
  454. index = vcpu->arch.pgfault_index;
  455. hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
  456. rev = &kvm->arch.hpt.rev[index];
  457. preempt_disable();
  458. while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
  459. cpu_relax();
  460. hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
  461. hpte[1] = be64_to_cpu(hptep[1]);
  462. hpte[2] = r = rev->guest_rpte;
  463. unlock_hpte(hptep, hpte[0]);
  464. preempt_enable();
  465. if (cpu_has_feature(CPU_FTR_ARCH_300)) {
  466. hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]);
  467. hpte[1] = hpte_new_to_old_r(hpte[1]);
  468. }
  469. if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
  470. hpte[1] != vcpu->arch.pgfault_hpte[1])
  471. return RESUME_GUEST;
  472. /* Translate the logical address and get the page */
  473. psize = kvmppc_actual_pgsz(hpte[0], r);
  474. gpa_base = r & HPTE_R_RPN & ~(psize - 1);
  475. gfn_base = gpa_base >> PAGE_SHIFT;
  476. gpa = gpa_base | (ea & (psize - 1));
  477. gfn = gpa >> PAGE_SHIFT;
  478. memslot = gfn_to_memslot(kvm, gfn);
  479. trace_kvm_page_fault_enter(vcpu, hpte, memslot, ea, dsisr);
  480. /* No memslot means it's an emulated MMIO region */
  481. if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
  482. return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
  483. dsisr & DSISR_ISSTORE);
  484. /*
  485. * This should never happen, because of the slot_is_aligned()
  486. * check in kvmppc_do_h_enter().
  487. */
  488. if (gfn_base < memslot->base_gfn)
  489. return -EFAULT;
  490. /* used to check for invalidations in progress */
  491. mmu_seq = kvm->mmu_notifier_seq;
  492. smp_rmb();
  493. ret = -EFAULT;
  494. is_ci = false;
  495. pfn = 0;
  496. page = NULL;
  497. pte_size = PAGE_SIZE;
  498. writing = (dsisr & DSISR_ISSTORE) != 0;
  499. /* If writing != 0, then the HPTE must allow writing, if we get here */
  500. write_ok = writing;
  501. hva = gfn_to_hva_memslot(memslot, gfn);
  502. npages = get_user_pages_fast(hva, 1, writing, pages);
  503. if (npages < 1) {
  504. /* Check if it's an I/O mapping */
  505. down_read(&current->mm->mmap_sem);
  506. vma = find_vma(current->mm, hva);
  507. if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end &&
  508. (vma->vm_flags & VM_PFNMAP)) {
  509. pfn = vma->vm_pgoff +
  510. ((hva - vma->vm_start) >> PAGE_SHIFT);
  511. pte_size = psize;
  512. is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot))));
  513. write_ok = vma->vm_flags & VM_WRITE;
  514. }
  515. up_read(&current->mm->mmap_sem);
  516. if (!pfn)
  517. goto out_put;
  518. } else {
  519. page = pages[0];
  520. pfn = page_to_pfn(page);
  521. if (PageHuge(page)) {
  522. page = compound_head(page);
  523. pte_size <<= compound_order(page);
  524. }
  525. /* if the guest wants write access, see if that is OK */
  526. if (!writing && hpte_is_writable(r)) {
  527. pte_t *ptep, pte;
  528. unsigned long flags;
  529. /*
  530. * We need to protect against page table destruction
  531. * hugepage split and collapse.
  532. */
  533. local_irq_save(flags);
  534. ptep = find_current_mm_pte(current->mm->pgd,
  535. hva, NULL, NULL);
  536. if (ptep) {
  537. pte = kvmppc_read_update_linux_pte(ptep, 1);
  538. if (__pte_write(pte))
  539. write_ok = 1;
  540. }
  541. local_irq_restore(flags);
  542. }
  543. }
  544. if (psize > pte_size)
  545. goto out_put;
  546. /* Check WIMG vs. the actual page we're accessing */
  547. if (!hpte_cache_flags_ok(r, is_ci)) {
  548. if (is_ci)
  549. goto out_put;
  550. /*
  551. * Allow guest to map emulated device memory as
  552. * uncacheable, but actually make it cacheable.
  553. */
  554. r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M;
  555. }
  556. /*
  557. * Set the HPTE to point to pfn.
  558. * Since the pfn is at PAGE_SIZE granularity, make sure we
  559. * don't mask out lower-order bits if psize < PAGE_SIZE.
  560. */
  561. if (psize < PAGE_SIZE)
  562. psize = PAGE_SIZE;
  563. r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) |
  564. ((pfn << PAGE_SHIFT) & ~(psize - 1));
  565. if (hpte_is_writable(r) && !write_ok)
  566. r = hpte_make_readonly(r);
  567. ret = RESUME_GUEST;
  568. preempt_disable();
  569. while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
  570. cpu_relax();
  571. hnow_v = be64_to_cpu(hptep[0]);
  572. hnow_r = be64_to_cpu(hptep[1]);
  573. if (cpu_has_feature(CPU_FTR_ARCH_300)) {
  574. hnow_v = hpte_new_to_old_v(hnow_v, hnow_r);
  575. hnow_r = hpte_new_to_old_r(hnow_r);
  576. }
  577. /*
  578. * If the HPT is being resized, don't update the HPTE,
  579. * instead let the guest retry after the resize operation is complete.
  580. * The synchronization for mmu_ready test vs. set is provided
  581. * by the HPTE lock.
  582. */
  583. if (!kvm->arch.mmu_ready)
  584. goto out_unlock;
  585. if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] ||
  586. rev->guest_rpte != hpte[2])
  587. /* HPTE has been changed under us; let the guest retry */
  588. goto out_unlock;
  589. hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
  590. /* Always put the HPTE in the rmap chain for the page base address */
  591. rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn];
  592. lock_rmap(rmap);
  593. /* Check if we might have been invalidated; let the guest retry if so */
  594. ret = RESUME_GUEST;
  595. if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
  596. unlock_rmap(rmap);
  597. goto out_unlock;
  598. }
  599. /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */
  600. rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
  601. r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
  602. if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) {
  603. /* HPTE was previously valid, so we need to invalidate it */
  604. unlock_rmap(rmap);
  605. hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
  606. kvmppc_invalidate_hpte(kvm, hptep, index);
  607. /* don't lose previous R and C bits */
  608. r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
  609. } else {
  610. kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
  611. }
  612. if (cpu_has_feature(CPU_FTR_ARCH_300)) {
  613. r = hpte_old_to_new_r(hpte[0], r);
  614. hpte[0] = hpte_old_to_new_v(hpte[0]);
  615. }
  616. hptep[1] = cpu_to_be64(r);
  617. eieio();
  618. __unlock_hpte(hptep, hpte[0]);
  619. asm volatile("ptesync" : : : "memory");
  620. preempt_enable();
  621. if (page && hpte_is_writable(r))
  622. SetPageDirty(page);
  623. out_put:
  624. trace_kvm_page_fault_exit(vcpu, hpte, ret);
  625. if (page) {
  626. /*
  627. * We drop pages[0] here, not page because page might
  628. * have been set to the head page of a compound, but
  629. * we have to drop the reference on the correct tail
  630. * page to match the get inside gup()
  631. */
  632. put_page(pages[0]);
  633. }
  634. return ret;
  635. out_unlock:
  636. __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
  637. preempt_enable();
  638. goto out_put;
  639. }
  640. void kvmppc_rmap_reset(struct kvm *kvm)
  641. {
  642. struct kvm_memslots *slots;
  643. struct kvm_memory_slot *memslot;
  644. int srcu_idx;
  645. srcu_idx = srcu_read_lock(&kvm->srcu);
  646. slots = kvm_memslots(kvm);
  647. kvm_for_each_memslot(memslot, slots) {
  648. /* Mutual exclusion with kvm_unmap_hva_range etc. */
  649. spin_lock(&kvm->mmu_lock);
  650. /*
  651. * This assumes it is acceptable to lose reference and
  652. * change bits across a reset.
  653. */
  654. memset(memslot->arch.rmap, 0,
  655. memslot->npages * sizeof(*memslot->arch.rmap));
  656. spin_unlock(&kvm->mmu_lock);
  657. }
  658. srcu_read_unlock(&kvm->srcu, srcu_idx);
  659. }
  660. typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot,
  661. unsigned long gfn);
  662. static int kvm_handle_hva_range(struct kvm *kvm,
  663. unsigned long start,
  664. unsigned long end,
  665. hva_handler_fn handler)
  666. {
  667. int ret;
  668. int retval = 0;
  669. struct kvm_memslots *slots;
  670. struct kvm_memory_slot *memslot;
  671. slots = kvm_memslots(kvm);
  672. kvm_for_each_memslot(memslot, slots) {
  673. unsigned long hva_start, hva_end;
  674. gfn_t gfn, gfn_end;
  675. hva_start = max(start, memslot->userspace_addr);
  676. hva_end = min(end, memslot->userspace_addr +
  677. (memslot->npages << PAGE_SHIFT));
  678. if (hva_start >= hva_end)
  679. continue;
  680. /*
  681. * {gfn(page) | page intersects with [hva_start, hva_end)} =
  682. * {gfn, gfn+1, ..., gfn_end-1}.
  683. */
  684. gfn = hva_to_gfn_memslot(hva_start, memslot);
  685. gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
  686. for (; gfn < gfn_end; ++gfn) {
  687. ret = handler(kvm, memslot, gfn);
  688. retval |= ret;
  689. }
  690. }
  691. return retval;
  692. }
  693. static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
  694. hva_handler_fn handler)
  695. {
  696. return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
  697. }
  698. /* Must be called with both HPTE and rmap locked */
  699. static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
  700. struct kvm_memory_slot *memslot,
  701. unsigned long *rmapp, unsigned long gfn)
  702. {
  703. __be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
  704. struct revmap_entry *rev = kvm->arch.hpt.rev;
  705. unsigned long j, h;
  706. unsigned long ptel, psize, rcbits;
  707. j = rev[i].forw;
  708. if (j == i) {
  709. /* chain is now empty */
  710. *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
  711. } else {
  712. /* remove i from chain */
  713. h = rev[i].back;
  714. rev[h].forw = j;
  715. rev[j].back = h;
  716. rev[i].forw = rev[i].back = i;
  717. *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
  718. }
  719. /* Now check and modify the HPTE */
  720. ptel = rev[i].guest_rpte;
  721. psize = kvmppc_actual_pgsz(be64_to_cpu(hptep[0]), ptel);
  722. if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
  723. hpte_rpn(ptel, psize) == gfn) {
  724. hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
  725. kvmppc_invalidate_hpte(kvm, hptep, i);
  726. hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
  727. /* Harvest R and C */
  728. rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
  729. *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
  730. if ((rcbits & HPTE_R_C) && memslot->dirty_bitmap)
  731. kvmppc_update_dirty_map(memslot, gfn, psize);
  732. if (rcbits & ~rev[i].guest_rpte) {
  733. rev[i].guest_rpte = ptel | rcbits;
  734. note_hpte_modification(kvm, &rev[i]);
  735. }
  736. }
  737. }
  738. static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
  739. unsigned long gfn)
  740. {
  741. unsigned long i;
  742. __be64 *hptep;
  743. unsigned long *rmapp;
  744. rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
  745. for (;;) {
  746. lock_rmap(rmapp);
  747. if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
  748. unlock_rmap(rmapp);
  749. break;
  750. }
  751. /*
  752. * To avoid an ABBA deadlock with the HPTE lock bit,
  753. * we can't spin on the HPTE lock while holding the
  754. * rmap chain lock.
  755. */
  756. i = *rmapp & KVMPPC_RMAP_INDEX;
  757. hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
  758. if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
  759. /* unlock rmap before spinning on the HPTE lock */
  760. unlock_rmap(rmapp);
  761. while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
  762. cpu_relax();
  763. continue;
  764. }
  765. kvmppc_unmap_hpte(kvm, i, memslot, rmapp, gfn);
  766. unlock_rmap(rmapp);
  767. __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
  768. }
  769. return 0;
  770. }
  771. int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end)
  772. {
  773. hva_handler_fn handler;
  774. handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
  775. kvm_handle_hva_range(kvm, start, end, handler);
  776. return 0;
  777. }
  778. void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
  779. struct kvm_memory_slot *memslot)
  780. {
  781. unsigned long gfn;
  782. unsigned long n;
  783. unsigned long *rmapp;
  784. gfn = memslot->base_gfn;
  785. rmapp = memslot->arch.rmap;
  786. for (n = memslot->npages; n; --n, ++gfn) {
  787. if (kvm_is_radix(kvm)) {
  788. kvm_unmap_radix(kvm, memslot, gfn);
  789. continue;
  790. }
  791. /*
  792. * Testing the present bit without locking is OK because
  793. * the memslot has been marked invalid already, and hence
  794. * no new HPTEs referencing this page can be created,
  795. * thus the present bit can't go from 0 to 1.
  796. */
  797. if (*rmapp & KVMPPC_RMAP_PRESENT)
  798. kvm_unmap_rmapp(kvm, memslot, gfn);
  799. ++rmapp;
  800. }
  801. }
  802. static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
  803. unsigned long gfn)
  804. {
  805. struct revmap_entry *rev = kvm->arch.hpt.rev;
  806. unsigned long head, i, j;
  807. __be64 *hptep;
  808. int ret = 0;
  809. unsigned long *rmapp;
  810. rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
  811. retry:
  812. lock_rmap(rmapp);
  813. if (*rmapp & KVMPPC_RMAP_REFERENCED) {
  814. *rmapp &= ~KVMPPC_RMAP_REFERENCED;
  815. ret = 1;
  816. }
  817. if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
  818. unlock_rmap(rmapp);
  819. return ret;
  820. }
  821. i = head = *rmapp & KVMPPC_RMAP_INDEX;
  822. do {
  823. hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
  824. j = rev[i].forw;
  825. /* If this HPTE isn't referenced, ignore it */
  826. if (!(be64_to_cpu(hptep[1]) & HPTE_R_R))
  827. continue;
  828. if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
  829. /* unlock rmap before spinning on the HPTE lock */
  830. unlock_rmap(rmapp);
  831. while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
  832. cpu_relax();
  833. goto retry;
  834. }
  835. /* Now check and modify the HPTE */
  836. if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
  837. (be64_to_cpu(hptep[1]) & HPTE_R_R)) {
  838. kvmppc_clear_ref_hpte(kvm, hptep, i);
  839. if (!(rev[i].guest_rpte & HPTE_R_R)) {
  840. rev[i].guest_rpte |= HPTE_R_R;
  841. note_hpte_modification(kvm, &rev[i]);
  842. }
  843. ret = 1;
  844. }
  845. __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
  846. } while ((i = j) != head);
  847. unlock_rmap(rmapp);
  848. return ret;
  849. }
  850. int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end)
  851. {
  852. hva_handler_fn handler;
  853. handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp;
  854. return kvm_handle_hva_range(kvm, start, end, handler);
  855. }
  856. static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
  857. unsigned long gfn)
  858. {
  859. struct revmap_entry *rev = kvm->arch.hpt.rev;
  860. unsigned long head, i, j;
  861. unsigned long *hp;
  862. int ret = 1;
  863. unsigned long *rmapp;
  864. rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
  865. if (*rmapp & KVMPPC_RMAP_REFERENCED)
  866. return 1;
  867. lock_rmap(rmapp);
  868. if (*rmapp & KVMPPC_RMAP_REFERENCED)
  869. goto out;
  870. if (*rmapp & KVMPPC_RMAP_PRESENT) {
  871. i = head = *rmapp & KVMPPC_RMAP_INDEX;
  872. do {
  873. hp = (unsigned long *)(kvm->arch.hpt.virt + (i << 4));
  874. j = rev[i].forw;
  875. if (be64_to_cpu(hp[1]) & HPTE_R_R)
  876. goto out;
  877. } while ((i = j) != head);
  878. }
  879. ret = 0;
  880. out:
  881. unlock_rmap(rmapp);
  882. return ret;
  883. }
  884. int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva)
  885. {
  886. hva_handler_fn handler;
  887. handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp;
  888. return kvm_handle_hva(kvm, hva, handler);
  889. }
  890. void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
  891. {
  892. hva_handler_fn handler;
  893. handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
  894. kvm_handle_hva(kvm, hva, handler);
  895. }
  896. static int vcpus_running(struct kvm *kvm)
  897. {
  898. return atomic_read(&kvm->arch.vcpus_running) != 0;
  899. }
  900. /*
  901. * Returns the number of system pages that are dirty.
  902. * This can be more than 1 if we find a huge-page HPTE.
  903. */
  904. static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
  905. {
  906. struct revmap_entry *rev = kvm->arch.hpt.rev;
  907. unsigned long head, i, j;
  908. unsigned long n;
  909. unsigned long v, r;
  910. __be64 *hptep;
  911. int npages_dirty = 0;
  912. retry:
  913. lock_rmap(rmapp);
  914. if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
  915. unlock_rmap(rmapp);
  916. return npages_dirty;
  917. }
  918. i = head = *rmapp & KVMPPC_RMAP_INDEX;
  919. do {
  920. unsigned long hptep1;
  921. hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
  922. j = rev[i].forw;
  923. /*
  924. * Checking the C (changed) bit here is racy since there
  925. * is no guarantee about when the hardware writes it back.
  926. * If the HPTE is not writable then it is stable since the
  927. * page can't be written to, and we would have done a tlbie
  928. * (which forces the hardware to complete any writeback)
  929. * when making the HPTE read-only.
  930. * If vcpus are running then this call is racy anyway
  931. * since the page could get dirtied subsequently, so we
  932. * expect there to be a further call which would pick up
  933. * any delayed C bit writeback.
  934. * Otherwise we need to do the tlbie even if C==0 in
  935. * order to pick up any delayed writeback of C.
  936. */
  937. hptep1 = be64_to_cpu(hptep[1]);
  938. if (!(hptep1 & HPTE_R_C) &&
  939. (!hpte_is_writable(hptep1) || vcpus_running(kvm)))
  940. continue;
  941. if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
  942. /* unlock rmap before spinning on the HPTE lock */
  943. unlock_rmap(rmapp);
  944. while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK))
  945. cpu_relax();
  946. goto retry;
  947. }
  948. /* Now check and modify the HPTE */
  949. if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) {
  950. __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
  951. continue;
  952. }
  953. /* need to make it temporarily absent so C is stable */
  954. hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
  955. kvmppc_invalidate_hpte(kvm, hptep, i);
  956. v = be64_to_cpu(hptep[0]);
  957. r = be64_to_cpu(hptep[1]);
  958. if (r & HPTE_R_C) {
  959. hptep[1] = cpu_to_be64(r & ~HPTE_R_C);
  960. if (!(rev[i].guest_rpte & HPTE_R_C)) {
  961. rev[i].guest_rpte |= HPTE_R_C;
  962. note_hpte_modification(kvm, &rev[i]);
  963. }
  964. n = kvmppc_actual_pgsz(v, r);
  965. n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT;
  966. if (n > npages_dirty)
  967. npages_dirty = n;
  968. eieio();
  969. }
  970. v &= ~HPTE_V_ABSENT;
  971. v |= HPTE_V_VALID;
  972. __unlock_hpte(hptep, v);
  973. } while ((i = j) != head);
  974. unlock_rmap(rmapp);
  975. return npages_dirty;
  976. }
  977. void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa,
  978. struct kvm_memory_slot *memslot,
  979. unsigned long *map)
  980. {
  981. unsigned long gfn;
  982. if (!vpa->dirty || !vpa->pinned_addr)
  983. return;
  984. gfn = vpa->gpa >> PAGE_SHIFT;
  985. if (gfn < memslot->base_gfn ||
  986. gfn >= memslot->base_gfn + memslot->npages)
  987. return;
  988. vpa->dirty = false;
  989. if (map)
  990. __set_bit_le(gfn - memslot->base_gfn, map);
  991. }
  992. long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
  993. struct kvm_memory_slot *memslot, unsigned long *map)
  994. {
  995. unsigned long i;
  996. unsigned long *rmapp;
  997. preempt_disable();
  998. rmapp = memslot->arch.rmap;
  999. for (i = 0; i < memslot->npages; ++i) {
  1000. int npages = kvm_test_clear_dirty_npages(kvm, rmapp);
  1001. /*
  1002. * Note that if npages > 0 then i must be a multiple of npages,
  1003. * since we always put huge-page HPTEs in the rmap chain
  1004. * corresponding to their page base address.
  1005. */
  1006. if (npages)
  1007. set_dirty_bits(map, i, npages);
  1008. ++rmapp;
  1009. }
  1010. preempt_enable();
  1011. return 0;
  1012. }
  1013. void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
  1014. unsigned long *nb_ret)
  1015. {
  1016. struct kvm_memory_slot *memslot;
  1017. unsigned long gfn = gpa >> PAGE_SHIFT;
  1018. struct page *page, *pages[1];
  1019. int npages;
  1020. unsigned long hva, offset;
  1021. int srcu_idx;
  1022. srcu_idx = srcu_read_lock(&kvm->srcu);
  1023. memslot = gfn_to_memslot(kvm, gfn);
  1024. if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
  1025. goto err;
  1026. hva = gfn_to_hva_memslot(memslot, gfn);
  1027. npages = get_user_pages_fast(hva, 1, 1, pages);
  1028. if (npages < 1)
  1029. goto err;
  1030. page = pages[0];
  1031. srcu_read_unlock(&kvm->srcu, srcu_idx);
  1032. offset = gpa & (PAGE_SIZE - 1);
  1033. if (nb_ret)
  1034. *nb_ret = PAGE_SIZE - offset;
  1035. return page_address(page) + offset;
  1036. err:
  1037. srcu_read_unlock(&kvm->srcu, srcu_idx);
  1038. return NULL;
  1039. }
  1040. void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
  1041. bool dirty)
  1042. {
  1043. struct page *page = virt_to_page(va);
  1044. struct kvm_memory_slot *memslot;
  1045. unsigned long gfn;
  1046. int srcu_idx;
  1047. put_page(page);
  1048. if (!dirty)
  1049. return;
  1050. /* We need to mark this page dirty in the memslot dirty_bitmap, if any */
  1051. gfn = gpa >> PAGE_SHIFT;
  1052. srcu_idx = srcu_read_lock(&kvm->srcu);
  1053. memslot = gfn_to_memslot(kvm, gfn);
  1054. if (memslot && memslot->dirty_bitmap)
  1055. set_bit_le(gfn - memslot->base_gfn, memslot->dirty_bitmap);
  1056. srcu_read_unlock(&kvm->srcu, srcu_idx);
  1057. }
  1058. /*
  1059. * HPT resizing
  1060. */
  1061. static int resize_hpt_allocate(struct kvm_resize_hpt *resize)
  1062. {
  1063. int rc;
  1064. rc = kvmppc_allocate_hpt(&resize->hpt, resize->order);
  1065. if (rc < 0)
  1066. return rc;
  1067. resize_hpt_debug(resize, "resize_hpt_allocate(): HPT @ 0x%lx\n",
  1068. resize->hpt.virt);
  1069. return 0;
  1070. }
  1071. static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
  1072. unsigned long idx)
  1073. {
  1074. struct kvm *kvm = resize->kvm;
  1075. struct kvm_hpt_info *old = &kvm->arch.hpt;
  1076. struct kvm_hpt_info *new = &resize->hpt;
  1077. unsigned long old_hash_mask = (1ULL << (old->order - 7)) - 1;
  1078. unsigned long new_hash_mask = (1ULL << (new->order - 7)) - 1;
  1079. __be64 *hptep, *new_hptep;
  1080. unsigned long vpte, rpte, guest_rpte;
  1081. int ret;
  1082. struct revmap_entry *rev;
  1083. unsigned long apsize, avpn, pteg, hash;
  1084. unsigned long new_idx, new_pteg, replace_vpte;
  1085. int pshift;
  1086. hptep = (__be64 *)(old->virt + (idx << 4));
  1087. /* Guest is stopped, so new HPTEs can't be added or faulted
  1088. * in, only unmapped or altered by host actions. So, it's
  1089. * safe to check this before we take the HPTE lock */
  1090. vpte = be64_to_cpu(hptep[0]);
  1091. if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT))
  1092. return 0; /* nothing to do */
  1093. while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
  1094. cpu_relax();
  1095. vpte = be64_to_cpu(hptep[0]);
  1096. ret = 0;
  1097. if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT))
  1098. /* Nothing to do */
  1099. goto out;
  1100. if (cpu_has_feature(CPU_FTR_ARCH_300)) {
  1101. rpte = be64_to_cpu(hptep[1]);
  1102. vpte = hpte_new_to_old_v(vpte, rpte);
  1103. }
  1104. /* Unmap */
  1105. rev = &old->rev[idx];
  1106. guest_rpte = rev->guest_rpte;
  1107. ret = -EIO;
  1108. apsize = kvmppc_actual_pgsz(vpte, guest_rpte);
  1109. if (!apsize)
  1110. goto out;
  1111. if (vpte & HPTE_V_VALID) {
  1112. unsigned long gfn = hpte_rpn(guest_rpte, apsize);
  1113. int srcu_idx = srcu_read_lock(&kvm->srcu);
  1114. struct kvm_memory_slot *memslot =
  1115. __gfn_to_memslot(kvm_memslots(kvm), gfn);
  1116. if (memslot) {
  1117. unsigned long *rmapp;
  1118. rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
  1119. lock_rmap(rmapp);
  1120. kvmppc_unmap_hpte(kvm, idx, memslot, rmapp, gfn);
  1121. unlock_rmap(rmapp);
  1122. }
  1123. srcu_read_unlock(&kvm->srcu, srcu_idx);
  1124. }
  1125. /* Reload PTE after unmap */
  1126. vpte = be64_to_cpu(hptep[0]);
  1127. BUG_ON(vpte & HPTE_V_VALID);
  1128. BUG_ON(!(vpte & HPTE_V_ABSENT));
  1129. ret = 0;
  1130. if (!(vpte & HPTE_V_BOLTED))
  1131. goto out;
  1132. rpte = be64_to_cpu(hptep[1]);
  1133. if (cpu_has_feature(CPU_FTR_ARCH_300)) {
  1134. vpte = hpte_new_to_old_v(vpte, rpte);
  1135. rpte = hpte_new_to_old_r(rpte);
  1136. }
  1137. pshift = kvmppc_hpte_base_page_shift(vpte, rpte);
  1138. avpn = HPTE_V_AVPN_VAL(vpte) & ~(((1ul << pshift) - 1) >> 23);
  1139. pteg = idx / HPTES_PER_GROUP;
  1140. if (vpte & HPTE_V_SECONDARY)
  1141. pteg = ~pteg;
  1142. if (!(vpte & HPTE_V_1TB_SEG)) {
  1143. unsigned long offset, vsid;
  1144. /* We only have 28 - 23 bits of offset in avpn */
  1145. offset = (avpn & 0x1f) << 23;
  1146. vsid = avpn >> 5;
  1147. /* We can find more bits from the pteg value */
  1148. if (pshift < 23)
  1149. offset |= ((vsid ^ pteg) & old_hash_mask) << pshift;
  1150. hash = vsid ^ (offset >> pshift);
  1151. } else {
  1152. unsigned long offset, vsid;
  1153. /* We only have 40 - 23 bits of seg_off in avpn */
  1154. offset = (avpn & 0x1ffff) << 23;
  1155. vsid = avpn >> 17;
  1156. if (pshift < 23)
  1157. offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) << pshift;
  1158. hash = vsid ^ (vsid << 25) ^ (offset >> pshift);
  1159. }
  1160. new_pteg = hash & new_hash_mask;
  1161. if (vpte & HPTE_V_SECONDARY)
  1162. new_pteg = ~hash & new_hash_mask;
  1163. new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP);
  1164. new_hptep = (__be64 *)(new->virt + (new_idx << 4));
  1165. replace_vpte = be64_to_cpu(new_hptep[0]);
  1166. if (cpu_has_feature(CPU_FTR_ARCH_300)) {
  1167. unsigned long replace_rpte = be64_to_cpu(new_hptep[1]);
  1168. replace_vpte = hpte_new_to_old_v(replace_vpte, replace_rpte);
  1169. }
  1170. if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
  1171. BUG_ON(new->order >= old->order);
  1172. if (replace_vpte & HPTE_V_BOLTED) {
  1173. if (vpte & HPTE_V_BOLTED)
  1174. /* Bolted collision, nothing we can do */
  1175. ret = -ENOSPC;
  1176. /* Discard the new HPTE */
  1177. goto out;
  1178. }
  1179. /* Discard the previous HPTE */
  1180. }
  1181. if (cpu_has_feature(CPU_FTR_ARCH_300)) {
  1182. rpte = hpte_old_to_new_r(vpte, rpte);
  1183. vpte = hpte_old_to_new_v(vpte);
  1184. }
  1185. new_hptep[1] = cpu_to_be64(rpte);
  1186. new->rev[new_idx].guest_rpte = guest_rpte;
  1187. /* No need for a barrier, since new HPT isn't active */
  1188. new_hptep[0] = cpu_to_be64(vpte);
  1189. unlock_hpte(new_hptep, vpte);
  1190. out:
  1191. unlock_hpte(hptep, vpte);
  1192. return ret;
  1193. }
  1194. static int resize_hpt_rehash(struct kvm_resize_hpt *resize)
  1195. {
  1196. struct kvm *kvm = resize->kvm;
  1197. unsigned long i;
  1198. int rc;
  1199. for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) {
  1200. rc = resize_hpt_rehash_hpte(resize, i);
  1201. if (rc != 0)
  1202. return rc;
  1203. }
  1204. return 0;
  1205. }
  1206. static void resize_hpt_pivot(struct kvm_resize_hpt *resize)
  1207. {
  1208. struct kvm *kvm = resize->kvm;
  1209. struct kvm_hpt_info hpt_tmp;
  1210. /* Exchange the pending tables in the resize structure with
  1211. * the active tables */
  1212. resize_hpt_debug(resize, "resize_hpt_pivot()\n");
  1213. spin_lock(&kvm->mmu_lock);
  1214. asm volatile("ptesync" : : : "memory");
  1215. hpt_tmp = kvm->arch.hpt;
  1216. kvmppc_set_hpt(kvm, &resize->hpt);
  1217. resize->hpt = hpt_tmp;
  1218. spin_unlock(&kvm->mmu_lock);
  1219. synchronize_srcu_expedited(&kvm->srcu);
  1220. if (cpu_has_feature(CPU_FTR_ARCH_300))
  1221. kvmppc_setup_partition_table(kvm);
  1222. resize_hpt_debug(resize, "resize_hpt_pivot() done\n");
  1223. }
  1224. static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize)
  1225. {
  1226. if (WARN_ON(!mutex_is_locked(&kvm->lock)))
  1227. return;
  1228. if (!resize)
  1229. return;
  1230. if (resize->error != -EBUSY) {
  1231. if (resize->hpt.virt)
  1232. kvmppc_free_hpt(&resize->hpt);
  1233. kfree(resize);
  1234. }
  1235. if (kvm->arch.resize_hpt == resize)
  1236. kvm->arch.resize_hpt = NULL;
  1237. }
  1238. static void resize_hpt_prepare_work(struct work_struct *work)
  1239. {
  1240. struct kvm_resize_hpt *resize = container_of(work,
  1241. struct kvm_resize_hpt,
  1242. work);
  1243. struct kvm *kvm = resize->kvm;
  1244. int err = 0;
  1245. if (WARN_ON(resize->error != -EBUSY))
  1246. return;
  1247. mutex_lock(&kvm->lock);
  1248. /* Request is still current? */
  1249. if (kvm->arch.resize_hpt == resize) {
  1250. /* We may request large allocations here:
  1251. * do not sleep with kvm->lock held for a while.
  1252. */
  1253. mutex_unlock(&kvm->lock);
  1254. resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n",
  1255. resize->order);
  1256. err = resize_hpt_allocate(resize);
  1257. /* We have strict assumption about -EBUSY
  1258. * when preparing for HPT resize.
  1259. */
  1260. if (WARN_ON(err == -EBUSY))
  1261. err = -EINPROGRESS;
  1262. mutex_lock(&kvm->lock);
  1263. /* It is possible that kvm->arch.resize_hpt != resize
  1264. * after we grab kvm->lock again.
  1265. */
  1266. }
  1267. resize->error = err;
  1268. if (kvm->arch.resize_hpt != resize)
  1269. resize_hpt_release(kvm, resize);
  1270. mutex_unlock(&kvm->lock);
  1271. }
  1272. long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
  1273. struct kvm_ppc_resize_hpt *rhpt)
  1274. {
  1275. unsigned long flags = rhpt->flags;
  1276. unsigned long shift = rhpt->shift;
  1277. struct kvm_resize_hpt *resize;
  1278. int ret;
  1279. if (flags != 0 || kvm_is_radix(kvm))
  1280. return -EINVAL;
  1281. if (shift && ((shift < 18) || (shift > 46)))
  1282. return -EINVAL;
  1283. mutex_lock(&kvm->lock);
  1284. resize = kvm->arch.resize_hpt;
  1285. if (resize) {
  1286. if (resize->order == shift) {
  1287. /* Suitable resize in progress? */
  1288. ret = resize->error;
  1289. if (ret == -EBUSY)
  1290. ret = 100; /* estimated time in ms */
  1291. else if (ret)
  1292. resize_hpt_release(kvm, resize);
  1293. goto out;
  1294. }
  1295. /* not suitable, cancel it */
  1296. resize_hpt_release(kvm, resize);
  1297. }
  1298. ret = 0;
  1299. if (!shift)
  1300. goto out; /* nothing to do */
  1301. /* start new resize */
  1302. resize = kzalloc(sizeof(*resize), GFP_KERNEL);
  1303. if (!resize) {
  1304. ret = -ENOMEM;
  1305. goto out;
  1306. }
  1307. resize->error = -EBUSY;
  1308. resize->order = shift;
  1309. resize->kvm = kvm;
  1310. INIT_WORK(&resize->work, resize_hpt_prepare_work);
  1311. kvm->arch.resize_hpt = resize;
  1312. schedule_work(&resize->work);
  1313. ret = 100; /* estimated time in ms */
  1314. out:
  1315. mutex_unlock(&kvm->lock);
  1316. return ret;
  1317. }
  1318. static void resize_hpt_boot_vcpu(void *opaque)
  1319. {
  1320. /* Nothing to do, just force a KVM exit */
  1321. }
  1322. long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
  1323. struct kvm_ppc_resize_hpt *rhpt)
  1324. {
  1325. unsigned long flags = rhpt->flags;
  1326. unsigned long shift = rhpt->shift;
  1327. struct kvm_resize_hpt *resize;
  1328. long ret;
  1329. if (flags != 0 || kvm_is_radix(kvm))
  1330. return -EINVAL;
  1331. if (shift && ((shift < 18) || (shift > 46)))
  1332. return -EINVAL;
  1333. mutex_lock(&kvm->lock);
  1334. resize = kvm->arch.resize_hpt;
  1335. /* This shouldn't be possible */
  1336. ret = -EIO;
  1337. if (WARN_ON(!kvm->arch.mmu_ready))
  1338. goto out_no_hpt;
  1339. /* Stop VCPUs from running while we mess with the HPT */
  1340. kvm->arch.mmu_ready = 0;
  1341. smp_mb();
  1342. /* Boot all CPUs out of the guest so they re-read
  1343. * mmu_ready */
  1344. on_each_cpu(resize_hpt_boot_vcpu, NULL, 1);
  1345. ret = -ENXIO;
  1346. if (!resize || (resize->order != shift))
  1347. goto out;
  1348. ret = resize->error;
  1349. if (ret)
  1350. goto out;
  1351. ret = resize_hpt_rehash(resize);
  1352. if (ret)
  1353. goto out;
  1354. resize_hpt_pivot(resize);
  1355. out:
  1356. /* Let VCPUs run again */
  1357. kvm->arch.mmu_ready = 1;
  1358. smp_mb();
  1359. out_no_hpt:
  1360. resize_hpt_release(kvm, resize);
  1361. mutex_unlock(&kvm->lock);
  1362. return ret;
  1363. }
  1364. /*
  1365. * Functions for reading and writing the hash table via reads and
  1366. * writes on a file descriptor.
  1367. *
  1368. * Reads return the guest view of the hash table, which has to be
  1369. * pieced together from the real hash table and the guest_rpte
  1370. * values in the revmap array.
  1371. *
  1372. * On writes, each HPTE written is considered in turn, and if it
  1373. * is valid, it is written to the HPT as if an H_ENTER with the
  1374. * exact flag set was done. When the invalid count is non-zero
  1375. * in the header written to the stream, the kernel will make
  1376. * sure that that many HPTEs are invalid, and invalidate them
  1377. * if not.
  1378. */
  1379. struct kvm_htab_ctx {
  1380. unsigned long index;
  1381. unsigned long flags;
  1382. struct kvm *kvm;
  1383. int first_pass;
  1384. };
  1385. #define HPTE_SIZE (2 * sizeof(unsigned long))
  1386. /*
  1387. * Returns 1 if this HPT entry has been modified or has pending
  1388. * R/C bit changes.
  1389. */
  1390. static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp)
  1391. {
  1392. unsigned long rcbits_unset;
  1393. if (revp->guest_rpte & HPTE_GR_MODIFIED)
  1394. return 1;
  1395. /* Also need to consider changes in reference and changed bits */
  1396. rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
  1397. if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) &&
  1398. (be64_to_cpu(hptp[1]) & rcbits_unset))
  1399. return 1;
  1400. return 0;
  1401. }
  1402. static long record_hpte(unsigned long flags, __be64 *hptp,
  1403. unsigned long *hpte, struct revmap_entry *revp,
  1404. int want_valid, int first_pass)
  1405. {
  1406. unsigned long v, r, hr;
  1407. unsigned long rcbits_unset;
  1408. int ok = 1;
  1409. int valid, dirty;
  1410. /* Unmodified entries are uninteresting except on the first pass */
  1411. dirty = hpte_dirty(revp, hptp);
  1412. if (!first_pass && !dirty)
  1413. return 0;
  1414. valid = 0;
  1415. if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) {
  1416. valid = 1;
  1417. if ((flags & KVM_GET_HTAB_BOLTED_ONLY) &&
  1418. !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED))
  1419. valid = 0;
  1420. }
  1421. if (valid != want_valid)
  1422. return 0;
  1423. v = r = 0;
  1424. if (valid || dirty) {
  1425. /* lock the HPTE so it's stable and read it */
  1426. preempt_disable();
  1427. while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
  1428. cpu_relax();
  1429. v = be64_to_cpu(hptp[0]);
  1430. hr = be64_to_cpu(hptp[1]);
  1431. if (cpu_has_feature(CPU_FTR_ARCH_300)) {
  1432. v = hpte_new_to_old_v(v, hr);
  1433. hr = hpte_new_to_old_r(hr);
  1434. }
  1435. /* re-evaluate valid and dirty from synchronized HPTE value */
  1436. valid = !!(v & HPTE_V_VALID);
  1437. dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
  1438. /* Harvest R and C into guest view if necessary */
  1439. rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
  1440. if (valid && (rcbits_unset & hr)) {
  1441. revp->guest_rpte |= (hr &
  1442. (HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED;
  1443. dirty = 1;
  1444. }
  1445. if (v & HPTE_V_ABSENT) {
  1446. v &= ~HPTE_V_ABSENT;
  1447. v |= HPTE_V_VALID;
  1448. valid = 1;
  1449. }
  1450. if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED))
  1451. valid = 0;
  1452. r = revp->guest_rpte;
  1453. /* only clear modified if this is the right sort of entry */
  1454. if (valid == want_valid && dirty) {
  1455. r &= ~HPTE_GR_MODIFIED;
  1456. revp->guest_rpte = r;
  1457. }
  1458. unlock_hpte(hptp, be64_to_cpu(hptp[0]));
  1459. preempt_enable();
  1460. if (!(valid == want_valid && (first_pass || dirty)))
  1461. ok = 0;
  1462. }
  1463. hpte[0] = cpu_to_be64(v);
  1464. hpte[1] = cpu_to_be64(r);
  1465. return ok;
  1466. }
  1467. static ssize_t kvm_htab_read(struct file *file, char __user *buf,
  1468. size_t count, loff_t *ppos)
  1469. {
  1470. struct kvm_htab_ctx *ctx = file->private_data;
  1471. struct kvm *kvm = ctx->kvm;
  1472. struct kvm_get_htab_header hdr;
  1473. __be64 *hptp;
  1474. struct revmap_entry *revp;
  1475. unsigned long i, nb, nw;
  1476. unsigned long __user *lbuf;
  1477. struct kvm_get_htab_header __user *hptr;
  1478. unsigned long flags;
  1479. int first_pass;
  1480. unsigned long hpte[2];
  1481. if (!access_ok(VERIFY_WRITE, buf, count))
  1482. return -EFAULT;
  1483. if (kvm_is_radix(kvm))
  1484. return 0;
  1485. first_pass = ctx->first_pass;
  1486. flags = ctx->flags;
  1487. i = ctx->index;
  1488. hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
  1489. revp = kvm->arch.hpt.rev + i;
  1490. lbuf = (unsigned long __user *)buf;
  1491. nb = 0;
  1492. while (nb + sizeof(hdr) + HPTE_SIZE < count) {
  1493. /* Initialize header */
  1494. hptr = (struct kvm_get_htab_header __user *)buf;
  1495. hdr.n_valid = 0;
  1496. hdr.n_invalid = 0;
  1497. nw = nb;
  1498. nb += sizeof(hdr);
  1499. lbuf = (unsigned long __user *)(buf + sizeof(hdr));
  1500. /* Skip uninteresting entries, i.e. clean on not-first pass */
  1501. if (!first_pass) {
  1502. while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
  1503. !hpte_dirty(revp, hptp)) {
  1504. ++i;
  1505. hptp += 2;
  1506. ++revp;
  1507. }
  1508. }
  1509. hdr.index = i;
  1510. /* Grab a series of valid entries */
  1511. while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
  1512. hdr.n_valid < 0xffff &&
  1513. nb + HPTE_SIZE < count &&
  1514. record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
  1515. /* valid entry, write it out */
  1516. ++hdr.n_valid;
  1517. if (__put_user(hpte[0], lbuf) ||
  1518. __put_user(hpte[1], lbuf + 1))
  1519. return -EFAULT;
  1520. nb += HPTE_SIZE;
  1521. lbuf += 2;
  1522. ++i;
  1523. hptp += 2;
  1524. ++revp;
  1525. }
  1526. /* Now skip invalid entries while we can */
  1527. while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
  1528. hdr.n_invalid < 0xffff &&
  1529. record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
  1530. /* found an invalid entry */
  1531. ++hdr.n_invalid;
  1532. ++i;
  1533. hptp += 2;
  1534. ++revp;
  1535. }
  1536. if (hdr.n_valid || hdr.n_invalid) {
  1537. /* write back the header */
  1538. if (__copy_to_user(hptr, &hdr, sizeof(hdr)))
  1539. return -EFAULT;
  1540. nw = nb;
  1541. buf = (char __user *)lbuf;
  1542. } else {
  1543. nb = nw;
  1544. }
  1545. /* Check if we've wrapped around the hash table */
  1546. if (i >= kvmppc_hpt_npte(&kvm->arch.hpt)) {
  1547. i = 0;
  1548. ctx->first_pass = 0;
  1549. break;
  1550. }
  1551. }
  1552. ctx->index = i;
  1553. return nb;
  1554. }
  1555. static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
  1556. size_t count, loff_t *ppos)
  1557. {
  1558. struct kvm_htab_ctx *ctx = file->private_data;
  1559. struct kvm *kvm = ctx->kvm;
  1560. struct kvm_get_htab_header hdr;
  1561. unsigned long i, j;
  1562. unsigned long v, r;
  1563. unsigned long __user *lbuf;
  1564. __be64 *hptp;
  1565. unsigned long tmp[2];
  1566. ssize_t nb;
  1567. long int err, ret;
  1568. int mmu_ready;
  1569. int pshift;
  1570. if (!access_ok(VERIFY_READ, buf, count))
  1571. return -EFAULT;
  1572. if (kvm_is_radix(kvm))
  1573. return -EINVAL;
  1574. /* lock out vcpus from running while we're doing this */
  1575. mutex_lock(&kvm->lock);
  1576. mmu_ready = kvm->arch.mmu_ready;
  1577. if (mmu_ready) {
  1578. kvm->arch.mmu_ready = 0; /* temporarily */
  1579. /* order mmu_ready vs. vcpus_running */
  1580. smp_mb();
  1581. if (atomic_read(&kvm->arch.vcpus_running)) {
  1582. kvm->arch.mmu_ready = 1;
  1583. mutex_unlock(&kvm->lock);
  1584. return -EBUSY;
  1585. }
  1586. }
  1587. err = 0;
  1588. for (nb = 0; nb + sizeof(hdr) <= count; ) {
  1589. err = -EFAULT;
  1590. if (__copy_from_user(&hdr, buf, sizeof(hdr)))
  1591. break;
  1592. err = 0;
  1593. if (nb + hdr.n_valid * HPTE_SIZE > count)
  1594. break;
  1595. nb += sizeof(hdr);
  1596. buf += sizeof(hdr);
  1597. err = -EINVAL;
  1598. i = hdr.index;
  1599. if (i >= kvmppc_hpt_npte(&kvm->arch.hpt) ||
  1600. i + hdr.n_valid + hdr.n_invalid > kvmppc_hpt_npte(&kvm->arch.hpt))
  1601. break;
  1602. hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
  1603. lbuf = (unsigned long __user *)buf;
  1604. for (j = 0; j < hdr.n_valid; ++j) {
  1605. __be64 hpte_v;
  1606. __be64 hpte_r;
  1607. err = -EFAULT;
  1608. if (__get_user(hpte_v, lbuf) ||
  1609. __get_user(hpte_r, lbuf + 1))
  1610. goto out;
  1611. v = be64_to_cpu(hpte_v);
  1612. r = be64_to_cpu(hpte_r);
  1613. err = -EINVAL;
  1614. if (!(v & HPTE_V_VALID))
  1615. goto out;
  1616. pshift = kvmppc_hpte_base_page_shift(v, r);
  1617. if (pshift <= 0)
  1618. goto out;
  1619. lbuf += 2;
  1620. nb += HPTE_SIZE;
  1621. if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
  1622. kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
  1623. err = -EIO;
  1624. ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r,
  1625. tmp);
  1626. if (ret != H_SUCCESS) {
  1627. pr_err("kvm_htab_write ret %ld i=%ld v=%lx "
  1628. "r=%lx\n", ret, i, v, r);
  1629. goto out;
  1630. }
  1631. if (!mmu_ready && is_vrma_hpte(v)) {
  1632. unsigned long senc, lpcr;
  1633. senc = slb_pgsize_encoding(1ul << pshift);
  1634. kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
  1635. (VRMA_VSID << SLB_VSID_SHIFT_1T);
  1636. if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
  1637. lpcr = senc << (LPCR_VRMASD_SH - 4);
  1638. kvmppc_update_lpcr(kvm, lpcr,
  1639. LPCR_VRMASD);
  1640. } else {
  1641. kvmppc_setup_partition_table(kvm);
  1642. }
  1643. mmu_ready = 1;
  1644. }
  1645. ++i;
  1646. hptp += 2;
  1647. }
  1648. for (j = 0; j < hdr.n_invalid; ++j) {
  1649. if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
  1650. kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
  1651. ++i;
  1652. hptp += 2;
  1653. }
  1654. err = 0;
  1655. }
  1656. out:
  1657. /* Order HPTE updates vs. mmu_ready */
  1658. smp_wmb();
  1659. kvm->arch.mmu_ready = mmu_ready;
  1660. mutex_unlock(&kvm->lock);
  1661. if (err)
  1662. return err;
  1663. return nb;
  1664. }
  1665. static int kvm_htab_release(struct inode *inode, struct file *filp)
  1666. {
  1667. struct kvm_htab_ctx *ctx = filp->private_data;
  1668. filp->private_data = NULL;
  1669. if (!(ctx->flags & KVM_GET_HTAB_WRITE))
  1670. atomic_dec(&ctx->kvm->arch.hpte_mod_interest);
  1671. kvm_put_kvm(ctx->kvm);
  1672. kfree(ctx);
  1673. return 0;
  1674. }
  1675. static const struct file_operations kvm_htab_fops = {
  1676. .read = kvm_htab_read,
  1677. .write = kvm_htab_write,
  1678. .llseek = default_llseek,
  1679. .release = kvm_htab_release,
  1680. };
  1681. int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
  1682. {
  1683. int ret;
  1684. struct kvm_htab_ctx *ctx;
  1685. int rwflag;
  1686. /* reject flags we don't recognize */
  1687. if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE))
  1688. return -EINVAL;
  1689. ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
  1690. if (!ctx)
  1691. return -ENOMEM;
  1692. kvm_get_kvm(kvm);
  1693. ctx->kvm = kvm;
  1694. ctx->index = ghf->start_index;
  1695. ctx->flags = ghf->flags;
  1696. ctx->first_pass = 1;
  1697. rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
  1698. ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC);
  1699. if (ret < 0) {
  1700. kfree(ctx);
  1701. kvm_put_kvm(kvm);
  1702. return ret;
  1703. }
  1704. if (rwflag == O_RDONLY) {
  1705. mutex_lock(&kvm->slots_lock);
  1706. atomic_inc(&kvm->arch.hpte_mod_interest);
  1707. /* make sure kvmppc_do_h_enter etc. see the increment */
  1708. synchronize_srcu_expedited(&kvm->srcu);
  1709. mutex_unlock(&kvm->slots_lock);
  1710. }
  1711. return ret;
  1712. }
  1713. struct debugfs_htab_state {
  1714. struct kvm *kvm;
  1715. struct mutex mutex;
  1716. unsigned long hpt_index;
  1717. int chars_left;
  1718. int buf_index;
  1719. char buf[64];
  1720. };
  1721. static int debugfs_htab_open(struct inode *inode, struct file *file)
  1722. {
  1723. struct kvm *kvm = inode->i_private;
  1724. struct debugfs_htab_state *p;
  1725. p = kzalloc(sizeof(*p), GFP_KERNEL);
  1726. if (!p)
  1727. return -ENOMEM;
  1728. kvm_get_kvm(kvm);
  1729. p->kvm = kvm;
  1730. mutex_init(&p->mutex);
  1731. file->private_data = p;
  1732. return nonseekable_open(inode, file);
  1733. }
  1734. static int debugfs_htab_release(struct inode *inode, struct file *file)
  1735. {
  1736. struct debugfs_htab_state *p = file->private_data;
  1737. kvm_put_kvm(p->kvm);
  1738. kfree(p);
  1739. return 0;
  1740. }
  1741. static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
  1742. size_t len, loff_t *ppos)
  1743. {
  1744. struct debugfs_htab_state *p = file->private_data;
  1745. ssize_t ret, r;
  1746. unsigned long i, n;
  1747. unsigned long v, hr, gr;
  1748. struct kvm *kvm;
  1749. __be64 *hptp;
  1750. kvm = p->kvm;
  1751. if (kvm_is_radix(kvm))
  1752. return 0;
  1753. ret = mutex_lock_interruptible(&p->mutex);
  1754. if (ret)
  1755. return ret;
  1756. if (p->chars_left) {
  1757. n = p->chars_left;
  1758. if (n > len)
  1759. n = len;
  1760. r = copy_to_user(buf, p->buf + p->buf_index, n);
  1761. n -= r;
  1762. p->chars_left -= n;
  1763. p->buf_index += n;
  1764. buf += n;
  1765. len -= n;
  1766. ret = n;
  1767. if (r) {
  1768. if (!n)
  1769. ret = -EFAULT;
  1770. goto out;
  1771. }
  1772. }
  1773. i = p->hpt_index;
  1774. hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
  1775. for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt);
  1776. ++i, hptp += 2) {
  1777. if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)))
  1778. continue;
  1779. /* lock the HPTE so it's stable and read it */
  1780. preempt_disable();
  1781. while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
  1782. cpu_relax();
  1783. v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK;
  1784. hr = be64_to_cpu(hptp[1]);
  1785. gr = kvm->arch.hpt.rev[i].guest_rpte;
  1786. unlock_hpte(hptp, v);
  1787. preempt_enable();
  1788. if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
  1789. continue;
  1790. n = scnprintf(p->buf, sizeof(p->buf),
  1791. "%6lx %.16lx %.16lx %.16lx\n",
  1792. i, v, hr, gr);
  1793. p->chars_left = n;
  1794. if (n > len)
  1795. n = len;
  1796. r = copy_to_user(buf, p->buf, n);
  1797. n -= r;
  1798. p->chars_left -= n;
  1799. p->buf_index = n;
  1800. buf += n;
  1801. len -= n;
  1802. ret += n;
  1803. if (r) {
  1804. if (!ret)
  1805. ret = -EFAULT;
  1806. goto out;
  1807. }
  1808. }
  1809. p->hpt_index = i;
  1810. out:
  1811. mutex_unlock(&p->mutex);
  1812. return ret;
  1813. }
  1814. static ssize_t debugfs_htab_write(struct file *file, const char __user *buf,
  1815. size_t len, loff_t *ppos)
  1816. {
  1817. return -EACCES;
  1818. }
  1819. static const struct file_operations debugfs_htab_fops = {
  1820. .owner = THIS_MODULE,
  1821. .open = debugfs_htab_open,
  1822. .release = debugfs_htab_release,
  1823. .read = debugfs_htab_read,
  1824. .write = debugfs_htab_write,
  1825. .llseek = generic_file_llseek,
  1826. };
  1827. void kvmppc_mmu_debugfs_init(struct kvm *kvm)
  1828. {
  1829. kvm->arch.htab_dentry = debugfs_create_file("htab", 0400,
  1830. kvm->arch.debugfs_dir, kvm,
  1831. &debugfs_htab_fops);
  1832. }
  1833. void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
  1834. {
  1835. struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
  1836. vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */
  1837. mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
  1838. mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
  1839. vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
  1840. }