gmap.c 85 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * KVM guest address space mapping code
  4. *
  5. * Copyright IBM Corp. 2007, 2020
  6. * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  7. * David Hildenbrand <david@redhat.com>
  8. * Janosch Frank <frankja@linux.vnet.ibm.com>
  9. */
  10. #include <linux/kernel.h>
  11. #include <linux/pagewalk.h>
  12. #include <linux/swap.h>
  13. #include <linux/smp.h>
  14. #include <linux/spinlock.h>
  15. #include <linux/slab.h>
  16. #include <linux/swapops.h>
  17. #include <linux/ksm.h>
  18. #include <linux/mman.h>
  19. #include <linux/pgtable.h>
  20. #include <asm/page-states.h>
  21. #include <asm/pgalloc.h>
  22. #include <asm/gmap.h>
  23. #include <asm/page.h>
  24. #include <asm/tlb.h>
  25. #define GMAP_SHADOW_FAKE_TABLE 1ULL
  26. static struct page *gmap_alloc_crst(void)
  27. {
  28. struct page *page;
  29. page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
  30. if (!page)
  31. return NULL;
  32. __arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER);
  33. return page;
  34. }
  35. /**
  36. * gmap_alloc - allocate and initialize a guest address space
  37. * @limit: maximum address of the gmap address space
  38. *
  39. * Returns a guest address space structure.
  40. */
  41. static struct gmap *gmap_alloc(unsigned long limit)
  42. {
  43. struct gmap *gmap;
  44. struct page *page;
  45. unsigned long *table;
  46. unsigned long etype, atype;
  47. if (limit < _REGION3_SIZE) {
  48. limit = _REGION3_SIZE - 1;
  49. atype = _ASCE_TYPE_SEGMENT;
  50. etype = _SEGMENT_ENTRY_EMPTY;
  51. } else if (limit < _REGION2_SIZE) {
  52. limit = _REGION2_SIZE - 1;
  53. atype = _ASCE_TYPE_REGION3;
  54. etype = _REGION3_ENTRY_EMPTY;
  55. } else if (limit < _REGION1_SIZE) {
  56. limit = _REGION1_SIZE - 1;
  57. atype = _ASCE_TYPE_REGION2;
  58. etype = _REGION2_ENTRY_EMPTY;
  59. } else {
  60. limit = -1UL;
  61. atype = _ASCE_TYPE_REGION1;
  62. etype = _REGION1_ENTRY_EMPTY;
  63. }
  64. gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT);
  65. if (!gmap)
  66. goto out;
  67. INIT_LIST_HEAD(&gmap->crst_list);
  68. INIT_LIST_HEAD(&gmap->children);
  69. INIT_LIST_HEAD(&gmap->pt_list);
  70. INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT);
  71. INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT);
  72. INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT);
  73. spin_lock_init(&gmap->guest_table_lock);
  74. spin_lock_init(&gmap->shadow_lock);
  75. refcount_set(&gmap->ref_count, 1);
  76. page = gmap_alloc_crst();
  77. if (!page)
  78. goto out_free;
  79. page->index = 0;
  80. list_add(&page->lru, &gmap->crst_list);
  81. table = page_to_virt(page);
  82. crst_table_init(table, etype);
  83. gmap->table = table;
  84. gmap->asce = atype | _ASCE_TABLE_LENGTH |
  85. _ASCE_USER_BITS | __pa(table);
  86. gmap->asce_end = limit;
  87. return gmap;
  88. out_free:
  89. kfree(gmap);
  90. out:
  91. return NULL;
  92. }
  93. /**
  94. * gmap_create - create a guest address space
  95. * @mm: pointer to the parent mm_struct
  96. * @limit: maximum size of the gmap address space
  97. *
  98. * Returns a guest address space structure.
  99. */
  100. struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
  101. {
  102. struct gmap *gmap;
  103. unsigned long gmap_asce;
  104. gmap = gmap_alloc(limit);
  105. if (!gmap)
  106. return NULL;
  107. gmap->mm = mm;
  108. spin_lock(&mm->context.lock);
  109. list_add_rcu(&gmap->list, &mm->context.gmap_list);
  110. if (list_is_singular(&mm->context.gmap_list))
  111. gmap_asce = gmap->asce;
  112. else
  113. gmap_asce = -1UL;
  114. WRITE_ONCE(mm->context.gmap_asce, gmap_asce);
  115. spin_unlock(&mm->context.lock);
  116. return gmap;
  117. }
  118. EXPORT_SYMBOL_GPL(gmap_create);
  119. static void gmap_flush_tlb(struct gmap *gmap)
  120. {
  121. if (MACHINE_HAS_IDTE)
  122. __tlb_flush_idte(gmap->asce);
  123. else
  124. __tlb_flush_global();
  125. }
  126. static void gmap_radix_tree_free(struct radix_tree_root *root)
  127. {
  128. struct radix_tree_iter iter;
  129. unsigned long indices[16];
  130. unsigned long index;
  131. void __rcu **slot;
  132. int i, nr;
  133. /* A radix tree is freed by deleting all of its entries */
  134. index = 0;
  135. do {
  136. nr = 0;
  137. radix_tree_for_each_slot(slot, root, &iter, index) {
  138. indices[nr] = iter.index;
  139. if (++nr == 16)
  140. break;
  141. }
  142. for (i = 0; i < nr; i++) {
  143. index = indices[i];
  144. radix_tree_delete(root, index);
  145. }
  146. } while (nr > 0);
  147. }
  148. static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
  149. {
  150. struct gmap_rmap *rmap, *rnext, *head;
  151. struct radix_tree_iter iter;
  152. unsigned long indices[16];
  153. unsigned long index;
  154. void __rcu **slot;
  155. int i, nr;
  156. /* A radix tree is freed by deleting all of its entries */
  157. index = 0;
  158. do {
  159. nr = 0;
  160. radix_tree_for_each_slot(slot, root, &iter, index) {
  161. indices[nr] = iter.index;
  162. if (++nr == 16)
  163. break;
  164. }
  165. for (i = 0; i < nr; i++) {
  166. index = indices[i];
  167. head = radix_tree_delete(root, index);
  168. gmap_for_each_rmap_safe(rmap, rnext, head)
  169. kfree(rmap);
  170. }
  171. } while (nr > 0);
  172. }
  173. /**
  174. * gmap_free - free a guest address space
  175. * @gmap: pointer to the guest address space structure
  176. *
  177. * No locks required. There are no references to this gmap anymore.
  178. */
  179. static void gmap_free(struct gmap *gmap)
  180. {
  181. struct page *page, *next;
  182. /* Flush tlb of all gmaps (if not already done for shadows) */
  183. if (!(gmap_is_shadow(gmap) && gmap->removed))
  184. gmap_flush_tlb(gmap);
  185. /* Free all segment & region tables. */
  186. list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
  187. __free_pages(page, CRST_ALLOC_ORDER);
  188. gmap_radix_tree_free(&gmap->guest_to_host);
  189. gmap_radix_tree_free(&gmap->host_to_guest);
  190. /* Free additional data for a shadow gmap */
  191. if (gmap_is_shadow(gmap)) {
  192. struct ptdesc *ptdesc, *n;
  193. /* Free all page tables. */
  194. list_for_each_entry_safe(ptdesc, n, &gmap->pt_list, pt_list)
  195. page_table_free_pgste(ptdesc);
  196. gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
  197. /* Release reference to the parent */
  198. gmap_put(gmap->parent);
  199. }
  200. kfree(gmap);
  201. }
  202. /**
  203. * gmap_get - increase reference counter for guest address space
  204. * @gmap: pointer to the guest address space structure
  205. *
  206. * Returns the gmap pointer
  207. */
  208. struct gmap *gmap_get(struct gmap *gmap)
  209. {
  210. refcount_inc(&gmap->ref_count);
  211. return gmap;
  212. }
  213. EXPORT_SYMBOL_GPL(gmap_get);
  214. /**
  215. * gmap_put - decrease reference counter for guest address space
  216. * @gmap: pointer to the guest address space structure
  217. *
  218. * If the reference counter reaches zero the guest address space is freed.
  219. */
  220. void gmap_put(struct gmap *gmap)
  221. {
  222. if (refcount_dec_and_test(&gmap->ref_count))
  223. gmap_free(gmap);
  224. }
  225. EXPORT_SYMBOL_GPL(gmap_put);
  226. /**
  227. * gmap_remove - remove a guest address space but do not free it yet
  228. * @gmap: pointer to the guest address space structure
  229. */
  230. void gmap_remove(struct gmap *gmap)
  231. {
  232. struct gmap *sg, *next;
  233. unsigned long gmap_asce;
  234. /* Remove all shadow gmaps linked to this gmap */
  235. if (!list_empty(&gmap->children)) {
  236. spin_lock(&gmap->shadow_lock);
  237. list_for_each_entry_safe(sg, next, &gmap->children, list) {
  238. list_del(&sg->list);
  239. gmap_put(sg);
  240. }
  241. spin_unlock(&gmap->shadow_lock);
  242. }
  243. /* Remove gmap from the pre-mm list */
  244. spin_lock(&gmap->mm->context.lock);
  245. list_del_rcu(&gmap->list);
  246. if (list_empty(&gmap->mm->context.gmap_list))
  247. gmap_asce = 0;
  248. else if (list_is_singular(&gmap->mm->context.gmap_list))
  249. gmap_asce = list_first_entry(&gmap->mm->context.gmap_list,
  250. struct gmap, list)->asce;
  251. else
  252. gmap_asce = -1UL;
  253. WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce);
  254. spin_unlock(&gmap->mm->context.lock);
  255. synchronize_rcu();
  256. /* Put reference */
  257. gmap_put(gmap);
  258. }
  259. EXPORT_SYMBOL_GPL(gmap_remove);
  260. /**
  261. * gmap_enable - switch primary space to the guest address space
  262. * @gmap: pointer to the guest address space structure
  263. */
  264. void gmap_enable(struct gmap *gmap)
  265. {
  266. get_lowcore()->gmap = (unsigned long)gmap;
  267. }
  268. EXPORT_SYMBOL_GPL(gmap_enable);
  269. /**
  270. * gmap_disable - switch back to the standard primary address space
  271. * @gmap: pointer to the guest address space structure
  272. */
  273. void gmap_disable(struct gmap *gmap)
  274. {
  275. get_lowcore()->gmap = 0UL;
  276. }
  277. EXPORT_SYMBOL_GPL(gmap_disable);
  278. /**
  279. * gmap_get_enabled - get a pointer to the currently enabled gmap
  280. *
  281. * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
  282. */
  283. struct gmap *gmap_get_enabled(void)
  284. {
  285. return (struct gmap *)get_lowcore()->gmap;
  286. }
  287. EXPORT_SYMBOL_GPL(gmap_get_enabled);
  288. /*
  289. * gmap_alloc_table is assumed to be called with mmap_lock held
  290. */
  291. static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
  292. unsigned long init, unsigned long gaddr)
  293. {
  294. struct page *page;
  295. unsigned long *new;
  296. /* since we dont free the gmap table until gmap_free we can unlock */
  297. page = gmap_alloc_crst();
  298. if (!page)
  299. return -ENOMEM;
  300. new = page_to_virt(page);
  301. crst_table_init(new, init);
  302. spin_lock(&gmap->guest_table_lock);
  303. if (*table & _REGION_ENTRY_INVALID) {
  304. list_add(&page->lru, &gmap->crst_list);
  305. *table = __pa(new) | _REGION_ENTRY_LENGTH |
  306. (*table & _REGION_ENTRY_TYPE_MASK);
  307. page->index = gaddr;
  308. page = NULL;
  309. }
  310. spin_unlock(&gmap->guest_table_lock);
  311. if (page)
  312. __free_pages(page, CRST_ALLOC_ORDER);
  313. return 0;
  314. }
  315. /**
  316. * __gmap_segment_gaddr - find virtual address from segment pointer
  317. * @entry: pointer to a segment table entry in the guest address space
  318. *
  319. * Returns the virtual address in the guest address space for the segment
  320. */
  321. static unsigned long __gmap_segment_gaddr(unsigned long *entry)
  322. {
  323. struct page *page;
  324. unsigned long offset;
  325. offset = (unsigned long) entry / sizeof(unsigned long);
  326. offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
  327. page = pmd_pgtable_page((pmd_t *) entry);
  328. return page->index + offset;
  329. }
  330. /**
  331. * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
  332. * @gmap: pointer to the guest address space structure
  333. * @vmaddr: address in the host process address space
  334. *
  335. * Returns 1 if a TLB flush is required
  336. */
  337. static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
  338. {
  339. unsigned long *entry;
  340. int flush = 0;
  341. BUG_ON(gmap_is_shadow(gmap));
  342. spin_lock(&gmap->guest_table_lock);
  343. entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
  344. if (entry) {
  345. flush = (*entry != _SEGMENT_ENTRY_EMPTY);
  346. *entry = _SEGMENT_ENTRY_EMPTY;
  347. }
  348. spin_unlock(&gmap->guest_table_lock);
  349. return flush;
  350. }
  351. /**
  352. * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
  353. * @gmap: pointer to the guest address space structure
  354. * @gaddr: address in the guest address space
  355. *
  356. * Returns 1 if a TLB flush is required
  357. */
  358. static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr)
  359. {
  360. unsigned long vmaddr;
  361. vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host,
  362. gaddr >> PMD_SHIFT);
  363. return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0;
  364. }
  365. /**
  366. * gmap_unmap_segment - unmap segment from the guest address space
  367. * @gmap: pointer to the guest address space structure
  368. * @to: address in the guest address space
  369. * @len: length of the memory area to unmap
  370. *
  371. * Returns 0 if the unmap succeeded, -EINVAL if not.
  372. */
  373. int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
  374. {
  375. unsigned long off;
  376. int flush;
  377. BUG_ON(gmap_is_shadow(gmap));
  378. if ((to | len) & (PMD_SIZE - 1))
  379. return -EINVAL;
  380. if (len == 0 || to + len < to)
  381. return -EINVAL;
  382. flush = 0;
  383. mmap_write_lock(gmap->mm);
  384. for (off = 0; off < len; off += PMD_SIZE)
  385. flush |= __gmap_unmap_by_gaddr(gmap, to + off);
  386. mmap_write_unlock(gmap->mm);
  387. if (flush)
  388. gmap_flush_tlb(gmap);
  389. return 0;
  390. }
  391. EXPORT_SYMBOL_GPL(gmap_unmap_segment);
  392. /**
  393. * gmap_map_segment - map a segment to the guest address space
  394. * @gmap: pointer to the guest address space structure
  395. * @from: source address in the parent address space
  396. * @to: target address in the guest address space
  397. * @len: length of the memory area to map
  398. *
  399. * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
  400. */
  401. int gmap_map_segment(struct gmap *gmap, unsigned long from,
  402. unsigned long to, unsigned long len)
  403. {
  404. unsigned long off;
  405. int flush;
  406. BUG_ON(gmap_is_shadow(gmap));
  407. if ((from | to | len) & (PMD_SIZE - 1))
  408. return -EINVAL;
  409. if (len == 0 || from + len < from || to + len < to ||
  410. from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end)
  411. return -EINVAL;
  412. flush = 0;
  413. mmap_write_lock(gmap->mm);
  414. for (off = 0; off < len; off += PMD_SIZE) {
  415. /* Remove old translation */
  416. flush |= __gmap_unmap_by_gaddr(gmap, to + off);
  417. /* Store new translation */
  418. if (radix_tree_insert(&gmap->guest_to_host,
  419. (to + off) >> PMD_SHIFT,
  420. (void *) from + off))
  421. break;
  422. }
  423. mmap_write_unlock(gmap->mm);
  424. if (flush)
  425. gmap_flush_tlb(gmap);
  426. if (off >= len)
  427. return 0;
  428. gmap_unmap_segment(gmap, to, len);
  429. return -ENOMEM;
  430. }
  431. EXPORT_SYMBOL_GPL(gmap_map_segment);
  432. /**
  433. * __gmap_translate - translate a guest address to a user space address
  434. * @gmap: pointer to guest mapping meta data structure
  435. * @gaddr: guest address
  436. *
  437. * Returns user space address which corresponds to the guest address or
  438. * -EFAULT if no such mapping exists.
  439. * This function does not establish potentially missing page table entries.
  440. * The mmap_lock of the mm that belongs to the address space must be held
  441. * when this function gets called.
  442. *
  443. * Note: Can also be called for shadow gmaps.
  444. */
  445. unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
  446. {
  447. unsigned long vmaddr;
  448. vmaddr = (unsigned long)
  449. radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
  450. /* Note: guest_to_host is empty for a shadow gmap */
  451. return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
  452. }
  453. EXPORT_SYMBOL_GPL(__gmap_translate);
  454. /**
  455. * gmap_translate - translate a guest address to a user space address
  456. * @gmap: pointer to guest mapping meta data structure
  457. * @gaddr: guest address
  458. *
  459. * Returns user space address which corresponds to the guest address or
  460. * -EFAULT if no such mapping exists.
  461. * This function does not establish potentially missing page table entries.
  462. */
  463. unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
  464. {
  465. unsigned long rc;
  466. mmap_read_lock(gmap->mm);
  467. rc = __gmap_translate(gmap, gaddr);
  468. mmap_read_unlock(gmap->mm);
  469. return rc;
  470. }
  471. EXPORT_SYMBOL_GPL(gmap_translate);
  472. /**
  473. * gmap_unlink - disconnect a page table from the gmap shadow tables
  474. * @mm: pointer to the parent mm_struct
  475. * @table: pointer to the host page table
  476. * @vmaddr: vm address associated with the host page table
  477. */
  478. void gmap_unlink(struct mm_struct *mm, unsigned long *table,
  479. unsigned long vmaddr)
  480. {
  481. struct gmap *gmap;
  482. int flush;
  483. rcu_read_lock();
  484. list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
  485. flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
  486. if (flush)
  487. gmap_flush_tlb(gmap);
  488. }
  489. rcu_read_unlock();
  490. }
  491. static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
  492. unsigned long gaddr);
  493. /**
  494. * __gmap_link - set up shadow page tables to connect a host to a guest address
  495. * @gmap: pointer to guest mapping meta data structure
  496. * @gaddr: guest address
  497. * @vmaddr: vm address
  498. *
  499. * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
  500. * if the vm address is already mapped to a different guest segment.
  501. * The mmap_lock of the mm that belongs to the address space must be held
  502. * when this function gets called.
  503. */
  504. int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
  505. {
  506. struct mm_struct *mm;
  507. unsigned long *table;
  508. spinlock_t *ptl;
  509. pgd_t *pgd;
  510. p4d_t *p4d;
  511. pud_t *pud;
  512. pmd_t *pmd;
  513. u64 unprot;
  514. int rc;
  515. BUG_ON(gmap_is_shadow(gmap));
  516. /* Create higher level tables in the gmap page table */
  517. table = gmap->table;
  518. if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
  519. table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
  520. if ((*table & _REGION_ENTRY_INVALID) &&
  521. gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
  522. gaddr & _REGION1_MASK))
  523. return -ENOMEM;
  524. table = __va(*table & _REGION_ENTRY_ORIGIN);
  525. }
  526. if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
  527. table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
  528. if ((*table & _REGION_ENTRY_INVALID) &&
  529. gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
  530. gaddr & _REGION2_MASK))
  531. return -ENOMEM;
  532. table = __va(*table & _REGION_ENTRY_ORIGIN);
  533. }
  534. if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
  535. table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
  536. if ((*table & _REGION_ENTRY_INVALID) &&
  537. gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
  538. gaddr & _REGION3_MASK))
  539. return -ENOMEM;
  540. table = __va(*table & _REGION_ENTRY_ORIGIN);
  541. }
  542. table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
  543. /* Walk the parent mm page table */
  544. mm = gmap->mm;
  545. pgd = pgd_offset(mm, vmaddr);
  546. VM_BUG_ON(pgd_none(*pgd));
  547. p4d = p4d_offset(pgd, vmaddr);
  548. VM_BUG_ON(p4d_none(*p4d));
  549. pud = pud_offset(p4d, vmaddr);
  550. VM_BUG_ON(pud_none(*pud));
  551. /* large puds cannot yet be handled */
  552. if (pud_leaf(*pud))
  553. return -EFAULT;
  554. pmd = pmd_offset(pud, vmaddr);
  555. VM_BUG_ON(pmd_none(*pmd));
  556. /* Are we allowed to use huge pages? */
  557. if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
  558. return -EFAULT;
  559. /* Link gmap segment table entry location to page table. */
  560. rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
  561. if (rc)
  562. return rc;
  563. ptl = pmd_lock(mm, pmd);
  564. spin_lock(&gmap->guest_table_lock);
  565. if (*table == _SEGMENT_ENTRY_EMPTY) {
  566. rc = radix_tree_insert(&gmap->host_to_guest,
  567. vmaddr >> PMD_SHIFT, table);
  568. if (!rc) {
  569. if (pmd_leaf(*pmd)) {
  570. *table = (pmd_val(*pmd) &
  571. _SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
  572. | _SEGMENT_ENTRY_GMAP_UC;
  573. } else
  574. *table = pmd_val(*pmd) &
  575. _SEGMENT_ENTRY_HARDWARE_BITS;
  576. }
  577. } else if (*table & _SEGMENT_ENTRY_PROTECT &&
  578. !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
  579. unprot = (u64)*table;
  580. unprot &= ~_SEGMENT_ENTRY_PROTECT;
  581. unprot |= _SEGMENT_ENTRY_GMAP_UC;
  582. gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr);
  583. }
  584. spin_unlock(&gmap->guest_table_lock);
  585. spin_unlock(ptl);
  586. radix_tree_preload_end();
  587. return rc;
  588. }
  589. /**
  590. * gmap_fault - resolve a fault on a guest address
  591. * @gmap: pointer to guest mapping meta data structure
  592. * @gaddr: guest address
  593. * @fault_flags: flags to pass down to handle_mm_fault()
  594. *
  595. * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
  596. * if the vm address is already mapped to a different guest segment.
  597. */
  598. int gmap_fault(struct gmap *gmap, unsigned long gaddr,
  599. unsigned int fault_flags)
  600. {
  601. unsigned long vmaddr;
  602. int rc;
  603. bool unlocked;
  604. mmap_read_lock(gmap->mm);
  605. retry:
  606. unlocked = false;
  607. vmaddr = __gmap_translate(gmap, gaddr);
  608. if (IS_ERR_VALUE(vmaddr)) {
  609. rc = vmaddr;
  610. goto out_up;
  611. }
  612. if (fixup_user_fault(gmap->mm, vmaddr, fault_flags,
  613. &unlocked)) {
  614. rc = -EFAULT;
  615. goto out_up;
  616. }
  617. /*
  618. * In the case that fixup_user_fault unlocked the mmap_lock during
  619. * faultin redo __gmap_translate to not race with a map/unmap_segment.
  620. */
  621. if (unlocked)
  622. goto retry;
  623. rc = __gmap_link(gmap, gaddr, vmaddr);
  624. out_up:
  625. mmap_read_unlock(gmap->mm);
  626. return rc;
  627. }
  628. EXPORT_SYMBOL_GPL(gmap_fault);
  629. /*
  630. * this function is assumed to be called with mmap_lock held
  631. */
  632. void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
  633. {
  634. struct vm_area_struct *vma;
  635. unsigned long vmaddr;
  636. spinlock_t *ptl;
  637. pte_t *ptep;
  638. /* Find the vm address for the guest address */
  639. vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
  640. gaddr >> PMD_SHIFT);
  641. if (vmaddr) {
  642. vmaddr |= gaddr & ~PMD_MASK;
  643. vma = vma_lookup(gmap->mm, vmaddr);
  644. if (!vma || is_vm_hugetlb_page(vma))
  645. return;
  646. /* Get pointer to the page table entry */
  647. ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
  648. if (likely(ptep)) {
  649. ptep_zap_unused(gmap->mm, vmaddr, ptep, 0);
  650. pte_unmap_unlock(ptep, ptl);
  651. }
  652. }
  653. }
  654. EXPORT_SYMBOL_GPL(__gmap_zap);
  655. void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
  656. {
  657. unsigned long gaddr, vmaddr, size;
  658. struct vm_area_struct *vma;
  659. mmap_read_lock(gmap->mm);
  660. for (gaddr = from; gaddr < to;
  661. gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
  662. /* Find the vm address for the guest address */
  663. vmaddr = (unsigned long)
  664. radix_tree_lookup(&gmap->guest_to_host,
  665. gaddr >> PMD_SHIFT);
  666. if (!vmaddr)
  667. continue;
  668. vmaddr |= gaddr & ~PMD_MASK;
  669. /* Find vma in the parent mm */
  670. vma = find_vma(gmap->mm, vmaddr);
  671. if (!vma)
  672. continue;
  673. /*
  674. * We do not discard pages that are backed by
  675. * hugetlbfs, so we don't have to refault them.
  676. */
  677. if (is_vm_hugetlb_page(vma))
  678. continue;
  679. size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
  680. zap_page_range_single(vma, vmaddr, size, NULL);
  681. }
  682. mmap_read_unlock(gmap->mm);
  683. }
  684. EXPORT_SYMBOL_GPL(gmap_discard);
  685. static LIST_HEAD(gmap_notifier_list);
  686. static DEFINE_SPINLOCK(gmap_notifier_lock);
  687. /**
  688. * gmap_register_pte_notifier - register a pte invalidation callback
  689. * @nb: pointer to the gmap notifier block
  690. */
  691. void gmap_register_pte_notifier(struct gmap_notifier *nb)
  692. {
  693. spin_lock(&gmap_notifier_lock);
  694. list_add_rcu(&nb->list, &gmap_notifier_list);
  695. spin_unlock(&gmap_notifier_lock);
  696. }
  697. EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
  698. /**
  699. * gmap_unregister_pte_notifier - remove a pte invalidation callback
  700. * @nb: pointer to the gmap notifier block
  701. */
  702. void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
  703. {
  704. spin_lock(&gmap_notifier_lock);
  705. list_del_rcu(&nb->list);
  706. spin_unlock(&gmap_notifier_lock);
  707. synchronize_rcu();
  708. }
  709. EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
  710. /**
  711. * gmap_call_notifier - call all registered invalidation callbacks
  712. * @gmap: pointer to guest mapping meta data structure
  713. * @start: start virtual address in the guest address space
  714. * @end: end virtual address in the guest address space
  715. */
  716. static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
  717. unsigned long end)
  718. {
  719. struct gmap_notifier *nb;
  720. list_for_each_entry(nb, &gmap_notifier_list, list)
  721. nb->notifier_call(gmap, start, end);
  722. }
  723. /**
  724. * gmap_table_walk - walk the gmap page tables
  725. * @gmap: pointer to guest mapping meta data structure
  726. * @gaddr: virtual address in the guest address space
  727. * @level: page table level to stop at
  728. *
  729. * Returns a table entry pointer for the given guest address and @level
  730. * @level=0 : returns a pointer to a page table table entry (or NULL)
  731. * @level=1 : returns a pointer to a segment table entry (or NULL)
  732. * @level=2 : returns a pointer to a region-3 table entry (or NULL)
  733. * @level=3 : returns a pointer to a region-2 table entry (or NULL)
  734. * @level=4 : returns a pointer to a region-1 table entry (or NULL)
  735. *
  736. * Returns NULL if the gmap page tables could not be walked to the
  737. * requested level.
  738. *
  739. * Note: Can also be called for shadow gmaps.
  740. */
  741. static inline unsigned long *gmap_table_walk(struct gmap *gmap,
  742. unsigned long gaddr, int level)
  743. {
  744. const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
  745. unsigned long *table = gmap->table;
  746. if (gmap_is_shadow(gmap) && gmap->removed)
  747. return NULL;
  748. if (WARN_ON_ONCE(level > (asce_type >> 2) + 1))
  749. return NULL;
  750. if (asce_type != _ASCE_TYPE_REGION1 &&
  751. gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
  752. return NULL;
  753. switch (asce_type) {
  754. case _ASCE_TYPE_REGION1:
  755. table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
  756. if (level == 4)
  757. break;
  758. if (*table & _REGION_ENTRY_INVALID)
  759. return NULL;
  760. table = __va(*table & _REGION_ENTRY_ORIGIN);
  761. fallthrough;
  762. case _ASCE_TYPE_REGION2:
  763. table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
  764. if (level == 3)
  765. break;
  766. if (*table & _REGION_ENTRY_INVALID)
  767. return NULL;
  768. table = __va(*table & _REGION_ENTRY_ORIGIN);
  769. fallthrough;
  770. case _ASCE_TYPE_REGION3:
  771. table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
  772. if (level == 2)
  773. break;
  774. if (*table & _REGION_ENTRY_INVALID)
  775. return NULL;
  776. table = __va(*table & _REGION_ENTRY_ORIGIN);
  777. fallthrough;
  778. case _ASCE_TYPE_SEGMENT:
  779. table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
  780. if (level == 1)
  781. break;
  782. if (*table & _REGION_ENTRY_INVALID)
  783. return NULL;
  784. table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
  785. table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT;
  786. }
  787. return table;
  788. }
  789. /**
  790. * gmap_pte_op_walk - walk the gmap page table, get the page table lock
  791. * and return the pte pointer
  792. * @gmap: pointer to guest mapping meta data structure
  793. * @gaddr: virtual address in the guest address space
  794. * @ptl: pointer to the spinlock pointer
  795. *
  796. * Returns a pointer to the locked pte for a guest address, or NULL
  797. */
  798. static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
  799. spinlock_t **ptl)
  800. {
  801. unsigned long *table;
  802. BUG_ON(gmap_is_shadow(gmap));
  803. /* Walk the gmap page table, lock and get pte pointer */
  804. table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
  805. if (!table || *table & _SEGMENT_ENTRY_INVALID)
  806. return NULL;
  807. return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
  808. }
  809. /**
  810. * gmap_pte_op_fixup - force a page in and connect the gmap page table
  811. * @gmap: pointer to guest mapping meta data structure
  812. * @gaddr: virtual address in the guest address space
  813. * @vmaddr: address in the host process address space
  814. * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
  815. *
  816. * Returns 0 if the caller can retry __gmap_translate (might fail again),
  817. * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
  818. * up or connecting the gmap page table.
  819. */
  820. static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
  821. unsigned long vmaddr, int prot)
  822. {
  823. struct mm_struct *mm = gmap->mm;
  824. unsigned int fault_flags;
  825. bool unlocked = false;
  826. BUG_ON(gmap_is_shadow(gmap));
  827. fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
  828. if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked))
  829. return -EFAULT;
  830. if (unlocked)
  831. /* lost mmap_lock, caller has to retry __gmap_translate */
  832. return 0;
  833. /* Connect the page tables */
  834. return __gmap_link(gmap, gaddr, vmaddr);
  835. }
  836. /**
  837. * gmap_pte_op_end - release the page table lock
  838. * @ptep: pointer to the locked pte
  839. * @ptl: pointer to the page table spinlock
  840. */
  841. static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl)
  842. {
  843. pte_unmap_unlock(ptep, ptl);
  844. }
  845. /**
  846. * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock
  847. * and return the pmd pointer
  848. * @gmap: pointer to guest mapping meta data structure
  849. * @gaddr: virtual address in the guest address space
  850. *
  851. * Returns a pointer to the pmd for a guest address, or NULL
  852. */
  853. static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
  854. {
  855. pmd_t *pmdp;
  856. BUG_ON(gmap_is_shadow(gmap));
  857. pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
  858. if (!pmdp)
  859. return NULL;
  860. /* without huge pages, there is no need to take the table lock */
  861. if (!gmap->mm->context.allow_gmap_hpage_1m)
  862. return pmd_none(*pmdp) ? NULL : pmdp;
  863. spin_lock(&gmap->guest_table_lock);
  864. if (pmd_none(*pmdp)) {
  865. spin_unlock(&gmap->guest_table_lock);
  866. return NULL;
  867. }
  868. /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
  869. if (!pmd_leaf(*pmdp))
  870. spin_unlock(&gmap->guest_table_lock);
  871. return pmdp;
  872. }
  873. /**
  874. * gmap_pmd_op_end - release the guest_table_lock if needed
  875. * @gmap: pointer to the guest mapping meta data structure
  876. * @pmdp: pointer to the pmd
  877. */
  878. static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
  879. {
  880. if (pmd_leaf(*pmdp))
  881. spin_unlock(&gmap->guest_table_lock);
  882. }
  883. /*
  884. * gmap_protect_pmd - remove access rights to memory and set pmd notification bits
  885. * @pmdp: pointer to the pmd to be protected
  886. * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
  887. * @bits: notification bits to set
  888. *
  889. * Returns:
  890. * 0 if successfully protected
  891. * -EAGAIN if a fixup is needed
  892. * -EINVAL if unsupported notifier bits have been specified
  893. *
  894. * Expected to be called with sg->mm->mmap_lock in read and
  895. * guest_table_lock held.
  896. */
  897. static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
  898. pmd_t *pmdp, int prot, unsigned long bits)
  899. {
  900. int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID;
  901. int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT;
  902. pmd_t new = *pmdp;
  903. /* Fixup needed */
  904. if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE)))
  905. return -EAGAIN;
  906. if (prot == PROT_NONE && !pmd_i) {
  907. new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
  908. gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
  909. }
  910. if (prot == PROT_READ && !pmd_p) {
  911. new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
  912. new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT));
  913. gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
  914. }
  915. if (bits & GMAP_NOTIFY_MPROT)
  916. set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
  917. /* Shadow GMAP protection needs split PMDs */
  918. if (bits & GMAP_NOTIFY_SHADOW)
  919. return -EINVAL;
  920. return 0;
  921. }
  922. /*
  923. * gmap_protect_pte - remove access rights to memory and set pgste bits
  924. * @gmap: pointer to guest mapping meta data structure
  925. * @gaddr: virtual address in the guest address space
  926. * @pmdp: pointer to the pmd associated with the pte
  927. * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
  928. * @bits: notification bits to set
  929. *
  930. * Returns 0 if successfully protected, -ENOMEM if out of memory and
  931. * -EAGAIN if a fixup is needed.
  932. *
  933. * Expected to be called with sg->mm->mmap_lock in read
  934. */
  935. static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
  936. pmd_t *pmdp, int prot, unsigned long bits)
  937. {
  938. int rc;
  939. pte_t *ptep;
  940. spinlock_t *ptl;
  941. unsigned long pbits = 0;
  942. if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
  943. return -EAGAIN;
  944. ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl);
  945. if (!ptep)
  946. return -ENOMEM;
  947. pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0;
  948. pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0;
  949. /* Protect and unlock. */
  950. rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits);
  951. gmap_pte_op_end(ptep, ptl);
  952. return rc;
  953. }
  954. /*
  955. * gmap_protect_range - remove access rights to memory and set pgste bits
  956. * @gmap: pointer to guest mapping meta data structure
  957. * @gaddr: virtual address in the guest address space
  958. * @len: size of area
  959. * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
  960. * @bits: pgste notification bits to set
  961. *
  962. * Returns 0 if successfully protected, -ENOMEM if out of memory and
  963. * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
  964. *
  965. * Called with sg->mm->mmap_lock in read.
  966. */
  967. static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
  968. unsigned long len, int prot, unsigned long bits)
  969. {
  970. unsigned long vmaddr, dist;
  971. pmd_t *pmdp;
  972. int rc;
  973. BUG_ON(gmap_is_shadow(gmap));
  974. while (len) {
  975. rc = -EAGAIN;
  976. pmdp = gmap_pmd_op_walk(gmap, gaddr);
  977. if (pmdp) {
  978. if (!pmd_leaf(*pmdp)) {
  979. rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
  980. bits);
  981. if (!rc) {
  982. len -= PAGE_SIZE;
  983. gaddr += PAGE_SIZE;
  984. }
  985. } else {
  986. rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot,
  987. bits);
  988. if (!rc) {
  989. dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK);
  990. len = len < dist ? 0 : len - dist;
  991. gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE;
  992. }
  993. }
  994. gmap_pmd_op_end(gmap, pmdp);
  995. }
  996. if (rc) {
  997. if (rc == -EINVAL)
  998. return rc;
  999. /* -EAGAIN, fixup of userspace mm and gmap */
  1000. vmaddr = __gmap_translate(gmap, gaddr);
  1001. if (IS_ERR_VALUE(vmaddr))
  1002. return vmaddr;
  1003. rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
  1004. if (rc)
  1005. return rc;
  1006. }
  1007. }
  1008. return 0;
  1009. }
  1010. /**
  1011. * gmap_mprotect_notify - change access rights for a range of ptes and
  1012. * call the notifier if any pte changes again
  1013. * @gmap: pointer to guest mapping meta data structure
  1014. * @gaddr: virtual address in the guest address space
  1015. * @len: size of area
  1016. * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
  1017. *
  1018. * Returns 0 if for each page in the given range a gmap mapping exists,
  1019. * the new access rights could be set and the notifier could be armed.
  1020. * If the gmap mapping is missing for one or more pages -EFAULT is
  1021. * returned. If no memory could be allocated -ENOMEM is returned.
  1022. * This function establishes missing page table entries.
  1023. */
  1024. int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
  1025. unsigned long len, int prot)
  1026. {
  1027. int rc;
  1028. if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
  1029. return -EINVAL;
  1030. if (!MACHINE_HAS_ESOP && prot == PROT_READ)
  1031. return -EINVAL;
  1032. mmap_read_lock(gmap->mm);
  1033. rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT);
  1034. mmap_read_unlock(gmap->mm);
  1035. return rc;
  1036. }
  1037. EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
  1038. /**
  1039. * gmap_read_table - get an unsigned long value from a guest page table using
  1040. * absolute addressing, without marking the page referenced.
  1041. * @gmap: pointer to guest mapping meta data structure
  1042. * @gaddr: virtual address in the guest address space
  1043. * @val: pointer to the unsigned long value to return
  1044. *
  1045. * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
  1046. * if reading using the virtual address failed. -EINVAL if called on a gmap
  1047. * shadow.
  1048. *
  1049. * Called with gmap->mm->mmap_lock in read.
  1050. */
  1051. int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
  1052. {
  1053. unsigned long address, vmaddr;
  1054. spinlock_t *ptl;
  1055. pte_t *ptep, pte;
  1056. int rc;
  1057. if (gmap_is_shadow(gmap))
  1058. return -EINVAL;
  1059. while (1) {
  1060. rc = -EAGAIN;
  1061. ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
  1062. if (ptep) {
  1063. pte = *ptep;
  1064. if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
  1065. address = pte_val(pte) & PAGE_MASK;
  1066. address += gaddr & ~PAGE_MASK;
  1067. *val = *(unsigned long *)__va(address);
  1068. set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG)));
  1069. /* Do *NOT* clear the _PAGE_INVALID bit! */
  1070. rc = 0;
  1071. }
  1072. gmap_pte_op_end(ptep, ptl);
  1073. }
  1074. if (!rc)
  1075. break;
  1076. vmaddr = __gmap_translate(gmap, gaddr);
  1077. if (IS_ERR_VALUE(vmaddr)) {
  1078. rc = vmaddr;
  1079. break;
  1080. }
  1081. rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
  1082. if (rc)
  1083. break;
  1084. }
  1085. return rc;
  1086. }
  1087. EXPORT_SYMBOL_GPL(gmap_read_table);
  1088. /**
  1089. * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
  1090. * @sg: pointer to the shadow guest address space structure
  1091. * @vmaddr: vm address associated with the rmap
  1092. * @rmap: pointer to the rmap structure
  1093. *
  1094. * Called with the sg->guest_table_lock
  1095. */
  1096. static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
  1097. struct gmap_rmap *rmap)
  1098. {
  1099. struct gmap_rmap *temp;
  1100. void __rcu **slot;
  1101. BUG_ON(!gmap_is_shadow(sg));
  1102. slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
  1103. if (slot) {
  1104. rmap->next = radix_tree_deref_slot_protected(slot,
  1105. &sg->guest_table_lock);
  1106. for (temp = rmap->next; temp; temp = temp->next) {
  1107. if (temp->raddr == rmap->raddr) {
  1108. kfree(rmap);
  1109. return;
  1110. }
  1111. }
  1112. radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
  1113. } else {
  1114. rmap->next = NULL;
  1115. radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
  1116. rmap);
  1117. }
  1118. }
  1119. /**
  1120. * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap
  1121. * @sg: pointer to the shadow guest address space structure
  1122. * @raddr: rmap address in the shadow gmap
  1123. * @paddr: address in the parent guest address space
  1124. * @len: length of the memory area to protect
  1125. *
  1126. * Returns 0 if successfully protected and the rmap was created, -ENOMEM
  1127. * if out of memory and -EFAULT if paddr is invalid.
  1128. */
  1129. static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
  1130. unsigned long paddr, unsigned long len)
  1131. {
  1132. struct gmap *parent;
  1133. struct gmap_rmap *rmap;
  1134. unsigned long vmaddr;
  1135. spinlock_t *ptl;
  1136. pte_t *ptep;
  1137. int rc;
  1138. BUG_ON(!gmap_is_shadow(sg));
  1139. parent = sg->parent;
  1140. while (len) {
  1141. vmaddr = __gmap_translate(parent, paddr);
  1142. if (IS_ERR_VALUE(vmaddr))
  1143. return vmaddr;
  1144. rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
  1145. if (!rmap)
  1146. return -ENOMEM;
  1147. rmap->raddr = raddr;
  1148. rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
  1149. if (rc) {
  1150. kfree(rmap);
  1151. return rc;
  1152. }
  1153. rc = -EAGAIN;
  1154. ptep = gmap_pte_op_walk(parent, paddr, &ptl);
  1155. if (ptep) {
  1156. spin_lock(&sg->guest_table_lock);
  1157. rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ,
  1158. PGSTE_VSIE_BIT);
  1159. if (!rc)
  1160. gmap_insert_rmap(sg, vmaddr, rmap);
  1161. spin_unlock(&sg->guest_table_lock);
  1162. gmap_pte_op_end(ptep, ptl);
  1163. }
  1164. radix_tree_preload_end();
  1165. if (rc) {
  1166. kfree(rmap);
  1167. rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ);
  1168. if (rc)
  1169. return rc;
  1170. continue;
  1171. }
  1172. paddr += PAGE_SIZE;
  1173. len -= PAGE_SIZE;
  1174. }
  1175. return 0;
  1176. }
  1177. #define _SHADOW_RMAP_MASK 0x7
  1178. #define _SHADOW_RMAP_REGION1 0x5
  1179. #define _SHADOW_RMAP_REGION2 0x4
  1180. #define _SHADOW_RMAP_REGION3 0x3
  1181. #define _SHADOW_RMAP_SEGMENT 0x2
  1182. #define _SHADOW_RMAP_PGTABLE 0x1
  1183. /**
  1184. * gmap_idte_one - invalidate a single region or segment table entry
  1185. * @asce: region or segment table *origin* + table-type bits
  1186. * @vaddr: virtual address to identify the table entry to flush
  1187. *
  1188. * The invalid bit of a single region or segment table entry is set
  1189. * and the associated TLB entries depending on the entry are flushed.
  1190. * The table-type of the @asce identifies the portion of the @vaddr
  1191. * that is used as the invalidation index.
  1192. */
  1193. static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
  1194. {
  1195. asm volatile(
  1196. " idte %0,0,%1"
  1197. : : "a" (asce), "a" (vaddr) : "cc", "memory");
  1198. }
  1199. /**
  1200. * gmap_unshadow_page - remove a page from a shadow page table
  1201. * @sg: pointer to the shadow guest address space structure
  1202. * @raddr: rmap address in the shadow guest address space
  1203. *
  1204. * Called with the sg->guest_table_lock
  1205. */
  1206. static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
  1207. {
  1208. unsigned long *table;
  1209. BUG_ON(!gmap_is_shadow(sg));
  1210. table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
  1211. if (!table || *table & _PAGE_INVALID)
  1212. return;
  1213. gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1);
  1214. ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
  1215. }
  1216. /**
  1217. * __gmap_unshadow_pgt - remove all entries from a shadow page table
  1218. * @sg: pointer to the shadow guest address space structure
  1219. * @raddr: rmap address in the shadow guest address space
  1220. * @pgt: pointer to the start of a shadow page table
  1221. *
  1222. * Called with the sg->guest_table_lock
  1223. */
  1224. static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
  1225. unsigned long *pgt)
  1226. {
  1227. int i;
  1228. BUG_ON(!gmap_is_shadow(sg));
  1229. for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE)
  1230. pgt[i] = _PAGE_INVALID;
  1231. }
  1232. /**
  1233. * gmap_unshadow_pgt - remove a shadow page table from a segment entry
  1234. * @sg: pointer to the shadow guest address space structure
  1235. * @raddr: address in the shadow guest address space
  1236. *
  1237. * Called with the sg->guest_table_lock
  1238. */
  1239. static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
  1240. {
  1241. unsigned long *ste;
  1242. phys_addr_t sto, pgt;
  1243. struct ptdesc *ptdesc;
  1244. BUG_ON(!gmap_is_shadow(sg));
  1245. ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
  1246. if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
  1247. return;
  1248. gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1);
  1249. sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
  1250. gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
  1251. pgt = *ste & _SEGMENT_ENTRY_ORIGIN;
  1252. *ste = _SEGMENT_ENTRY_EMPTY;
  1253. __gmap_unshadow_pgt(sg, raddr, __va(pgt));
  1254. /* Free page table */
  1255. ptdesc = page_ptdesc(phys_to_page(pgt));
  1256. list_del(&ptdesc->pt_list);
  1257. page_table_free_pgste(ptdesc);
  1258. }
  1259. /**
  1260. * __gmap_unshadow_sgt - remove all entries from a shadow segment table
  1261. * @sg: pointer to the shadow guest address space structure
  1262. * @raddr: rmap address in the shadow guest address space
  1263. * @sgt: pointer to the start of a shadow segment table
  1264. *
  1265. * Called with the sg->guest_table_lock
  1266. */
  1267. static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
  1268. unsigned long *sgt)
  1269. {
  1270. struct ptdesc *ptdesc;
  1271. phys_addr_t pgt;
  1272. int i;
  1273. BUG_ON(!gmap_is_shadow(sg));
  1274. for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) {
  1275. if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
  1276. continue;
  1277. pgt = sgt[i] & _REGION_ENTRY_ORIGIN;
  1278. sgt[i] = _SEGMENT_ENTRY_EMPTY;
  1279. __gmap_unshadow_pgt(sg, raddr, __va(pgt));
  1280. /* Free page table */
  1281. ptdesc = page_ptdesc(phys_to_page(pgt));
  1282. list_del(&ptdesc->pt_list);
  1283. page_table_free_pgste(ptdesc);
  1284. }
  1285. }
  1286. /**
  1287. * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
  1288. * @sg: pointer to the shadow guest address space structure
  1289. * @raddr: rmap address in the shadow guest address space
  1290. *
  1291. * Called with the shadow->guest_table_lock
  1292. */
  1293. static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
  1294. {
  1295. unsigned long r3o, *r3e;
  1296. phys_addr_t sgt;
  1297. struct page *page;
  1298. BUG_ON(!gmap_is_shadow(sg));
  1299. r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
  1300. if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
  1301. return;
  1302. gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1);
  1303. r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT));
  1304. gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr);
  1305. sgt = *r3e & _REGION_ENTRY_ORIGIN;
  1306. *r3e = _REGION3_ENTRY_EMPTY;
  1307. __gmap_unshadow_sgt(sg, raddr, __va(sgt));
  1308. /* Free segment table */
  1309. page = phys_to_page(sgt);
  1310. list_del(&page->lru);
  1311. __free_pages(page, CRST_ALLOC_ORDER);
  1312. }
  1313. /**
  1314. * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
  1315. * @sg: pointer to the shadow guest address space structure
  1316. * @raddr: address in the shadow guest address space
  1317. * @r3t: pointer to the start of a shadow region-3 table
  1318. *
  1319. * Called with the sg->guest_table_lock
  1320. */
  1321. static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
  1322. unsigned long *r3t)
  1323. {
  1324. struct page *page;
  1325. phys_addr_t sgt;
  1326. int i;
  1327. BUG_ON(!gmap_is_shadow(sg));
  1328. for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) {
  1329. if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
  1330. continue;
  1331. sgt = r3t[i] & _REGION_ENTRY_ORIGIN;
  1332. r3t[i] = _REGION3_ENTRY_EMPTY;
  1333. __gmap_unshadow_sgt(sg, raddr, __va(sgt));
  1334. /* Free segment table */
  1335. page = phys_to_page(sgt);
  1336. list_del(&page->lru);
  1337. __free_pages(page, CRST_ALLOC_ORDER);
  1338. }
  1339. }
  1340. /**
  1341. * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
  1342. * @sg: pointer to the shadow guest address space structure
  1343. * @raddr: rmap address in the shadow guest address space
  1344. *
  1345. * Called with the sg->guest_table_lock
  1346. */
  1347. static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
  1348. {
  1349. unsigned long r2o, *r2e;
  1350. phys_addr_t r3t;
  1351. struct page *page;
  1352. BUG_ON(!gmap_is_shadow(sg));
  1353. r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
  1354. if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
  1355. return;
  1356. gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1);
  1357. r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT));
  1358. gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr);
  1359. r3t = *r2e & _REGION_ENTRY_ORIGIN;
  1360. *r2e = _REGION2_ENTRY_EMPTY;
  1361. __gmap_unshadow_r3t(sg, raddr, __va(r3t));
  1362. /* Free region 3 table */
  1363. page = phys_to_page(r3t);
  1364. list_del(&page->lru);
  1365. __free_pages(page, CRST_ALLOC_ORDER);
  1366. }
  1367. /**
  1368. * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
  1369. * @sg: pointer to the shadow guest address space structure
  1370. * @raddr: rmap address in the shadow guest address space
  1371. * @r2t: pointer to the start of a shadow region-2 table
  1372. *
  1373. * Called with the sg->guest_table_lock
  1374. */
  1375. static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
  1376. unsigned long *r2t)
  1377. {
  1378. phys_addr_t r3t;
  1379. struct page *page;
  1380. int i;
  1381. BUG_ON(!gmap_is_shadow(sg));
  1382. for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) {
  1383. if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
  1384. continue;
  1385. r3t = r2t[i] & _REGION_ENTRY_ORIGIN;
  1386. r2t[i] = _REGION2_ENTRY_EMPTY;
  1387. __gmap_unshadow_r3t(sg, raddr, __va(r3t));
  1388. /* Free region 3 table */
  1389. page = phys_to_page(r3t);
  1390. list_del(&page->lru);
  1391. __free_pages(page, CRST_ALLOC_ORDER);
  1392. }
  1393. }
  1394. /**
  1395. * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
  1396. * @sg: pointer to the shadow guest address space structure
  1397. * @raddr: rmap address in the shadow guest address space
  1398. *
  1399. * Called with the sg->guest_table_lock
  1400. */
  1401. static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
  1402. {
  1403. unsigned long r1o, *r1e;
  1404. struct page *page;
  1405. phys_addr_t r2t;
  1406. BUG_ON(!gmap_is_shadow(sg));
  1407. r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
  1408. if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
  1409. return;
  1410. gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1);
  1411. r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT));
  1412. gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr);
  1413. r2t = *r1e & _REGION_ENTRY_ORIGIN;
  1414. *r1e = _REGION1_ENTRY_EMPTY;
  1415. __gmap_unshadow_r2t(sg, raddr, __va(r2t));
  1416. /* Free region 2 table */
  1417. page = phys_to_page(r2t);
  1418. list_del(&page->lru);
  1419. __free_pages(page, CRST_ALLOC_ORDER);
  1420. }
  1421. /**
  1422. * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
  1423. * @sg: pointer to the shadow guest address space structure
  1424. * @raddr: rmap address in the shadow guest address space
  1425. * @r1t: pointer to the start of a shadow region-1 table
  1426. *
  1427. * Called with the shadow->guest_table_lock
  1428. */
  1429. static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
  1430. unsigned long *r1t)
  1431. {
  1432. unsigned long asce;
  1433. struct page *page;
  1434. phys_addr_t r2t;
  1435. int i;
  1436. BUG_ON(!gmap_is_shadow(sg));
  1437. asce = __pa(r1t) | _ASCE_TYPE_REGION1;
  1438. for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) {
  1439. if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
  1440. continue;
  1441. r2t = r1t[i] & _REGION_ENTRY_ORIGIN;
  1442. __gmap_unshadow_r2t(sg, raddr, __va(r2t));
  1443. /* Clear entry and flush translation r1t -> r2t */
  1444. gmap_idte_one(asce, raddr);
  1445. r1t[i] = _REGION1_ENTRY_EMPTY;
  1446. /* Free region 2 table */
  1447. page = phys_to_page(r2t);
  1448. list_del(&page->lru);
  1449. __free_pages(page, CRST_ALLOC_ORDER);
  1450. }
  1451. }
  1452. /**
  1453. * gmap_unshadow - remove a shadow page table completely
  1454. * @sg: pointer to the shadow guest address space structure
  1455. *
  1456. * Called with sg->guest_table_lock
  1457. */
  1458. static void gmap_unshadow(struct gmap *sg)
  1459. {
  1460. unsigned long *table;
  1461. BUG_ON(!gmap_is_shadow(sg));
  1462. if (sg->removed)
  1463. return;
  1464. sg->removed = 1;
  1465. gmap_call_notifier(sg, 0, -1UL);
  1466. gmap_flush_tlb(sg);
  1467. table = __va(sg->asce & _ASCE_ORIGIN);
  1468. switch (sg->asce & _ASCE_TYPE_MASK) {
  1469. case _ASCE_TYPE_REGION1:
  1470. __gmap_unshadow_r1t(sg, 0, table);
  1471. break;
  1472. case _ASCE_TYPE_REGION2:
  1473. __gmap_unshadow_r2t(sg, 0, table);
  1474. break;
  1475. case _ASCE_TYPE_REGION3:
  1476. __gmap_unshadow_r3t(sg, 0, table);
  1477. break;
  1478. case _ASCE_TYPE_SEGMENT:
  1479. __gmap_unshadow_sgt(sg, 0, table);
  1480. break;
  1481. }
  1482. }
  1483. /**
  1484. * gmap_find_shadow - find a specific asce in the list of shadow tables
  1485. * @parent: pointer to the parent gmap
  1486. * @asce: ASCE for which the shadow table is created
  1487. * @edat_level: edat level to be used for the shadow translation
  1488. *
  1489. * Returns the pointer to a gmap if a shadow table with the given asce is
  1490. * already available, ERR_PTR(-EAGAIN) if another one is just being created,
  1491. * otherwise NULL
  1492. */
  1493. static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
  1494. int edat_level)
  1495. {
  1496. struct gmap *sg;
  1497. list_for_each_entry(sg, &parent->children, list) {
  1498. if (sg->orig_asce != asce || sg->edat_level != edat_level ||
  1499. sg->removed)
  1500. continue;
  1501. if (!sg->initialized)
  1502. return ERR_PTR(-EAGAIN);
  1503. refcount_inc(&sg->ref_count);
  1504. return sg;
  1505. }
  1506. return NULL;
  1507. }
  1508. /**
  1509. * gmap_shadow_valid - check if a shadow guest address space matches the
  1510. * given properties and is still valid
  1511. * @sg: pointer to the shadow guest address space structure
  1512. * @asce: ASCE for which the shadow table is requested
  1513. * @edat_level: edat level to be used for the shadow translation
  1514. *
  1515. * Returns 1 if the gmap shadow is still valid and matches the given
  1516. * properties, the caller can continue using it. Returns 0 otherwise, the
  1517. * caller has to request a new shadow gmap in this case.
  1518. *
  1519. */
  1520. int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
  1521. {
  1522. if (sg->removed)
  1523. return 0;
  1524. return sg->orig_asce == asce && sg->edat_level == edat_level;
  1525. }
  1526. EXPORT_SYMBOL_GPL(gmap_shadow_valid);
  1527. /**
  1528. * gmap_shadow - create/find a shadow guest address space
  1529. * @parent: pointer to the parent gmap
  1530. * @asce: ASCE for which the shadow table is created
  1531. * @edat_level: edat level to be used for the shadow translation
  1532. *
  1533. * The pages of the top level page table referred by the asce parameter
  1534. * will be set to read-only and marked in the PGSTEs of the kvm process.
  1535. * The shadow table will be removed automatically on any change to the
  1536. * PTE mapping for the source table.
  1537. *
  1538. * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
  1539. * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
  1540. * parent gmap table could not be protected.
  1541. */
  1542. struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
  1543. int edat_level)
  1544. {
  1545. struct gmap *sg, *new;
  1546. unsigned long limit;
  1547. int rc;
  1548. BUG_ON(parent->mm->context.allow_gmap_hpage_1m);
  1549. BUG_ON(gmap_is_shadow(parent));
  1550. spin_lock(&parent->shadow_lock);
  1551. sg = gmap_find_shadow(parent, asce, edat_level);
  1552. spin_unlock(&parent->shadow_lock);
  1553. if (sg)
  1554. return sg;
  1555. /* Create a new shadow gmap */
  1556. limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
  1557. if (asce & _ASCE_REAL_SPACE)
  1558. limit = -1UL;
  1559. new = gmap_alloc(limit);
  1560. if (!new)
  1561. return ERR_PTR(-ENOMEM);
  1562. new->mm = parent->mm;
  1563. new->parent = gmap_get(parent);
  1564. new->private = parent->private;
  1565. new->orig_asce = asce;
  1566. new->edat_level = edat_level;
  1567. new->initialized = false;
  1568. spin_lock(&parent->shadow_lock);
  1569. /* Recheck if another CPU created the same shadow */
  1570. sg = gmap_find_shadow(parent, asce, edat_level);
  1571. if (sg) {
  1572. spin_unlock(&parent->shadow_lock);
  1573. gmap_free(new);
  1574. return sg;
  1575. }
  1576. if (asce & _ASCE_REAL_SPACE) {
  1577. /* only allow one real-space gmap shadow */
  1578. list_for_each_entry(sg, &parent->children, list) {
  1579. if (sg->orig_asce & _ASCE_REAL_SPACE) {
  1580. spin_lock(&sg->guest_table_lock);
  1581. gmap_unshadow(sg);
  1582. spin_unlock(&sg->guest_table_lock);
  1583. list_del(&sg->list);
  1584. gmap_put(sg);
  1585. break;
  1586. }
  1587. }
  1588. }
  1589. refcount_set(&new->ref_count, 2);
  1590. list_add(&new->list, &parent->children);
  1591. if (asce & _ASCE_REAL_SPACE) {
  1592. /* nothing to protect, return right away */
  1593. new->initialized = true;
  1594. spin_unlock(&parent->shadow_lock);
  1595. return new;
  1596. }
  1597. spin_unlock(&parent->shadow_lock);
  1598. /* protect after insertion, so it will get properly invalidated */
  1599. mmap_read_lock(parent->mm);
  1600. rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
  1601. ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE,
  1602. PROT_READ, GMAP_NOTIFY_SHADOW);
  1603. mmap_read_unlock(parent->mm);
  1604. spin_lock(&parent->shadow_lock);
  1605. new->initialized = true;
  1606. if (rc) {
  1607. list_del(&new->list);
  1608. gmap_free(new);
  1609. new = ERR_PTR(rc);
  1610. }
  1611. spin_unlock(&parent->shadow_lock);
  1612. return new;
  1613. }
  1614. EXPORT_SYMBOL_GPL(gmap_shadow);
  1615. /**
  1616. * gmap_shadow_r2t - create an empty shadow region 2 table
  1617. * @sg: pointer to the shadow guest address space structure
  1618. * @saddr: faulting address in the shadow gmap
  1619. * @r2t: parent gmap address of the region 2 table to get shadowed
  1620. * @fake: r2t references contiguous guest memory block, not a r2t
  1621. *
  1622. * The r2t parameter specifies the address of the source table. The
  1623. * four pages of the source table are made read-only in the parent gmap
  1624. * address space. A write to the source table area @r2t will automatically
  1625. * remove the shadow r2 table and all of its descendants.
  1626. *
  1627. * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
  1628. * shadow table structure is incomplete, -ENOMEM if out of memory and
  1629. * -EFAULT if an address in the parent gmap could not be resolved.
  1630. *
  1631. * Called with sg->mm->mmap_lock in read.
  1632. */
  1633. int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
  1634. int fake)
  1635. {
  1636. unsigned long raddr, origin, offset, len;
  1637. unsigned long *table;
  1638. phys_addr_t s_r2t;
  1639. struct page *page;
  1640. int rc;
  1641. BUG_ON(!gmap_is_shadow(sg));
  1642. /* Allocate a shadow region second table */
  1643. page = gmap_alloc_crst();
  1644. if (!page)
  1645. return -ENOMEM;
  1646. page->index = r2t & _REGION_ENTRY_ORIGIN;
  1647. if (fake)
  1648. page->index |= GMAP_SHADOW_FAKE_TABLE;
  1649. s_r2t = page_to_phys(page);
  1650. /* Install shadow region second table */
  1651. spin_lock(&sg->guest_table_lock);
  1652. table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
  1653. if (!table) {
  1654. rc = -EAGAIN; /* Race with unshadow */
  1655. goto out_free;
  1656. }
  1657. if (!(*table & _REGION_ENTRY_INVALID)) {
  1658. rc = 0; /* Already established */
  1659. goto out_free;
  1660. } else if (*table & _REGION_ENTRY_ORIGIN) {
  1661. rc = -EAGAIN; /* Race with shadow */
  1662. goto out_free;
  1663. }
  1664. crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY);
  1665. /* mark as invalid as long as the parent table is not protected */
  1666. *table = s_r2t | _REGION_ENTRY_LENGTH |
  1667. _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
  1668. if (sg->edat_level >= 1)
  1669. *table |= (r2t & _REGION_ENTRY_PROTECT);
  1670. list_add(&page->lru, &sg->crst_list);
  1671. if (fake) {
  1672. /* nothing to protect for fake tables */
  1673. *table &= ~_REGION_ENTRY_INVALID;
  1674. spin_unlock(&sg->guest_table_lock);
  1675. return 0;
  1676. }
  1677. spin_unlock(&sg->guest_table_lock);
  1678. /* Make r2t read-only in parent gmap page table */
  1679. raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1;
  1680. origin = r2t & _REGION_ENTRY_ORIGIN;
  1681. offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
  1682. len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
  1683. rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
  1684. spin_lock(&sg->guest_table_lock);
  1685. if (!rc) {
  1686. table = gmap_table_walk(sg, saddr, 4);
  1687. if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t)
  1688. rc = -EAGAIN; /* Race with unshadow */
  1689. else
  1690. *table &= ~_REGION_ENTRY_INVALID;
  1691. } else {
  1692. gmap_unshadow_r2t(sg, raddr);
  1693. }
  1694. spin_unlock(&sg->guest_table_lock);
  1695. return rc;
  1696. out_free:
  1697. spin_unlock(&sg->guest_table_lock);
  1698. __free_pages(page, CRST_ALLOC_ORDER);
  1699. return rc;
  1700. }
  1701. EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
  1702. /**
  1703. * gmap_shadow_r3t - create a shadow region 3 table
  1704. * @sg: pointer to the shadow guest address space structure
  1705. * @saddr: faulting address in the shadow gmap
  1706. * @r3t: parent gmap address of the region 3 table to get shadowed
  1707. * @fake: r3t references contiguous guest memory block, not a r3t
  1708. *
  1709. * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
  1710. * shadow table structure is incomplete, -ENOMEM if out of memory and
  1711. * -EFAULT if an address in the parent gmap could not be resolved.
  1712. *
  1713. * Called with sg->mm->mmap_lock in read.
  1714. */
  1715. int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
  1716. int fake)
  1717. {
  1718. unsigned long raddr, origin, offset, len;
  1719. unsigned long *table;
  1720. phys_addr_t s_r3t;
  1721. struct page *page;
  1722. int rc;
  1723. BUG_ON(!gmap_is_shadow(sg));
  1724. /* Allocate a shadow region second table */
  1725. page = gmap_alloc_crst();
  1726. if (!page)
  1727. return -ENOMEM;
  1728. page->index = r3t & _REGION_ENTRY_ORIGIN;
  1729. if (fake)
  1730. page->index |= GMAP_SHADOW_FAKE_TABLE;
  1731. s_r3t = page_to_phys(page);
  1732. /* Install shadow region second table */
  1733. spin_lock(&sg->guest_table_lock);
  1734. table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
  1735. if (!table) {
  1736. rc = -EAGAIN; /* Race with unshadow */
  1737. goto out_free;
  1738. }
  1739. if (!(*table & _REGION_ENTRY_INVALID)) {
  1740. rc = 0; /* Already established */
  1741. goto out_free;
  1742. } else if (*table & _REGION_ENTRY_ORIGIN) {
  1743. rc = -EAGAIN; /* Race with shadow */
  1744. goto out_free;
  1745. }
  1746. crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY);
  1747. /* mark as invalid as long as the parent table is not protected */
  1748. *table = s_r3t | _REGION_ENTRY_LENGTH |
  1749. _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
  1750. if (sg->edat_level >= 1)
  1751. *table |= (r3t & _REGION_ENTRY_PROTECT);
  1752. list_add(&page->lru, &sg->crst_list);
  1753. if (fake) {
  1754. /* nothing to protect for fake tables */
  1755. *table &= ~_REGION_ENTRY_INVALID;
  1756. spin_unlock(&sg->guest_table_lock);
  1757. return 0;
  1758. }
  1759. spin_unlock(&sg->guest_table_lock);
  1760. /* Make r3t read-only in parent gmap page table */
  1761. raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2;
  1762. origin = r3t & _REGION_ENTRY_ORIGIN;
  1763. offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
  1764. len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
  1765. rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
  1766. spin_lock(&sg->guest_table_lock);
  1767. if (!rc) {
  1768. table = gmap_table_walk(sg, saddr, 3);
  1769. if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t)
  1770. rc = -EAGAIN; /* Race with unshadow */
  1771. else
  1772. *table &= ~_REGION_ENTRY_INVALID;
  1773. } else {
  1774. gmap_unshadow_r3t(sg, raddr);
  1775. }
  1776. spin_unlock(&sg->guest_table_lock);
  1777. return rc;
  1778. out_free:
  1779. spin_unlock(&sg->guest_table_lock);
  1780. __free_pages(page, CRST_ALLOC_ORDER);
  1781. return rc;
  1782. }
  1783. EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
  1784. /**
  1785. * gmap_shadow_sgt - create a shadow segment table
  1786. * @sg: pointer to the shadow guest address space structure
  1787. * @saddr: faulting address in the shadow gmap
  1788. * @sgt: parent gmap address of the segment table to get shadowed
  1789. * @fake: sgt references contiguous guest memory block, not a sgt
  1790. *
  1791. * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
  1792. * shadow table structure is incomplete, -ENOMEM if out of memory and
  1793. * -EFAULT if an address in the parent gmap could not be resolved.
  1794. *
  1795. * Called with sg->mm->mmap_lock in read.
  1796. */
  1797. int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
  1798. int fake)
  1799. {
  1800. unsigned long raddr, origin, offset, len;
  1801. unsigned long *table;
  1802. phys_addr_t s_sgt;
  1803. struct page *page;
  1804. int rc;
  1805. BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
  1806. /* Allocate a shadow segment table */
  1807. page = gmap_alloc_crst();
  1808. if (!page)
  1809. return -ENOMEM;
  1810. page->index = sgt & _REGION_ENTRY_ORIGIN;
  1811. if (fake)
  1812. page->index |= GMAP_SHADOW_FAKE_TABLE;
  1813. s_sgt = page_to_phys(page);
  1814. /* Install shadow region second table */
  1815. spin_lock(&sg->guest_table_lock);
  1816. table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
  1817. if (!table) {
  1818. rc = -EAGAIN; /* Race with unshadow */
  1819. goto out_free;
  1820. }
  1821. if (!(*table & _REGION_ENTRY_INVALID)) {
  1822. rc = 0; /* Already established */
  1823. goto out_free;
  1824. } else if (*table & _REGION_ENTRY_ORIGIN) {
  1825. rc = -EAGAIN; /* Race with shadow */
  1826. goto out_free;
  1827. }
  1828. crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY);
  1829. /* mark as invalid as long as the parent table is not protected */
  1830. *table = s_sgt | _REGION_ENTRY_LENGTH |
  1831. _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
  1832. if (sg->edat_level >= 1)
  1833. *table |= sgt & _REGION_ENTRY_PROTECT;
  1834. list_add(&page->lru, &sg->crst_list);
  1835. if (fake) {
  1836. /* nothing to protect for fake tables */
  1837. *table &= ~_REGION_ENTRY_INVALID;
  1838. spin_unlock(&sg->guest_table_lock);
  1839. return 0;
  1840. }
  1841. spin_unlock(&sg->guest_table_lock);
  1842. /* Make sgt read-only in parent gmap page table */
  1843. raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3;
  1844. origin = sgt & _REGION_ENTRY_ORIGIN;
  1845. offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
  1846. len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
  1847. rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
  1848. spin_lock(&sg->guest_table_lock);
  1849. if (!rc) {
  1850. table = gmap_table_walk(sg, saddr, 2);
  1851. if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt)
  1852. rc = -EAGAIN; /* Race with unshadow */
  1853. else
  1854. *table &= ~_REGION_ENTRY_INVALID;
  1855. } else {
  1856. gmap_unshadow_sgt(sg, raddr);
  1857. }
  1858. spin_unlock(&sg->guest_table_lock);
  1859. return rc;
  1860. out_free:
  1861. spin_unlock(&sg->guest_table_lock);
  1862. __free_pages(page, CRST_ALLOC_ORDER);
  1863. return rc;
  1864. }
  1865. EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
  1866. /**
  1867. * gmap_shadow_pgt_lookup - find a shadow page table
  1868. * @sg: pointer to the shadow guest address space structure
  1869. * @saddr: the address in the shadow aguest address space
  1870. * @pgt: parent gmap address of the page table to get shadowed
  1871. * @dat_protection: if the pgtable is marked as protected by dat
  1872. * @fake: pgt references contiguous guest memory block, not a pgtable
  1873. *
  1874. * Returns 0 if the shadow page table was found and -EAGAIN if the page
  1875. * table was not found.
  1876. *
  1877. * Called with sg->mm->mmap_lock in read.
  1878. */
  1879. int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
  1880. unsigned long *pgt, int *dat_protection,
  1881. int *fake)
  1882. {
  1883. unsigned long *table;
  1884. struct page *page;
  1885. int rc;
  1886. BUG_ON(!gmap_is_shadow(sg));
  1887. spin_lock(&sg->guest_table_lock);
  1888. table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
  1889. if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
  1890. /* Shadow page tables are full pages (pte+pgste) */
  1891. page = pfn_to_page(*table >> PAGE_SHIFT);
  1892. *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
  1893. *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
  1894. *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
  1895. rc = 0;
  1896. } else {
  1897. rc = -EAGAIN;
  1898. }
  1899. spin_unlock(&sg->guest_table_lock);
  1900. return rc;
  1901. }
  1902. EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
  1903. /**
  1904. * gmap_shadow_pgt - instantiate a shadow page table
  1905. * @sg: pointer to the shadow guest address space structure
  1906. * @saddr: faulting address in the shadow gmap
  1907. * @pgt: parent gmap address of the page table to get shadowed
  1908. * @fake: pgt references contiguous guest memory block, not a pgtable
  1909. *
  1910. * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
  1911. * shadow table structure is incomplete, -ENOMEM if out of memory,
  1912. * -EFAULT if an address in the parent gmap could not be resolved and
  1913. *
  1914. * Called with gmap->mm->mmap_lock in read
  1915. */
  1916. int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
  1917. int fake)
  1918. {
  1919. unsigned long raddr, origin;
  1920. unsigned long *table;
  1921. struct ptdesc *ptdesc;
  1922. phys_addr_t s_pgt;
  1923. int rc;
  1924. BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
  1925. /* Allocate a shadow page table */
  1926. ptdesc = page_table_alloc_pgste(sg->mm);
  1927. if (!ptdesc)
  1928. return -ENOMEM;
  1929. ptdesc->pt_index = pgt & _SEGMENT_ENTRY_ORIGIN;
  1930. if (fake)
  1931. ptdesc->pt_index |= GMAP_SHADOW_FAKE_TABLE;
  1932. s_pgt = page_to_phys(ptdesc_page(ptdesc));
  1933. /* Install shadow page table */
  1934. spin_lock(&sg->guest_table_lock);
  1935. table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
  1936. if (!table) {
  1937. rc = -EAGAIN; /* Race with unshadow */
  1938. goto out_free;
  1939. }
  1940. if (!(*table & _SEGMENT_ENTRY_INVALID)) {
  1941. rc = 0; /* Already established */
  1942. goto out_free;
  1943. } else if (*table & _SEGMENT_ENTRY_ORIGIN) {
  1944. rc = -EAGAIN; /* Race with shadow */
  1945. goto out_free;
  1946. }
  1947. /* mark as invalid as long as the parent table is not protected */
  1948. *table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
  1949. (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
  1950. list_add(&ptdesc->pt_list, &sg->pt_list);
  1951. if (fake) {
  1952. /* nothing to protect for fake tables */
  1953. *table &= ~_SEGMENT_ENTRY_INVALID;
  1954. spin_unlock(&sg->guest_table_lock);
  1955. return 0;
  1956. }
  1957. spin_unlock(&sg->guest_table_lock);
  1958. /* Make pgt read-only in parent gmap page table (not the pgste) */
  1959. raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT;
  1960. origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
  1961. rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE);
  1962. spin_lock(&sg->guest_table_lock);
  1963. if (!rc) {
  1964. table = gmap_table_walk(sg, saddr, 1);
  1965. if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt)
  1966. rc = -EAGAIN; /* Race with unshadow */
  1967. else
  1968. *table &= ~_SEGMENT_ENTRY_INVALID;
  1969. } else {
  1970. gmap_unshadow_pgt(sg, raddr);
  1971. }
  1972. spin_unlock(&sg->guest_table_lock);
  1973. return rc;
  1974. out_free:
  1975. spin_unlock(&sg->guest_table_lock);
  1976. page_table_free_pgste(ptdesc);
  1977. return rc;
  1978. }
  1979. EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
  1980. /**
  1981. * gmap_shadow_page - create a shadow page mapping
  1982. * @sg: pointer to the shadow guest address space structure
  1983. * @saddr: faulting address in the shadow gmap
  1984. * @pte: pte in parent gmap address space to get shadowed
  1985. *
  1986. * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
  1987. * shadow table structure is incomplete, -ENOMEM if out of memory and
  1988. * -EFAULT if an address in the parent gmap could not be resolved.
  1989. *
  1990. * Called with sg->mm->mmap_lock in read.
  1991. */
  1992. int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
  1993. {
  1994. struct gmap *parent;
  1995. struct gmap_rmap *rmap;
  1996. unsigned long vmaddr, paddr;
  1997. spinlock_t *ptl;
  1998. pte_t *sptep, *tptep;
  1999. int prot;
  2000. int rc;
  2001. BUG_ON(!gmap_is_shadow(sg));
  2002. parent = sg->parent;
  2003. prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
  2004. rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
  2005. if (!rmap)
  2006. return -ENOMEM;
  2007. rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
  2008. while (1) {
  2009. paddr = pte_val(pte) & PAGE_MASK;
  2010. vmaddr = __gmap_translate(parent, paddr);
  2011. if (IS_ERR_VALUE(vmaddr)) {
  2012. rc = vmaddr;
  2013. break;
  2014. }
  2015. rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
  2016. if (rc)
  2017. break;
  2018. rc = -EAGAIN;
  2019. sptep = gmap_pte_op_walk(parent, paddr, &ptl);
  2020. if (sptep) {
  2021. spin_lock(&sg->guest_table_lock);
  2022. /* Get page table pointer */
  2023. tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
  2024. if (!tptep) {
  2025. spin_unlock(&sg->guest_table_lock);
  2026. gmap_pte_op_end(sptep, ptl);
  2027. radix_tree_preload_end();
  2028. break;
  2029. }
  2030. rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
  2031. if (rc > 0) {
  2032. /* Success and a new mapping */
  2033. gmap_insert_rmap(sg, vmaddr, rmap);
  2034. rmap = NULL;
  2035. rc = 0;
  2036. }
  2037. gmap_pte_op_end(sptep, ptl);
  2038. spin_unlock(&sg->guest_table_lock);
  2039. }
  2040. radix_tree_preload_end();
  2041. if (!rc)
  2042. break;
  2043. rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
  2044. if (rc)
  2045. break;
  2046. }
  2047. kfree(rmap);
  2048. return rc;
  2049. }
  2050. EXPORT_SYMBOL_GPL(gmap_shadow_page);
  2051. /*
  2052. * gmap_shadow_notify - handle notifications for shadow gmap
  2053. *
  2054. * Called with sg->parent->shadow_lock.
  2055. */
  2056. static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
  2057. unsigned long gaddr)
  2058. {
  2059. struct gmap_rmap *rmap, *rnext, *head;
  2060. unsigned long start, end, bits, raddr;
  2061. BUG_ON(!gmap_is_shadow(sg));
  2062. spin_lock(&sg->guest_table_lock);
  2063. if (sg->removed) {
  2064. spin_unlock(&sg->guest_table_lock);
  2065. return;
  2066. }
  2067. /* Check for top level table */
  2068. start = sg->orig_asce & _ASCE_ORIGIN;
  2069. end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE;
  2070. if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
  2071. gaddr < end) {
  2072. /* The complete shadow table has to go */
  2073. gmap_unshadow(sg);
  2074. spin_unlock(&sg->guest_table_lock);
  2075. list_del(&sg->list);
  2076. gmap_put(sg);
  2077. return;
  2078. }
  2079. /* Remove the page table tree from on specific entry */
  2080. head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
  2081. gmap_for_each_rmap_safe(rmap, rnext, head) {
  2082. bits = rmap->raddr & _SHADOW_RMAP_MASK;
  2083. raddr = rmap->raddr ^ bits;
  2084. switch (bits) {
  2085. case _SHADOW_RMAP_REGION1:
  2086. gmap_unshadow_r2t(sg, raddr);
  2087. break;
  2088. case _SHADOW_RMAP_REGION2:
  2089. gmap_unshadow_r3t(sg, raddr);
  2090. break;
  2091. case _SHADOW_RMAP_REGION3:
  2092. gmap_unshadow_sgt(sg, raddr);
  2093. break;
  2094. case _SHADOW_RMAP_SEGMENT:
  2095. gmap_unshadow_pgt(sg, raddr);
  2096. break;
  2097. case _SHADOW_RMAP_PGTABLE:
  2098. gmap_unshadow_page(sg, raddr);
  2099. break;
  2100. }
  2101. kfree(rmap);
  2102. }
  2103. spin_unlock(&sg->guest_table_lock);
  2104. }
  2105. /**
  2106. * ptep_notify - call all invalidation callbacks for a specific pte.
  2107. * @mm: pointer to the process mm_struct
  2108. * @vmaddr: virtual address in the process address space
  2109. * @pte: pointer to the page table entry
  2110. * @bits: bits from the pgste that caused the notify call
  2111. *
  2112. * This function is assumed to be called with the page table lock held
  2113. * for the pte to notify.
  2114. */
  2115. void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
  2116. pte_t *pte, unsigned long bits)
  2117. {
  2118. unsigned long offset, gaddr = 0;
  2119. unsigned long *table;
  2120. struct gmap *gmap, *sg, *next;
  2121. offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
  2122. offset = offset * (PAGE_SIZE / sizeof(pte_t));
  2123. rcu_read_lock();
  2124. list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
  2125. spin_lock(&gmap->guest_table_lock);
  2126. table = radix_tree_lookup(&gmap->host_to_guest,
  2127. vmaddr >> PMD_SHIFT);
  2128. if (table)
  2129. gaddr = __gmap_segment_gaddr(table) + offset;
  2130. spin_unlock(&gmap->guest_table_lock);
  2131. if (!table)
  2132. continue;
  2133. if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
  2134. spin_lock(&gmap->shadow_lock);
  2135. list_for_each_entry_safe(sg, next,
  2136. &gmap->children, list)
  2137. gmap_shadow_notify(sg, vmaddr, gaddr);
  2138. spin_unlock(&gmap->shadow_lock);
  2139. }
  2140. if (bits & PGSTE_IN_BIT)
  2141. gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
  2142. }
  2143. rcu_read_unlock();
  2144. }
  2145. EXPORT_SYMBOL_GPL(ptep_notify);
  2146. static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
  2147. unsigned long gaddr)
  2148. {
  2149. set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
  2150. gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
  2151. }
  2152. /**
  2153. * gmap_pmdp_xchg - exchange a gmap pmd with another
  2154. * @gmap: pointer to the guest address space structure
  2155. * @pmdp: pointer to the pmd entry
  2156. * @new: replacement entry
  2157. * @gaddr: the affected guest address
  2158. *
  2159. * This function is assumed to be called with the guest_table_lock
  2160. * held.
  2161. */
  2162. static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
  2163. unsigned long gaddr)
  2164. {
  2165. gaddr &= HPAGE_MASK;
  2166. pmdp_notify_gmap(gmap, pmdp, gaddr);
  2167. new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN));
  2168. if (MACHINE_HAS_TLB_GUEST)
  2169. __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
  2170. IDTE_GLOBAL);
  2171. else if (MACHINE_HAS_IDTE)
  2172. __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
  2173. else
  2174. __pmdp_csp(pmdp);
  2175. set_pmd(pmdp, new);
  2176. }
  2177. static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
  2178. int purge)
  2179. {
  2180. pmd_t *pmdp;
  2181. struct gmap *gmap;
  2182. unsigned long gaddr;
  2183. rcu_read_lock();
  2184. list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
  2185. spin_lock(&gmap->guest_table_lock);
  2186. pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest,
  2187. vmaddr >> PMD_SHIFT);
  2188. if (pmdp) {
  2189. gaddr = __gmap_segment_gaddr((unsigned long *)pmdp);
  2190. pmdp_notify_gmap(gmap, pmdp, gaddr);
  2191. WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
  2192. _SEGMENT_ENTRY_GMAP_UC));
  2193. if (purge)
  2194. __pmdp_csp(pmdp);
  2195. set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
  2196. }
  2197. spin_unlock(&gmap->guest_table_lock);
  2198. }
  2199. rcu_read_unlock();
  2200. }
  2201. /**
  2202. * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without
  2203. * flushing
  2204. * @mm: pointer to the process mm_struct
  2205. * @vmaddr: virtual address in the process address space
  2206. */
  2207. void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr)
  2208. {
  2209. gmap_pmdp_clear(mm, vmaddr, 0);
  2210. }
  2211. EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate);
  2212. /**
  2213. * gmap_pmdp_csp - csp all affected guest pmd entries
  2214. * @mm: pointer to the process mm_struct
  2215. * @vmaddr: virtual address in the process address space
  2216. */
  2217. void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr)
  2218. {
  2219. gmap_pmdp_clear(mm, vmaddr, 1);
  2220. }
  2221. EXPORT_SYMBOL_GPL(gmap_pmdp_csp);
  2222. /**
  2223. * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry
  2224. * @mm: pointer to the process mm_struct
  2225. * @vmaddr: virtual address in the process address space
  2226. */
  2227. void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
  2228. {
  2229. unsigned long *entry, gaddr;
  2230. struct gmap *gmap;
  2231. pmd_t *pmdp;
  2232. rcu_read_lock();
  2233. list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
  2234. spin_lock(&gmap->guest_table_lock);
  2235. entry = radix_tree_delete(&gmap->host_to_guest,
  2236. vmaddr >> PMD_SHIFT);
  2237. if (entry) {
  2238. pmdp = (pmd_t *)entry;
  2239. gaddr = __gmap_segment_gaddr(entry);
  2240. pmdp_notify_gmap(gmap, pmdp, gaddr);
  2241. WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
  2242. _SEGMENT_ENTRY_GMAP_UC));
  2243. if (MACHINE_HAS_TLB_GUEST)
  2244. __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
  2245. gmap->asce, IDTE_LOCAL);
  2246. else if (MACHINE_HAS_IDTE)
  2247. __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
  2248. *entry = _SEGMENT_ENTRY_EMPTY;
  2249. }
  2250. spin_unlock(&gmap->guest_table_lock);
  2251. }
  2252. rcu_read_unlock();
  2253. }
  2254. EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local);
  2255. /**
  2256. * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry
  2257. * @mm: pointer to the process mm_struct
  2258. * @vmaddr: virtual address in the process address space
  2259. */
  2260. void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
  2261. {
  2262. unsigned long *entry, gaddr;
  2263. struct gmap *gmap;
  2264. pmd_t *pmdp;
  2265. rcu_read_lock();
  2266. list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
  2267. spin_lock(&gmap->guest_table_lock);
  2268. entry = radix_tree_delete(&gmap->host_to_guest,
  2269. vmaddr >> PMD_SHIFT);
  2270. if (entry) {
  2271. pmdp = (pmd_t *)entry;
  2272. gaddr = __gmap_segment_gaddr(entry);
  2273. pmdp_notify_gmap(gmap, pmdp, gaddr);
  2274. WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
  2275. _SEGMENT_ENTRY_GMAP_UC));
  2276. if (MACHINE_HAS_TLB_GUEST)
  2277. __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
  2278. gmap->asce, IDTE_GLOBAL);
  2279. else if (MACHINE_HAS_IDTE)
  2280. __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
  2281. else
  2282. __pmdp_csp(pmdp);
  2283. *entry = _SEGMENT_ENTRY_EMPTY;
  2284. }
  2285. spin_unlock(&gmap->guest_table_lock);
  2286. }
  2287. rcu_read_unlock();
  2288. }
  2289. EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global);
  2290. /**
  2291. * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status
  2292. * @gmap: pointer to guest address space
  2293. * @pmdp: pointer to the pmd to be tested
  2294. * @gaddr: virtual address in the guest address space
  2295. *
  2296. * This function is assumed to be called with the guest_table_lock
  2297. * held.
  2298. */
  2299. static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
  2300. unsigned long gaddr)
  2301. {
  2302. if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
  2303. return false;
  2304. /* Already protected memory, which did not change is clean */
  2305. if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT &&
  2306. !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC))
  2307. return false;
  2308. /* Clear UC indication and reset protection */
  2309. set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC)));
  2310. gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0);
  2311. return true;
  2312. }
  2313. /**
  2314. * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment
  2315. * @gmap: pointer to guest address space
  2316. * @bitmap: dirty bitmap for this pmd
  2317. * @gaddr: virtual address in the guest address space
  2318. * @vmaddr: virtual address in the host address space
  2319. *
  2320. * This function is assumed to be called with the guest_table_lock
  2321. * held.
  2322. */
  2323. void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
  2324. unsigned long gaddr, unsigned long vmaddr)
  2325. {
  2326. int i;
  2327. pmd_t *pmdp;
  2328. pte_t *ptep;
  2329. spinlock_t *ptl;
  2330. pmdp = gmap_pmd_op_walk(gmap, gaddr);
  2331. if (!pmdp)
  2332. return;
  2333. if (pmd_leaf(*pmdp)) {
  2334. if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr))
  2335. bitmap_fill(bitmap, _PAGE_ENTRIES);
  2336. } else {
  2337. for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) {
  2338. ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl);
  2339. if (!ptep)
  2340. continue;
  2341. if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep))
  2342. set_bit(i, bitmap);
  2343. pte_unmap_unlock(ptep, ptl);
  2344. }
  2345. }
  2346. gmap_pmd_op_end(gmap, pmdp);
  2347. }
  2348. EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
  2349. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  2350. static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
  2351. unsigned long end, struct mm_walk *walk)
  2352. {
  2353. struct vm_area_struct *vma = walk->vma;
  2354. split_huge_pmd(vma, pmd, addr);
  2355. return 0;
  2356. }
  2357. static const struct mm_walk_ops thp_split_walk_ops = {
  2358. .pmd_entry = thp_split_walk_pmd_entry,
  2359. .walk_lock = PGWALK_WRLOCK_VERIFY,
  2360. };
  2361. static inline void thp_split_mm(struct mm_struct *mm)
  2362. {
  2363. struct vm_area_struct *vma;
  2364. VMA_ITERATOR(vmi, mm, 0);
  2365. for_each_vma(vmi, vma) {
  2366. vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE);
  2367. walk_page_vma(vma, &thp_split_walk_ops, NULL);
  2368. }
  2369. mm->def_flags |= VM_NOHUGEPAGE;
  2370. }
  2371. #else
  2372. static inline void thp_split_mm(struct mm_struct *mm)
  2373. {
  2374. }
  2375. #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  2376. /*
  2377. * switch on pgstes for its userspace process (for kvm)
  2378. */
  2379. int s390_enable_sie(void)
  2380. {
  2381. struct mm_struct *mm = current->mm;
  2382. /* Do we have pgstes? if yes, we are done */
  2383. if (mm_has_pgste(mm))
  2384. return 0;
  2385. /* Fail if the page tables are 2K */
  2386. if (!mm_alloc_pgste(mm))
  2387. return -EINVAL;
  2388. mmap_write_lock(mm);
  2389. mm->context.has_pgste = 1;
  2390. /* split thp mappings and disable thp for future mappings */
  2391. thp_split_mm(mm);
  2392. mmap_write_unlock(mm);
  2393. return 0;
  2394. }
  2395. EXPORT_SYMBOL_GPL(s390_enable_sie);
  2396. static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
  2397. unsigned long end, struct mm_walk *walk)
  2398. {
  2399. unsigned long *found_addr = walk->private;
  2400. /* Return 1 of the page is a zeropage. */
  2401. if (is_zero_pfn(pte_pfn(*pte))) {
  2402. /*
  2403. * Shared zeropage in e.g., a FS DAX mapping? We cannot do the
  2404. * right thing and likely don't care: FAULT_FLAG_UNSHARE
  2405. * currently only works in COW mappings, which is also where
  2406. * mm_forbids_zeropage() is checked.
  2407. */
  2408. if (!is_cow_mapping(walk->vma->vm_flags))
  2409. return -EFAULT;
  2410. *found_addr = addr;
  2411. return 1;
  2412. }
  2413. return 0;
  2414. }
  2415. static const struct mm_walk_ops find_zeropage_ops = {
  2416. .pte_entry = find_zeropage_pte_entry,
  2417. .walk_lock = PGWALK_WRLOCK,
  2418. };
  2419. /*
  2420. * Unshare all shared zeropages, replacing them by anonymous pages. Note that
  2421. * we cannot simply zap all shared zeropages, because this could later
  2422. * trigger unexpected userfaultfd missing events.
  2423. *
  2424. * This must be called after mm->context.allow_cow_sharing was
  2425. * set to 0, to avoid future mappings of shared zeropages.
  2426. *
  2427. * mm contracts with s390, that even if mm were to remove a page table,
  2428. * and racing with walk_page_range_vma() calling pte_offset_map_lock()
  2429. * would fail, it will never insert a page table containing empty zero
  2430. * pages once mm_forbids_zeropage(mm) i.e.
  2431. * mm->context.allow_cow_sharing is set to 0.
  2432. */
  2433. static int __s390_unshare_zeropages(struct mm_struct *mm)
  2434. {
  2435. struct vm_area_struct *vma;
  2436. VMA_ITERATOR(vmi, mm, 0);
  2437. unsigned long addr;
  2438. vm_fault_t fault;
  2439. int rc;
  2440. for_each_vma(vmi, vma) {
  2441. /*
  2442. * We could only look at COW mappings, but it's more future
  2443. * proof to catch unexpected zeropages in other mappings and
  2444. * fail.
  2445. */
  2446. if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma))
  2447. continue;
  2448. addr = vma->vm_start;
  2449. retry:
  2450. rc = walk_page_range_vma(vma, addr, vma->vm_end,
  2451. &find_zeropage_ops, &addr);
  2452. if (rc < 0)
  2453. return rc;
  2454. else if (!rc)
  2455. continue;
  2456. /* addr was updated by find_zeropage_pte_entry() */
  2457. fault = handle_mm_fault(vma, addr,
  2458. FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
  2459. NULL);
  2460. if (fault & VM_FAULT_OOM)
  2461. return -ENOMEM;
  2462. /*
  2463. * See break_ksm(): even after handle_mm_fault() returned 0, we
  2464. * must start the lookup from the current address, because
  2465. * handle_mm_fault() may back out if there's any difficulty.
  2466. *
  2467. * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but
  2468. * maybe they could trigger in the future on concurrent
  2469. * truncation. In that case, the shared zeropage would be gone
  2470. * and we can simply retry and make progress.
  2471. */
  2472. cond_resched();
  2473. goto retry;
  2474. }
  2475. return 0;
  2476. }
  2477. static int __s390_disable_cow_sharing(struct mm_struct *mm)
  2478. {
  2479. int rc;
  2480. if (!mm->context.allow_cow_sharing)
  2481. return 0;
  2482. mm->context.allow_cow_sharing = 0;
  2483. /* Replace all shared zeropages by anonymous pages. */
  2484. rc = __s390_unshare_zeropages(mm);
  2485. /*
  2486. * Make sure to disable KSM (if enabled for the whole process or
  2487. * individual VMAs). Note that nothing currently hinders user space
  2488. * from re-enabling it.
  2489. */
  2490. if (!rc)
  2491. rc = ksm_disable(mm);
  2492. if (rc)
  2493. mm->context.allow_cow_sharing = 1;
  2494. return rc;
  2495. }
  2496. /*
  2497. * Disable most COW-sharing of memory pages for the whole process:
  2498. * (1) Disable KSM and unmerge/unshare any KSM pages.
  2499. * (2) Disallow shared zeropages and unshare any zerpages that are mapped.
  2500. *
  2501. * Not that we currently don't bother with COW-shared pages that are shared
  2502. * with parent/child processes due to fork().
  2503. */
  2504. int s390_disable_cow_sharing(void)
  2505. {
  2506. int rc;
  2507. mmap_write_lock(current->mm);
  2508. rc = __s390_disable_cow_sharing(current->mm);
  2509. mmap_write_unlock(current->mm);
  2510. return rc;
  2511. }
  2512. EXPORT_SYMBOL_GPL(s390_disable_cow_sharing);
  2513. /*
  2514. * Enable storage key handling from now on and initialize the storage
  2515. * keys with the default key.
  2516. */
  2517. static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr,
  2518. unsigned long next, struct mm_walk *walk)
  2519. {
  2520. /* Clear storage key */
  2521. ptep_zap_key(walk->mm, addr, pte);
  2522. return 0;
  2523. }
  2524. /*
  2525. * Give a chance to schedule after setting a key to 256 pages.
  2526. * We only hold the mm lock, which is a rwsem and the kvm srcu.
  2527. * Both can sleep.
  2528. */
  2529. static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr,
  2530. unsigned long next, struct mm_walk *walk)
  2531. {
  2532. cond_resched();
  2533. return 0;
  2534. }
  2535. static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
  2536. unsigned long hmask, unsigned long next,
  2537. struct mm_walk *walk)
  2538. {
  2539. pmd_t *pmd = (pmd_t *)pte;
  2540. unsigned long start, end;
  2541. struct folio *folio = page_folio(pmd_page(*pmd));
  2542. /*
  2543. * The write check makes sure we do not set a key on shared
  2544. * memory. This is needed as the walker does not differentiate
  2545. * between actual guest memory and the process executable or
  2546. * shared libraries.
  2547. */
  2548. if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID ||
  2549. !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE))
  2550. return 0;
  2551. start = pmd_val(*pmd) & HPAGE_MASK;
  2552. end = start + HPAGE_SIZE;
  2553. __storage_key_init_range(start, end);
  2554. set_bit(PG_arch_1, &folio->flags);
  2555. cond_resched();
  2556. return 0;
  2557. }
  2558. static const struct mm_walk_ops enable_skey_walk_ops = {
  2559. .hugetlb_entry = __s390_enable_skey_hugetlb,
  2560. .pte_entry = __s390_enable_skey_pte,
  2561. .pmd_entry = __s390_enable_skey_pmd,
  2562. .walk_lock = PGWALK_WRLOCK,
  2563. };
  2564. int s390_enable_skey(void)
  2565. {
  2566. struct mm_struct *mm = current->mm;
  2567. int rc = 0;
  2568. mmap_write_lock(mm);
  2569. if (mm_uses_skeys(mm))
  2570. goto out_up;
  2571. mm->context.uses_skeys = 1;
  2572. rc = __s390_disable_cow_sharing(mm);
  2573. if (rc) {
  2574. mm->context.uses_skeys = 0;
  2575. goto out_up;
  2576. }
  2577. walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
  2578. out_up:
  2579. mmap_write_unlock(mm);
  2580. return rc;
  2581. }
  2582. EXPORT_SYMBOL_GPL(s390_enable_skey);
  2583. /*
  2584. * Reset CMMA state, make all pages stable again.
  2585. */
  2586. static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
  2587. unsigned long next, struct mm_walk *walk)
  2588. {
  2589. ptep_zap_unused(walk->mm, addr, pte, 1);
  2590. return 0;
  2591. }
  2592. static const struct mm_walk_ops reset_cmma_walk_ops = {
  2593. .pte_entry = __s390_reset_cmma,
  2594. .walk_lock = PGWALK_WRLOCK,
  2595. };
  2596. void s390_reset_cmma(struct mm_struct *mm)
  2597. {
  2598. mmap_write_lock(mm);
  2599. walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
  2600. mmap_write_unlock(mm);
  2601. }
  2602. EXPORT_SYMBOL_GPL(s390_reset_cmma);
  2603. #define GATHER_GET_PAGES 32
  2604. struct reset_walk_state {
  2605. unsigned long next;
  2606. unsigned long count;
  2607. unsigned long pfns[GATHER_GET_PAGES];
  2608. };
  2609. static int s390_gather_pages(pte_t *ptep, unsigned long addr,
  2610. unsigned long next, struct mm_walk *walk)
  2611. {
  2612. struct reset_walk_state *p = walk->private;
  2613. pte_t pte = READ_ONCE(*ptep);
  2614. if (pte_present(pte)) {
  2615. /* we have a reference from the mapping, take an extra one */
  2616. get_page(phys_to_page(pte_val(pte)));
  2617. p->pfns[p->count] = phys_to_pfn(pte_val(pte));
  2618. p->next = next;
  2619. p->count++;
  2620. }
  2621. return p->count >= GATHER_GET_PAGES;
  2622. }
  2623. static const struct mm_walk_ops gather_pages_ops = {
  2624. .pte_entry = s390_gather_pages,
  2625. .walk_lock = PGWALK_RDLOCK,
  2626. };
  2627. /*
  2628. * Call the Destroy secure page UVC on each page in the given array of PFNs.
  2629. * Each page needs to have an extra reference, which will be released here.
  2630. */
  2631. void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns)
  2632. {
  2633. struct folio *folio;
  2634. unsigned long i;
  2635. for (i = 0; i < count; i++) {
  2636. folio = pfn_folio(pfns[i]);
  2637. /* we always have an extra reference */
  2638. uv_destroy_folio(folio);
  2639. /* get rid of the extra reference */
  2640. folio_put(folio);
  2641. cond_resched();
  2642. }
  2643. }
  2644. EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns);
  2645. /**
  2646. * __s390_uv_destroy_range - Call the destroy secure page UVC on each page
  2647. * in the given range of the given address space.
  2648. * @mm: the mm to operate on
  2649. * @start: the start of the range
  2650. * @end: the end of the range
  2651. * @interruptible: if not 0, stop when a fatal signal is received
  2652. *
  2653. * Walk the given range of the given address space and call the destroy
  2654. * secure page UVC on each page. Optionally exit early if a fatal signal is
  2655. * pending.
  2656. *
  2657. * Return: 0 on success, -EINTR if the function stopped before completing
  2658. */
  2659. int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
  2660. unsigned long end, bool interruptible)
  2661. {
  2662. struct reset_walk_state state = { .next = start };
  2663. int r = 1;
  2664. while (r > 0) {
  2665. state.count = 0;
  2666. mmap_read_lock(mm);
  2667. r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state);
  2668. mmap_read_unlock(mm);
  2669. cond_resched();
  2670. s390_uv_destroy_pfns(state.count, state.pfns);
  2671. if (interruptible && fatal_signal_pending(current))
  2672. return -EINTR;
  2673. }
  2674. return 0;
  2675. }
  2676. EXPORT_SYMBOL_GPL(__s390_uv_destroy_range);
  2677. /**
  2678. * s390_unlist_old_asce - Remove the topmost level of page tables from the
  2679. * list of page tables of the gmap.
  2680. * @gmap: the gmap whose table is to be removed
  2681. *
  2682. * On s390x, KVM keeps a list of all pages containing the page tables of the
  2683. * gmap (the CRST list). This list is used at tear down time to free all
  2684. * pages that are now not needed anymore.
  2685. *
  2686. * This function removes the topmost page of the tree (the one pointed to by
  2687. * the ASCE) from the CRST list.
  2688. *
  2689. * This means that it will not be freed when the VM is torn down, and needs
  2690. * to be handled separately by the caller, unless a leak is actually
  2691. * intended. Notice that this function will only remove the page from the
  2692. * list, the page will still be used as a top level page table (and ASCE).
  2693. */
  2694. void s390_unlist_old_asce(struct gmap *gmap)
  2695. {
  2696. struct page *old;
  2697. old = virt_to_page(gmap->table);
  2698. spin_lock(&gmap->guest_table_lock);
  2699. list_del(&old->lru);
  2700. /*
  2701. * Sometimes the topmost page might need to be "removed" multiple
  2702. * times, for example if the VM is rebooted into secure mode several
  2703. * times concurrently, or if s390_replace_asce fails after calling
  2704. * s390_remove_old_asce and is attempted again later. In that case
  2705. * the old asce has been removed from the list, and therefore it
  2706. * will not be freed when the VM terminates, but the ASCE is still
  2707. * in use and still pointed to.
  2708. * A subsequent call to replace_asce will follow the pointer and try
  2709. * to remove the same page from the list again.
  2710. * Therefore it's necessary that the page of the ASCE has valid
  2711. * pointers, so list_del can work (and do nothing) without
  2712. * dereferencing stale or invalid pointers.
  2713. */
  2714. INIT_LIST_HEAD(&old->lru);
  2715. spin_unlock(&gmap->guest_table_lock);
  2716. }
  2717. EXPORT_SYMBOL_GPL(s390_unlist_old_asce);
  2718. /**
  2719. * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
  2720. * @gmap: the gmap whose ASCE needs to be replaced
  2721. *
  2722. * If the ASCE is a SEGMENT type then this function will return -EINVAL,
  2723. * otherwise the pointers in the host_to_guest radix tree will keep pointing
  2724. * to the wrong pages, causing use-after-free and memory corruption.
  2725. * If the allocation of the new top level page table fails, the ASCE is not
  2726. * replaced.
  2727. * In any case, the old ASCE is always removed from the gmap CRST list.
  2728. * Therefore the caller has to make sure to save a pointer to it
  2729. * beforehand, unless a leak is actually intended.
  2730. */
  2731. int s390_replace_asce(struct gmap *gmap)
  2732. {
  2733. unsigned long asce;
  2734. struct page *page;
  2735. void *table;
  2736. s390_unlist_old_asce(gmap);
  2737. /* Replacing segment type ASCEs would cause serious issues */
  2738. if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
  2739. return -EINVAL;
  2740. page = gmap_alloc_crst();
  2741. if (!page)
  2742. return -ENOMEM;
  2743. page->index = 0;
  2744. table = page_to_virt(page);
  2745. memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
  2746. /*
  2747. * The caller has to deal with the old ASCE, but here we make sure
  2748. * the new one is properly added to the CRST list, so that
  2749. * it will be freed when the VM is torn down.
  2750. */
  2751. spin_lock(&gmap->guest_table_lock);
  2752. list_add(&page->lru, &gmap->crst_list);
  2753. spin_unlock(&gmap->guest_table_lock);
  2754. /* Set new table origin while preserving existing ASCE control bits */
  2755. asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
  2756. WRITE_ONCE(gmap->asce, asce);
  2757. WRITE_ONCE(gmap->mm->context.gmap_asce, asce);
  2758. WRITE_ONCE(gmap->table, table);
  2759. return 0;
  2760. }
  2761. EXPORT_SYMBOL_GPL(s390_replace_asce);