svm.c 150 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471
  1. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  2. #include <linux/kvm_host.h>
  3. #include "irq.h"
  4. #include "mmu.h"
  5. #include "kvm_cache_regs.h"
  6. #include "x86.h"
  7. #include "smm.h"
  8. #include "cpuid.h"
  9. #include "pmu.h"
  10. #include <linux/module.h>
  11. #include <linux/mod_devicetable.h>
  12. #include <linux/kernel.h>
  13. #include <linux/vmalloc.h>
  14. #include <linux/highmem.h>
  15. #include <linux/amd-iommu.h>
  16. #include <linux/sched.h>
  17. #include <linux/trace_events.h>
  18. #include <linux/slab.h>
  19. #include <linux/hashtable.h>
  20. #include <linux/objtool.h>
  21. #include <linux/psp-sev.h>
  22. #include <linux/file.h>
  23. #include <linux/pagemap.h>
  24. #include <linux/swap.h>
  25. #include <linux/rwsem.h>
  26. #include <linux/cc_platform.h>
  27. #include <linux/smp.h>
  28. #include <asm/apic.h>
  29. #include <asm/perf_event.h>
  30. #include <asm/tlbflush.h>
  31. #include <asm/desc.h>
  32. #include <asm/debugreg.h>
  33. #include <asm/kvm_para.h>
  34. #include <asm/irq_remapping.h>
  35. #include <asm/spec-ctrl.h>
  36. #include <asm/cpu_device_id.h>
  37. #include <asm/traps.h>
  38. #include <asm/reboot.h>
  39. #include <asm/fpu/api.h>
  40. #include <trace/events/ipi.h>
  41. #include "trace.h"
  42. #include "svm.h"
  43. #include "svm_ops.h"
  44. #include "kvm_onhyperv.h"
  45. #include "svm_onhyperv.h"
  46. MODULE_AUTHOR("Qumranet");
  47. MODULE_DESCRIPTION("KVM support for SVM (AMD-V) extensions");
  48. MODULE_LICENSE("GPL");
  49. #ifdef MODULE
  50. static const struct x86_cpu_id svm_cpu_id[] = {
  51. X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
  52. {}
  53. };
  54. MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
  55. #endif
  56. #define SEG_TYPE_LDT 2
  57. #define SEG_TYPE_BUSY_TSS16 3
  58. static bool erratum_383_found __read_mostly;
  59. u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
  60. /*
  61. * Set osvw_len to higher value when updated Revision Guides
  62. * are published and we know what the new status bits are
  63. */
  64. static uint64_t osvw_len = 4, osvw_status;
  65. static DEFINE_PER_CPU(u64, current_tsc_ratio);
  66. #define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4))
  67. static const struct svm_direct_access_msrs {
  68. u32 index; /* Index of the MSR */
  69. bool always; /* True if intercept is initially cleared */
  70. } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
  71. { .index = MSR_STAR, .always = true },
  72. { .index = MSR_IA32_SYSENTER_CS, .always = true },
  73. { .index = MSR_IA32_SYSENTER_EIP, .always = false },
  74. { .index = MSR_IA32_SYSENTER_ESP, .always = false },
  75. #ifdef CONFIG_X86_64
  76. { .index = MSR_GS_BASE, .always = true },
  77. { .index = MSR_FS_BASE, .always = true },
  78. { .index = MSR_KERNEL_GS_BASE, .always = true },
  79. { .index = MSR_LSTAR, .always = true },
  80. { .index = MSR_CSTAR, .always = true },
  81. { .index = MSR_SYSCALL_MASK, .always = true },
  82. #endif
  83. { .index = MSR_IA32_SPEC_CTRL, .always = false },
  84. { .index = MSR_IA32_PRED_CMD, .always = false },
  85. { .index = MSR_IA32_FLUSH_CMD, .always = false },
  86. { .index = MSR_IA32_DEBUGCTLMSR, .always = false },
  87. { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
  88. { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
  89. { .index = MSR_IA32_LASTINTFROMIP, .always = false },
  90. { .index = MSR_IA32_LASTINTTOIP, .always = false },
  91. { .index = MSR_IA32_XSS, .always = false },
  92. { .index = MSR_EFER, .always = false },
  93. { .index = MSR_IA32_CR_PAT, .always = false },
  94. { .index = MSR_AMD64_SEV_ES_GHCB, .always = true },
  95. { .index = MSR_TSC_AUX, .always = false },
  96. { .index = X2APIC_MSR(APIC_ID), .always = false },
  97. { .index = X2APIC_MSR(APIC_LVR), .always = false },
  98. { .index = X2APIC_MSR(APIC_TASKPRI), .always = false },
  99. { .index = X2APIC_MSR(APIC_ARBPRI), .always = false },
  100. { .index = X2APIC_MSR(APIC_PROCPRI), .always = false },
  101. { .index = X2APIC_MSR(APIC_EOI), .always = false },
  102. { .index = X2APIC_MSR(APIC_RRR), .always = false },
  103. { .index = X2APIC_MSR(APIC_LDR), .always = false },
  104. { .index = X2APIC_MSR(APIC_DFR), .always = false },
  105. { .index = X2APIC_MSR(APIC_SPIV), .always = false },
  106. { .index = X2APIC_MSR(APIC_ISR), .always = false },
  107. { .index = X2APIC_MSR(APIC_TMR), .always = false },
  108. { .index = X2APIC_MSR(APIC_IRR), .always = false },
  109. { .index = X2APIC_MSR(APIC_ESR), .always = false },
  110. { .index = X2APIC_MSR(APIC_ICR), .always = false },
  111. { .index = X2APIC_MSR(APIC_ICR2), .always = false },
  112. /*
  113. * Note:
  114. * AMD does not virtualize APIC TSC-deadline timer mode, but it is
  115. * emulated by KVM. When setting APIC LVTT (0x832) register bit 18,
  116. * the AVIC hardware would generate GP fault. Therefore, always
  117. * intercept the MSR 0x832, and do not setup direct_access_msr.
  118. */
  119. { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false },
  120. { .index = X2APIC_MSR(APIC_LVTPC), .always = false },
  121. { .index = X2APIC_MSR(APIC_LVT0), .always = false },
  122. { .index = X2APIC_MSR(APIC_LVT1), .always = false },
  123. { .index = X2APIC_MSR(APIC_LVTERR), .always = false },
  124. { .index = X2APIC_MSR(APIC_TMICT), .always = false },
  125. { .index = X2APIC_MSR(APIC_TMCCT), .always = false },
  126. { .index = X2APIC_MSR(APIC_TDCR), .always = false },
  127. { .index = MSR_INVALID, .always = false },
  128. };
  129. /*
  130. * These 2 parameters are used to config the controls for Pause-Loop Exiting:
  131. * pause_filter_count: On processors that support Pause filtering(indicated
  132. * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
  133. * count value. On VMRUN this value is loaded into an internal counter.
  134. * Each time a pause instruction is executed, this counter is decremented
  135. * until it reaches zero at which time a #VMEXIT is generated if pause
  136. * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
  137. * Intercept Filtering for more details.
  138. * This also indicate if ple logic enabled.
  139. *
  140. * pause_filter_thresh: In addition, some processor families support advanced
  141. * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
  142. * the amount of time a guest is allowed to execute in a pause loop.
  143. * In this mode, a 16-bit pause filter threshold field is added in the
  144. * VMCB. The threshold value is a cycle count that is used to reset the
  145. * pause counter. As with simple pause filtering, VMRUN loads the pause
  146. * count value from VMCB into an internal counter. Then, on each pause
  147. * instruction the hardware checks the elapsed number of cycles since
  148. * the most recent pause instruction against the pause filter threshold.
  149. * If the elapsed cycle count is greater than the pause filter threshold,
  150. * then the internal pause count is reloaded from the VMCB and execution
  151. * continues. If the elapsed cycle count is less than the pause filter
  152. * threshold, then the internal pause count is decremented. If the count
  153. * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
  154. * triggered. If advanced pause filtering is supported and pause filter
  155. * threshold field is set to zero, the filter will operate in the simpler,
  156. * count only mode.
  157. */
  158. static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
  159. module_param(pause_filter_thresh, ushort, 0444);
  160. static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
  161. module_param(pause_filter_count, ushort, 0444);
  162. /* Default doubles per-vcpu window every exit. */
  163. static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
  164. module_param(pause_filter_count_grow, ushort, 0444);
  165. /* Default resets per-vcpu window every exit to pause_filter_count. */
  166. static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
  167. module_param(pause_filter_count_shrink, ushort, 0444);
  168. /* Default is to compute the maximum so we can never overflow. */
  169. static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
  170. module_param(pause_filter_count_max, ushort, 0444);
  171. /*
  172. * Use nested page tables by default. Note, NPT may get forced off by
  173. * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
  174. */
  175. bool npt_enabled = true;
  176. module_param_named(npt, npt_enabled, bool, 0444);
  177. /* allow nested virtualization in KVM/SVM */
  178. static int nested = true;
  179. module_param(nested, int, 0444);
  180. /* enable/disable Next RIP Save */
  181. int nrips = true;
  182. module_param(nrips, int, 0444);
  183. /* enable/disable Virtual VMLOAD VMSAVE */
  184. static int vls = true;
  185. module_param(vls, int, 0444);
  186. /* enable/disable Virtual GIF */
  187. int vgif = true;
  188. module_param(vgif, int, 0444);
  189. /* enable/disable LBR virtualization */
  190. int lbrv = true;
  191. module_param(lbrv, int, 0444);
  192. static int tsc_scaling = true;
  193. module_param(tsc_scaling, int, 0444);
  194. /*
  195. * enable / disable AVIC. Because the defaults differ for APICv
  196. * support between VMX and SVM we cannot use module_param_named.
  197. */
  198. static bool avic;
  199. module_param(avic, bool, 0444);
  200. bool __read_mostly dump_invalid_vmcb;
  201. module_param(dump_invalid_vmcb, bool, 0644);
  202. bool intercept_smi = true;
  203. module_param(intercept_smi, bool, 0444);
  204. bool vnmi = true;
  205. module_param(vnmi, bool, 0444);
  206. static bool svm_gp_erratum_intercept = true;
  207. static u8 rsm_ins_bytes[] = "\x0f\xaa";
  208. static unsigned long iopm_base;
  209. DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
  210. /*
  211. * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via
  212. * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
  213. *
  214. * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
  215. * defer the restoration of TSC_AUX until the CPU returns to userspace.
  216. */
  217. static int tsc_aux_uret_slot __read_mostly = -1;
  218. static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
  219. #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
  220. #define MSRS_RANGE_SIZE 2048
  221. #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
  222. u32 svm_msrpm_offset(u32 msr)
  223. {
  224. u32 offset;
  225. int i;
  226. for (i = 0; i < NUM_MSR_MAPS; i++) {
  227. if (msr < msrpm_ranges[i] ||
  228. msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
  229. continue;
  230. offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
  231. offset += (i * MSRS_RANGE_SIZE); /* add range offset */
  232. /* Now we have the u8 offset - but need the u32 offset */
  233. return offset / 4;
  234. }
  235. /* MSR not in any range */
  236. return MSR_INVALID;
  237. }
  238. static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
  239. static int get_npt_level(void)
  240. {
  241. #ifdef CONFIG_X86_64
  242. return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
  243. #else
  244. return PT32E_ROOT_LEVEL;
  245. #endif
  246. }
  247. int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
  248. {
  249. struct vcpu_svm *svm = to_svm(vcpu);
  250. u64 old_efer = vcpu->arch.efer;
  251. vcpu->arch.efer = efer;
  252. if (!npt_enabled) {
  253. /* Shadow paging assumes NX to be available. */
  254. efer |= EFER_NX;
  255. if (!(efer & EFER_LMA))
  256. efer &= ~EFER_LME;
  257. }
  258. if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
  259. if (!(efer & EFER_SVME)) {
  260. svm_leave_nested(vcpu);
  261. svm_set_gif(svm, true);
  262. /* #GP intercept is still needed for vmware backdoor */
  263. if (!enable_vmware_backdoor)
  264. clr_exception_intercept(svm, GP_VECTOR);
  265. /*
  266. * Free the nested guest state, unless we are in SMM.
  267. * In this case we will return to the nested guest
  268. * as soon as we leave SMM.
  269. */
  270. if (!is_smm(vcpu))
  271. svm_free_nested(svm);
  272. } else {
  273. int ret = svm_allocate_nested(svm);
  274. if (ret) {
  275. vcpu->arch.efer = old_efer;
  276. return ret;
  277. }
  278. /*
  279. * Never intercept #GP for SEV guests, KVM can't
  280. * decrypt guest memory to workaround the erratum.
  281. */
  282. if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
  283. set_exception_intercept(svm, GP_VECTOR);
  284. }
  285. }
  286. svm->vmcb->save.efer = efer | EFER_SVME;
  287. vmcb_mark_dirty(svm->vmcb, VMCB_CR);
  288. return 0;
  289. }
  290. static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
  291. {
  292. struct vcpu_svm *svm = to_svm(vcpu);
  293. u32 ret = 0;
  294. if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
  295. ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
  296. return ret;
  297. }
  298. static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
  299. {
  300. struct vcpu_svm *svm = to_svm(vcpu);
  301. if (mask == 0)
  302. svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
  303. else
  304. svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
  305. }
  306. static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
  307. bool commit_side_effects)
  308. {
  309. struct vcpu_svm *svm = to_svm(vcpu);
  310. unsigned long old_rflags;
  311. /*
  312. * SEV-ES does not expose the next RIP. The RIP update is controlled by
  313. * the type of exit and the #VC handler in the guest.
  314. */
  315. if (sev_es_guest(vcpu->kvm))
  316. goto done;
  317. if (nrips && svm->vmcb->control.next_rip != 0) {
  318. WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
  319. svm->next_rip = svm->vmcb->control.next_rip;
  320. }
  321. if (!svm->next_rip) {
  322. if (unlikely(!commit_side_effects))
  323. old_rflags = svm->vmcb->save.rflags;
  324. if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
  325. return 0;
  326. if (unlikely(!commit_side_effects))
  327. svm->vmcb->save.rflags = old_rflags;
  328. } else {
  329. kvm_rip_write(vcpu, svm->next_rip);
  330. }
  331. done:
  332. if (likely(commit_side_effects))
  333. svm_set_interrupt_shadow(vcpu, 0);
  334. return 1;
  335. }
  336. static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
  337. {
  338. return __svm_skip_emulated_instruction(vcpu, true);
  339. }
  340. static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
  341. {
  342. unsigned long rip, old_rip = kvm_rip_read(vcpu);
  343. struct vcpu_svm *svm = to_svm(vcpu);
  344. /*
  345. * Due to architectural shortcomings, the CPU doesn't always provide
  346. * NextRIP, e.g. if KVM intercepted an exception that occurred while
  347. * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip
  348. * the instruction even if NextRIP is supported to acquire the next
  349. * RIP so that it can be shoved into the NextRIP field, otherwise
  350. * hardware will fail to advance guest RIP during event injection.
  351. * Drop the exception/interrupt if emulation fails and effectively
  352. * retry the instruction, it's the least awful option. If NRIPS is
  353. * in use, the skip must not commit any side effects such as clearing
  354. * the interrupt shadow or RFLAGS.RF.
  355. */
  356. if (!__svm_skip_emulated_instruction(vcpu, !nrips))
  357. return -EIO;
  358. rip = kvm_rip_read(vcpu);
  359. /*
  360. * Save the injection information, even when using next_rip, as the
  361. * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
  362. * doesn't complete due to a VM-Exit occurring while the CPU is
  363. * vectoring the event. Decoding the instruction isn't guaranteed to
  364. * work as there may be no backing instruction, e.g. if the event is
  365. * being injected by L1 for L2, or if the guest is patching INT3 into
  366. * a different instruction.
  367. */
  368. svm->soft_int_injected = true;
  369. svm->soft_int_csbase = svm->vmcb->save.cs.base;
  370. svm->soft_int_old_rip = old_rip;
  371. svm->soft_int_next_rip = rip;
  372. if (nrips)
  373. kvm_rip_write(vcpu, old_rip);
  374. if (static_cpu_has(X86_FEATURE_NRIPS))
  375. svm->vmcb->control.next_rip = rip;
  376. return 0;
  377. }
  378. static void svm_inject_exception(struct kvm_vcpu *vcpu)
  379. {
  380. struct kvm_queued_exception *ex = &vcpu->arch.exception;
  381. struct vcpu_svm *svm = to_svm(vcpu);
  382. kvm_deliver_exception_payload(vcpu, ex);
  383. if (kvm_exception_is_soft(ex->vector) &&
  384. svm_update_soft_interrupt_rip(vcpu))
  385. return;
  386. svm->vmcb->control.event_inj = ex->vector
  387. | SVM_EVTINJ_VALID
  388. | (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
  389. | SVM_EVTINJ_TYPE_EXEPT;
  390. svm->vmcb->control.event_inj_err = ex->error_code;
  391. }
  392. static void svm_init_erratum_383(void)
  393. {
  394. u32 low, high;
  395. int err;
  396. u64 val;
  397. if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
  398. return;
  399. /* Use _safe variants to not break nested virtualization */
  400. val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
  401. if (err)
  402. return;
  403. val |= (1ULL << 47);
  404. low = lower_32_bits(val);
  405. high = upper_32_bits(val);
  406. native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
  407. erratum_383_found = true;
  408. }
  409. static void svm_init_osvw(struct kvm_vcpu *vcpu)
  410. {
  411. /*
  412. * Guests should see errata 400 and 415 as fixed (assuming that
  413. * HLT and IO instructions are intercepted).
  414. */
  415. vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
  416. vcpu->arch.osvw.status = osvw_status & ~(6ULL);
  417. /*
  418. * By increasing VCPU's osvw.length to 3 we are telling the guest that
  419. * all osvw.status bits inside that length, including bit 0 (which is
  420. * reserved for erratum 298), are valid. However, if host processor's
  421. * osvw_len is 0 then osvw_status[0] carries no information. We need to
  422. * be conservative here and therefore we tell the guest that erratum 298
  423. * is present (because we really don't know).
  424. */
  425. if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
  426. vcpu->arch.osvw.status |= 1;
  427. }
  428. static bool __kvm_is_svm_supported(void)
  429. {
  430. int cpu = smp_processor_id();
  431. struct cpuinfo_x86 *c = &cpu_data(cpu);
  432. if (c->x86_vendor != X86_VENDOR_AMD &&
  433. c->x86_vendor != X86_VENDOR_HYGON) {
  434. pr_err("CPU %d isn't AMD or Hygon\n", cpu);
  435. return false;
  436. }
  437. if (!cpu_has(c, X86_FEATURE_SVM)) {
  438. pr_err("SVM not supported by CPU %d\n", cpu);
  439. return false;
  440. }
  441. if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
  442. pr_info("KVM is unsupported when running as an SEV guest\n");
  443. return false;
  444. }
  445. return true;
  446. }
  447. static bool kvm_is_svm_supported(void)
  448. {
  449. bool supported;
  450. migrate_disable();
  451. supported = __kvm_is_svm_supported();
  452. migrate_enable();
  453. return supported;
  454. }
  455. static int svm_check_processor_compat(void)
  456. {
  457. if (!__kvm_is_svm_supported())
  458. return -EIO;
  459. return 0;
  460. }
  461. static void __svm_write_tsc_multiplier(u64 multiplier)
  462. {
  463. if (multiplier == __this_cpu_read(current_tsc_ratio))
  464. return;
  465. wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
  466. __this_cpu_write(current_tsc_ratio, multiplier);
  467. }
  468. static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd)
  469. {
  470. return &sd->save_area->host_sev_es_save;
  471. }
  472. static inline void kvm_cpu_svm_disable(void)
  473. {
  474. uint64_t efer;
  475. wrmsrl(MSR_VM_HSAVE_PA, 0);
  476. rdmsrl(MSR_EFER, efer);
  477. if (efer & EFER_SVME) {
  478. /*
  479. * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and
  480. * NMI aren't blocked.
  481. */
  482. stgi();
  483. wrmsrl(MSR_EFER, efer & ~EFER_SVME);
  484. }
  485. }
  486. static void svm_emergency_disable_virtualization_cpu(void)
  487. {
  488. kvm_rebooting = true;
  489. kvm_cpu_svm_disable();
  490. }
  491. static void svm_disable_virtualization_cpu(void)
  492. {
  493. /* Make sure we clean up behind us */
  494. if (tsc_scaling)
  495. __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
  496. kvm_cpu_svm_disable();
  497. amd_pmu_disable_virt();
  498. }
  499. static int svm_enable_virtualization_cpu(void)
  500. {
  501. struct svm_cpu_data *sd;
  502. uint64_t efer;
  503. int me = raw_smp_processor_id();
  504. rdmsrl(MSR_EFER, efer);
  505. if (efer & EFER_SVME)
  506. return -EBUSY;
  507. sd = per_cpu_ptr(&svm_data, me);
  508. sd->asid_generation = 1;
  509. sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
  510. sd->next_asid = sd->max_asid + 1;
  511. sd->min_asid = max_sev_asid + 1;
  512. wrmsrl(MSR_EFER, efer | EFER_SVME);
  513. wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa);
  514. if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
  515. /*
  516. * Set the default value, even if we don't use TSC scaling
  517. * to avoid having stale value in the msr
  518. */
  519. __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
  520. }
  521. /*
  522. * Get OSVW bits.
  523. *
  524. * Note that it is possible to have a system with mixed processor
  525. * revisions and therefore different OSVW bits. If bits are not the same
  526. * on different processors then choose the worst case (i.e. if erratum
  527. * is present on one processor and not on another then assume that the
  528. * erratum is present everywhere).
  529. */
  530. if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
  531. uint64_t len, status = 0;
  532. int err;
  533. len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
  534. if (!err)
  535. status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
  536. &err);
  537. if (err)
  538. osvw_status = osvw_len = 0;
  539. else {
  540. if (len < osvw_len)
  541. osvw_len = len;
  542. osvw_status |= status;
  543. osvw_status &= (1ULL << osvw_len) - 1;
  544. }
  545. } else
  546. osvw_status = osvw_len = 0;
  547. svm_init_erratum_383();
  548. amd_pmu_enable_virt();
  549. /*
  550. * If TSC_AUX virtualization is supported, TSC_AUX becomes a swap type
  551. * "B" field (see sev_es_prepare_switch_to_guest()) for SEV-ES guests.
  552. * Since Linux does not change the value of TSC_AUX once set, prime the
  553. * TSC_AUX field now to avoid a RDMSR on every vCPU run.
  554. */
  555. if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) {
  556. u32 __maybe_unused msr_hi;
  557. rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi);
  558. }
  559. return 0;
  560. }
  561. static void svm_cpu_uninit(int cpu)
  562. {
  563. struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
  564. if (!sd->save_area)
  565. return;
  566. kfree(sd->sev_vmcbs);
  567. __free_page(__sme_pa_to_page(sd->save_area_pa));
  568. sd->save_area_pa = 0;
  569. sd->save_area = NULL;
  570. }
  571. static int svm_cpu_init(int cpu)
  572. {
  573. struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
  574. struct page *save_area_page;
  575. int ret = -ENOMEM;
  576. memset(sd, 0, sizeof(struct svm_cpu_data));
  577. save_area_page = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL);
  578. if (!save_area_page)
  579. return ret;
  580. ret = sev_cpu_init(sd);
  581. if (ret)
  582. goto free_save_area;
  583. sd->save_area = page_address(save_area_page);
  584. sd->save_area_pa = __sme_page_pa(save_area_page);
  585. return 0;
  586. free_save_area:
  587. __free_page(save_area_page);
  588. return ret;
  589. }
  590. static void set_dr_intercepts(struct vcpu_svm *svm)
  591. {
  592. struct vmcb *vmcb = svm->vmcb01.ptr;
  593. vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ);
  594. vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ);
  595. vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ);
  596. vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ);
  597. vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ);
  598. vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ);
  599. vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ);
  600. vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE);
  601. vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE);
  602. vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE);
  603. vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE);
  604. vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE);
  605. vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE);
  606. vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE);
  607. vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
  608. vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
  609. recalc_intercepts(svm);
  610. }
  611. static void clr_dr_intercepts(struct vcpu_svm *svm)
  612. {
  613. struct vmcb *vmcb = svm->vmcb01.ptr;
  614. vmcb->control.intercepts[INTERCEPT_DR] = 0;
  615. recalc_intercepts(svm);
  616. }
  617. static int direct_access_msr_slot(u32 msr)
  618. {
  619. u32 i;
  620. for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
  621. if (direct_access_msrs[i].index == msr)
  622. return i;
  623. return -ENOENT;
  624. }
  625. static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
  626. int write)
  627. {
  628. struct vcpu_svm *svm = to_svm(vcpu);
  629. int slot = direct_access_msr_slot(msr);
  630. if (slot == -ENOENT)
  631. return;
  632. /* Set the shadow bitmaps to the desired intercept states */
  633. if (read)
  634. set_bit(slot, svm->shadow_msr_intercept.read);
  635. else
  636. clear_bit(slot, svm->shadow_msr_intercept.read);
  637. if (write)
  638. set_bit(slot, svm->shadow_msr_intercept.write);
  639. else
  640. clear_bit(slot, svm->shadow_msr_intercept.write);
  641. }
  642. static bool valid_msr_intercept(u32 index)
  643. {
  644. return direct_access_msr_slot(index) != -ENOENT;
  645. }
  646. static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
  647. {
  648. u8 bit_write;
  649. unsigned long tmp;
  650. u32 offset;
  651. u32 *msrpm;
  652. /*
  653. * For non-nested case:
  654. * If the L01 MSR bitmap does not intercept the MSR, then we need to
  655. * save it.
  656. *
  657. * For nested case:
  658. * If the L02 MSR bitmap does not intercept the MSR, then we need to
  659. * save it.
  660. */
  661. msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
  662. to_svm(vcpu)->msrpm;
  663. offset = svm_msrpm_offset(msr);
  664. bit_write = 2 * (msr & 0x0f) + 1;
  665. tmp = msrpm[offset];
  666. BUG_ON(offset == MSR_INVALID);
  667. return test_bit(bit_write, &tmp);
  668. }
  669. static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
  670. u32 msr, int read, int write)
  671. {
  672. struct vcpu_svm *svm = to_svm(vcpu);
  673. u8 bit_read, bit_write;
  674. unsigned long tmp;
  675. u32 offset;
  676. /*
  677. * If this warning triggers extend the direct_access_msrs list at the
  678. * beginning of the file
  679. */
  680. WARN_ON(!valid_msr_intercept(msr));
  681. /* Enforce non allowed MSRs to trap */
  682. if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
  683. read = 0;
  684. if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
  685. write = 0;
  686. offset = svm_msrpm_offset(msr);
  687. bit_read = 2 * (msr & 0x0f);
  688. bit_write = 2 * (msr & 0x0f) + 1;
  689. tmp = msrpm[offset];
  690. BUG_ON(offset == MSR_INVALID);
  691. read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
  692. write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
  693. msrpm[offset] = tmp;
  694. svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
  695. svm->nested.force_msr_bitmap_recalc = true;
  696. }
  697. void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
  698. int read, int write)
  699. {
  700. set_shadow_msr_intercept(vcpu, msr, read, write);
  701. set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
  702. }
  703. u32 *svm_vcpu_alloc_msrpm(void)
  704. {
  705. unsigned int order = get_order(MSRPM_SIZE);
  706. struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
  707. u32 *msrpm;
  708. if (!pages)
  709. return NULL;
  710. msrpm = page_address(pages);
  711. memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
  712. return msrpm;
  713. }
  714. void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
  715. {
  716. int i;
  717. for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
  718. if (!direct_access_msrs[i].always)
  719. continue;
  720. set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
  721. }
  722. }
  723. void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
  724. {
  725. int i;
  726. if (intercept == svm->x2avic_msrs_intercepted)
  727. return;
  728. if (!x2avic_enabled)
  729. return;
  730. for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) {
  731. int index = direct_access_msrs[i].index;
  732. if ((index < APIC_BASE_MSR) ||
  733. (index > APIC_BASE_MSR + 0xff))
  734. continue;
  735. set_msr_interception(&svm->vcpu, svm->msrpm, index,
  736. !intercept, !intercept);
  737. }
  738. svm->x2avic_msrs_intercepted = intercept;
  739. }
  740. void svm_vcpu_free_msrpm(u32 *msrpm)
  741. {
  742. __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
  743. }
  744. static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
  745. {
  746. struct vcpu_svm *svm = to_svm(vcpu);
  747. u32 i;
  748. /*
  749. * Set intercept permissions for all direct access MSRs again. They
  750. * will automatically get filtered through the MSR filter, so we are
  751. * back in sync after this.
  752. */
  753. for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
  754. u32 msr = direct_access_msrs[i].index;
  755. u32 read = test_bit(i, svm->shadow_msr_intercept.read);
  756. u32 write = test_bit(i, svm->shadow_msr_intercept.write);
  757. set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
  758. }
  759. }
  760. static void add_msr_offset(u32 offset)
  761. {
  762. int i;
  763. for (i = 0; i < MSRPM_OFFSETS; ++i) {
  764. /* Offset already in list? */
  765. if (msrpm_offsets[i] == offset)
  766. return;
  767. /* Slot used by another offset? */
  768. if (msrpm_offsets[i] != MSR_INVALID)
  769. continue;
  770. /* Add offset to list */
  771. msrpm_offsets[i] = offset;
  772. return;
  773. }
  774. /*
  775. * If this BUG triggers the msrpm_offsets table has an overflow. Just
  776. * increase MSRPM_OFFSETS in this case.
  777. */
  778. BUG();
  779. }
  780. static void init_msrpm_offsets(void)
  781. {
  782. int i;
  783. memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
  784. for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
  785. u32 offset;
  786. offset = svm_msrpm_offset(direct_access_msrs[i].index);
  787. BUG_ON(offset == MSR_INVALID);
  788. add_msr_offset(offset);
  789. }
  790. }
  791. void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
  792. {
  793. to_vmcb->save.dbgctl = from_vmcb->save.dbgctl;
  794. to_vmcb->save.br_from = from_vmcb->save.br_from;
  795. to_vmcb->save.br_to = from_vmcb->save.br_to;
  796. to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from;
  797. to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to;
  798. vmcb_mark_dirty(to_vmcb, VMCB_LBR);
  799. }
  800. void svm_enable_lbrv(struct kvm_vcpu *vcpu)
  801. {
  802. struct vcpu_svm *svm = to_svm(vcpu);
  803. svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
  804. set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
  805. set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
  806. set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
  807. set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
  808. if (sev_es_guest(vcpu->kvm))
  809. set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, 1, 1);
  810. /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
  811. if (is_guest_mode(vcpu))
  812. svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
  813. }
  814. static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
  815. {
  816. struct vcpu_svm *svm = to_svm(vcpu);
  817. KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
  818. svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
  819. set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
  820. set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
  821. set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
  822. set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
  823. /*
  824. * Move the LBR msrs back to the vmcb01 to avoid copying them
  825. * on nested guest entries.
  826. */
  827. if (is_guest_mode(vcpu))
  828. svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
  829. }
  830. static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm)
  831. {
  832. /*
  833. * If LBR virtualization is disabled, the LBR MSRs are always kept in
  834. * vmcb01. If LBR virtualization is enabled and L1 is running VMs of
  835. * its own, the MSRs are moved between vmcb01 and vmcb02 as needed.
  836. */
  837. return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb :
  838. svm->vmcb01.ptr;
  839. }
  840. void svm_update_lbrv(struct kvm_vcpu *vcpu)
  841. {
  842. struct vcpu_svm *svm = to_svm(vcpu);
  843. bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
  844. bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) ||
  845. (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) &&
  846. (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
  847. if (enable_lbrv == current_enable_lbrv)
  848. return;
  849. if (enable_lbrv)
  850. svm_enable_lbrv(vcpu);
  851. else
  852. svm_disable_lbrv(vcpu);
  853. }
  854. void disable_nmi_singlestep(struct vcpu_svm *svm)
  855. {
  856. svm->nmi_singlestep = false;
  857. if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
  858. /* Clear our flags if they were not set by the guest */
  859. if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
  860. svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
  861. if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
  862. svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
  863. }
  864. }
  865. static void grow_ple_window(struct kvm_vcpu *vcpu)
  866. {
  867. struct vcpu_svm *svm = to_svm(vcpu);
  868. struct vmcb_control_area *control = &svm->vmcb->control;
  869. int old = control->pause_filter_count;
  870. if (kvm_pause_in_guest(vcpu->kvm))
  871. return;
  872. control->pause_filter_count = __grow_ple_window(old,
  873. pause_filter_count,
  874. pause_filter_count_grow,
  875. pause_filter_count_max);
  876. if (control->pause_filter_count != old) {
  877. vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
  878. trace_kvm_ple_window_update(vcpu->vcpu_id,
  879. control->pause_filter_count, old);
  880. }
  881. }
  882. static void shrink_ple_window(struct kvm_vcpu *vcpu)
  883. {
  884. struct vcpu_svm *svm = to_svm(vcpu);
  885. struct vmcb_control_area *control = &svm->vmcb->control;
  886. int old = control->pause_filter_count;
  887. if (kvm_pause_in_guest(vcpu->kvm))
  888. return;
  889. control->pause_filter_count =
  890. __shrink_ple_window(old,
  891. pause_filter_count,
  892. pause_filter_count_shrink,
  893. pause_filter_count);
  894. if (control->pause_filter_count != old) {
  895. vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
  896. trace_kvm_ple_window_update(vcpu->vcpu_id,
  897. control->pause_filter_count, old);
  898. }
  899. }
  900. static void svm_hardware_unsetup(void)
  901. {
  902. int cpu;
  903. sev_hardware_unsetup();
  904. for_each_possible_cpu(cpu)
  905. svm_cpu_uninit(cpu);
  906. __free_pages(__sme_pa_to_page(iopm_base), get_order(IOPM_SIZE));
  907. iopm_base = 0;
  908. }
  909. static void init_seg(struct vmcb_seg *seg)
  910. {
  911. seg->selector = 0;
  912. seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
  913. SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
  914. seg->limit = 0xffff;
  915. seg->base = 0;
  916. }
  917. static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
  918. {
  919. seg->selector = 0;
  920. seg->attrib = SVM_SELECTOR_P_MASK | type;
  921. seg->limit = 0xffff;
  922. seg->base = 0;
  923. }
  924. static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
  925. {
  926. struct vcpu_svm *svm = to_svm(vcpu);
  927. return svm->nested.ctl.tsc_offset;
  928. }
  929. static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
  930. {
  931. struct vcpu_svm *svm = to_svm(vcpu);
  932. return svm->tsc_ratio_msr;
  933. }
  934. static void svm_write_tsc_offset(struct kvm_vcpu *vcpu)
  935. {
  936. struct vcpu_svm *svm = to_svm(vcpu);
  937. svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
  938. svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
  939. vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
  940. }
  941. void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
  942. {
  943. preempt_disable();
  944. if (to_svm(vcpu)->guest_state_loaded)
  945. __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
  946. preempt_enable();
  947. }
  948. /* Evaluate instruction intercepts that depend on guest CPUID features. */
  949. static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
  950. struct vcpu_svm *svm)
  951. {
  952. /*
  953. * Intercept INVPCID if shadow paging is enabled to sync/free shadow
  954. * roots, or if INVPCID is disabled in the guest to inject #UD.
  955. */
  956. if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
  957. if (!npt_enabled ||
  958. !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
  959. svm_set_intercept(svm, INTERCEPT_INVPCID);
  960. else
  961. svm_clr_intercept(svm, INTERCEPT_INVPCID);
  962. }
  963. if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
  964. if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
  965. svm_clr_intercept(svm, INTERCEPT_RDTSCP);
  966. else
  967. svm_set_intercept(svm, INTERCEPT_RDTSCP);
  968. }
  969. }
  970. static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
  971. {
  972. struct vcpu_svm *svm = to_svm(vcpu);
  973. if (guest_cpuid_is_intel_compatible(vcpu)) {
  974. /*
  975. * We must intercept SYSENTER_EIP and SYSENTER_ESP
  976. * accesses because the processor only stores 32 bits.
  977. * For the same reason we cannot use virtual VMLOAD/VMSAVE.
  978. */
  979. svm_set_intercept(svm, INTERCEPT_VMLOAD);
  980. svm_set_intercept(svm, INTERCEPT_VMSAVE);
  981. svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
  982. set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
  983. set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
  984. } else {
  985. /*
  986. * If hardware supports Virtual VMLOAD VMSAVE then enable it
  987. * in VMCB and clear intercepts to avoid #VMEXIT.
  988. */
  989. if (vls) {
  990. svm_clr_intercept(svm, INTERCEPT_VMLOAD);
  991. svm_clr_intercept(svm, INTERCEPT_VMSAVE);
  992. svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
  993. }
  994. /* No need to intercept these MSRs */
  995. set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
  996. set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
  997. }
  998. }
  999. static void init_vmcb(struct kvm_vcpu *vcpu)
  1000. {
  1001. struct vcpu_svm *svm = to_svm(vcpu);
  1002. struct vmcb *vmcb = svm->vmcb01.ptr;
  1003. struct vmcb_control_area *control = &vmcb->control;
  1004. struct vmcb_save_area *save = &vmcb->save;
  1005. svm_set_intercept(svm, INTERCEPT_CR0_READ);
  1006. svm_set_intercept(svm, INTERCEPT_CR3_READ);
  1007. svm_set_intercept(svm, INTERCEPT_CR4_READ);
  1008. svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
  1009. svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
  1010. svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
  1011. if (!kvm_vcpu_apicv_active(vcpu))
  1012. svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
  1013. set_dr_intercepts(svm);
  1014. set_exception_intercept(svm, PF_VECTOR);
  1015. set_exception_intercept(svm, UD_VECTOR);
  1016. set_exception_intercept(svm, MC_VECTOR);
  1017. set_exception_intercept(svm, AC_VECTOR);
  1018. set_exception_intercept(svm, DB_VECTOR);
  1019. /*
  1020. * Guest access to VMware backdoor ports could legitimately
  1021. * trigger #GP because of TSS I/O permission bitmap.
  1022. * We intercept those #GP and allow access to them anyway
  1023. * as VMware does.
  1024. */
  1025. if (enable_vmware_backdoor)
  1026. set_exception_intercept(svm, GP_VECTOR);
  1027. svm_set_intercept(svm, INTERCEPT_INTR);
  1028. svm_set_intercept(svm, INTERCEPT_NMI);
  1029. if (intercept_smi)
  1030. svm_set_intercept(svm, INTERCEPT_SMI);
  1031. svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
  1032. svm_set_intercept(svm, INTERCEPT_RDPMC);
  1033. svm_set_intercept(svm, INTERCEPT_CPUID);
  1034. svm_set_intercept(svm, INTERCEPT_INVD);
  1035. svm_set_intercept(svm, INTERCEPT_INVLPG);
  1036. svm_set_intercept(svm, INTERCEPT_INVLPGA);
  1037. svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
  1038. svm_set_intercept(svm, INTERCEPT_MSR_PROT);
  1039. svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
  1040. svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
  1041. svm_set_intercept(svm, INTERCEPT_VMRUN);
  1042. svm_set_intercept(svm, INTERCEPT_VMMCALL);
  1043. svm_set_intercept(svm, INTERCEPT_VMLOAD);
  1044. svm_set_intercept(svm, INTERCEPT_VMSAVE);
  1045. svm_set_intercept(svm, INTERCEPT_STGI);
  1046. svm_set_intercept(svm, INTERCEPT_CLGI);
  1047. svm_set_intercept(svm, INTERCEPT_SKINIT);
  1048. svm_set_intercept(svm, INTERCEPT_WBINVD);
  1049. svm_set_intercept(svm, INTERCEPT_XSETBV);
  1050. svm_set_intercept(svm, INTERCEPT_RDPRU);
  1051. svm_set_intercept(svm, INTERCEPT_RSM);
  1052. if (!kvm_mwait_in_guest(vcpu->kvm)) {
  1053. svm_set_intercept(svm, INTERCEPT_MONITOR);
  1054. svm_set_intercept(svm, INTERCEPT_MWAIT);
  1055. }
  1056. if (!kvm_hlt_in_guest(vcpu->kvm))
  1057. svm_set_intercept(svm, INTERCEPT_HLT);
  1058. control->iopm_base_pa = iopm_base;
  1059. control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
  1060. control->int_ctl = V_INTR_MASKING_MASK;
  1061. init_seg(&save->es);
  1062. init_seg(&save->ss);
  1063. init_seg(&save->ds);
  1064. init_seg(&save->fs);
  1065. init_seg(&save->gs);
  1066. save->cs.selector = 0xf000;
  1067. save->cs.base = 0xffff0000;
  1068. /* Executable/Readable Code Segment */
  1069. save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
  1070. SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
  1071. save->cs.limit = 0xffff;
  1072. save->gdtr.base = 0;
  1073. save->gdtr.limit = 0xffff;
  1074. save->idtr.base = 0;
  1075. save->idtr.limit = 0xffff;
  1076. init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
  1077. init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
  1078. if (npt_enabled) {
  1079. /* Setup VMCB for Nested Paging */
  1080. control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
  1081. svm_clr_intercept(svm, INTERCEPT_INVLPG);
  1082. clr_exception_intercept(svm, PF_VECTOR);
  1083. svm_clr_intercept(svm, INTERCEPT_CR3_READ);
  1084. svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
  1085. save->g_pat = vcpu->arch.pat;
  1086. save->cr3 = 0;
  1087. }
  1088. svm->current_vmcb->asid_generation = 0;
  1089. svm->asid = 0;
  1090. svm->nested.vmcb12_gpa = INVALID_GPA;
  1091. svm->nested.last_vmcb12_gpa = INVALID_GPA;
  1092. if (!kvm_pause_in_guest(vcpu->kvm)) {
  1093. control->pause_filter_count = pause_filter_count;
  1094. if (pause_filter_thresh)
  1095. control->pause_filter_thresh = pause_filter_thresh;
  1096. svm_set_intercept(svm, INTERCEPT_PAUSE);
  1097. } else {
  1098. svm_clr_intercept(svm, INTERCEPT_PAUSE);
  1099. }
  1100. svm_recalc_instruction_intercepts(vcpu, svm);
  1101. /*
  1102. * If the host supports V_SPEC_CTRL then disable the interception
  1103. * of MSR_IA32_SPEC_CTRL.
  1104. */
  1105. if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
  1106. set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
  1107. if (kvm_vcpu_apicv_active(vcpu))
  1108. avic_init_vmcb(svm, vmcb);
  1109. if (vnmi)
  1110. svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK;
  1111. if (vgif) {
  1112. svm_clr_intercept(svm, INTERCEPT_STGI);
  1113. svm_clr_intercept(svm, INTERCEPT_CLGI);
  1114. svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
  1115. }
  1116. if (sev_guest(vcpu->kvm))
  1117. sev_init_vmcb(svm);
  1118. svm_hv_init_vmcb(vmcb);
  1119. init_vmcb_after_set_cpuid(vcpu);
  1120. vmcb_mark_all_dirty(vmcb);
  1121. enable_gif(svm);
  1122. }
  1123. static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
  1124. {
  1125. struct vcpu_svm *svm = to_svm(vcpu);
  1126. svm_vcpu_init_msrpm(vcpu, svm->msrpm);
  1127. svm_init_osvw(vcpu);
  1128. vcpu->arch.microcode_version = 0x01000065;
  1129. svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
  1130. svm->nmi_masked = false;
  1131. svm->awaiting_iret_completion = false;
  1132. if (sev_es_guest(vcpu->kvm))
  1133. sev_es_vcpu_reset(svm);
  1134. }
  1135. static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
  1136. {
  1137. struct vcpu_svm *svm = to_svm(vcpu);
  1138. svm->spec_ctrl = 0;
  1139. svm->virt_spec_ctrl = 0;
  1140. if (init_event)
  1141. sev_snp_init_protected_guest_state(vcpu);
  1142. init_vmcb(vcpu);
  1143. if (!init_event)
  1144. __svm_vcpu_reset(vcpu);
  1145. }
  1146. void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
  1147. {
  1148. svm->current_vmcb = target_vmcb;
  1149. svm->vmcb = target_vmcb->ptr;
  1150. }
  1151. static int svm_vcpu_create(struct kvm_vcpu *vcpu)
  1152. {
  1153. struct vcpu_svm *svm;
  1154. struct page *vmcb01_page;
  1155. struct page *vmsa_page = NULL;
  1156. int err;
  1157. BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
  1158. svm = to_svm(vcpu);
  1159. err = -ENOMEM;
  1160. vmcb01_page = snp_safe_alloc_page();
  1161. if (!vmcb01_page)
  1162. goto out;
  1163. if (sev_es_guest(vcpu->kvm)) {
  1164. /*
  1165. * SEV-ES guests require a separate VMSA page used to contain
  1166. * the encrypted register state of the guest.
  1167. */
  1168. vmsa_page = snp_safe_alloc_page();
  1169. if (!vmsa_page)
  1170. goto error_free_vmcb_page;
  1171. }
  1172. err = avic_init_vcpu(svm);
  1173. if (err)
  1174. goto error_free_vmsa_page;
  1175. svm->msrpm = svm_vcpu_alloc_msrpm();
  1176. if (!svm->msrpm) {
  1177. err = -ENOMEM;
  1178. goto error_free_vmsa_page;
  1179. }
  1180. svm->x2avic_msrs_intercepted = true;
  1181. svm->vmcb01.ptr = page_address(vmcb01_page);
  1182. svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
  1183. svm_switch_vmcb(svm, &svm->vmcb01);
  1184. if (vmsa_page)
  1185. svm->sev_es.vmsa = page_address(vmsa_page);
  1186. svm->guest_state_loaded = false;
  1187. return 0;
  1188. error_free_vmsa_page:
  1189. if (vmsa_page)
  1190. __free_page(vmsa_page);
  1191. error_free_vmcb_page:
  1192. __free_page(vmcb01_page);
  1193. out:
  1194. return err;
  1195. }
  1196. static void svm_clear_current_vmcb(struct vmcb *vmcb)
  1197. {
  1198. int i;
  1199. for_each_online_cpu(i)
  1200. cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL);
  1201. }
  1202. static void svm_vcpu_free(struct kvm_vcpu *vcpu)
  1203. {
  1204. struct vcpu_svm *svm = to_svm(vcpu);
  1205. /*
  1206. * The vmcb page can be recycled, causing a false negative in
  1207. * svm_vcpu_load(). So, ensure that no logical CPU has this
  1208. * vmcb page recorded as its current vmcb.
  1209. */
  1210. svm_clear_current_vmcb(svm->vmcb);
  1211. svm_leave_nested(vcpu);
  1212. svm_free_nested(svm);
  1213. sev_free_vcpu(vcpu);
  1214. __free_page(__sme_pa_to_page(svm->vmcb01.pa));
  1215. __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
  1216. }
  1217. static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
  1218. {
  1219. struct vcpu_svm *svm = to_svm(vcpu);
  1220. struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
  1221. if (sev_es_guest(vcpu->kvm))
  1222. sev_es_unmap_ghcb(svm);
  1223. if (svm->guest_state_loaded)
  1224. return;
  1225. /*
  1226. * Save additional host state that will be restored on VMEXIT (sev-es)
  1227. * or subsequent vmload of host save area.
  1228. */
  1229. vmsave(sd->save_area_pa);
  1230. if (sev_es_guest(vcpu->kvm))
  1231. sev_es_prepare_switch_to_guest(svm, sev_es_host_save_area(sd));
  1232. if (tsc_scaling)
  1233. __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
  1234. /*
  1235. * TSC_AUX is always virtualized for SEV-ES guests when the feature is
  1236. * available. The user return MSR support is not required in this case
  1237. * because TSC_AUX is restored on #VMEXIT from the host save area
  1238. * (which has been initialized in svm_enable_virtualization_cpu()).
  1239. */
  1240. if (likely(tsc_aux_uret_slot >= 0) &&
  1241. (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
  1242. kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
  1243. svm->guest_state_loaded = true;
  1244. }
  1245. static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
  1246. {
  1247. to_svm(vcpu)->guest_state_loaded = false;
  1248. }
  1249. static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
  1250. {
  1251. struct vcpu_svm *svm = to_svm(vcpu);
  1252. struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
  1253. if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
  1254. shrink_ple_window(vcpu);
  1255. if (sd->current_vmcb != svm->vmcb) {
  1256. sd->current_vmcb = svm->vmcb;
  1257. if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT))
  1258. indirect_branch_prediction_barrier();
  1259. }
  1260. if (kvm_vcpu_apicv_active(vcpu))
  1261. avic_vcpu_load(vcpu, cpu);
  1262. }
  1263. static void svm_vcpu_put(struct kvm_vcpu *vcpu)
  1264. {
  1265. if (kvm_vcpu_apicv_active(vcpu))
  1266. avic_vcpu_put(vcpu);
  1267. svm_prepare_host_switch(vcpu);
  1268. ++vcpu->stat.host_state_reload;
  1269. }
  1270. static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
  1271. {
  1272. struct vcpu_svm *svm = to_svm(vcpu);
  1273. unsigned long rflags = svm->vmcb->save.rflags;
  1274. if (svm->nmi_singlestep) {
  1275. /* Hide our flags if they were not set by the guest */
  1276. if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
  1277. rflags &= ~X86_EFLAGS_TF;
  1278. if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
  1279. rflags &= ~X86_EFLAGS_RF;
  1280. }
  1281. return rflags;
  1282. }
  1283. static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
  1284. {
  1285. if (to_svm(vcpu)->nmi_singlestep)
  1286. rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
  1287. /*
  1288. * Any change of EFLAGS.VM is accompanied by a reload of SS
  1289. * (caused by either a task switch or an inter-privilege IRET),
  1290. * so we do not need to update the CPL here.
  1291. */
  1292. to_svm(vcpu)->vmcb->save.rflags = rflags;
  1293. }
  1294. static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
  1295. {
  1296. struct vmcb *vmcb = to_svm(vcpu)->vmcb;
  1297. return sev_es_guest(vcpu->kvm)
  1298. ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
  1299. : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
  1300. }
  1301. static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
  1302. {
  1303. kvm_register_mark_available(vcpu, reg);
  1304. switch (reg) {
  1305. case VCPU_EXREG_PDPTR:
  1306. /*
  1307. * When !npt_enabled, mmu->pdptrs[] is already available since
  1308. * it is always updated per SDM when moving to CRs.
  1309. */
  1310. if (npt_enabled)
  1311. load_pdptrs(vcpu, kvm_read_cr3(vcpu));
  1312. break;
  1313. default:
  1314. KVM_BUG_ON(1, vcpu->kvm);
  1315. }
  1316. }
  1317. static void svm_set_vintr(struct vcpu_svm *svm)
  1318. {
  1319. struct vmcb_control_area *control;
  1320. /*
  1321. * The following fields are ignored when AVIC is enabled
  1322. */
  1323. WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
  1324. svm_set_intercept(svm, INTERCEPT_VINTR);
  1325. /*
  1326. * Recalculating intercepts may have cleared the VINTR intercept. If
  1327. * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF
  1328. * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN.
  1329. * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as
  1330. * interrupts will never be unblocked while L2 is running.
  1331. */
  1332. if (!svm_is_intercept(svm, INTERCEPT_VINTR))
  1333. return;
  1334. /*
  1335. * This is just a dummy VINTR to actually cause a vmexit to happen.
  1336. * Actual injection of virtual interrupts happens through EVENTINJ.
  1337. */
  1338. control = &svm->vmcb->control;
  1339. control->int_vector = 0x0;
  1340. control->int_ctl &= ~V_INTR_PRIO_MASK;
  1341. control->int_ctl |= V_IRQ_MASK |
  1342. ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
  1343. vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
  1344. }
  1345. static void svm_clear_vintr(struct vcpu_svm *svm)
  1346. {
  1347. svm_clr_intercept(svm, INTERCEPT_VINTR);
  1348. /* Drop int_ctl fields related to VINTR injection. */
  1349. svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
  1350. if (is_guest_mode(&svm->vcpu)) {
  1351. svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
  1352. WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
  1353. (svm->nested.ctl.int_ctl & V_TPR_MASK));
  1354. svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
  1355. V_IRQ_INJECTION_BITS_MASK;
  1356. svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
  1357. }
  1358. vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
  1359. }
  1360. static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
  1361. {
  1362. struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
  1363. struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
  1364. switch (seg) {
  1365. case VCPU_SREG_CS: return &save->cs;
  1366. case VCPU_SREG_DS: return &save->ds;
  1367. case VCPU_SREG_ES: return &save->es;
  1368. case VCPU_SREG_FS: return &save01->fs;
  1369. case VCPU_SREG_GS: return &save01->gs;
  1370. case VCPU_SREG_SS: return &save->ss;
  1371. case VCPU_SREG_TR: return &save01->tr;
  1372. case VCPU_SREG_LDTR: return &save01->ldtr;
  1373. }
  1374. BUG();
  1375. return NULL;
  1376. }
  1377. static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
  1378. {
  1379. struct vmcb_seg *s = svm_seg(vcpu, seg);
  1380. return s->base;
  1381. }
  1382. static void svm_get_segment(struct kvm_vcpu *vcpu,
  1383. struct kvm_segment *var, int seg)
  1384. {
  1385. struct vmcb_seg *s = svm_seg(vcpu, seg);
  1386. var->base = s->base;
  1387. var->limit = s->limit;
  1388. var->selector = s->selector;
  1389. var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
  1390. var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
  1391. var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
  1392. var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
  1393. var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
  1394. var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
  1395. var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
  1396. /*
  1397. * AMD CPUs circa 2014 track the G bit for all segments except CS.
  1398. * However, the SVM spec states that the G bit is not observed by the
  1399. * CPU, and some VMware virtual CPUs drop the G bit for all segments.
  1400. * So let's synthesize a legal G bit for all segments, this helps
  1401. * running KVM nested. It also helps cross-vendor migration, because
  1402. * Intel's vmentry has a check on the 'G' bit.
  1403. */
  1404. var->g = s->limit > 0xfffff;
  1405. /*
  1406. * AMD's VMCB does not have an explicit unusable field, so emulate it
  1407. * for cross vendor migration purposes by "not present"
  1408. */
  1409. var->unusable = !var->present;
  1410. switch (seg) {
  1411. case VCPU_SREG_TR:
  1412. /*
  1413. * Work around a bug where the busy flag in the tr selector
  1414. * isn't exposed
  1415. */
  1416. var->type |= 0x2;
  1417. break;
  1418. case VCPU_SREG_DS:
  1419. case VCPU_SREG_ES:
  1420. case VCPU_SREG_FS:
  1421. case VCPU_SREG_GS:
  1422. /*
  1423. * The accessed bit must always be set in the segment
  1424. * descriptor cache, although it can be cleared in the
  1425. * descriptor, the cached bit always remains at 1. Since
  1426. * Intel has a check on this, set it here to support
  1427. * cross-vendor migration.
  1428. */
  1429. if (!var->unusable)
  1430. var->type |= 0x1;
  1431. break;
  1432. case VCPU_SREG_SS:
  1433. /*
  1434. * On AMD CPUs sometimes the DB bit in the segment
  1435. * descriptor is left as 1, although the whole segment has
  1436. * been made unusable. Clear it here to pass an Intel VMX
  1437. * entry check when cross vendor migrating.
  1438. */
  1439. if (var->unusable)
  1440. var->db = 0;
  1441. /* This is symmetric with svm_set_segment() */
  1442. var->dpl = to_svm(vcpu)->vmcb->save.cpl;
  1443. break;
  1444. }
  1445. }
  1446. static int svm_get_cpl(struct kvm_vcpu *vcpu)
  1447. {
  1448. struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
  1449. return save->cpl;
  1450. }
  1451. static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
  1452. {
  1453. struct kvm_segment cs;
  1454. svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
  1455. *db = cs.db;
  1456. *l = cs.l;
  1457. }
  1458. static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
  1459. {
  1460. struct vcpu_svm *svm = to_svm(vcpu);
  1461. dt->size = svm->vmcb->save.idtr.limit;
  1462. dt->address = svm->vmcb->save.idtr.base;
  1463. }
  1464. static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
  1465. {
  1466. struct vcpu_svm *svm = to_svm(vcpu);
  1467. svm->vmcb->save.idtr.limit = dt->size;
  1468. svm->vmcb->save.idtr.base = dt->address ;
  1469. vmcb_mark_dirty(svm->vmcb, VMCB_DT);
  1470. }
  1471. static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
  1472. {
  1473. struct vcpu_svm *svm = to_svm(vcpu);
  1474. dt->size = svm->vmcb->save.gdtr.limit;
  1475. dt->address = svm->vmcb->save.gdtr.base;
  1476. }
  1477. static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
  1478. {
  1479. struct vcpu_svm *svm = to_svm(vcpu);
  1480. svm->vmcb->save.gdtr.limit = dt->size;
  1481. svm->vmcb->save.gdtr.base = dt->address ;
  1482. vmcb_mark_dirty(svm->vmcb, VMCB_DT);
  1483. }
  1484. static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
  1485. {
  1486. struct vcpu_svm *svm = to_svm(vcpu);
  1487. /*
  1488. * For guests that don't set guest_state_protected, the cr3 update is
  1489. * handled via kvm_mmu_load() while entering the guest. For guests
  1490. * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
  1491. * VMCB save area now, since the save area will become the initial
  1492. * contents of the VMSA, and future VMCB save area updates won't be
  1493. * seen.
  1494. */
  1495. if (sev_es_guest(vcpu->kvm)) {
  1496. svm->vmcb->save.cr3 = cr3;
  1497. vmcb_mark_dirty(svm->vmcb, VMCB_CR);
  1498. }
  1499. }
  1500. static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
  1501. {
  1502. return true;
  1503. }
  1504. void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
  1505. {
  1506. struct vcpu_svm *svm = to_svm(vcpu);
  1507. u64 hcr0 = cr0;
  1508. bool old_paging = is_paging(vcpu);
  1509. #ifdef CONFIG_X86_64
  1510. if (vcpu->arch.efer & EFER_LME) {
  1511. if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
  1512. vcpu->arch.efer |= EFER_LMA;
  1513. if (!vcpu->arch.guest_state_protected)
  1514. svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
  1515. }
  1516. if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
  1517. vcpu->arch.efer &= ~EFER_LMA;
  1518. if (!vcpu->arch.guest_state_protected)
  1519. svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
  1520. }
  1521. }
  1522. #endif
  1523. vcpu->arch.cr0 = cr0;
  1524. if (!npt_enabled) {
  1525. hcr0 |= X86_CR0_PG | X86_CR0_WP;
  1526. if (old_paging != is_paging(vcpu))
  1527. svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
  1528. }
  1529. /*
  1530. * re-enable caching here because the QEMU bios
  1531. * does not do it - this results in some delay at
  1532. * reboot
  1533. */
  1534. if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
  1535. hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
  1536. svm->vmcb->save.cr0 = hcr0;
  1537. vmcb_mark_dirty(svm->vmcb, VMCB_CR);
  1538. /*
  1539. * SEV-ES guests must always keep the CR intercepts cleared. CR
  1540. * tracking is done using the CR write traps.
  1541. */
  1542. if (sev_es_guest(vcpu->kvm))
  1543. return;
  1544. if (hcr0 == cr0) {
  1545. /* Selective CR0 write remains on. */
  1546. svm_clr_intercept(svm, INTERCEPT_CR0_READ);
  1547. svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
  1548. } else {
  1549. svm_set_intercept(svm, INTERCEPT_CR0_READ);
  1550. svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
  1551. }
  1552. }
  1553. static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
  1554. {
  1555. return true;
  1556. }
  1557. void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
  1558. {
  1559. unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
  1560. unsigned long old_cr4 = vcpu->arch.cr4;
  1561. if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
  1562. svm_flush_tlb_current(vcpu);
  1563. vcpu->arch.cr4 = cr4;
  1564. if (!npt_enabled) {
  1565. cr4 |= X86_CR4_PAE;
  1566. if (!is_paging(vcpu))
  1567. cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
  1568. }
  1569. cr4 |= host_cr4_mce;
  1570. to_svm(vcpu)->vmcb->save.cr4 = cr4;
  1571. vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
  1572. if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
  1573. kvm_update_cpuid_runtime(vcpu);
  1574. }
  1575. static void svm_set_segment(struct kvm_vcpu *vcpu,
  1576. struct kvm_segment *var, int seg)
  1577. {
  1578. struct vcpu_svm *svm = to_svm(vcpu);
  1579. struct vmcb_seg *s = svm_seg(vcpu, seg);
  1580. s->base = var->base;
  1581. s->limit = var->limit;
  1582. s->selector = var->selector;
  1583. s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
  1584. s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
  1585. s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
  1586. s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
  1587. s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
  1588. s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
  1589. s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
  1590. s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
  1591. /*
  1592. * This is always accurate, except if SYSRET returned to a segment
  1593. * with SS.DPL != 3. Intel does not have this quirk, and always
  1594. * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
  1595. * would entail passing the CPL to userspace and back.
  1596. */
  1597. if (seg == VCPU_SREG_SS)
  1598. /* This is symmetric with svm_get_segment() */
  1599. svm->vmcb->save.cpl = (var->dpl & 3);
  1600. vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
  1601. }
  1602. static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
  1603. {
  1604. struct vcpu_svm *svm = to_svm(vcpu);
  1605. clr_exception_intercept(svm, BP_VECTOR);
  1606. if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
  1607. if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
  1608. set_exception_intercept(svm, BP_VECTOR);
  1609. }
  1610. }
  1611. static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
  1612. {
  1613. if (sd->next_asid > sd->max_asid) {
  1614. ++sd->asid_generation;
  1615. sd->next_asid = sd->min_asid;
  1616. svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
  1617. vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
  1618. }
  1619. svm->current_vmcb->asid_generation = sd->asid_generation;
  1620. svm->asid = sd->next_asid++;
  1621. }
  1622. static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
  1623. {
  1624. struct vmcb *vmcb = to_svm(vcpu)->vmcb;
  1625. if (vcpu->arch.guest_state_protected)
  1626. return;
  1627. if (unlikely(value != vmcb->save.dr6)) {
  1628. vmcb->save.dr6 = value;
  1629. vmcb_mark_dirty(vmcb, VMCB_DR);
  1630. }
  1631. }
  1632. static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
  1633. {
  1634. struct vcpu_svm *svm = to_svm(vcpu);
  1635. if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm)))
  1636. return;
  1637. get_debugreg(vcpu->arch.db[0], 0);
  1638. get_debugreg(vcpu->arch.db[1], 1);
  1639. get_debugreg(vcpu->arch.db[2], 2);
  1640. get_debugreg(vcpu->arch.db[3], 3);
  1641. /*
  1642. * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
  1643. * because db_interception might need it. We can do it before vmentry.
  1644. */
  1645. vcpu->arch.dr6 = svm->vmcb->save.dr6;
  1646. vcpu->arch.dr7 = svm->vmcb->save.dr7;
  1647. vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
  1648. set_dr_intercepts(svm);
  1649. }
  1650. static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
  1651. {
  1652. struct vcpu_svm *svm = to_svm(vcpu);
  1653. if (vcpu->arch.guest_state_protected)
  1654. return;
  1655. svm->vmcb->save.dr7 = value;
  1656. vmcb_mark_dirty(svm->vmcb, VMCB_DR);
  1657. }
  1658. static int pf_interception(struct kvm_vcpu *vcpu)
  1659. {
  1660. struct vcpu_svm *svm = to_svm(vcpu);
  1661. u64 fault_address = svm->vmcb->control.exit_info_2;
  1662. u64 error_code = svm->vmcb->control.exit_info_1;
  1663. return kvm_handle_page_fault(vcpu, error_code, fault_address,
  1664. static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
  1665. svm->vmcb->control.insn_bytes : NULL,
  1666. svm->vmcb->control.insn_len);
  1667. }
  1668. static int npf_interception(struct kvm_vcpu *vcpu)
  1669. {
  1670. struct vcpu_svm *svm = to_svm(vcpu);
  1671. int rc;
  1672. u64 fault_address = svm->vmcb->control.exit_info_2;
  1673. u64 error_code = svm->vmcb->control.exit_info_1;
  1674. /*
  1675. * WARN if hardware generates a fault with an error code that collides
  1676. * with KVM-defined sythentic flags. Clear the flags and continue on,
  1677. * i.e. don't terminate the VM, as KVM can't possibly be relying on a
  1678. * flag that KVM doesn't know about.
  1679. */
  1680. if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK))
  1681. error_code &= ~PFERR_SYNTHETIC_MASK;
  1682. if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK))
  1683. error_code |= PFERR_PRIVATE_ACCESS;
  1684. trace_kvm_page_fault(vcpu, fault_address, error_code);
  1685. rc = kvm_mmu_page_fault(vcpu, fault_address, error_code,
  1686. static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
  1687. svm->vmcb->control.insn_bytes : NULL,
  1688. svm->vmcb->control.insn_len);
  1689. if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK)
  1690. sev_handle_rmp_fault(vcpu, fault_address, error_code);
  1691. return rc;
  1692. }
  1693. static int db_interception(struct kvm_vcpu *vcpu)
  1694. {
  1695. struct kvm_run *kvm_run = vcpu->run;
  1696. struct vcpu_svm *svm = to_svm(vcpu);
  1697. if (!(vcpu->guest_debug &
  1698. (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
  1699. !svm->nmi_singlestep) {
  1700. u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
  1701. kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
  1702. return 1;
  1703. }
  1704. if (svm->nmi_singlestep) {
  1705. disable_nmi_singlestep(svm);
  1706. /* Make sure we check for pending NMIs upon entry */
  1707. kvm_make_request(KVM_REQ_EVENT, vcpu);
  1708. }
  1709. if (vcpu->guest_debug &
  1710. (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
  1711. kvm_run->exit_reason = KVM_EXIT_DEBUG;
  1712. kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
  1713. kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
  1714. kvm_run->debug.arch.pc =
  1715. svm->vmcb->save.cs.base + svm->vmcb->save.rip;
  1716. kvm_run->debug.arch.exception = DB_VECTOR;
  1717. return 0;
  1718. }
  1719. return 1;
  1720. }
  1721. static int bp_interception(struct kvm_vcpu *vcpu)
  1722. {
  1723. struct vcpu_svm *svm = to_svm(vcpu);
  1724. struct kvm_run *kvm_run = vcpu->run;
  1725. kvm_run->exit_reason = KVM_EXIT_DEBUG;
  1726. kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
  1727. kvm_run->debug.arch.exception = BP_VECTOR;
  1728. return 0;
  1729. }
  1730. static int ud_interception(struct kvm_vcpu *vcpu)
  1731. {
  1732. return handle_ud(vcpu);
  1733. }
  1734. static int ac_interception(struct kvm_vcpu *vcpu)
  1735. {
  1736. kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
  1737. return 1;
  1738. }
  1739. static bool is_erratum_383(void)
  1740. {
  1741. int err, i;
  1742. u64 value;
  1743. if (!erratum_383_found)
  1744. return false;
  1745. value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
  1746. if (err)
  1747. return false;
  1748. /* Bit 62 may or may not be set for this mce */
  1749. value &= ~(1ULL << 62);
  1750. if (value != 0xb600000000010015ULL)
  1751. return false;
  1752. /* Clear MCi_STATUS registers */
  1753. for (i = 0; i < 6; ++i)
  1754. native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
  1755. value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
  1756. if (!err) {
  1757. u32 low, high;
  1758. value &= ~(1ULL << 2);
  1759. low = lower_32_bits(value);
  1760. high = upper_32_bits(value);
  1761. native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
  1762. }
  1763. /* Flush tlb to evict multi-match entries */
  1764. __flush_tlb_all();
  1765. return true;
  1766. }
  1767. static void svm_handle_mce(struct kvm_vcpu *vcpu)
  1768. {
  1769. if (is_erratum_383()) {
  1770. /*
  1771. * Erratum 383 triggered. Guest state is corrupt so kill the
  1772. * guest.
  1773. */
  1774. pr_err("Guest triggered AMD Erratum 383\n");
  1775. kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
  1776. return;
  1777. }
  1778. /*
  1779. * On an #MC intercept the MCE handler is not called automatically in
  1780. * the host. So do it by hand here.
  1781. */
  1782. kvm_machine_check();
  1783. }
  1784. static int mc_interception(struct kvm_vcpu *vcpu)
  1785. {
  1786. return 1;
  1787. }
  1788. static int shutdown_interception(struct kvm_vcpu *vcpu)
  1789. {
  1790. struct kvm_run *kvm_run = vcpu->run;
  1791. struct vcpu_svm *svm = to_svm(vcpu);
  1792. /*
  1793. * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put
  1794. * the VMCB in a known good state. Unfortuately, KVM doesn't have
  1795. * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
  1796. * userspace. At a platform view, INIT is acceptable behavior as
  1797. * there exist bare metal platforms that automatically INIT the CPU
  1798. * in response to shutdown.
  1799. *
  1800. * The VM save area for SEV-ES guests has already been encrypted so it
  1801. * cannot be reinitialized, i.e. synthesizing INIT is futile.
  1802. */
  1803. if (!sev_es_guest(vcpu->kvm)) {
  1804. clear_page(svm->vmcb);
  1805. kvm_vcpu_reset(vcpu, true);
  1806. }
  1807. kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
  1808. return 0;
  1809. }
  1810. static int io_interception(struct kvm_vcpu *vcpu)
  1811. {
  1812. struct vcpu_svm *svm = to_svm(vcpu);
  1813. u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
  1814. int size, in, string;
  1815. unsigned port;
  1816. ++vcpu->stat.io_exits;
  1817. string = (io_info & SVM_IOIO_STR_MASK) != 0;
  1818. in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
  1819. port = io_info >> 16;
  1820. size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
  1821. if (string) {
  1822. if (sev_es_guest(vcpu->kvm))
  1823. return sev_es_string_io(svm, size, port, in);
  1824. else
  1825. return kvm_emulate_instruction(vcpu, 0);
  1826. }
  1827. svm->next_rip = svm->vmcb->control.exit_info_2;
  1828. return kvm_fast_pio(vcpu, size, port, in);
  1829. }
  1830. static int nmi_interception(struct kvm_vcpu *vcpu)
  1831. {
  1832. return 1;
  1833. }
  1834. static int smi_interception(struct kvm_vcpu *vcpu)
  1835. {
  1836. return 1;
  1837. }
  1838. static int intr_interception(struct kvm_vcpu *vcpu)
  1839. {
  1840. ++vcpu->stat.irq_exits;
  1841. return 1;
  1842. }
  1843. static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
  1844. {
  1845. struct vcpu_svm *svm = to_svm(vcpu);
  1846. struct vmcb *vmcb12;
  1847. struct kvm_host_map map;
  1848. int ret;
  1849. if (nested_svm_check_permissions(vcpu))
  1850. return 1;
  1851. ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
  1852. if (ret) {
  1853. if (ret == -EINVAL)
  1854. kvm_inject_gp(vcpu, 0);
  1855. return 1;
  1856. }
  1857. vmcb12 = map.hva;
  1858. ret = kvm_skip_emulated_instruction(vcpu);
  1859. if (vmload) {
  1860. svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
  1861. svm->sysenter_eip_hi = 0;
  1862. svm->sysenter_esp_hi = 0;
  1863. } else {
  1864. svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
  1865. }
  1866. kvm_vcpu_unmap(vcpu, &map, true);
  1867. return ret;
  1868. }
  1869. static int vmload_interception(struct kvm_vcpu *vcpu)
  1870. {
  1871. return vmload_vmsave_interception(vcpu, true);
  1872. }
  1873. static int vmsave_interception(struct kvm_vcpu *vcpu)
  1874. {
  1875. return vmload_vmsave_interception(vcpu, false);
  1876. }
  1877. static int vmrun_interception(struct kvm_vcpu *vcpu)
  1878. {
  1879. if (nested_svm_check_permissions(vcpu))
  1880. return 1;
  1881. return nested_svm_vmrun(vcpu);
  1882. }
  1883. enum {
  1884. NONE_SVM_INSTR,
  1885. SVM_INSTR_VMRUN,
  1886. SVM_INSTR_VMLOAD,
  1887. SVM_INSTR_VMSAVE,
  1888. };
  1889. /* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
  1890. static int svm_instr_opcode(struct kvm_vcpu *vcpu)
  1891. {
  1892. struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
  1893. if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
  1894. return NONE_SVM_INSTR;
  1895. switch (ctxt->modrm) {
  1896. case 0xd8: /* VMRUN */
  1897. return SVM_INSTR_VMRUN;
  1898. case 0xda: /* VMLOAD */
  1899. return SVM_INSTR_VMLOAD;
  1900. case 0xdb: /* VMSAVE */
  1901. return SVM_INSTR_VMSAVE;
  1902. default:
  1903. break;
  1904. }
  1905. return NONE_SVM_INSTR;
  1906. }
  1907. static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
  1908. {
  1909. const int guest_mode_exit_codes[] = {
  1910. [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
  1911. [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
  1912. [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
  1913. };
  1914. int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
  1915. [SVM_INSTR_VMRUN] = vmrun_interception,
  1916. [SVM_INSTR_VMLOAD] = vmload_interception,
  1917. [SVM_INSTR_VMSAVE] = vmsave_interception,
  1918. };
  1919. struct vcpu_svm *svm = to_svm(vcpu);
  1920. int ret;
  1921. if (is_guest_mode(vcpu)) {
  1922. /* Returns '1' or -errno on failure, '0' on success. */
  1923. ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
  1924. if (ret)
  1925. return ret;
  1926. return 1;
  1927. }
  1928. return svm_instr_handlers[opcode](vcpu);
  1929. }
  1930. /*
  1931. * #GP handling code. Note that #GP can be triggered under the following two
  1932. * cases:
  1933. * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
  1934. * some AMD CPUs when EAX of these instructions are in the reserved memory
  1935. * regions (e.g. SMM memory on host).
  1936. * 2) VMware backdoor
  1937. */
  1938. static int gp_interception(struct kvm_vcpu *vcpu)
  1939. {
  1940. struct vcpu_svm *svm = to_svm(vcpu);
  1941. u32 error_code = svm->vmcb->control.exit_info_1;
  1942. int opcode;
  1943. /* Both #GP cases have zero error_code */
  1944. if (error_code)
  1945. goto reinject;
  1946. /* Decode the instruction for usage later */
  1947. if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
  1948. goto reinject;
  1949. opcode = svm_instr_opcode(vcpu);
  1950. if (opcode == NONE_SVM_INSTR) {
  1951. if (!enable_vmware_backdoor)
  1952. goto reinject;
  1953. /*
  1954. * VMware backdoor emulation on #GP interception only handles
  1955. * IN{S}, OUT{S}, and RDPMC.
  1956. */
  1957. if (!is_guest_mode(vcpu))
  1958. return kvm_emulate_instruction(vcpu,
  1959. EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
  1960. } else {
  1961. /* All SVM instructions expect page aligned RAX */
  1962. if (svm->vmcb->save.rax & ~PAGE_MASK)
  1963. goto reinject;
  1964. return emulate_svm_instr(vcpu, opcode);
  1965. }
  1966. reinject:
  1967. kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
  1968. return 1;
  1969. }
  1970. void svm_set_gif(struct vcpu_svm *svm, bool value)
  1971. {
  1972. if (value) {
  1973. /*
  1974. * If VGIF is enabled, the STGI intercept is only added to
  1975. * detect the opening of the SMI/NMI window; remove it now.
  1976. * Likewise, clear the VINTR intercept, we will set it
  1977. * again while processing KVM_REQ_EVENT if needed.
  1978. */
  1979. if (vgif)
  1980. svm_clr_intercept(svm, INTERCEPT_STGI);
  1981. if (svm_is_intercept(svm, INTERCEPT_VINTR))
  1982. svm_clear_vintr(svm);
  1983. enable_gif(svm);
  1984. if (svm->vcpu.arch.smi_pending ||
  1985. svm->vcpu.arch.nmi_pending ||
  1986. kvm_cpu_has_injectable_intr(&svm->vcpu) ||
  1987. kvm_apic_has_pending_init_or_sipi(&svm->vcpu))
  1988. kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
  1989. } else {
  1990. disable_gif(svm);
  1991. /*
  1992. * After a CLGI no interrupts should come. But if vGIF is
  1993. * in use, we still rely on the VINTR intercept (rather than
  1994. * STGI) to detect an open interrupt window.
  1995. */
  1996. if (!vgif)
  1997. svm_clear_vintr(svm);
  1998. }
  1999. }
  2000. static int stgi_interception(struct kvm_vcpu *vcpu)
  2001. {
  2002. int ret;
  2003. if (nested_svm_check_permissions(vcpu))
  2004. return 1;
  2005. ret = kvm_skip_emulated_instruction(vcpu);
  2006. svm_set_gif(to_svm(vcpu), true);
  2007. return ret;
  2008. }
  2009. static int clgi_interception(struct kvm_vcpu *vcpu)
  2010. {
  2011. int ret;
  2012. if (nested_svm_check_permissions(vcpu))
  2013. return 1;
  2014. ret = kvm_skip_emulated_instruction(vcpu);
  2015. svm_set_gif(to_svm(vcpu), false);
  2016. return ret;
  2017. }
  2018. static int invlpga_interception(struct kvm_vcpu *vcpu)
  2019. {
  2020. gva_t gva = kvm_rax_read(vcpu);
  2021. u32 asid = kvm_rcx_read(vcpu);
  2022. /* FIXME: Handle an address size prefix. */
  2023. if (!is_long_mode(vcpu))
  2024. gva = (u32)gva;
  2025. trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
  2026. /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
  2027. kvm_mmu_invlpg(vcpu, gva);
  2028. return kvm_skip_emulated_instruction(vcpu);
  2029. }
  2030. static int skinit_interception(struct kvm_vcpu *vcpu)
  2031. {
  2032. trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
  2033. kvm_queue_exception(vcpu, UD_VECTOR);
  2034. return 1;
  2035. }
  2036. static int task_switch_interception(struct kvm_vcpu *vcpu)
  2037. {
  2038. struct vcpu_svm *svm = to_svm(vcpu);
  2039. u16 tss_selector;
  2040. int reason;
  2041. int int_type = svm->vmcb->control.exit_int_info &
  2042. SVM_EXITINTINFO_TYPE_MASK;
  2043. int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
  2044. uint32_t type =
  2045. svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
  2046. uint32_t idt_v =
  2047. svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
  2048. bool has_error_code = false;
  2049. u32 error_code = 0;
  2050. tss_selector = (u16)svm->vmcb->control.exit_info_1;
  2051. if (svm->vmcb->control.exit_info_2 &
  2052. (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
  2053. reason = TASK_SWITCH_IRET;
  2054. else if (svm->vmcb->control.exit_info_2 &
  2055. (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
  2056. reason = TASK_SWITCH_JMP;
  2057. else if (idt_v)
  2058. reason = TASK_SWITCH_GATE;
  2059. else
  2060. reason = TASK_SWITCH_CALL;
  2061. if (reason == TASK_SWITCH_GATE) {
  2062. switch (type) {
  2063. case SVM_EXITINTINFO_TYPE_NMI:
  2064. vcpu->arch.nmi_injected = false;
  2065. break;
  2066. case SVM_EXITINTINFO_TYPE_EXEPT:
  2067. if (svm->vmcb->control.exit_info_2 &
  2068. (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
  2069. has_error_code = true;
  2070. error_code =
  2071. (u32)svm->vmcb->control.exit_info_2;
  2072. }
  2073. kvm_clear_exception_queue(vcpu);
  2074. break;
  2075. case SVM_EXITINTINFO_TYPE_INTR:
  2076. case SVM_EXITINTINFO_TYPE_SOFT:
  2077. kvm_clear_interrupt_queue(vcpu);
  2078. break;
  2079. default:
  2080. break;
  2081. }
  2082. }
  2083. if (reason != TASK_SWITCH_GATE ||
  2084. int_type == SVM_EXITINTINFO_TYPE_SOFT ||
  2085. (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
  2086. (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
  2087. if (!svm_skip_emulated_instruction(vcpu))
  2088. return 0;
  2089. }
  2090. if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
  2091. int_vec = -1;
  2092. return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
  2093. has_error_code, error_code);
  2094. }
  2095. static void svm_clr_iret_intercept(struct vcpu_svm *svm)
  2096. {
  2097. if (!sev_es_guest(svm->vcpu.kvm))
  2098. svm_clr_intercept(svm, INTERCEPT_IRET);
  2099. }
  2100. static void svm_set_iret_intercept(struct vcpu_svm *svm)
  2101. {
  2102. if (!sev_es_guest(svm->vcpu.kvm))
  2103. svm_set_intercept(svm, INTERCEPT_IRET);
  2104. }
  2105. static int iret_interception(struct kvm_vcpu *vcpu)
  2106. {
  2107. struct vcpu_svm *svm = to_svm(vcpu);
  2108. WARN_ON_ONCE(sev_es_guest(vcpu->kvm));
  2109. ++vcpu->stat.nmi_window_exits;
  2110. svm->awaiting_iret_completion = true;
  2111. svm_clr_iret_intercept(svm);
  2112. svm->nmi_iret_rip = kvm_rip_read(vcpu);
  2113. kvm_make_request(KVM_REQ_EVENT, vcpu);
  2114. return 1;
  2115. }
  2116. static int invlpg_interception(struct kvm_vcpu *vcpu)
  2117. {
  2118. if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
  2119. return kvm_emulate_instruction(vcpu, 0);
  2120. kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
  2121. return kvm_skip_emulated_instruction(vcpu);
  2122. }
  2123. static int emulate_on_interception(struct kvm_vcpu *vcpu)
  2124. {
  2125. return kvm_emulate_instruction(vcpu, 0);
  2126. }
  2127. static int rsm_interception(struct kvm_vcpu *vcpu)
  2128. {
  2129. return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
  2130. }
  2131. static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
  2132. unsigned long val)
  2133. {
  2134. struct vcpu_svm *svm = to_svm(vcpu);
  2135. unsigned long cr0 = vcpu->arch.cr0;
  2136. bool ret = false;
  2137. if (!is_guest_mode(vcpu) ||
  2138. (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
  2139. return false;
  2140. cr0 &= ~SVM_CR0_SELECTIVE_MASK;
  2141. val &= ~SVM_CR0_SELECTIVE_MASK;
  2142. if (cr0 ^ val) {
  2143. svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
  2144. ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
  2145. }
  2146. return ret;
  2147. }
  2148. #define CR_VALID (1ULL << 63)
  2149. static int cr_interception(struct kvm_vcpu *vcpu)
  2150. {
  2151. struct vcpu_svm *svm = to_svm(vcpu);
  2152. int reg, cr;
  2153. unsigned long val;
  2154. int err;
  2155. if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
  2156. return emulate_on_interception(vcpu);
  2157. if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
  2158. return emulate_on_interception(vcpu);
  2159. reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
  2160. if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
  2161. cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
  2162. else
  2163. cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
  2164. err = 0;
  2165. if (cr >= 16) { /* mov to cr */
  2166. cr -= 16;
  2167. val = kvm_register_read(vcpu, reg);
  2168. trace_kvm_cr_write(cr, val);
  2169. switch (cr) {
  2170. case 0:
  2171. if (!check_selective_cr0_intercepted(vcpu, val))
  2172. err = kvm_set_cr0(vcpu, val);
  2173. else
  2174. return 1;
  2175. break;
  2176. case 3:
  2177. err = kvm_set_cr3(vcpu, val);
  2178. break;
  2179. case 4:
  2180. err = kvm_set_cr4(vcpu, val);
  2181. break;
  2182. case 8:
  2183. err = kvm_set_cr8(vcpu, val);
  2184. break;
  2185. default:
  2186. WARN(1, "unhandled write to CR%d", cr);
  2187. kvm_queue_exception(vcpu, UD_VECTOR);
  2188. return 1;
  2189. }
  2190. } else { /* mov from cr */
  2191. switch (cr) {
  2192. case 0:
  2193. val = kvm_read_cr0(vcpu);
  2194. break;
  2195. case 2:
  2196. val = vcpu->arch.cr2;
  2197. break;
  2198. case 3:
  2199. val = kvm_read_cr3(vcpu);
  2200. break;
  2201. case 4:
  2202. val = kvm_read_cr4(vcpu);
  2203. break;
  2204. case 8:
  2205. val = kvm_get_cr8(vcpu);
  2206. break;
  2207. default:
  2208. WARN(1, "unhandled read from CR%d", cr);
  2209. kvm_queue_exception(vcpu, UD_VECTOR);
  2210. return 1;
  2211. }
  2212. kvm_register_write(vcpu, reg, val);
  2213. trace_kvm_cr_read(cr, val);
  2214. }
  2215. return kvm_complete_insn_gp(vcpu, err);
  2216. }
  2217. static int cr_trap(struct kvm_vcpu *vcpu)
  2218. {
  2219. struct vcpu_svm *svm = to_svm(vcpu);
  2220. unsigned long old_value, new_value;
  2221. unsigned int cr;
  2222. int ret = 0;
  2223. new_value = (unsigned long)svm->vmcb->control.exit_info_1;
  2224. cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
  2225. switch (cr) {
  2226. case 0:
  2227. old_value = kvm_read_cr0(vcpu);
  2228. svm_set_cr0(vcpu, new_value);
  2229. kvm_post_set_cr0(vcpu, old_value, new_value);
  2230. break;
  2231. case 4:
  2232. old_value = kvm_read_cr4(vcpu);
  2233. svm_set_cr4(vcpu, new_value);
  2234. kvm_post_set_cr4(vcpu, old_value, new_value);
  2235. break;
  2236. case 8:
  2237. ret = kvm_set_cr8(vcpu, new_value);
  2238. break;
  2239. default:
  2240. WARN(1, "unhandled CR%d write trap", cr);
  2241. kvm_queue_exception(vcpu, UD_VECTOR);
  2242. return 1;
  2243. }
  2244. return kvm_complete_insn_gp(vcpu, ret);
  2245. }
  2246. static int dr_interception(struct kvm_vcpu *vcpu)
  2247. {
  2248. struct vcpu_svm *svm = to_svm(vcpu);
  2249. int reg, dr;
  2250. int err = 0;
  2251. /*
  2252. * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT
  2253. * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early.
  2254. */
  2255. if (sev_es_guest(vcpu->kvm))
  2256. return 1;
  2257. if (vcpu->guest_debug == 0) {
  2258. /*
  2259. * No more DR vmexits; force a reload of the debug registers
  2260. * and reenter on this instruction. The next vmexit will
  2261. * retrieve the full state of the debug registers.
  2262. */
  2263. clr_dr_intercepts(svm);
  2264. vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
  2265. return 1;
  2266. }
  2267. if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
  2268. return emulate_on_interception(vcpu);
  2269. reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
  2270. dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
  2271. if (dr >= 16) { /* mov to DRn */
  2272. dr -= 16;
  2273. err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
  2274. } else {
  2275. kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr));
  2276. }
  2277. return kvm_complete_insn_gp(vcpu, err);
  2278. }
  2279. static int cr8_write_interception(struct kvm_vcpu *vcpu)
  2280. {
  2281. int r;
  2282. u8 cr8_prev = kvm_get_cr8(vcpu);
  2283. /* instruction emulation calls kvm_set_cr8() */
  2284. r = cr_interception(vcpu);
  2285. if (lapic_in_kernel(vcpu))
  2286. return r;
  2287. if (cr8_prev <= kvm_get_cr8(vcpu))
  2288. return r;
  2289. vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
  2290. return 0;
  2291. }
  2292. static int efer_trap(struct kvm_vcpu *vcpu)
  2293. {
  2294. struct msr_data msr_info;
  2295. int ret;
  2296. /*
  2297. * Clear the EFER_SVME bit from EFER. The SVM code always sets this
  2298. * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
  2299. * whether the guest has X86_FEATURE_SVM - this avoids a failure if
  2300. * the guest doesn't have X86_FEATURE_SVM.
  2301. */
  2302. msr_info.host_initiated = false;
  2303. msr_info.index = MSR_EFER;
  2304. msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
  2305. ret = kvm_set_msr_common(vcpu, &msr_info);
  2306. return kvm_complete_insn_gp(vcpu, ret);
  2307. }
  2308. static int svm_get_feature_msr(u32 msr, u64 *data)
  2309. {
  2310. *data = 0;
  2311. switch (msr) {
  2312. case MSR_AMD64_DE_CFG:
  2313. if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
  2314. *data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
  2315. break;
  2316. default:
  2317. return KVM_MSR_RET_UNSUPPORTED;
  2318. }
  2319. return 0;
  2320. }
  2321. static bool
  2322. sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
  2323. {
  2324. return sev_es_guest(vcpu->kvm) &&
  2325. vcpu->arch.guest_state_protected &&
  2326. svm_msrpm_offset(msr_info->index) != MSR_INVALID &&
  2327. !msr_write_intercepted(vcpu, msr_info->index);
  2328. }
  2329. static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
  2330. {
  2331. struct vcpu_svm *svm = to_svm(vcpu);
  2332. if (sev_es_prevent_msr_access(vcpu, msr_info)) {
  2333. msr_info->data = 0;
  2334. return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
  2335. }
  2336. switch (msr_info->index) {
  2337. case MSR_AMD64_TSC_RATIO:
  2338. if (!msr_info->host_initiated &&
  2339. !guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR))
  2340. return 1;
  2341. msr_info->data = svm->tsc_ratio_msr;
  2342. break;
  2343. case MSR_STAR:
  2344. msr_info->data = svm->vmcb01.ptr->save.star;
  2345. break;
  2346. #ifdef CONFIG_X86_64
  2347. case MSR_LSTAR:
  2348. msr_info->data = svm->vmcb01.ptr->save.lstar;
  2349. break;
  2350. case MSR_CSTAR:
  2351. msr_info->data = svm->vmcb01.ptr->save.cstar;
  2352. break;
  2353. case MSR_GS_BASE:
  2354. msr_info->data = svm->vmcb01.ptr->save.gs.base;
  2355. break;
  2356. case MSR_FS_BASE:
  2357. msr_info->data = svm->vmcb01.ptr->save.fs.base;
  2358. break;
  2359. case MSR_KERNEL_GS_BASE:
  2360. msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
  2361. break;
  2362. case MSR_SYSCALL_MASK:
  2363. msr_info->data = svm->vmcb01.ptr->save.sfmask;
  2364. break;
  2365. #endif
  2366. case MSR_IA32_SYSENTER_CS:
  2367. msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
  2368. break;
  2369. case MSR_IA32_SYSENTER_EIP:
  2370. msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
  2371. if (guest_cpuid_is_intel_compatible(vcpu))
  2372. msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
  2373. break;
  2374. case MSR_IA32_SYSENTER_ESP:
  2375. msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
  2376. if (guest_cpuid_is_intel_compatible(vcpu))
  2377. msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
  2378. break;
  2379. case MSR_TSC_AUX:
  2380. msr_info->data = svm->tsc_aux;
  2381. break;
  2382. case MSR_IA32_DEBUGCTLMSR:
  2383. msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl;
  2384. break;
  2385. case MSR_IA32_LASTBRANCHFROMIP:
  2386. msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from;
  2387. break;
  2388. case MSR_IA32_LASTBRANCHTOIP:
  2389. msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to;
  2390. break;
  2391. case MSR_IA32_LASTINTFROMIP:
  2392. msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from;
  2393. break;
  2394. case MSR_IA32_LASTINTTOIP:
  2395. msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to;
  2396. break;
  2397. case MSR_VM_HSAVE_PA:
  2398. msr_info->data = svm->nested.hsave_msr;
  2399. break;
  2400. case MSR_VM_CR:
  2401. msr_info->data = svm->nested.vm_cr_msr;
  2402. break;
  2403. case MSR_IA32_SPEC_CTRL:
  2404. if (!msr_info->host_initiated &&
  2405. !guest_has_spec_ctrl_msr(vcpu))
  2406. return 1;
  2407. if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
  2408. msr_info->data = svm->vmcb->save.spec_ctrl;
  2409. else
  2410. msr_info->data = svm->spec_ctrl;
  2411. break;
  2412. case MSR_AMD64_VIRT_SPEC_CTRL:
  2413. if (!msr_info->host_initiated &&
  2414. !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
  2415. return 1;
  2416. msr_info->data = svm->virt_spec_ctrl;
  2417. break;
  2418. case MSR_F15H_IC_CFG: {
  2419. int family, model;
  2420. family = guest_cpuid_family(vcpu);
  2421. model = guest_cpuid_model(vcpu);
  2422. if (family < 0 || model < 0)
  2423. return kvm_get_msr_common(vcpu, msr_info);
  2424. msr_info->data = 0;
  2425. if (family == 0x15 &&
  2426. (model >= 0x2 && model < 0x20))
  2427. msr_info->data = 0x1E;
  2428. }
  2429. break;
  2430. case MSR_AMD64_DE_CFG:
  2431. msr_info->data = svm->msr_decfg;
  2432. break;
  2433. default:
  2434. return kvm_get_msr_common(vcpu, msr_info);
  2435. }
  2436. return 0;
  2437. }
  2438. static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
  2439. {
  2440. struct vcpu_svm *svm = to_svm(vcpu);
  2441. if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
  2442. return kvm_complete_insn_gp(vcpu, err);
  2443. ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1);
  2444. ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
  2445. X86_TRAP_GP |
  2446. SVM_EVTINJ_TYPE_EXEPT |
  2447. SVM_EVTINJ_VALID);
  2448. return 1;
  2449. }
  2450. static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
  2451. {
  2452. struct vcpu_svm *svm = to_svm(vcpu);
  2453. int svm_dis, chg_mask;
  2454. if (data & ~SVM_VM_CR_VALID_MASK)
  2455. return 1;
  2456. chg_mask = SVM_VM_CR_VALID_MASK;
  2457. if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
  2458. chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
  2459. svm->nested.vm_cr_msr &= ~chg_mask;
  2460. svm->nested.vm_cr_msr |= (data & chg_mask);
  2461. svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
  2462. /* check for svm_disable while efer.svme is set */
  2463. if (svm_dis && (vcpu->arch.efer & EFER_SVME))
  2464. return 1;
  2465. return 0;
  2466. }
  2467. static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
  2468. {
  2469. struct vcpu_svm *svm = to_svm(vcpu);
  2470. int ret = 0;
  2471. u32 ecx = msr->index;
  2472. u64 data = msr->data;
  2473. if (sev_es_prevent_msr_access(vcpu, msr))
  2474. return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
  2475. switch (ecx) {
  2476. case MSR_AMD64_TSC_RATIO:
  2477. if (!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) {
  2478. if (!msr->host_initiated)
  2479. return 1;
  2480. /*
  2481. * In case TSC scaling is not enabled, always
  2482. * leave this MSR at the default value.
  2483. *
  2484. * Due to bug in qemu 6.2.0, it would try to set
  2485. * this msr to 0 if tsc scaling is not enabled.
  2486. * Ignore this value as well.
  2487. */
  2488. if (data != 0 && data != svm->tsc_ratio_msr)
  2489. return 1;
  2490. break;
  2491. }
  2492. if (data & SVM_TSC_RATIO_RSVD)
  2493. return 1;
  2494. svm->tsc_ratio_msr = data;
  2495. if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) &&
  2496. is_guest_mode(vcpu))
  2497. nested_svm_update_tsc_ratio_msr(vcpu);
  2498. break;
  2499. case MSR_IA32_CR_PAT:
  2500. ret = kvm_set_msr_common(vcpu, msr);
  2501. if (ret)
  2502. break;
  2503. svm->vmcb01.ptr->save.g_pat = data;
  2504. if (is_guest_mode(vcpu))
  2505. nested_vmcb02_compute_g_pat(svm);
  2506. vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
  2507. break;
  2508. case MSR_IA32_SPEC_CTRL:
  2509. if (!msr->host_initiated &&
  2510. !guest_has_spec_ctrl_msr(vcpu))
  2511. return 1;
  2512. if (kvm_spec_ctrl_test_value(data))
  2513. return 1;
  2514. if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
  2515. svm->vmcb->save.spec_ctrl = data;
  2516. else
  2517. svm->spec_ctrl = data;
  2518. if (!data)
  2519. break;
  2520. /*
  2521. * For non-nested:
  2522. * When it's written (to non-zero) for the first time, pass
  2523. * it through.
  2524. *
  2525. * For nested:
  2526. * The handling of the MSR bitmap for L2 guests is done in
  2527. * nested_svm_vmrun_msrpm.
  2528. * We update the L1 MSR bit as well since it will end up
  2529. * touching the MSR anyway now.
  2530. */
  2531. set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
  2532. break;
  2533. case MSR_AMD64_VIRT_SPEC_CTRL:
  2534. if (!msr->host_initiated &&
  2535. !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
  2536. return 1;
  2537. if (data & ~SPEC_CTRL_SSBD)
  2538. return 1;
  2539. svm->virt_spec_ctrl = data;
  2540. break;
  2541. case MSR_STAR:
  2542. svm->vmcb01.ptr->save.star = data;
  2543. break;
  2544. #ifdef CONFIG_X86_64
  2545. case MSR_LSTAR:
  2546. svm->vmcb01.ptr->save.lstar = data;
  2547. break;
  2548. case MSR_CSTAR:
  2549. svm->vmcb01.ptr->save.cstar = data;
  2550. break;
  2551. case MSR_GS_BASE:
  2552. svm->vmcb01.ptr->save.gs.base = data;
  2553. break;
  2554. case MSR_FS_BASE:
  2555. svm->vmcb01.ptr->save.fs.base = data;
  2556. break;
  2557. case MSR_KERNEL_GS_BASE:
  2558. svm->vmcb01.ptr->save.kernel_gs_base = data;
  2559. break;
  2560. case MSR_SYSCALL_MASK:
  2561. svm->vmcb01.ptr->save.sfmask = data;
  2562. break;
  2563. #endif
  2564. case MSR_IA32_SYSENTER_CS:
  2565. svm->vmcb01.ptr->save.sysenter_cs = data;
  2566. break;
  2567. case MSR_IA32_SYSENTER_EIP:
  2568. svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
  2569. /*
  2570. * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
  2571. * when we spoof an Intel vendor ID (for cross vendor migration).
  2572. * In this case we use this intercept to track the high
  2573. * 32 bit part of these msrs to support Intel's
  2574. * implementation of SYSENTER/SYSEXIT.
  2575. */
  2576. svm->sysenter_eip_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0;
  2577. break;
  2578. case MSR_IA32_SYSENTER_ESP:
  2579. svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
  2580. svm->sysenter_esp_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0;
  2581. break;
  2582. case MSR_TSC_AUX:
  2583. /*
  2584. * TSC_AUX is always virtualized for SEV-ES guests when the
  2585. * feature is available. The user return MSR support is not
  2586. * required in this case because TSC_AUX is restored on #VMEXIT
  2587. * from the host save area (which has been initialized in
  2588. * svm_enable_virtualization_cpu()).
  2589. */
  2590. if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm))
  2591. break;
  2592. /*
  2593. * TSC_AUX is usually changed only during boot and never read
  2594. * directly. Intercept TSC_AUX instead of exposing it to the
  2595. * guest via direct_access_msrs, and switch it via user return.
  2596. */
  2597. preempt_disable();
  2598. ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
  2599. preempt_enable();
  2600. if (ret)
  2601. break;
  2602. svm->tsc_aux = data;
  2603. break;
  2604. case MSR_IA32_DEBUGCTLMSR:
  2605. if (!lbrv) {
  2606. kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
  2607. break;
  2608. }
  2609. if (data & DEBUGCTL_RESERVED_BITS)
  2610. return 1;
  2611. svm_get_lbr_vmcb(svm)->save.dbgctl = data;
  2612. svm_update_lbrv(vcpu);
  2613. break;
  2614. case MSR_VM_HSAVE_PA:
  2615. /*
  2616. * Old kernels did not validate the value written to
  2617. * MSR_VM_HSAVE_PA. Allow KVM_SET_MSR to set an invalid
  2618. * value to allow live migrating buggy or malicious guests
  2619. * originating from those kernels.
  2620. */
  2621. if (!msr->host_initiated && !page_address_valid(vcpu, data))
  2622. return 1;
  2623. svm->nested.hsave_msr = data & PAGE_MASK;
  2624. break;
  2625. case MSR_VM_CR:
  2626. return svm_set_vm_cr(vcpu, data);
  2627. case MSR_VM_IGNNE:
  2628. kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
  2629. break;
  2630. case MSR_AMD64_DE_CFG: {
  2631. u64 supported_de_cfg;
  2632. if (svm_get_feature_msr(ecx, &supported_de_cfg))
  2633. return 1;
  2634. if (data & ~supported_de_cfg)
  2635. return 1;
  2636. svm->msr_decfg = data;
  2637. break;
  2638. }
  2639. default:
  2640. return kvm_set_msr_common(vcpu, msr);
  2641. }
  2642. return ret;
  2643. }
  2644. static int msr_interception(struct kvm_vcpu *vcpu)
  2645. {
  2646. if (to_svm(vcpu)->vmcb->control.exit_info_1)
  2647. return kvm_emulate_wrmsr(vcpu);
  2648. else
  2649. return kvm_emulate_rdmsr(vcpu);
  2650. }
  2651. static int interrupt_window_interception(struct kvm_vcpu *vcpu)
  2652. {
  2653. kvm_make_request(KVM_REQ_EVENT, vcpu);
  2654. svm_clear_vintr(to_svm(vcpu));
  2655. /*
  2656. * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
  2657. * In this case AVIC was temporarily disabled for
  2658. * requesting the IRQ window and we have to re-enable it.
  2659. *
  2660. * If running nested, still remove the VM wide AVIC inhibit to
  2661. * support case in which the interrupt window was requested when the
  2662. * vCPU was not running nested.
  2663. * All vCPUs which run still run nested, will remain to have their
  2664. * AVIC still inhibited due to per-cpu AVIC inhibition.
  2665. */
  2666. kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
  2667. ++vcpu->stat.irq_window_exits;
  2668. return 1;
  2669. }
  2670. static int pause_interception(struct kvm_vcpu *vcpu)
  2671. {
  2672. bool in_kernel;
  2673. /*
  2674. * CPL is not made available for an SEV-ES guest, therefore
  2675. * vcpu->arch.preempted_in_kernel can never be true. Just
  2676. * set in_kernel to false as well.
  2677. */
  2678. in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
  2679. grow_ple_window(vcpu);
  2680. kvm_vcpu_on_spin(vcpu, in_kernel);
  2681. return kvm_skip_emulated_instruction(vcpu);
  2682. }
  2683. static int invpcid_interception(struct kvm_vcpu *vcpu)
  2684. {
  2685. struct vcpu_svm *svm = to_svm(vcpu);
  2686. unsigned long type;
  2687. gva_t gva;
  2688. if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
  2689. kvm_queue_exception(vcpu, UD_VECTOR);
  2690. return 1;
  2691. }
  2692. /*
  2693. * For an INVPCID intercept:
  2694. * EXITINFO1 provides the linear address of the memory operand.
  2695. * EXITINFO2 provides the contents of the register operand.
  2696. */
  2697. type = svm->vmcb->control.exit_info_2;
  2698. gva = svm->vmcb->control.exit_info_1;
  2699. return kvm_handle_invpcid(vcpu, type, gva);
  2700. }
  2701. static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
  2702. [SVM_EXIT_READ_CR0] = cr_interception,
  2703. [SVM_EXIT_READ_CR3] = cr_interception,
  2704. [SVM_EXIT_READ_CR4] = cr_interception,
  2705. [SVM_EXIT_READ_CR8] = cr_interception,
  2706. [SVM_EXIT_CR0_SEL_WRITE] = cr_interception,
  2707. [SVM_EXIT_WRITE_CR0] = cr_interception,
  2708. [SVM_EXIT_WRITE_CR3] = cr_interception,
  2709. [SVM_EXIT_WRITE_CR4] = cr_interception,
  2710. [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
  2711. [SVM_EXIT_READ_DR0] = dr_interception,
  2712. [SVM_EXIT_READ_DR1] = dr_interception,
  2713. [SVM_EXIT_READ_DR2] = dr_interception,
  2714. [SVM_EXIT_READ_DR3] = dr_interception,
  2715. [SVM_EXIT_READ_DR4] = dr_interception,
  2716. [SVM_EXIT_READ_DR5] = dr_interception,
  2717. [SVM_EXIT_READ_DR6] = dr_interception,
  2718. [SVM_EXIT_READ_DR7] = dr_interception,
  2719. [SVM_EXIT_WRITE_DR0] = dr_interception,
  2720. [SVM_EXIT_WRITE_DR1] = dr_interception,
  2721. [SVM_EXIT_WRITE_DR2] = dr_interception,
  2722. [SVM_EXIT_WRITE_DR3] = dr_interception,
  2723. [SVM_EXIT_WRITE_DR4] = dr_interception,
  2724. [SVM_EXIT_WRITE_DR5] = dr_interception,
  2725. [SVM_EXIT_WRITE_DR6] = dr_interception,
  2726. [SVM_EXIT_WRITE_DR7] = dr_interception,
  2727. [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
  2728. [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
  2729. [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
  2730. [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
  2731. [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
  2732. [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
  2733. [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception,
  2734. [SVM_EXIT_INTR] = intr_interception,
  2735. [SVM_EXIT_NMI] = nmi_interception,
  2736. [SVM_EXIT_SMI] = smi_interception,
  2737. [SVM_EXIT_VINTR] = interrupt_window_interception,
  2738. [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc,
  2739. [SVM_EXIT_CPUID] = kvm_emulate_cpuid,
  2740. [SVM_EXIT_IRET] = iret_interception,
  2741. [SVM_EXIT_INVD] = kvm_emulate_invd,
  2742. [SVM_EXIT_PAUSE] = pause_interception,
  2743. [SVM_EXIT_HLT] = kvm_emulate_halt,
  2744. [SVM_EXIT_INVLPG] = invlpg_interception,
  2745. [SVM_EXIT_INVLPGA] = invlpga_interception,
  2746. [SVM_EXIT_IOIO] = io_interception,
  2747. [SVM_EXIT_MSR] = msr_interception,
  2748. [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
  2749. [SVM_EXIT_SHUTDOWN] = shutdown_interception,
  2750. [SVM_EXIT_VMRUN] = vmrun_interception,
  2751. [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall,
  2752. [SVM_EXIT_VMLOAD] = vmload_interception,
  2753. [SVM_EXIT_VMSAVE] = vmsave_interception,
  2754. [SVM_EXIT_STGI] = stgi_interception,
  2755. [SVM_EXIT_CLGI] = clgi_interception,
  2756. [SVM_EXIT_SKINIT] = skinit_interception,
  2757. [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op,
  2758. [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd,
  2759. [SVM_EXIT_MONITOR] = kvm_emulate_monitor,
  2760. [SVM_EXIT_MWAIT] = kvm_emulate_mwait,
  2761. [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv,
  2762. [SVM_EXIT_RDPRU] = kvm_handle_invalid_op,
  2763. [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap,
  2764. [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap,
  2765. [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap,
  2766. [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap,
  2767. [SVM_EXIT_INVPCID] = invpcid_interception,
  2768. [SVM_EXIT_NPF] = npf_interception,
  2769. [SVM_EXIT_RSM] = rsm_interception,
  2770. [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
  2771. [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
  2772. #ifdef CONFIG_KVM_AMD_SEV
  2773. [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit,
  2774. #endif
  2775. };
  2776. static void dump_vmcb(struct kvm_vcpu *vcpu)
  2777. {
  2778. struct vcpu_svm *svm = to_svm(vcpu);
  2779. struct vmcb_control_area *control = &svm->vmcb->control;
  2780. struct vmcb_save_area *save = &svm->vmcb->save;
  2781. struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
  2782. if (!dump_invalid_vmcb) {
  2783. pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
  2784. return;
  2785. }
  2786. pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
  2787. svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
  2788. pr_err("VMCB Control Area:\n");
  2789. pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
  2790. pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
  2791. pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
  2792. pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
  2793. pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
  2794. pr_err("%-20s%08x %08x\n", "intercepts:",
  2795. control->intercepts[INTERCEPT_WORD3],
  2796. control->intercepts[INTERCEPT_WORD4]);
  2797. pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
  2798. pr_err("%-20s%d\n", "pause filter threshold:",
  2799. control->pause_filter_thresh);
  2800. pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
  2801. pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
  2802. pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
  2803. pr_err("%-20s%d\n", "asid:", control->asid);
  2804. pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
  2805. pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
  2806. pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
  2807. pr_err("%-20s%08x\n", "int_state:", control->int_state);
  2808. pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
  2809. pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
  2810. pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
  2811. pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
  2812. pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
  2813. pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
  2814. pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
  2815. pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
  2816. pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
  2817. pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
  2818. pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
  2819. pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
  2820. pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
  2821. pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
  2822. pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
  2823. pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
  2824. pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
  2825. pr_err("VMCB State Save Area:\n");
  2826. pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
  2827. "es:",
  2828. save->es.selector, save->es.attrib,
  2829. save->es.limit, save->es.base);
  2830. pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
  2831. "cs:",
  2832. save->cs.selector, save->cs.attrib,
  2833. save->cs.limit, save->cs.base);
  2834. pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
  2835. "ss:",
  2836. save->ss.selector, save->ss.attrib,
  2837. save->ss.limit, save->ss.base);
  2838. pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
  2839. "ds:",
  2840. save->ds.selector, save->ds.attrib,
  2841. save->ds.limit, save->ds.base);
  2842. pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
  2843. "fs:",
  2844. save01->fs.selector, save01->fs.attrib,
  2845. save01->fs.limit, save01->fs.base);
  2846. pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
  2847. "gs:",
  2848. save01->gs.selector, save01->gs.attrib,
  2849. save01->gs.limit, save01->gs.base);
  2850. pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
  2851. "gdtr:",
  2852. save->gdtr.selector, save->gdtr.attrib,
  2853. save->gdtr.limit, save->gdtr.base);
  2854. pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
  2855. "ldtr:",
  2856. save01->ldtr.selector, save01->ldtr.attrib,
  2857. save01->ldtr.limit, save01->ldtr.base);
  2858. pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
  2859. "idtr:",
  2860. save->idtr.selector, save->idtr.attrib,
  2861. save->idtr.limit, save->idtr.base);
  2862. pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
  2863. "tr:",
  2864. save01->tr.selector, save01->tr.attrib,
  2865. save01->tr.limit, save01->tr.base);
  2866. pr_err("vmpl: %d cpl: %d efer: %016llx\n",
  2867. save->vmpl, save->cpl, save->efer);
  2868. pr_err("%-15s %016llx %-13s %016llx\n",
  2869. "cr0:", save->cr0, "cr2:", save->cr2);
  2870. pr_err("%-15s %016llx %-13s %016llx\n",
  2871. "cr3:", save->cr3, "cr4:", save->cr4);
  2872. pr_err("%-15s %016llx %-13s %016llx\n",
  2873. "dr6:", save->dr6, "dr7:", save->dr7);
  2874. pr_err("%-15s %016llx %-13s %016llx\n",
  2875. "rip:", save->rip, "rflags:", save->rflags);
  2876. pr_err("%-15s %016llx %-13s %016llx\n",
  2877. "rsp:", save->rsp, "rax:", save->rax);
  2878. pr_err("%-15s %016llx %-13s %016llx\n",
  2879. "star:", save01->star, "lstar:", save01->lstar);
  2880. pr_err("%-15s %016llx %-13s %016llx\n",
  2881. "cstar:", save01->cstar, "sfmask:", save01->sfmask);
  2882. pr_err("%-15s %016llx %-13s %016llx\n",
  2883. "kernel_gs_base:", save01->kernel_gs_base,
  2884. "sysenter_cs:", save01->sysenter_cs);
  2885. pr_err("%-15s %016llx %-13s %016llx\n",
  2886. "sysenter_esp:", save01->sysenter_esp,
  2887. "sysenter_eip:", save01->sysenter_eip);
  2888. pr_err("%-15s %016llx %-13s %016llx\n",
  2889. "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
  2890. pr_err("%-15s %016llx %-13s %016llx\n",
  2891. "br_from:", save->br_from, "br_to:", save->br_to);
  2892. pr_err("%-15s %016llx %-13s %016llx\n",
  2893. "excp_from:", save->last_excp_from,
  2894. "excp_to:", save->last_excp_to);
  2895. }
  2896. static bool svm_check_exit_valid(u64 exit_code)
  2897. {
  2898. return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
  2899. svm_exit_handlers[exit_code]);
  2900. }
  2901. static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
  2902. {
  2903. vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
  2904. dump_vmcb(vcpu);
  2905. vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
  2906. vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
  2907. vcpu->run->internal.ndata = 2;
  2908. vcpu->run->internal.data[0] = exit_code;
  2909. vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
  2910. return 0;
  2911. }
  2912. int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
  2913. {
  2914. if (!svm_check_exit_valid(exit_code))
  2915. return svm_handle_invalid_exit(vcpu, exit_code);
  2916. #ifdef CONFIG_MITIGATION_RETPOLINE
  2917. if (exit_code == SVM_EXIT_MSR)
  2918. return msr_interception(vcpu);
  2919. else if (exit_code == SVM_EXIT_VINTR)
  2920. return interrupt_window_interception(vcpu);
  2921. else if (exit_code == SVM_EXIT_INTR)
  2922. return intr_interception(vcpu);
  2923. else if (exit_code == SVM_EXIT_HLT)
  2924. return kvm_emulate_halt(vcpu);
  2925. else if (exit_code == SVM_EXIT_NPF)
  2926. return npf_interception(vcpu);
  2927. #endif
  2928. return svm_exit_handlers[exit_code](vcpu);
  2929. }
  2930. static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
  2931. u64 *info1, u64 *info2,
  2932. u32 *intr_info, u32 *error_code)
  2933. {
  2934. struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
  2935. *reason = control->exit_code;
  2936. *info1 = control->exit_info_1;
  2937. *info2 = control->exit_info_2;
  2938. *intr_info = control->exit_int_info;
  2939. if ((*intr_info & SVM_EXITINTINFO_VALID) &&
  2940. (*intr_info & SVM_EXITINTINFO_VALID_ERR))
  2941. *error_code = control->exit_int_info_err;
  2942. else
  2943. *error_code = 0;
  2944. }
  2945. static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
  2946. {
  2947. struct vcpu_svm *svm = to_svm(vcpu);
  2948. struct kvm_run *kvm_run = vcpu->run;
  2949. u32 exit_code = svm->vmcb->control.exit_code;
  2950. /* SEV-ES guests must use the CR write traps to track CR registers. */
  2951. if (!sev_es_guest(vcpu->kvm)) {
  2952. if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
  2953. vcpu->arch.cr0 = svm->vmcb->save.cr0;
  2954. if (npt_enabled)
  2955. vcpu->arch.cr3 = svm->vmcb->save.cr3;
  2956. }
  2957. if (is_guest_mode(vcpu)) {
  2958. int vmexit;
  2959. trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
  2960. vmexit = nested_svm_exit_special(svm);
  2961. if (vmexit == NESTED_EXIT_CONTINUE)
  2962. vmexit = nested_svm_exit_handled(svm);
  2963. if (vmexit == NESTED_EXIT_DONE)
  2964. return 1;
  2965. }
  2966. if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
  2967. kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
  2968. kvm_run->fail_entry.hardware_entry_failure_reason
  2969. = svm->vmcb->control.exit_code;
  2970. kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
  2971. dump_vmcb(vcpu);
  2972. return 0;
  2973. }
  2974. if (exit_fastpath != EXIT_FASTPATH_NONE)
  2975. return 1;
  2976. return svm_invoke_exit_handler(vcpu, exit_code);
  2977. }
  2978. static void pre_svm_run(struct kvm_vcpu *vcpu)
  2979. {
  2980. struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
  2981. struct vcpu_svm *svm = to_svm(vcpu);
  2982. /*
  2983. * If the previous vmrun of the vmcb occurred on a different physical
  2984. * cpu, then mark the vmcb dirty and assign a new asid. Hardware's
  2985. * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
  2986. */
  2987. if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
  2988. svm->current_vmcb->asid_generation = 0;
  2989. vmcb_mark_all_dirty(svm->vmcb);
  2990. svm->current_vmcb->cpu = vcpu->cpu;
  2991. }
  2992. if (sev_guest(vcpu->kvm))
  2993. return pre_sev_run(svm, vcpu->cpu);
  2994. /* FIXME: handle wraparound of asid_generation */
  2995. if (svm->current_vmcb->asid_generation != sd->asid_generation)
  2996. new_asid(svm, sd);
  2997. }
  2998. static void svm_inject_nmi(struct kvm_vcpu *vcpu)
  2999. {
  3000. struct vcpu_svm *svm = to_svm(vcpu);
  3001. svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
  3002. if (svm->nmi_l1_to_l2)
  3003. return;
  3004. /*
  3005. * No need to manually track NMI masking when vNMI is enabled, hardware
  3006. * automatically sets V_NMI_BLOCKING_MASK as appropriate, including the
  3007. * case where software directly injects an NMI.
  3008. */
  3009. if (!is_vnmi_enabled(svm)) {
  3010. svm->nmi_masked = true;
  3011. svm_set_iret_intercept(svm);
  3012. }
  3013. ++vcpu->stat.nmi_injections;
  3014. }
  3015. static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu)
  3016. {
  3017. struct vcpu_svm *svm = to_svm(vcpu);
  3018. if (!is_vnmi_enabled(svm))
  3019. return false;
  3020. return !!(svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK);
  3021. }
  3022. static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu)
  3023. {
  3024. struct vcpu_svm *svm = to_svm(vcpu);
  3025. if (!is_vnmi_enabled(svm))
  3026. return false;
  3027. if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK)
  3028. return false;
  3029. svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK;
  3030. vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
  3031. /*
  3032. * Because the pending NMI is serviced by hardware, KVM can't know when
  3033. * the NMI is "injected", but for all intents and purposes, passing the
  3034. * NMI off to hardware counts as injection.
  3035. */
  3036. ++vcpu->stat.nmi_injections;
  3037. return true;
  3038. }
  3039. static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
  3040. {
  3041. struct vcpu_svm *svm = to_svm(vcpu);
  3042. u32 type;
  3043. if (vcpu->arch.interrupt.soft) {
  3044. if (svm_update_soft_interrupt_rip(vcpu))
  3045. return;
  3046. type = SVM_EVTINJ_TYPE_SOFT;
  3047. } else {
  3048. type = SVM_EVTINJ_TYPE_INTR;
  3049. }
  3050. trace_kvm_inj_virq(vcpu->arch.interrupt.nr,
  3051. vcpu->arch.interrupt.soft, reinjected);
  3052. ++vcpu->stat.irq_injections;
  3053. svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
  3054. SVM_EVTINJ_VALID | type;
  3055. }
  3056. void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
  3057. int trig_mode, int vector)
  3058. {
  3059. /*
  3060. * apic->apicv_active must be read after vcpu->mode.
  3061. * Pairs with smp_store_release in vcpu_enter_guest.
  3062. */
  3063. bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
  3064. /* Note, this is called iff the local APIC is in-kernel. */
  3065. if (!READ_ONCE(vcpu->arch.apic->apicv_active)) {
  3066. /* Process the interrupt via kvm_check_and_inject_events(). */
  3067. kvm_make_request(KVM_REQ_EVENT, vcpu);
  3068. kvm_vcpu_kick(vcpu);
  3069. return;
  3070. }
  3071. trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
  3072. if (in_guest_mode) {
  3073. /*
  3074. * Signal the doorbell to tell hardware to inject the IRQ. If
  3075. * the vCPU exits the guest before the doorbell chimes, hardware
  3076. * will automatically process AVIC interrupts at the next VMRUN.
  3077. */
  3078. avic_ring_doorbell(vcpu);
  3079. } else {
  3080. /*
  3081. * Wake the vCPU if it was blocking. KVM will then detect the
  3082. * pending IRQ when checking if the vCPU has a wake event.
  3083. */
  3084. kvm_vcpu_wake_up(vcpu);
  3085. }
  3086. }
  3087. static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
  3088. int trig_mode, int vector)
  3089. {
  3090. kvm_lapic_set_irr(vector, apic);
  3091. /*
  3092. * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
  3093. * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
  3094. * the read of guest_mode. This guarantees that either VMRUN will see
  3095. * and process the new vIRR entry, or that svm_complete_interrupt_delivery
  3096. * will signal the doorbell if the CPU has already entered the guest.
  3097. */
  3098. smp_mb__after_atomic();
  3099. svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
  3100. }
  3101. static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
  3102. {
  3103. struct vcpu_svm *svm = to_svm(vcpu);
  3104. /*
  3105. * SEV-ES guests must always keep the CR intercepts cleared. CR
  3106. * tracking is done using the CR write traps.
  3107. */
  3108. if (sev_es_guest(vcpu->kvm))
  3109. return;
  3110. if (nested_svm_virtualize_tpr(vcpu))
  3111. return;
  3112. svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
  3113. if (irr == -1)
  3114. return;
  3115. if (tpr >= irr)
  3116. svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
  3117. }
  3118. static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
  3119. {
  3120. struct vcpu_svm *svm = to_svm(vcpu);
  3121. if (is_vnmi_enabled(svm))
  3122. return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK;
  3123. else
  3124. return svm->nmi_masked;
  3125. }
  3126. static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
  3127. {
  3128. struct vcpu_svm *svm = to_svm(vcpu);
  3129. if (is_vnmi_enabled(svm)) {
  3130. if (masked)
  3131. svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK;
  3132. else
  3133. svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
  3134. } else {
  3135. svm->nmi_masked = masked;
  3136. if (masked)
  3137. svm_set_iret_intercept(svm);
  3138. else
  3139. svm_clr_iret_intercept(svm);
  3140. }
  3141. }
  3142. bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
  3143. {
  3144. struct vcpu_svm *svm = to_svm(vcpu);
  3145. struct vmcb *vmcb = svm->vmcb;
  3146. if (!gif_set(svm))
  3147. return true;
  3148. if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
  3149. return false;
  3150. if (svm_get_nmi_mask(vcpu))
  3151. return true;
  3152. return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK;
  3153. }
  3154. static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
  3155. {
  3156. struct vcpu_svm *svm = to_svm(vcpu);
  3157. if (svm->nested.nested_run_pending)
  3158. return -EBUSY;
  3159. if (svm_nmi_blocked(vcpu))
  3160. return 0;
  3161. /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
  3162. if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
  3163. return -EBUSY;
  3164. return 1;
  3165. }
  3166. bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
  3167. {
  3168. struct vcpu_svm *svm = to_svm(vcpu);
  3169. struct vmcb *vmcb = svm->vmcb;
  3170. if (!gif_set(svm))
  3171. return true;
  3172. if (is_guest_mode(vcpu)) {
  3173. /* As long as interrupts are being delivered... */
  3174. if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
  3175. ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
  3176. : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
  3177. return true;
  3178. /* ... vmexits aren't blocked by the interrupt shadow */
  3179. if (nested_exit_on_intr(svm))
  3180. return false;
  3181. } else {
  3182. if (!svm_get_if_flag(vcpu))
  3183. return true;
  3184. }
  3185. return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
  3186. }
  3187. static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
  3188. {
  3189. struct vcpu_svm *svm = to_svm(vcpu);
  3190. if (svm->nested.nested_run_pending)
  3191. return -EBUSY;
  3192. if (svm_interrupt_blocked(vcpu))
  3193. return 0;
  3194. /*
  3195. * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
  3196. * e.g. if the IRQ arrived asynchronously after checking nested events.
  3197. */
  3198. if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
  3199. return -EBUSY;
  3200. return 1;
  3201. }
  3202. static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
  3203. {
  3204. struct vcpu_svm *svm = to_svm(vcpu);
  3205. /*
  3206. * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
  3207. * 1, because that's a separate STGI/VMRUN intercept. The next time we
  3208. * get that intercept, this function will be called again though and
  3209. * we'll get the vintr intercept. However, if the vGIF feature is
  3210. * enabled, the STGI interception will not occur. Enable the irq
  3211. * window under the assumption that the hardware will set the GIF.
  3212. */
  3213. if (vgif || gif_set(svm)) {
  3214. /*
  3215. * IRQ window is not needed when AVIC is enabled,
  3216. * unless we have pending ExtINT since it cannot be injected
  3217. * via AVIC. In such case, KVM needs to temporarily disable AVIC,
  3218. * and fallback to injecting IRQ via V_IRQ.
  3219. *
  3220. * If running nested, AVIC is already locally inhibited
  3221. * on this vCPU, therefore there is no need to request
  3222. * the VM wide AVIC inhibition.
  3223. */
  3224. if (!is_guest_mode(vcpu))
  3225. kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
  3226. svm_set_vintr(svm);
  3227. }
  3228. }
  3229. static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
  3230. {
  3231. struct vcpu_svm *svm = to_svm(vcpu);
  3232. /*
  3233. * If NMIs are outright masked, i.e. the vCPU is already handling an
  3234. * NMI, and KVM has not yet intercepted an IRET, then there is nothing
  3235. * more to do at this time as KVM has already enabled IRET intercepts.
  3236. * If KVM has already intercepted IRET, then single-step over the IRET,
  3237. * as NMIs aren't architecturally unmasked until the IRET completes.
  3238. *
  3239. * If vNMI is enabled, KVM should never request an NMI window if NMIs
  3240. * are masked, as KVM allows at most one to-be-injected NMI and one
  3241. * pending NMI. If two NMIs arrive simultaneously, KVM will inject one
  3242. * NMI and set V_NMI_PENDING for the other, but if and only if NMIs are
  3243. * unmasked. KVM _will_ request an NMI window in some situations, e.g.
  3244. * if the vCPU is in an STI shadow or if GIF=0, KVM can't immediately
  3245. * inject the NMI. In those situations, KVM needs to single-step over
  3246. * the STI shadow or intercept STGI.
  3247. */
  3248. if (svm_get_nmi_mask(vcpu)) {
  3249. WARN_ON_ONCE(is_vnmi_enabled(svm));
  3250. if (!svm->awaiting_iret_completion)
  3251. return; /* IRET will cause a vm exit */
  3252. }
  3253. /*
  3254. * SEV-ES guests are responsible for signaling when a vCPU is ready to
  3255. * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e.
  3256. * KVM can't intercept and single-step IRET to detect when NMIs are
  3257. * unblocked (architecturally speaking). See SVM_VMGEXIT_NMI_COMPLETE.
  3258. *
  3259. * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware
  3260. * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not
  3261. * supported NAEs in the GHCB protocol.
  3262. */
  3263. if (sev_es_guest(vcpu->kvm))
  3264. return;
  3265. if (!gif_set(svm)) {
  3266. if (vgif)
  3267. svm_set_intercept(svm, INTERCEPT_STGI);
  3268. return; /* STGI will cause a vm exit */
  3269. }
  3270. /*
  3271. * Something prevents NMI from been injected. Single step over possible
  3272. * problem (IRET or exception injection or interrupt shadow)
  3273. */
  3274. svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
  3275. svm->nmi_singlestep = true;
  3276. svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
  3277. }
  3278. static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu)
  3279. {
  3280. struct vcpu_svm *svm = to_svm(vcpu);
  3281. /*
  3282. * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries.
  3283. * A TLB flush for the current ASID flushes both "host" and "guest" TLB
  3284. * entries, and thus is a superset of Hyper-V's fine grained flushing.
  3285. */
  3286. kvm_hv_vcpu_purge_flush_tlb(vcpu);
  3287. /*
  3288. * Flush only the current ASID even if the TLB flush was invoked via
  3289. * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all
  3290. * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
  3291. * unconditionally does a TLB flush on both nested VM-Enter and nested
  3292. * VM-Exit (via kvm_mmu_reset_context()).
  3293. */
  3294. if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
  3295. svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
  3296. else
  3297. svm->current_vmcb->asid_generation--;
  3298. }
  3299. static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
  3300. {
  3301. hpa_t root_tdp = vcpu->arch.mmu->root.hpa;
  3302. /*
  3303. * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly
  3304. * flush the NPT mappings via hypercall as flushing the ASID only
  3305. * affects virtual to physical mappings, it does not invalidate guest
  3306. * physical to host physical mappings.
  3307. */
  3308. if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp))
  3309. hyperv_flush_guest_mapping(root_tdp);
  3310. svm_flush_tlb_asid(vcpu);
  3311. }
  3312. static void svm_flush_tlb_all(struct kvm_vcpu *vcpu)
  3313. {
  3314. /*
  3315. * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB
  3316. * flushes should be routed to hv_flush_remote_tlbs() without requesting
  3317. * a "regular" remote flush. Reaching this point means either there's
  3318. * a KVM bug or a prior hv_flush_remote_tlbs() call failed, both of
  3319. * which might be fatal to the guest. Yell, but try to recover.
  3320. */
  3321. if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu)))
  3322. hv_flush_remote_tlbs(vcpu->kvm);
  3323. svm_flush_tlb_asid(vcpu);
  3324. }
  3325. static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
  3326. {
  3327. struct vcpu_svm *svm = to_svm(vcpu);
  3328. invlpga(gva, svm->vmcb->control.asid);
  3329. }
  3330. static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
  3331. {
  3332. struct vcpu_svm *svm = to_svm(vcpu);
  3333. if (nested_svm_virtualize_tpr(vcpu))
  3334. return;
  3335. if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
  3336. int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
  3337. kvm_set_cr8(vcpu, cr8);
  3338. }
  3339. }
  3340. static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
  3341. {
  3342. struct vcpu_svm *svm = to_svm(vcpu);
  3343. u64 cr8;
  3344. if (nested_svm_virtualize_tpr(vcpu) ||
  3345. kvm_vcpu_apicv_active(vcpu))
  3346. return;
  3347. cr8 = kvm_get_cr8(vcpu);
  3348. svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
  3349. svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
  3350. }
  3351. static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
  3352. int type)
  3353. {
  3354. bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT);
  3355. bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT);
  3356. struct vcpu_svm *svm = to_svm(vcpu);
  3357. /*
  3358. * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
  3359. * associated with the original soft exception/interrupt. next_rip is
  3360. * cleared on all exits that can occur while vectoring an event, so KVM
  3361. * needs to manually set next_rip for re-injection. Unlike the !nrips
  3362. * case below, this needs to be done if and only if KVM is re-injecting
  3363. * the same event, i.e. if the event is a soft exception/interrupt,
  3364. * otherwise next_rip is unused on VMRUN.
  3365. */
  3366. if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) &&
  3367. kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
  3368. svm->vmcb->control.next_rip = svm->soft_int_next_rip;
  3369. /*
  3370. * If NRIPS isn't enabled, KVM must manually advance RIP prior to
  3371. * injecting the soft exception/interrupt. That advancement needs to
  3372. * be unwound if vectoring didn't complete. Note, the new event may
  3373. * not be the injected event, e.g. if KVM injected an INTn, the INTn
  3374. * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
  3375. * be the reported vectored event, but RIP still needs to be unwound.
  3376. */
  3377. else if (!nrips && (is_soft || is_exception) &&
  3378. kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
  3379. kvm_rip_write(vcpu, svm->soft_int_old_rip);
  3380. }
  3381. static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
  3382. {
  3383. struct vcpu_svm *svm = to_svm(vcpu);
  3384. u8 vector;
  3385. int type;
  3386. u32 exitintinfo = svm->vmcb->control.exit_int_info;
  3387. bool nmi_l1_to_l2 = svm->nmi_l1_to_l2;
  3388. bool soft_int_injected = svm->soft_int_injected;
  3389. svm->nmi_l1_to_l2 = false;
  3390. svm->soft_int_injected = false;
  3391. /*
  3392. * If we've made progress since setting awaiting_iret_completion, we've
  3393. * executed an IRET and can allow NMI injection.
  3394. */
  3395. if (svm->awaiting_iret_completion &&
  3396. kvm_rip_read(vcpu) != svm->nmi_iret_rip) {
  3397. svm->awaiting_iret_completion = false;
  3398. svm->nmi_masked = false;
  3399. kvm_make_request(KVM_REQ_EVENT, vcpu);
  3400. }
  3401. vcpu->arch.nmi_injected = false;
  3402. kvm_clear_exception_queue(vcpu);
  3403. kvm_clear_interrupt_queue(vcpu);
  3404. if (!(exitintinfo & SVM_EXITINTINFO_VALID))
  3405. return;
  3406. kvm_make_request(KVM_REQ_EVENT, vcpu);
  3407. vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
  3408. type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
  3409. if (soft_int_injected)
  3410. svm_complete_soft_interrupt(vcpu, vector, type);
  3411. switch (type) {
  3412. case SVM_EXITINTINFO_TYPE_NMI:
  3413. vcpu->arch.nmi_injected = true;
  3414. svm->nmi_l1_to_l2 = nmi_l1_to_l2;
  3415. break;
  3416. case SVM_EXITINTINFO_TYPE_EXEPT:
  3417. /*
  3418. * Never re-inject a #VC exception.
  3419. */
  3420. if (vector == X86_TRAP_VC)
  3421. break;
  3422. if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
  3423. u32 err = svm->vmcb->control.exit_int_info_err;
  3424. kvm_requeue_exception_e(vcpu, vector, err);
  3425. } else
  3426. kvm_requeue_exception(vcpu, vector);
  3427. break;
  3428. case SVM_EXITINTINFO_TYPE_INTR:
  3429. kvm_queue_interrupt(vcpu, vector, false);
  3430. break;
  3431. case SVM_EXITINTINFO_TYPE_SOFT:
  3432. kvm_queue_interrupt(vcpu, vector, true);
  3433. break;
  3434. default:
  3435. break;
  3436. }
  3437. }
  3438. static void svm_cancel_injection(struct kvm_vcpu *vcpu)
  3439. {
  3440. struct vcpu_svm *svm = to_svm(vcpu);
  3441. struct vmcb_control_area *control = &svm->vmcb->control;
  3442. control->exit_int_info = control->event_inj;
  3443. control->exit_int_info_err = control->event_inj_err;
  3444. control->event_inj = 0;
  3445. svm_complete_interrupts(vcpu);
  3446. }
  3447. static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
  3448. {
  3449. if (to_kvm_sev_info(vcpu->kvm)->need_init)
  3450. return -EINVAL;
  3451. return 1;
  3452. }
  3453. static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
  3454. {
  3455. struct vcpu_svm *svm = to_svm(vcpu);
  3456. if (is_guest_mode(vcpu))
  3457. return EXIT_FASTPATH_NONE;
  3458. switch (svm->vmcb->control.exit_code) {
  3459. case SVM_EXIT_MSR:
  3460. if (!svm->vmcb->control.exit_info_1)
  3461. break;
  3462. return handle_fastpath_set_msr_irqoff(vcpu);
  3463. case SVM_EXIT_HLT:
  3464. return handle_fastpath_hlt(vcpu);
  3465. default:
  3466. break;
  3467. }
  3468. return EXIT_FASTPATH_NONE;
  3469. }
  3470. static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
  3471. {
  3472. struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
  3473. struct vcpu_svm *svm = to_svm(vcpu);
  3474. guest_state_enter_irqoff();
  3475. amd_clear_divider();
  3476. if (sev_es_guest(vcpu->kvm))
  3477. __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted,
  3478. sev_es_host_save_area(sd));
  3479. else
  3480. __svm_vcpu_run(svm, spec_ctrl_intercepted);
  3481. guest_state_exit_irqoff();
  3482. }
  3483. static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
  3484. bool force_immediate_exit)
  3485. {
  3486. struct vcpu_svm *svm = to_svm(vcpu);
  3487. bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
  3488. trace_kvm_entry(vcpu, force_immediate_exit);
  3489. svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
  3490. svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
  3491. svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
  3492. /*
  3493. * Disable singlestep if we're injecting an interrupt/exception.
  3494. * We don't want our modified rflags to be pushed on the stack where
  3495. * we might not be able to easily reset them if we disabled NMI
  3496. * singlestep later.
  3497. */
  3498. if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
  3499. /*
  3500. * Event injection happens before external interrupts cause a
  3501. * vmexit and interrupts are disabled here, so smp_send_reschedule
  3502. * is enough to force an immediate vmexit.
  3503. */
  3504. disable_nmi_singlestep(svm);
  3505. force_immediate_exit = true;
  3506. }
  3507. if (force_immediate_exit)
  3508. smp_send_reschedule(vcpu->cpu);
  3509. pre_svm_run(vcpu);
  3510. sync_lapic_to_cr8(vcpu);
  3511. if (unlikely(svm->asid != svm->vmcb->control.asid)) {
  3512. svm->vmcb->control.asid = svm->asid;
  3513. vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
  3514. }
  3515. svm->vmcb->save.cr2 = vcpu->arch.cr2;
  3516. svm_hv_update_vp_id(svm->vmcb, vcpu);
  3517. /*
  3518. * Run with all-zero DR6 unless needed, so that we can get the exact cause
  3519. * of a #DB.
  3520. */
  3521. if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
  3522. svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
  3523. clgi();
  3524. kvm_load_guest_xsave_state(vcpu);
  3525. kvm_wait_lapic_expire(vcpu);
  3526. /*
  3527. * If this vCPU has touched SPEC_CTRL, restore the guest's value if
  3528. * it's non-zero. Since vmentry is serialising on affected CPUs, there
  3529. * is no need to worry about the conditional branch over the wrmsr
  3530. * being speculatively taken.
  3531. */
  3532. if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
  3533. x86_spec_ctrl_set_guest(svm->virt_spec_ctrl);
  3534. svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
  3535. if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
  3536. x86_spec_ctrl_restore_host(svm->virt_spec_ctrl);
  3537. if (!sev_es_guest(vcpu->kvm)) {
  3538. vcpu->arch.cr2 = svm->vmcb->save.cr2;
  3539. vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
  3540. vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
  3541. vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
  3542. }
  3543. vcpu->arch.regs_dirty = 0;
  3544. if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
  3545. kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
  3546. kvm_load_host_xsave_state(vcpu);
  3547. stgi();
  3548. /* Any pending NMI will happen here */
  3549. if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
  3550. kvm_after_interrupt(vcpu);
  3551. sync_cr8_to_lapic(vcpu);
  3552. svm->next_rip = 0;
  3553. if (is_guest_mode(vcpu)) {
  3554. nested_sync_control_from_vmcb02(svm);
  3555. /* Track VMRUNs that have made past consistency checking */
  3556. if (svm->nested.nested_run_pending &&
  3557. svm->vmcb->control.exit_code != SVM_EXIT_ERR)
  3558. ++vcpu->stat.nested_run;
  3559. svm->nested.nested_run_pending = 0;
  3560. }
  3561. svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
  3562. vmcb_mark_all_clean(svm->vmcb);
  3563. /* if exit due to PF check for async PF */
  3564. if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
  3565. vcpu->arch.apf.host_apf_flags =
  3566. kvm_read_and_reset_apf_flags();
  3567. vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
  3568. /*
  3569. * We need to handle MC intercepts here before the vcpu has a chance to
  3570. * change the physical cpu
  3571. */
  3572. if (unlikely(svm->vmcb->control.exit_code ==
  3573. SVM_EXIT_EXCP_BASE + MC_VECTOR))
  3574. svm_handle_mce(vcpu);
  3575. trace_kvm_exit(vcpu, KVM_ISA_SVM);
  3576. svm_complete_interrupts(vcpu);
  3577. return svm_exit_handlers_fastpath(vcpu);
  3578. }
  3579. static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
  3580. int root_level)
  3581. {
  3582. struct vcpu_svm *svm = to_svm(vcpu);
  3583. unsigned long cr3;
  3584. if (npt_enabled) {
  3585. svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
  3586. vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
  3587. hv_track_root_tdp(vcpu, root_hpa);
  3588. cr3 = vcpu->arch.cr3;
  3589. } else if (root_level >= PT64_ROOT_4LEVEL) {
  3590. cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
  3591. } else {
  3592. /* PCID in the guest should be impossible with a 32-bit MMU. */
  3593. WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
  3594. cr3 = root_hpa;
  3595. }
  3596. svm->vmcb->save.cr3 = cr3;
  3597. vmcb_mark_dirty(svm->vmcb, VMCB_CR);
  3598. }
  3599. static void
  3600. svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
  3601. {
  3602. /*
  3603. * Patch in the VMMCALL instruction:
  3604. */
  3605. hypercall[0] = 0x0f;
  3606. hypercall[1] = 0x01;
  3607. hypercall[2] = 0xd9;
  3608. }
  3609. /*
  3610. * The kvm parameter can be NULL (module initialization, or invocation before
  3611. * VM creation). Be sure to check the kvm parameter before using it.
  3612. */
  3613. static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
  3614. {
  3615. switch (index) {
  3616. case MSR_IA32_MCG_EXT_CTL:
  3617. case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
  3618. return false;
  3619. case MSR_IA32_SMBASE:
  3620. if (!IS_ENABLED(CONFIG_KVM_SMM))
  3621. return false;
  3622. /* SEV-ES guests do not support SMM, so report false */
  3623. if (kvm && sev_es_guest(kvm))
  3624. return false;
  3625. break;
  3626. default:
  3627. break;
  3628. }
  3629. return true;
  3630. }
  3631. static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
  3632. {
  3633. struct vcpu_svm *svm = to_svm(vcpu);
  3634. /*
  3635. * SVM doesn't provide a way to disable just XSAVES in the guest, KVM
  3636. * can only disable all variants of by disallowing CR4.OSXSAVE from
  3637. * being set. As a result, if the host has XSAVE and XSAVES, and the
  3638. * guest has XSAVE enabled, the guest can execute XSAVES without
  3639. * faulting. Treat XSAVES as enabled in this case regardless of
  3640. * whether it's advertised to the guest so that KVM context switches
  3641. * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give
  3642. * the guest read/write access to the host's XSS.
  3643. */
  3644. if (boot_cpu_has(X86_FEATURE_XSAVE) &&
  3645. boot_cpu_has(X86_FEATURE_XSAVES) &&
  3646. guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
  3647. kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES);
  3648. kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS);
  3649. kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR);
  3650. kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV);
  3651. /*
  3652. * Intercept VMLOAD if the vCPU model is Intel in order to emulate that
  3653. * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing
  3654. * SVM on Intel is bonkers and extremely unlikely to work).
  3655. */
  3656. if (!guest_cpuid_is_intel_compatible(vcpu))
  3657. kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
  3658. kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER);
  3659. kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD);
  3660. kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF);
  3661. kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VNMI);
  3662. svm_recalc_instruction_intercepts(vcpu, svm);
  3663. if (boot_cpu_has(X86_FEATURE_IBPB))
  3664. set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
  3665. !!guest_has_pred_cmd_msr(vcpu));
  3666. if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
  3667. set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
  3668. !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
  3669. if (sev_guest(vcpu->kvm))
  3670. sev_vcpu_after_set_cpuid(svm);
  3671. init_vmcb_after_set_cpuid(vcpu);
  3672. }
  3673. static bool svm_has_wbinvd_exit(void)
  3674. {
  3675. return true;
  3676. }
  3677. #define PRE_EX(exit) { .exit_code = (exit), \
  3678. .stage = X86_ICPT_PRE_EXCEPT, }
  3679. #define POST_EX(exit) { .exit_code = (exit), \
  3680. .stage = X86_ICPT_POST_EXCEPT, }
  3681. #define POST_MEM(exit) { .exit_code = (exit), \
  3682. .stage = X86_ICPT_POST_MEMACCESS, }
  3683. static const struct __x86_intercept {
  3684. u32 exit_code;
  3685. enum x86_intercept_stage stage;
  3686. } x86_intercept_map[] = {
  3687. [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0),
  3688. [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0),
  3689. [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0),
  3690. [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0),
  3691. [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0),
  3692. [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0),
  3693. [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0),
  3694. [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ),
  3695. [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ),
  3696. [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE),
  3697. [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE),
  3698. [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ),
  3699. [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ),
  3700. [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE),
  3701. [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE),
  3702. [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN),
  3703. [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL),
  3704. [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD),
  3705. [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE),
  3706. [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI),
  3707. [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI),
  3708. [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT),
  3709. [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA),
  3710. [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP),
  3711. [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR),
  3712. [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT),
  3713. [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG),
  3714. [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD),
  3715. [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD),
  3716. [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR),
  3717. [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC),
  3718. [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR),
  3719. [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC),
  3720. [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID),
  3721. [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM),
  3722. [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE),
  3723. [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF),
  3724. [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF),
  3725. [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT),
  3726. [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET),
  3727. [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP),
  3728. [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT),
  3729. [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO),
  3730. [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
  3731. [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
  3732. [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
  3733. [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV),
  3734. };
  3735. #undef PRE_EX
  3736. #undef POST_EX
  3737. #undef POST_MEM
  3738. static int svm_check_intercept(struct kvm_vcpu *vcpu,
  3739. struct x86_instruction_info *info,
  3740. enum x86_intercept_stage stage,
  3741. struct x86_exception *exception)
  3742. {
  3743. struct vcpu_svm *svm = to_svm(vcpu);
  3744. int vmexit, ret = X86EMUL_CONTINUE;
  3745. struct __x86_intercept icpt_info;
  3746. struct vmcb *vmcb = svm->vmcb;
  3747. if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
  3748. goto out;
  3749. icpt_info = x86_intercept_map[info->intercept];
  3750. if (stage != icpt_info.stage)
  3751. goto out;
  3752. switch (icpt_info.exit_code) {
  3753. case SVM_EXIT_READ_CR0:
  3754. if (info->intercept == x86_intercept_cr_read)
  3755. icpt_info.exit_code += info->modrm_reg;
  3756. break;
  3757. case SVM_EXIT_WRITE_CR0: {
  3758. unsigned long cr0, val;
  3759. if (info->intercept == x86_intercept_cr_write)
  3760. icpt_info.exit_code += info->modrm_reg;
  3761. if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
  3762. info->intercept == x86_intercept_clts)
  3763. break;
  3764. if (!(vmcb12_is_intercept(&svm->nested.ctl,
  3765. INTERCEPT_SELECTIVE_CR0)))
  3766. break;
  3767. cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
  3768. val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
  3769. if (info->intercept == x86_intercept_lmsw) {
  3770. cr0 &= 0xfUL;
  3771. val &= 0xfUL;
  3772. /* lmsw can't clear PE - catch this here */
  3773. if (cr0 & X86_CR0_PE)
  3774. val |= X86_CR0_PE;
  3775. }
  3776. if (cr0 ^ val)
  3777. icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
  3778. break;
  3779. }
  3780. case SVM_EXIT_READ_DR0:
  3781. case SVM_EXIT_WRITE_DR0:
  3782. icpt_info.exit_code += info->modrm_reg;
  3783. break;
  3784. case SVM_EXIT_MSR:
  3785. if (info->intercept == x86_intercept_wrmsr)
  3786. vmcb->control.exit_info_1 = 1;
  3787. else
  3788. vmcb->control.exit_info_1 = 0;
  3789. break;
  3790. case SVM_EXIT_PAUSE:
  3791. /*
  3792. * We get this for NOP only, but pause
  3793. * is rep not, check this here
  3794. */
  3795. if (info->rep_prefix != REPE_PREFIX)
  3796. goto out;
  3797. break;
  3798. case SVM_EXIT_IOIO: {
  3799. u64 exit_info;
  3800. u32 bytes;
  3801. if (info->intercept == x86_intercept_in ||
  3802. info->intercept == x86_intercept_ins) {
  3803. exit_info = ((info->src_val & 0xffff) << 16) |
  3804. SVM_IOIO_TYPE_MASK;
  3805. bytes = info->dst_bytes;
  3806. } else {
  3807. exit_info = (info->dst_val & 0xffff) << 16;
  3808. bytes = info->src_bytes;
  3809. }
  3810. if (info->intercept == x86_intercept_outs ||
  3811. info->intercept == x86_intercept_ins)
  3812. exit_info |= SVM_IOIO_STR_MASK;
  3813. if (info->rep_prefix)
  3814. exit_info |= SVM_IOIO_REP_MASK;
  3815. bytes = min(bytes, 4u);
  3816. exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
  3817. exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
  3818. vmcb->control.exit_info_1 = exit_info;
  3819. vmcb->control.exit_info_2 = info->next_rip;
  3820. break;
  3821. }
  3822. default:
  3823. break;
  3824. }
  3825. /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
  3826. if (static_cpu_has(X86_FEATURE_NRIPS))
  3827. vmcb->control.next_rip = info->next_rip;
  3828. vmcb->control.exit_code = icpt_info.exit_code;
  3829. vmexit = nested_svm_exit_handled(svm);
  3830. ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
  3831. : X86EMUL_CONTINUE;
  3832. out:
  3833. return ret;
  3834. }
  3835. static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
  3836. {
  3837. if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
  3838. vcpu->arch.at_instruction_boundary = true;
  3839. }
  3840. static void svm_setup_mce(struct kvm_vcpu *vcpu)
  3841. {
  3842. /* [63:9] are reserved. */
  3843. vcpu->arch.mcg_cap &= 0x1ff;
  3844. }
  3845. #ifdef CONFIG_KVM_SMM
  3846. bool svm_smi_blocked(struct kvm_vcpu *vcpu)
  3847. {
  3848. struct vcpu_svm *svm = to_svm(vcpu);
  3849. /* Per APM Vol.2 15.22.2 "Response to SMI" */
  3850. if (!gif_set(svm))
  3851. return true;
  3852. return is_smm(vcpu);
  3853. }
  3854. static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
  3855. {
  3856. struct vcpu_svm *svm = to_svm(vcpu);
  3857. if (svm->nested.nested_run_pending)
  3858. return -EBUSY;
  3859. if (svm_smi_blocked(vcpu))
  3860. return 0;
  3861. /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */
  3862. if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
  3863. return -EBUSY;
  3864. return 1;
  3865. }
  3866. static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
  3867. {
  3868. struct vcpu_svm *svm = to_svm(vcpu);
  3869. struct kvm_host_map map_save;
  3870. int ret;
  3871. if (!is_guest_mode(vcpu))
  3872. return 0;
  3873. /*
  3874. * 32-bit SMRAM format doesn't preserve EFER and SVM state. Userspace is
  3875. * responsible for ensuring nested SVM and SMIs are mutually exclusive.
  3876. */
  3877. if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
  3878. return 1;
  3879. smram->smram64.svm_guest_flag = 1;
  3880. smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa;
  3881. svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
  3882. svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
  3883. svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
  3884. ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
  3885. if (ret)
  3886. return ret;
  3887. /*
  3888. * KVM uses VMCB01 to store L1 host state while L2 runs but
  3889. * VMCB01 is going to be used during SMM and thus the state will
  3890. * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
  3891. * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
  3892. * format of the area is identical to guest save area offsetted
  3893. * by 0x400 (matches the offset of 'struct vmcb_save_area'
  3894. * within 'struct vmcb'). Note: HSAVE area may also be used by
  3895. * L1 hypervisor to save additional host context (e.g. KVM does
  3896. * that, see svm_prepare_switch_to_guest()) which must be
  3897. * preserved.
  3898. */
  3899. if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
  3900. return 1;
  3901. BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
  3902. svm_copy_vmrun_state(map_save.hva + 0x400,
  3903. &svm->vmcb01.ptr->save);
  3904. kvm_vcpu_unmap(vcpu, &map_save, true);
  3905. return 0;
  3906. }
  3907. static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
  3908. {
  3909. struct vcpu_svm *svm = to_svm(vcpu);
  3910. struct kvm_host_map map, map_save;
  3911. struct vmcb *vmcb12;
  3912. int ret;
  3913. const struct kvm_smram_state_64 *smram64 = &smram->smram64;
  3914. if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
  3915. return 0;
  3916. /* Non-zero if SMI arrived while vCPU was in guest mode. */
  3917. if (!smram64->svm_guest_flag)
  3918. return 0;
  3919. if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
  3920. return 1;
  3921. if (!(smram64->efer & EFER_SVME))
  3922. return 1;
  3923. if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map))
  3924. return 1;
  3925. ret = 1;
  3926. if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
  3927. goto unmap_map;
  3928. if (svm_allocate_nested(svm))
  3929. goto unmap_save;
  3930. /*
  3931. * Restore L1 host state from L1 HSAVE area as VMCB01 was
  3932. * used during SMM (see svm_enter_smm())
  3933. */
  3934. svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
  3935. /*
  3936. * Enter the nested guest now
  3937. */
  3938. vmcb_mark_all_dirty(svm->vmcb01.ptr);
  3939. vmcb12 = map.hva;
  3940. nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
  3941. nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
  3942. ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false);
  3943. if (ret)
  3944. goto unmap_save;
  3945. svm->nested.nested_run_pending = 1;
  3946. unmap_save:
  3947. kvm_vcpu_unmap(vcpu, &map_save, true);
  3948. unmap_map:
  3949. kvm_vcpu_unmap(vcpu, &map, true);
  3950. return ret;
  3951. }
  3952. static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
  3953. {
  3954. struct vcpu_svm *svm = to_svm(vcpu);
  3955. if (!gif_set(svm)) {
  3956. if (vgif)
  3957. svm_set_intercept(svm, INTERCEPT_STGI);
  3958. /* STGI will cause a vm exit */
  3959. } else {
  3960. /* We must be in SMM; RSM will cause a vmexit anyway. */
  3961. }
  3962. }
  3963. #endif
  3964. static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
  3965. void *insn, int insn_len)
  3966. {
  3967. bool smep, smap, is_user;
  3968. u64 error_code;
  3969. /* Emulation is always possible when KVM has access to all guest state. */
  3970. if (!sev_guest(vcpu->kvm))
  3971. return X86EMUL_CONTINUE;
  3972. /* #UD and #GP should never be intercepted for SEV guests. */
  3973. WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
  3974. EMULTYPE_TRAP_UD_FORCED |
  3975. EMULTYPE_VMWARE_GP));
  3976. /*
  3977. * Emulation is impossible for SEV-ES guests as KVM doesn't have access
  3978. * to guest register state.
  3979. */
  3980. if (sev_es_guest(vcpu->kvm))
  3981. return X86EMUL_RETRY_INSTR;
  3982. /*
  3983. * Emulation is possible if the instruction is already decoded, e.g.
  3984. * when completing I/O after returning from userspace.
  3985. */
  3986. if (emul_type & EMULTYPE_NO_DECODE)
  3987. return X86EMUL_CONTINUE;
  3988. /*
  3989. * Emulation is possible for SEV guests if and only if a prefilled
  3990. * buffer containing the bytes of the intercepted instruction is
  3991. * available. SEV guest memory is encrypted with a guest specific key
  3992. * and cannot be decrypted by KVM, i.e. KVM would read ciphertext and
  3993. * decode garbage.
  3994. *
  3995. * If KVM is NOT trying to simply skip an instruction, inject #UD if
  3996. * KVM reached this point without an instruction buffer. In practice,
  3997. * this path should never be hit by a well-behaved guest, e.g. KVM
  3998. * doesn't intercept #UD or #GP for SEV guests, but this path is still
  3999. * theoretically reachable, e.g. via unaccelerated fault-like AVIC
  4000. * access, and needs to be handled by KVM to avoid putting the guest
  4001. * into an infinite loop. Injecting #UD is somewhat arbitrary, but
  4002. * its the least awful option given lack of insight into the guest.
  4003. *
  4004. * If KVM is trying to skip an instruction, simply resume the guest.
  4005. * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM
  4006. * will attempt to re-inject the INT3/INTO and skip the instruction.
  4007. * In that scenario, retrying the INT3/INTO and hoping the guest will
  4008. * make forward progress is the only option that has a chance of
  4009. * success (and in practice it will work the vast majority of the time).
  4010. */
  4011. if (unlikely(!insn)) {
  4012. if (emul_type & EMULTYPE_SKIP)
  4013. return X86EMUL_UNHANDLEABLE;
  4014. kvm_queue_exception(vcpu, UD_VECTOR);
  4015. return X86EMUL_PROPAGATE_FAULT;
  4016. }
  4017. /*
  4018. * Emulate for SEV guests if the insn buffer is not empty. The buffer
  4019. * will be empty if the DecodeAssist microcode cannot fetch bytes for
  4020. * the faulting instruction because the code fetch itself faulted, e.g.
  4021. * the guest attempted to fetch from emulated MMIO or a guest page
  4022. * table used to translate CS:RIP resides in emulated MMIO.
  4023. */
  4024. if (likely(insn_len))
  4025. return X86EMUL_CONTINUE;
  4026. /*
  4027. * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
  4028. *
  4029. * Errata:
  4030. * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
  4031. * possible that CPU microcode implementing DecodeAssist will fail to
  4032. * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
  4033. * be '0'. This happens because microcode reads CS:RIP using a _data_
  4034. * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode
  4035. * gives up and does not fill the instruction bytes buffer.
  4036. *
  4037. * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
  4038. * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
  4039. * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
  4040. * GuestIntrBytes field of the VMCB.
  4041. *
  4042. * This does _not_ mean that the erratum has been encountered, as the
  4043. * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
  4044. * #PF, e.g. if the guest attempt to execute from emulated MMIO and
  4045. * encountered a reserved/not-present #PF.
  4046. *
  4047. * To hit the erratum, the following conditions must be true:
  4048. * 1. CR4.SMAP=1 (obviously).
  4049. * 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot
  4050. * have been hit as the guest would have encountered a SMEP
  4051. * violation #PF, not a #NPF.
  4052. * 3. The #NPF is not due to a code fetch, in which case failure to
  4053. * retrieve the instruction bytes is legitimate (see abvoe).
  4054. *
  4055. * In addition, don't apply the erratum workaround if the #NPF occurred
  4056. * while translating guest page tables (see below).
  4057. */
  4058. error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
  4059. if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
  4060. goto resume_guest;
  4061. smep = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMEP);
  4062. smap = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMAP);
  4063. is_user = svm_get_cpl(vcpu) == 3;
  4064. if (smap && (!smep || is_user)) {
  4065. pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n");
  4066. /*
  4067. * If the fault occurred in userspace, arbitrarily inject #GP
  4068. * to avoid killing the guest and to hopefully avoid confusing
  4069. * the guest kernel too much, e.g. injecting #PF would not be
  4070. * coherent with respect to the guest's page tables. Request
  4071. * triple fault if the fault occurred in the kernel as there's
  4072. * no fault that KVM can inject without confusing the guest.
  4073. * In practice, the triple fault is moot as no sane SEV kernel
  4074. * will execute from user memory while also running with SMAP=1.
  4075. */
  4076. if (is_user)
  4077. kvm_inject_gp(vcpu, 0);
  4078. else
  4079. kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
  4080. return X86EMUL_PROPAGATE_FAULT;
  4081. }
  4082. resume_guest:
  4083. /*
  4084. * If the erratum was not hit, simply resume the guest and let it fault
  4085. * again. While awful, e.g. the vCPU may get stuck in an infinite loop
  4086. * if the fault is at CPL=0, it's the lesser of all evils. Exiting to
  4087. * userspace will kill the guest, and letting the emulator read garbage
  4088. * will yield random behavior and potentially corrupt the guest.
  4089. *
  4090. * Simply resuming the guest is technically not a violation of the SEV
  4091. * architecture. AMD's APM states that all code fetches and page table
  4092. * accesses for SEV guest are encrypted, regardless of the C-Bit. The
  4093. * APM also states that encrypted accesses to MMIO are "ignored", but
  4094. * doesn't explicitly define "ignored", i.e. doing nothing and letting
  4095. * the guest spin is technically "ignoring" the access.
  4096. */
  4097. return X86EMUL_RETRY_INSTR;
  4098. }
  4099. static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
  4100. {
  4101. struct vcpu_svm *svm = to_svm(vcpu);
  4102. return !gif_set(svm);
  4103. }
  4104. static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
  4105. {
  4106. if (!sev_es_guest(vcpu->kvm))
  4107. return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
  4108. sev_vcpu_deliver_sipi_vector(vcpu, vector);
  4109. }
  4110. static void svm_vm_destroy(struct kvm *kvm)
  4111. {
  4112. avic_vm_destroy(kvm);
  4113. sev_vm_destroy(kvm);
  4114. }
  4115. static int svm_vm_init(struct kvm *kvm)
  4116. {
  4117. int type = kvm->arch.vm_type;
  4118. if (type != KVM_X86_DEFAULT_VM &&
  4119. type != KVM_X86_SW_PROTECTED_VM) {
  4120. kvm->arch.has_protected_state =
  4121. (type == KVM_X86_SEV_ES_VM || type == KVM_X86_SNP_VM);
  4122. to_kvm_sev_info(kvm)->need_init = true;
  4123. kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM);
  4124. kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem;
  4125. }
  4126. if (!pause_filter_count || !pause_filter_thresh)
  4127. kvm->arch.pause_in_guest = true;
  4128. if (enable_apicv) {
  4129. int ret = avic_vm_init(kvm);
  4130. if (ret)
  4131. return ret;
  4132. }
  4133. return 0;
  4134. }
  4135. static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu)
  4136. {
  4137. struct page *page = snp_safe_alloc_page();
  4138. if (!page)
  4139. return NULL;
  4140. return page_address(page);
  4141. }
  4142. static struct kvm_x86_ops svm_x86_ops __initdata = {
  4143. .name = KBUILD_MODNAME,
  4144. .check_processor_compatibility = svm_check_processor_compat,
  4145. .hardware_unsetup = svm_hardware_unsetup,
  4146. .enable_virtualization_cpu = svm_enable_virtualization_cpu,
  4147. .disable_virtualization_cpu = svm_disable_virtualization_cpu,
  4148. .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu,
  4149. .has_emulated_msr = svm_has_emulated_msr,
  4150. .vcpu_create = svm_vcpu_create,
  4151. .vcpu_free = svm_vcpu_free,
  4152. .vcpu_reset = svm_vcpu_reset,
  4153. .vm_size = sizeof(struct kvm_svm),
  4154. .vm_init = svm_vm_init,
  4155. .vm_destroy = svm_vm_destroy,
  4156. .prepare_switch_to_guest = svm_prepare_switch_to_guest,
  4157. .vcpu_load = svm_vcpu_load,
  4158. .vcpu_put = svm_vcpu_put,
  4159. .vcpu_blocking = avic_vcpu_blocking,
  4160. .vcpu_unblocking = avic_vcpu_unblocking,
  4161. .update_exception_bitmap = svm_update_exception_bitmap,
  4162. .get_feature_msr = svm_get_feature_msr,
  4163. .get_msr = svm_get_msr,
  4164. .set_msr = svm_set_msr,
  4165. .get_segment_base = svm_get_segment_base,
  4166. .get_segment = svm_get_segment,
  4167. .set_segment = svm_set_segment,
  4168. .get_cpl = svm_get_cpl,
  4169. .get_cs_db_l_bits = svm_get_cs_db_l_bits,
  4170. .is_valid_cr0 = svm_is_valid_cr0,
  4171. .set_cr0 = svm_set_cr0,
  4172. .post_set_cr3 = sev_post_set_cr3,
  4173. .is_valid_cr4 = svm_is_valid_cr4,
  4174. .set_cr4 = svm_set_cr4,
  4175. .set_efer = svm_set_efer,
  4176. .get_idt = svm_get_idt,
  4177. .set_idt = svm_set_idt,
  4178. .get_gdt = svm_get_gdt,
  4179. .set_gdt = svm_set_gdt,
  4180. .set_dr6 = svm_set_dr6,
  4181. .set_dr7 = svm_set_dr7,
  4182. .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
  4183. .cache_reg = svm_cache_reg,
  4184. .get_rflags = svm_get_rflags,
  4185. .set_rflags = svm_set_rflags,
  4186. .get_if_flag = svm_get_if_flag,
  4187. .flush_tlb_all = svm_flush_tlb_all,
  4188. .flush_tlb_current = svm_flush_tlb_current,
  4189. .flush_tlb_gva = svm_flush_tlb_gva,
  4190. .flush_tlb_guest = svm_flush_tlb_asid,
  4191. .vcpu_pre_run = svm_vcpu_pre_run,
  4192. .vcpu_run = svm_vcpu_run,
  4193. .handle_exit = svm_handle_exit,
  4194. .skip_emulated_instruction = svm_skip_emulated_instruction,
  4195. .update_emulated_instruction = NULL,
  4196. .set_interrupt_shadow = svm_set_interrupt_shadow,
  4197. .get_interrupt_shadow = svm_get_interrupt_shadow,
  4198. .patch_hypercall = svm_patch_hypercall,
  4199. .inject_irq = svm_inject_irq,
  4200. .inject_nmi = svm_inject_nmi,
  4201. .is_vnmi_pending = svm_is_vnmi_pending,
  4202. .set_vnmi_pending = svm_set_vnmi_pending,
  4203. .inject_exception = svm_inject_exception,
  4204. .cancel_injection = svm_cancel_injection,
  4205. .interrupt_allowed = svm_interrupt_allowed,
  4206. .nmi_allowed = svm_nmi_allowed,
  4207. .get_nmi_mask = svm_get_nmi_mask,
  4208. .set_nmi_mask = svm_set_nmi_mask,
  4209. .enable_nmi_window = svm_enable_nmi_window,
  4210. .enable_irq_window = svm_enable_irq_window,
  4211. .update_cr8_intercept = svm_update_cr8_intercept,
  4212. .x2apic_icr_is_split = true,
  4213. .set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
  4214. .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
  4215. .apicv_post_state_restore = avic_apicv_post_state_restore,
  4216. .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS,
  4217. .get_exit_info = svm_get_exit_info,
  4218. .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
  4219. .has_wbinvd_exit = svm_has_wbinvd_exit,
  4220. .get_l2_tsc_offset = svm_get_l2_tsc_offset,
  4221. .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
  4222. .write_tsc_offset = svm_write_tsc_offset,
  4223. .write_tsc_multiplier = svm_write_tsc_multiplier,
  4224. .load_mmu_pgd = svm_load_mmu_pgd,
  4225. .check_intercept = svm_check_intercept,
  4226. .handle_exit_irqoff = svm_handle_exit_irqoff,
  4227. .nested_ops = &svm_nested_ops,
  4228. .deliver_interrupt = svm_deliver_interrupt,
  4229. .pi_update_irte = avic_pi_update_irte,
  4230. .setup_mce = svm_setup_mce,
  4231. #ifdef CONFIG_KVM_SMM
  4232. .smi_allowed = svm_smi_allowed,
  4233. .enter_smm = svm_enter_smm,
  4234. .leave_smm = svm_leave_smm,
  4235. .enable_smi_window = svm_enable_smi_window,
  4236. #endif
  4237. #ifdef CONFIG_KVM_AMD_SEV
  4238. .dev_get_attr = sev_dev_get_attr,
  4239. .mem_enc_ioctl = sev_mem_enc_ioctl,
  4240. .mem_enc_register_region = sev_mem_enc_register_region,
  4241. .mem_enc_unregister_region = sev_mem_enc_unregister_region,
  4242. .guest_memory_reclaimed = sev_guest_memory_reclaimed,
  4243. .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
  4244. .vm_move_enc_context_from = sev_vm_move_enc_context_from,
  4245. #endif
  4246. .check_emulate_instruction = svm_check_emulate_instruction,
  4247. .apic_init_signal_blocked = svm_apic_init_signal_blocked,
  4248. .msr_filter_changed = svm_msr_filter_changed,
  4249. .complete_emulated_msr = svm_complete_emulated_msr,
  4250. .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
  4251. .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
  4252. .alloc_apic_backing_page = svm_alloc_apic_backing_page,
  4253. .gmem_prepare = sev_gmem_prepare,
  4254. .gmem_invalidate = sev_gmem_invalidate,
  4255. .private_max_mapping_level = sev_private_max_mapping_level,
  4256. };
  4257. /*
  4258. * The default MMIO mask is a single bit (excluding the present bit),
  4259. * which could conflict with the memory encryption bit. Check for
  4260. * memory encryption support and override the default MMIO mask if
  4261. * memory encryption is enabled.
  4262. */
  4263. static __init void svm_adjust_mmio_mask(void)
  4264. {
  4265. unsigned int enc_bit, mask_bit;
  4266. u64 msr, mask;
  4267. /* If there is no memory encryption support, use existing mask */
  4268. if (cpuid_eax(0x80000000) < 0x8000001f)
  4269. return;
  4270. /* If memory encryption is not enabled, use existing mask */
  4271. rdmsrl(MSR_AMD64_SYSCFG, msr);
  4272. if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
  4273. return;
  4274. enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
  4275. mask_bit = boot_cpu_data.x86_phys_bits;
  4276. /* Increment the mask bit if it is the same as the encryption bit */
  4277. if (enc_bit == mask_bit)
  4278. mask_bit++;
  4279. /*
  4280. * If the mask bit location is below 52, then some bits above the
  4281. * physical addressing limit will always be reserved, so use the
  4282. * rsvd_bits() function to generate the mask. This mask, along with
  4283. * the present bit, will be used to generate a page fault with
  4284. * PFER.RSV = 1.
  4285. *
  4286. * If the mask bit location is 52 (or above), then clear the mask.
  4287. */
  4288. mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
  4289. kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
  4290. }
  4291. static __init void svm_set_cpu_caps(void)
  4292. {
  4293. kvm_set_cpu_caps();
  4294. kvm_caps.supported_perf_cap = 0;
  4295. kvm_caps.supported_xss = 0;
  4296. /* CPUID 0x80000001 and 0x8000000A (SVM features) */
  4297. if (nested) {
  4298. kvm_cpu_cap_set(X86_FEATURE_SVM);
  4299. kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
  4300. /*
  4301. * KVM currently flushes TLBs on *every* nested SVM transition,
  4302. * and so for all intents and purposes KVM supports flushing by
  4303. * ASID, i.e. KVM is guaranteed to honor every L1 ASID flush.
  4304. */
  4305. kvm_cpu_cap_set(X86_FEATURE_FLUSHBYASID);
  4306. if (nrips)
  4307. kvm_cpu_cap_set(X86_FEATURE_NRIPS);
  4308. if (npt_enabled)
  4309. kvm_cpu_cap_set(X86_FEATURE_NPT);
  4310. if (tsc_scaling)
  4311. kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
  4312. if (vls)
  4313. kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
  4314. if (lbrv)
  4315. kvm_cpu_cap_set(X86_FEATURE_LBRV);
  4316. if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
  4317. kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
  4318. if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
  4319. kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
  4320. if (vgif)
  4321. kvm_cpu_cap_set(X86_FEATURE_VGIF);
  4322. if (vnmi)
  4323. kvm_cpu_cap_set(X86_FEATURE_VNMI);
  4324. /* Nested VM can receive #VMEXIT instead of triggering #GP */
  4325. kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
  4326. }
  4327. /* CPUID 0x80000008 */
  4328. if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
  4329. boot_cpu_has(X86_FEATURE_AMD_SSBD))
  4330. kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
  4331. if (enable_pmu) {
  4332. /*
  4333. * Enumerate support for PERFCTR_CORE if and only if KVM has
  4334. * access to enough counters to virtualize "core" support,
  4335. * otherwise limit vPMU support to the legacy number of counters.
  4336. */
  4337. if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE)
  4338. kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS,
  4339. kvm_pmu_cap.num_counters_gp);
  4340. else
  4341. kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE);
  4342. if (kvm_pmu_cap.version != 2 ||
  4343. !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE))
  4344. kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2);
  4345. }
  4346. /* CPUID 0x8000001F (SME/SEV features) */
  4347. sev_set_cpu_caps();
  4348. /* Don't advertise Bus Lock Detect to guest if SVM support is absent */
  4349. kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT);
  4350. }
  4351. static __init int svm_hardware_setup(void)
  4352. {
  4353. int cpu;
  4354. struct page *iopm_pages;
  4355. void *iopm_va;
  4356. int r;
  4357. unsigned int order = get_order(IOPM_SIZE);
  4358. /*
  4359. * NX is required for shadow paging and for NPT if the NX huge pages
  4360. * mitigation is enabled.
  4361. */
  4362. if (!boot_cpu_has(X86_FEATURE_NX)) {
  4363. pr_err_ratelimited("NX (Execute Disable) not supported\n");
  4364. return -EOPNOTSUPP;
  4365. }
  4366. kvm_enable_efer_bits(EFER_NX);
  4367. iopm_pages = alloc_pages(GFP_KERNEL, order);
  4368. if (!iopm_pages)
  4369. return -ENOMEM;
  4370. iopm_va = page_address(iopm_pages);
  4371. memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
  4372. iopm_base = __sme_page_pa(iopm_pages);
  4373. init_msrpm_offsets();
  4374. kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
  4375. XFEATURE_MASK_BNDCSR);
  4376. if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
  4377. kvm_enable_efer_bits(EFER_FFXSR);
  4378. if (tsc_scaling) {
  4379. if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
  4380. tsc_scaling = false;
  4381. } else {
  4382. pr_info("TSC scaling supported\n");
  4383. kvm_caps.has_tsc_control = true;
  4384. }
  4385. }
  4386. kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
  4387. kvm_caps.tsc_scaling_ratio_frac_bits = 32;
  4388. tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
  4389. if (boot_cpu_has(X86_FEATURE_AUTOIBRS))
  4390. kvm_enable_efer_bits(EFER_AUTOIBRS);
  4391. /* Check for pause filtering support */
  4392. if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
  4393. pause_filter_count = 0;
  4394. pause_filter_thresh = 0;
  4395. } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
  4396. pause_filter_thresh = 0;
  4397. }
  4398. if (nested) {
  4399. pr_info("Nested Virtualization enabled\n");
  4400. kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
  4401. }
  4402. /*
  4403. * KVM's MMU doesn't support using 2-level paging for itself, and thus
  4404. * NPT isn't supported if the host is using 2-level paging since host
  4405. * CR4 is unchanged on VMRUN.
  4406. */
  4407. if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
  4408. npt_enabled = false;
  4409. if (!boot_cpu_has(X86_FEATURE_NPT))
  4410. npt_enabled = false;
  4411. /* Force VM NPT level equal to the host's paging level */
  4412. kvm_configure_mmu(npt_enabled, get_npt_level(),
  4413. get_npt_level(), PG_LEVEL_1G);
  4414. pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
  4415. /* Setup shadow_me_value and shadow_me_mask */
  4416. kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
  4417. svm_adjust_mmio_mask();
  4418. nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS);
  4419. if (lbrv) {
  4420. if (!boot_cpu_has(X86_FEATURE_LBRV))
  4421. lbrv = false;
  4422. else
  4423. pr_info("LBR virtualization supported\n");
  4424. }
  4425. /*
  4426. * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which
  4427. * may be modified by svm_adjust_mmio_mask()), as well as nrips.
  4428. */
  4429. sev_hardware_setup();
  4430. svm_hv_hardware_setup();
  4431. for_each_possible_cpu(cpu) {
  4432. r = svm_cpu_init(cpu);
  4433. if (r)
  4434. goto err;
  4435. }
  4436. enable_apicv = avic = avic && avic_hardware_setup();
  4437. if (!enable_apicv) {
  4438. svm_x86_ops.vcpu_blocking = NULL;
  4439. svm_x86_ops.vcpu_unblocking = NULL;
  4440. svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
  4441. } else if (!x2avic_enabled) {
  4442. svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
  4443. }
  4444. if (vls) {
  4445. if (!npt_enabled ||
  4446. !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
  4447. !IS_ENABLED(CONFIG_X86_64)) {
  4448. vls = false;
  4449. } else {
  4450. pr_info("Virtual VMLOAD VMSAVE supported\n");
  4451. }
  4452. }
  4453. if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
  4454. svm_gp_erratum_intercept = false;
  4455. if (vgif) {
  4456. if (!boot_cpu_has(X86_FEATURE_VGIF))
  4457. vgif = false;
  4458. else
  4459. pr_info("Virtual GIF supported\n");
  4460. }
  4461. vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI);
  4462. if (vnmi)
  4463. pr_info("Virtual NMI enabled\n");
  4464. if (!vnmi) {
  4465. svm_x86_ops.is_vnmi_pending = NULL;
  4466. svm_x86_ops.set_vnmi_pending = NULL;
  4467. }
  4468. if (!enable_pmu)
  4469. pr_info("PMU virtualization is disabled\n");
  4470. svm_set_cpu_caps();
  4471. /*
  4472. * It seems that on AMD processors PTE's accessed bit is
  4473. * being set by the CPU hardware before the NPF vmexit.
  4474. * This is not expected behaviour and our tests fail because
  4475. * of it.
  4476. * A workaround here is to disable support for
  4477. * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
  4478. * In this case userspace can know if there is support using
  4479. * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
  4480. * it
  4481. * If future AMD CPU models change the behaviour described above,
  4482. * this variable can be changed accordingly
  4483. */
  4484. allow_smaller_maxphyaddr = !npt_enabled;
  4485. return 0;
  4486. err:
  4487. svm_hardware_unsetup();
  4488. return r;
  4489. }
  4490. static struct kvm_x86_init_ops svm_init_ops __initdata = {
  4491. .hardware_setup = svm_hardware_setup,
  4492. .runtime_ops = &svm_x86_ops,
  4493. .pmu_ops = &amd_pmu_ops,
  4494. };
  4495. static void __svm_exit(void)
  4496. {
  4497. kvm_x86_vendor_exit();
  4498. }
  4499. static int __init svm_init(void)
  4500. {
  4501. int r;
  4502. __unused_size_checks();
  4503. if (!kvm_is_svm_supported())
  4504. return -EOPNOTSUPP;
  4505. r = kvm_x86_vendor_init(&svm_init_ops);
  4506. if (r)
  4507. return r;
  4508. /*
  4509. * Common KVM initialization _must_ come last, after this, /dev/kvm is
  4510. * exposed to userspace!
  4511. */
  4512. r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm),
  4513. THIS_MODULE);
  4514. if (r)
  4515. goto err_kvm_init;
  4516. return 0;
  4517. err_kvm_init:
  4518. __svm_exit();
  4519. return r;
  4520. }
  4521. static void __exit svm_exit(void)
  4522. {
  4523. kvm_exit();
  4524. __svm_exit();
  4525. }
  4526. module_init(svm_init)
  4527. module_exit(svm_exit)