kvm_main.c 165 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Kernel-based Virtual Machine (KVM) Hypervisor
  4. *
  5. * Copyright (C) 2006 Qumranet, Inc.
  6. * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  7. *
  8. * Authors:
  9. * Avi Kivity <avi@qumranet.com>
  10. * Yaniv Kamay <yaniv@qumranet.com>
  11. */
  12. #include <kvm/iodev.h>
  13. #include <linux/kvm_host.h>
  14. #include <linux/kvm.h>
  15. #include <linux/module.h>
  16. #include <linux/errno.h>
  17. #include <linux/percpu.h>
  18. #include <linux/mm.h>
  19. #include <linux/miscdevice.h>
  20. #include <linux/vmalloc.h>
  21. #include <linux/reboot.h>
  22. #include <linux/debugfs.h>
  23. #include <linux/highmem.h>
  24. #include <linux/file.h>
  25. #include <linux/syscore_ops.h>
  26. #include <linux/cpu.h>
  27. #include <linux/sched/signal.h>
  28. #include <linux/sched/mm.h>
  29. #include <linux/sched/stat.h>
  30. #include <linux/cpumask.h>
  31. #include <linux/smp.h>
  32. #include <linux/anon_inodes.h>
  33. #include <linux/profile.h>
  34. #include <linux/kvm_para.h>
  35. #include <linux/pagemap.h>
  36. #include <linux/mman.h>
  37. #include <linux/swap.h>
  38. #include <linux/bitops.h>
  39. #include <linux/spinlock.h>
  40. #include <linux/compat.h>
  41. #include <linux/srcu.h>
  42. #include <linux/hugetlb.h>
  43. #include <linux/slab.h>
  44. #include <linux/sort.h>
  45. #include <linux/bsearch.h>
  46. #include <linux/io.h>
  47. #include <linux/lockdep.h>
  48. #include <linux/kthread.h>
  49. #include <linux/suspend.h>
  50. #include <asm/processor.h>
  51. #include <asm/ioctl.h>
  52. #include <linux/uaccess.h>
  53. #include "coalesced_mmio.h"
  54. #include "async_pf.h"
  55. #include "kvm_mm.h"
  56. #include "vfio.h"
  57. #include <trace/events/ipi.h>
  58. #define CREATE_TRACE_POINTS
  59. #include <trace/events/kvm.h>
  60. #include <linux/kvm_dirty_ring.h>
  61. /* Worst case buffer size needed for holding an integer. */
  62. #define ITOA_MAX_LEN 12
  63. MODULE_AUTHOR("Qumranet");
  64. MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
  65. MODULE_LICENSE("GPL");
  66. /* Architectures should define their poll value according to the halt latency */
  67. unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
  68. module_param(halt_poll_ns, uint, 0644);
  69. EXPORT_SYMBOL_GPL(halt_poll_ns);
  70. /* Default doubles per-vcpu halt_poll_ns. */
  71. unsigned int halt_poll_ns_grow = 2;
  72. module_param(halt_poll_ns_grow, uint, 0644);
  73. EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
  74. /* The start value to grow halt_poll_ns from */
  75. unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
  76. module_param(halt_poll_ns_grow_start, uint, 0644);
  77. EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
  78. /* Default halves per-vcpu halt_poll_ns. */
  79. unsigned int halt_poll_ns_shrink = 2;
  80. module_param(halt_poll_ns_shrink, uint, 0644);
  81. EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
  82. /*
  83. * Ordering of locks:
  84. *
  85. * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
  86. */
  87. DEFINE_MUTEX(kvm_lock);
  88. LIST_HEAD(vm_list);
  89. static struct kmem_cache *kvm_vcpu_cache;
  90. static __read_mostly struct preempt_ops kvm_preempt_ops;
  91. static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
  92. static struct dentry *kvm_debugfs_dir;
  93. static const struct file_operations stat_fops_per_vm;
  94. static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
  95. unsigned long arg);
  96. #ifdef CONFIG_KVM_COMPAT
  97. static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
  98. unsigned long arg);
  99. #define KVM_COMPAT(c) .compat_ioctl = (c)
  100. #else
  101. /*
  102. * For architectures that don't implement a compat infrastructure,
  103. * adopt a double line of defense:
  104. * - Prevent a compat task from opening /dev/kvm
  105. * - If the open has been done by a 64bit task, and the KVM fd
  106. * passed to a compat task, let the ioctls fail.
  107. */
  108. static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
  109. unsigned long arg) { return -EINVAL; }
  110. static int kvm_no_compat_open(struct inode *inode, struct file *file)
  111. {
  112. return is_compat_task() ? -ENODEV : 0;
  113. }
  114. #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
  115. .open = kvm_no_compat_open
  116. #endif
  117. static int kvm_enable_virtualization(void);
  118. static void kvm_disable_virtualization(void);
  119. static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
  120. #define KVM_EVENT_CREATE_VM 0
  121. #define KVM_EVENT_DESTROY_VM 1
  122. static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
  123. static unsigned long long kvm_createvm_count;
  124. static unsigned long long kvm_active_vms;
  125. static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
  126. __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
  127. {
  128. }
  129. bool kvm_is_zone_device_page(struct page *page)
  130. {
  131. /*
  132. * The metadata used by is_zone_device_page() to determine whether or
  133. * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
  134. * the device has been pinned, e.g. by get_user_pages(). WARN if the
  135. * page_count() is zero to help detect bad usage of this helper.
  136. */
  137. if (WARN_ON_ONCE(!page_count(page)))
  138. return false;
  139. return is_zone_device_page(page);
  140. }
  141. /*
  142. * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
  143. * page, NULL otherwise. Note, the list of refcounted PG_reserved page types
  144. * is likely incomplete, it has been compiled purely through people wanting to
  145. * back guest with a certain type of memory and encountering issues.
  146. */
  147. struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)
  148. {
  149. struct page *page;
  150. if (!pfn_valid(pfn))
  151. return NULL;
  152. page = pfn_to_page(pfn);
  153. if (!PageReserved(page))
  154. return page;
  155. /* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */
  156. if (is_zero_pfn(pfn))
  157. return page;
  158. /*
  159. * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
  160. * perspective they are "normal" pages, albeit with slightly different
  161. * usage rules.
  162. */
  163. if (kvm_is_zone_device_page(page))
  164. return page;
  165. return NULL;
  166. }
  167. /*
  168. * Switches to specified vcpu, until a matching vcpu_put()
  169. */
  170. void vcpu_load(struct kvm_vcpu *vcpu)
  171. {
  172. int cpu = get_cpu();
  173. __this_cpu_write(kvm_running_vcpu, vcpu);
  174. preempt_notifier_register(&vcpu->preempt_notifier);
  175. kvm_arch_vcpu_load(vcpu, cpu);
  176. put_cpu();
  177. }
  178. EXPORT_SYMBOL_GPL(vcpu_load);
  179. void vcpu_put(struct kvm_vcpu *vcpu)
  180. {
  181. preempt_disable();
  182. kvm_arch_vcpu_put(vcpu);
  183. preempt_notifier_unregister(&vcpu->preempt_notifier);
  184. __this_cpu_write(kvm_running_vcpu, NULL);
  185. preempt_enable();
  186. }
  187. EXPORT_SYMBOL_GPL(vcpu_put);
  188. /* TODO: merge with kvm_arch_vcpu_should_kick */
  189. static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
  190. {
  191. int mode = kvm_vcpu_exiting_guest_mode(vcpu);
  192. /*
  193. * We need to wait for the VCPU to reenable interrupts and get out of
  194. * READING_SHADOW_PAGE_TABLES mode.
  195. */
  196. if (req & KVM_REQUEST_WAIT)
  197. return mode != OUTSIDE_GUEST_MODE;
  198. /*
  199. * Need to kick a running VCPU, but otherwise there is nothing to do.
  200. */
  201. return mode == IN_GUEST_MODE;
  202. }
  203. static void ack_kick(void *_completed)
  204. {
  205. }
  206. static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
  207. {
  208. if (cpumask_empty(cpus))
  209. return false;
  210. smp_call_function_many(cpus, ack_kick, NULL, wait);
  211. return true;
  212. }
  213. static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req,
  214. struct cpumask *tmp, int current_cpu)
  215. {
  216. int cpu;
  217. if (likely(!(req & KVM_REQUEST_NO_ACTION)))
  218. __kvm_make_request(req, vcpu);
  219. if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
  220. return;
  221. /*
  222. * Note, the vCPU could get migrated to a different pCPU at any point
  223. * after kvm_request_needs_ipi(), which could result in sending an IPI
  224. * to the previous pCPU. But, that's OK because the purpose of the IPI
  225. * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
  226. * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
  227. * after this point is also OK, as the requirement is only that KVM wait
  228. * for vCPUs that were reading SPTEs _before_ any changes were
  229. * finalized. See kvm_vcpu_kick() for more details on handling requests.
  230. */
  231. if (kvm_request_needs_ipi(vcpu, req)) {
  232. cpu = READ_ONCE(vcpu->cpu);
  233. if (cpu != -1 && cpu != current_cpu)
  234. __cpumask_set_cpu(cpu, tmp);
  235. }
  236. }
  237. bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
  238. unsigned long *vcpu_bitmap)
  239. {
  240. struct kvm_vcpu *vcpu;
  241. struct cpumask *cpus;
  242. int i, me;
  243. bool called;
  244. me = get_cpu();
  245. cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
  246. cpumask_clear(cpus);
  247. for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
  248. vcpu = kvm_get_vcpu(kvm, i);
  249. if (!vcpu)
  250. continue;
  251. kvm_make_vcpu_request(vcpu, req, cpus, me);
  252. }
  253. called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
  254. put_cpu();
  255. return called;
  256. }
  257. bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
  258. {
  259. struct kvm_vcpu *vcpu;
  260. struct cpumask *cpus;
  261. unsigned long i;
  262. bool called;
  263. int me;
  264. me = get_cpu();
  265. cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
  266. cpumask_clear(cpus);
  267. kvm_for_each_vcpu(i, vcpu, kvm)
  268. kvm_make_vcpu_request(vcpu, req, cpus, me);
  269. called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
  270. put_cpu();
  271. return called;
  272. }
  273. EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
  274. void kvm_flush_remote_tlbs(struct kvm *kvm)
  275. {
  276. ++kvm->stat.generic.remote_tlb_flush_requests;
  277. /*
  278. * We want to publish modifications to the page tables before reading
  279. * mode. Pairs with a memory barrier in arch-specific code.
  280. * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
  281. * and smp_mb in walk_shadow_page_lockless_begin/end.
  282. * - powerpc: smp_mb in kvmppc_prepare_to_enter.
  283. *
  284. * There is already an smp_mb__after_atomic() before
  285. * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
  286. * barrier here.
  287. */
  288. if (!kvm_arch_flush_remote_tlbs(kvm)
  289. || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
  290. ++kvm->stat.generic.remote_tlb_flush;
  291. }
  292. EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
  293. void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
  294. {
  295. if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages))
  296. return;
  297. /*
  298. * Fall back to a flushing entire TLBs if the architecture range-based
  299. * TLB invalidation is unsupported or can't be performed for whatever
  300. * reason.
  301. */
  302. kvm_flush_remote_tlbs(kvm);
  303. }
  304. void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
  305. const struct kvm_memory_slot *memslot)
  306. {
  307. /*
  308. * All current use cases for flushing the TLBs for a specific memslot
  309. * are related to dirty logging, and many do the TLB flush out of
  310. * mmu_lock. The interaction between the various operations on memslot
  311. * must be serialized by slots_locks to ensure the TLB flush from one
  312. * operation is observed by any other operation on the same memslot.
  313. */
  314. lockdep_assert_held(&kvm->slots_lock);
  315. kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
  316. }
  317. static void kvm_flush_shadow_all(struct kvm *kvm)
  318. {
  319. kvm_arch_flush_shadow_all(kvm);
  320. kvm_arch_guest_memory_reclaimed(kvm);
  321. }
  322. #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
  323. static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
  324. gfp_t gfp_flags)
  325. {
  326. void *page;
  327. gfp_flags |= mc->gfp_zero;
  328. if (mc->kmem_cache)
  329. return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
  330. page = (void *)__get_free_page(gfp_flags);
  331. if (page && mc->init_value)
  332. memset64(page, mc->init_value, PAGE_SIZE / sizeof(u64));
  333. return page;
  334. }
  335. int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
  336. {
  337. gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT;
  338. void *obj;
  339. if (mc->nobjs >= min)
  340. return 0;
  341. if (unlikely(!mc->objects)) {
  342. if (WARN_ON_ONCE(!capacity))
  343. return -EIO;
  344. /*
  345. * Custom init values can be used only for page allocations,
  346. * and obviously conflict with __GFP_ZERO.
  347. */
  348. if (WARN_ON_ONCE(mc->init_value && (mc->kmem_cache || mc->gfp_zero)))
  349. return -EIO;
  350. mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp);
  351. if (!mc->objects)
  352. return -ENOMEM;
  353. mc->capacity = capacity;
  354. }
  355. /* It is illegal to request a different capacity across topups. */
  356. if (WARN_ON_ONCE(mc->capacity != capacity))
  357. return -EIO;
  358. while (mc->nobjs < mc->capacity) {
  359. obj = mmu_memory_cache_alloc_obj(mc, gfp);
  360. if (!obj)
  361. return mc->nobjs >= min ? 0 : -ENOMEM;
  362. mc->objects[mc->nobjs++] = obj;
  363. }
  364. return 0;
  365. }
  366. int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
  367. {
  368. return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
  369. }
  370. int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
  371. {
  372. return mc->nobjs;
  373. }
  374. void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
  375. {
  376. while (mc->nobjs) {
  377. if (mc->kmem_cache)
  378. kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
  379. else
  380. free_page((unsigned long)mc->objects[--mc->nobjs]);
  381. }
  382. kvfree(mc->objects);
  383. mc->objects = NULL;
  384. mc->capacity = 0;
  385. }
  386. void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
  387. {
  388. void *p;
  389. if (WARN_ON(!mc->nobjs))
  390. p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
  391. else
  392. p = mc->objects[--mc->nobjs];
  393. BUG_ON(!p);
  394. return p;
  395. }
  396. #endif
  397. static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
  398. {
  399. mutex_init(&vcpu->mutex);
  400. vcpu->cpu = -1;
  401. vcpu->kvm = kvm;
  402. vcpu->vcpu_id = id;
  403. vcpu->pid = NULL;
  404. #ifndef __KVM_HAVE_ARCH_WQP
  405. rcuwait_init(&vcpu->wait);
  406. #endif
  407. kvm_async_pf_vcpu_init(vcpu);
  408. kvm_vcpu_set_in_spin_loop(vcpu, false);
  409. kvm_vcpu_set_dy_eligible(vcpu, false);
  410. vcpu->preempted = false;
  411. vcpu->ready = false;
  412. preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
  413. vcpu->last_used_slot = NULL;
  414. /* Fill the stats id string for the vcpu */
  415. snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
  416. task_pid_nr(current), id);
  417. }
  418. static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
  419. {
  420. kvm_arch_vcpu_destroy(vcpu);
  421. kvm_dirty_ring_free(&vcpu->dirty_ring);
  422. /*
  423. * No need for rcu_read_lock as VCPU_RUN is the only place that changes
  424. * the vcpu->pid pointer, and at destruction time all file descriptors
  425. * are already gone.
  426. */
  427. put_pid(rcu_dereference_protected(vcpu->pid, 1));
  428. free_page((unsigned long)vcpu->run);
  429. kmem_cache_free(kvm_vcpu_cache, vcpu);
  430. }
  431. void kvm_destroy_vcpus(struct kvm *kvm)
  432. {
  433. unsigned long i;
  434. struct kvm_vcpu *vcpu;
  435. kvm_for_each_vcpu(i, vcpu, kvm) {
  436. kvm_vcpu_destroy(vcpu);
  437. xa_erase(&kvm->vcpu_array, i);
  438. }
  439. atomic_set(&kvm->online_vcpus, 0);
  440. }
  441. EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
  442. #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
  443. static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
  444. {
  445. return container_of(mn, struct kvm, mmu_notifier);
  446. }
  447. typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
  448. typedef void (*on_lock_fn_t)(struct kvm *kvm);
  449. struct kvm_mmu_notifier_range {
  450. /*
  451. * 64-bit addresses, as KVM notifiers can operate on host virtual
  452. * addresses (unsigned long) and guest physical addresses (64-bit).
  453. */
  454. u64 start;
  455. u64 end;
  456. union kvm_mmu_notifier_arg arg;
  457. gfn_handler_t handler;
  458. on_lock_fn_t on_lock;
  459. bool flush_on_ret;
  460. bool may_block;
  461. };
  462. /*
  463. * The inner-most helper returns a tuple containing the return value from the
  464. * arch- and action-specific handler, plus a flag indicating whether or not at
  465. * least one memslot was found, i.e. if the handler found guest memory.
  466. *
  467. * Note, most notifiers are averse to booleans, so even though KVM tracks the
  468. * return from arch code as a bool, outer helpers will cast it to an int. :-(
  469. */
  470. typedef struct kvm_mmu_notifier_return {
  471. bool ret;
  472. bool found_memslot;
  473. } kvm_mn_ret_t;
  474. /*
  475. * Use a dedicated stub instead of NULL to indicate that there is no callback
  476. * function/handler. The compiler technically can't guarantee that a real
  477. * function will have a non-zero address, and so it will generate code to
  478. * check for !NULL, whereas comparing against a stub will be elided at compile
  479. * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
  480. */
  481. static void kvm_null_fn(void)
  482. {
  483. }
  484. #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
  485. /* Iterate over each memslot intersecting [start, last] (inclusive) range */
  486. #define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \
  487. for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
  488. node; \
  489. node = interval_tree_iter_next(node, start, last)) \
  490. static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm,
  491. const struct kvm_mmu_notifier_range *range)
  492. {
  493. struct kvm_mmu_notifier_return r = {
  494. .ret = false,
  495. .found_memslot = false,
  496. };
  497. struct kvm_gfn_range gfn_range;
  498. struct kvm_memory_slot *slot;
  499. struct kvm_memslots *slots;
  500. int i, idx;
  501. if (WARN_ON_ONCE(range->end <= range->start))
  502. return r;
  503. /* A null handler is allowed if and only if on_lock() is provided. */
  504. if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
  505. IS_KVM_NULL_FN(range->handler)))
  506. return r;
  507. idx = srcu_read_lock(&kvm->srcu);
  508. for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
  509. struct interval_tree_node *node;
  510. slots = __kvm_memslots(kvm, i);
  511. kvm_for_each_memslot_in_hva_range(node, slots,
  512. range->start, range->end - 1) {
  513. unsigned long hva_start, hva_end;
  514. slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
  515. hva_start = max_t(unsigned long, range->start, slot->userspace_addr);
  516. hva_end = min_t(unsigned long, range->end,
  517. slot->userspace_addr + (slot->npages << PAGE_SHIFT));
  518. /*
  519. * To optimize for the likely case where the address
  520. * range is covered by zero or one memslots, don't
  521. * bother making these conditional (to avoid writes on
  522. * the second or later invocation of the handler).
  523. */
  524. gfn_range.arg = range->arg;
  525. gfn_range.may_block = range->may_block;
  526. /*
  527. * {gfn(page) | page intersects with [hva_start, hva_end)} =
  528. * {gfn_start, gfn_start+1, ..., gfn_end-1}.
  529. */
  530. gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
  531. gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
  532. gfn_range.slot = slot;
  533. if (!r.found_memslot) {
  534. r.found_memslot = true;
  535. KVM_MMU_LOCK(kvm);
  536. if (!IS_KVM_NULL_FN(range->on_lock))
  537. range->on_lock(kvm);
  538. if (IS_KVM_NULL_FN(range->handler))
  539. goto mmu_unlock;
  540. }
  541. r.ret |= range->handler(kvm, &gfn_range);
  542. }
  543. }
  544. if (range->flush_on_ret && r.ret)
  545. kvm_flush_remote_tlbs(kvm);
  546. mmu_unlock:
  547. if (r.found_memslot)
  548. KVM_MMU_UNLOCK(kvm);
  549. srcu_read_unlock(&kvm->srcu, idx);
  550. return r;
  551. }
  552. static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
  553. unsigned long start,
  554. unsigned long end,
  555. gfn_handler_t handler)
  556. {
  557. struct kvm *kvm = mmu_notifier_to_kvm(mn);
  558. const struct kvm_mmu_notifier_range range = {
  559. .start = start,
  560. .end = end,
  561. .handler = handler,
  562. .on_lock = (void *)kvm_null_fn,
  563. .flush_on_ret = true,
  564. .may_block = false,
  565. };
  566. return __kvm_handle_hva_range(kvm, &range).ret;
  567. }
  568. static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
  569. unsigned long start,
  570. unsigned long end,
  571. gfn_handler_t handler)
  572. {
  573. struct kvm *kvm = mmu_notifier_to_kvm(mn);
  574. const struct kvm_mmu_notifier_range range = {
  575. .start = start,
  576. .end = end,
  577. .handler = handler,
  578. .on_lock = (void *)kvm_null_fn,
  579. .flush_on_ret = false,
  580. .may_block = false,
  581. };
  582. return __kvm_handle_hva_range(kvm, &range).ret;
  583. }
  584. void kvm_mmu_invalidate_begin(struct kvm *kvm)
  585. {
  586. lockdep_assert_held_write(&kvm->mmu_lock);
  587. /*
  588. * The count increase must become visible at unlock time as no
  589. * spte can be established without taking the mmu_lock and
  590. * count is also read inside the mmu_lock critical section.
  591. */
  592. kvm->mmu_invalidate_in_progress++;
  593. if (likely(kvm->mmu_invalidate_in_progress == 1)) {
  594. kvm->mmu_invalidate_range_start = INVALID_GPA;
  595. kvm->mmu_invalidate_range_end = INVALID_GPA;
  596. }
  597. }
  598. void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
  599. {
  600. lockdep_assert_held_write(&kvm->mmu_lock);
  601. WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
  602. if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) {
  603. kvm->mmu_invalidate_range_start = start;
  604. kvm->mmu_invalidate_range_end = end;
  605. } else {
  606. /*
  607. * Fully tracking multiple concurrent ranges has diminishing
  608. * returns. Keep things simple and just find the minimal range
  609. * which includes the current and new ranges. As there won't be
  610. * enough information to subtract a range after its invalidate
  611. * completes, any ranges invalidated concurrently will
  612. * accumulate and persist until all outstanding invalidates
  613. * complete.
  614. */
  615. kvm->mmu_invalidate_range_start =
  616. min(kvm->mmu_invalidate_range_start, start);
  617. kvm->mmu_invalidate_range_end =
  618. max(kvm->mmu_invalidate_range_end, end);
  619. }
  620. }
  621. bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
  622. {
  623. kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
  624. return kvm_unmap_gfn_range(kvm, range);
  625. }
  626. static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
  627. const struct mmu_notifier_range *range)
  628. {
  629. struct kvm *kvm = mmu_notifier_to_kvm(mn);
  630. const struct kvm_mmu_notifier_range hva_range = {
  631. .start = range->start,
  632. .end = range->end,
  633. .handler = kvm_mmu_unmap_gfn_range,
  634. .on_lock = kvm_mmu_invalidate_begin,
  635. .flush_on_ret = true,
  636. .may_block = mmu_notifier_range_blockable(range),
  637. };
  638. trace_kvm_unmap_hva_range(range->start, range->end);
  639. /*
  640. * Prevent memslot modification between range_start() and range_end()
  641. * so that conditionally locking provides the same result in both
  642. * functions. Without that guarantee, the mmu_invalidate_in_progress
  643. * adjustments will be imbalanced.
  644. *
  645. * Pairs with the decrement in range_end().
  646. */
  647. spin_lock(&kvm->mn_invalidate_lock);
  648. kvm->mn_active_invalidate_count++;
  649. spin_unlock(&kvm->mn_invalidate_lock);
  650. /*
  651. * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
  652. * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
  653. * each cache's lock. There are relatively few caches in existence at
  654. * any given time, and the caches themselves can check for hva overlap,
  655. * i.e. don't need to rely on memslot overlap checks for performance.
  656. * Because this runs without holding mmu_lock, the pfn caches must use
  657. * mn_active_invalidate_count (see above) instead of
  658. * mmu_invalidate_in_progress.
  659. */
  660. gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);
  661. /*
  662. * If one or more memslots were found and thus zapped, notify arch code
  663. * that guest memory has been reclaimed. This needs to be done *after*
  664. * dropping mmu_lock, as x86's reclaim path is slooooow.
  665. */
  666. if (__kvm_handle_hva_range(kvm, &hva_range).found_memslot)
  667. kvm_arch_guest_memory_reclaimed(kvm);
  668. return 0;
  669. }
  670. void kvm_mmu_invalidate_end(struct kvm *kvm)
  671. {
  672. lockdep_assert_held_write(&kvm->mmu_lock);
  673. /*
  674. * This sequence increase will notify the kvm page fault that
  675. * the page that is going to be mapped in the spte could have
  676. * been freed.
  677. */
  678. kvm->mmu_invalidate_seq++;
  679. smp_wmb();
  680. /*
  681. * The above sequence increase must be visible before the
  682. * below count decrease, which is ensured by the smp_wmb above
  683. * in conjunction with the smp_rmb in mmu_invalidate_retry().
  684. */
  685. kvm->mmu_invalidate_in_progress--;
  686. KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm);
  687. /*
  688. * Assert that at least one range was added between start() and end().
  689. * Not adding a range isn't fatal, but it is a KVM bug.
  690. */
  691. WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA);
  692. }
  693. static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
  694. const struct mmu_notifier_range *range)
  695. {
  696. struct kvm *kvm = mmu_notifier_to_kvm(mn);
  697. const struct kvm_mmu_notifier_range hva_range = {
  698. .start = range->start,
  699. .end = range->end,
  700. .handler = (void *)kvm_null_fn,
  701. .on_lock = kvm_mmu_invalidate_end,
  702. .flush_on_ret = false,
  703. .may_block = mmu_notifier_range_blockable(range),
  704. };
  705. bool wake;
  706. __kvm_handle_hva_range(kvm, &hva_range);
  707. /* Pairs with the increment in range_start(). */
  708. spin_lock(&kvm->mn_invalidate_lock);
  709. if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count))
  710. --kvm->mn_active_invalidate_count;
  711. wake = !kvm->mn_active_invalidate_count;
  712. spin_unlock(&kvm->mn_invalidate_lock);
  713. /*
  714. * There can only be one waiter, since the wait happens under
  715. * slots_lock.
  716. */
  717. if (wake)
  718. rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
  719. }
  720. static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
  721. struct mm_struct *mm,
  722. unsigned long start,
  723. unsigned long end)
  724. {
  725. trace_kvm_age_hva(start, end);
  726. return kvm_handle_hva_range(mn, start, end, kvm_age_gfn);
  727. }
  728. static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
  729. struct mm_struct *mm,
  730. unsigned long start,
  731. unsigned long end)
  732. {
  733. trace_kvm_age_hva(start, end);
  734. /*
  735. * Even though we do not flush TLB, this will still adversely
  736. * affect performance on pre-Haswell Intel EPT, where there is
  737. * no EPT Access Bit to clear so that we have to tear down EPT
  738. * tables instead. If we find this unacceptable, we can always
  739. * add a parameter to kvm_age_hva so that it effectively doesn't
  740. * do anything on clear_young.
  741. *
  742. * Also note that currently we never issue secondary TLB flushes
  743. * from clear_young, leaving this job up to the regular system
  744. * cadence. If we find this inaccurate, we might come up with a
  745. * more sophisticated heuristic later.
  746. */
  747. return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
  748. }
  749. static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
  750. struct mm_struct *mm,
  751. unsigned long address)
  752. {
  753. trace_kvm_test_age_hva(address);
  754. return kvm_handle_hva_range_no_flush(mn, address, address + 1,
  755. kvm_test_age_gfn);
  756. }
  757. static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
  758. struct mm_struct *mm)
  759. {
  760. struct kvm *kvm = mmu_notifier_to_kvm(mn);
  761. int idx;
  762. idx = srcu_read_lock(&kvm->srcu);
  763. kvm_flush_shadow_all(kvm);
  764. srcu_read_unlock(&kvm->srcu, idx);
  765. }
  766. static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
  767. .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
  768. .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
  769. .clear_flush_young = kvm_mmu_notifier_clear_flush_young,
  770. .clear_young = kvm_mmu_notifier_clear_young,
  771. .test_young = kvm_mmu_notifier_test_young,
  772. .release = kvm_mmu_notifier_release,
  773. };
  774. static int kvm_init_mmu_notifier(struct kvm *kvm)
  775. {
  776. kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
  777. return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
  778. }
  779. #else /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */
  780. static int kvm_init_mmu_notifier(struct kvm *kvm)
  781. {
  782. return 0;
  783. }
  784. #endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */
  785. #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
  786. static int kvm_pm_notifier_call(struct notifier_block *bl,
  787. unsigned long state,
  788. void *unused)
  789. {
  790. struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
  791. return kvm_arch_pm_notifier(kvm, state);
  792. }
  793. static void kvm_init_pm_notifier(struct kvm *kvm)
  794. {
  795. kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
  796. /* Suspend KVM before we suspend ftrace, RCU, etc. */
  797. kvm->pm_notifier.priority = INT_MAX;
  798. register_pm_notifier(&kvm->pm_notifier);
  799. }
  800. static void kvm_destroy_pm_notifier(struct kvm *kvm)
  801. {
  802. unregister_pm_notifier(&kvm->pm_notifier);
  803. }
  804. #else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
  805. static void kvm_init_pm_notifier(struct kvm *kvm)
  806. {
  807. }
  808. static void kvm_destroy_pm_notifier(struct kvm *kvm)
  809. {
  810. }
  811. #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
  812. static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
  813. {
  814. if (!memslot->dirty_bitmap)
  815. return;
  816. vfree(memslot->dirty_bitmap);
  817. memslot->dirty_bitmap = NULL;
  818. }
  819. /* This does not remove the slot from struct kvm_memslots data structures */
  820. static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
  821. {
  822. if (slot->flags & KVM_MEM_GUEST_MEMFD)
  823. kvm_gmem_unbind(slot);
  824. kvm_destroy_dirty_bitmap(slot);
  825. kvm_arch_free_memslot(kvm, slot);
  826. kfree(slot);
  827. }
  828. static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
  829. {
  830. struct hlist_node *idnode;
  831. struct kvm_memory_slot *memslot;
  832. int bkt;
  833. /*
  834. * The same memslot objects live in both active and inactive sets,
  835. * arbitrarily free using index '1' so the second invocation of this
  836. * function isn't operating over a structure with dangling pointers
  837. * (even though this function isn't actually touching them).
  838. */
  839. if (!slots->node_idx)
  840. return;
  841. hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
  842. kvm_free_memslot(kvm, memslot);
  843. }
  844. static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
  845. {
  846. switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
  847. case KVM_STATS_TYPE_INSTANT:
  848. return 0444;
  849. case KVM_STATS_TYPE_CUMULATIVE:
  850. case KVM_STATS_TYPE_PEAK:
  851. default:
  852. return 0644;
  853. }
  854. }
  855. static void kvm_destroy_vm_debugfs(struct kvm *kvm)
  856. {
  857. int i;
  858. int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
  859. kvm_vcpu_stats_header.num_desc;
  860. if (IS_ERR(kvm->debugfs_dentry))
  861. return;
  862. debugfs_remove_recursive(kvm->debugfs_dentry);
  863. if (kvm->debugfs_stat_data) {
  864. for (i = 0; i < kvm_debugfs_num_entries; i++)
  865. kfree(kvm->debugfs_stat_data[i]);
  866. kfree(kvm->debugfs_stat_data);
  867. }
  868. }
  869. static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname)
  870. {
  871. static DEFINE_MUTEX(kvm_debugfs_lock);
  872. struct dentry *dent;
  873. char dir_name[ITOA_MAX_LEN * 2];
  874. struct kvm_stat_data *stat_data;
  875. const struct _kvm_stats_desc *pdesc;
  876. int i, ret = -ENOMEM;
  877. int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
  878. kvm_vcpu_stats_header.num_desc;
  879. if (!debugfs_initialized())
  880. return 0;
  881. snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname);
  882. mutex_lock(&kvm_debugfs_lock);
  883. dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
  884. if (dent) {
  885. pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
  886. dput(dent);
  887. mutex_unlock(&kvm_debugfs_lock);
  888. return 0;
  889. }
  890. dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
  891. mutex_unlock(&kvm_debugfs_lock);
  892. if (IS_ERR(dent))
  893. return 0;
  894. kvm->debugfs_dentry = dent;
  895. kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
  896. sizeof(*kvm->debugfs_stat_data),
  897. GFP_KERNEL_ACCOUNT);
  898. if (!kvm->debugfs_stat_data)
  899. goto out_err;
  900. for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
  901. pdesc = &kvm_vm_stats_desc[i];
  902. stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
  903. if (!stat_data)
  904. goto out_err;
  905. stat_data->kvm = kvm;
  906. stat_data->desc = pdesc;
  907. stat_data->kind = KVM_STAT_VM;
  908. kvm->debugfs_stat_data[i] = stat_data;
  909. debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
  910. kvm->debugfs_dentry, stat_data,
  911. &stat_fops_per_vm);
  912. }
  913. for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
  914. pdesc = &kvm_vcpu_stats_desc[i];
  915. stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
  916. if (!stat_data)
  917. goto out_err;
  918. stat_data->kvm = kvm;
  919. stat_data->desc = pdesc;
  920. stat_data->kind = KVM_STAT_VCPU;
  921. kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
  922. debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
  923. kvm->debugfs_dentry, stat_data,
  924. &stat_fops_per_vm);
  925. }
  926. kvm_arch_create_vm_debugfs(kvm);
  927. return 0;
  928. out_err:
  929. kvm_destroy_vm_debugfs(kvm);
  930. return ret;
  931. }
  932. /*
  933. * Called after the VM is otherwise initialized, but just before adding it to
  934. * the vm_list.
  935. */
  936. int __weak kvm_arch_post_init_vm(struct kvm *kvm)
  937. {
  938. return 0;
  939. }
  940. /*
  941. * Called just after removing the VM from the vm_list, but before doing any
  942. * other destruction.
  943. */
  944. void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
  945. {
  946. }
  947. /*
  948. * Called after per-vm debugfs created. When called kvm->debugfs_dentry should
  949. * be setup already, so we can create arch-specific debugfs entries under it.
  950. * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
  951. * a per-arch destroy interface is not needed.
  952. */
  953. void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
  954. {
  955. }
  956. static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
  957. {
  958. struct kvm *kvm = kvm_arch_alloc_vm();
  959. struct kvm_memslots *slots;
  960. int r, i, j;
  961. if (!kvm)
  962. return ERR_PTR(-ENOMEM);
  963. KVM_MMU_LOCK_INIT(kvm);
  964. mmgrab(current->mm);
  965. kvm->mm = current->mm;
  966. kvm_eventfd_init(kvm);
  967. mutex_init(&kvm->lock);
  968. mutex_init(&kvm->irq_lock);
  969. mutex_init(&kvm->slots_lock);
  970. mutex_init(&kvm->slots_arch_lock);
  971. spin_lock_init(&kvm->mn_invalidate_lock);
  972. rcuwait_init(&kvm->mn_memslots_update_rcuwait);
  973. xa_init(&kvm->vcpu_array);
  974. #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
  975. xa_init(&kvm->mem_attr_array);
  976. #endif
  977. INIT_LIST_HEAD(&kvm->gpc_list);
  978. spin_lock_init(&kvm->gpc_lock);
  979. INIT_LIST_HEAD(&kvm->devices);
  980. kvm->max_vcpus = KVM_MAX_VCPUS;
  981. BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
  982. /*
  983. * Force subsequent debugfs file creations to fail if the VM directory
  984. * is not created (by kvm_create_vm_debugfs()).
  985. */
  986. kvm->debugfs_dentry = ERR_PTR(-ENOENT);
  987. snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
  988. task_pid_nr(current));
  989. r = -ENOMEM;
  990. if (init_srcu_struct(&kvm->srcu))
  991. goto out_err_no_srcu;
  992. if (init_srcu_struct(&kvm->irq_srcu))
  993. goto out_err_no_irq_srcu;
  994. r = kvm_init_irq_routing(kvm);
  995. if (r)
  996. goto out_err_no_irq_routing;
  997. refcount_set(&kvm->users_count, 1);
  998. for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
  999. for (j = 0; j < 2; j++) {
  1000. slots = &kvm->__memslots[i][j];
  1001. atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
  1002. slots->hva_tree = RB_ROOT_CACHED;
  1003. slots->gfn_tree = RB_ROOT;
  1004. hash_init(slots->id_hash);
  1005. slots->node_idx = j;
  1006. /* Generations must be different for each address space. */
  1007. slots->generation = i;
  1008. }
  1009. rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
  1010. }
  1011. r = -ENOMEM;
  1012. for (i = 0; i < KVM_NR_BUSES; i++) {
  1013. rcu_assign_pointer(kvm->buses[i],
  1014. kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
  1015. if (!kvm->buses[i])
  1016. goto out_err_no_arch_destroy_vm;
  1017. }
  1018. r = kvm_arch_init_vm(kvm, type);
  1019. if (r)
  1020. goto out_err_no_arch_destroy_vm;
  1021. r = kvm_enable_virtualization();
  1022. if (r)
  1023. goto out_err_no_disable;
  1024. #ifdef CONFIG_HAVE_KVM_IRQCHIP
  1025. INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
  1026. #endif
  1027. r = kvm_init_mmu_notifier(kvm);
  1028. if (r)
  1029. goto out_err_no_mmu_notifier;
  1030. r = kvm_coalesced_mmio_init(kvm);
  1031. if (r < 0)
  1032. goto out_no_coalesced_mmio;
  1033. r = kvm_create_vm_debugfs(kvm, fdname);
  1034. if (r)
  1035. goto out_err_no_debugfs;
  1036. r = kvm_arch_post_init_vm(kvm);
  1037. if (r)
  1038. goto out_err;
  1039. mutex_lock(&kvm_lock);
  1040. list_add(&kvm->vm_list, &vm_list);
  1041. mutex_unlock(&kvm_lock);
  1042. preempt_notifier_inc();
  1043. kvm_init_pm_notifier(kvm);
  1044. return kvm;
  1045. out_err:
  1046. kvm_destroy_vm_debugfs(kvm);
  1047. out_err_no_debugfs:
  1048. kvm_coalesced_mmio_free(kvm);
  1049. out_no_coalesced_mmio:
  1050. #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
  1051. if (kvm->mmu_notifier.ops)
  1052. mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
  1053. #endif
  1054. out_err_no_mmu_notifier:
  1055. kvm_disable_virtualization();
  1056. out_err_no_disable:
  1057. kvm_arch_destroy_vm(kvm);
  1058. out_err_no_arch_destroy_vm:
  1059. WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
  1060. for (i = 0; i < KVM_NR_BUSES; i++)
  1061. kfree(kvm_get_bus(kvm, i));
  1062. kvm_free_irq_routing(kvm);
  1063. out_err_no_irq_routing:
  1064. cleanup_srcu_struct(&kvm->irq_srcu);
  1065. out_err_no_irq_srcu:
  1066. cleanup_srcu_struct(&kvm->srcu);
  1067. out_err_no_srcu:
  1068. kvm_arch_free_vm(kvm);
  1069. mmdrop(current->mm);
  1070. return ERR_PTR(r);
  1071. }
  1072. static void kvm_destroy_devices(struct kvm *kvm)
  1073. {
  1074. struct kvm_device *dev, *tmp;
  1075. /*
  1076. * We do not need to take the kvm->lock here, because nobody else
  1077. * has a reference to the struct kvm at this point and therefore
  1078. * cannot access the devices list anyhow.
  1079. *
  1080. * The device list is generally managed as an rculist, but list_del()
  1081. * is used intentionally here. If a bug in KVM introduced a reader that
  1082. * was not backed by a reference on the kvm struct, the hope is that
  1083. * it'd consume the poisoned forward pointer instead of suffering a
  1084. * use-after-free, even though this cannot be guaranteed.
  1085. */
  1086. list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
  1087. list_del(&dev->vm_node);
  1088. dev->ops->destroy(dev);
  1089. }
  1090. }
  1091. static void kvm_destroy_vm(struct kvm *kvm)
  1092. {
  1093. int i;
  1094. struct mm_struct *mm = kvm->mm;
  1095. kvm_destroy_pm_notifier(kvm);
  1096. kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
  1097. kvm_destroy_vm_debugfs(kvm);
  1098. kvm_arch_sync_events(kvm);
  1099. mutex_lock(&kvm_lock);
  1100. list_del(&kvm->vm_list);
  1101. mutex_unlock(&kvm_lock);
  1102. kvm_arch_pre_destroy_vm(kvm);
  1103. kvm_free_irq_routing(kvm);
  1104. for (i = 0; i < KVM_NR_BUSES; i++) {
  1105. struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
  1106. if (bus)
  1107. kvm_io_bus_destroy(bus);
  1108. kvm->buses[i] = NULL;
  1109. }
  1110. kvm_coalesced_mmio_free(kvm);
  1111. #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
  1112. mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
  1113. /*
  1114. * At this point, pending calls to invalidate_range_start()
  1115. * have completed but no more MMU notifiers will run, so
  1116. * mn_active_invalidate_count may remain unbalanced.
  1117. * No threads can be waiting in kvm_swap_active_memslots() as the
  1118. * last reference on KVM has been dropped, but freeing
  1119. * memslots would deadlock without this manual intervention.
  1120. *
  1121. * If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU
  1122. * notifier between a start() and end(), then there shouldn't be any
  1123. * in-progress invalidations.
  1124. */
  1125. WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
  1126. if (kvm->mn_active_invalidate_count)
  1127. kvm->mn_active_invalidate_count = 0;
  1128. else
  1129. WARN_ON(kvm->mmu_invalidate_in_progress);
  1130. #else
  1131. kvm_flush_shadow_all(kvm);
  1132. #endif
  1133. kvm_arch_destroy_vm(kvm);
  1134. kvm_destroy_devices(kvm);
  1135. for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
  1136. kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
  1137. kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
  1138. }
  1139. cleanup_srcu_struct(&kvm->irq_srcu);
  1140. cleanup_srcu_struct(&kvm->srcu);
  1141. #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
  1142. xa_destroy(&kvm->mem_attr_array);
  1143. #endif
  1144. kvm_arch_free_vm(kvm);
  1145. preempt_notifier_dec();
  1146. kvm_disable_virtualization();
  1147. mmdrop(mm);
  1148. }
  1149. void kvm_get_kvm(struct kvm *kvm)
  1150. {
  1151. refcount_inc(&kvm->users_count);
  1152. }
  1153. EXPORT_SYMBOL_GPL(kvm_get_kvm);
  1154. /*
  1155. * Make sure the vm is not during destruction, which is a safe version of
  1156. * kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise.
  1157. */
  1158. bool kvm_get_kvm_safe(struct kvm *kvm)
  1159. {
  1160. return refcount_inc_not_zero(&kvm->users_count);
  1161. }
  1162. EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
  1163. void kvm_put_kvm(struct kvm *kvm)
  1164. {
  1165. if (refcount_dec_and_test(&kvm->users_count))
  1166. kvm_destroy_vm(kvm);
  1167. }
  1168. EXPORT_SYMBOL_GPL(kvm_put_kvm);
  1169. /*
  1170. * Used to put a reference that was taken on behalf of an object associated
  1171. * with a user-visible file descriptor, e.g. a vcpu or device, if installation
  1172. * of the new file descriptor fails and the reference cannot be transferred to
  1173. * its final owner. In such cases, the caller is still actively using @kvm and
  1174. * will fail miserably if the refcount unexpectedly hits zero.
  1175. */
  1176. void kvm_put_kvm_no_destroy(struct kvm *kvm)
  1177. {
  1178. WARN_ON(refcount_dec_and_test(&kvm->users_count));
  1179. }
  1180. EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
  1181. static int kvm_vm_release(struct inode *inode, struct file *filp)
  1182. {
  1183. struct kvm *kvm = filp->private_data;
  1184. kvm_irqfd_release(kvm);
  1185. kvm_put_kvm(kvm);
  1186. return 0;
  1187. }
  1188. /*
  1189. * Allocation size is twice as large as the actual dirty bitmap size.
  1190. * See kvm_vm_ioctl_get_dirty_log() why this is needed.
  1191. */
  1192. static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
  1193. {
  1194. unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
  1195. memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT);
  1196. if (!memslot->dirty_bitmap)
  1197. return -ENOMEM;
  1198. return 0;
  1199. }
  1200. static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
  1201. {
  1202. struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
  1203. int node_idx_inactive = active->node_idx ^ 1;
  1204. return &kvm->__memslots[as_id][node_idx_inactive];
  1205. }
  1206. /*
  1207. * Helper to get the address space ID when one of memslot pointers may be NULL.
  1208. * This also serves as a sanity that at least one of the pointers is non-NULL,
  1209. * and that their address space IDs don't diverge.
  1210. */
  1211. static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
  1212. struct kvm_memory_slot *b)
  1213. {
  1214. if (WARN_ON_ONCE(!a && !b))
  1215. return 0;
  1216. if (!a)
  1217. return b->as_id;
  1218. if (!b)
  1219. return a->as_id;
  1220. WARN_ON_ONCE(a->as_id != b->as_id);
  1221. return a->as_id;
  1222. }
  1223. static void kvm_insert_gfn_node(struct kvm_memslots *slots,
  1224. struct kvm_memory_slot *slot)
  1225. {
  1226. struct rb_root *gfn_tree = &slots->gfn_tree;
  1227. struct rb_node **node, *parent;
  1228. int idx = slots->node_idx;
  1229. parent = NULL;
  1230. for (node = &gfn_tree->rb_node; *node; ) {
  1231. struct kvm_memory_slot *tmp;
  1232. tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
  1233. parent = *node;
  1234. if (slot->base_gfn < tmp->base_gfn)
  1235. node = &(*node)->rb_left;
  1236. else if (slot->base_gfn > tmp->base_gfn)
  1237. node = &(*node)->rb_right;
  1238. else
  1239. BUG();
  1240. }
  1241. rb_link_node(&slot->gfn_node[idx], parent, node);
  1242. rb_insert_color(&slot->gfn_node[idx], gfn_tree);
  1243. }
  1244. static void kvm_erase_gfn_node(struct kvm_memslots *slots,
  1245. struct kvm_memory_slot *slot)
  1246. {
  1247. rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
  1248. }
  1249. static void kvm_replace_gfn_node(struct kvm_memslots *slots,
  1250. struct kvm_memory_slot *old,
  1251. struct kvm_memory_slot *new)
  1252. {
  1253. int idx = slots->node_idx;
  1254. WARN_ON_ONCE(old->base_gfn != new->base_gfn);
  1255. rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
  1256. &slots->gfn_tree);
  1257. }
  1258. /*
  1259. * Replace @old with @new in the inactive memslots.
  1260. *
  1261. * With NULL @old this simply adds @new.
  1262. * With NULL @new this simply removes @old.
  1263. *
  1264. * If @new is non-NULL its hva_node[slots_idx] range has to be set
  1265. * appropriately.
  1266. */
  1267. static void kvm_replace_memslot(struct kvm *kvm,
  1268. struct kvm_memory_slot *old,
  1269. struct kvm_memory_slot *new)
  1270. {
  1271. int as_id = kvm_memslots_get_as_id(old, new);
  1272. struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
  1273. int idx = slots->node_idx;
  1274. if (old) {
  1275. hash_del(&old->id_node[idx]);
  1276. interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);
  1277. if ((long)old == atomic_long_read(&slots->last_used_slot))
  1278. atomic_long_set(&slots->last_used_slot, (long)new);
  1279. if (!new) {
  1280. kvm_erase_gfn_node(slots, old);
  1281. return;
  1282. }
  1283. }
  1284. /*
  1285. * Initialize @new's hva range. Do this even when replacing an @old
  1286. * slot, kvm_copy_memslot() deliberately does not touch node data.
  1287. */
  1288. new->hva_node[idx].start = new->userspace_addr;
  1289. new->hva_node[idx].last = new->userspace_addr +
  1290. (new->npages << PAGE_SHIFT) - 1;
  1291. /*
  1292. * (Re)Add the new memslot. There is no O(1) interval_tree_replace(),
  1293. * hva_node needs to be swapped with remove+insert even though hva can't
  1294. * change when replacing an existing slot.
  1295. */
  1296. hash_add(slots->id_hash, &new->id_node[idx], new->id);
  1297. interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);
  1298. /*
  1299. * If the memslot gfn is unchanged, rb_replace_node() can be used to
  1300. * switch the node in the gfn tree instead of removing the old and
  1301. * inserting the new as two separate operations. Replacement is a
  1302. * single O(1) operation versus two O(log(n)) operations for
  1303. * remove+insert.
  1304. */
  1305. if (old && old->base_gfn == new->base_gfn) {
  1306. kvm_replace_gfn_node(slots, old, new);
  1307. } else {
  1308. if (old)
  1309. kvm_erase_gfn_node(slots, old);
  1310. kvm_insert_gfn_node(slots, new);
  1311. }
  1312. }
  1313. /*
  1314. * Flags that do not access any of the extra space of struct
  1315. * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_V1_FLAGS
  1316. * only allows these.
  1317. */
  1318. #define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \
  1319. (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)
  1320. static int check_memory_region_flags(struct kvm *kvm,
  1321. const struct kvm_userspace_memory_region2 *mem)
  1322. {
  1323. u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
  1324. if (kvm_arch_has_private_mem(kvm))
  1325. valid_flags |= KVM_MEM_GUEST_MEMFD;
  1326. /* Dirty logging private memory is not currently supported. */
  1327. if (mem->flags & KVM_MEM_GUEST_MEMFD)
  1328. valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
  1329. /*
  1330. * GUEST_MEMFD is incompatible with read-only memslots, as writes to
  1331. * read-only memslots have emulated MMIO, not page fault, semantics,
  1332. * and KVM doesn't allow emulated MMIO for private memory.
  1333. */
  1334. if (kvm_arch_has_readonly_mem(kvm) &&
  1335. !(mem->flags & KVM_MEM_GUEST_MEMFD))
  1336. valid_flags |= KVM_MEM_READONLY;
  1337. if (mem->flags & ~valid_flags)
  1338. return -EINVAL;
  1339. return 0;
  1340. }
  1341. static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
  1342. {
  1343. struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
  1344. /* Grab the generation from the activate memslots. */
  1345. u64 gen = __kvm_memslots(kvm, as_id)->generation;
  1346. WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
  1347. slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
  1348. /*
  1349. * Do not store the new memslots while there are invalidations in
  1350. * progress, otherwise the locking in invalidate_range_start and
  1351. * invalidate_range_end will be unbalanced.
  1352. */
  1353. spin_lock(&kvm->mn_invalidate_lock);
  1354. prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
  1355. while (kvm->mn_active_invalidate_count) {
  1356. set_current_state(TASK_UNINTERRUPTIBLE);
  1357. spin_unlock(&kvm->mn_invalidate_lock);
  1358. schedule();
  1359. spin_lock(&kvm->mn_invalidate_lock);
  1360. }
  1361. finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
  1362. rcu_assign_pointer(kvm->memslots[as_id], slots);
  1363. spin_unlock(&kvm->mn_invalidate_lock);
  1364. /*
  1365. * Acquired in kvm_set_memslot. Must be released before synchronize
  1366. * SRCU below in order to avoid deadlock with another thread
  1367. * acquiring the slots_arch_lock in an srcu critical section.
  1368. */
  1369. mutex_unlock(&kvm->slots_arch_lock);
  1370. synchronize_srcu_expedited(&kvm->srcu);
  1371. /*
  1372. * Increment the new memslot generation a second time, dropping the
  1373. * update in-progress flag and incrementing the generation based on
  1374. * the number of address spaces. This provides a unique and easily
  1375. * identifiable generation number while the memslots are in flux.
  1376. */
  1377. gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
  1378. /*
  1379. * Generations must be unique even across address spaces. We do not need
  1380. * a global counter for that, instead the generation space is evenly split
  1381. * across address spaces. For example, with two address spaces, address
  1382. * space 0 will use generations 0, 2, 4, ... while address space 1 will
  1383. * use generations 1, 3, 5, ...
  1384. */
  1385. gen += kvm_arch_nr_memslot_as_ids(kvm);
  1386. kvm_arch_memslots_updated(kvm, gen);
  1387. slots->generation = gen;
  1388. }
  1389. static int kvm_prepare_memory_region(struct kvm *kvm,
  1390. const struct kvm_memory_slot *old,
  1391. struct kvm_memory_slot *new,
  1392. enum kvm_mr_change change)
  1393. {
  1394. int r;
  1395. /*
  1396. * If dirty logging is disabled, nullify the bitmap; the old bitmap
  1397. * will be freed on "commit". If logging is enabled in both old and
  1398. * new, reuse the existing bitmap. If logging is enabled only in the
  1399. * new and KVM isn't using a ring buffer, allocate and initialize a
  1400. * new bitmap.
  1401. */
  1402. if (change != KVM_MR_DELETE) {
  1403. if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
  1404. new->dirty_bitmap = NULL;
  1405. else if (old && old->dirty_bitmap)
  1406. new->dirty_bitmap = old->dirty_bitmap;
  1407. else if (kvm_use_dirty_bitmap(kvm)) {
  1408. r = kvm_alloc_dirty_bitmap(new);
  1409. if (r)
  1410. return r;
  1411. if (kvm_dirty_log_manual_protect_and_init_set(kvm))
  1412. bitmap_set(new->dirty_bitmap, 0, new->npages);
  1413. }
  1414. }
  1415. r = kvm_arch_prepare_memory_region(kvm, old, new, change);
  1416. /* Free the bitmap on failure if it was allocated above. */
  1417. if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
  1418. kvm_destroy_dirty_bitmap(new);
  1419. return r;
  1420. }
  1421. static void kvm_commit_memory_region(struct kvm *kvm,
  1422. struct kvm_memory_slot *old,
  1423. const struct kvm_memory_slot *new,
  1424. enum kvm_mr_change change)
  1425. {
  1426. int old_flags = old ? old->flags : 0;
  1427. int new_flags = new ? new->flags : 0;
  1428. /*
  1429. * Update the total number of memslot pages before calling the arch
  1430. * hook so that architectures can consume the result directly.
  1431. */
  1432. if (change == KVM_MR_DELETE)
  1433. kvm->nr_memslot_pages -= old->npages;
  1434. else if (change == KVM_MR_CREATE)
  1435. kvm->nr_memslot_pages += new->npages;
  1436. if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
  1437. int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;
  1438. atomic_set(&kvm->nr_memslots_dirty_logging,
  1439. atomic_read(&kvm->nr_memslots_dirty_logging) + change);
  1440. }
  1441. kvm_arch_commit_memory_region(kvm, old, new, change);
  1442. switch (change) {
  1443. case KVM_MR_CREATE:
  1444. /* Nothing more to do. */
  1445. break;
  1446. case KVM_MR_DELETE:
  1447. /* Free the old memslot and all its metadata. */
  1448. kvm_free_memslot(kvm, old);
  1449. break;
  1450. case KVM_MR_MOVE:
  1451. case KVM_MR_FLAGS_ONLY:
  1452. /*
  1453. * Free the dirty bitmap as needed; the below check encompasses
  1454. * both the flags and whether a ring buffer is being used)
  1455. */
  1456. if (old->dirty_bitmap && !new->dirty_bitmap)
  1457. kvm_destroy_dirty_bitmap(old);
  1458. /*
  1459. * The final quirk. Free the detached, old slot, but only its
  1460. * memory, not any metadata. Metadata, including arch specific
  1461. * data, may be reused by @new.
  1462. */
  1463. kfree(old);
  1464. break;
  1465. default:
  1466. BUG();
  1467. }
  1468. }
  1469. /*
  1470. * Activate @new, which must be installed in the inactive slots by the caller,
  1471. * by swapping the active slots and then propagating @new to @old once @old is
  1472. * unreachable and can be safely modified.
  1473. *
  1474. * With NULL @old this simply adds @new to @active (while swapping the sets).
  1475. * With NULL @new this simply removes @old from @active and frees it
  1476. * (while also swapping the sets).
  1477. */
  1478. static void kvm_activate_memslot(struct kvm *kvm,
  1479. struct kvm_memory_slot *old,
  1480. struct kvm_memory_slot *new)
  1481. {
  1482. int as_id = kvm_memslots_get_as_id(old, new);
  1483. kvm_swap_active_memslots(kvm, as_id);
  1484. /* Propagate the new memslot to the now inactive memslots. */
  1485. kvm_replace_memslot(kvm, old, new);
  1486. }
  1487. static void kvm_copy_memslot(struct kvm_memory_slot *dest,
  1488. const struct kvm_memory_slot *src)
  1489. {
  1490. dest->base_gfn = src->base_gfn;
  1491. dest->npages = src->npages;
  1492. dest->dirty_bitmap = src->dirty_bitmap;
  1493. dest->arch = src->arch;
  1494. dest->userspace_addr = src->userspace_addr;
  1495. dest->flags = src->flags;
  1496. dest->id = src->id;
  1497. dest->as_id = src->as_id;
  1498. }
  1499. static void kvm_invalidate_memslot(struct kvm *kvm,
  1500. struct kvm_memory_slot *old,
  1501. struct kvm_memory_slot *invalid_slot)
  1502. {
  1503. /*
  1504. * Mark the current slot INVALID. As with all memslot modifications,
  1505. * this must be done on an unreachable slot to avoid modifying the
  1506. * current slot in the active tree.
  1507. */
  1508. kvm_copy_memslot(invalid_slot, old);
  1509. invalid_slot->flags |= KVM_MEMSLOT_INVALID;
  1510. kvm_replace_memslot(kvm, old, invalid_slot);
  1511. /*
  1512. * Activate the slot that is now marked INVALID, but don't propagate
  1513. * the slot to the now inactive slots. The slot is either going to be
  1514. * deleted or recreated as a new slot.
  1515. */
  1516. kvm_swap_active_memslots(kvm, old->as_id);
  1517. /*
  1518. * From this point no new shadow pages pointing to a deleted, or moved,
  1519. * memslot will be created. Validation of sp->gfn happens in:
  1520. * - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
  1521. * - kvm_is_visible_gfn (mmu_check_root)
  1522. */
  1523. kvm_arch_flush_shadow_memslot(kvm, old);
  1524. kvm_arch_guest_memory_reclaimed(kvm);
  1525. /* Was released by kvm_swap_active_memslots(), reacquire. */
  1526. mutex_lock(&kvm->slots_arch_lock);
  1527. /*
  1528. * Copy the arch-specific field of the newly-installed slot back to the
  1529. * old slot as the arch data could have changed between releasing
  1530. * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock
  1531. * above. Writers are required to retrieve memslots *after* acquiring
  1532. * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
  1533. */
  1534. old->arch = invalid_slot->arch;
  1535. }
  1536. static void kvm_create_memslot(struct kvm *kvm,
  1537. struct kvm_memory_slot *new)
  1538. {
  1539. /* Add the new memslot to the inactive set and activate. */
  1540. kvm_replace_memslot(kvm, NULL, new);
  1541. kvm_activate_memslot(kvm, NULL, new);
  1542. }
  1543. static void kvm_delete_memslot(struct kvm *kvm,
  1544. struct kvm_memory_slot *old,
  1545. struct kvm_memory_slot *invalid_slot)
  1546. {
  1547. /*
  1548. * Remove the old memslot (in the inactive memslots) by passing NULL as
  1549. * the "new" slot, and for the invalid version in the active slots.
  1550. */
  1551. kvm_replace_memslot(kvm, old, NULL);
  1552. kvm_activate_memslot(kvm, invalid_slot, NULL);
  1553. }
  1554. static void kvm_move_memslot(struct kvm *kvm,
  1555. struct kvm_memory_slot *old,
  1556. struct kvm_memory_slot *new,
  1557. struct kvm_memory_slot *invalid_slot)
  1558. {
  1559. /*
  1560. * Replace the old memslot in the inactive slots, and then swap slots
  1561. * and replace the current INVALID with the new as well.
  1562. */
  1563. kvm_replace_memslot(kvm, old, new);
  1564. kvm_activate_memslot(kvm, invalid_slot, new);
  1565. }
  1566. static void kvm_update_flags_memslot(struct kvm *kvm,
  1567. struct kvm_memory_slot *old,
  1568. struct kvm_memory_slot *new)
  1569. {
  1570. /*
  1571. * Similar to the MOVE case, but the slot doesn't need to be zapped as
  1572. * an intermediate step. Instead, the old memslot is simply replaced
  1573. * with a new, updated copy in both memslot sets.
  1574. */
  1575. kvm_replace_memslot(kvm, old, new);
  1576. kvm_activate_memslot(kvm, old, new);
  1577. }
  1578. static int kvm_set_memslot(struct kvm *kvm,
  1579. struct kvm_memory_slot *old,
  1580. struct kvm_memory_slot *new,
  1581. enum kvm_mr_change change)
  1582. {
  1583. struct kvm_memory_slot *invalid_slot;
  1584. int r;
  1585. /*
  1586. * Released in kvm_swap_active_memslots().
  1587. *
  1588. * Must be held from before the current memslots are copied until after
  1589. * the new memslots are installed with rcu_assign_pointer, then
  1590. * released before the synchronize srcu in kvm_swap_active_memslots().
  1591. *
  1592. * When modifying memslots outside of the slots_lock, must be held
  1593. * before reading the pointer to the current memslots until after all
  1594. * changes to those memslots are complete.
  1595. *
  1596. * These rules ensure that installing new memslots does not lose
  1597. * changes made to the previous memslots.
  1598. */
  1599. mutex_lock(&kvm->slots_arch_lock);
  1600. /*
  1601. * Invalidate the old slot if it's being deleted or moved. This is
  1602. * done prior to actually deleting/moving the memslot to allow vCPUs to
  1603. * continue running by ensuring there are no mappings or shadow pages
  1604. * for the memslot when it is deleted/moved. Without pre-invalidation
  1605. * (and without a lock), a window would exist between effecting the
  1606. * delete/move and committing the changes in arch code where KVM or a
  1607. * guest could access a non-existent memslot.
  1608. *
  1609. * Modifications are done on a temporary, unreachable slot. The old
  1610. * slot needs to be preserved in case a later step fails and the
  1611. * invalidation needs to be reverted.
  1612. */
  1613. if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
  1614. invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
  1615. if (!invalid_slot) {
  1616. mutex_unlock(&kvm->slots_arch_lock);
  1617. return -ENOMEM;
  1618. }
  1619. kvm_invalidate_memslot(kvm, old, invalid_slot);
  1620. }
  1621. r = kvm_prepare_memory_region(kvm, old, new, change);
  1622. if (r) {
  1623. /*
  1624. * For DELETE/MOVE, revert the above INVALID change. No
  1625. * modifications required since the original slot was preserved
  1626. * in the inactive slots. Changing the active memslots also
  1627. * release slots_arch_lock.
  1628. */
  1629. if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
  1630. kvm_activate_memslot(kvm, invalid_slot, old);
  1631. kfree(invalid_slot);
  1632. } else {
  1633. mutex_unlock(&kvm->slots_arch_lock);
  1634. }
  1635. return r;
  1636. }
  1637. /*
  1638. * For DELETE and MOVE, the working slot is now active as the INVALID
  1639. * version of the old slot. MOVE is particularly special as it reuses
  1640. * the old slot and returns a copy of the old slot (in working_slot).
  1641. * For CREATE, there is no old slot. For DELETE and FLAGS_ONLY, the
  1642. * old slot is detached but otherwise preserved.
  1643. */
  1644. if (change == KVM_MR_CREATE)
  1645. kvm_create_memslot(kvm, new);
  1646. else if (change == KVM_MR_DELETE)
  1647. kvm_delete_memslot(kvm, old, invalid_slot);
  1648. else if (change == KVM_MR_MOVE)
  1649. kvm_move_memslot(kvm, old, new, invalid_slot);
  1650. else if (change == KVM_MR_FLAGS_ONLY)
  1651. kvm_update_flags_memslot(kvm, old, new);
  1652. else
  1653. BUG();
  1654. /* Free the temporary INVALID slot used for DELETE and MOVE. */
  1655. if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
  1656. kfree(invalid_slot);
  1657. /*
  1658. * No need to refresh new->arch, changes after dropping slots_arch_lock
  1659. * will directly hit the final, active memslot. Architectures are
  1660. * responsible for knowing that new->arch may be stale.
  1661. */
  1662. kvm_commit_memory_region(kvm, old, new, change);
  1663. return 0;
  1664. }
  1665. static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
  1666. gfn_t start, gfn_t end)
  1667. {
  1668. struct kvm_memslot_iter iter;
  1669. kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
  1670. if (iter.slot->id != id)
  1671. return true;
  1672. }
  1673. return false;
  1674. }
  1675. /*
  1676. * Allocate some memory and give it an address in the guest physical address
  1677. * space.
  1678. *
  1679. * Discontiguous memory is allowed, mostly for framebuffers.
  1680. *
  1681. * Must be called holding kvm->slots_lock for write.
  1682. */
  1683. int __kvm_set_memory_region(struct kvm *kvm,
  1684. const struct kvm_userspace_memory_region2 *mem)
  1685. {
  1686. struct kvm_memory_slot *old, *new;
  1687. struct kvm_memslots *slots;
  1688. enum kvm_mr_change change;
  1689. unsigned long npages;
  1690. gfn_t base_gfn;
  1691. int as_id, id;
  1692. int r;
  1693. r = check_memory_region_flags(kvm, mem);
  1694. if (r)
  1695. return r;
  1696. as_id = mem->slot >> 16;
  1697. id = (u16)mem->slot;
  1698. /* General sanity checks */
  1699. if ((mem->memory_size & (PAGE_SIZE - 1)) ||
  1700. (mem->memory_size != (unsigned long)mem->memory_size))
  1701. return -EINVAL;
  1702. if (mem->guest_phys_addr & (PAGE_SIZE - 1))
  1703. return -EINVAL;
  1704. /* We can read the guest memory with __xxx_user() later on. */
  1705. if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
  1706. (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
  1707. !access_ok((void __user *)(unsigned long)mem->userspace_addr,
  1708. mem->memory_size))
  1709. return -EINVAL;
  1710. if (mem->flags & KVM_MEM_GUEST_MEMFD &&
  1711. (mem->guest_memfd_offset & (PAGE_SIZE - 1) ||
  1712. mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset))
  1713. return -EINVAL;
  1714. if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM)
  1715. return -EINVAL;
  1716. if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
  1717. return -EINVAL;
  1718. if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
  1719. return -EINVAL;
  1720. slots = __kvm_memslots(kvm, as_id);
  1721. /*
  1722. * Note, the old memslot (and the pointer itself!) may be invalidated
  1723. * and/or destroyed by kvm_set_memslot().
  1724. */
  1725. old = id_to_memslot(slots, id);
  1726. if (!mem->memory_size) {
  1727. if (!old || !old->npages)
  1728. return -EINVAL;
  1729. if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
  1730. return -EIO;
  1731. return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
  1732. }
  1733. base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
  1734. npages = (mem->memory_size >> PAGE_SHIFT);
  1735. if (!old || !old->npages) {
  1736. change = KVM_MR_CREATE;
  1737. /*
  1738. * To simplify KVM internals, the total number of pages across
  1739. * all memslots must fit in an unsigned long.
  1740. */
  1741. if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
  1742. return -EINVAL;
  1743. } else { /* Modify an existing slot. */
  1744. /* Private memslots are immutable, they can only be deleted. */
  1745. if (mem->flags & KVM_MEM_GUEST_MEMFD)
  1746. return -EINVAL;
  1747. if ((mem->userspace_addr != old->userspace_addr) ||
  1748. (npages != old->npages) ||
  1749. ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
  1750. return -EINVAL;
  1751. if (base_gfn != old->base_gfn)
  1752. change = KVM_MR_MOVE;
  1753. else if (mem->flags != old->flags)
  1754. change = KVM_MR_FLAGS_ONLY;
  1755. else /* Nothing to change. */
  1756. return 0;
  1757. }
  1758. if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
  1759. kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
  1760. return -EEXIST;
  1761. /* Allocate a slot that will persist in the memslot. */
  1762. new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
  1763. if (!new)
  1764. return -ENOMEM;
  1765. new->as_id = as_id;
  1766. new->id = id;
  1767. new->base_gfn = base_gfn;
  1768. new->npages = npages;
  1769. new->flags = mem->flags;
  1770. new->userspace_addr = mem->userspace_addr;
  1771. if (mem->flags & KVM_MEM_GUEST_MEMFD) {
  1772. r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset);
  1773. if (r)
  1774. goto out;
  1775. }
  1776. r = kvm_set_memslot(kvm, old, new, change);
  1777. if (r)
  1778. goto out_unbind;
  1779. return 0;
  1780. out_unbind:
  1781. if (mem->flags & KVM_MEM_GUEST_MEMFD)
  1782. kvm_gmem_unbind(new);
  1783. out:
  1784. kfree(new);
  1785. return r;
  1786. }
  1787. EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
  1788. int kvm_set_memory_region(struct kvm *kvm,
  1789. const struct kvm_userspace_memory_region2 *mem)
  1790. {
  1791. int r;
  1792. mutex_lock(&kvm->slots_lock);
  1793. r = __kvm_set_memory_region(kvm, mem);
  1794. mutex_unlock(&kvm->slots_lock);
  1795. return r;
  1796. }
  1797. EXPORT_SYMBOL_GPL(kvm_set_memory_region);
  1798. static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
  1799. struct kvm_userspace_memory_region2 *mem)
  1800. {
  1801. if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
  1802. return -EINVAL;
  1803. return kvm_set_memory_region(kvm, mem);
  1804. }
  1805. #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
  1806. /**
  1807. * kvm_get_dirty_log - get a snapshot of dirty pages
  1808. * @kvm: pointer to kvm instance
  1809. * @log: slot id and address to which we copy the log
  1810. * @is_dirty: set to '1' if any dirty pages were found
  1811. * @memslot: set to the associated memslot, always valid on success
  1812. */
  1813. int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
  1814. int *is_dirty, struct kvm_memory_slot **memslot)
  1815. {
  1816. struct kvm_memslots *slots;
  1817. int i, as_id, id;
  1818. unsigned long n;
  1819. unsigned long any = 0;
  1820. /* Dirty ring tracking may be exclusive to dirty log tracking */
  1821. if (!kvm_use_dirty_bitmap(kvm))
  1822. return -ENXIO;
  1823. *memslot = NULL;
  1824. *is_dirty = 0;
  1825. as_id = log->slot >> 16;
  1826. id = (u16)log->slot;
  1827. if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
  1828. return -EINVAL;
  1829. slots = __kvm_memslots(kvm, as_id);
  1830. *memslot = id_to_memslot(slots, id);
  1831. if (!(*memslot) || !(*memslot)->dirty_bitmap)
  1832. return -ENOENT;
  1833. kvm_arch_sync_dirty_log(kvm, *memslot);
  1834. n = kvm_dirty_bitmap_bytes(*memslot);
  1835. for (i = 0; !any && i < n/sizeof(long); ++i)
  1836. any = (*memslot)->dirty_bitmap[i];
  1837. if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
  1838. return -EFAULT;
  1839. if (any)
  1840. *is_dirty = 1;
  1841. return 0;
  1842. }
  1843. EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
  1844. #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
  1845. /**
  1846. * kvm_get_dirty_log_protect - get a snapshot of dirty pages
  1847. * and reenable dirty page tracking for the corresponding pages.
  1848. * @kvm: pointer to kvm instance
  1849. * @log: slot id and address to which we copy the log
  1850. *
  1851. * We need to keep it in mind that VCPU threads can write to the bitmap
  1852. * concurrently. So, to avoid losing track of dirty pages we keep the
  1853. * following order:
  1854. *
  1855. * 1. Take a snapshot of the bit and clear it if needed.
  1856. * 2. Write protect the corresponding page.
  1857. * 3. Copy the snapshot to the userspace.
  1858. * 4. Upon return caller flushes TLB's if needed.
  1859. *
  1860. * Between 2 and 4, the guest may write to the page using the remaining TLB
  1861. * entry. This is not a problem because the page is reported dirty using
  1862. * the snapshot taken before and step 4 ensures that writes done after
  1863. * exiting to userspace will be logged for the next call.
  1864. *
  1865. */
  1866. static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
  1867. {
  1868. struct kvm_memslots *slots;
  1869. struct kvm_memory_slot *memslot;
  1870. int i, as_id, id;
  1871. unsigned long n;
  1872. unsigned long *dirty_bitmap;
  1873. unsigned long *dirty_bitmap_buffer;
  1874. bool flush;
  1875. /* Dirty ring tracking may be exclusive to dirty log tracking */
  1876. if (!kvm_use_dirty_bitmap(kvm))
  1877. return -ENXIO;
  1878. as_id = log->slot >> 16;
  1879. id = (u16)log->slot;
  1880. if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
  1881. return -EINVAL;
  1882. slots = __kvm_memslots(kvm, as_id);
  1883. memslot = id_to_memslot(slots, id);
  1884. if (!memslot || !memslot->dirty_bitmap)
  1885. return -ENOENT;
  1886. dirty_bitmap = memslot->dirty_bitmap;
  1887. kvm_arch_sync_dirty_log(kvm, memslot);
  1888. n = kvm_dirty_bitmap_bytes(memslot);
  1889. flush = false;
  1890. if (kvm->manual_dirty_log_protect) {
  1891. /*
  1892. * Unlike kvm_get_dirty_log, we always return false in *flush,
  1893. * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
  1894. * is some code duplication between this function and
  1895. * kvm_get_dirty_log, but hopefully all architecture
  1896. * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
  1897. * can be eliminated.
  1898. */
  1899. dirty_bitmap_buffer = dirty_bitmap;
  1900. } else {
  1901. dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
  1902. memset(dirty_bitmap_buffer, 0, n);
  1903. KVM_MMU_LOCK(kvm);
  1904. for (i = 0; i < n / sizeof(long); i++) {
  1905. unsigned long mask;
  1906. gfn_t offset;
  1907. if (!dirty_bitmap[i])
  1908. continue;
  1909. flush = true;
  1910. mask = xchg(&dirty_bitmap[i], 0);
  1911. dirty_bitmap_buffer[i] = mask;
  1912. offset = i * BITS_PER_LONG;
  1913. kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
  1914. offset, mask);
  1915. }
  1916. KVM_MMU_UNLOCK(kvm);
  1917. }
  1918. if (flush)
  1919. kvm_flush_remote_tlbs_memslot(kvm, memslot);
  1920. if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
  1921. return -EFAULT;
  1922. return 0;
  1923. }
  1924. /**
  1925. * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
  1926. * @kvm: kvm instance
  1927. * @log: slot id and address to which we copy the log
  1928. *
  1929. * Steps 1-4 below provide general overview of dirty page logging. See
  1930. * kvm_get_dirty_log_protect() function description for additional details.
  1931. *
  1932. * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
  1933. * always flush the TLB (step 4) even if previous step failed and the dirty
  1934. * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
  1935. * does not preclude user space subsequent dirty log read. Flushing TLB ensures
  1936. * writes will be marked dirty for next log read.
  1937. *
  1938. * 1. Take a snapshot of the bit and clear it if needed.
  1939. * 2. Write protect the corresponding page.
  1940. * 3. Copy the snapshot to the userspace.
  1941. * 4. Flush TLB's if needed.
  1942. */
  1943. static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
  1944. struct kvm_dirty_log *log)
  1945. {
  1946. int r;
  1947. mutex_lock(&kvm->slots_lock);
  1948. r = kvm_get_dirty_log_protect(kvm, log);
  1949. mutex_unlock(&kvm->slots_lock);
  1950. return r;
  1951. }
  1952. /**
  1953. * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
  1954. * and reenable dirty page tracking for the corresponding pages.
  1955. * @kvm: pointer to kvm instance
  1956. * @log: slot id and address from which to fetch the bitmap of dirty pages
  1957. */
  1958. static int kvm_clear_dirty_log_protect(struct kvm *kvm,
  1959. struct kvm_clear_dirty_log *log)
  1960. {
  1961. struct kvm_memslots *slots;
  1962. struct kvm_memory_slot *memslot;
  1963. int as_id, id;
  1964. gfn_t offset;
  1965. unsigned long i, n;
  1966. unsigned long *dirty_bitmap;
  1967. unsigned long *dirty_bitmap_buffer;
  1968. bool flush;
  1969. /* Dirty ring tracking may be exclusive to dirty log tracking */
  1970. if (!kvm_use_dirty_bitmap(kvm))
  1971. return -ENXIO;
  1972. as_id = log->slot >> 16;
  1973. id = (u16)log->slot;
  1974. if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
  1975. return -EINVAL;
  1976. if (log->first_page & 63)
  1977. return -EINVAL;
  1978. slots = __kvm_memslots(kvm, as_id);
  1979. memslot = id_to_memslot(slots, id);
  1980. if (!memslot || !memslot->dirty_bitmap)
  1981. return -ENOENT;
  1982. dirty_bitmap = memslot->dirty_bitmap;
  1983. n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
  1984. if (log->first_page > memslot->npages ||
  1985. log->num_pages > memslot->npages - log->first_page ||
  1986. (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
  1987. return -EINVAL;
  1988. kvm_arch_sync_dirty_log(kvm, memslot);
  1989. flush = false;
  1990. dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
  1991. if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
  1992. return -EFAULT;
  1993. KVM_MMU_LOCK(kvm);
  1994. for (offset = log->first_page, i = offset / BITS_PER_LONG,
  1995. n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
  1996. i++, offset += BITS_PER_LONG) {
  1997. unsigned long mask = *dirty_bitmap_buffer++;
  1998. atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
  1999. if (!mask)
  2000. continue;
  2001. mask &= atomic_long_fetch_andnot(mask, p);
  2002. /*
  2003. * mask contains the bits that really have been cleared. This
  2004. * never includes any bits beyond the length of the memslot (if
  2005. * the length is not aligned to 64 pages), therefore it is not
  2006. * a problem if userspace sets them in log->dirty_bitmap.
  2007. */
  2008. if (mask) {
  2009. flush = true;
  2010. kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
  2011. offset, mask);
  2012. }
  2013. }
  2014. KVM_MMU_UNLOCK(kvm);
  2015. if (flush)
  2016. kvm_flush_remote_tlbs_memslot(kvm, memslot);
  2017. return 0;
  2018. }
  2019. static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
  2020. struct kvm_clear_dirty_log *log)
  2021. {
  2022. int r;
  2023. mutex_lock(&kvm->slots_lock);
  2024. r = kvm_clear_dirty_log_protect(kvm, log);
  2025. mutex_unlock(&kvm->slots_lock);
  2026. return r;
  2027. }
  2028. #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
  2029. #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
  2030. static u64 kvm_supported_mem_attributes(struct kvm *kvm)
  2031. {
  2032. if (!kvm || kvm_arch_has_private_mem(kvm))
  2033. return KVM_MEMORY_ATTRIBUTE_PRIVATE;
  2034. return 0;
  2035. }
  2036. /*
  2037. * Returns true if _all_ gfns in the range [@start, @end) have attributes
  2038. * such that the bits in @mask match @attrs.
  2039. */
  2040. bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
  2041. unsigned long mask, unsigned long attrs)
  2042. {
  2043. XA_STATE(xas, &kvm->mem_attr_array, start);
  2044. unsigned long index;
  2045. void *entry;
  2046. mask &= kvm_supported_mem_attributes(kvm);
  2047. if (attrs & ~mask)
  2048. return false;
  2049. if (end == start + 1)
  2050. return (kvm_get_memory_attributes(kvm, start) & mask) == attrs;
  2051. guard(rcu)();
  2052. if (!attrs)
  2053. return !xas_find(&xas, end - 1);
  2054. for (index = start; index < end; index++) {
  2055. do {
  2056. entry = xas_next(&xas);
  2057. } while (xas_retry(&xas, entry));
  2058. if (xas.xa_index != index ||
  2059. (xa_to_value(entry) & mask) != attrs)
  2060. return false;
  2061. }
  2062. return true;
  2063. }
  2064. static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
  2065. struct kvm_mmu_notifier_range *range)
  2066. {
  2067. struct kvm_gfn_range gfn_range;
  2068. struct kvm_memory_slot *slot;
  2069. struct kvm_memslots *slots;
  2070. struct kvm_memslot_iter iter;
  2071. bool found_memslot = false;
  2072. bool ret = false;
  2073. int i;
  2074. gfn_range.arg = range->arg;
  2075. gfn_range.may_block = range->may_block;
  2076. for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
  2077. slots = __kvm_memslots(kvm, i);
  2078. kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) {
  2079. slot = iter.slot;
  2080. gfn_range.slot = slot;
  2081. gfn_range.start = max(range->start, slot->base_gfn);
  2082. gfn_range.end = min(range->end, slot->base_gfn + slot->npages);
  2083. if (gfn_range.start >= gfn_range.end)
  2084. continue;
  2085. if (!found_memslot) {
  2086. found_memslot = true;
  2087. KVM_MMU_LOCK(kvm);
  2088. if (!IS_KVM_NULL_FN(range->on_lock))
  2089. range->on_lock(kvm);
  2090. }
  2091. ret |= range->handler(kvm, &gfn_range);
  2092. }
  2093. }
  2094. if (range->flush_on_ret && ret)
  2095. kvm_flush_remote_tlbs(kvm);
  2096. if (found_memslot)
  2097. KVM_MMU_UNLOCK(kvm);
  2098. }
  2099. static bool kvm_pre_set_memory_attributes(struct kvm *kvm,
  2100. struct kvm_gfn_range *range)
  2101. {
  2102. /*
  2103. * Unconditionally add the range to the invalidation set, regardless of
  2104. * whether or not the arch callback actually needs to zap SPTEs. E.g.
  2105. * if KVM supports RWX attributes in the future and the attributes are
  2106. * going from R=>RW, zapping isn't strictly necessary. Unconditionally
  2107. * adding the range allows KVM to require that MMU invalidations add at
  2108. * least one range between begin() and end(), e.g. allows KVM to detect
  2109. * bugs where the add() is missed. Relaxing the rule *might* be safe,
  2110. * but it's not obvious that allowing new mappings while the attributes
  2111. * are in flux is desirable or worth the complexity.
  2112. */
  2113. kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
  2114. return kvm_arch_pre_set_memory_attributes(kvm, range);
  2115. }
  2116. /* Set @attributes for the gfn range [@start, @end). */
  2117. static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
  2118. unsigned long attributes)
  2119. {
  2120. struct kvm_mmu_notifier_range pre_set_range = {
  2121. .start = start,
  2122. .end = end,
  2123. .handler = kvm_pre_set_memory_attributes,
  2124. .on_lock = kvm_mmu_invalidate_begin,
  2125. .flush_on_ret = true,
  2126. .may_block = true,
  2127. };
  2128. struct kvm_mmu_notifier_range post_set_range = {
  2129. .start = start,
  2130. .end = end,
  2131. .arg.attributes = attributes,
  2132. .handler = kvm_arch_post_set_memory_attributes,
  2133. .on_lock = kvm_mmu_invalidate_end,
  2134. .may_block = true,
  2135. };
  2136. unsigned long i;
  2137. void *entry;
  2138. int r = 0;
  2139. entry = attributes ? xa_mk_value(attributes) : NULL;
  2140. mutex_lock(&kvm->slots_lock);
  2141. /* Nothing to do if the entire range as the desired attributes. */
  2142. if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes))
  2143. goto out_unlock;
  2144. /*
  2145. * Reserve memory ahead of time to avoid having to deal with failures
  2146. * partway through setting the new attributes.
  2147. */
  2148. for (i = start; i < end; i++) {
  2149. r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT);
  2150. if (r)
  2151. goto out_unlock;
  2152. }
  2153. kvm_handle_gfn_range(kvm, &pre_set_range);
  2154. for (i = start; i < end; i++) {
  2155. r = xa_err(xa_store(&kvm->mem_attr_array, i, entry,
  2156. GFP_KERNEL_ACCOUNT));
  2157. KVM_BUG_ON(r, kvm);
  2158. }
  2159. kvm_handle_gfn_range(kvm, &post_set_range);
  2160. out_unlock:
  2161. mutex_unlock(&kvm->slots_lock);
  2162. return r;
  2163. }
  2164. static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
  2165. struct kvm_memory_attributes *attrs)
  2166. {
  2167. gfn_t start, end;
  2168. /* flags is currently not used. */
  2169. if (attrs->flags)
  2170. return -EINVAL;
  2171. if (attrs->attributes & ~kvm_supported_mem_attributes(kvm))
  2172. return -EINVAL;
  2173. if (attrs->size == 0 || attrs->address + attrs->size < attrs->address)
  2174. return -EINVAL;
  2175. if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size))
  2176. return -EINVAL;
  2177. start = attrs->address >> PAGE_SHIFT;
  2178. end = (attrs->address + attrs->size) >> PAGE_SHIFT;
  2179. /*
  2180. * xarray tracks data using "unsigned long", and as a result so does
  2181. * KVM. For simplicity, supports generic attributes only on 64-bit
  2182. * architectures.
  2183. */
  2184. BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long));
  2185. return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
  2186. }
  2187. #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
  2188. struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
  2189. {
  2190. return __gfn_to_memslot(kvm_memslots(kvm), gfn);
  2191. }
  2192. EXPORT_SYMBOL_GPL(gfn_to_memslot);
  2193. struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
  2194. {
  2195. struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
  2196. u64 gen = slots->generation;
  2197. struct kvm_memory_slot *slot;
  2198. /*
  2199. * This also protects against using a memslot from a different address space,
  2200. * since different address spaces have different generation numbers.
  2201. */
  2202. if (unlikely(gen != vcpu->last_used_slot_gen)) {
  2203. vcpu->last_used_slot = NULL;
  2204. vcpu->last_used_slot_gen = gen;
  2205. }
  2206. slot = try_get_memslot(vcpu->last_used_slot, gfn);
  2207. if (slot)
  2208. return slot;
  2209. /*
  2210. * Fall back to searching all memslots. We purposely use
  2211. * search_memslots() instead of __gfn_to_memslot() to avoid
  2212. * thrashing the VM-wide last_used_slot in kvm_memslots.
  2213. */
  2214. slot = search_memslots(slots, gfn, false);
  2215. if (slot) {
  2216. vcpu->last_used_slot = slot;
  2217. return slot;
  2218. }
  2219. return NULL;
  2220. }
  2221. bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
  2222. {
  2223. struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
  2224. return kvm_is_visible_memslot(memslot);
  2225. }
  2226. EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
  2227. bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
  2228. {
  2229. struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
  2230. return kvm_is_visible_memslot(memslot);
  2231. }
  2232. EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
  2233. unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
  2234. {
  2235. struct vm_area_struct *vma;
  2236. unsigned long addr, size;
  2237. size = PAGE_SIZE;
  2238. addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
  2239. if (kvm_is_error_hva(addr))
  2240. return PAGE_SIZE;
  2241. mmap_read_lock(current->mm);
  2242. vma = find_vma(current->mm, addr);
  2243. if (!vma)
  2244. goto out;
  2245. size = vma_kernel_pagesize(vma);
  2246. out:
  2247. mmap_read_unlock(current->mm);
  2248. return size;
  2249. }
  2250. static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
  2251. {
  2252. return slot->flags & KVM_MEM_READONLY;
  2253. }
  2254. static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
  2255. gfn_t *nr_pages, bool write)
  2256. {
  2257. if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
  2258. return KVM_HVA_ERR_BAD;
  2259. if (memslot_is_readonly(slot) && write)
  2260. return KVM_HVA_ERR_RO_BAD;
  2261. if (nr_pages)
  2262. *nr_pages = slot->npages - (gfn - slot->base_gfn);
  2263. return __gfn_to_hva_memslot(slot, gfn);
  2264. }
  2265. static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
  2266. gfn_t *nr_pages)
  2267. {
  2268. return __gfn_to_hva_many(slot, gfn, nr_pages, true);
  2269. }
  2270. unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
  2271. gfn_t gfn)
  2272. {
  2273. return gfn_to_hva_many(slot, gfn, NULL);
  2274. }
  2275. EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
  2276. unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
  2277. {
  2278. return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
  2279. }
  2280. EXPORT_SYMBOL_GPL(gfn_to_hva);
  2281. unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
  2282. {
  2283. return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
  2284. }
  2285. EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
  2286. /*
  2287. * Return the hva of a @gfn and the R/W attribute if possible.
  2288. *
  2289. * @slot: the kvm_memory_slot which contains @gfn
  2290. * @gfn: the gfn to be translated
  2291. * @writable: used to return the read/write attribute of the @slot if the hva
  2292. * is valid and @writable is not NULL
  2293. */
  2294. unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
  2295. gfn_t gfn, bool *writable)
  2296. {
  2297. unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
  2298. if (!kvm_is_error_hva(hva) && writable)
  2299. *writable = !memslot_is_readonly(slot);
  2300. return hva;
  2301. }
  2302. unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
  2303. {
  2304. struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
  2305. return gfn_to_hva_memslot_prot(slot, gfn, writable);
  2306. }
  2307. unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
  2308. {
  2309. struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
  2310. return gfn_to_hva_memslot_prot(slot, gfn, writable);
  2311. }
  2312. static inline int check_user_page_hwpoison(unsigned long addr)
  2313. {
  2314. int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
  2315. rc = get_user_pages(addr, 1, flags, NULL);
  2316. return rc == -EHWPOISON;
  2317. }
  2318. /*
  2319. * The fast path to get the writable pfn which will be stored in @pfn,
  2320. * true indicates success, otherwise false is returned. It's also the
  2321. * only part that runs if we can in atomic context.
  2322. */
  2323. static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
  2324. bool *writable, kvm_pfn_t *pfn)
  2325. {
  2326. struct page *page[1];
  2327. /*
  2328. * Fast pin a writable pfn only if it is a write fault request
  2329. * or the caller allows to map a writable pfn for a read fault
  2330. * request.
  2331. */
  2332. if (!(write_fault || writable))
  2333. return false;
  2334. if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
  2335. *pfn = page_to_pfn(page[0]);
  2336. if (writable)
  2337. *writable = true;
  2338. return true;
  2339. }
  2340. return false;
  2341. }
  2342. /*
  2343. * The slow path to get the pfn of the specified host virtual address,
  2344. * 1 indicates success, -errno is returned if error is detected.
  2345. */
  2346. static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
  2347. bool interruptible, bool *writable, kvm_pfn_t *pfn)
  2348. {
  2349. /*
  2350. * When a VCPU accesses a page that is not mapped into the secondary
  2351. * MMU, we lookup the page using GUP to map it, so the guest VCPU can
  2352. * make progress. We always want to honor NUMA hinting faults in that
  2353. * case, because GUP usage corresponds to memory accesses from the VCPU.
  2354. * Otherwise, we'd not trigger NUMA hinting faults once a page is
  2355. * mapped into the secondary MMU and gets accessed by a VCPU.
  2356. *
  2357. * Note that get_user_page_fast_only() and FOLL_WRITE for now
  2358. * implicitly honor NUMA hinting faults and don't need this flag.
  2359. */
  2360. unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT;
  2361. struct page *page;
  2362. int npages;
  2363. might_sleep();
  2364. if (writable)
  2365. *writable = write_fault;
  2366. if (write_fault)
  2367. flags |= FOLL_WRITE;
  2368. if (async)
  2369. flags |= FOLL_NOWAIT;
  2370. if (interruptible)
  2371. flags |= FOLL_INTERRUPTIBLE;
  2372. npages = get_user_pages_unlocked(addr, 1, &page, flags);
  2373. if (npages != 1)
  2374. return npages;
  2375. /* map read fault as writable if possible */
  2376. if (unlikely(!write_fault) && writable) {
  2377. struct page *wpage;
  2378. if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
  2379. *writable = true;
  2380. put_page(page);
  2381. page = wpage;
  2382. }
  2383. }
  2384. *pfn = page_to_pfn(page);
  2385. return npages;
  2386. }
  2387. static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
  2388. {
  2389. if (unlikely(!(vma->vm_flags & VM_READ)))
  2390. return false;
  2391. if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
  2392. return false;
  2393. return true;
  2394. }
  2395. static int kvm_try_get_pfn(kvm_pfn_t pfn)
  2396. {
  2397. struct page *page = kvm_pfn_to_refcounted_page(pfn);
  2398. if (!page)
  2399. return 1;
  2400. return get_page_unless_zero(page);
  2401. }
  2402. static int hva_to_pfn_remapped(struct vm_area_struct *vma,
  2403. unsigned long addr, bool write_fault,
  2404. bool *writable, kvm_pfn_t *p_pfn)
  2405. {
  2406. struct follow_pfnmap_args args = { .vma = vma, .address = addr };
  2407. kvm_pfn_t pfn;
  2408. int r;
  2409. r = follow_pfnmap_start(&args);
  2410. if (r) {
  2411. /*
  2412. * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
  2413. * not call the fault handler, so do it here.
  2414. */
  2415. bool unlocked = false;
  2416. r = fixup_user_fault(current->mm, addr,
  2417. (write_fault ? FAULT_FLAG_WRITE : 0),
  2418. &unlocked);
  2419. if (unlocked)
  2420. return -EAGAIN;
  2421. if (r)
  2422. return r;
  2423. r = follow_pfnmap_start(&args);
  2424. if (r)
  2425. return r;
  2426. }
  2427. if (write_fault && !args.writable) {
  2428. pfn = KVM_PFN_ERR_RO_FAULT;
  2429. goto out;
  2430. }
  2431. if (writable)
  2432. *writable = args.writable;
  2433. pfn = args.pfn;
  2434. /*
  2435. * Get a reference here because callers of *hva_to_pfn* and
  2436. * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
  2437. * returned pfn. This is only needed if the VMA has VM_MIXEDMAP
  2438. * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
  2439. * simply do nothing for reserved pfns.
  2440. *
  2441. * Whoever called remap_pfn_range is also going to call e.g.
  2442. * unmap_mapping_range before the underlying pages are freed,
  2443. * causing a call to our MMU notifier.
  2444. *
  2445. * Certain IO or PFNMAP mappings can be backed with valid
  2446. * struct pages, but be allocated without refcounting e.g.,
  2447. * tail pages of non-compound higher order allocations, which
  2448. * would then underflow the refcount when the caller does the
  2449. * required put_page. Don't allow those pages here.
  2450. */
  2451. if (!kvm_try_get_pfn(pfn))
  2452. r = -EFAULT;
  2453. out:
  2454. follow_pfnmap_end(&args);
  2455. *p_pfn = pfn;
  2456. return r;
  2457. }
  2458. /*
  2459. * Pin guest page in memory and return its pfn.
  2460. * @addr: host virtual address which maps memory to the guest
  2461. * @atomic: whether this function is forbidden from sleeping
  2462. * @interruptible: whether the process can be interrupted by non-fatal signals
  2463. * @async: whether this function need to wait IO complete if the
  2464. * host page is not in the memory
  2465. * @write_fault: whether we should get a writable host page
  2466. * @writable: whether it allows to map a writable host page for !@write_fault
  2467. *
  2468. * The function will map a writable host page for these two cases:
  2469. * 1): @write_fault = true
  2470. * 2): @write_fault = false && @writable, @writable will tell the caller
  2471. * whether the mapping is writable.
  2472. */
  2473. kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
  2474. bool *async, bool write_fault, bool *writable)
  2475. {
  2476. struct vm_area_struct *vma;
  2477. kvm_pfn_t pfn;
  2478. int npages, r;
  2479. /* we can do it either atomically or asynchronously, not both */
  2480. BUG_ON(atomic && async);
  2481. if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
  2482. return pfn;
  2483. if (atomic)
  2484. return KVM_PFN_ERR_FAULT;
  2485. npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
  2486. writable, &pfn);
  2487. if (npages == 1)
  2488. return pfn;
  2489. if (npages == -EINTR)
  2490. return KVM_PFN_ERR_SIGPENDING;
  2491. mmap_read_lock(current->mm);
  2492. if (npages == -EHWPOISON ||
  2493. (!async && check_user_page_hwpoison(addr))) {
  2494. pfn = KVM_PFN_ERR_HWPOISON;
  2495. goto exit;
  2496. }
  2497. retry:
  2498. vma = vma_lookup(current->mm, addr);
  2499. if (vma == NULL)
  2500. pfn = KVM_PFN_ERR_FAULT;
  2501. else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
  2502. r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
  2503. if (r == -EAGAIN)
  2504. goto retry;
  2505. if (r < 0)
  2506. pfn = KVM_PFN_ERR_FAULT;
  2507. } else {
  2508. if (async && vma_is_valid(vma, write_fault))
  2509. *async = true;
  2510. pfn = KVM_PFN_ERR_FAULT;
  2511. }
  2512. exit:
  2513. mmap_read_unlock(current->mm);
  2514. return pfn;
  2515. }
  2516. kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
  2517. bool atomic, bool interruptible, bool *async,
  2518. bool write_fault, bool *writable, hva_t *hva)
  2519. {
  2520. unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
  2521. if (hva)
  2522. *hva = addr;
  2523. if (kvm_is_error_hva(addr)) {
  2524. if (writable)
  2525. *writable = false;
  2526. return addr == KVM_HVA_ERR_RO_BAD ? KVM_PFN_ERR_RO_FAULT :
  2527. KVM_PFN_NOSLOT;
  2528. }
  2529. /* Do not map writable pfn in the readonly memslot. */
  2530. if (writable && memslot_is_readonly(slot)) {
  2531. *writable = false;
  2532. writable = NULL;
  2533. }
  2534. return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
  2535. writable);
  2536. }
  2537. EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
  2538. kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
  2539. bool *writable)
  2540. {
  2541. return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
  2542. NULL, write_fault, writable, NULL);
  2543. }
  2544. EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
  2545. kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
  2546. {
  2547. return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
  2548. NULL, NULL);
  2549. }
  2550. EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
  2551. kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
  2552. {
  2553. return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
  2554. NULL, NULL);
  2555. }
  2556. EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
  2557. kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
  2558. {
  2559. return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
  2560. }
  2561. EXPORT_SYMBOL_GPL(gfn_to_pfn);
  2562. int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
  2563. struct page **pages, int nr_pages)
  2564. {
  2565. unsigned long addr;
  2566. gfn_t entry = 0;
  2567. addr = gfn_to_hva_many(slot, gfn, &entry);
  2568. if (kvm_is_error_hva(addr))
  2569. return -1;
  2570. if (entry < nr_pages)
  2571. return 0;
  2572. return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
  2573. }
  2574. EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
  2575. /*
  2576. * Do not use this helper unless you are absolutely certain the gfn _must_ be
  2577. * backed by 'struct page'. A valid example is if the backing memslot is
  2578. * controlled by KVM. Note, if the returned page is valid, it's refcount has
  2579. * been elevated by gfn_to_pfn().
  2580. */
  2581. struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
  2582. {
  2583. struct page *page;
  2584. kvm_pfn_t pfn;
  2585. pfn = gfn_to_pfn(kvm, gfn);
  2586. if (is_error_noslot_pfn(pfn))
  2587. return KVM_ERR_PTR_BAD_PAGE;
  2588. page = kvm_pfn_to_refcounted_page(pfn);
  2589. if (!page)
  2590. return KVM_ERR_PTR_BAD_PAGE;
  2591. return page;
  2592. }
  2593. EXPORT_SYMBOL_GPL(gfn_to_page);
  2594. void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
  2595. {
  2596. if (dirty)
  2597. kvm_release_pfn_dirty(pfn);
  2598. else
  2599. kvm_release_pfn_clean(pfn);
  2600. }
  2601. int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
  2602. {
  2603. kvm_pfn_t pfn;
  2604. void *hva = NULL;
  2605. struct page *page = KVM_UNMAPPED_PAGE;
  2606. if (!map)
  2607. return -EINVAL;
  2608. pfn = gfn_to_pfn(vcpu->kvm, gfn);
  2609. if (is_error_noslot_pfn(pfn))
  2610. return -EINVAL;
  2611. if (pfn_valid(pfn)) {
  2612. page = pfn_to_page(pfn);
  2613. hva = kmap(page);
  2614. #ifdef CONFIG_HAS_IOMEM
  2615. } else {
  2616. hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
  2617. #endif
  2618. }
  2619. if (!hva)
  2620. return -EFAULT;
  2621. map->page = page;
  2622. map->hva = hva;
  2623. map->pfn = pfn;
  2624. map->gfn = gfn;
  2625. return 0;
  2626. }
  2627. EXPORT_SYMBOL_GPL(kvm_vcpu_map);
  2628. void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
  2629. {
  2630. if (!map)
  2631. return;
  2632. if (!map->hva)
  2633. return;
  2634. if (map->page != KVM_UNMAPPED_PAGE)
  2635. kunmap(map->page);
  2636. #ifdef CONFIG_HAS_IOMEM
  2637. else
  2638. memunmap(map->hva);
  2639. #endif
  2640. if (dirty)
  2641. kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
  2642. kvm_release_pfn(map->pfn, dirty);
  2643. map->hva = NULL;
  2644. map->page = NULL;
  2645. }
  2646. EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
  2647. static bool kvm_is_ad_tracked_page(struct page *page)
  2648. {
  2649. /*
  2650. * Per page-flags.h, pages tagged PG_reserved "should in general not be
  2651. * touched (e.g. set dirty) except by its owner".
  2652. */
  2653. return !PageReserved(page);
  2654. }
  2655. static void kvm_set_page_dirty(struct page *page)
  2656. {
  2657. if (kvm_is_ad_tracked_page(page))
  2658. SetPageDirty(page);
  2659. }
  2660. static void kvm_set_page_accessed(struct page *page)
  2661. {
  2662. if (kvm_is_ad_tracked_page(page))
  2663. mark_page_accessed(page);
  2664. }
  2665. void kvm_release_page_clean(struct page *page)
  2666. {
  2667. WARN_ON(is_error_page(page));
  2668. kvm_set_page_accessed(page);
  2669. put_page(page);
  2670. }
  2671. EXPORT_SYMBOL_GPL(kvm_release_page_clean);
  2672. void kvm_release_pfn_clean(kvm_pfn_t pfn)
  2673. {
  2674. struct page *page;
  2675. if (is_error_noslot_pfn(pfn))
  2676. return;
  2677. page = kvm_pfn_to_refcounted_page(pfn);
  2678. if (!page)
  2679. return;
  2680. kvm_release_page_clean(page);
  2681. }
  2682. EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
  2683. void kvm_release_page_dirty(struct page *page)
  2684. {
  2685. WARN_ON(is_error_page(page));
  2686. kvm_set_page_dirty(page);
  2687. kvm_release_page_clean(page);
  2688. }
  2689. EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
  2690. void kvm_release_pfn_dirty(kvm_pfn_t pfn)
  2691. {
  2692. struct page *page;
  2693. if (is_error_noslot_pfn(pfn))
  2694. return;
  2695. page = kvm_pfn_to_refcounted_page(pfn);
  2696. if (!page)
  2697. return;
  2698. kvm_release_page_dirty(page);
  2699. }
  2700. EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
  2701. /*
  2702. * Note, checking for an error/noslot pfn is the caller's responsibility when
  2703. * directly marking a page dirty/accessed. Unlike the "release" helpers, the
  2704. * "set" helpers are not to be used when the pfn might point at garbage.
  2705. */
  2706. void kvm_set_pfn_dirty(kvm_pfn_t pfn)
  2707. {
  2708. if (WARN_ON(is_error_noslot_pfn(pfn)))
  2709. return;
  2710. if (pfn_valid(pfn))
  2711. kvm_set_page_dirty(pfn_to_page(pfn));
  2712. }
  2713. EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
  2714. void kvm_set_pfn_accessed(kvm_pfn_t pfn)
  2715. {
  2716. if (WARN_ON(is_error_noslot_pfn(pfn)))
  2717. return;
  2718. if (pfn_valid(pfn))
  2719. kvm_set_page_accessed(pfn_to_page(pfn));
  2720. }
  2721. EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
  2722. static int next_segment(unsigned long len, int offset)
  2723. {
  2724. if (len > PAGE_SIZE - offset)
  2725. return PAGE_SIZE - offset;
  2726. else
  2727. return len;
  2728. }
  2729. /* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */
  2730. static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
  2731. void *data, int offset, int len)
  2732. {
  2733. int r;
  2734. unsigned long addr;
  2735. if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
  2736. return -EFAULT;
  2737. addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
  2738. if (kvm_is_error_hva(addr))
  2739. return -EFAULT;
  2740. r = __copy_from_user(data, (void __user *)addr + offset, len);
  2741. if (r)
  2742. return -EFAULT;
  2743. return 0;
  2744. }
  2745. int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
  2746. int len)
  2747. {
  2748. struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
  2749. return __kvm_read_guest_page(slot, gfn, data, offset, len);
  2750. }
  2751. EXPORT_SYMBOL_GPL(kvm_read_guest_page);
  2752. int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
  2753. int offset, int len)
  2754. {
  2755. struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
  2756. return __kvm_read_guest_page(slot, gfn, data, offset, len);
  2757. }
  2758. EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
  2759. int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
  2760. {
  2761. gfn_t gfn = gpa >> PAGE_SHIFT;
  2762. int seg;
  2763. int offset = offset_in_page(gpa);
  2764. int ret;
  2765. while ((seg = next_segment(len, offset)) != 0) {
  2766. ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
  2767. if (ret < 0)
  2768. return ret;
  2769. offset = 0;
  2770. len -= seg;
  2771. data += seg;
  2772. ++gfn;
  2773. }
  2774. return 0;
  2775. }
  2776. EXPORT_SYMBOL_GPL(kvm_read_guest);
  2777. int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
  2778. {
  2779. gfn_t gfn = gpa >> PAGE_SHIFT;
  2780. int seg;
  2781. int offset = offset_in_page(gpa);
  2782. int ret;
  2783. while ((seg = next_segment(len, offset)) != 0) {
  2784. ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
  2785. if (ret < 0)
  2786. return ret;
  2787. offset = 0;
  2788. len -= seg;
  2789. data += seg;
  2790. ++gfn;
  2791. }
  2792. return 0;
  2793. }
  2794. EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
  2795. static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
  2796. void *data, int offset, unsigned long len)
  2797. {
  2798. int r;
  2799. unsigned long addr;
  2800. if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
  2801. return -EFAULT;
  2802. addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
  2803. if (kvm_is_error_hva(addr))
  2804. return -EFAULT;
  2805. pagefault_disable();
  2806. r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
  2807. pagefault_enable();
  2808. if (r)
  2809. return -EFAULT;
  2810. return 0;
  2811. }
  2812. int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
  2813. void *data, unsigned long len)
  2814. {
  2815. gfn_t gfn = gpa >> PAGE_SHIFT;
  2816. struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
  2817. int offset = offset_in_page(gpa);
  2818. return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
  2819. }
  2820. EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
  2821. /* Copy @len bytes from @data into guest memory at '(@gfn * PAGE_SIZE) + @offset' */
  2822. static int __kvm_write_guest_page(struct kvm *kvm,
  2823. struct kvm_memory_slot *memslot, gfn_t gfn,
  2824. const void *data, int offset, int len)
  2825. {
  2826. int r;
  2827. unsigned long addr;
  2828. if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
  2829. return -EFAULT;
  2830. addr = gfn_to_hva_memslot(memslot, gfn);
  2831. if (kvm_is_error_hva(addr))
  2832. return -EFAULT;
  2833. r = __copy_to_user((void __user *)addr + offset, data, len);
  2834. if (r)
  2835. return -EFAULT;
  2836. mark_page_dirty_in_slot(kvm, memslot, gfn);
  2837. return 0;
  2838. }
  2839. int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
  2840. const void *data, int offset, int len)
  2841. {
  2842. struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
  2843. return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
  2844. }
  2845. EXPORT_SYMBOL_GPL(kvm_write_guest_page);
  2846. int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
  2847. const void *data, int offset, int len)
  2848. {
  2849. struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
  2850. return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
  2851. }
  2852. EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
  2853. int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
  2854. unsigned long len)
  2855. {
  2856. gfn_t gfn = gpa >> PAGE_SHIFT;
  2857. int seg;
  2858. int offset = offset_in_page(gpa);
  2859. int ret;
  2860. while ((seg = next_segment(len, offset)) != 0) {
  2861. ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
  2862. if (ret < 0)
  2863. return ret;
  2864. offset = 0;
  2865. len -= seg;
  2866. data += seg;
  2867. ++gfn;
  2868. }
  2869. return 0;
  2870. }
  2871. EXPORT_SYMBOL_GPL(kvm_write_guest);
  2872. int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
  2873. unsigned long len)
  2874. {
  2875. gfn_t gfn = gpa >> PAGE_SHIFT;
  2876. int seg;
  2877. int offset = offset_in_page(gpa);
  2878. int ret;
  2879. while ((seg = next_segment(len, offset)) != 0) {
  2880. ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
  2881. if (ret < 0)
  2882. return ret;
  2883. offset = 0;
  2884. len -= seg;
  2885. data += seg;
  2886. ++gfn;
  2887. }
  2888. return 0;
  2889. }
  2890. EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
  2891. static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
  2892. struct gfn_to_hva_cache *ghc,
  2893. gpa_t gpa, unsigned long len)
  2894. {
  2895. int offset = offset_in_page(gpa);
  2896. gfn_t start_gfn = gpa >> PAGE_SHIFT;
  2897. gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
  2898. gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
  2899. gfn_t nr_pages_avail;
  2900. /* Update ghc->generation before performing any error checks. */
  2901. ghc->generation = slots->generation;
  2902. if (start_gfn > end_gfn) {
  2903. ghc->hva = KVM_HVA_ERR_BAD;
  2904. return -EINVAL;
  2905. }
  2906. /*
  2907. * If the requested region crosses two memslots, we still
  2908. * verify that the entire region is valid here.
  2909. */
  2910. for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
  2911. ghc->memslot = __gfn_to_memslot(slots, start_gfn);
  2912. ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
  2913. &nr_pages_avail);
  2914. if (kvm_is_error_hva(ghc->hva))
  2915. return -EFAULT;
  2916. }
  2917. /* Use the slow path for cross page reads and writes. */
  2918. if (nr_pages_needed == 1)
  2919. ghc->hva += offset;
  2920. else
  2921. ghc->memslot = NULL;
  2922. ghc->gpa = gpa;
  2923. ghc->len = len;
  2924. return 0;
  2925. }
  2926. int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
  2927. gpa_t gpa, unsigned long len)
  2928. {
  2929. struct kvm_memslots *slots = kvm_memslots(kvm);
  2930. return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
  2931. }
  2932. EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
  2933. int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
  2934. void *data, unsigned int offset,
  2935. unsigned long len)
  2936. {
  2937. struct kvm_memslots *slots = kvm_memslots(kvm);
  2938. int r;
  2939. gpa_t gpa = ghc->gpa + offset;
  2940. if (WARN_ON_ONCE(len + offset > ghc->len))
  2941. return -EINVAL;
  2942. if (slots->generation != ghc->generation) {
  2943. if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
  2944. return -EFAULT;
  2945. }
  2946. if (kvm_is_error_hva(ghc->hva))
  2947. return -EFAULT;
  2948. if (unlikely(!ghc->memslot))
  2949. return kvm_write_guest(kvm, gpa, data, len);
  2950. r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
  2951. if (r)
  2952. return -EFAULT;
  2953. mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
  2954. return 0;
  2955. }
  2956. EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
  2957. int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
  2958. void *data, unsigned long len)
  2959. {
  2960. return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
  2961. }
  2962. EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
  2963. int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
  2964. void *data, unsigned int offset,
  2965. unsigned long len)
  2966. {
  2967. struct kvm_memslots *slots = kvm_memslots(kvm);
  2968. int r;
  2969. gpa_t gpa = ghc->gpa + offset;
  2970. if (WARN_ON_ONCE(len + offset > ghc->len))
  2971. return -EINVAL;
  2972. if (slots->generation != ghc->generation) {
  2973. if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
  2974. return -EFAULT;
  2975. }
  2976. if (kvm_is_error_hva(ghc->hva))
  2977. return -EFAULT;
  2978. if (unlikely(!ghc->memslot))
  2979. return kvm_read_guest(kvm, gpa, data, len);
  2980. r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
  2981. if (r)
  2982. return -EFAULT;
  2983. return 0;
  2984. }
  2985. EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
  2986. int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
  2987. void *data, unsigned long len)
  2988. {
  2989. return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
  2990. }
  2991. EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
  2992. int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
  2993. {
  2994. const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
  2995. gfn_t gfn = gpa >> PAGE_SHIFT;
  2996. int seg;
  2997. int offset = offset_in_page(gpa);
  2998. int ret;
  2999. while ((seg = next_segment(len, offset)) != 0) {
  3000. ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, seg);
  3001. if (ret < 0)
  3002. return ret;
  3003. offset = 0;
  3004. len -= seg;
  3005. ++gfn;
  3006. }
  3007. return 0;
  3008. }
  3009. EXPORT_SYMBOL_GPL(kvm_clear_guest);
  3010. void mark_page_dirty_in_slot(struct kvm *kvm,
  3011. const struct kvm_memory_slot *memslot,
  3012. gfn_t gfn)
  3013. {
  3014. struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
  3015. #ifdef CONFIG_HAVE_KVM_DIRTY_RING
  3016. if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
  3017. return;
  3018. WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm));
  3019. #endif
  3020. if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
  3021. unsigned long rel_gfn = gfn - memslot->base_gfn;
  3022. u32 slot = (memslot->as_id << 16) | memslot->id;
  3023. if (kvm->dirty_ring_size && vcpu)
  3024. kvm_dirty_ring_push(vcpu, slot, rel_gfn);
  3025. else if (memslot->dirty_bitmap)
  3026. set_bit_le(rel_gfn, memslot->dirty_bitmap);
  3027. }
  3028. }
  3029. EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
  3030. void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
  3031. {
  3032. struct kvm_memory_slot *memslot;
  3033. memslot = gfn_to_memslot(kvm, gfn);
  3034. mark_page_dirty_in_slot(kvm, memslot, gfn);
  3035. }
  3036. EXPORT_SYMBOL_GPL(mark_page_dirty);
  3037. void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
  3038. {
  3039. struct kvm_memory_slot *memslot;
  3040. memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
  3041. mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
  3042. }
  3043. EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
  3044. void kvm_sigset_activate(struct kvm_vcpu *vcpu)
  3045. {
  3046. if (!vcpu->sigset_active)
  3047. return;
  3048. /*
  3049. * This does a lockless modification of ->real_blocked, which is fine
  3050. * because, only current can change ->real_blocked and all readers of
  3051. * ->real_blocked don't care as long ->real_blocked is always a subset
  3052. * of ->blocked.
  3053. */
  3054. sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
  3055. }
  3056. void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
  3057. {
  3058. if (!vcpu->sigset_active)
  3059. return;
  3060. sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
  3061. sigemptyset(&current->real_blocked);
  3062. }
  3063. static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
  3064. {
  3065. unsigned int old, val, grow, grow_start;
  3066. old = val = vcpu->halt_poll_ns;
  3067. grow_start = READ_ONCE(halt_poll_ns_grow_start);
  3068. grow = READ_ONCE(halt_poll_ns_grow);
  3069. if (!grow)
  3070. goto out;
  3071. val *= grow;
  3072. if (val < grow_start)
  3073. val = grow_start;
  3074. vcpu->halt_poll_ns = val;
  3075. out:
  3076. trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
  3077. }
  3078. static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
  3079. {
  3080. unsigned int old, val, shrink, grow_start;
  3081. old = val = vcpu->halt_poll_ns;
  3082. shrink = READ_ONCE(halt_poll_ns_shrink);
  3083. grow_start = READ_ONCE(halt_poll_ns_grow_start);
  3084. if (shrink == 0)
  3085. val = 0;
  3086. else
  3087. val /= shrink;
  3088. if (val < grow_start)
  3089. val = 0;
  3090. vcpu->halt_poll_ns = val;
  3091. trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
  3092. }
  3093. static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
  3094. {
  3095. int ret = -EINTR;
  3096. int idx = srcu_read_lock(&vcpu->kvm->srcu);
  3097. if (kvm_arch_vcpu_runnable(vcpu))
  3098. goto out;
  3099. if (kvm_cpu_has_pending_timer(vcpu))
  3100. goto out;
  3101. if (signal_pending(current))
  3102. goto out;
  3103. if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
  3104. goto out;
  3105. ret = 0;
  3106. out:
  3107. srcu_read_unlock(&vcpu->kvm->srcu, idx);
  3108. return ret;
  3109. }
  3110. /*
  3111. * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
  3112. * pending. This is mostly used when halting a vCPU, but may also be used
  3113. * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
  3114. */
  3115. bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
  3116. {
  3117. struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
  3118. bool waited = false;
  3119. vcpu->stat.generic.blocking = 1;
  3120. preempt_disable();
  3121. kvm_arch_vcpu_blocking(vcpu);
  3122. prepare_to_rcuwait(wait);
  3123. preempt_enable();
  3124. for (;;) {
  3125. set_current_state(TASK_INTERRUPTIBLE);
  3126. if (kvm_vcpu_check_block(vcpu) < 0)
  3127. break;
  3128. waited = true;
  3129. schedule();
  3130. }
  3131. preempt_disable();
  3132. finish_rcuwait(wait);
  3133. kvm_arch_vcpu_unblocking(vcpu);
  3134. preempt_enable();
  3135. vcpu->stat.generic.blocking = 0;
  3136. return waited;
  3137. }
  3138. static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
  3139. ktime_t end, bool success)
  3140. {
  3141. struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
  3142. u64 poll_ns = ktime_to_ns(ktime_sub(end, start));
  3143. ++vcpu->stat.generic.halt_attempted_poll;
  3144. if (success) {
  3145. ++vcpu->stat.generic.halt_successful_poll;
  3146. if (!vcpu_valid_wakeup(vcpu))
  3147. ++vcpu->stat.generic.halt_poll_invalid;
  3148. stats->halt_poll_success_ns += poll_ns;
  3149. KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
  3150. } else {
  3151. stats->halt_poll_fail_ns += poll_ns;
  3152. KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
  3153. }
  3154. }
  3155. static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu)
  3156. {
  3157. struct kvm *kvm = vcpu->kvm;
  3158. if (kvm->override_halt_poll_ns) {
  3159. /*
  3160. * Ensure kvm->max_halt_poll_ns is not read before
  3161. * kvm->override_halt_poll_ns.
  3162. *
  3163. * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL.
  3164. */
  3165. smp_rmb();
  3166. return READ_ONCE(kvm->max_halt_poll_ns);
  3167. }
  3168. return READ_ONCE(halt_poll_ns);
  3169. }
  3170. /*
  3171. * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc... If halt
  3172. * polling is enabled, busy wait for a short time before blocking to avoid the
  3173. * expensive block+unblock sequence if a wake event arrives soon after the vCPU
  3174. * is halted.
  3175. */
  3176. void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
  3177. {
  3178. unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
  3179. bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
  3180. ktime_t start, cur, poll_end;
  3181. bool waited = false;
  3182. bool do_halt_poll;
  3183. u64 halt_ns;
  3184. if (vcpu->halt_poll_ns > max_halt_poll_ns)
  3185. vcpu->halt_poll_ns = max_halt_poll_ns;
  3186. do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
  3187. start = cur = poll_end = ktime_get();
  3188. if (do_halt_poll) {
  3189. ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
  3190. do {
  3191. if (kvm_vcpu_check_block(vcpu) < 0)
  3192. goto out;
  3193. cpu_relax();
  3194. poll_end = cur = ktime_get();
  3195. } while (kvm_vcpu_can_poll(cur, stop));
  3196. }
  3197. waited = kvm_vcpu_block(vcpu);
  3198. cur = ktime_get();
  3199. if (waited) {
  3200. vcpu->stat.generic.halt_wait_ns +=
  3201. ktime_to_ns(cur) - ktime_to_ns(poll_end);
  3202. KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
  3203. ktime_to_ns(cur) - ktime_to_ns(poll_end));
  3204. }
  3205. out:
  3206. /* The total time the vCPU was "halted", including polling time. */
  3207. halt_ns = ktime_to_ns(cur) - ktime_to_ns(start);
  3208. /*
  3209. * Note, halt-polling is considered successful so long as the vCPU was
  3210. * never actually scheduled out, i.e. even if the wake event arrived
  3211. * after of the halt-polling loop itself, but before the full wait.
  3212. */
  3213. if (do_halt_poll)
  3214. update_halt_poll_stats(vcpu, start, poll_end, !waited);
  3215. if (halt_poll_allowed) {
  3216. /* Recompute the max halt poll time in case it changed. */
  3217. max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
  3218. if (!vcpu_valid_wakeup(vcpu)) {
  3219. shrink_halt_poll_ns(vcpu);
  3220. } else if (max_halt_poll_ns) {
  3221. if (halt_ns <= vcpu->halt_poll_ns)
  3222. ;
  3223. /* we had a long block, shrink polling */
  3224. else if (vcpu->halt_poll_ns &&
  3225. halt_ns > max_halt_poll_ns)
  3226. shrink_halt_poll_ns(vcpu);
  3227. /* we had a short halt and our poll time is too small */
  3228. else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
  3229. halt_ns < max_halt_poll_ns)
  3230. grow_halt_poll_ns(vcpu);
  3231. } else {
  3232. vcpu->halt_poll_ns = 0;
  3233. }
  3234. }
  3235. trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu));
  3236. }
  3237. EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
  3238. bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
  3239. {
  3240. if (__kvm_vcpu_wake_up(vcpu)) {
  3241. WRITE_ONCE(vcpu->ready, true);
  3242. ++vcpu->stat.generic.halt_wakeup;
  3243. return true;
  3244. }
  3245. return false;
  3246. }
  3247. EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
  3248. #ifndef CONFIG_S390
  3249. /*
  3250. * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
  3251. */
  3252. void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
  3253. {
  3254. int me, cpu;
  3255. if (kvm_vcpu_wake_up(vcpu))
  3256. return;
  3257. me = get_cpu();
  3258. /*
  3259. * The only state change done outside the vcpu mutex is IN_GUEST_MODE
  3260. * to EXITING_GUEST_MODE. Therefore the moderately expensive "should
  3261. * kick" check does not need atomic operations if kvm_vcpu_kick is used
  3262. * within the vCPU thread itself.
  3263. */
  3264. if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
  3265. if (vcpu->mode == IN_GUEST_MODE)
  3266. WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
  3267. goto out;
  3268. }
  3269. /*
  3270. * Note, the vCPU could get migrated to a different pCPU at any point
  3271. * after kvm_arch_vcpu_should_kick(), which could result in sending an
  3272. * IPI to the previous pCPU. But, that's ok because the purpose of the
  3273. * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
  3274. * vCPU also requires it to leave IN_GUEST_MODE.
  3275. */
  3276. if (kvm_arch_vcpu_should_kick(vcpu)) {
  3277. cpu = READ_ONCE(vcpu->cpu);
  3278. if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
  3279. smp_send_reschedule(cpu);
  3280. }
  3281. out:
  3282. put_cpu();
  3283. }
  3284. EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
  3285. #endif /* !CONFIG_S390 */
  3286. int kvm_vcpu_yield_to(struct kvm_vcpu *target)
  3287. {
  3288. struct pid *pid;
  3289. struct task_struct *task = NULL;
  3290. int ret = 0;
  3291. rcu_read_lock();
  3292. pid = rcu_dereference(target->pid);
  3293. if (pid)
  3294. task = get_pid_task(pid, PIDTYPE_PID);
  3295. rcu_read_unlock();
  3296. if (!task)
  3297. return ret;
  3298. ret = yield_to(task, 1);
  3299. put_task_struct(task);
  3300. return ret;
  3301. }
  3302. EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
  3303. /*
  3304. * Helper that checks whether a VCPU is eligible for directed yield.
  3305. * Most eligible candidate to yield is decided by following heuristics:
  3306. *
  3307. * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
  3308. * (preempted lock holder), indicated by @in_spin_loop.
  3309. * Set at the beginning and cleared at the end of interception/PLE handler.
  3310. *
  3311. * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
  3312. * chance last time (mostly it has become eligible now since we have probably
  3313. * yielded to lockholder in last iteration. This is done by toggling
  3314. * @dy_eligible each time a VCPU checked for eligibility.)
  3315. *
  3316. * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
  3317. * to preempted lock-holder could result in wrong VCPU selection and CPU
  3318. * burning. Giving priority for a potential lock-holder increases lock
  3319. * progress.
  3320. *
  3321. * Since algorithm is based on heuristics, accessing another VCPU data without
  3322. * locking does not harm. It may result in trying to yield to same VCPU, fail
  3323. * and continue with next VCPU and so on.
  3324. */
  3325. static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
  3326. {
  3327. #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
  3328. bool eligible;
  3329. eligible = !vcpu->spin_loop.in_spin_loop ||
  3330. vcpu->spin_loop.dy_eligible;
  3331. if (vcpu->spin_loop.in_spin_loop)
  3332. kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
  3333. return eligible;
  3334. #else
  3335. return true;
  3336. #endif
  3337. }
  3338. /*
  3339. * Unlike kvm_arch_vcpu_runnable, this function is called outside
  3340. * a vcpu_load/vcpu_put pair. However, for most architectures
  3341. * kvm_arch_vcpu_runnable does not require vcpu_load.
  3342. */
  3343. bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
  3344. {
  3345. return kvm_arch_vcpu_runnable(vcpu);
  3346. }
  3347. static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
  3348. {
  3349. if (kvm_arch_dy_runnable(vcpu))
  3350. return true;
  3351. #ifdef CONFIG_KVM_ASYNC_PF
  3352. if (!list_empty_careful(&vcpu->async_pf.done))
  3353. return true;
  3354. #endif
  3355. return false;
  3356. }
  3357. /*
  3358. * By default, simply query the target vCPU's current mode when checking if a
  3359. * vCPU was preempted in kernel mode. All architectures except x86 (or more
  3360. * specifical, except VMX) allow querying whether or not a vCPU is in kernel
  3361. * mode even if the vCPU is NOT loaded, i.e. using kvm_arch_vcpu_in_kernel()
  3362. * directly for cross-vCPU checks is functionally correct and accurate.
  3363. */
  3364. bool __weak kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
  3365. {
  3366. return kvm_arch_vcpu_in_kernel(vcpu);
  3367. }
  3368. bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
  3369. {
  3370. return false;
  3371. }
  3372. void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
  3373. {
  3374. struct kvm *kvm = me->kvm;
  3375. struct kvm_vcpu *vcpu;
  3376. int last_boosted_vcpu;
  3377. unsigned long i;
  3378. int yielded = 0;
  3379. int try = 3;
  3380. int pass;
  3381. last_boosted_vcpu = READ_ONCE(kvm->last_boosted_vcpu);
  3382. kvm_vcpu_set_in_spin_loop(me, true);
  3383. /*
  3384. * We boost the priority of a VCPU that is runnable but not
  3385. * currently running, because it got preempted by something
  3386. * else and called schedule in __vcpu_run. Hopefully that
  3387. * VCPU is holding the lock that we need and will release it.
  3388. * We approximate round-robin by starting at the last boosted VCPU.
  3389. */
  3390. for (pass = 0; pass < 2 && !yielded && try; pass++) {
  3391. kvm_for_each_vcpu(i, vcpu, kvm) {
  3392. if (!pass && i <= last_boosted_vcpu) {
  3393. i = last_boosted_vcpu;
  3394. continue;
  3395. } else if (pass && i > last_boosted_vcpu)
  3396. break;
  3397. if (!READ_ONCE(vcpu->ready))
  3398. continue;
  3399. if (vcpu == me)
  3400. continue;
  3401. if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
  3402. continue;
  3403. /*
  3404. * Treat the target vCPU as being in-kernel if it has a
  3405. * pending interrupt, as the vCPU trying to yield may
  3406. * be spinning waiting on IPI delivery, i.e. the target
  3407. * vCPU is in-kernel for the purposes of directed yield.
  3408. */
  3409. if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
  3410. !kvm_arch_dy_has_pending_interrupt(vcpu) &&
  3411. !kvm_arch_vcpu_preempted_in_kernel(vcpu))
  3412. continue;
  3413. if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
  3414. continue;
  3415. yielded = kvm_vcpu_yield_to(vcpu);
  3416. if (yielded > 0) {
  3417. WRITE_ONCE(kvm->last_boosted_vcpu, i);
  3418. break;
  3419. } else if (yielded < 0) {
  3420. try--;
  3421. if (!try)
  3422. break;
  3423. }
  3424. }
  3425. }
  3426. kvm_vcpu_set_in_spin_loop(me, false);
  3427. /* Ensure vcpu is not eligible during next spinloop */
  3428. kvm_vcpu_set_dy_eligible(me, false);
  3429. }
  3430. EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
  3431. static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
  3432. {
  3433. #ifdef CONFIG_HAVE_KVM_DIRTY_RING
  3434. return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
  3435. (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
  3436. kvm->dirty_ring_size / PAGE_SIZE);
  3437. #else
  3438. return false;
  3439. #endif
  3440. }
  3441. static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
  3442. {
  3443. struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
  3444. struct page *page;
  3445. if (vmf->pgoff == 0)
  3446. page = virt_to_page(vcpu->run);
  3447. #ifdef CONFIG_X86
  3448. else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
  3449. page = virt_to_page(vcpu->arch.pio_data);
  3450. #endif
  3451. #ifdef CONFIG_KVM_MMIO
  3452. else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
  3453. page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
  3454. #endif
  3455. else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
  3456. page = kvm_dirty_ring_get_page(
  3457. &vcpu->dirty_ring,
  3458. vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
  3459. else
  3460. return kvm_arch_vcpu_fault(vcpu, vmf);
  3461. get_page(page);
  3462. vmf->page = page;
  3463. return 0;
  3464. }
  3465. static const struct vm_operations_struct kvm_vcpu_vm_ops = {
  3466. .fault = kvm_vcpu_fault,
  3467. };
  3468. static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
  3469. {
  3470. struct kvm_vcpu *vcpu = file->private_data;
  3471. unsigned long pages = vma_pages(vma);
  3472. if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
  3473. kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
  3474. ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
  3475. return -EINVAL;
  3476. vma->vm_ops = &kvm_vcpu_vm_ops;
  3477. return 0;
  3478. }
  3479. static int kvm_vcpu_release(struct inode *inode, struct file *filp)
  3480. {
  3481. struct kvm_vcpu *vcpu = filp->private_data;
  3482. kvm_put_kvm(vcpu->kvm);
  3483. return 0;
  3484. }
  3485. static struct file_operations kvm_vcpu_fops = {
  3486. .release = kvm_vcpu_release,
  3487. .unlocked_ioctl = kvm_vcpu_ioctl,
  3488. .mmap = kvm_vcpu_mmap,
  3489. .llseek = noop_llseek,
  3490. KVM_COMPAT(kvm_vcpu_compat_ioctl),
  3491. };
  3492. /*
  3493. * Allocates an inode for the vcpu.
  3494. */
  3495. static int create_vcpu_fd(struct kvm_vcpu *vcpu)
  3496. {
  3497. char name[8 + 1 + ITOA_MAX_LEN + 1];
  3498. snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
  3499. return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
  3500. }
  3501. #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
  3502. static int vcpu_get_pid(void *data, u64 *val)
  3503. {
  3504. struct kvm_vcpu *vcpu = data;
  3505. rcu_read_lock();
  3506. *val = pid_nr(rcu_dereference(vcpu->pid));
  3507. rcu_read_unlock();
  3508. return 0;
  3509. }
  3510. DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");
  3511. static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
  3512. {
  3513. struct dentry *debugfs_dentry;
  3514. char dir_name[ITOA_MAX_LEN * 2];
  3515. if (!debugfs_initialized())
  3516. return;
  3517. snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
  3518. debugfs_dentry = debugfs_create_dir(dir_name,
  3519. vcpu->kvm->debugfs_dentry);
  3520. debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
  3521. &vcpu_get_pid_fops);
  3522. kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
  3523. }
  3524. #endif
  3525. /*
  3526. * Creates some virtual cpus. Good luck creating more than one.
  3527. */
  3528. static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
  3529. {
  3530. int r;
  3531. struct kvm_vcpu *vcpu;
  3532. struct page *page;
  3533. /*
  3534. * KVM tracks vCPU IDs as 'int', be kind to userspace and reject
  3535. * too-large values instead of silently truncating.
  3536. *
  3537. * Ensure KVM_MAX_VCPU_IDS isn't pushed above INT_MAX without first
  3538. * changing the storage type (at the very least, IDs should be tracked
  3539. * as unsigned ints).
  3540. */
  3541. BUILD_BUG_ON(KVM_MAX_VCPU_IDS > INT_MAX);
  3542. if (id >= KVM_MAX_VCPU_IDS)
  3543. return -EINVAL;
  3544. mutex_lock(&kvm->lock);
  3545. if (kvm->created_vcpus >= kvm->max_vcpus) {
  3546. mutex_unlock(&kvm->lock);
  3547. return -EINVAL;
  3548. }
  3549. r = kvm_arch_vcpu_precreate(kvm, id);
  3550. if (r) {
  3551. mutex_unlock(&kvm->lock);
  3552. return r;
  3553. }
  3554. kvm->created_vcpus++;
  3555. mutex_unlock(&kvm->lock);
  3556. vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
  3557. if (!vcpu) {
  3558. r = -ENOMEM;
  3559. goto vcpu_decrement;
  3560. }
  3561. BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
  3562. page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
  3563. if (!page) {
  3564. r = -ENOMEM;
  3565. goto vcpu_free;
  3566. }
  3567. vcpu->run = page_address(page);
  3568. kvm_vcpu_init(vcpu, kvm, id);
  3569. r = kvm_arch_vcpu_create(vcpu);
  3570. if (r)
  3571. goto vcpu_free_run_page;
  3572. if (kvm->dirty_ring_size) {
  3573. r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
  3574. id, kvm->dirty_ring_size);
  3575. if (r)
  3576. goto arch_vcpu_destroy;
  3577. }
  3578. mutex_lock(&kvm->lock);
  3579. #ifdef CONFIG_LOCKDEP
  3580. /* Ensure that lockdep knows vcpu->mutex is taken *inside* kvm->lock */
  3581. mutex_lock(&vcpu->mutex);
  3582. mutex_unlock(&vcpu->mutex);
  3583. #endif
  3584. if (kvm_get_vcpu_by_id(kvm, id)) {
  3585. r = -EEXIST;
  3586. goto unlock_vcpu_destroy;
  3587. }
  3588. vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
  3589. r = xa_reserve(&kvm->vcpu_array, vcpu->vcpu_idx, GFP_KERNEL_ACCOUNT);
  3590. if (r)
  3591. goto unlock_vcpu_destroy;
  3592. /* Now it's all set up, let userspace reach it */
  3593. kvm_get_kvm(kvm);
  3594. r = create_vcpu_fd(vcpu);
  3595. if (r < 0)
  3596. goto kvm_put_xa_release;
  3597. if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
  3598. r = -EINVAL;
  3599. goto kvm_put_xa_release;
  3600. }
  3601. /*
  3602. * Pairs with smp_rmb() in kvm_get_vcpu. Store the vcpu
  3603. * pointer before kvm->online_vcpu's incremented value.
  3604. */
  3605. smp_wmb();
  3606. atomic_inc(&kvm->online_vcpus);
  3607. mutex_unlock(&kvm->lock);
  3608. kvm_arch_vcpu_postcreate(vcpu);
  3609. kvm_create_vcpu_debugfs(vcpu);
  3610. return r;
  3611. kvm_put_xa_release:
  3612. kvm_put_kvm_no_destroy(kvm);
  3613. xa_release(&kvm->vcpu_array, vcpu->vcpu_idx);
  3614. unlock_vcpu_destroy:
  3615. mutex_unlock(&kvm->lock);
  3616. kvm_dirty_ring_free(&vcpu->dirty_ring);
  3617. arch_vcpu_destroy:
  3618. kvm_arch_vcpu_destroy(vcpu);
  3619. vcpu_free_run_page:
  3620. free_page((unsigned long)vcpu->run);
  3621. vcpu_free:
  3622. kmem_cache_free(kvm_vcpu_cache, vcpu);
  3623. vcpu_decrement:
  3624. mutex_lock(&kvm->lock);
  3625. kvm->created_vcpus--;
  3626. mutex_unlock(&kvm->lock);
  3627. return r;
  3628. }
  3629. static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
  3630. {
  3631. if (sigset) {
  3632. sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
  3633. vcpu->sigset_active = 1;
  3634. vcpu->sigset = *sigset;
  3635. } else
  3636. vcpu->sigset_active = 0;
  3637. return 0;
  3638. }
  3639. static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
  3640. size_t size, loff_t *offset)
  3641. {
  3642. struct kvm_vcpu *vcpu = file->private_data;
  3643. return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
  3644. &kvm_vcpu_stats_desc[0], &vcpu->stat,
  3645. sizeof(vcpu->stat), user_buffer, size, offset);
  3646. }
  3647. static int kvm_vcpu_stats_release(struct inode *inode, struct file *file)
  3648. {
  3649. struct kvm_vcpu *vcpu = file->private_data;
  3650. kvm_put_kvm(vcpu->kvm);
  3651. return 0;
  3652. }
  3653. static const struct file_operations kvm_vcpu_stats_fops = {
  3654. .owner = THIS_MODULE,
  3655. .read = kvm_vcpu_stats_read,
  3656. .release = kvm_vcpu_stats_release,
  3657. .llseek = noop_llseek,
  3658. };
  3659. static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
  3660. {
  3661. int fd;
  3662. struct file *file;
  3663. char name[15 + ITOA_MAX_LEN + 1];
  3664. snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
  3665. fd = get_unused_fd_flags(O_CLOEXEC);
  3666. if (fd < 0)
  3667. return fd;
  3668. file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
  3669. if (IS_ERR(file)) {
  3670. put_unused_fd(fd);
  3671. return PTR_ERR(file);
  3672. }
  3673. kvm_get_kvm(vcpu->kvm);
  3674. file->f_mode |= FMODE_PREAD;
  3675. fd_install(fd, file);
  3676. return fd;
  3677. }
  3678. #ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
  3679. static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
  3680. struct kvm_pre_fault_memory *range)
  3681. {
  3682. int idx;
  3683. long r;
  3684. u64 full_size;
  3685. if (range->flags)
  3686. return -EINVAL;
  3687. if (!PAGE_ALIGNED(range->gpa) ||
  3688. !PAGE_ALIGNED(range->size) ||
  3689. range->gpa + range->size <= range->gpa)
  3690. return -EINVAL;
  3691. vcpu_load(vcpu);
  3692. idx = srcu_read_lock(&vcpu->kvm->srcu);
  3693. full_size = range->size;
  3694. do {
  3695. if (signal_pending(current)) {
  3696. r = -EINTR;
  3697. break;
  3698. }
  3699. r = kvm_arch_vcpu_pre_fault_memory(vcpu, range);
  3700. if (WARN_ON_ONCE(r == 0 || r == -EIO))
  3701. break;
  3702. if (r < 0)
  3703. break;
  3704. range->size -= r;
  3705. range->gpa += r;
  3706. cond_resched();
  3707. } while (range->size);
  3708. srcu_read_unlock(&vcpu->kvm->srcu, idx);
  3709. vcpu_put(vcpu);
  3710. /* Return success if at least one page was mapped successfully. */
  3711. return full_size == range->size ? r : 0;
  3712. }
  3713. #endif
  3714. static long kvm_vcpu_ioctl(struct file *filp,
  3715. unsigned int ioctl, unsigned long arg)
  3716. {
  3717. struct kvm_vcpu *vcpu = filp->private_data;
  3718. void __user *argp = (void __user *)arg;
  3719. int r;
  3720. struct kvm_fpu *fpu = NULL;
  3721. struct kvm_sregs *kvm_sregs = NULL;
  3722. if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
  3723. return -EIO;
  3724. if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
  3725. return -EINVAL;
  3726. /*
  3727. * Some architectures have vcpu ioctls that are asynchronous to vcpu
  3728. * execution; mutex_lock() would break them.
  3729. */
  3730. r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
  3731. if (r != -ENOIOCTLCMD)
  3732. return r;
  3733. if (mutex_lock_killable(&vcpu->mutex))
  3734. return -EINTR;
  3735. switch (ioctl) {
  3736. case KVM_RUN: {
  3737. struct pid *oldpid;
  3738. r = -EINVAL;
  3739. if (arg)
  3740. goto out;
  3741. oldpid = rcu_access_pointer(vcpu->pid);
  3742. if (unlikely(oldpid != task_pid(current))) {
  3743. /* The thread running this VCPU changed. */
  3744. struct pid *newpid;
  3745. r = kvm_arch_vcpu_run_pid_change(vcpu);
  3746. if (r)
  3747. break;
  3748. newpid = get_task_pid(current, PIDTYPE_PID);
  3749. rcu_assign_pointer(vcpu->pid, newpid);
  3750. if (oldpid)
  3751. synchronize_rcu();
  3752. put_pid(oldpid);
  3753. }
  3754. vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe);
  3755. r = kvm_arch_vcpu_ioctl_run(vcpu);
  3756. vcpu->wants_to_run = false;
  3757. trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
  3758. break;
  3759. }
  3760. case KVM_GET_REGS: {
  3761. struct kvm_regs *kvm_regs;
  3762. r = -ENOMEM;
  3763. kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
  3764. if (!kvm_regs)
  3765. goto out;
  3766. r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
  3767. if (r)
  3768. goto out_free1;
  3769. r = -EFAULT;
  3770. if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
  3771. goto out_free1;
  3772. r = 0;
  3773. out_free1:
  3774. kfree(kvm_regs);
  3775. break;
  3776. }
  3777. case KVM_SET_REGS: {
  3778. struct kvm_regs *kvm_regs;
  3779. kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
  3780. if (IS_ERR(kvm_regs)) {
  3781. r = PTR_ERR(kvm_regs);
  3782. goto out;
  3783. }
  3784. r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
  3785. kfree(kvm_regs);
  3786. break;
  3787. }
  3788. case KVM_GET_SREGS: {
  3789. kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
  3790. r = -ENOMEM;
  3791. if (!kvm_sregs)
  3792. goto out;
  3793. r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
  3794. if (r)
  3795. goto out;
  3796. r = -EFAULT;
  3797. if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
  3798. goto out;
  3799. r = 0;
  3800. break;
  3801. }
  3802. case KVM_SET_SREGS: {
  3803. kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
  3804. if (IS_ERR(kvm_sregs)) {
  3805. r = PTR_ERR(kvm_sregs);
  3806. kvm_sregs = NULL;
  3807. goto out;
  3808. }
  3809. r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
  3810. break;
  3811. }
  3812. case KVM_GET_MP_STATE: {
  3813. struct kvm_mp_state mp_state;
  3814. r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
  3815. if (r)
  3816. goto out;
  3817. r = -EFAULT;
  3818. if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
  3819. goto out;
  3820. r = 0;
  3821. break;
  3822. }
  3823. case KVM_SET_MP_STATE: {
  3824. struct kvm_mp_state mp_state;
  3825. r = -EFAULT;
  3826. if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
  3827. goto out;
  3828. r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
  3829. break;
  3830. }
  3831. case KVM_TRANSLATE: {
  3832. struct kvm_translation tr;
  3833. r = -EFAULT;
  3834. if (copy_from_user(&tr, argp, sizeof(tr)))
  3835. goto out;
  3836. r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
  3837. if (r)
  3838. goto out;
  3839. r = -EFAULT;
  3840. if (copy_to_user(argp, &tr, sizeof(tr)))
  3841. goto out;
  3842. r = 0;
  3843. break;
  3844. }
  3845. case KVM_SET_GUEST_DEBUG: {
  3846. struct kvm_guest_debug dbg;
  3847. r = -EFAULT;
  3848. if (copy_from_user(&dbg, argp, sizeof(dbg)))
  3849. goto out;
  3850. r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
  3851. break;
  3852. }
  3853. case KVM_SET_SIGNAL_MASK: {
  3854. struct kvm_signal_mask __user *sigmask_arg = argp;
  3855. struct kvm_signal_mask kvm_sigmask;
  3856. sigset_t sigset, *p;
  3857. p = NULL;
  3858. if (argp) {
  3859. r = -EFAULT;
  3860. if (copy_from_user(&kvm_sigmask, argp,
  3861. sizeof(kvm_sigmask)))
  3862. goto out;
  3863. r = -EINVAL;
  3864. if (kvm_sigmask.len != sizeof(sigset))
  3865. goto out;
  3866. r = -EFAULT;
  3867. if (copy_from_user(&sigset, sigmask_arg->sigset,
  3868. sizeof(sigset)))
  3869. goto out;
  3870. p = &sigset;
  3871. }
  3872. r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
  3873. break;
  3874. }
  3875. case KVM_GET_FPU: {
  3876. fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
  3877. r = -ENOMEM;
  3878. if (!fpu)
  3879. goto out;
  3880. r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
  3881. if (r)
  3882. goto out;
  3883. r = -EFAULT;
  3884. if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
  3885. goto out;
  3886. r = 0;
  3887. break;
  3888. }
  3889. case KVM_SET_FPU: {
  3890. fpu = memdup_user(argp, sizeof(*fpu));
  3891. if (IS_ERR(fpu)) {
  3892. r = PTR_ERR(fpu);
  3893. fpu = NULL;
  3894. goto out;
  3895. }
  3896. r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
  3897. break;
  3898. }
  3899. case KVM_GET_STATS_FD: {
  3900. r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
  3901. break;
  3902. }
  3903. #ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
  3904. case KVM_PRE_FAULT_MEMORY: {
  3905. struct kvm_pre_fault_memory range;
  3906. r = -EFAULT;
  3907. if (copy_from_user(&range, argp, sizeof(range)))
  3908. break;
  3909. r = kvm_vcpu_pre_fault_memory(vcpu, &range);
  3910. /* Pass back leftover range. */
  3911. if (copy_to_user(argp, &range, sizeof(range)))
  3912. r = -EFAULT;
  3913. break;
  3914. }
  3915. #endif
  3916. default:
  3917. r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
  3918. }
  3919. out:
  3920. mutex_unlock(&vcpu->mutex);
  3921. kfree(fpu);
  3922. kfree(kvm_sregs);
  3923. return r;
  3924. }
  3925. #ifdef CONFIG_KVM_COMPAT
  3926. static long kvm_vcpu_compat_ioctl(struct file *filp,
  3927. unsigned int ioctl, unsigned long arg)
  3928. {
  3929. struct kvm_vcpu *vcpu = filp->private_data;
  3930. void __user *argp = compat_ptr(arg);
  3931. int r;
  3932. if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
  3933. return -EIO;
  3934. switch (ioctl) {
  3935. case KVM_SET_SIGNAL_MASK: {
  3936. struct kvm_signal_mask __user *sigmask_arg = argp;
  3937. struct kvm_signal_mask kvm_sigmask;
  3938. sigset_t sigset;
  3939. if (argp) {
  3940. r = -EFAULT;
  3941. if (copy_from_user(&kvm_sigmask, argp,
  3942. sizeof(kvm_sigmask)))
  3943. goto out;
  3944. r = -EINVAL;
  3945. if (kvm_sigmask.len != sizeof(compat_sigset_t))
  3946. goto out;
  3947. r = -EFAULT;
  3948. if (get_compat_sigset(&sigset,
  3949. (compat_sigset_t __user *)sigmask_arg->sigset))
  3950. goto out;
  3951. r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
  3952. } else
  3953. r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
  3954. break;
  3955. }
  3956. default:
  3957. r = kvm_vcpu_ioctl(filp, ioctl, arg);
  3958. }
  3959. out:
  3960. return r;
  3961. }
  3962. #endif
  3963. static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
  3964. {
  3965. struct kvm_device *dev = filp->private_data;
  3966. if (dev->ops->mmap)
  3967. return dev->ops->mmap(dev, vma);
  3968. return -ENODEV;
  3969. }
  3970. static int kvm_device_ioctl_attr(struct kvm_device *dev,
  3971. int (*accessor)(struct kvm_device *dev,
  3972. struct kvm_device_attr *attr),
  3973. unsigned long arg)
  3974. {
  3975. struct kvm_device_attr attr;
  3976. if (!accessor)
  3977. return -EPERM;
  3978. if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
  3979. return -EFAULT;
  3980. return accessor(dev, &attr);
  3981. }
  3982. static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
  3983. unsigned long arg)
  3984. {
  3985. struct kvm_device *dev = filp->private_data;
  3986. if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
  3987. return -EIO;
  3988. switch (ioctl) {
  3989. case KVM_SET_DEVICE_ATTR:
  3990. return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
  3991. case KVM_GET_DEVICE_ATTR:
  3992. return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
  3993. case KVM_HAS_DEVICE_ATTR:
  3994. return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
  3995. default:
  3996. if (dev->ops->ioctl)
  3997. return dev->ops->ioctl(dev, ioctl, arg);
  3998. return -ENOTTY;
  3999. }
  4000. }
  4001. static int kvm_device_release(struct inode *inode, struct file *filp)
  4002. {
  4003. struct kvm_device *dev = filp->private_data;
  4004. struct kvm *kvm = dev->kvm;
  4005. if (dev->ops->release) {
  4006. mutex_lock(&kvm->lock);
  4007. list_del_rcu(&dev->vm_node);
  4008. synchronize_rcu();
  4009. dev->ops->release(dev);
  4010. mutex_unlock(&kvm->lock);
  4011. }
  4012. kvm_put_kvm(kvm);
  4013. return 0;
  4014. }
  4015. static struct file_operations kvm_device_fops = {
  4016. .unlocked_ioctl = kvm_device_ioctl,
  4017. .release = kvm_device_release,
  4018. KVM_COMPAT(kvm_device_ioctl),
  4019. .mmap = kvm_device_mmap,
  4020. };
  4021. struct kvm_device *kvm_device_from_filp(struct file *filp)
  4022. {
  4023. if (filp->f_op != &kvm_device_fops)
  4024. return NULL;
  4025. return filp->private_data;
  4026. }
  4027. static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
  4028. #ifdef CONFIG_KVM_MPIC
  4029. [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
  4030. [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
  4031. #endif
  4032. };
  4033. int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
  4034. {
  4035. if (type >= ARRAY_SIZE(kvm_device_ops_table))
  4036. return -ENOSPC;
  4037. if (kvm_device_ops_table[type] != NULL)
  4038. return -EEXIST;
  4039. kvm_device_ops_table[type] = ops;
  4040. return 0;
  4041. }
  4042. void kvm_unregister_device_ops(u32 type)
  4043. {
  4044. if (kvm_device_ops_table[type] != NULL)
  4045. kvm_device_ops_table[type] = NULL;
  4046. }
  4047. static int kvm_ioctl_create_device(struct kvm *kvm,
  4048. struct kvm_create_device *cd)
  4049. {
  4050. const struct kvm_device_ops *ops;
  4051. struct kvm_device *dev;
  4052. bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
  4053. int type;
  4054. int ret;
  4055. if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
  4056. return -ENODEV;
  4057. type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
  4058. ops = kvm_device_ops_table[type];
  4059. if (ops == NULL)
  4060. return -ENODEV;
  4061. if (test)
  4062. return 0;
  4063. dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
  4064. if (!dev)
  4065. return -ENOMEM;
  4066. dev->ops = ops;
  4067. dev->kvm = kvm;
  4068. mutex_lock(&kvm->lock);
  4069. ret = ops->create(dev, type);
  4070. if (ret < 0) {
  4071. mutex_unlock(&kvm->lock);
  4072. kfree(dev);
  4073. return ret;
  4074. }
  4075. list_add_rcu(&dev->vm_node, &kvm->devices);
  4076. mutex_unlock(&kvm->lock);
  4077. if (ops->init)
  4078. ops->init(dev);
  4079. kvm_get_kvm(kvm);
  4080. ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
  4081. if (ret < 0) {
  4082. kvm_put_kvm_no_destroy(kvm);
  4083. mutex_lock(&kvm->lock);
  4084. list_del_rcu(&dev->vm_node);
  4085. synchronize_rcu();
  4086. if (ops->release)
  4087. ops->release(dev);
  4088. mutex_unlock(&kvm->lock);
  4089. if (ops->destroy)
  4090. ops->destroy(dev);
  4091. return ret;
  4092. }
  4093. cd->fd = ret;
  4094. return 0;
  4095. }
  4096. static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
  4097. {
  4098. switch (arg) {
  4099. case KVM_CAP_USER_MEMORY:
  4100. case KVM_CAP_USER_MEMORY2:
  4101. case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
  4102. case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
  4103. case KVM_CAP_INTERNAL_ERROR_DATA:
  4104. #ifdef CONFIG_HAVE_KVM_MSI
  4105. case KVM_CAP_SIGNAL_MSI:
  4106. #endif
  4107. #ifdef CONFIG_HAVE_KVM_IRQCHIP
  4108. case KVM_CAP_IRQFD:
  4109. #endif
  4110. case KVM_CAP_IOEVENTFD_ANY_LENGTH:
  4111. case KVM_CAP_CHECK_EXTENSION_VM:
  4112. case KVM_CAP_ENABLE_CAP_VM:
  4113. case KVM_CAP_HALT_POLL:
  4114. return 1;
  4115. #ifdef CONFIG_KVM_MMIO
  4116. case KVM_CAP_COALESCED_MMIO:
  4117. return KVM_COALESCED_MMIO_PAGE_OFFSET;
  4118. case KVM_CAP_COALESCED_PIO:
  4119. return 1;
  4120. #endif
  4121. #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
  4122. case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
  4123. return KVM_DIRTY_LOG_MANUAL_CAPS;
  4124. #endif
  4125. #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
  4126. case KVM_CAP_IRQ_ROUTING:
  4127. return KVM_MAX_IRQ_ROUTES;
  4128. #endif
  4129. #if KVM_MAX_NR_ADDRESS_SPACES > 1
  4130. case KVM_CAP_MULTI_ADDRESS_SPACE:
  4131. if (kvm)
  4132. return kvm_arch_nr_memslot_as_ids(kvm);
  4133. return KVM_MAX_NR_ADDRESS_SPACES;
  4134. #endif
  4135. case KVM_CAP_NR_MEMSLOTS:
  4136. return KVM_USER_MEM_SLOTS;
  4137. case KVM_CAP_DIRTY_LOG_RING:
  4138. #ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
  4139. return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
  4140. #else
  4141. return 0;
  4142. #endif
  4143. case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
  4144. #ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
  4145. return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
  4146. #else
  4147. return 0;
  4148. #endif
  4149. #ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
  4150. case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP:
  4151. #endif
  4152. case KVM_CAP_BINARY_STATS_FD:
  4153. case KVM_CAP_SYSTEM_EVENT_DATA:
  4154. case KVM_CAP_DEVICE_CTRL:
  4155. return 1;
  4156. #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
  4157. case KVM_CAP_MEMORY_ATTRIBUTES:
  4158. return kvm_supported_mem_attributes(kvm);
  4159. #endif
  4160. #ifdef CONFIG_KVM_PRIVATE_MEM
  4161. case KVM_CAP_GUEST_MEMFD:
  4162. return !kvm || kvm_arch_has_private_mem(kvm);
  4163. #endif
  4164. default:
  4165. break;
  4166. }
  4167. return kvm_vm_ioctl_check_extension(kvm, arg);
  4168. }
  4169. static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
  4170. {
  4171. int r;
  4172. if (!KVM_DIRTY_LOG_PAGE_OFFSET)
  4173. return -EINVAL;
  4174. /* the size should be power of 2 */
  4175. if (!size || (size & (size - 1)))
  4176. return -EINVAL;
  4177. /* Should be bigger to keep the reserved entries, or a page */
  4178. if (size < kvm_dirty_ring_get_rsvd_entries() *
  4179. sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
  4180. return -EINVAL;
  4181. if (size > KVM_DIRTY_RING_MAX_ENTRIES *
  4182. sizeof(struct kvm_dirty_gfn))
  4183. return -E2BIG;
  4184. /* We only allow it to set once */
  4185. if (kvm->dirty_ring_size)
  4186. return -EINVAL;
  4187. mutex_lock(&kvm->lock);
  4188. if (kvm->created_vcpus) {
  4189. /* We don't allow to change this value after vcpu created */
  4190. r = -EINVAL;
  4191. } else {
  4192. kvm->dirty_ring_size = size;
  4193. r = 0;
  4194. }
  4195. mutex_unlock(&kvm->lock);
  4196. return r;
  4197. }
  4198. static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
  4199. {
  4200. unsigned long i;
  4201. struct kvm_vcpu *vcpu;
  4202. int cleared = 0;
  4203. if (!kvm->dirty_ring_size)
  4204. return -EINVAL;
  4205. mutex_lock(&kvm->slots_lock);
  4206. kvm_for_each_vcpu(i, vcpu, kvm)
  4207. cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
  4208. mutex_unlock(&kvm->slots_lock);
  4209. if (cleared)
  4210. kvm_flush_remote_tlbs(kvm);
  4211. return cleared;
  4212. }
  4213. int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
  4214. struct kvm_enable_cap *cap)
  4215. {
  4216. return -EINVAL;
  4217. }
  4218. bool kvm_are_all_memslots_empty(struct kvm *kvm)
  4219. {
  4220. int i;
  4221. lockdep_assert_held(&kvm->slots_lock);
  4222. for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
  4223. if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
  4224. return false;
  4225. }
  4226. return true;
  4227. }
  4228. EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty);
  4229. static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
  4230. struct kvm_enable_cap *cap)
  4231. {
  4232. switch (cap->cap) {
  4233. #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
  4234. case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
  4235. u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
  4236. if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
  4237. allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
  4238. if (cap->flags || (cap->args[0] & ~allowed_options))
  4239. return -EINVAL;
  4240. kvm->manual_dirty_log_protect = cap->args[0];
  4241. return 0;
  4242. }
  4243. #endif
  4244. case KVM_CAP_HALT_POLL: {
  4245. if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
  4246. return -EINVAL;
  4247. kvm->max_halt_poll_ns = cap->args[0];
  4248. /*
  4249. * Ensure kvm->override_halt_poll_ns does not become visible
  4250. * before kvm->max_halt_poll_ns.
  4251. *
  4252. * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns().
  4253. */
  4254. smp_wmb();
  4255. kvm->override_halt_poll_ns = true;
  4256. return 0;
  4257. }
  4258. case KVM_CAP_DIRTY_LOG_RING:
  4259. case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
  4260. if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap))
  4261. return -EINVAL;
  4262. return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
  4263. case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: {
  4264. int r = -EINVAL;
  4265. if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) ||
  4266. !kvm->dirty_ring_size || cap->flags)
  4267. return r;
  4268. mutex_lock(&kvm->slots_lock);
  4269. /*
  4270. * For simplicity, allow enabling ring+bitmap if and only if
  4271. * there are no memslots, e.g. to ensure all memslots allocate
  4272. * a bitmap after the capability is enabled.
  4273. */
  4274. if (kvm_are_all_memslots_empty(kvm)) {
  4275. kvm->dirty_ring_with_bitmap = true;
  4276. r = 0;
  4277. }
  4278. mutex_unlock(&kvm->slots_lock);
  4279. return r;
  4280. }
  4281. default:
  4282. return kvm_vm_ioctl_enable_cap(kvm, cap);
  4283. }
  4284. }
  4285. static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
  4286. size_t size, loff_t *offset)
  4287. {
  4288. struct kvm *kvm = file->private_data;
  4289. return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
  4290. &kvm_vm_stats_desc[0], &kvm->stat,
  4291. sizeof(kvm->stat), user_buffer, size, offset);
  4292. }
  4293. static int kvm_vm_stats_release(struct inode *inode, struct file *file)
  4294. {
  4295. struct kvm *kvm = file->private_data;
  4296. kvm_put_kvm(kvm);
  4297. return 0;
  4298. }
  4299. static const struct file_operations kvm_vm_stats_fops = {
  4300. .owner = THIS_MODULE,
  4301. .read = kvm_vm_stats_read,
  4302. .release = kvm_vm_stats_release,
  4303. .llseek = noop_llseek,
  4304. };
  4305. static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
  4306. {
  4307. int fd;
  4308. struct file *file;
  4309. fd = get_unused_fd_flags(O_CLOEXEC);
  4310. if (fd < 0)
  4311. return fd;
  4312. file = anon_inode_getfile("kvm-vm-stats",
  4313. &kvm_vm_stats_fops, kvm, O_RDONLY);
  4314. if (IS_ERR(file)) {
  4315. put_unused_fd(fd);
  4316. return PTR_ERR(file);
  4317. }
  4318. kvm_get_kvm(kvm);
  4319. file->f_mode |= FMODE_PREAD;
  4320. fd_install(fd, file);
  4321. return fd;
  4322. }
  4323. #define SANITY_CHECK_MEM_REGION_FIELD(field) \
  4324. do { \
  4325. BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) != \
  4326. offsetof(struct kvm_userspace_memory_region2, field)); \
  4327. BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) != \
  4328. sizeof_field(struct kvm_userspace_memory_region2, field)); \
  4329. } while (0)
  4330. static long kvm_vm_ioctl(struct file *filp,
  4331. unsigned int ioctl, unsigned long arg)
  4332. {
  4333. struct kvm *kvm = filp->private_data;
  4334. void __user *argp = (void __user *)arg;
  4335. int r;
  4336. if (kvm->mm != current->mm || kvm->vm_dead)
  4337. return -EIO;
  4338. switch (ioctl) {
  4339. case KVM_CREATE_VCPU:
  4340. r = kvm_vm_ioctl_create_vcpu(kvm, arg);
  4341. break;
  4342. case KVM_ENABLE_CAP: {
  4343. struct kvm_enable_cap cap;
  4344. r = -EFAULT;
  4345. if (copy_from_user(&cap, argp, sizeof(cap)))
  4346. goto out;
  4347. r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
  4348. break;
  4349. }
  4350. case KVM_SET_USER_MEMORY_REGION2:
  4351. case KVM_SET_USER_MEMORY_REGION: {
  4352. struct kvm_userspace_memory_region2 mem;
  4353. unsigned long size;
  4354. if (ioctl == KVM_SET_USER_MEMORY_REGION) {
  4355. /*
  4356. * Fields beyond struct kvm_userspace_memory_region shouldn't be
  4357. * accessed, but avoid leaking kernel memory in case of a bug.
  4358. */
  4359. memset(&mem, 0, sizeof(mem));
  4360. size = sizeof(struct kvm_userspace_memory_region);
  4361. } else {
  4362. size = sizeof(struct kvm_userspace_memory_region2);
  4363. }
  4364. /* Ensure the common parts of the two structs are identical. */
  4365. SANITY_CHECK_MEM_REGION_FIELD(slot);
  4366. SANITY_CHECK_MEM_REGION_FIELD(flags);
  4367. SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
  4368. SANITY_CHECK_MEM_REGION_FIELD(memory_size);
  4369. SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);
  4370. r = -EFAULT;
  4371. if (copy_from_user(&mem, argp, size))
  4372. goto out;
  4373. r = -EINVAL;
  4374. if (ioctl == KVM_SET_USER_MEMORY_REGION &&
  4375. (mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS))
  4376. goto out;
  4377. r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
  4378. break;
  4379. }
  4380. case KVM_GET_DIRTY_LOG: {
  4381. struct kvm_dirty_log log;
  4382. r = -EFAULT;
  4383. if (copy_from_user(&log, argp, sizeof(log)))
  4384. goto out;
  4385. r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
  4386. break;
  4387. }
  4388. #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
  4389. case KVM_CLEAR_DIRTY_LOG: {
  4390. struct kvm_clear_dirty_log log;
  4391. r = -EFAULT;
  4392. if (copy_from_user(&log, argp, sizeof(log)))
  4393. goto out;
  4394. r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
  4395. break;
  4396. }
  4397. #endif
  4398. #ifdef CONFIG_KVM_MMIO
  4399. case KVM_REGISTER_COALESCED_MMIO: {
  4400. struct kvm_coalesced_mmio_zone zone;
  4401. r = -EFAULT;
  4402. if (copy_from_user(&zone, argp, sizeof(zone)))
  4403. goto out;
  4404. r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
  4405. break;
  4406. }
  4407. case KVM_UNREGISTER_COALESCED_MMIO: {
  4408. struct kvm_coalesced_mmio_zone zone;
  4409. r = -EFAULT;
  4410. if (copy_from_user(&zone, argp, sizeof(zone)))
  4411. goto out;
  4412. r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
  4413. break;
  4414. }
  4415. #endif
  4416. case KVM_IRQFD: {
  4417. struct kvm_irqfd data;
  4418. r = -EFAULT;
  4419. if (copy_from_user(&data, argp, sizeof(data)))
  4420. goto out;
  4421. r = kvm_irqfd(kvm, &data);
  4422. break;
  4423. }
  4424. case KVM_IOEVENTFD: {
  4425. struct kvm_ioeventfd data;
  4426. r = -EFAULT;
  4427. if (copy_from_user(&data, argp, sizeof(data)))
  4428. goto out;
  4429. r = kvm_ioeventfd(kvm, &data);
  4430. break;
  4431. }
  4432. #ifdef CONFIG_HAVE_KVM_MSI
  4433. case KVM_SIGNAL_MSI: {
  4434. struct kvm_msi msi;
  4435. r = -EFAULT;
  4436. if (copy_from_user(&msi, argp, sizeof(msi)))
  4437. goto out;
  4438. r = kvm_send_userspace_msi(kvm, &msi);
  4439. break;
  4440. }
  4441. #endif
  4442. #ifdef __KVM_HAVE_IRQ_LINE
  4443. case KVM_IRQ_LINE_STATUS:
  4444. case KVM_IRQ_LINE: {
  4445. struct kvm_irq_level irq_event;
  4446. r = -EFAULT;
  4447. if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
  4448. goto out;
  4449. r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
  4450. ioctl == KVM_IRQ_LINE_STATUS);
  4451. if (r)
  4452. goto out;
  4453. r = -EFAULT;
  4454. if (ioctl == KVM_IRQ_LINE_STATUS) {
  4455. if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
  4456. goto out;
  4457. }
  4458. r = 0;
  4459. break;
  4460. }
  4461. #endif
  4462. #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
  4463. case KVM_SET_GSI_ROUTING: {
  4464. struct kvm_irq_routing routing;
  4465. struct kvm_irq_routing __user *urouting;
  4466. struct kvm_irq_routing_entry *entries = NULL;
  4467. r = -EFAULT;
  4468. if (copy_from_user(&routing, argp, sizeof(routing)))
  4469. goto out;
  4470. r = -EINVAL;
  4471. if (!kvm_arch_can_set_irq_routing(kvm))
  4472. goto out;
  4473. if (routing.nr > KVM_MAX_IRQ_ROUTES)
  4474. goto out;
  4475. if (routing.flags)
  4476. goto out;
  4477. if (routing.nr) {
  4478. urouting = argp;
  4479. entries = vmemdup_array_user(urouting->entries,
  4480. routing.nr, sizeof(*entries));
  4481. if (IS_ERR(entries)) {
  4482. r = PTR_ERR(entries);
  4483. goto out;
  4484. }
  4485. }
  4486. r = kvm_set_irq_routing(kvm, entries, routing.nr,
  4487. routing.flags);
  4488. kvfree(entries);
  4489. break;
  4490. }
  4491. #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
  4492. #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
  4493. case KVM_SET_MEMORY_ATTRIBUTES: {
  4494. struct kvm_memory_attributes attrs;
  4495. r = -EFAULT;
  4496. if (copy_from_user(&attrs, argp, sizeof(attrs)))
  4497. goto out;
  4498. r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
  4499. break;
  4500. }
  4501. #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
  4502. case KVM_CREATE_DEVICE: {
  4503. struct kvm_create_device cd;
  4504. r = -EFAULT;
  4505. if (copy_from_user(&cd, argp, sizeof(cd)))
  4506. goto out;
  4507. r = kvm_ioctl_create_device(kvm, &cd);
  4508. if (r)
  4509. goto out;
  4510. r = -EFAULT;
  4511. if (copy_to_user(argp, &cd, sizeof(cd)))
  4512. goto out;
  4513. r = 0;
  4514. break;
  4515. }
  4516. case KVM_CHECK_EXTENSION:
  4517. r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
  4518. break;
  4519. case KVM_RESET_DIRTY_RINGS:
  4520. r = kvm_vm_ioctl_reset_dirty_pages(kvm);
  4521. break;
  4522. case KVM_GET_STATS_FD:
  4523. r = kvm_vm_ioctl_get_stats_fd(kvm);
  4524. break;
  4525. #ifdef CONFIG_KVM_PRIVATE_MEM
  4526. case KVM_CREATE_GUEST_MEMFD: {
  4527. struct kvm_create_guest_memfd guest_memfd;
  4528. r = -EFAULT;
  4529. if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
  4530. goto out;
  4531. r = kvm_gmem_create(kvm, &guest_memfd);
  4532. break;
  4533. }
  4534. #endif
  4535. default:
  4536. r = kvm_arch_vm_ioctl(filp, ioctl, arg);
  4537. }
  4538. out:
  4539. return r;
  4540. }
  4541. #ifdef CONFIG_KVM_COMPAT
  4542. struct compat_kvm_dirty_log {
  4543. __u32 slot;
  4544. __u32 padding1;
  4545. union {
  4546. compat_uptr_t dirty_bitmap; /* one bit per page */
  4547. __u64 padding2;
  4548. };
  4549. };
  4550. struct compat_kvm_clear_dirty_log {
  4551. __u32 slot;
  4552. __u32 num_pages;
  4553. __u64 first_page;
  4554. union {
  4555. compat_uptr_t dirty_bitmap; /* one bit per page */
  4556. __u64 padding2;
  4557. };
  4558. };
  4559. long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
  4560. unsigned long arg)
  4561. {
  4562. return -ENOTTY;
  4563. }
  4564. static long kvm_vm_compat_ioctl(struct file *filp,
  4565. unsigned int ioctl, unsigned long arg)
  4566. {
  4567. struct kvm *kvm = filp->private_data;
  4568. int r;
  4569. if (kvm->mm != current->mm || kvm->vm_dead)
  4570. return -EIO;
  4571. r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
  4572. if (r != -ENOTTY)
  4573. return r;
  4574. switch (ioctl) {
  4575. #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
  4576. case KVM_CLEAR_DIRTY_LOG: {
  4577. struct compat_kvm_clear_dirty_log compat_log;
  4578. struct kvm_clear_dirty_log log;
  4579. if (copy_from_user(&compat_log, (void __user *)arg,
  4580. sizeof(compat_log)))
  4581. return -EFAULT;
  4582. log.slot = compat_log.slot;
  4583. log.num_pages = compat_log.num_pages;
  4584. log.first_page = compat_log.first_page;
  4585. log.padding2 = compat_log.padding2;
  4586. log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
  4587. r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
  4588. break;
  4589. }
  4590. #endif
  4591. case KVM_GET_DIRTY_LOG: {
  4592. struct compat_kvm_dirty_log compat_log;
  4593. struct kvm_dirty_log log;
  4594. if (copy_from_user(&compat_log, (void __user *)arg,
  4595. sizeof(compat_log)))
  4596. return -EFAULT;
  4597. log.slot = compat_log.slot;
  4598. log.padding1 = compat_log.padding1;
  4599. log.padding2 = compat_log.padding2;
  4600. log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
  4601. r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
  4602. break;
  4603. }
  4604. default:
  4605. r = kvm_vm_ioctl(filp, ioctl, arg);
  4606. }
  4607. return r;
  4608. }
  4609. #endif
  4610. static struct file_operations kvm_vm_fops = {
  4611. .release = kvm_vm_release,
  4612. .unlocked_ioctl = kvm_vm_ioctl,
  4613. .llseek = noop_llseek,
  4614. KVM_COMPAT(kvm_vm_compat_ioctl),
  4615. };
  4616. bool file_is_kvm(struct file *file)
  4617. {
  4618. return file && file->f_op == &kvm_vm_fops;
  4619. }
  4620. EXPORT_SYMBOL_GPL(file_is_kvm);
  4621. static int kvm_dev_ioctl_create_vm(unsigned long type)
  4622. {
  4623. char fdname[ITOA_MAX_LEN + 1];
  4624. int r, fd;
  4625. struct kvm *kvm;
  4626. struct file *file;
  4627. fd = get_unused_fd_flags(O_CLOEXEC);
  4628. if (fd < 0)
  4629. return fd;
  4630. snprintf(fdname, sizeof(fdname), "%d", fd);
  4631. kvm = kvm_create_vm(type, fdname);
  4632. if (IS_ERR(kvm)) {
  4633. r = PTR_ERR(kvm);
  4634. goto put_fd;
  4635. }
  4636. file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
  4637. if (IS_ERR(file)) {
  4638. r = PTR_ERR(file);
  4639. goto put_kvm;
  4640. }
  4641. /*
  4642. * Don't call kvm_put_kvm anymore at this point; file->f_op is
  4643. * already set, with ->release() being kvm_vm_release(). In error
  4644. * cases it will be called by the final fput(file) and will take
  4645. * care of doing kvm_put_kvm(kvm).
  4646. */
  4647. kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
  4648. fd_install(fd, file);
  4649. return fd;
  4650. put_kvm:
  4651. kvm_put_kvm(kvm);
  4652. put_fd:
  4653. put_unused_fd(fd);
  4654. return r;
  4655. }
  4656. static long kvm_dev_ioctl(struct file *filp,
  4657. unsigned int ioctl, unsigned long arg)
  4658. {
  4659. int r = -EINVAL;
  4660. switch (ioctl) {
  4661. case KVM_GET_API_VERSION:
  4662. if (arg)
  4663. goto out;
  4664. r = KVM_API_VERSION;
  4665. break;
  4666. case KVM_CREATE_VM:
  4667. r = kvm_dev_ioctl_create_vm(arg);
  4668. break;
  4669. case KVM_CHECK_EXTENSION:
  4670. r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
  4671. break;
  4672. case KVM_GET_VCPU_MMAP_SIZE:
  4673. if (arg)
  4674. goto out;
  4675. r = PAGE_SIZE; /* struct kvm_run */
  4676. #ifdef CONFIG_X86
  4677. r += PAGE_SIZE; /* pio data page */
  4678. #endif
  4679. #ifdef CONFIG_KVM_MMIO
  4680. r += PAGE_SIZE; /* coalesced mmio ring page */
  4681. #endif
  4682. break;
  4683. default:
  4684. return kvm_arch_dev_ioctl(filp, ioctl, arg);
  4685. }
  4686. out:
  4687. return r;
  4688. }
  4689. static struct file_operations kvm_chardev_ops = {
  4690. .unlocked_ioctl = kvm_dev_ioctl,
  4691. .llseek = noop_llseek,
  4692. KVM_COMPAT(kvm_dev_ioctl),
  4693. };
  4694. static struct miscdevice kvm_dev = {
  4695. KVM_MINOR,
  4696. "kvm",
  4697. &kvm_chardev_ops,
  4698. };
  4699. #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
  4700. static bool enable_virt_at_load = true;
  4701. module_param(enable_virt_at_load, bool, 0444);
  4702. __visible bool kvm_rebooting;
  4703. EXPORT_SYMBOL_GPL(kvm_rebooting);
  4704. static DEFINE_PER_CPU(bool, virtualization_enabled);
  4705. static DEFINE_MUTEX(kvm_usage_lock);
  4706. static int kvm_usage_count;
  4707. __weak void kvm_arch_enable_virtualization(void)
  4708. {
  4709. }
  4710. __weak void kvm_arch_disable_virtualization(void)
  4711. {
  4712. }
  4713. static int kvm_enable_virtualization_cpu(void)
  4714. {
  4715. if (__this_cpu_read(virtualization_enabled))
  4716. return 0;
  4717. if (kvm_arch_enable_virtualization_cpu()) {
  4718. pr_info("kvm: enabling virtualization on CPU%d failed\n",
  4719. raw_smp_processor_id());
  4720. return -EIO;
  4721. }
  4722. __this_cpu_write(virtualization_enabled, true);
  4723. return 0;
  4724. }
  4725. static int kvm_online_cpu(unsigned int cpu)
  4726. {
  4727. /*
  4728. * Abort the CPU online process if hardware virtualization cannot
  4729. * be enabled. Otherwise running VMs would encounter unrecoverable
  4730. * errors when scheduled to this CPU.
  4731. */
  4732. return kvm_enable_virtualization_cpu();
  4733. }
  4734. static void kvm_disable_virtualization_cpu(void *ign)
  4735. {
  4736. if (!__this_cpu_read(virtualization_enabled))
  4737. return;
  4738. kvm_arch_disable_virtualization_cpu();
  4739. __this_cpu_write(virtualization_enabled, false);
  4740. }
  4741. static int kvm_offline_cpu(unsigned int cpu)
  4742. {
  4743. kvm_disable_virtualization_cpu(NULL);
  4744. return 0;
  4745. }
  4746. static void kvm_shutdown(void)
  4747. {
  4748. /*
  4749. * Disable hardware virtualization and set kvm_rebooting to indicate
  4750. * that KVM has asynchronously disabled hardware virtualization, i.e.
  4751. * that relevant errors and exceptions aren't entirely unexpected.
  4752. * Some flavors of hardware virtualization need to be disabled before
  4753. * transferring control to firmware (to perform shutdown/reboot), e.g.
  4754. * on x86, virtualization can block INIT interrupts, which are used by
  4755. * firmware to pull APs back under firmware control. Note, this path
  4756. * is used for both shutdown and reboot scenarios, i.e. neither name is
  4757. * 100% comprehensive.
  4758. */
  4759. pr_info("kvm: exiting hardware virtualization\n");
  4760. kvm_rebooting = true;
  4761. on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1);
  4762. }
  4763. static int kvm_suspend(void)
  4764. {
  4765. /*
  4766. * Secondary CPUs and CPU hotplug are disabled across the suspend/resume
  4767. * callbacks, i.e. no need to acquire kvm_usage_lock to ensure the usage
  4768. * count is stable. Assert that kvm_usage_lock is not held to ensure
  4769. * the system isn't suspended while KVM is enabling hardware. Hardware
  4770. * enabling can be preempted, but the task cannot be frozen until it has
  4771. * dropped all locks (userspace tasks are frozen via a fake signal).
  4772. */
  4773. lockdep_assert_not_held(&kvm_usage_lock);
  4774. lockdep_assert_irqs_disabled();
  4775. kvm_disable_virtualization_cpu(NULL);
  4776. return 0;
  4777. }
  4778. static void kvm_resume(void)
  4779. {
  4780. lockdep_assert_not_held(&kvm_usage_lock);
  4781. lockdep_assert_irqs_disabled();
  4782. WARN_ON_ONCE(kvm_enable_virtualization_cpu());
  4783. }
  4784. static struct syscore_ops kvm_syscore_ops = {
  4785. .suspend = kvm_suspend,
  4786. .resume = kvm_resume,
  4787. .shutdown = kvm_shutdown,
  4788. };
  4789. static int kvm_enable_virtualization(void)
  4790. {
  4791. int r;
  4792. guard(mutex)(&kvm_usage_lock);
  4793. if (kvm_usage_count++)
  4794. return 0;
  4795. kvm_arch_enable_virtualization();
  4796. r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
  4797. kvm_online_cpu, kvm_offline_cpu);
  4798. if (r)
  4799. goto err_cpuhp;
  4800. register_syscore_ops(&kvm_syscore_ops);
  4801. /*
  4802. * Undo virtualization enabling and bail if the system is going down.
  4803. * If userspace initiated a forced reboot, e.g. reboot -f, then it's
  4804. * possible for an in-flight operation to enable virtualization after
  4805. * syscore_shutdown() is called, i.e. without kvm_shutdown() being
  4806. * invoked. Note, this relies on system_state being set _before_
  4807. * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked
  4808. * or this CPU observes the impending shutdown. Which is why KVM uses
  4809. * a syscore ops hook instead of registering a dedicated reboot
  4810. * notifier (the latter runs before system_state is updated).
  4811. */
  4812. if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
  4813. system_state == SYSTEM_RESTART) {
  4814. r = -EBUSY;
  4815. goto err_rebooting;
  4816. }
  4817. return 0;
  4818. err_rebooting:
  4819. unregister_syscore_ops(&kvm_syscore_ops);
  4820. cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
  4821. err_cpuhp:
  4822. kvm_arch_disable_virtualization();
  4823. --kvm_usage_count;
  4824. return r;
  4825. }
  4826. static void kvm_disable_virtualization(void)
  4827. {
  4828. guard(mutex)(&kvm_usage_lock);
  4829. if (--kvm_usage_count)
  4830. return;
  4831. unregister_syscore_ops(&kvm_syscore_ops);
  4832. cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
  4833. kvm_arch_disable_virtualization();
  4834. }
  4835. static int kvm_init_virtualization(void)
  4836. {
  4837. if (enable_virt_at_load)
  4838. return kvm_enable_virtualization();
  4839. return 0;
  4840. }
  4841. static void kvm_uninit_virtualization(void)
  4842. {
  4843. if (enable_virt_at_load)
  4844. kvm_disable_virtualization();
  4845. }
  4846. #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
  4847. static int kvm_enable_virtualization(void)
  4848. {
  4849. return 0;
  4850. }
  4851. static int kvm_init_virtualization(void)
  4852. {
  4853. return 0;
  4854. }
  4855. static void kvm_disable_virtualization(void)
  4856. {
  4857. }
  4858. static void kvm_uninit_virtualization(void)
  4859. {
  4860. }
  4861. #endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
  4862. static void kvm_iodevice_destructor(struct kvm_io_device *dev)
  4863. {
  4864. if (dev->ops->destructor)
  4865. dev->ops->destructor(dev);
  4866. }
  4867. static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
  4868. {
  4869. int i;
  4870. for (i = 0; i < bus->dev_count; i++) {
  4871. struct kvm_io_device *pos = bus->range[i].dev;
  4872. kvm_iodevice_destructor(pos);
  4873. }
  4874. kfree(bus);
  4875. }
  4876. static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
  4877. const struct kvm_io_range *r2)
  4878. {
  4879. gpa_t addr1 = r1->addr;
  4880. gpa_t addr2 = r2->addr;
  4881. if (addr1 < addr2)
  4882. return -1;
  4883. /* If r2->len == 0, match the exact address. If r2->len != 0,
  4884. * accept any overlapping write. Any order is acceptable for
  4885. * overlapping ranges, because kvm_io_bus_get_first_dev ensures
  4886. * we process all of them.
  4887. */
  4888. if (r2->len) {
  4889. addr1 += r1->len;
  4890. addr2 += r2->len;
  4891. }
  4892. if (addr1 > addr2)
  4893. return 1;
  4894. return 0;
  4895. }
  4896. static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
  4897. {
  4898. return kvm_io_bus_cmp(p1, p2);
  4899. }
  4900. static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
  4901. gpa_t addr, int len)
  4902. {
  4903. struct kvm_io_range *range, key;
  4904. int off;
  4905. key = (struct kvm_io_range) {
  4906. .addr = addr,
  4907. .len = len,
  4908. };
  4909. range = bsearch(&key, bus->range, bus->dev_count,
  4910. sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
  4911. if (range == NULL)
  4912. return -ENOENT;
  4913. off = range - bus->range;
  4914. while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
  4915. off--;
  4916. return off;
  4917. }
  4918. static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
  4919. struct kvm_io_range *range, const void *val)
  4920. {
  4921. int idx;
  4922. idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
  4923. if (idx < 0)
  4924. return -EOPNOTSUPP;
  4925. while (idx < bus->dev_count &&
  4926. kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
  4927. if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
  4928. range->len, val))
  4929. return idx;
  4930. idx++;
  4931. }
  4932. return -EOPNOTSUPP;
  4933. }
  4934. /* kvm_io_bus_write - called under kvm->slots_lock */
  4935. int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
  4936. int len, const void *val)
  4937. {
  4938. struct kvm_io_bus *bus;
  4939. struct kvm_io_range range;
  4940. int r;
  4941. range = (struct kvm_io_range) {
  4942. .addr = addr,
  4943. .len = len,
  4944. };
  4945. bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
  4946. if (!bus)
  4947. return -ENOMEM;
  4948. r = __kvm_io_bus_write(vcpu, bus, &range, val);
  4949. return r < 0 ? r : 0;
  4950. }
  4951. EXPORT_SYMBOL_GPL(kvm_io_bus_write);
  4952. /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
  4953. int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
  4954. gpa_t addr, int len, const void *val, long cookie)
  4955. {
  4956. struct kvm_io_bus *bus;
  4957. struct kvm_io_range range;
  4958. range = (struct kvm_io_range) {
  4959. .addr = addr,
  4960. .len = len,
  4961. };
  4962. bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
  4963. if (!bus)
  4964. return -ENOMEM;
  4965. /* First try the device referenced by cookie. */
  4966. if ((cookie >= 0) && (cookie < bus->dev_count) &&
  4967. (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
  4968. if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
  4969. val))
  4970. return cookie;
  4971. /*
  4972. * cookie contained garbage; fall back to search and return the
  4973. * correct cookie value.
  4974. */
  4975. return __kvm_io_bus_write(vcpu, bus, &range, val);
  4976. }
  4977. static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
  4978. struct kvm_io_range *range, void *val)
  4979. {
  4980. int idx;
  4981. idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
  4982. if (idx < 0)
  4983. return -EOPNOTSUPP;
  4984. while (idx < bus->dev_count &&
  4985. kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
  4986. if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
  4987. range->len, val))
  4988. return idx;
  4989. idx++;
  4990. }
  4991. return -EOPNOTSUPP;
  4992. }
  4993. /* kvm_io_bus_read - called under kvm->slots_lock */
  4994. int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
  4995. int len, void *val)
  4996. {
  4997. struct kvm_io_bus *bus;
  4998. struct kvm_io_range range;
  4999. int r;
  5000. range = (struct kvm_io_range) {
  5001. .addr = addr,
  5002. .len = len,
  5003. };
  5004. bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
  5005. if (!bus)
  5006. return -ENOMEM;
  5007. r = __kvm_io_bus_read(vcpu, bus, &range, val);
  5008. return r < 0 ? r : 0;
  5009. }
  5010. int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
  5011. int len, struct kvm_io_device *dev)
  5012. {
  5013. int i;
  5014. struct kvm_io_bus *new_bus, *bus;
  5015. struct kvm_io_range range;
  5016. lockdep_assert_held(&kvm->slots_lock);
  5017. bus = kvm_get_bus(kvm, bus_idx);
  5018. if (!bus)
  5019. return -ENOMEM;
  5020. /* exclude ioeventfd which is limited by maximum fd */
  5021. if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
  5022. return -ENOSPC;
  5023. new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
  5024. GFP_KERNEL_ACCOUNT);
  5025. if (!new_bus)
  5026. return -ENOMEM;
  5027. range = (struct kvm_io_range) {
  5028. .addr = addr,
  5029. .len = len,
  5030. .dev = dev,
  5031. };
  5032. for (i = 0; i < bus->dev_count; i++)
  5033. if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
  5034. break;
  5035. memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
  5036. new_bus->dev_count++;
  5037. new_bus->range[i] = range;
  5038. memcpy(new_bus->range + i + 1, bus->range + i,
  5039. (bus->dev_count - i) * sizeof(struct kvm_io_range));
  5040. rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
  5041. synchronize_srcu_expedited(&kvm->srcu);
  5042. kfree(bus);
  5043. return 0;
  5044. }
  5045. int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
  5046. struct kvm_io_device *dev)
  5047. {
  5048. int i;
  5049. struct kvm_io_bus *new_bus, *bus;
  5050. lockdep_assert_held(&kvm->slots_lock);
  5051. bus = kvm_get_bus(kvm, bus_idx);
  5052. if (!bus)
  5053. return 0;
  5054. for (i = 0; i < bus->dev_count; i++) {
  5055. if (bus->range[i].dev == dev) {
  5056. break;
  5057. }
  5058. }
  5059. if (i == bus->dev_count)
  5060. return 0;
  5061. new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
  5062. GFP_KERNEL_ACCOUNT);
  5063. if (new_bus) {
  5064. memcpy(new_bus, bus, struct_size(bus, range, i));
  5065. new_bus->dev_count--;
  5066. memcpy(new_bus->range + i, bus->range + i + 1,
  5067. flex_array_size(new_bus, range, new_bus->dev_count - i));
  5068. }
  5069. rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
  5070. synchronize_srcu_expedited(&kvm->srcu);
  5071. /*
  5072. * If NULL bus is installed, destroy the old bus, including all the
  5073. * attached devices. Otherwise, destroy the caller's device only.
  5074. */
  5075. if (!new_bus) {
  5076. pr_err("kvm: failed to shrink bus, removing it completely\n");
  5077. kvm_io_bus_destroy(bus);
  5078. return -ENOMEM;
  5079. }
  5080. kvm_iodevice_destructor(dev);
  5081. kfree(bus);
  5082. return 0;
  5083. }
  5084. struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
  5085. gpa_t addr)
  5086. {
  5087. struct kvm_io_bus *bus;
  5088. int dev_idx, srcu_idx;
  5089. struct kvm_io_device *iodev = NULL;
  5090. srcu_idx = srcu_read_lock(&kvm->srcu);
  5091. bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
  5092. if (!bus)
  5093. goto out_unlock;
  5094. dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
  5095. if (dev_idx < 0)
  5096. goto out_unlock;
  5097. iodev = bus->range[dev_idx].dev;
  5098. out_unlock:
  5099. srcu_read_unlock(&kvm->srcu, srcu_idx);
  5100. return iodev;
  5101. }
  5102. EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
  5103. static int kvm_debugfs_open(struct inode *inode, struct file *file,
  5104. int (*get)(void *, u64 *), int (*set)(void *, u64),
  5105. const char *fmt)
  5106. {
  5107. int ret;
  5108. struct kvm_stat_data *stat_data = inode->i_private;
  5109. /*
  5110. * The debugfs files are a reference to the kvm struct which
  5111. * is still valid when kvm_destroy_vm is called. kvm_get_kvm_safe
  5112. * avoids the race between open and the removal of the debugfs directory.
  5113. */
  5114. if (!kvm_get_kvm_safe(stat_data->kvm))
  5115. return -ENOENT;
  5116. ret = simple_attr_open(inode, file, get,
  5117. kvm_stats_debugfs_mode(stat_data->desc) & 0222
  5118. ? set : NULL, fmt);
  5119. if (ret)
  5120. kvm_put_kvm(stat_data->kvm);
  5121. return ret;
  5122. }
  5123. static int kvm_debugfs_release(struct inode *inode, struct file *file)
  5124. {
  5125. struct kvm_stat_data *stat_data = inode->i_private;
  5126. simple_attr_release(inode, file);
  5127. kvm_put_kvm(stat_data->kvm);
  5128. return 0;
  5129. }
  5130. static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
  5131. {
  5132. *val = *(u64 *)((void *)(&kvm->stat) + offset);
  5133. return 0;
  5134. }
  5135. static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
  5136. {
  5137. *(u64 *)((void *)(&kvm->stat) + offset) = 0;
  5138. return 0;
  5139. }
  5140. static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
  5141. {
  5142. unsigned long i;
  5143. struct kvm_vcpu *vcpu;
  5144. *val = 0;
  5145. kvm_for_each_vcpu(i, vcpu, kvm)
  5146. *val += *(u64 *)((void *)(&vcpu->stat) + offset);
  5147. return 0;
  5148. }
  5149. static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
  5150. {
  5151. unsigned long i;
  5152. struct kvm_vcpu *vcpu;
  5153. kvm_for_each_vcpu(i, vcpu, kvm)
  5154. *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
  5155. return 0;
  5156. }
  5157. static int kvm_stat_data_get(void *data, u64 *val)
  5158. {
  5159. int r = -EFAULT;
  5160. struct kvm_stat_data *stat_data = data;
  5161. switch (stat_data->kind) {
  5162. case KVM_STAT_VM:
  5163. r = kvm_get_stat_per_vm(stat_data->kvm,
  5164. stat_data->desc->desc.offset, val);
  5165. break;
  5166. case KVM_STAT_VCPU:
  5167. r = kvm_get_stat_per_vcpu(stat_data->kvm,
  5168. stat_data->desc->desc.offset, val);
  5169. break;
  5170. }
  5171. return r;
  5172. }
  5173. static int kvm_stat_data_clear(void *data, u64 val)
  5174. {
  5175. int r = -EFAULT;
  5176. struct kvm_stat_data *stat_data = data;
  5177. if (val)
  5178. return -EINVAL;
  5179. switch (stat_data->kind) {
  5180. case KVM_STAT_VM:
  5181. r = kvm_clear_stat_per_vm(stat_data->kvm,
  5182. stat_data->desc->desc.offset);
  5183. break;
  5184. case KVM_STAT_VCPU:
  5185. r = kvm_clear_stat_per_vcpu(stat_data->kvm,
  5186. stat_data->desc->desc.offset);
  5187. break;
  5188. }
  5189. return r;
  5190. }
  5191. static int kvm_stat_data_open(struct inode *inode, struct file *file)
  5192. {
  5193. __simple_attr_check_format("%llu\n", 0ull);
  5194. return kvm_debugfs_open(inode, file, kvm_stat_data_get,
  5195. kvm_stat_data_clear, "%llu\n");
  5196. }
  5197. static const struct file_operations stat_fops_per_vm = {
  5198. .owner = THIS_MODULE,
  5199. .open = kvm_stat_data_open,
  5200. .release = kvm_debugfs_release,
  5201. .read = simple_attr_read,
  5202. .write = simple_attr_write,
  5203. };
  5204. static int vm_stat_get(void *_offset, u64 *val)
  5205. {
  5206. unsigned offset = (long)_offset;
  5207. struct kvm *kvm;
  5208. u64 tmp_val;
  5209. *val = 0;
  5210. mutex_lock(&kvm_lock);
  5211. list_for_each_entry(kvm, &vm_list, vm_list) {
  5212. kvm_get_stat_per_vm(kvm, offset, &tmp_val);
  5213. *val += tmp_val;
  5214. }
  5215. mutex_unlock(&kvm_lock);
  5216. return 0;
  5217. }
  5218. static int vm_stat_clear(void *_offset, u64 val)
  5219. {
  5220. unsigned offset = (long)_offset;
  5221. struct kvm *kvm;
  5222. if (val)
  5223. return -EINVAL;
  5224. mutex_lock(&kvm_lock);
  5225. list_for_each_entry(kvm, &vm_list, vm_list) {
  5226. kvm_clear_stat_per_vm(kvm, offset);
  5227. }
  5228. mutex_unlock(&kvm_lock);
  5229. return 0;
  5230. }
  5231. DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
  5232. DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
  5233. static int vcpu_stat_get(void *_offset, u64 *val)
  5234. {
  5235. unsigned offset = (long)_offset;
  5236. struct kvm *kvm;
  5237. u64 tmp_val;
  5238. *val = 0;
  5239. mutex_lock(&kvm_lock);
  5240. list_for_each_entry(kvm, &vm_list, vm_list) {
  5241. kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
  5242. *val += tmp_val;
  5243. }
  5244. mutex_unlock(&kvm_lock);
  5245. return 0;
  5246. }
  5247. static int vcpu_stat_clear(void *_offset, u64 val)
  5248. {
  5249. unsigned offset = (long)_offset;
  5250. struct kvm *kvm;
  5251. if (val)
  5252. return -EINVAL;
  5253. mutex_lock(&kvm_lock);
  5254. list_for_each_entry(kvm, &vm_list, vm_list) {
  5255. kvm_clear_stat_per_vcpu(kvm, offset);
  5256. }
  5257. mutex_unlock(&kvm_lock);
  5258. return 0;
  5259. }
  5260. DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
  5261. "%llu\n");
  5262. DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
  5263. static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
  5264. {
  5265. struct kobj_uevent_env *env;
  5266. unsigned long long created, active;
  5267. if (!kvm_dev.this_device || !kvm)
  5268. return;
  5269. mutex_lock(&kvm_lock);
  5270. if (type == KVM_EVENT_CREATE_VM) {
  5271. kvm_createvm_count++;
  5272. kvm_active_vms++;
  5273. } else if (type == KVM_EVENT_DESTROY_VM) {
  5274. kvm_active_vms--;
  5275. }
  5276. created = kvm_createvm_count;
  5277. active = kvm_active_vms;
  5278. mutex_unlock(&kvm_lock);
  5279. env = kzalloc(sizeof(*env), GFP_KERNEL);
  5280. if (!env)
  5281. return;
  5282. add_uevent_var(env, "CREATED=%llu", created);
  5283. add_uevent_var(env, "COUNT=%llu", active);
  5284. if (type == KVM_EVENT_CREATE_VM) {
  5285. add_uevent_var(env, "EVENT=create");
  5286. kvm->userspace_pid = task_pid_nr(current);
  5287. } else if (type == KVM_EVENT_DESTROY_VM) {
  5288. add_uevent_var(env, "EVENT=destroy");
  5289. }
  5290. add_uevent_var(env, "PID=%d", kvm->userspace_pid);
  5291. if (!IS_ERR(kvm->debugfs_dentry)) {
  5292. char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL);
  5293. if (p) {
  5294. tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
  5295. if (!IS_ERR(tmp))
  5296. add_uevent_var(env, "STATS_PATH=%s", tmp);
  5297. kfree(p);
  5298. }
  5299. }
  5300. /* no need for checks, since we are adding at most only 5 keys */
  5301. env->envp[env->envp_idx++] = NULL;
  5302. kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
  5303. kfree(env);
  5304. }
  5305. static void kvm_init_debug(void)
  5306. {
  5307. const struct file_operations *fops;
  5308. const struct _kvm_stats_desc *pdesc;
  5309. int i;
  5310. kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
  5311. for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
  5312. pdesc = &kvm_vm_stats_desc[i];
  5313. if (kvm_stats_debugfs_mode(pdesc) & 0222)
  5314. fops = &vm_stat_fops;
  5315. else
  5316. fops = &vm_stat_readonly_fops;
  5317. debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
  5318. kvm_debugfs_dir,
  5319. (void *)(long)pdesc->desc.offset, fops);
  5320. }
  5321. for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
  5322. pdesc = &kvm_vcpu_stats_desc[i];
  5323. if (kvm_stats_debugfs_mode(pdesc) & 0222)
  5324. fops = &vcpu_stat_fops;
  5325. else
  5326. fops = &vcpu_stat_readonly_fops;
  5327. debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
  5328. kvm_debugfs_dir,
  5329. (void *)(long)pdesc->desc.offset, fops);
  5330. }
  5331. }
  5332. static inline
  5333. struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
  5334. {
  5335. return container_of(pn, struct kvm_vcpu, preempt_notifier);
  5336. }
  5337. static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
  5338. {
  5339. struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
  5340. WRITE_ONCE(vcpu->preempted, false);
  5341. WRITE_ONCE(vcpu->ready, false);
  5342. __this_cpu_write(kvm_running_vcpu, vcpu);
  5343. kvm_arch_vcpu_load(vcpu, cpu);
  5344. WRITE_ONCE(vcpu->scheduled_out, false);
  5345. }
  5346. static void kvm_sched_out(struct preempt_notifier *pn,
  5347. struct task_struct *next)
  5348. {
  5349. struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
  5350. WRITE_ONCE(vcpu->scheduled_out, true);
  5351. if (task_is_runnable(current) && vcpu->wants_to_run) {
  5352. WRITE_ONCE(vcpu->preempted, true);
  5353. WRITE_ONCE(vcpu->ready, true);
  5354. }
  5355. kvm_arch_vcpu_put(vcpu);
  5356. __this_cpu_write(kvm_running_vcpu, NULL);
  5357. }
  5358. /**
  5359. * kvm_get_running_vcpu - get the vcpu running on the current CPU.
  5360. *
  5361. * We can disable preemption locally around accessing the per-CPU variable,
  5362. * and use the resolved vcpu pointer after enabling preemption again,
  5363. * because even if the current thread is migrated to another CPU, reading
  5364. * the per-CPU value later will give us the same value as we update the
  5365. * per-CPU variable in the preempt notifier handlers.
  5366. */
  5367. struct kvm_vcpu *kvm_get_running_vcpu(void)
  5368. {
  5369. struct kvm_vcpu *vcpu;
  5370. preempt_disable();
  5371. vcpu = __this_cpu_read(kvm_running_vcpu);
  5372. preempt_enable();
  5373. return vcpu;
  5374. }
  5375. EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
  5376. /**
  5377. * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
  5378. */
  5379. struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
  5380. {
  5381. return &kvm_running_vcpu;
  5382. }
  5383. #ifdef CONFIG_GUEST_PERF_EVENTS
  5384. static unsigned int kvm_guest_state(void)
  5385. {
  5386. struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
  5387. unsigned int state;
  5388. if (!kvm_arch_pmi_in_guest(vcpu))
  5389. return 0;
  5390. state = PERF_GUEST_ACTIVE;
  5391. if (!kvm_arch_vcpu_in_kernel(vcpu))
  5392. state |= PERF_GUEST_USER;
  5393. return state;
  5394. }
  5395. static unsigned long kvm_guest_get_ip(void)
  5396. {
  5397. struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
  5398. /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
  5399. if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
  5400. return 0;
  5401. return kvm_arch_vcpu_get_ip(vcpu);
  5402. }
  5403. static struct perf_guest_info_callbacks kvm_guest_cbs = {
  5404. .state = kvm_guest_state,
  5405. .get_ip = kvm_guest_get_ip,
  5406. .handle_intel_pt_intr = NULL,
  5407. };
  5408. void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
  5409. {
  5410. kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
  5411. perf_register_guest_info_callbacks(&kvm_guest_cbs);
  5412. }
  5413. void kvm_unregister_perf_callbacks(void)
  5414. {
  5415. perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
  5416. }
  5417. #endif
  5418. int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
  5419. {
  5420. int r;
  5421. int cpu;
  5422. /* A kmem cache lets us meet the alignment requirements of fx_save. */
  5423. if (!vcpu_align)
  5424. vcpu_align = __alignof__(struct kvm_vcpu);
  5425. kvm_vcpu_cache =
  5426. kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
  5427. SLAB_ACCOUNT,
  5428. offsetof(struct kvm_vcpu, arch),
  5429. offsetofend(struct kvm_vcpu, stats_id)
  5430. - offsetof(struct kvm_vcpu, arch),
  5431. NULL);
  5432. if (!kvm_vcpu_cache)
  5433. return -ENOMEM;
  5434. for_each_possible_cpu(cpu) {
  5435. if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
  5436. GFP_KERNEL, cpu_to_node(cpu))) {
  5437. r = -ENOMEM;
  5438. goto err_cpu_kick_mask;
  5439. }
  5440. }
  5441. r = kvm_irqfd_init();
  5442. if (r)
  5443. goto err_irqfd;
  5444. r = kvm_async_pf_init();
  5445. if (r)
  5446. goto err_async_pf;
  5447. kvm_chardev_ops.owner = module;
  5448. kvm_vm_fops.owner = module;
  5449. kvm_vcpu_fops.owner = module;
  5450. kvm_device_fops.owner = module;
  5451. kvm_preempt_ops.sched_in = kvm_sched_in;
  5452. kvm_preempt_ops.sched_out = kvm_sched_out;
  5453. kvm_init_debug();
  5454. r = kvm_vfio_ops_init();
  5455. if (WARN_ON_ONCE(r))
  5456. goto err_vfio;
  5457. kvm_gmem_init(module);
  5458. r = kvm_init_virtualization();
  5459. if (r)
  5460. goto err_virt;
  5461. /*
  5462. * Registration _must_ be the very last thing done, as this exposes
  5463. * /dev/kvm to userspace, i.e. all infrastructure must be setup!
  5464. */
  5465. r = misc_register(&kvm_dev);
  5466. if (r) {
  5467. pr_err("kvm: misc device register failed\n");
  5468. goto err_register;
  5469. }
  5470. return 0;
  5471. err_register:
  5472. kvm_uninit_virtualization();
  5473. err_virt:
  5474. kvm_vfio_ops_exit();
  5475. err_vfio:
  5476. kvm_async_pf_deinit();
  5477. err_async_pf:
  5478. kvm_irqfd_exit();
  5479. err_irqfd:
  5480. err_cpu_kick_mask:
  5481. for_each_possible_cpu(cpu)
  5482. free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
  5483. kmem_cache_destroy(kvm_vcpu_cache);
  5484. return r;
  5485. }
  5486. EXPORT_SYMBOL_GPL(kvm_init);
  5487. void kvm_exit(void)
  5488. {
  5489. int cpu;
  5490. /*
  5491. * Note, unregistering /dev/kvm doesn't strictly need to come first,
  5492. * fops_get(), a.k.a. try_module_get(), prevents acquiring references
  5493. * to KVM while the module is being stopped.
  5494. */
  5495. misc_deregister(&kvm_dev);
  5496. kvm_uninit_virtualization();
  5497. debugfs_remove_recursive(kvm_debugfs_dir);
  5498. for_each_possible_cpu(cpu)
  5499. free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
  5500. kmem_cache_destroy(kvm_vcpu_cache);
  5501. kvm_vfio_ops_exit();
  5502. kvm_async_pf_deinit();
  5503. kvm_irqfd_exit();
  5504. }
  5505. EXPORT_SYMBOL_GPL(kvm_exit);