namespace.c 146 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * linux/fs/namespace.c
  4. *
  5. * (C) Copyright Al Viro 2000, 2001
  6. *
  7. * Based on code from fs/super.c, copyright Linus Torvalds and others.
  8. * Heavily rewritten.
  9. */
  10. #include <linux/syscalls.h>
  11. #include <linux/export.h>
  12. #include <linux/capability.h>
  13. #include <linux/mnt_namespace.h>
  14. #include <linux/user_namespace.h>
  15. #include <linux/namei.h>
  16. #include <linux/security.h>
  17. #include <linux/cred.h>
  18. #include <linux/idr.h>
  19. #include <linux/init.h> /* init_rootfs */
  20. #include <linux/fs_struct.h> /* get_fs_root et.al. */
  21. #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
  22. #include <linux/file.h>
  23. #include <linux/uaccess.h>
  24. #include <linux/proc_ns.h>
  25. #include <linux/magic.h>
  26. #include <linux/memblock.h>
  27. #include <linux/proc_fs.h>
  28. #include <linux/task_work.h>
  29. #include <linux/sched/task.h>
  30. #include <uapi/linux/mount.h>
  31. #include <linux/fs_context.h>
  32. #include <linux/shmem_fs.h>
  33. #include <linux/mnt_idmapping.h>
  34. #include <linux/nospec.h>
  35. #include "pnode.h"
  36. #include "internal.h"
  37. /* Maximum number of mounts in a mount namespace */
  38. static unsigned int sysctl_mount_max __read_mostly = 100000;
  39. static unsigned int m_hash_mask __ro_after_init;
  40. static unsigned int m_hash_shift __ro_after_init;
  41. static unsigned int mp_hash_mask __ro_after_init;
  42. static unsigned int mp_hash_shift __ro_after_init;
  43. static __initdata unsigned long mhash_entries;
  44. static int __init set_mhash_entries(char *str)
  45. {
  46. if (!str)
  47. return 0;
  48. mhash_entries = simple_strtoul(str, &str, 0);
  49. return 1;
  50. }
  51. __setup("mhash_entries=", set_mhash_entries);
  52. static __initdata unsigned long mphash_entries;
  53. static int __init set_mphash_entries(char *str)
  54. {
  55. if (!str)
  56. return 0;
  57. mphash_entries = simple_strtoul(str, &str, 0);
  58. return 1;
  59. }
  60. __setup("mphash_entries=", set_mphash_entries);
  61. static char * __initdata initramfs_options;
  62. static int __init initramfs_options_setup(char *str)
  63. {
  64. initramfs_options = str;
  65. return 1;
  66. }
  67. __setup("initramfs_options=", initramfs_options_setup);
  68. static u64 event;
  69. static DEFINE_IDA(mnt_id_ida);
  70. static DEFINE_IDA(mnt_group_ida);
  71. /* Don't allow confusion with old 32bit mount ID */
  72. #define MNT_UNIQUE_ID_OFFSET (1ULL << 31)
  73. static atomic64_t mnt_id_ctr = ATOMIC64_INIT(MNT_UNIQUE_ID_OFFSET);
  74. static struct hlist_head *mount_hashtable __ro_after_init;
  75. static struct hlist_head *mountpoint_hashtable __ro_after_init;
  76. static struct kmem_cache *mnt_cache __ro_after_init;
  77. static DECLARE_RWSEM(namespace_sem);
  78. static HLIST_HEAD(unmounted); /* protected by namespace_sem */
  79. static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
  80. static DEFINE_RWLOCK(mnt_ns_tree_lock);
  81. static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
  82. struct mount_kattr {
  83. unsigned int attr_set;
  84. unsigned int attr_clr;
  85. unsigned int propagation;
  86. unsigned int lookup_flags;
  87. bool recurse;
  88. struct user_namespace *mnt_userns;
  89. struct mnt_idmap *mnt_idmap;
  90. };
  91. /* /sys/fs */
  92. struct kobject *fs_kobj __ro_after_init;
  93. EXPORT_SYMBOL_GPL(fs_kobj);
  94. /*
  95. * vfsmount lock may be taken for read to prevent changes to the
  96. * vfsmount hash, ie. during mountpoint lookups or walking back
  97. * up the tree.
  98. *
  99. * It should be taken for write in all cases where the vfsmount
  100. * tree or hash is modified or when a vfsmount structure is modified.
  101. */
  102. __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
  103. static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
  104. {
  105. u64 seq_b = ns->seq;
  106. if (seq < seq_b)
  107. return -1;
  108. if (seq > seq_b)
  109. return 1;
  110. return 0;
  111. }
  112. static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
  113. {
  114. if (!node)
  115. return NULL;
  116. return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
  117. }
  118. static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
  119. {
  120. struct mnt_namespace *ns_a = node_to_mnt_ns(a);
  121. struct mnt_namespace *ns_b = node_to_mnt_ns(b);
  122. u64 seq_a = ns_a->seq;
  123. return mnt_ns_cmp(seq_a, ns_b) < 0;
  124. }
  125. static void mnt_ns_tree_add(struct mnt_namespace *ns)
  126. {
  127. guard(write_lock)(&mnt_ns_tree_lock);
  128. rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
  129. }
  130. static void mnt_ns_release(struct mnt_namespace *ns)
  131. {
  132. lockdep_assert_not_held(&mnt_ns_tree_lock);
  133. /* keep alive for {list,stat}mount() */
  134. if (ns && refcount_dec_and_test(&ns->passive)) {
  135. put_user_ns(ns->user_ns);
  136. kfree(ns);
  137. }
  138. }
  139. DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
  140. static void mnt_ns_tree_remove(struct mnt_namespace *ns)
  141. {
  142. /* remove from global mount namespace list */
  143. if (!is_anon_ns(ns)) {
  144. guard(write_lock)(&mnt_ns_tree_lock);
  145. rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
  146. }
  147. mnt_ns_release(ns);
  148. }
  149. /*
  150. * Returns the mount namespace which either has the specified id, or has the
  151. * next smallest id afer the specified one.
  152. */
  153. static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
  154. {
  155. struct rb_node *node = mnt_ns_tree.rb_node;
  156. struct mnt_namespace *ret = NULL;
  157. lockdep_assert_held(&mnt_ns_tree_lock);
  158. while (node) {
  159. struct mnt_namespace *n = node_to_mnt_ns(node);
  160. if (mnt_ns_id <= n->seq) {
  161. ret = node_to_mnt_ns(node);
  162. if (mnt_ns_id == n->seq)
  163. break;
  164. node = node->rb_left;
  165. } else {
  166. node = node->rb_right;
  167. }
  168. }
  169. return ret;
  170. }
  171. /*
  172. * Lookup a mount namespace by id and take a passive reference count. Taking a
  173. * passive reference means the mount namespace can be emptied if e.g., the last
  174. * task holding an active reference exits. To access the mounts of the
  175. * namespace the @namespace_sem must first be acquired. If the namespace has
  176. * already shut down before acquiring @namespace_sem, {list,stat}mount() will
  177. * see that the mount rbtree of the namespace is empty.
  178. */
  179. static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
  180. {
  181. struct mnt_namespace *ns;
  182. guard(read_lock)(&mnt_ns_tree_lock);
  183. ns = mnt_ns_find_id_at(mnt_ns_id);
  184. if (!ns || ns->seq != mnt_ns_id)
  185. return NULL;
  186. refcount_inc(&ns->passive);
  187. return ns;
  188. }
  189. static inline void lock_mount_hash(void)
  190. {
  191. write_seqlock(&mount_lock);
  192. }
  193. static inline void unlock_mount_hash(void)
  194. {
  195. write_sequnlock(&mount_lock);
  196. }
  197. static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
  198. {
  199. unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
  200. tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
  201. tmp = tmp + (tmp >> m_hash_shift);
  202. return &mount_hashtable[tmp & m_hash_mask];
  203. }
  204. static inline struct hlist_head *mp_hash(struct dentry *dentry)
  205. {
  206. unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
  207. tmp = tmp + (tmp >> mp_hash_shift);
  208. return &mountpoint_hashtable[tmp & mp_hash_mask];
  209. }
  210. static int mnt_alloc_id(struct mount *mnt)
  211. {
  212. int res = ida_alloc(&mnt_id_ida, GFP_KERNEL);
  213. if (res < 0)
  214. return res;
  215. mnt->mnt_id = res;
  216. mnt->mnt_id_unique = atomic64_inc_return(&mnt_id_ctr);
  217. return 0;
  218. }
  219. static void mnt_free_id(struct mount *mnt)
  220. {
  221. ida_free(&mnt_id_ida, mnt->mnt_id);
  222. }
  223. /*
  224. * Allocate a new peer group ID
  225. */
  226. static int mnt_alloc_group_id(struct mount *mnt)
  227. {
  228. int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL);
  229. if (res < 0)
  230. return res;
  231. mnt->mnt_group_id = res;
  232. return 0;
  233. }
  234. /*
  235. * Release a peer group ID
  236. */
  237. void mnt_release_group_id(struct mount *mnt)
  238. {
  239. ida_free(&mnt_group_ida, mnt->mnt_group_id);
  240. mnt->mnt_group_id = 0;
  241. }
  242. /*
  243. * vfsmount lock must be held for read
  244. */
  245. static inline void mnt_add_count(struct mount *mnt, int n)
  246. {
  247. #ifdef CONFIG_SMP
  248. this_cpu_add(mnt->mnt_pcp->mnt_count, n);
  249. #else
  250. preempt_disable();
  251. mnt->mnt_count += n;
  252. preempt_enable();
  253. #endif
  254. }
  255. /*
  256. * vfsmount lock must be held for write
  257. */
  258. int mnt_get_count(struct mount *mnt)
  259. {
  260. #ifdef CONFIG_SMP
  261. int count = 0;
  262. int cpu;
  263. for_each_possible_cpu(cpu) {
  264. count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
  265. }
  266. return count;
  267. #else
  268. return mnt->mnt_count;
  269. #endif
  270. }
  271. static struct mount *alloc_vfsmnt(const char *name)
  272. {
  273. struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
  274. if (mnt) {
  275. int err;
  276. err = mnt_alloc_id(mnt);
  277. if (err)
  278. goto out_free_cache;
  279. if (name) {
  280. mnt->mnt_devname = kstrdup_const(name,
  281. GFP_KERNEL_ACCOUNT);
  282. if (!mnt->mnt_devname)
  283. goto out_free_id;
  284. }
  285. #ifdef CONFIG_SMP
  286. mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
  287. if (!mnt->mnt_pcp)
  288. goto out_free_devname;
  289. this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
  290. #else
  291. mnt->mnt_count = 1;
  292. mnt->mnt_writers = 0;
  293. #endif
  294. INIT_HLIST_NODE(&mnt->mnt_hash);
  295. INIT_LIST_HEAD(&mnt->mnt_child);
  296. INIT_LIST_HEAD(&mnt->mnt_mounts);
  297. INIT_LIST_HEAD(&mnt->mnt_list);
  298. INIT_LIST_HEAD(&mnt->mnt_expire);
  299. INIT_LIST_HEAD(&mnt->mnt_share);
  300. INIT_LIST_HEAD(&mnt->mnt_slave_list);
  301. INIT_LIST_HEAD(&mnt->mnt_slave);
  302. INIT_HLIST_NODE(&mnt->mnt_mp_list);
  303. INIT_LIST_HEAD(&mnt->mnt_umounting);
  304. INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
  305. RB_CLEAR_NODE(&mnt->mnt_node);
  306. mnt->mnt.mnt_idmap = &nop_mnt_idmap;
  307. }
  308. return mnt;
  309. #ifdef CONFIG_SMP
  310. out_free_devname:
  311. kfree_const(mnt->mnt_devname);
  312. #endif
  313. out_free_id:
  314. mnt_free_id(mnt);
  315. out_free_cache:
  316. kmem_cache_free(mnt_cache, mnt);
  317. return NULL;
  318. }
  319. /*
  320. * Most r/o checks on a fs are for operations that take
  321. * discrete amounts of time, like a write() or unlink().
  322. * We must keep track of when those operations start
  323. * (for permission checks) and when they end, so that
  324. * we can determine when writes are able to occur to
  325. * a filesystem.
  326. */
  327. /*
  328. * __mnt_is_readonly: check whether a mount is read-only
  329. * @mnt: the mount to check for its write status
  330. *
  331. * This shouldn't be used directly ouside of the VFS.
  332. * It does not guarantee that the filesystem will stay
  333. * r/w, just that it is right *now*. This can not and
  334. * should not be used in place of IS_RDONLY(inode).
  335. * mnt_want/drop_write() will _keep_ the filesystem
  336. * r/w.
  337. */
  338. bool __mnt_is_readonly(struct vfsmount *mnt)
  339. {
  340. return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb);
  341. }
  342. EXPORT_SYMBOL_GPL(__mnt_is_readonly);
  343. static inline void mnt_inc_writers(struct mount *mnt)
  344. {
  345. #ifdef CONFIG_SMP
  346. this_cpu_inc(mnt->mnt_pcp->mnt_writers);
  347. #else
  348. mnt->mnt_writers++;
  349. #endif
  350. }
  351. static inline void mnt_dec_writers(struct mount *mnt)
  352. {
  353. #ifdef CONFIG_SMP
  354. this_cpu_dec(mnt->mnt_pcp->mnt_writers);
  355. #else
  356. mnt->mnt_writers--;
  357. #endif
  358. }
  359. static unsigned int mnt_get_writers(struct mount *mnt)
  360. {
  361. #ifdef CONFIG_SMP
  362. unsigned int count = 0;
  363. int cpu;
  364. for_each_possible_cpu(cpu) {
  365. count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
  366. }
  367. return count;
  368. #else
  369. return mnt->mnt_writers;
  370. #endif
  371. }
  372. static int mnt_is_readonly(struct vfsmount *mnt)
  373. {
  374. if (READ_ONCE(mnt->mnt_sb->s_readonly_remount))
  375. return 1;
  376. /*
  377. * The barrier pairs with the barrier in sb_start_ro_state_change()
  378. * making sure if we don't see s_readonly_remount set yet, we also will
  379. * not see any superblock / mount flag changes done by remount.
  380. * It also pairs with the barrier in sb_end_ro_state_change()
  381. * assuring that if we see s_readonly_remount already cleared, we will
  382. * see the values of superblock / mount flags updated by remount.
  383. */
  384. smp_rmb();
  385. return __mnt_is_readonly(mnt);
  386. }
  387. /*
  388. * Most r/o & frozen checks on a fs are for operations that take discrete
  389. * amounts of time, like a write() or unlink(). We must keep track of when
  390. * those operations start (for permission checks) and when they end, so that we
  391. * can determine when writes are able to occur to a filesystem.
  392. */
  393. /**
  394. * mnt_get_write_access - get write access to a mount without freeze protection
  395. * @m: the mount on which to take a write
  396. *
  397. * This tells the low-level filesystem that a write is about to be performed to
  398. * it, and makes sure that writes are allowed (mnt it read-write) before
  399. * returning success. This operation does not protect against filesystem being
  400. * frozen. When the write operation is finished, mnt_put_write_access() must be
  401. * called. This is effectively a refcount.
  402. */
  403. int mnt_get_write_access(struct vfsmount *m)
  404. {
  405. struct mount *mnt = real_mount(m);
  406. int ret = 0;
  407. preempt_disable();
  408. mnt_inc_writers(mnt);
  409. /*
  410. * The store to mnt_inc_writers must be visible before we pass
  411. * MNT_WRITE_HOLD loop below, so that the slowpath can see our
  412. * incremented count after it has set MNT_WRITE_HOLD.
  413. */
  414. smp_mb();
  415. might_lock(&mount_lock.lock);
  416. while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
  417. if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
  418. cpu_relax();
  419. } else {
  420. /*
  421. * This prevents priority inversion, if the task
  422. * setting MNT_WRITE_HOLD got preempted on a remote
  423. * CPU, and it prevents life lock if the task setting
  424. * MNT_WRITE_HOLD has a lower priority and is bound to
  425. * the same CPU as the task that is spinning here.
  426. */
  427. preempt_enable();
  428. lock_mount_hash();
  429. unlock_mount_hash();
  430. preempt_disable();
  431. }
  432. }
  433. /*
  434. * The barrier pairs with the barrier sb_start_ro_state_change() making
  435. * sure that if we see MNT_WRITE_HOLD cleared, we will also see
  436. * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in
  437. * mnt_is_readonly() and bail in case we are racing with remount
  438. * read-only.
  439. */
  440. smp_rmb();
  441. if (mnt_is_readonly(m)) {
  442. mnt_dec_writers(mnt);
  443. ret = -EROFS;
  444. }
  445. preempt_enable();
  446. return ret;
  447. }
  448. EXPORT_SYMBOL_GPL(mnt_get_write_access);
  449. /**
  450. * mnt_want_write - get write access to a mount
  451. * @m: the mount on which to take a write
  452. *
  453. * This tells the low-level filesystem that a write is about to be performed to
  454. * it, and makes sure that writes are allowed (mount is read-write, filesystem
  455. * is not frozen) before returning success. When the write operation is
  456. * finished, mnt_drop_write() must be called. This is effectively a refcount.
  457. */
  458. int mnt_want_write(struct vfsmount *m)
  459. {
  460. int ret;
  461. sb_start_write(m->mnt_sb);
  462. ret = mnt_get_write_access(m);
  463. if (ret)
  464. sb_end_write(m->mnt_sb);
  465. return ret;
  466. }
  467. EXPORT_SYMBOL_GPL(mnt_want_write);
  468. /**
  469. * mnt_get_write_access_file - get write access to a file's mount
  470. * @file: the file who's mount on which to take a write
  471. *
  472. * This is like mnt_get_write_access, but if @file is already open for write it
  473. * skips incrementing mnt_writers (since the open file already has a reference)
  474. * and instead only does the check for emergency r/o remounts. This must be
  475. * paired with mnt_put_write_access_file.
  476. */
  477. int mnt_get_write_access_file(struct file *file)
  478. {
  479. if (file->f_mode & FMODE_WRITER) {
  480. /*
  481. * Superblock may have become readonly while there are still
  482. * writable fd's, e.g. due to a fs error with errors=remount-ro
  483. */
  484. if (__mnt_is_readonly(file->f_path.mnt))
  485. return -EROFS;
  486. return 0;
  487. }
  488. return mnt_get_write_access(file->f_path.mnt);
  489. }
  490. /**
  491. * mnt_want_write_file - get write access to a file's mount
  492. * @file: the file who's mount on which to take a write
  493. *
  494. * This is like mnt_want_write, but if the file is already open for writing it
  495. * skips incrementing mnt_writers (since the open file already has a reference)
  496. * and instead only does the freeze protection and the check for emergency r/o
  497. * remounts. This must be paired with mnt_drop_write_file.
  498. */
  499. int mnt_want_write_file(struct file *file)
  500. {
  501. int ret;
  502. sb_start_write(file_inode(file)->i_sb);
  503. ret = mnt_get_write_access_file(file);
  504. if (ret)
  505. sb_end_write(file_inode(file)->i_sb);
  506. return ret;
  507. }
  508. EXPORT_SYMBOL_GPL(mnt_want_write_file);
  509. /**
  510. * mnt_put_write_access - give up write access to a mount
  511. * @mnt: the mount on which to give up write access
  512. *
  513. * Tells the low-level filesystem that we are done
  514. * performing writes to it. Must be matched with
  515. * mnt_get_write_access() call above.
  516. */
  517. void mnt_put_write_access(struct vfsmount *mnt)
  518. {
  519. preempt_disable();
  520. mnt_dec_writers(real_mount(mnt));
  521. preempt_enable();
  522. }
  523. EXPORT_SYMBOL_GPL(mnt_put_write_access);
  524. /**
  525. * mnt_drop_write - give up write access to a mount
  526. * @mnt: the mount on which to give up write access
  527. *
  528. * Tells the low-level filesystem that we are done performing writes to it and
  529. * also allows filesystem to be frozen again. Must be matched with
  530. * mnt_want_write() call above.
  531. */
  532. void mnt_drop_write(struct vfsmount *mnt)
  533. {
  534. mnt_put_write_access(mnt);
  535. sb_end_write(mnt->mnt_sb);
  536. }
  537. EXPORT_SYMBOL_GPL(mnt_drop_write);
  538. void mnt_put_write_access_file(struct file *file)
  539. {
  540. if (!(file->f_mode & FMODE_WRITER))
  541. mnt_put_write_access(file->f_path.mnt);
  542. }
  543. void mnt_drop_write_file(struct file *file)
  544. {
  545. mnt_put_write_access_file(file);
  546. sb_end_write(file_inode(file)->i_sb);
  547. }
  548. EXPORT_SYMBOL(mnt_drop_write_file);
  549. /**
  550. * mnt_hold_writers - prevent write access to the given mount
  551. * @mnt: mnt to prevent write access to
  552. *
  553. * Prevents write access to @mnt if there are no active writers for @mnt.
  554. * This function needs to be called and return successfully before changing
  555. * properties of @mnt that need to remain stable for callers with write access
  556. * to @mnt.
  557. *
  558. * After this functions has been called successfully callers must pair it with
  559. * a call to mnt_unhold_writers() in order to stop preventing write access to
  560. * @mnt.
  561. *
  562. * Context: This function expects lock_mount_hash() to be held serializing
  563. * setting MNT_WRITE_HOLD.
  564. * Return: On success 0 is returned.
  565. * On error, -EBUSY is returned.
  566. */
  567. static inline int mnt_hold_writers(struct mount *mnt)
  568. {
  569. mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
  570. /*
  571. * After storing MNT_WRITE_HOLD, we'll read the counters. This store
  572. * should be visible before we do.
  573. */
  574. smp_mb();
  575. /*
  576. * With writers on hold, if this value is zero, then there are
  577. * definitely no active writers (although held writers may subsequently
  578. * increment the count, they'll have to wait, and decrement it after
  579. * seeing MNT_READONLY).
  580. *
  581. * It is OK to have counter incremented on one CPU and decremented on
  582. * another: the sum will add up correctly. The danger would be when we
  583. * sum up each counter, if we read a counter before it is incremented,
  584. * but then read another CPU's count which it has been subsequently
  585. * decremented from -- we would see more decrements than we should.
  586. * MNT_WRITE_HOLD protects against this scenario, because
  587. * mnt_want_write first increments count, then smp_mb, then spins on
  588. * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
  589. * we're counting up here.
  590. */
  591. if (mnt_get_writers(mnt) > 0)
  592. return -EBUSY;
  593. return 0;
  594. }
  595. /**
  596. * mnt_unhold_writers - stop preventing write access to the given mount
  597. * @mnt: mnt to stop preventing write access to
  598. *
  599. * Stop preventing write access to @mnt allowing callers to gain write access
  600. * to @mnt again.
  601. *
  602. * This function can only be called after a successful call to
  603. * mnt_hold_writers().
  604. *
  605. * Context: This function expects lock_mount_hash() to be held.
  606. */
  607. static inline void mnt_unhold_writers(struct mount *mnt)
  608. {
  609. /*
  610. * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
  611. * that become unheld will see MNT_READONLY.
  612. */
  613. smp_wmb();
  614. mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
  615. }
  616. static int mnt_make_readonly(struct mount *mnt)
  617. {
  618. int ret;
  619. ret = mnt_hold_writers(mnt);
  620. if (!ret)
  621. mnt->mnt.mnt_flags |= MNT_READONLY;
  622. mnt_unhold_writers(mnt);
  623. return ret;
  624. }
  625. int sb_prepare_remount_readonly(struct super_block *sb)
  626. {
  627. struct mount *mnt;
  628. int err = 0;
  629. /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */
  630. if (atomic_long_read(&sb->s_remove_count))
  631. return -EBUSY;
  632. lock_mount_hash();
  633. list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
  634. if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
  635. err = mnt_hold_writers(mnt);
  636. if (err)
  637. break;
  638. }
  639. }
  640. if (!err && atomic_long_read(&sb->s_remove_count))
  641. err = -EBUSY;
  642. if (!err)
  643. sb_start_ro_state_change(sb);
  644. list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
  645. if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
  646. mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
  647. }
  648. unlock_mount_hash();
  649. return err;
  650. }
  651. static void free_vfsmnt(struct mount *mnt)
  652. {
  653. mnt_idmap_put(mnt_idmap(&mnt->mnt));
  654. kfree_const(mnt->mnt_devname);
  655. #ifdef CONFIG_SMP
  656. free_percpu(mnt->mnt_pcp);
  657. #endif
  658. kmem_cache_free(mnt_cache, mnt);
  659. }
  660. static void delayed_free_vfsmnt(struct rcu_head *head)
  661. {
  662. free_vfsmnt(container_of(head, struct mount, mnt_rcu));
  663. }
  664. /* call under rcu_read_lock */
  665. int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
  666. {
  667. struct mount *mnt;
  668. if (read_seqretry(&mount_lock, seq))
  669. return 1;
  670. if (bastard == NULL)
  671. return 0;
  672. mnt = real_mount(bastard);
  673. mnt_add_count(mnt, 1);
  674. smp_mb(); // see mntput_no_expire() and do_umount()
  675. if (likely(!read_seqretry(&mount_lock, seq)))
  676. return 0;
  677. lock_mount_hash();
  678. if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT | MNT_DOOMED))) {
  679. mnt_add_count(mnt, -1);
  680. unlock_mount_hash();
  681. return 1;
  682. }
  683. unlock_mount_hash();
  684. /* caller will mntput() */
  685. return -1;
  686. }
  687. /* call under rcu_read_lock */
  688. static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
  689. {
  690. int res = __legitimize_mnt(bastard, seq);
  691. if (likely(!res))
  692. return true;
  693. if (unlikely(res < 0)) {
  694. rcu_read_unlock();
  695. mntput(bastard);
  696. rcu_read_lock();
  697. }
  698. return false;
  699. }
  700. /**
  701. * __lookup_mnt - find first child mount
  702. * @mnt: parent mount
  703. * @dentry: mountpoint
  704. *
  705. * If @mnt has a child mount @c mounted @dentry find and return it.
  706. *
  707. * Note that the child mount @c need not be unique. There are cases
  708. * where shadow mounts are created. For example, during mount
  709. * propagation when a source mount @mnt whose root got overmounted by a
  710. * mount @o after path lookup but before @namespace_sem could be
  711. * acquired gets copied and propagated. So @mnt gets copied including
  712. * @o. When @mnt is propagated to a destination mount @d that already
  713. * has another mount @n mounted at the same mountpoint then the source
  714. * mount @mnt will be tucked beneath @n, i.e., @n will be mounted on
  715. * @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt
  716. * on @dentry.
  717. *
  718. * Return: The first child of @mnt mounted @dentry or NULL.
  719. */
  720. struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
  721. {
  722. struct hlist_head *head = m_hash(mnt, dentry);
  723. struct mount *p;
  724. hlist_for_each_entry_rcu(p, head, mnt_hash)
  725. if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
  726. return p;
  727. return NULL;
  728. }
  729. /*
  730. * lookup_mnt - Return the first child mount mounted at path
  731. *
  732. * "First" means first mounted chronologically. If you create the
  733. * following mounts:
  734. *
  735. * mount /dev/sda1 /mnt
  736. * mount /dev/sda2 /mnt
  737. * mount /dev/sda3 /mnt
  738. *
  739. * Then lookup_mnt() on the base /mnt dentry in the root mount will
  740. * return successively the root dentry and vfsmount of /dev/sda1, then
  741. * /dev/sda2, then /dev/sda3, then NULL.
  742. *
  743. * lookup_mnt takes a reference to the found vfsmount.
  744. */
  745. struct vfsmount *lookup_mnt(const struct path *path)
  746. {
  747. struct mount *child_mnt;
  748. struct vfsmount *m;
  749. unsigned seq;
  750. rcu_read_lock();
  751. do {
  752. seq = read_seqbegin(&mount_lock);
  753. child_mnt = __lookup_mnt(path->mnt, path->dentry);
  754. m = child_mnt ? &child_mnt->mnt : NULL;
  755. } while (!legitimize_mnt(m, seq));
  756. rcu_read_unlock();
  757. return m;
  758. }
  759. /*
  760. * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
  761. * current mount namespace.
  762. *
  763. * The common case is dentries are not mountpoints at all and that
  764. * test is handled inline. For the slow case when we are actually
  765. * dealing with a mountpoint of some kind, walk through all of the
  766. * mounts in the current mount namespace and test to see if the dentry
  767. * is a mountpoint.
  768. *
  769. * The mount_hashtable is not usable in the context because we
  770. * need to identify all mounts that may be in the current mount
  771. * namespace not just a mount that happens to have some specified
  772. * parent mount.
  773. */
  774. bool __is_local_mountpoint(struct dentry *dentry)
  775. {
  776. struct mnt_namespace *ns = current->nsproxy->mnt_ns;
  777. struct mount *mnt, *n;
  778. bool is_covered = false;
  779. down_read(&namespace_sem);
  780. rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
  781. is_covered = (mnt->mnt_mountpoint == dentry);
  782. if (is_covered)
  783. break;
  784. }
  785. up_read(&namespace_sem);
  786. return is_covered;
  787. }
  788. static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
  789. {
  790. struct hlist_head *chain = mp_hash(dentry);
  791. struct mountpoint *mp;
  792. hlist_for_each_entry(mp, chain, m_hash) {
  793. if (mp->m_dentry == dentry) {
  794. mp->m_count++;
  795. return mp;
  796. }
  797. }
  798. return NULL;
  799. }
  800. static struct mountpoint *get_mountpoint(struct dentry *dentry)
  801. {
  802. struct mountpoint *mp, *new = NULL;
  803. int ret;
  804. if (d_mountpoint(dentry)) {
  805. /* might be worth a WARN_ON() */
  806. if (d_unlinked(dentry))
  807. return ERR_PTR(-ENOENT);
  808. mountpoint:
  809. read_seqlock_excl(&mount_lock);
  810. mp = lookup_mountpoint(dentry);
  811. read_sequnlock_excl(&mount_lock);
  812. if (mp)
  813. goto done;
  814. }
  815. if (!new)
  816. new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
  817. if (!new)
  818. return ERR_PTR(-ENOMEM);
  819. /* Exactly one processes may set d_mounted */
  820. ret = d_set_mounted(dentry);
  821. /* Someone else set d_mounted? */
  822. if (ret == -EBUSY)
  823. goto mountpoint;
  824. /* The dentry is not available as a mountpoint? */
  825. mp = ERR_PTR(ret);
  826. if (ret)
  827. goto done;
  828. /* Add the new mountpoint to the hash table */
  829. read_seqlock_excl(&mount_lock);
  830. new->m_dentry = dget(dentry);
  831. new->m_count = 1;
  832. hlist_add_head(&new->m_hash, mp_hash(dentry));
  833. INIT_HLIST_HEAD(&new->m_list);
  834. read_sequnlock_excl(&mount_lock);
  835. mp = new;
  836. new = NULL;
  837. done:
  838. kfree(new);
  839. return mp;
  840. }
  841. /*
  842. * vfsmount lock must be held. Additionally, the caller is responsible
  843. * for serializing calls for given disposal list.
  844. */
  845. static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
  846. {
  847. if (!--mp->m_count) {
  848. struct dentry *dentry = mp->m_dentry;
  849. BUG_ON(!hlist_empty(&mp->m_list));
  850. spin_lock(&dentry->d_lock);
  851. dentry->d_flags &= ~DCACHE_MOUNTED;
  852. spin_unlock(&dentry->d_lock);
  853. dput_to_list(dentry, list);
  854. hlist_del(&mp->m_hash);
  855. kfree(mp);
  856. }
  857. }
  858. /* called with namespace_lock and vfsmount lock */
  859. static void put_mountpoint(struct mountpoint *mp)
  860. {
  861. __put_mountpoint(mp, &ex_mountpoints);
  862. }
  863. static inline int check_mnt(struct mount *mnt)
  864. {
  865. return mnt->mnt_ns == current->nsproxy->mnt_ns;
  866. }
  867. /*
  868. * vfsmount lock must be held for write
  869. */
  870. static void touch_mnt_namespace(struct mnt_namespace *ns)
  871. {
  872. if (ns) {
  873. ns->event = ++event;
  874. wake_up_interruptible(&ns->poll);
  875. }
  876. }
  877. /*
  878. * vfsmount lock must be held for write
  879. */
  880. static void __touch_mnt_namespace(struct mnt_namespace *ns)
  881. {
  882. if (ns && ns->event != event) {
  883. ns->event = event;
  884. wake_up_interruptible(&ns->poll);
  885. }
  886. }
  887. /*
  888. * vfsmount lock must be held for write
  889. */
  890. static struct mountpoint *unhash_mnt(struct mount *mnt)
  891. {
  892. struct mountpoint *mp;
  893. mnt->mnt_parent = mnt;
  894. mnt->mnt_mountpoint = mnt->mnt.mnt_root;
  895. list_del_init(&mnt->mnt_child);
  896. hlist_del_init_rcu(&mnt->mnt_hash);
  897. hlist_del_init(&mnt->mnt_mp_list);
  898. mp = mnt->mnt_mp;
  899. mnt->mnt_mp = NULL;
  900. return mp;
  901. }
  902. /*
  903. * vfsmount lock must be held for write
  904. */
  905. static void umount_mnt(struct mount *mnt)
  906. {
  907. put_mountpoint(unhash_mnt(mnt));
  908. }
  909. /*
  910. * vfsmount lock must be held for write
  911. */
  912. void mnt_set_mountpoint(struct mount *mnt,
  913. struct mountpoint *mp,
  914. struct mount *child_mnt)
  915. {
  916. mp->m_count++;
  917. mnt_add_count(mnt, 1); /* essentially, that's mntget */
  918. child_mnt->mnt_mountpoint = mp->m_dentry;
  919. child_mnt->mnt_parent = mnt;
  920. child_mnt->mnt_mp = mp;
  921. hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
  922. }
  923. /**
  924. * mnt_set_mountpoint_beneath - mount a mount beneath another one
  925. *
  926. * @new_parent: the source mount
  927. * @top_mnt: the mount beneath which @new_parent is mounted
  928. * @new_mp: the new mountpoint of @top_mnt on @new_parent
  929. *
  930. * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and
  931. * parent @top_mnt->mnt_parent and mount it on top of @new_parent at
  932. * @new_mp. And mount @new_parent on the old parent and old
  933. * mountpoint of @top_mnt.
  934. *
  935. * Context: This function expects namespace_lock() and lock_mount_hash()
  936. * to have been acquired in that order.
  937. */
  938. static void mnt_set_mountpoint_beneath(struct mount *new_parent,
  939. struct mount *top_mnt,
  940. struct mountpoint *new_mp)
  941. {
  942. struct mount *old_top_parent = top_mnt->mnt_parent;
  943. struct mountpoint *old_top_mp = top_mnt->mnt_mp;
  944. mnt_set_mountpoint(old_top_parent, old_top_mp, new_parent);
  945. mnt_change_mountpoint(new_parent, new_mp, top_mnt);
  946. }
  947. static void __attach_mnt(struct mount *mnt, struct mount *parent)
  948. {
  949. hlist_add_head_rcu(&mnt->mnt_hash,
  950. m_hash(&parent->mnt, mnt->mnt_mountpoint));
  951. list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
  952. }
  953. /**
  954. * attach_mnt - mount a mount, attach to @mount_hashtable and parent's
  955. * list of child mounts
  956. * @parent: the parent
  957. * @mnt: the new mount
  958. * @mp: the new mountpoint
  959. * @beneath: whether to mount @mnt beneath or on top of @parent
  960. *
  961. * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt
  962. * to @parent's child mount list and to @mount_hashtable.
  963. *
  964. * If @beneath is true, remove @mnt from its current parent and
  965. * mountpoint and mount it on @mp on @parent, and mount @parent on the
  966. * old parent and old mountpoint of @mnt. Finally, attach @parent to
  967. * @mnt_hashtable and @parent->mnt_parent->mnt_mounts.
  968. *
  969. * Note, when __attach_mnt() is called @mnt->mnt_parent already points
  970. * to the correct parent.
  971. *
  972. * Context: This function expects namespace_lock() and lock_mount_hash()
  973. * to have been acquired in that order.
  974. */
  975. static void attach_mnt(struct mount *mnt, struct mount *parent,
  976. struct mountpoint *mp, bool beneath)
  977. {
  978. if (beneath)
  979. mnt_set_mountpoint_beneath(mnt, parent, mp);
  980. else
  981. mnt_set_mountpoint(parent, mp, mnt);
  982. /*
  983. * Note, @mnt->mnt_parent has to be used. If @mnt was mounted
  984. * beneath @parent then @mnt will need to be attached to
  985. * @parent's old parent, not @parent. IOW, @mnt->mnt_parent
  986. * isn't the same mount as @parent.
  987. */
  988. __attach_mnt(mnt, mnt->mnt_parent);
  989. }
  990. void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
  991. {
  992. struct mountpoint *old_mp = mnt->mnt_mp;
  993. struct mount *old_parent = mnt->mnt_parent;
  994. list_del_init(&mnt->mnt_child);
  995. hlist_del_init(&mnt->mnt_mp_list);
  996. hlist_del_init_rcu(&mnt->mnt_hash);
  997. attach_mnt(mnt, parent, mp, false);
  998. put_mountpoint(old_mp);
  999. mnt_add_count(old_parent, -1);
  1000. }
  1001. static inline struct mount *node_to_mount(struct rb_node *node)
  1002. {
  1003. return node ? rb_entry(node, struct mount, mnt_node) : NULL;
  1004. }
  1005. static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
  1006. {
  1007. struct rb_node **link = &ns->mounts.rb_node;
  1008. struct rb_node *parent = NULL;
  1009. WARN_ON(mnt_ns_attached(mnt));
  1010. mnt->mnt_ns = ns;
  1011. while (*link) {
  1012. parent = *link;
  1013. if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique)
  1014. link = &parent->rb_left;
  1015. else
  1016. link = &parent->rb_right;
  1017. }
  1018. rb_link_node(&mnt->mnt_node, parent, link);
  1019. rb_insert_color(&mnt->mnt_node, &ns->mounts);
  1020. }
  1021. /*
  1022. * vfsmount lock must be held for write
  1023. */
  1024. static void commit_tree(struct mount *mnt)
  1025. {
  1026. struct mount *parent = mnt->mnt_parent;
  1027. struct mount *m;
  1028. LIST_HEAD(head);
  1029. struct mnt_namespace *n = parent->mnt_ns;
  1030. BUG_ON(parent == mnt);
  1031. list_add_tail(&head, &mnt->mnt_list);
  1032. while (!list_empty(&head)) {
  1033. m = list_first_entry(&head, typeof(*m), mnt_list);
  1034. list_del(&m->mnt_list);
  1035. mnt_add_to_ns(n, m);
  1036. }
  1037. n->nr_mounts += n->pending_mounts;
  1038. n->pending_mounts = 0;
  1039. __attach_mnt(mnt, parent);
  1040. touch_mnt_namespace(n);
  1041. }
  1042. static struct mount *next_mnt(struct mount *p, struct mount *root)
  1043. {
  1044. struct list_head *next = p->mnt_mounts.next;
  1045. if (next == &p->mnt_mounts) {
  1046. while (1) {
  1047. if (p == root)
  1048. return NULL;
  1049. next = p->mnt_child.next;
  1050. if (next != &p->mnt_parent->mnt_mounts)
  1051. break;
  1052. p = p->mnt_parent;
  1053. }
  1054. }
  1055. return list_entry(next, struct mount, mnt_child);
  1056. }
  1057. static struct mount *skip_mnt_tree(struct mount *p)
  1058. {
  1059. struct list_head *prev = p->mnt_mounts.prev;
  1060. while (prev != &p->mnt_mounts) {
  1061. p = list_entry(prev, struct mount, mnt_child);
  1062. prev = p->mnt_mounts.prev;
  1063. }
  1064. return p;
  1065. }
  1066. /**
  1067. * vfs_create_mount - Create a mount for a configured superblock
  1068. * @fc: The configuration context with the superblock attached
  1069. *
  1070. * Create a mount to an already configured superblock. If necessary, the
  1071. * caller should invoke vfs_get_tree() before calling this.
  1072. *
  1073. * Note that this does not attach the mount to anything.
  1074. */
  1075. struct vfsmount *vfs_create_mount(struct fs_context *fc)
  1076. {
  1077. struct mount *mnt;
  1078. if (!fc->root)
  1079. return ERR_PTR(-EINVAL);
  1080. mnt = alloc_vfsmnt(fc->source ?: "none");
  1081. if (!mnt)
  1082. return ERR_PTR(-ENOMEM);
  1083. if (fc->sb_flags & SB_KERNMOUNT)
  1084. mnt->mnt.mnt_flags = MNT_INTERNAL;
  1085. atomic_inc(&fc->root->d_sb->s_active);
  1086. mnt->mnt.mnt_sb = fc->root->d_sb;
  1087. mnt->mnt.mnt_root = dget(fc->root);
  1088. mnt->mnt_mountpoint = mnt->mnt.mnt_root;
  1089. mnt->mnt_parent = mnt;
  1090. lock_mount_hash();
  1091. list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
  1092. unlock_mount_hash();
  1093. return &mnt->mnt;
  1094. }
  1095. EXPORT_SYMBOL(vfs_create_mount);
  1096. struct vfsmount *fc_mount(struct fs_context *fc)
  1097. {
  1098. int err = vfs_get_tree(fc);
  1099. if (!err) {
  1100. up_write(&fc->root->d_sb->s_umount);
  1101. return vfs_create_mount(fc);
  1102. }
  1103. return ERR_PTR(err);
  1104. }
  1105. EXPORT_SYMBOL(fc_mount);
  1106. struct vfsmount *vfs_kern_mount(struct file_system_type *type,
  1107. int flags, const char *name,
  1108. void *data)
  1109. {
  1110. struct fs_context *fc;
  1111. struct vfsmount *mnt;
  1112. int ret = 0;
  1113. if (!type)
  1114. return ERR_PTR(-EINVAL);
  1115. fc = fs_context_for_mount(type, flags);
  1116. if (IS_ERR(fc))
  1117. return ERR_CAST(fc);
  1118. if (name)
  1119. ret = vfs_parse_fs_string(fc, "source",
  1120. name, strlen(name));
  1121. if (!ret)
  1122. ret = parse_monolithic_mount_data(fc, data);
  1123. if (!ret)
  1124. mnt = fc_mount(fc);
  1125. else
  1126. mnt = ERR_PTR(ret);
  1127. put_fs_context(fc);
  1128. return mnt;
  1129. }
  1130. EXPORT_SYMBOL_GPL(vfs_kern_mount);
  1131. struct vfsmount *
  1132. vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
  1133. const char *name, void *data)
  1134. {
  1135. /* Until it is worked out how to pass the user namespace
  1136. * through from the parent mount to the submount don't support
  1137. * unprivileged mounts with submounts.
  1138. */
  1139. if (mountpoint->d_sb->s_user_ns != &init_user_ns)
  1140. return ERR_PTR(-EPERM);
  1141. return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
  1142. }
  1143. EXPORT_SYMBOL_GPL(vfs_submount);
  1144. static struct mount *clone_mnt(struct mount *old, struct dentry *root,
  1145. int flag)
  1146. {
  1147. struct super_block *sb = old->mnt.mnt_sb;
  1148. struct mount *mnt;
  1149. int err;
  1150. mnt = alloc_vfsmnt(old->mnt_devname);
  1151. if (!mnt)
  1152. return ERR_PTR(-ENOMEM);
  1153. if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
  1154. mnt->mnt_group_id = 0; /* not a peer of original */
  1155. else
  1156. mnt->mnt_group_id = old->mnt_group_id;
  1157. if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
  1158. err = mnt_alloc_group_id(mnt);
  1159. if (err)
  1160. goto out_free;
  1161. }
  1162. mnt->mnt.mnt_flags = old->mnt.mnt_flags;
  1163. mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
  1164. atomic_inc(&sb->s_active);
  1165. mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
  1166. mnt->mnt.mnt_sb = sb;
  1167. mnt->mnt.mnt_root = dget(root);
  1168. mnt->mnt_mountpoint = mnt->mnt.mnt_root;
  1169. mnt->mnt_parent = mnt;
  1170. lock_mount_hash();
  1171. list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
  1172. unlock_mount_hash();
  1173. if ((flag & CL_SLAVE) ||
  1174. ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
  1175. list_add(&mnt->mnt_slave, &old->mnt_slave_list);
  1176. mnt->mnt_master = old;
  1177. CLEAR_MNT_SHARED(mnt);
  1178. } else if (!(flag & CL_PRIVATE)) {
  1179. if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
  1180. list_add(&mnt->mnt_share, &old->mnt_share);
  1181. if (IS_MNT_SLAVE(old))
  1182. list_add(&mnt->mnt_slave, &old->mnt_slave);
  1183. mnt->mnt_master = old->mnt_master;
  1184. } else {
  1185. CLEAR_MNT_SHARED(mnt);
  1186. }
  1187. if (flag & CL_MAKE_SHARED)
  1188. set_mnt_shared(mnt);
  1189. /* stick the duplicate mount on the same expiry list
  1190. * as the original if that was on one */
  1191. if (flag & CL_EXPIRE) {
  1192. if (!list_empty(&old->mnt_expire))
  1193. list_add(&mnt->mnt_expire, &old->mnt_expire);
  1194. }
  1195. return mnt;
  1196. out_free:
  1197. mnt_free_id(mnt);
  1198. free_vfsmnt(mnt);
  1199. return ERR_PTR(err);
  1200. }
  1201. static void cleanup_mnt(struct mount *mnt)
  1202. {
  1203. struct hlist_node *p;
  1204. struct mount *m;
  1205. /*
  1206. * The warning here probably indicates that somebody messed
  1207. * up a mnt_want/drop_write() pair. If this happens, the
  1208. * filesystem was probably unable to make r/w->r/o transitions.
  1209. * The locking used to deal with mnt_count decrement provides barriers,
  1210. * so mnt_get_writers() below is safe.
  1211. */
  1212. WARN_ON(mnt_get_writers(mnt));
  1213. if (unlikely(mnt->mnt_pins.first))
  1214. mnt_pin_kill(mnt);
  1215. hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
  1216. hlist_del(&m->mnt_umount);
  1217. mntput(&m->mnt);
  1218. }
  1219. fsnotify_vfsmount_delete(&mnt->mnt);
  1220. dput(mnt->mnt.mnt_root);
  1221. deactivate_super(mnt->mnt.mnt_sb);
  1222. mnt_free_id(mnt);
  1223. call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
  1224. }
  1225. static void __cleanup_mnt(struct rcu_head *head)
  1226. {
  1227. cleanup_mnt(container_of(head, struct mount, mnt_rcu));
  1228. }
  1229. static LLIST_HEAD(delayed_mntput_list);
  1230. static void delayed_mntput(struct work_struct *unused)
  1231. {
  1232. struct llist_node *node = llist_del_all(&delayed_mntput_list);
  1233. struct mount *m, *t;
  1234. llist_for_each_entry_safe(m, t, node, mnt_llist)
  1235. cleanup_mnt(m);
  1236. }
  1237. static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
  1238. static void mntput_no_expire(struct mount *mnt)
  1239. {
  1240. LIST_HEAD(list);
  1241. int count;
  1242. rcu_read_lock();
  1243. if (likely(READ_ONCE(mnt->mnt_ns))) {
  1244. /*
  1245. * Since we don't do lock_mount_hash() here,
  1246. * ->mnt_ns can change under us. However, if it's
  1247. * non-NULL, then there's a reference that won't
  1248. * be dropped until after an RCU delay done after
  1249. * turning ->mnt_ns NULL. So if we observe it
  1250. * non-NULL under rcu_read_lock(), the reference
  1251. * we are dropping is not the final one.
  1252. */
  1253. mnt_add_count(mnt, -1);
  1254. rcu_read_unlock();
  1255. return;
  1256. }
  1257. lock_mount_hash();
  1258. /*
  1259. * make sure that if __legitimize_mnt() has not seen us grab
  1260. * mount_lock, we'll see their refcount increment here.
  1261. */
  1262. smp_mb();
  1263. mnt_add_count(mnt, -1);
  1264. count = mnt_get_count(mnt);
  1265. if (count != 0) {
  1266. WARN_ON(count < 0);
  1267. rcu_read_unlock();
  1268. unlock_mount_hash();
  1269. return;
  1270. }
  1271. if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
  1272. rcu_read_unlock();
  1273. unlock_mount_hash();
  1274. return;
  1275. }
  1276. mnt->mnt.mnt_flags |= MNT_DOOMED;
  1277. rcu_read_unlock();
  1278. list_del(&mnt->mnt_instance);
  1279. if (unlikely(!list_empty(&mnt->mnt_mounts))) {
  1280. struct mount *p, *tmp;
  1281. list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
  1282. __put_mountpoint(unhash_mnt(p), &list);
  1283. hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
  1284. }
  1285. }
  1286. unlock_mount_hash();
  1287. shrink_dentry_list(&list);
  1288. if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
  1289. struct task_struct *task = current;
  1290. if (likely(!(task->flags & PF_KTHREAD))) {
  1291. init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
  1292. if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
  1293. return;
  1294. }
  1295. if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
  1296. schedule_delayed_work(&delayed_mntput_work, 1);
  1297. return;
  1298. }
  1299. cleanup_mnt(mnt);
  1300. }
  1301. void mntput(struct vfsmount *mnt)
  1302. {
  1303. if (mnt) {
  1304. struct mount *m = real_mount(mnt);
  1305. /* avoid cacheline pingpong */
  1306. if (unlikely(m->mnt_expiry_mark))
  1307. WRITE_ONCE(m->mnt_expiry_mark, 0);
  1308. mntput_no_expire(m);
  1309. }
  1310. }
  1311. EXPORT_SYMBOL(mntput);
  1312. struct vfsmount *mntget(struct vfsmount *mnt)
  1313. {
  1314. if (mnt)
  1315. mnt_add_count(real_mount(mnt), 1);
  1316. return mnt;
  1317. }
  1318. EXPORT_SYMBOL(mntget);
  1319. /*
  1320. * Make a mount point inaccessible to new lookups.
  1321. * Because there may still be current users, the caller MUST WAIT
  1322. * for an RCU grace period before destroying the mount point.
  1323. */
  1324. void mnt_make_shortterm(struct vfsmount *mnt)
  1325. {
  1326. if (mnt)
  1327. real_mount(mnt)->mnt_ns = NULL;
  1328. }
  1329. /**
  1330. * path_is_mountpoint() - Check if path is a mount in the current namespace.
  1331. * @path: path to check
  1332. *
  1333. * d_mountpoint() can only be used reliably to establish if a dentry is
  1334. * not mounted in any namespace and that common case is handled inline.
  1335. * d_mountpoint() isn't aware of the possibility there may be multiple
  1336. * mounts using a given dentry in a different namespace. This function
  1337. * checks if the passed in path is a mountpoint rather than the dentry
  1338. * alone.
  1339. */
  1340. bool path_is_mountpoint(const struct path *path)
  1341. {
  1342. unsigned seq;
  1343. bool res;
  1344. if (!d_mountpoint(path->dentry))
  1345. return false;
  1346. rcu_read_lock();
  1347. do {
  1348. seq = read_seqbegin(&mount_lock);
  1349. res = __path_is_mountpoint(path);
  1350. } while (read_seqretry(&mount_lock, seq));
  1351. rcu_read_unlock();
  1352. return res;
  1353. }
  1354. EXPORT_SYMBOL(path_is_mountpoint);
  1355. struct vfsmount *mnt_clone_internal(const struct path *path)
  1356. {
  1357. struct mount *p;
  1358. p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
  1359. if (IS_ERR(p))
  1360. return ERR_CAST(p);
  1361. p->mnt.mnt_flags |= MNT_INTERNAL;
  1362. return &p->mnt;
  1363. }
  1364. /*
  1365. * Returns the mount which either has the specified mnt_id, or has the next
  1366. * smallest id afer the specified one.
  1367. */
  1368. static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id)
  1369. {
  1370. struct rb_node *node = ns->mounts.rb_node;
  1371. struct mount *ret = NULL;
  1372. while (node) {
  1373. struct mount *m = node_to_mount(node);
  1374. if (mnt_id <= m->mnt_id_unique) {
  1375. ret = node_to_mount(node);
  1376. if (mnt_id == m->mnt_id_unique)
  1377. break;
  1378. node = node->rb_left;
  1379. } else {
  1380. node = node->rb_right;
  1381. }
  1382. }
  1383. return ret;
  1384. }
  1385. /*
  1386. * Returns the mount which either has the specified mnt_id, or has the next
  1387. * greater id before the specified one.
  1388. */
  1389. static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id)
  1390. {
  1391. struct rb_node *node = ns->mounts.rb_node;
  1392. struct mount *ret = NULL;
  1393. while (node) {
  1394. struct mount *m = node_to_mount(node);
  1395. if (mnt_id >= m->mnt_id_unique) {
  1396. ret = node_to_mount(node);
  1397. if (mnt_id == m->mnt_id_unique)
  1398. break;
  1399. node = node->rb_right;
  1400. } else {
  1401. node = node->rb_left;
  1402. }
  1403. }
  1404. return ret;
  1405. }
  1406. #ifdef CONFIG_PROC_FS
  1407. /* iterator; we want it to have access to namespace_sem, thus here... */
  1408. static void *m_start(struct seq_file *m, loff_t *pos)
  1409. {
  1410. struct proc_mounts *p = m->private;
  1411. down_read(&namespace_sem);
  1412. return mnt_find_id_at(p->ns, *pos);
  1413. }
  1414. static void *m_next(struct seq_file *m, void *v, loff_t *pos)
  1415. {
  1416. struct mount *next = NULL, *mnt = v;
  1417. struct rb_node *node = rb_next(&mnt->mnt_node);
  1418. ++*pos;
  1419. if (node) {
  1420. next = node_to_mount(node);
  1421. *pos = next->mnt_id_unique;
  1422. }
  1423. return next;
  1424. }
  1425. static void m_stop(struct seq_file *m, void *v)
  1426. {
  1427. up_read(&namespace_sem);
  1428. }
  1429. static int m_show(struct seq_file *m, void *v)
  1430. {
  1431. struct proc_mounts *p = m->private;
  1432. struct mount *r = v;
  1433. return p->show(m, &r->mnt);
  1434. }
  1435. const struct seq_operations mounts_op = {
  1436. .start = m_start,
  1437. .next = m_next,
  1438. .stop = m_stop,
  1439. .show = m_show,
  1440. };
  1441. #endif /* CONFIG_PROC_FS */
  1442. /**
  1443. * may_umount_tree - check if a mount tree is busy
  1444. * @m: root of mount tree
  1445. *
  1446. * This is called to check if a tree of mounts has any
  1447. * open files, pwds, chroots or sub mounts that are
  1448. * busy.
  1449. */
  1450. int may_umount_tree(struct vfsmount *m)
  1451. {
  1452. struct mount *mnt = real_mount(m);
  1453. int actual_refs = 0;
  1454. int minimum_refs = 0;
  1455. struct mount *p;
  1456. BUG_ON(!m);
  1457. /* write lock needed for mnt_get_count */
  1458. lock_mount_hash();
  1459. for (p = mnt; p; p = next_mnt(p, mnt)) {
  1460. actual_refs += mnt_get_count(p);
  1461. minimum_refs += 2;
  1462. }
  1463. unlock_mount_hash();
  1464. if (actual_refs > minimum_refs)
  1465. return 0;
  1466. return 1;
  1467. }
  1468. EXPORT_SYMBOL(may_umount_tree);
  1469. /**
  1470. * may_umount - check if a mount point is busy
  1471. * @mnt: root of mount
  1472. *
  1473. * This is called to check if a mount point has any
  1474. * open files, pwds, chroots or sub mounts. If the
  1475. * mount has sub mounts this will return busy
  1476. * regardless of whether the sub mounts are busy.
  1477. *
  1478. * Doesn't take quota and stuff into account. IOW, in some cases it will
  1479. * give false negatives. The main reason why it's here is that we need
  1480. * a non-destructive way to look for easily umountable filesystems.
  1481. */
  1482. int may_umount(struct vfsmount *mnt)
  1483. {
  1484. int ret = 1;
  1485. down_read(&namespace_sem);
  1486. lock_mount_hash();
  1487. if (propagate_mount_busy(real_mount(mnt), 2))
  1488. ret = 0;
  1489. unlock_mount_hash();
  1490. up_read(&namespace_sem);
  1491. return ret;
  1492. }
  1493. EXPORT_SYMBOL(may_umount);
  1494. static void namespace_unlock(void)
  1495. {
  1496. struct hlist_head head;
  1497. struct hlist_node *p;
  1498. struct mount *m;
  1499. LIST_HEAD(list);
  1500. hlist_move_list(&unmounted, &head);
  1501. list_splice_init(&ex_mountpoints, &list);
  1502. up_write(&namespace_sem);
  1503. shrink_dentry_list(&list);
  1504. if (likely(hlist_empty(&head)))
  1505. return;
  1506. synchronize_rcu_expedited();
  1507. hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
  1508. hlist_del(&m->mnt_umount);
  1509. mntput(&m->mnt);
  1510. }
  1511. }
  1512. static inline void namespace_lock(void)
  1513. {
  1514. down_write(&namespace_sem);
  1515. }
  1516. enum umount_tree_flags {
  1517. UMOUNT_SYNC = 1,
  1518. UMOUNT_PROPAGATE = 2,
  1519. UMOUNT_CONNECTED = 4,
  1520. };
  1521. static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
  1522. {
  1523. /* Leaving mounts connected is only valid for lazy umounts */
  1524. if (how & UMOUNT_SYNC)
  1525. return true;
  1526. /* A mount without a parent has nothing to be connected to */
  1527. if (!mnt_has_parent(mnt))
  1528. return true;
  1529. /* Because the reference counting rules change when mounts are
  1530. * unmounted and connected, umounted mounts may not be
  1531. * connected to mounted mounts.
  1532. */
  1533. if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
  1534. return true;
  1535. /* Has it been requested that the mount remain connected? */
  1536. if (how & UMOUNT_CONNECTED)
  1537. return false;
  1538. /* Is the mount locked such that it needs to remain connected? */
  1539. if (IS_MNT_LOCKED(mnt))
  1540. return false;
  1541. /* By default disconnect the mount */
  1542. return true;
  1543. }
  1544. /*
  1545. * mount_lock must be held
  1546. * namespace_sem must be held for write
  1547. */
  1548. static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
  1549. {
  1550. LIST_HEAD(tmp_list);
  1551. struct mount *p;
  1552. if (how & UMOUNT_PROPAGATE)
  1553. propagate_mount_unlock(mnt);
  1554. /* Gather the mounts to umount */
  1555. for (p = mnt; p; p = next_mnt(p, mnt)) {
  1556. p->mnt.mnt_flags |= MNT_UMOUNT;
  1557. if (mnt_ns_attached(p))
  1558. move_from_ns(p, &tmp_list);
  1559. else
  1560. list_move(&p->mnt_list, &tmp_list);
  1561. }
  1562. /* Hide the mounts from mnt_mounts */
  1563. list_for_each_entry(p, &tmp_list, mnt_list) {
  1564. list_del_init(&p->mnt_child);
  1565. }
  1566. /* Add propagated mounts to the tmp_list */
  1567. if (how & UMOUNT_PROPAGATE)
  1568. propagate_umount(&tmp_list);
  1569. while (!list_empty(&tmp_list)) {
  1570. struct mnt_namespace *ns;
  1571. bool disconnect;
  1572. p = list_first_entry(&tmp_list, struct mount, mnt_list);
  1573. list_del_init(&p->mnt_expire);
  1574. list_del_init(&p->mnt_list);
  1575. ns = p->mnt_ns;
  1576. if (ns) {
  1577. ns->nr_mounts--;
  1578. __touch_mnt_namespace(ns);
  1579. }
  1580. p->mnt_ns = NULL;
  1581. if (how & UMOUNT_SYNC)
  1582. p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
  1583. disconnect = disconnect_mount(p, how);
  1584. if (mnt_has_parent(p)) {
  1585. mnt_add_count(p->mnt_parent, -1);
  1586. if (!disconnect) {
  1587. /* Don't forget about p */
  1588. list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
  1589. } else {
  1590. umount_mnt(p);
  1591. }
  1592. }
  1593. change_mnt_propagation(p, MS_PRIVATE);
  1594. if (disconnect)
  1595. hlist_add_head(&p->mnt_umount, &unmounted);
  1596. }
  1597. }
  1598. static void shrink_submounts(struct mount *mnt);
  1599. static int do_umount_root(struct super_block *sb)
  1600. {
  1601. int ret = 0;
  1602. down_write(&sb->s_umount);
  1603. if (!sb_rdonly(sb)) {
  1604. struct fs_context *fc;
  1605. fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
  1606. SB_RDONLY);
  1607. if (IS_ERR(fc)) {
  1608. ret = PTR_ERR(fc);
  1609. } else {
  1610. ret = parse_monolithic_mount_data(fc, NULL);
  1611. if (!ret)
  1612. ret = reconfigure_super(fc);
  1613. put_fs_context(fc);
  1614. }
  1615. }
  1616. up_write(&sb->s_umount);
  1617. return ret;
  1618. }
  1619. static int do_umount(struct mount *mnt, int flags)
  1620. {
  1621. struct super_block *sb = mnt->mnt.mnt_sb;
  1622. int retval;
  1623. retval = security_sb_umount(&mnt->mnt, flags);
  1624. if (retval)
  1625. return retval;
  1626. /*
  1627. * Allow userspace to request a mountpoint be expired rather than
  1628. * unmounting unconditionally. Unmount only happens if:
  1629. * (1) the mark is already set (the mark is cleared by mntput())
  1630. * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
  1631. */
  1632. if (flags & MNT_EXPIRE) {
  1633. if (&mnt->mnt == current->fs->root.mnt ||
  1634. flags & (MNT_FORCE | MNT_DETACH))
  1635. return -EINVAL;
  1636. /*
  1637. * probably don't strictly need the lock here if we examined
  1638. * all race cases, but it's a slowpath.
  1639. */
  1640. lock_mount_hash();
  1641. if (mnt_get_count(mnt) != 2) {
  1642. unlock_mount_hash();
  1643. return -EBUSY;
  1644. }
  1645. unlock_mount_hash();
  1646. if (!xchg(&mnt->mnt_expiry_mark, 1))
  1647. return -EAGAIN;
  1648. }
  1649. /*
  1650. * If we may have to abort operations to get out of this
  1651. * mount, and they will themselves hold resources we must
  1652. * allow the fs to do things. In the Unix tradition of
  1653. * 'Gee thats tricky lets do it in userspace' the umount_begin
  1654. * might fail to complete on the first run through as other tasks
  1655. * must return, and the like. Thats for the mount program to worry
  1656. * about for the moment.
  1657. */
  1658. if (flags & MNT_FORCE && sb->s_op->umount_begin) {
  1659. sb->s_op->umount_begin(sb);
  1660. }
  1661. /*
  1662. * No sense to grab the lock for this test, but test itself looks
  1663. * somewhat bogus. Suggestions for better replacement?
  1664. * Ho-hum... In principle, we might treat that as umount + switch
  1665. * to rootfs. GC would eventually take care of the old vfsmount.
  1666. * Actually it makes sense, especially if rootfs would contain a
  1667. * /reboot - static binary that would close all descriptors and
  1668. * call reboot(9). Then init(8) could umount root and exec /reboot.
  1669. */
  1670. if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
  1671. /*
  1672. * Special case for "unmounting" root ...
  1673. * we just try to remount it readonly.
  1674. */
  1675. if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
  1676. return -EPERM;
  1677. return do_umount_root(sb);
  1678. }
  1679. namespace_lock();
  1680. lock_mount_hash();
  1681. /* Recheck MNT_LOCKED with the locks held */
  1682. retval = -EINVAL;
  1683. if (mnt->mnt.mnt_flags & MNT_LOCKED)
  1684. goto out;
  1685. event++;
  1686. if (flags & MNT_DETACH) {
  1687. if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
  1688. umount_tree(mnt, UMOUNT_PROPAGATE);
  1689. retval = 0;
  1690. } else {
  1691. smp_mb(); // paired with __legitimize_mnt()
  1692. shrink_submounts(mnt);
  1693. retval = -EBUSY;
  1694. if (!propagate_mount_busy(mnt, 2)) {
  1695. if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
  1696. umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
  1697. retval = 0;
  1698. }
  1699. }
  1700. out:
  1701. unlock_mount_hash();
  1702. namespace_unlock();
  1703. return retval;
  1704. }
  1705. /*
  1706. * __detach_mounts - lazily unmount all mounts on the specified dentry
  1707. *
  1708. * During unlink, rmdir, and d_drop it is possible to loose the path
  1709. * to an existing mountpoint, and wind up leaking the mount.
  1710. * detach_mounts allows lazily unmounting those mounts instead of
  1711. * leaking them.
  1712. *
  1713. * The caller may hold dentry->d_inode->i_mutex.
  1714. */
  1715. void __detach_mounts(struct dentry *dentry)
  1716. {
  1717. struct mountpoint *mp;
  1718. struct mount *mnt;
  1719. namespace_lock();
  1720. lock_mount_hash();
  1721. mp = lookup_mountpoint(dentry);
  1722. if (!mp)
  1723. goto out_unlock;
  1724. event++;
  1725. while (!hlist_empty(&mp->m_list)) {
  1726. mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
  1727. if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
  1728. umount_mnt(mnt);
  1729. hlist_add_head(&mnt->mnt_umount, &unmounted);
  1730. }
  1731. else umount_tree(mnt, UMOUNT_CONNECTED);
  1732. }
  1733. put_mountpoint(mp);
  1734. out_unlock:
  1735. unlock_mount_hash();
  1736. namespace_unlock();
  1737. }
  1738. /*
  1739. * Is the caller allowed to modify his namespace?
  1740. */
  1741. bool may_mount(void)
  1742. {
  1743. return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
  1744. }
  1745. static void warn_mandlock(void)
  1746. {
  1747. pr_warn_once("=======================================================\n"
  1748. "WARNING: The mand mount option has been deprecated and\n"
  1749. " and is ignored by this kernel. Remove the mand\n"
  1750. " option from the mount to silence this warning.\n"
  1751. "=======================================================\n");
  1752. }
  1753. static int can_umount(const struct path *path, int flags)
  1754. {
  1755. struct mount *mnt = real_mount(path->mnt);
  1756. struct super_block *sb = path->dentry->d_sb;
  1757. if (!may_mount())
  1758. return -EPERM;
  1759. if (!path_mounted(path))
  1760. return -EINVAL;
  1761. if (!check_mnt(mnt))
  1762. return -EINVAL;
  1763. if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
  1764. return -EINVAL;
  1765. if (flags & MNT_FORCE && !ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
  1766. return -EPERM;
  1767. return 0;
  1768. }
  1769. // caller is responsible for flags being sane
  1770. int path_umount(struct path *path, int flags)
  1771. {
  1772. struct mount *mnt = real_mount(path->mnt);
  1773. int ret;
  1774. ret = can_umount(path, flags);
  1775. if (!ret)
  1776. ret = do_umount(mnt, flags);
  1777. /* we mustn't call path_put() as that would clear mnt_expiry_mark */
  1778. dput(path->dentry);
  1779. mntput_no_expire(mnt);
  1780. return ret;
  1781. }
  1782. static int ksys_umount(char __user *name, int flags)
  1783. {
  1784. int lookup_flags = LOOKUP_MOUNTPOINT;
  1785. struct path path;
  1786. int ret;
  1787. // basic validity checks done first
  1788. if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
  1789. return -EINVAL;
  1790. if (!(flags & UMOUNT_NOFOLLOW))
  1791. lookup_flags |= LOOKUP_FOLLOW;
  1792. ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
  1793. if (ret)
  1794. return ret;
  1795. return path_umount(&path, flags);
  1796. }
  1797. SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
  1798. {
  1799. return ksys_umount(name, flags);
  1800. }
  1801. #ifdef __ARCH_WANT_SYS_OLDUMOUNT
  1802. /*
  1803. * The 2.0 compatible umount. No flags.
  1804. */
  1805. SYSCALL_DEFINE1(oldumount, char __user *, name)
  1806. {
  1807. return ksys_umount(name, 0);
  1808. }
  1809. #endif
  1810. static bool is_mnt_ns_file(struct dentry *dentry)
  1811. {
  1812. struct ns_common *ns;
  1813. /* Is this a proxy for a mount namespace? */
  1814. if (dentry->d_op != &ns_dentry_operations)
  1815. return false;
  1816. ns = d_inode(dentry)->i_private;
  1817. return ns->ops == &mntns_operations;
  1818. }
  1819. struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
  1820. {
  1821. return &mnt->ns;
  1822. }
  1823. struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous)
  1824. {
  1825. guard(read_lock)(&mnt_ns_tree_lock);
  1826. for (;;) {
  1827. struct rb_node *node;
  1828. if (previous)
  1829. node = rb_prev(&mntns->mnt_ns_tree_node);
  1830. else
  1831. node = rb_next(&mntns->mnt_ns_tree_node);
  1832. if (!node)
  1833. return ERR_PTR(-ENOENT);
  1834. mntns = node_to_mnt_ns(node);
  1835. node = &mntns->mnt_ns_tree_node;
  1836. if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
  1837. continue;
  1838. /*
  1839. * Holding mnt_ns_tree_lock prevents the mount namespace from
  1840. * being freed but it may well be on it's deathbed. We want an
  1841. * active reference, not just a passive one here as we're
  1842. * persisting the mount namespace.
  1843. */
  1844. if (!refcount_inc_not_zero(&mntns->ns.count))
  1845. continue;
  1846. return mntns;
  1847. }
  1848. }
  1849. static bool mnt_ns_loop(struct dentry *dentry)
  1850. {
  1851. /* Could bind mounting the mount namespace inode cause a
  1852. * mount namespace loop?
  1853. */
  1854. struct mnt_namespace *mnt_ns;
  1855. if (!is_mnt_ns_file(dentry))
  1856. return false;
  1857. mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
  1858. return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
  1859. }
  1860. struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
  1861. int flag)
  1862. {
  1863. struct mount *res, *src_parent, *src_root_child, *src_mnt,
  1864. *dst_parent, *dst_mnt;
  1865. if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(src_root))
  1866. return ERR_PTR(-EINVAL);
  1867. if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
  1868. return ERR_PTR(-EINVAL);
  1869. res = dst_mnt = clone_mnt(src_root, dentry, flag);
  1870. if (IS_ERR(dst_mnt))
  1871. return dst_mnt;
  1872. src_parent = src_root;
  1873. dst_mnt->mnt_mountpoint = src_root->mnt_mountpoint;
  1874. list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) {
  1875. if (!is_subdir(src_root_child->mnt_mountpoint, dentry))
  1876. continue;
  1877. for (src_mnt = src_root_child; src_mnt;
  1878. src_mnt = next_mnt(src_mnt, src_root_child)) {
  1879. if (!(flag & CL_COPY_UNBINDABLE) &&
  1880. IS_MNT_UNBINDABLE(src_mnt)) {
  1881. if (src_mnt->mnt.mnt_flags & MNT_LOCKED) {
  1882. /* Both unbindable and locked. */
  1883. dst_mnt = ERR_PTR(-EPERM);
  1884. goto out;
  1885. } else {
  1886. src_mnt = skip_mnt_tree(src_mnt);
  1887. continue;
  1888. }
  1889. }
  1890. if (!(flag & CL_COPY_MNT_NS_FILE) &&
  1891. is_mnt_ns_file(src_mnt->mnt.mnt_root)) {
  1892. src_mnt = skip_mnt_tree(src_mnt);
  1893. continue;
  1894. }
  1895. while (src_parent != src_mnt->mnt_parent) {
  1896. src_parent = src_parent->mnt_parent;
  1897. dst_mnt = dst_mnt->mnt_parent;
  1898. }
  1899. src_parent = src_mnt;
  1900. dst_parent = dst_mnt;
  1901. dst_mnt = clone_mnt(src_mnt, src_mnt->mnt.mnt_root, flag);
  1902. if (IS_ERR(dst_mnt))
  1903. goto out;
  1904. lock_mount_hash();
  1905. list_add_tail(&dst_mnt->mnt_list, &res->mnt_list);
  1906. attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp, false);
  1907. unlock_mount_hash();
  1908. }
  1909. }
  1910. return res;
  1911. out:
  1912. if (res) {
  1913. lock_mount_hash();
  1914. umount_tree(res, UMOUNT_SYNC);
  1915. unlock_mount_hash();
  1916. }
  1917. return dst_mnt;
  1918. }
  1919. /* Caller should check returned pointer for errors */
  1920. struct vfsmount *collect_mounts(const struct path *path)
  1921. {
  1922. struct mount *tree;
  1923. namespace_lock();
  1924. if (!check_mnt(real_mount(path->mnt)))
  1925. tree = ERR_PTR(-EINVAL);
  1926. else
  1927. tree = copy_tree(real_mount(path->mnt), path->dentry,
  1928. CL_COPY_ALL | CL_PRIVATE);
  1929. namespace_unlock();
  1930. if (IS_ERR(tree))
  1931. return ERR_CAST(tree);
  1932. return &tree->mnt;
  1933. }
  1934. static void free_mnt_ns(struct mnt_namespace *);
  1935. static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
  1936. void dissolve_on_fput(struct vfsmount *mnt)
  1937. {
  1938. struct mnt_namespace *ns;
  1939. namespace_lock();
  1940. lock_mount_hash();
  1941. ns = real_mount(mnt)->mnt_ns;
  1942. if (ns) {
  1943. if (is_anon_ns(ns))
  1944. umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
  1945. else
  1946. ns = NULL;
  1947. }
  1948. unlock_mount_hash();
  1949. namespace_unlock();
  1950. if (ns)
  1951. free_mnt_ns(ns);
  1952. }
  1953. void drop_collected_mounts(struct vfsmount *mnt)
  1954. {
  1955. namespace_lock();
  1956. lock_mount_hash();
  1957. umount_tree(real_mount(mnt), 0);
  1958. unlock_mount_hash();
  1959. namespace_unlock();
  1960. }
  1961. static bool __has_locked_children(struct mount *mnt, struct dentry *dentry)
  1962. {
  1963. struct mount *child;
  1964. list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
  1965. if (!is_subdir(child->mnt_mountpoint, dentry))
  1966. continue;
  1967. if (child->mnt.mnt_flags & MNT_LOCKED)
  1968. return true;
  1969. }
  1970. return false;
  1971. }
  1972. bool has_locked_children(struct mount *mnt, struct dentry *dentry)
  1973. {
  1974. bool res;
  1975. read_seqlock_excl(&mount_lock);
  1976. res = __has_locked_children(mnt, dentry);
  1977. read_sequnlock_excl(&mount_lock);
  1978. return res;
  1979. }
  1980. /**
  1981. * clone_private_mount - create a private clone of a path
  1982. * @path: path to clone
  1983. *
  1984. * This creates a new vfsmount, which will be the clone of @path. The new mount
  1985. * will not be attached anywhere in the namespace and will be private (i.e.
  1986. * changes to the originating mount won't be propagated into this).
  1987. *
  1988. * Release with mntput().
  1989. */
  1990. struct vfsmount *clone_private_mount(const struct path *path)
  1991. {
  1992. struct mount *old_mnt = real_mount(path->mnt);
  1993. struct mount *new_mnt;
  1994. down_read(&namespace_sem);
  1995. if (IS_MNT_UNBINDABLE(old_mnt))
  1996. goto invalid;
  1997. if (!check_mnt(old_mnt))
  1998. goto invalid;
  1999. if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) {
  2000. up_read(&namespace_sem);
  2001. return ERR_PTR(-EPERM);
  2002. }
  2003. if (__has_locked_children(old_mnt, path->dentry))
  2004. goto invalid;
  2005. new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
  2006. up_read(&namespace_sem);
  2007. if (IS_ERR(new_mnt))
  2008. return ERR_CAST(new_mnt);
  2009. /* Longterm mount to be removed by kern_unmount*() */
  2010. new_mnt->mnt_ns = MNT_NS_INTERNAL;
  2011. return &new_mnt->mnt;
  2012. invalid:
  2013. up_read(&namespace_sem);
  2014. return ERR_PTR(-EINVAL);
  2015. }
  2016. EXPORT_SYMBOL_GPL(clone_private_mount);
  2017. int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
  2018. struct vfsmount *root)
  2019. {
  2020. struct mount *mnt;
  2021. int res = f(root, arg);
  2022. if (res)
  2023. return res;
  2024. list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
  2025. res = f(&mnt->mnt, arg);
  2026. if (res)
  2027. return res;
  2028. }
  2029. return 0;
  2030. }
  2031. static void lock_mnt_tree(struct mount *mnt)
  2032. {
  2033. struct mount *p;
  2034. for (p = mnt; p; p = next_mnt(p, mnt)) {
  2035. int flags = p->mnt.mnt_flags;
  2036. /* Don't allow unprivileged users to change mount flags */
  2037. flags |= MNT_LOCK_ATIME;
  2038. if (flags & MNT_READONLY)
  2039. flags |= MNT_LOCK_READONLY;
  2040. if (flags & MNT_NODEV)
  2041. flags |= MNT_LOCK_NODEV;
  2042. if (flags & MNT_NOSUID)
  2043. flags |= MNT_LOCK_NOSUID;
  2044. if (flags & MNT_NOEXEC)
  2045. flags |= MNT_LOCK_NOEXEC;
  2046. /* Don't allow unprivileged users to reveal what is under a mount */
  2047. if (list_empty(&p->mnt_expire))
  2048. flags |= MNT_LOCKED;
  2049. p->mnt.mnt_flags = flags;
  2050. }
  2051. }
  2052. static void cleanup_group_ids(struct mount *mnt, struct mount *end)
  2053. {
  2054. struct mount *p;
  2055. for (p = mnt; p != end; p = next_mnt(p, mnt)) {
  2056. if (p->mnt_group_id && !IS_MNT_SHARED(p))
  2057. mnt_release_group_id(p);
  2058. }
  2059. }
  2060. static int invent_group_ids(struct mount *mnt, bool recurse)
  2061. {
  2062. struct mount *p;
  2063. for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
  2064. if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
  2065. int err = mnt_alloc_group_id(p);
  2066. if (err) {
  2067. cleanup_group_ids(mnt, p);
  2068. return err;
  2069. }
  2070. }
  2071. }
  2072. return 0;
  2073. }
  2074. int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
  2075. {
  2076. unsigned int max = READ_ONCE(sysctl_mount_max);
  2077. unsigned int mounts = 0;
  2078. struct mount *p;
  2079. if (ns->nr_mounts >= max)
  2080. return -ENOSPC;
  2081. max -= ns->nr_mounts;
  2082. if (ns->pending_mounts >= max)
  2083. return -ENOSPC;
  2084. max -= ns->pending_mounts;
  2085. for (p = mnt; p; p = next_mnt(p, mnt))
  2086. mounts++;
  2087. if (mounts > max)
  2088. return -ENOSPC;
  2089. ns->pending_mounts += mounts;
  2090. return 0;
  2091. }
  2092. enum mnt_tree_flags_t {
  2093. MNT_TREE_MOVE = BIT(0),
  2094. MNT_TREE_BENEATH = BIT(1),
  2095. };
  2096. /**
  2097. * attach_recursive_mnt - attach a source mount tree
  2098. * @source_mnt: mount tree to be attached
  2099. * @top_mnt: mount that @source_mnt will be mounted on or mounted beneath
  2100. * @dest_mp: the mountpoint @source_mnt will be mounted at
  2101. * @flags: modify how @source_mnt is supposed to be attached
  2102. *
  2103. * NOTE: in the table below explains the semantics when a source mount
  2104. * of a given type is attached to a destination mount of a given type.
  2105. * ---------------------------------------------------------------------------
  2106. * | BIND MOUNT OPERATION |
  2107. * |**************************************************************************
  2108. * | source-->| shared | private | slave | unbindable |
  2109. * | dest | | | | |
  2110. * | | | | | | |
  2111. * | v | | | | |
  2112. * |**************************************************************************
  2113. * | shared | shared (++) | shared (+) | shared(+++)| invalid |
  2114. * | | | | | |
  2115. * |non-shared| shared (+) | private | slave (*) | invalid |
  2116. * ***************************************************************************
  2117. * A bind operation clones the source mount and mounts the clone on the
  2118. * destination mount.
  2119. *
  2120. * (++) the cloned mount is propagated to all the mounts in the propagation
  2121. * tree of the destination mount and the cloned mount is added to
  2122. * the peer group of the source mount.
  2123. * (+) the cloned mount is created under the destination mount and is marked
  2124. * as shared. The cloned mount is added to the peer group of the source
  2125. * mount.
  2126. * (+++) the mount is propagated to all the mounts in the propagation tree
  2127. * of the destination mount and the cloned mount is made slave
  2128. * of the same master as that of the source mount. The cloned mount
  2129. * is marked as 'shared and slave'.
  2130. * (*) the cloned mount is made a slave of the same master as that of the
  2131. * source mount.
  2132. *
  2133. * ---------------------------------------------------------------------------
  2134. * | MOVE MOUNT OPERATION |
  2135. * |**************************************************************************
  2136. * | source-->| shared | private | slave | unbindable |
  2137. * | dest | | | | |
  2138. * | | | | | | |
  2139. * | v | | | | |
  2140. * |**************************************************************************
  2141. * | shared | shared (+) | shared (+) | shared(+++) | invalid |
  2142. * | | | | | |
  2143. * |non-shared| shared (+*) | private | slave (*) | unbindable |
  2144. * ***************************************************************************
  2145. *
  2146. * (+) the mount is moved to the destination. And is then propagated to
  2147. * all the mounts in the propagation tree of the destination mount.
  2148. * (+*) the mount is moved to the destination.
  2149. * (+++) the mount is moved to the destination and is then propagated to
  2150. * all the mounts belonging to the destination mount's propagation tree.
  2151. * the mount is marked as 'shared and slave'.
  2152. * (*) the mount continues to be a slave at the new location.
  2153. *
  2154. * if the source mount is a tree, the operations explained above is
  2155. * applied to each mount in the tree.
  2156. * Must be called without spinlocks held, since this function can sleep
  2157. * in allocations.
  2158. *
  2159. * Context: The function expects namespace_lock() to be held.
  2160. * Return: If @source_mnt was successfully attached 0 is returned.
  2161. * Otherwise a negative error code is returned.
  2162. */
  2163. static int attach_recursive_mnt(struct mount *source_mnt,
  2164. struct mount *top_mnt,
  2165. struct mountpoint *dest_mp,
  2166. enum mnt_tree_flags_t flags)
  2167. {
  2168. struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
  2169. HLIST_HEAD(tree_list);
  2170. struct mnt_namespace *ns = top_mnt->mnt_ns;
  2171. struct mountpoint *smp;
  2172. struct mount *child, *dest_mnt, *p;
  2173. struct hlist_node *n;
  2174. int err = 0;
  2175. bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH;
  2176. /*
  2177. * Preallocate a mountpoint in case the new mounts need to be
  2178. * mounted beneath mounts on the same mountpoint.
  2179. */
  2180. smp = get_mountpoint(source_mnt->mnt.mnt_root);
  2181. if (IS_ERR(smp))
  2182. return PTR_ERR(smp);
  2183. /* Is there space to add these mounts to the mount namespace? */
  2184. if (!moving) {
  2185. err = count_mounts(ns, source_mnt);
  2186. if (err)
  2187. goto out;
  2188. }
  2189. if (beneath)
  2190. dest_mnt = top_mnt->mnt_parent;
  2191. else
  2192. dest_mnt = top_mnt;
  2193. if (IS_MNT_SHARED(dest_mnt)) {
  2194. err = invent_group_ids(source_mnt, true);
  2195. if (err)
  2196. goto out;
  2197. err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
  2198. }
  2199. lock_mount_hash();
  2200. if (err)
  2201. goto out_cleanup_ids;
  2202. if (IS_MNT_SHARED(dest_mnt)) {
  2203. for (p = source_mnt; p; p = next_mnt(p, source_mnt))
  2204. set_mnt_shared(p);
  2205. }
  2206. if (moving) {
  2207. if (beneath)
  2208. dest_mp = smp;
  2209. unhash_mnt(source_mnt);
  2210. attach_mnt(source_mnt, top_mnt, dest_mp, beneath);
  2211. touch_mnt_namespace(source_mnt->mnt_ns);
  2212. } else {
  2213. if (source_mnt->mnt_ns) {
  2214. LIST_HEAD(head);
  2215. /* move from anon - the caller will destroy */
  2216. for (p = source_mnt; p; p = next_mnt(p, source_mnt))
  2217. move_from_ns(p, &head);
  2218. list_del_init(&head);
  2219. }
  2220. if (beneath)
  2221. mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp);
  2222. else
  2223. mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
  2224. commit_tree(source_mnt);
  2225. }
  2226. hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
  2227. struct mount *q;
  2228. hlist_del_init(&child->mnt_hash);
  2229. /* Notice when we are propagating across user namespaces */
  2230. if (child->mnt_parent->mnt_ns->user_ns != user_ns)
  2231. lock_mnt_tree(child);
  2232. child->mnt.mnt_flags &= ~MNT_LOCKED;
  2233. q = __lookup_mnt(&child->mnt_parent->mnt,
  2234. child->mnt_mountpoint);
  2235. if (q)
  2236. mnt_change_mountpoint(child, smp, q);
  2237. commit_tree(child);
  2238. }
  2239. put_mountpoint(smp);
  2240. unlock_mount_hash();
  2241. return 0;
  2242. out_cleanup_ids:
  2243. while (!hlist_empty(&tree_list)) {
  2244. child = hlist_entry(tree_list.first, struct mount, mnt_hash);
  2245. child->mnt_parent->mnt_ns->pending_mounts = 0;
  2246. umount_tree(child, UMOUNT_SYNC);
  2247. }
  2248. unlock_mount_hash();
  2249. cleanup_group_ids(source_mnt, NULL);
  2250. out:
  2251. ns->pending_mounts = 0;
  2252. read_seqlock_excl(&mount_lock);
  2253. put_mountpoint(smp);
  2254. read_sequnlock_excl(&mount_lock);
  2255. return err;
  2256. }
  2257. /**
  2258. * do_lock_mount - lock mount and mountpoint
  2259. * @path: target path
  2260. * @beneath: whether the intention is to mount beneath @path
  2261. *
  2262. * Follow the mount stack on @path until the top mount @mnt is found. If
  2263. * the initial @path->{mnt,dentry} is a mountpoint lookup the first
  2264. * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root}
  2265. * until nothing is stacked on top of it anymore.
  2266. *
  2267. * Acquire the inode_lock() on the top mount's ->mnt_root to protect
  2268. * against concurrent removal of the new mountpoint from another mount
  2269. * namespace.
  2270. *
  2271. * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint
  2272. * @mp on @mnt->mnt_parent must be acquired. This protects against a
  2273. * concurrent unlink of @mp->mnt_dentry from another mount namespace
  2274. * where @mnt doesn't have a child mount mounted @mp. A concurrent
  2275. * removal of @mnt->mnt_root doesn't matter as nothing will be mounted
  2276. * on top of it for @beneath.
  2277. *
  2278. * In addition, @beneath needs to make sure that @mnt hasn't been
  2279. * unmounted or moved from its current mountpoint in between dropping
  2280. * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt
  2281. * being unmounted would be detected later by e.g., calling
  2282. * check_mnt(mnt) in the function it's called from. For the @beneath
  2283. * case however, it's useful to detect it directly in do_lock_mount().
  2284. * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points
  2285. * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will
  2286. * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL.
  2287. *
  2288. * Return: Either the target mountpoint on the top mount or the top
  2289. * mount's mountpoint.
  2290. */
  2291. static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
  2292. {
  2293. struct vfsmount *mnt = path->mnt;
  2294. struct dentry *dentry;
  2295. struct mountpoint *mp = ERR_PTR(-ENOENT);
  2296. struct path under = {};
  2297. for (;;) {
  2298. struct mount *m = real_mount(mnt);
  2299. if (beneath) {
  2300. path_put(&under);
  2301. read_seqlock_excl(&mount_lock);
  2302. under.mnt = mntget(&m->mnt_parent->mnt);
  2303. under.dentry = dget(m->mnt_mountpoint);
  2304. read_sequnlock_excl(&mount_lock);
  2305. dentry = under.dentry;
  2306. } else {
  2307. dentry = path->dentry;
  2308. }
  2309. inode_lock(dentry->d_inode);
  2310. namespace_lock();
  2311. if (unlikely(cant_mount(dentry) || !is_mounted(mnt)))
  2312. break; // not to be mounted on
  2313. if (beneath && unlikely(m->mnt_mountpoint != dentry ||
  2314. &m->mnt_parent->mnt != under.mnt)) {
  2315. namespace_unlock();
  2316. inode_unlock(dentry->d_inode);
  2317. continue; // got moved
  2318. }
  2319. mnt = lookup_mnt(path);
  2320. if (unlikely(mnt)) {
  2321. namespace_unlock();
  2322. inode_unlock(dentry->d_inode);
  2323. path_put(path);
  2324. path->mnt = mnt;
  2325. path->dentry = dget(mnt->mnt_root);
  2326. continue; // got overmounted
  2327. }
  2328. mp = get_mountpoint(dentry);
  2329. if (IS_ERR(mp))
  2330. break;
  2331. if (beneath) {
  2332. /*
  2333. * @under duplicates the references that will stay
  2334. * at least until namespace_unlock(), so the path_put()
  2335. * below is safe (and OK to do under namespace_lock -
  2336. * we are not dropping the final references here).
  2337. */
  2338. path_put(&under);
  2339. }
  2340. return mp;
  2341. }
  2342. namespace_unlock();
  2343. inode_unlock(dentry->d_inode);
  2344. if (beneath)
  2345. path_put(&under);
  2346. return mp;
  2347. }
  2348. static inline struct mountpoint *lock_mount(struct path *path)
  2349. {
  2350. return do_lock_mount(path, false);
  2351. }
  2352. static void unlock_mount(struct mountpoint *where)
  2353. {
  2354. inode_unlock(where->m_dentry->d_inode);
  2355. read_seqlock_excl(&mount_lock);
  2356. put_mountpoint(where);
  2357. read_sequnlock_excl(&mount_lock);
  2358. namespace_unlock();
  2359. }
  2360. static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
  2361. {
  2362. if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
  2363. return -EINVAL;
  2364. if (d_is_dir(mp->m_dentry) !=
  2365. d_is_dir(mnt->mnt.mnt_root))
  2366. return -ENOTDIR;
  2367. return attach_recursive_mnt(mnt, p, mp, 0);
  2368. }
  2369. static int may_change_propagation(const struct mount *m)
  2370. {
  2371. struct mnt_namespace *ns = m->mnt_ns;
  2372. // it must be mounted in some namespace
  2373. if (IS_ERR_OR_NULL(ns)) // is_mounted()
  2374. return -EINVAL;
  2375. // and the caller must be admin in userns of that namespace
  2376. if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
  2377. return -EPERM;
  2378. return 0;
  2379. }
  2380. /*
  2381. * Sanity check the flags to change_mnt_propagation.
  2382. */
  2383. static int flags_to_propagation_type(int ms_flags)
  2384. {
  2385. int type = ms_flags & ~(MS_REC | MS_SILENT);
  2386. /* Fail if any non-propagation flags are set */
  2387. if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
  2388. return 0;
  2389. /* Only one propagation flag should be set */
  2390. if (!is_power_of_2(type))
  2391. return 0;
  2392. return type;
  2393. }
  2394. /*
  2395. * recursively change the type of the mountpoint.
  2396. */
  2397. static int do_change_type(struct path *path, int ms_flags)
  2398. {
  2399. struct mount *m;
  2400. struct mount *mnt = real_mount(path->mnt);
  2401. int recurse = ms_flags & MS_REC;
  2402. int type;
  2403. int err = 0;
  2404. if (!path_mounted(path))
  2405. return -EINVAL;
  2406. type = flags_to_propagation_type(ms_flags);
  2407. if (!type)
  2408. return -EINVAL;
  2409. namespace_lock();
  2410. err = may_change_propagation(mnt);
  2411. if (err)
  2412. goto out_unlock;
  2413. if (type == MS_SHARED) {
  2414. err = invent_group_ids(mnt, recurse);
  2415. if (err)
  2416. goto out_unlock;
  2417. }
  2418. lock_mount_hash();
  2419. for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
  2420. change_mnt_propagation(m, type);
  2421. unlock_mount_hash();
  2422. out_unlock:
  2423. namespace_unlock();
  2424. return err;
  2425. }
  2426. static struct mount *__do_loopback(struct path *old_path, int recurse)
  2427. {
  2428. struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
  2429. if (IS_MNT_UNBINDABLE(old))
  2430. return mnt;
  2431. if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
  2432. return mnt;
  2433. if (!recurse && __has_locked_children(old, old_path->dentry))
  2434. return mnt;
  2435. if (recurse)
  2436. mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
  2437. else
  2438. mnt = clone_mnt(old, old_path->dentry, 0);
  2439. if (!IS_ERR(mnt))
  2440. mnt->mnt.mnt_flags &= ~MNT_LOCKED;
  2441. return mnt;
  2442. }
  2443. /*
  2444. * do loopback mount.
  2445. */
  2446. static int do_loopback(struct path *path, const char *old_name,
  2447. int recurse)
  2448. {
  2449. struct path old_path;
  2450. struct mount *mnt = NULL, *parent;
  2451. struct mountpoint *mp;
  2452. int err;
  2453. if (!old_name || !*old_name)
  2454. return -EINVAL;
  2455. err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
  2456. if (err)
  2457. return err;
  2458. err = -EINVAL;
  2459. if (mnt_ns_loop(old_path.dentry))
  2460. goto out;
  2461. mp = lock_mount(path);
  2462. if (IS_ERR(mp)) {
  2463. err = PTR_ERR(mp);
  2464. goto out;
  2465. }
  2466. parent = real_mount(path->mnt);
  2467. if (!check_mnt(parent))
  2468. goto out2;
  2469. mnt = __do_loopback(&old_path, recurse);
  2470. if (IS_ERR(mnt)) {
  2471. err = PTR_ERR(mnt);
  2472. goto out2;
  2473. }
  2474. err = graft_tree(mnt, parent, mp);
  2475. if (err) {
  2476. lock_mount_hash();
  2477. umount_tree(mnt, UMOUNT_SYNC);
  2478. unlock_mount_hash();
  2479. }
  2480. out2:
  2481. unlock_mount(mp);
  2482. out:
  2483. path_put(&old_path);
  2484. return err;
  2485. }
  2486. static struct file *open_detached_copy(struct path *path, bool recursive)
  2487. {
  2488. struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
  2489. struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
  2490. struct mount *mnt, *p;
  2491. struct file *file;
  2492. if (IS_ERR(ns))
  2493. return ERR_CAST(ns);
  2494. namespace_lock();
  2495. mnt = __do_loopback(path, recursive);
  2496. if (IS_ERR(mnt)) {
  2497. namespace_unlock();
  2498. free_mnt_ns(ns);
  2499. return ERR_CAST(mnt);
  2500. }
  2501. lock_mount_hash();
  2502. for (p = mnt; p; p = next_mnt(p, mnt)) {
  2503. mnt_add_to_ns(ns, p);
  2504. ns->nr_mounts++;
  2505. }
  2506. ns->root = mnt;
  2507. mntget(&mnt->mnt);
  2508. unlock_mount_hash();
  2509. namespace_unlock();
  2510. mntput(path->mnt);
  2511. path->mnt = &mnt->mnt;
  2512. file = dentry_open(path, O_PATH, current_cred());
  2513. if (IS_ERR(file))
  2514. dissolve_on_fput(path->mnt);
  2515. else
  2516. file->f_mode |= FMODE_NEED_UNMOUNT;
  2517. return file;
  2518. }
  2519. SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
  2520. {
  2521. struct file *file;
  2522. struct path path;
  2523. int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
  2524. bool detached = flags & OPEN_TREE_CLONE;
  2525. int error;
  2526. int fd;
  2527. BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
  2528. if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
  2529. AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
  2530. OPEN_TREE_CLOEXEC))
  2531. return -EINVAL;
  2532. if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
  2533. return -EINVAL;
  2534. if (flags & AT_NO_AUTOMOUNT)
  2535. lookup_flags &= ~LOOKUP_AUTOMOUNT;
  2536. if (flags & AT_SYMLINK_NOFOLLOW)
  2537. lookup_flags &= ~LOOKUP_FOLLOW;
  2538. if (flags & AT_EMPTY_PATH)
  2539. lookup_flags |= LOOKUP_EMPTY;
  2540. if (detached && !may_mount())
  2541. return -EPERM;
  2542. fd = get_unused_fd_flags(flags & O_CLOEXEC);
  2543. if (fd < 0)
  2544. return fd;
  2545. error = user_path_at(dfd, filename, lookup_flags, &path);
  2546. if (unlikely(error)) {
  2547. file = ERR_PTR(error);
  2548. } else {
  2549. if (detached)
  2550. file = open_detached_copy(&path, flags & AT_RECURSIVE);
  2551. else
  2552. file = dentry_open(&path, O_PATH, current_cred());
  2553. path_put(&path);
  2554. }
  2555. if (IS_ERR(file)) {
  2556. put_unused_fd(fd);
  2557. return PTR_ERR(file);
  2558. }
  2559. fd_install(fd, file);
  2560. return fd;
  2561. }
  2562. /*
  2563. * Don't allow locked mount flags to be cleared.
  2564. *
  2565. * No locks need to be held here while testing the various MNT_LOCK
  2566. * flags because those flags can never be cleared once they are set.
  2567. */
  2568. static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
  2569. {
  2570. unsigned int fl = mnt->mnt.mnt_flags;
  2571. if ((fl & MNT_LOCK_READONLY) &&
  2572. !(mnt_flags & MNT_READONLY))
  2573. return false;
  2574. if ((fl & MNT_LOCK_NODEV) &&
  2575. !(mnt_flags & MNT_NODEV))
  2576. return false;
  2577. if ((fl & MNT_LOCK_NOSUID) &&
  2578. !(mnt_flags & MNT_NOSUID))
  2579. return false;
  2580. if ((fl & MNT_LOCK_NOEXEC) &&
  2581. !(mnt_flags & MNT_NOEXEC))
  2582. return false;
  2583. if ((fl & MNT_LOCK_ATIME) &&
  2584. ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
  2585. return false;
  2586. return true;
  2587. }
  2588. static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
  2589. {
  2590. bool readonly_request = (mnt_flags & MNT_READONLY);
  2591. if (readonly_request == __mnt_is_readonly(&mnt->mnt))
  2592. return 0;
  2593. if (readonly_request)
  2594. return mnt_make_readonly(mnt);
  2595. mnt->mnt.mnt_flags &= ~MNT_READONLY;
  2596. return 0;
  2597. }
  2598. static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
  2599. {
  2600. mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
  2601. mnt->mnt.mnt_flags = mnt_flags;
  2602. touch_mnt_namespace(mnt->mnt_ns);
  2603. }
  2604. static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt)
  2605. {
  2606. struct super_block *sb = mnt->mnt_sb;
  2607. if (!__mnt_is_readonly(mnt) &&
  2608. (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
  2609. (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
  2610. char *buf, *mntpath;
  2611. buf = (char *)__get_free_page(GFP_KERNEL);
  2612. if (buf)
  2613. mntpath = d_path(mountpoint, buf, PAGE_SIZE);
  2614. else
  2615. mntpath = ERR_PTR(-ENOMEM);
  2616. if (IS_ERR(mntpath))
  2617. mntpath = "(unknown)";
  2618. pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n",
  2619. sb->s_type->name,
  2620. is_mounted(mnt) ? "remounted" : "mounted",
  2621. mntpath, &sb->s_time_max,
  2622. (unsigned long long)sb->s_time_max);
  2623. sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
  2624. if (buf)
  2625. free_page((unsigned long)buf);
  2626. }
  2627. }
  2628. /*
  2629. * Handle reconfiguration of the mountpoint only without alteration of the
  2630. * superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND
  2631. * to mount(2).
  2632. */
  2633. static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
  2634. {
  2635. struct super_block *sb = path->mnt->mnt_sb;
  2636. struct mount *mnt = real_mount(path->mnt);
  2637. int ret;
  2638. if (!check_mnt(mnt))
  2639. return -EINVAL;
  2640. if (!path_mounted(path))
  2641. return -EINVAL;
  2642. if (!can_change_locked_flags(mnt, mnt_flags))
  2643. return -EPERM;
  2644. /*
  2645. * We're only checking whether the superblock is read-only not
  2646. * changing it, so only take down_read(&sb->s_umount).
  2647. */
  2648. down_read(&sb->s_umount);
  2649. lock_mount_hash();
  2650. ret = change_mount_ro_state(mnt, mnt_flags);
  2651. if (ret == 0)
  2652. set_mount_attributes(mnt, mnt_flags);
  2653. unlock_mount_hash();
  2654. up_read(&sb->s_umount);
  2655. mnt_warn_timestamp_expiry(path, &mnt->mnt);
  2656. return ret;
  2657. }
  2658. /*
  2659. * change filesystem flags. dir should be a physical root of filesystem.
  2660. * If you've mounted a non-root directory somewhere and want to do remount
  2661. * on it - tough luck.
  2662. */
  2663. static int do_remount(struct path *path, int ms_flags, int sb_flags,
  2664. int mnt_flags, void *data)
  2665. {
  2666. int err;
  2667. struct super_block *sb = path->mnt->mnt_sb;
  2668. struct mount *mnt = real_mount(path->mnt);
  2669. struct fs_context *fc;
  2670. if (!check_mnt(mnt))
  2671. return -EINVAL;
  2672. if (!path_mounted(path))
  2673. return -EINVAL;
  2674. if (!can_change_locked_flags(mnt, mnt_flags))
  2675. return -EPERM;
  2676. fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
  2677. if (IS_ERR(fc))
  2678. return PTR_ERR(fc);
  2679. /*
  2680. * Indicate to the filesystem that the remount request is coming
  2681. * from the legacy mount system call.
  2682. */
  2683. fc->oldapi = true;
  2684. err = parse_monolithic_mount_data(fc, data);
  2685. if (!err) {
  2686. down_write(&sb->s_umount);
  2687. err = -EPERM;
  2688. if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
  2689. err = reconfigure_super(fc);
  2690. if (!err) {
  2691. lock_mount_hash();
  2692. set_mount_attributes(mnt, mnt_flags);
  2693. unlock_mount_hash();
  2694. }
  2695. }
  2696. up_write(&sb->s_umount);
  2697. }
  2698. mnt_warn_timestamp_expiry(path, &mnt->mnt);
  2699. put_fs_context(fc);
  2700. return err;
  2701. }
  2702. static inline int tree_contains_unbindable(struct mount *mnt)
  2703. {
  2704. struct mount *p;
  2705. for (p = mnt; p; p = next_mnt(p, mnt)) {
  2706. if (IS_MNT_UNBINDABLE(p))
  2707. return 1;
  2708. }
  2709. return 0;
  2710. }
  2711. /*
  2712. * Check that there aren't references to earlier/same mount namespaces in the
  2713. * specified subtree. Such references can act as pins for mount namespaces
  2714. * that aren't checked by the mount-cycle checking code, thereby allowing
  2715. * cycles to be made.
  2716. */
  2717. static bool check_for_nsfs_mounts(struct mount *subtree)
  2718. {
  2719. struct mount *p;
  2720. bool ret = false;
  2721. lock_mount_hash();
  2722. for (p = subtree; p; p = next_mnt(p, subtree))
  2723. if (mnt_ns_loop(p->mnt.mnt_root))
  2724. goto out;
  2725. ret = true;
  2726. out:
  2727. unlock_mount_hash();
  2728. return ret;
  2729. }
  2730. static int do_set_group(struct path *from_path, struct path *to_path)
  2731. {
  2732. struct mount *from, *to;
  2733. int err;
  2734. from = real_mount(from_path->mnt);
  2735. to = real_mount(to_path->mnt);
  2736. namespace_lock();
  2737. err = may_change_propagation(from);
  2738. if (err)
  2739. goto out;
  2740. err = may_change_propagation(to);
  2741. if (err)
  2742. goto out;
  2743. err = -EINVAL;
  2744. /* To and From paths should be mount roots */
  2745. if (!path_mounted(from_path))
  2746. goto out;
  2747. if (!path_mounted(to_path))
  2748. goto out;
  2749. /* Setting sharing groups is only allowed across same superblock */
  2750. if (from->mnt.mnt_sb != to->mnt.mnt_sb)
  2751. goto out;
  2752. /* From mount root should be wider than To mount root */
  2753. if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
  2754. goto out;
  2755. /* From mount should not have locked children in place of To's root */
  2756. if (__has_locked_children(from, to->mnt.mnt_root))
  2757. goto out;
  2758. /* Setting sharing groups is only allowed on private mounts */
  2759. if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
  2760. goto out;
  2761. /* From should not be private */
  2762. if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
  2763. goto out;
  2764. if (IS_MNT_SLAVE(from)) {
  2765. struct mount *m = from->mnt_master;
  2766. list_add(&to->mnt_slave, &from->mnt_slave);
  2767. to->mnt_master = m;
  2768. }
  2769. if (IS_MNT_SHARED(from)) {
  2770. to->mnt_group_id = from->mnt_group_id;
  2771. list_add(&to->mnt_share, &from->mnt_share);
  2772. lock_mount_hash();
  2773. set_mnt_shared(to);
  2774. unlock_mount_hash();
  2775. }
  2776. err = 0;
  2777. out:
  2778. namespace_unlock();
  2779. return err;
  2780. }
  2781. /**
  2782. * path_overmounted - check if path is overmounted
  2783. * @path: path to check
  2784. *
  2785. * Check if path is overmounted, i.e., if there's a mount on top of
  2786. * @path->mnt with @path->dentry as mountpoint.
  2787. *
  2788. * Context: namespace_sem must be held at least shared.
  2789. * MUST NOT be called under lock_mount_hash() (there one should just
  2790. * call __lookup_mnt() and check if it returns NULL).
  2791. * Return: If path is overmounted true is returned, false if not.
  2792. */
  2793. static inline bool path_overmounted(const struct path *path)
  2794. {
  2795. unsigned seq = read_seqbegin(&mount_lock);
  2796. bool no_child;
  2797. rcu_read_lock();
  2798. no_child = !__lookup_mnt(path->mnt, path->dentry);
  2799. rcu_read_unlock();
  2800. if (need_seqretry(&mount_lock, seq)) {
  2801. read_seqlock_excl(&mount_lock);
  2802. no_child = !__lookup_mnt(path->mnt, path->dentry);
  2803. read_sequnlock_excl(&mount_lock);
  2804. }
  2805. return unlikely(!no_child);
  2806. }
  2807. /**
  2808. * can_move_mount_beneath - check that we can mount beneath the top mount
  2809. * @from: mount to mount beneath
  2810. * @to: mount under which to mount
  2811. * @mp: mountpoint of @to
  2812. *
  2813. * - Make sure that @to->dentry is actually the root of a mount under
  2814. * which we can mount another mount.
  2815. * - Make sure that nothing can be mounted beneath the caller's current
  2816. * root or the rootfs of the namespace.
  2817. * - Make sure that the caller can unmount the topmost mount ensuring
  2818. * that the caller could reveal the underlying mountpoint.
  2819. * - Ensure that nothing has been mounted on top of @from before we
  2820. * grabbed @namespace_sem to avoid creating pointless shadow mounts.
  2821. * - Prevent mounting beneath a mount if the propagation relationship
  2822. * between the source mount, parent mount, and top mount would lead to
  2823. * nonsensical mount trees.
  2824. *
  2825. * Context: This function expects namespace_lock() to be held.
  2826. * Return: On success 0, and on error a negative error code is returned.
  2827. */
  2828. static int can_move_mount_beneath(const struct path *from,
  2829. const struct path *to,
  2830. const struct mountpoint *mp)
  2831. {
  2832. struct mount *mnt_from = real_mount(from->mnt),
  2833. *mnt_to = real_mount(to->mnt),
  2834. *parent_mnt_to = mnt_to->mnt_parent;
  2835. if (!mnt_has_parent(mnt_to))
  2836. return -EINVAL;
  2837. if (!path_mounted(to))
  2838. return -EINVAL;
  2839. if (IS_MNT_LOCKED(mnt_to))
  2840. return -EINVAL;
  2841. /* Avoid creating shadow mounts during mount propagation. */
  2842. if (path_overmounted(from))
  2843. return -EINVAL;
  2844. /*
  2845. * Mounting beneath the rootfs only makes sense when the
  2846. * semantics of pivot_root(".", ".") are used.
  2847. */
  2848. if (&mnt_to->mnt == current->fs->root.mnt)
  2849. return -EINVAL;
  2850. if (parent_mnt_to == current->nsproxy->mnt_ns->root)
  2851. return -EINVAL;
  2852. for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent)
  2853. if (p == mnt_to)
  2854. return -EINVAL;
  2855. /*
  2856. * If the parent mount propagates to the child mount this would
  2857. * mean mounting @mnt_from on @mnt_to->mnt_parent and then
  2858. * propagating a copy @c of @mnt_from on top of @mnt_to. This
  2859. * defeats the whole purpose of mounting beneath another mount.
  2860. */
  2861. if (propagation_would_overmount(parent_mnt_to, mnt_to, mp))
  2862. return -EINVAL;
  2863. /*
  2864. * If @mnt_to->mnt_parent propagates to @mnt_from this would
  2865. * mean propagating a copy @c of @mnt_from on top of @mnt_from.
  2866. * Afterwards @mnt_from would be mounted on top of
  2867. * @mnt_to->mnt_parent and @mnt_to would be unmounted from
  2868. * @mnt->mnt_parent and remounted on @mnt_from. But since @c is
  2869. * already mounted on @mnt_from, @mnt_to would ultimately be
  2870. * remounted on top of @c. Afterwards, @mnt_from would be
  2871. * covered by a copy @c of @mnt_from and @c would be covered by
  2872. * @mnt_from itself. This defeats the whole purpose of mounting
  2873. * @mnt_from beneath @mnt_to.
  2874. */
  2875. if (propagation_would_overmount(parent_mnt_to, mnt_from, mp))
  2876. return -EINVAL;
  2877. return 0;
  2878. }
  2879. static int do_move_mount(struct path *old_path, struct path *new_path,
  2880. bool beneath)
  2881. {
  2882. struct mnt_namespace *ns;
  2883. struct mount *p;
  2884. struct mount *old;
  2885. struct mount *parent;
  2886. struct mountpoint *mp, *old_mp;
  2887. int err;
  2888. bool attached;
  2889. enum mnt_tree_flags_t flags = 0;
  2890. mp = do_lock_mount(new_path, beneath);
  2891. if (IS_ERR(mp))
  2892. return PTR_ERR(mp);
  2893. old = real_mount(old_path->mnt);
  2894. p = real_mount(new_path->mnt);
  2895. parent = old->mnt_parent;
  2896. attached = mnt_has_parent(old);
  2897. if (attached)
  2898. flags |= MNT_TREE_MOVE;
  2899. old_mp = old->mnt_mp;
  2900. ns = old->mnt_ns;
  2901. err = -EINVAL;
  2902. /* The mountpoint must be in our namespace. */
  2903. if (!check_mnt(p))
  2904. goto out;
  2905. /* The thing moved must be mounted... */
  2906. if (!is_mounted(&old->mnt))
  2907. goto out;
  2908. /* ... and either ours or the root of anon namespace */
  2909. if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
  2910. goto out;
  2911. if (old->mnt.mnt_flags & MNT_LOCKED)
  2912. goto out;
  2913. if (!path_mounted(old_path))
  2914. goto out;
  2915. if (d_is_dir(new_path->dentry) !=
  2916. d_is_dir(old_path->dentry))
  2917. goto out;
  2918. /*
  2919. * Don't move a mount residing in a shared parent.
  2920. */
  2921. if (attached && IS_MNT_SHARED(parent))
  2922. goto out;
  2923. if (beneath) {
  2924. err = can_move_mount_beneath(old_path, new_path, mp);
  2925. if (err)
  2926. goto out;
  2927. err = -EINVAL;
  2928. p = p->mnt_parent;
  2929. flags |= MNT_TREE_BENEATH;
  2930. }
  2931. /*
  2932. * Don't move a mount tree containing unbindable mounts to a destination
  2933. * mount which is shared.
  2934. */
  2935. if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
  2936. goto out;
  2937. err = -ELOOP;
  2938. if (!check_for_nsfs_mounts(old))
  2939. goto out;
  2940. for (; mnt_has_parent(p); p = p->mnt_parent)
  2941. if (p == old)
  2942. goto out;
  2943. err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags);
  2944. if (err)
  2945. goto out;
  2946. /* if the mount is moved, it should no longer be expire
  2947. * automatically */
  2948. list_del_init(&old->mnt_expire);
  2949. if (attached)
  2950. put_mountpoint(old_mp);
  2951. out:
  2952. unlock_mount(mp);
  2953. if (!err) {
  2954. if (attached)
  2955. mntput_no_expire(parent);
  2956. else
  2957. free_mnt_ns(ns);
  2958. }
  2959. return err;
  2960. }
  2961. static int do_move_mount_old(struct path *path, const char *old_name)
  2962. {
  2963. struct path old_path;
  2964. int err;
  2965. if (!old_name || !*old_name)
  2966. return -EINVAL;
  2967. err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
  2968. if (err)
  2969. return err;
  2970. err = do_move_mount(&old_path, path, false);
  2971. path_put(&old_path);
  2972. return err;
  2973. }
  2974. /*
  2975. * add a mount into a namespace's mount tree
  2976. */
  2977. static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
  2978. const struct path *path, int mnt_flags)
  2979. {
  2980. struct mount *parent = real_mount(path->mnt);
  2981. mnt_flags &= ~MNT_INTERNAL_FLAGS;
  2982. if (unlikely(!check_mnt(parent))) {
  2983. /* that's acceptable only for automounts done in private ns */
  2984. if (!(mnt_flags & MNT_SHRINKABLE))
  2985. return -EINVAL;
  2986. /* ... and for those we'd better have mountpoint still alive */
  2987. if (!parent->mnt_ns)
  2988. return -EINVAL;
  2989. }
  2990. /* Refuse the same filesystem on the same mount point */
  2991. if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path_mounted(path))
  2992. return -EBUSY;
  2993. if (d_is_symlink(newmnt->mnt.mnt_root))
  2994. return -EINVAL;
  2995. newmnt->mnt.mnt_flags = mnt_flags;
  2996. return graft_tree(newmnt, parent, mp);
  2997. }
  2998. static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);
  2999. /*
  3000. * Create a new mount using a superblock configuration and request it
  3001. * be added to the namespace tree.
  3002. */
  3003. static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
  3004. unsigned int mnt_flags)
  3005. {
  3006. struct vfsmount *mnt;
  3007. struct mountpoint *mp;
  3008. struct super_block *sb = fc->root->d_sb;
  3009. int error;
  3010. error = security_sb_kern_mount(sb);
  3011. if (!error && mount_too_revealing(sb, &mnt_flags))
  3012. error = -EPERM;
  3013. if (unlikely(error)) {
  3014. fc_drop_locked(fc);
  3015. return error;
  3016. }
  3017. up_write(&sb->s_umount);
  3018. mnt = vfs_create_mount(fc);
  3019. if (IS_ERR(mnt))
  3020. return PTR_ERR(mnt);
  3021. mnt_warn_timestamp_expiry(mountpoint, mnt);
  3022. mp = lock_mount(mountpoint);
  3023. if (IS_ERR(mp)) {
  3024. mntput(mnt);
  3025. return PTR_ERR(mp);
  3026. }
  3027. error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
  3028. unlock_mount(mp);
  3029. if (error < 0)
  3030. mntput(mnt);
  3031. return error;
  3032. }
  3033. /*
  3034. * create a new mount for userspace and request it to be added into the
  3035. * namespace's tree
  3036. */
  3037. static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
  3038. int mnt_flags, const char *name, void *data)
  3039. {
  3040. struct file_system_type *type;
  3041. struct fs_context *fc;
  3042. const char *subtype = NULL;
  3043. int err = 0;
  3044. if (!fstype)
  3045. return -EINVAL;
  3046. type = get_fs_type(fstype);
  3047. if (!type)
  3048. return -ENODEV;
  3049. if (type->fs_flags & FS_HAS_SUBTYPE) {
  3050. subtype = strchr(fstype, '.');
  3051. if (subtype) {
  3052. subtype++;
  3053. if (!*subtype) {
  3054. put_filesystem(type);
  3055. return -EINVAL;
  3056. }
  3057. }
  3058. }
  3059. fc = fs_context_for_mount(type, sb_flags);
  3060. put_filesystem(type);
  3061. if (IS_ERR(fc))
  3062. return PTR_ERR(fc);
  3063. /*
  3064. * Indicate to the filesystem that the mount request is coming
  3065. * from the legacy mount system call.
  3066. */
  3067. fc->oldapi = true;
  3068. if (subtype)
  3069. err = vfs_parse_fs_string(fc, "subtype",
  3070. subtype, strlen(subtype));
  3071. if (!err && name)
  3072. err = vfs_parse_fs_string(fc, "source", name, strlen(name));
  3073. if (!err)
  3074. err = parse_monolithic_mount_data(fc, data);
  3075. if (!err && !mount_capable(fc))
  3076. err = -EPERM;
  3077. if (!err)
  3078. err = vfs_get_tree(fc);
  3079. if (!err)
  3080. err = do_new_mount_fc(fc, path, mnt_flags);
  3081. put_fs_context(fc);
  3082. return err;
  3083. }
  3084. int finish_automount(struct vfsmount *m, const struct path *path)
  3085. {
  3086. struct dentry *dentry = path->dentry;
  3087. struct mountpoint *mp;
  3088. struct mount *mnt;
  3089. int err;
  3090. if (!m)
  3091. return 0;
  3092. if (IS_ERR(m))
  3093. return PTR_ERR(m);
  3094. mnt = real_mount(m);
  3095. /* The new mount record should have at least 2 refs to prevent it being
  3096. * expired before we get a chance to add it
  3097. */
  3098. BUG_ON(mnt_get_count(mnt) < 2);
  3099. if (m->mnt_sb == path->mnt->mnt_sb &&
  3100. m->mnt_root == dentry) {
  3101. err = -ELOOP;
  3102. goto discard;
  3103. }
  3104. /*
  3105. * we don't want to use lock_mount() - in this case finding something
  3106. * that overmounts our mountpoint to be means "quitely drop what we've
  3107. * got", not "try to mount it on top".
  3108. */
  3109. inode_lock(dentry->d_inode);
  3110. namespace_lock();
  3111. if (unlikely(cant_mount(dentry))) {
  3112. err = -ENOENT;
  3113. goto discard_locked;
  3114. }
  3115. if (path_overmounted(path)) {
  3116. err = 0;
  3117. goto discard_locked;
  3118. }
  3119. mp = get_mountpoint(dentry);
  3120. if (IS_ERR(mp)) {
  3121. err = PTR_ERR(mp);
  3122. goto discard_locked;
  3123. }
  3124. err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
  3125. unlock_mount(mp);
  3126. if (unlikely(err))
  3127. goto discard;
  3128. mntput(m);
  3129. return 0;
  3130. discard_locked:
  3131. namespace_unlock();
  3132. inode_unlock(dentry->d_inode);
  3133. discard:
  3134. /* remove m from any expiration list it may be on */
  3135. if (!list_empty(&mnt->mnt_expire)) {
  3136. namespace_lock();
  3137. list_del_init(&mnt->mnt_expire);
  3138. namespace_unlock();
  3139. }
  3140. mntput(m);
  3141. mntput(m);
  3142. return err;
  3143. }
  3144. /**
  3145. * mnt_set_expiry - Put a mount on an expiration list
  3146. * @mnt: The mount to list.
  3147. * @expiry_list: The list to add the mount to.
  3148. */
  3149. void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
  3150. {
  3151. namespace_lock();
  3152. list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
  3153. namespace_unlock();
  3154. }
  3155. EXPORT_SYMBOL(mnt_set_expiry);
  3156. /*
  3157. * process a list of expirable mountpoints with the intent of discarding any
  3158. * mountpoints that aren't in use and haven't been touched since last we came
  3159. * here
  3160. */
  3161. void mark_mounts_for_expiry(struct list_head *mounts)
  3162. {
  3163. struct mount *mnt, *next;
  3164. LIST_HEAD(graveyard);
  3165. if (list_empty(mounts))
  3166. return;
  3167. namespace_lock();
  3168. lock_mount_hash();
  3169. /* extract from the expiration list every vfsmount that matches the
  3170. * following criteria:
  3171. * - only referenced by its parent vfsmount
  3172. * - still marked for expiry (marked on the last call here; marks are
  3173. * cleared by mntput())
  3174. */
  3175. list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
  3176. if (!xchg(&mnt->mnt_expiry_mark, 1) ||
  3177. propagate_mount_busy(mnt, 1))
  3178. continue;
  3179. list_move(&mnt->mnt_expire, &graveyard);
  3180. }
  3181. while (!list_empty(&graveyard)) {
  3182. mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
  3183. touch_mnt_namespace(mnt->mnt_ns);
  3184. umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
  3185. }
  3186. unlock_mount_hash();
  3187. namespace_unlock();
  3188. }
  3189. EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
  3190. /*
  3191. * Ripoff of 'select_parent()'
  3192. *
  3193. * search the list of submounts for a given mountpoint, and move any
  3194. * shrinkable submounts to the 'graveyard' list.
  3195. */
  3196. static int select_submounts(struct mount *parent, struct list_head *graveyard)
  3197. {
  3198. struct mount *this_parent = parent;
  3199. struct list_head *next;
  3200. int found = 0;
  3201. repeat:
  3202. next = this_parent->mnt_mounts.next;
  3203. resume:
  3204. while (next != &this_parent->mnt_mounts) {
  3205. struct list_head *tmp = next;
  3206. struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
  3207. next = tmp->next;
  3208. if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
  3209. continue;
  3210. /*
  3211. * Descend a level if the d_mounts list is non-empty.
  3212. */
  3213. if (!list_empty(&mnt->mnt_mounts)) {
  3214. this_parent = mnt;
  3215. goto repeat;
  3216. }
  3217. if (!propagate_mount_busy(mnt, 1)) {
  3218. list_move_tail(&mnt->mnt_expire, graveyard);
  3219. found++;
  3220. }
  3221. }
  3222. /*
  3223. * All done at this level ... ascend and resume the search
  3224. */
  3225. if (this_parent != parent) {
  3226. next = this_parent->mnt_child.next;
  3227. this_parent = this_parent->mnt_parent;
  3228. goto resume;
  3229. }
  3230. return found;
  3231. }
  3232. /*
  3233. * process a list of expirable mountpoints with the intent of discarding any
  3234. * submounts of a specific parent mountpoint
  3235. *
  3236. * mount_lock must be held for write
  3237. */
  3238. static void shrink_submounts(struct mount *mnt)
  3239. {
  3240. LIST_HEAD(graveyard);
  3241. struct mount *m;
  3242. /* extract submounts of 'mountpoint' from the expiration list */
  3243. while (select_submounts(mnt, &graveyard)) {
  3244. while (!list_empty(&graveyard)) {
  3245. m = list_first_entry(&graveyard, struct mount,
  3246. mnt_expire);
  3247. touch_mnt_namespace(m->mnt_ns);
  3248. umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
  3249. }
  3250. }
  3251. }
  3252. static void *copy_mount_options(const void __user * data)
  3253. {
  3254. char *copy;
  3255. unsigned left, offset;
  3256. if (!data)
  3257. return NULL;
  3258. copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
  3259. if (!copy)
  3260. return ERR_PTR(-ENOMEM);
  3261. left = copy_from_user(copy, data, PAGE_SIZE);
  3262. /*
  3263. * Not all architectures have an exact copy_from_user(). Resort to
  3264. * byte at a time.
  3265. */
  3266. offset = PAGE_SIZE - left;
  3267. while (left) {
  3268. char c;
  3269. if (get_user(c, (const char __user *)data + offset))
  3270. break;
  3271. copy[offset] = c;
  3272. left--;
  3273. offset++;
  3274. }
  3275. if (left == PAGE_SIZE) {
  3276. kfree(copy);
  3277. return ERR_PTR(-EFAULT);
  3278. }
  3279. return copy;
  3280. }
  3281. static char *copy_mount_string(const void __user *data)
  3282. {
  3283. return data ? strndup_user(data, PATH_MAX) : NULL;
  3284. }
  3285. /*
  3286. * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
  3287. * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
  3288. *
  3289. * data is a (void *) that can point to any structure up to
  3290. * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
  3291. * information (or be NULL).
  3292. *
  3293. * Pre-0.97 versions of mount() didn't have a flags word.
  3294. * When the flags word was introduced its top half was required
  3295. * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
  3296. * Therefore, if this magic number is present, it carries no information
  3297. * and must be discarded.
  3298. */
  3299. int path_mount(const char *dev_name, struct path *path,
  3300. const char *type_page, unsigned long flags, void *data_page)
  3301. {
  3302. unsigned int mnt_flags = 0, sb_flags;
  3303. int ret;
  3304. /* Discard magic */
  3305. if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
  3306. flags &= ~MS_MGC_MSK;
  3307. /* Basic sanity checks */
  3308. if (data_page)
  3309. ((char *)data_page)[PAGE_SIZE - 1] = 0;
  3310. if (flags & MS_NOUSER)
  3311. return -EINVAL;
  3312. ret = security_sb_mount(dev_name, path, type_page, flags, data_page);
  3313. if (ret)
  3314. return ret;
  3315. if (!may_mount())
  3316. return -EPERM;
  3317. if (flags & SB_MANDLOCK)
  3318. warn_mandlock();
  3319. /* Default to relatime unless overriden */
  3320. if (!(flags & MS_NOATIME))
  3321. mnt_flags |= MNT_RELATIME;
  3322. /* Separate the per-mountpoint flags */
  3323. if (flags & MS_NOSUID)
  3324. mnt_flags |= MNT_NOSUID;
  3325. if (flags & MS_NODEV)
  3326. mnt_flags |= MNT_NODEV;
  3327. if (flags & MS_NOEXEC)
  3328. mnt_flags |= MNT_NOEXEC;
  3329. if (flags & MS_NOATIME)
  3330. mnt_flags |= MNT_NOATIME;
  3331. if (flags & MS_NODIRATIME)
  3332. mnt_flags |= MNT_NODIRATIME;
  3333. if (flags & MS_STRICTATIME)
  3334. mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
  3335. if (flags & MS_RDONLY)
  3336. mnt_flags |= MNT_READONLY;
  3337. if (flags & MS_NOSYMFOLLOW)
  3338. mnt_flags |= MNT_NOSYMFOLLOW;
  3339. /* The default atime for remount is preservation */
  3340. if ((flags & MS_REMOUNT) &&
  3341. ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
  3342. MS_STRICTATIME)) == 0)) {
  3343. mnt_flags &= ~MNT_ATIME_MASK;
  3344. mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK;
  3345. }
  3346. sb_flags = flags & (SB_RDONLY |
  3347. SB_SYNCHRONOUS |
  3348. SB_MANDLOCK |
  3349. SB_DIRSYNC |
  3350. SB_SILENT |
  3351. SB_POSIXACL |
  3352. SB_LAZYTIME |
  3353. SB_I_VERSION);
  3354. if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
  3355. return do_reconfigure_mnt(path, mnt_flags);
  3356. if (flags & MS_REMOUNT)
  3357. return do_remount(path, flags, sb_flags, mnt_flags, data_page);
  3358. if (flags & MS_BIND)
  3359. return do_loopback(path, dev_name, flags & MS_REC);
  3360. if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
  3361. return do_change_type(path, flags);
  3362. if (flags & MS_MOVE)
  3363. return do_move_mount_old(path, dev_name);
  3364. return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name,
  3365. data_page);
  3366. }
  3367. long do_mount(const char *dev_name, const char __user *dir_name,
  3368. const char *type_page, unsigned long flags, void *data_page)
  3369. {
  3370. struct path path;
  3371. int ret;
  3372. ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
  3373. if (ret)
  3374. return ret;
  3375. ret = path_mount(dev_name, &path, type_page, flags, data_page);
  3376. path_put(&path);
  3377. return ret;
  3378. }
  3379. static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
  3380. {
  3381. return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
  3382. }
  3383. static void dec_mnt_namespaces(struct ucounts *ucounts)
  3384. {
  3385. dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
  3386. }
  3387. static void free_mnt_ns(struct mnt_namespace *ns)
  3388. {
  3389. if (!is_anon_ns(ns))
  3390. ns_free_inum(&ns->ns);
  3391. dec_mnt_namespaces(ns->ucounts);
  3392. mnt_ns_tree_remove(ns);
  3393. }
  3394. /*
  3395. * Assign a sequence number so we can detect when we attempt to bind
  3396. * mount a reference to an older mount namespace into the current
  3397. * mount namespace, preventing reference counting loops. A 64bit
  3398. * number incrementing at 10Ghz will take 12,427 years to wrap which
  3399. * is effectively never, so we can ignore the possibility.
  3400. */
  3401. static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
  3402. static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
  3403. {
  3404. struct mnt_namespace *new_ns;
  3405. struct ucounts *ucounts;
  3406. int ret;
  3407. ucounts = inc_mnt_namespaces(user_ns);
  3408. if (!ucounts)
  3409. return ERR_PTR(-ENOSPC);
  3410. new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT);
  3411. if (!new_ns) {
  3412. dec_mnt_namespaces(ucounts);
  3413. return ERR_PTR(-ENOMEM);
  3414. }
  3415. if (!anon) {
  3416. ret = ns_alloc_inum(&new_ns->ns);
  3417. if (ret) {
  3418. kfree(new_ns);
  3419. dec_mnt_namespaces(ucounts);
  3420. return ERR_PTR(ret);
  3421. }
  3422. }
  3423. new_ns->ns.ops = &mntns_operations;
  3424. if (!anon)
  3425. new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
  3426. refcount_set(&new_ns->ns.count, 1);
  3427. refcount_set(&new_ns->passive, 1);
  3428. new_ns->mounts = RB_ROOT;
  3429. RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
  3430. init_waitqueue_head(&new_ns->poll);
  3431. new_ns->user_ns = get_user_ns(user_ns);
  3432. new_ns->ucounts = ucounts;
  3433. return new_ns;
  3434. }
  3435. __latent_entropy
  3436. struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
  3437. struct user_namespace *user_ns, struct fs_struct *new_fs)
  3438. {
  3439. struct mnt_namespace *new_ns;
  3440. struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
  3441. struct mount *p, *q;
  3442. struct mount *old;
  3443. struct mount *new;
  3444. int copy_flags;
  3445. BUG_ON(!ns);
  3446. if (likely(!(flags & CLONE_NEWNS))) {
  3447. get_mnt_ns(ns);
  3448. return ns;
  3449. }
  3450. old = ns->root;
  3451. new_ns = alloc_mnt_ns(user_ns, false);
  3452. if (IS_ERR(new_ns))
  3453. return new_ns;
  3454. namespace_lock();
  3455. /* First pass: copy the tree topology */
  3456. copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
  3457. if (user_ns != ns->user_ns)
  3458. copy_flags |= CL_SHARED_TO_SLAVE;
  3459. new = copy_tree(old, old->mnt.mnt_root, copy_flags);
  3460. if (IS_ERR(new)) {
  3461. namespace_unlock();
  3462. ns_free_inum(&new_ns->ns);
  3463. dec_mnt_namespaces(new_ns->ucounts);
  3464. mnt_ns_release(new_ns);
  3465. return ERR_CAST(new);
  3466. }
  3467. if (user_ns != ns->user_ns) {
  3468. lock_mount_hash();
  3469. lock_mnt_tree(new);
  3470. unlock_mount_hash();
  3471. }
  3472. new_ns->root = new;
  3473. /*
  3474. * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
  3475. * as belonging to new namespace. We have already acquired a private
  3476. * fs_struct, so tsk->fs->lock is not needed.
  3477. */
  3478. p = old;
  3479. q = new;
  3480. while (p) {
  3481. mnt_add_to_ns(new_ns, q);
  3482. new_ns->nr_mounts++;
  3483. if (new_fs) {
  3484. if (&p->mnt == new_fs->root.mnt) {
  3485. new_fs->root.mnt = mntget(&q->mnt);
  3486. rootmnt = &p->mnt;
  3487. }
  3488. if (&p->mnt == new_fs->pwd.mnt) {
  3489. new_fs->pwd.mnt = mntget(&q->mnt);
  3490. pwdmnt = &p->mnt;
  3491. }
  3492. }
  3493. p = next_mnt(p, old);
  3494. q = next_mnt(q, new);
  3495. if (!q)
  3496. break;
  3497. // an mntns binding we'd skipped?
  3498. while (p->mnt.mnt_root != q->mnt.mnt_root)
  3499. p = next_mnt(skip_mnt_tree(p), old);
  3500. }
  3501. mnt_ns_tree_add(new_ns);
  3502. namespace_unlock();
  3503. if (rootmnt)
  3504. mntput(rootmnt);
  3505. if (pwdmnt)
  3506. mntput(pwdmnt);
  3507. return new_ns;
  3508. }
  3509. struct dentry *mount_subtree(struct vfsmount *m, const char *name)
  3510. {
  3511. struct mount *mnt = real_mount(m);
  3512. struct mnt_namespace *ns;
  3513. struct super_block *s;
  3514. struct path path;
  3515. int err;
  3516. ns = alloc_mnt_ns(&init_user_ns, true);
  3517. if (IS_ERR(ns)) {
  3518. mntput(m);
  3519. return ERR_CAST(ns);
  3520. }
  3521. ns->root = mnt;
  3522. ns->nr_mounts++;
  3523. mnt_add_to_ns(ns, mnt);
  3524. err = vfs_path_lookup(m->mnt_root, m,
  3525. name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
  3526. put_mnt_ns(ns);
  3527. if (err)
  3528. return ERR_PTR(err);
  3529. /* trade a vfsmount reference for active sb one */
  3530. s = path.mnt->mnt_sb;
  3531. atomic_inc(&s->s_active);
  3532. mntput(path.mnt);
  3533. /* lock the sucker */
  3534. down_write(&s->s_umount);
  3535. /* ... and return the root of (sub)tree on it */
  3536. return path.dentry;
  3537. }
  3538. EXPORT_SYMBOL(mount_subtree);
  3539. SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
  3540. char __user *, type, unsigned long, flags, void __user *, data)
  3541. {
  3542. int ret;
  3543. char *kernel_type;
  3544. char *kernel_dev;
  3545. void *options;
  3546. kernel_type = copy_mount_string(type);
  3547. ret = PTR_ERR(kernel_type);
  3548. if (IS_ERR(kernel_type))
  3549. goto out_type;
  3550. kernel_dev = copy_mount_string(dev_name);
  3551. ret = PTR_ERR(kernel_dev);
  3552. if (IS_ERR(kernel_dev))
  3553. goto out_dev;
  3554. options = copy_mount_options(data);
  3555. ret = PTR_ERR(options);
  3556. if (IS_ERR(options))
  3557. goto out_data;
  3558. ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
  3559. kfree(options);
  3560. out_data:
  3561. kfree(kernel_dev);
  3562. out_dev:
  3563. kfree(kernel_type);
  3564. out_type:
  3565. return ret;
  3566. }
  3567. #define FSMOUNT_VALID_FLAGS \
  3568. (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \
  3569. MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME | \
  3570. MOUNT_ATTR_NOSYMFOLLOW)
  3571. #define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP)
  3572. #define MOUNT_SETATTR_PROPAGATION_FLAGS \
  3573. (MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED)
  3574. static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
  3575. {
  3576. unsigned int mnt_flags = 0;
  3577. if (attr_flags & MOUNT_ATTR_RDONLY)
  3578. mnt_flags |= MNT_READONLY;
  3579. if (attr_flags & MOUNT_ATTR_NOSUID)
  3580. mnt_flags |= MNT_NOSUID;
  3581. if (attr_flags & MOUNT_ATTR_NODEV)
  3582. mnt_flags |= MNT_NODEV;
  3583. if (attr_flags & MOUNT_ATTR_NOEXEC)
  3584. mnt_flags |= MNT_NOEXEC;
  3585. if (attr_flags & MOUNT_ATTR_NODIRATIME)
  3586. mnt_flags |= MNT_NODIRATIME;
  3587. if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW)
  3588. mnt_flags |= MNT_NOSYMFOLLOW;
  3589. return mnt_flags;
  3590. }
  3591. /*
  3592. * Create a kernel mount representation for a new, prepared superblock
  3593. * (specified by fs_fd) and attach to an open_tree-like file descriptor.
  3594. */
  3595. SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
  3596. unsigned int, attr_flags)
  3597. {
  3598. struct mnt_namespace *ns;
  3599. struct fs_context *fc;
  3600. struct file *file;
  3601. struct path newmount;
  3602. struct mount *mnt;
  3603. struct fd f;
  3604. unsigned int mnt_flags = 0;
  3605. long ret;
  3606. if (!may_mount())
  3607. return -EPERM;
  3608. if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
  3609. return -EINVAL;
  3610. if (attr_flags & ~FSMOUNT_VALID_FLAGS)
  3611. return -EINVAL;
  3612. mnt_flags = attr_flags_to_mnt_flags(attr_flags);
  3613. switch (attr_flags & MOUNT_ATTR__ATIME) {
  3614. case MOUNT_ATTR_STRICTATIME:
  3615. break;
  3616. case MOUNT_ATTR_NOATIME:
  3617. mnt_flags |= MNT_NOATIME;
  3618. break;
  3619. case MOUNT_ATTR_RELATIME:
  3620. mnt_flags |= MNT_RELATIME;
  3621. break;
  3622. default:
  3623. return -EINVAL;
  3624. }
  3625. f = fdget(fs_fd);
  3626. if (!fd_file(f))
  3627. return -EBADF;
  3628. ret = -EINVAL;
  3629. if (fd_file(f)->f_op != &fscontext_fops)
  3630. goto err_fsfd;
  3631. fc = fd_file(f)->private_data;
  3632. ret = mutex_lock_interruptible(&fc->uapi_mutex);
  3633. if (ret < 0)
  3634. goto err_fsfd;
  3635. /* There must be a valid superblock or we can't mount it */
  3636. ret = -EINVAL;
  3637. if (!fc->root)
  3638. goto err_unlock;
  3639. ret = -EPERM;
  3640. if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
  3641. pr_warn("VFS: Mount too revealing\n");
  3642. goto err_unlock;
  3643. }
  3644. ret = -EBUSY;
  3645. if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
  3646. goto err_unlock;
  3647. if (fc->sb_flags & SB_MANDLOCK)
  3648. warn_mandlock();
  3649. newmount.mnt = vfs_create_mount(fc);
  3650. if (IS_ERR(newmount.mnt)) {
  3651. ret = PTR_ERR(newmount.mnt);
  3652. goto err_unlock;
  3653. }
  3654. newmount.dentry = dget(fc->root);
  3655. newmount.mnt->mnt_flags = mnt_flags;
  3656. /* We've done the mount bit - now move the file context into more or
  3657. * less the same state as if we'd done an fspick(). We don't want to
  3658. * do any memory allocation or anything like that at this point as we
  3659. * don't want to have to handle any errors incurred.
  3660. */
  3661. vfs_clean_context(fc);
  3662. ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
  3663. if (IS_ERR(ns)) {
  3664. ret = PTR_ERR(ns);
  3665. goto err_path;
  3666. }
  3667. mnt = real_mount(newmount.mnt);
  3668. ns->root = mnt;
  3669. ns->nr_mounts = 1;
  3670. mnt_add_to_ns(ns, mnt);
  3671. mntget(newmount.mnt);
  3672. /* Attach to an apparent O_PATH fd with a note that we need to unmount
  3673. * it, not just simply put it.
  3674. */
  3675. file = dentry_open(&newmount, O_PATH, fc->cred);
  3676. if (IS_ERR(file)) {
  3677. dissolve_on_fput(newmount.mnt);
  3678. ret = PTR_ERR(file);
  3679. goto err_path;
  3680. }
  3681. file->f_mode |= FMODE_NEED_UNMOUNT;
  3682. ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
  3683. if (ret >= 0)
  3684. fd_install(ret, file);
  3685. else
  3686. fput(file);
  3687. err_path:
  3688. path_put(&newmount);
  3689. err_unlock:
  3690. mutex_unlock(&fc->uapi_mutex);
  3691. err_fsfd:
  3692. fdput(f);
  3693. return ret;
  3694. }
  3695. /*
  3696. * Move a mount from one place to another. In combination with
  3697. * fsopen()/fsmount() this is used to install a new mount and in combination
  3698. * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy
  3699. * a mount subtree.
  3700. *
  3701. * Note the flags value is a combination of MOVE_MOUNT_* flags.
  3702. */
  3703. SYSCALL_DEFINE5(move_mount,
  3704. int, from_dfd, const char __user *, from_pathname,
  3705. int, to_dfd, const char __user *, to_pathname,
  3706. unsigned int, flags)
  3707. {
  3708. struct path from_path, to_path;
  3709. unsigned int lflags;
  3710. int ret = 0;
  3711. if (!may_mount())
  3712. return -EPERM;
  3713. if (flags & ~MOVE_MOUNT__MASK)
  3714. return -EINVAL;
  3715. if ((flags & (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) ==
  3716. (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP))
  3717. return -EINVAL;
  3718. /* If someone gives a pathname, they aren't permitted to move
  3719. * from an fd that requires unmount as we can't get at the flag
  3720. * to clear it afterwards.
  3721. */
  3722. lflags = 0;
  3723. if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW;
  3724. if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
  3725. if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags |= LOOKUP_EMPTY;
  3726. ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
  3727. if (ret < 0)
  3728. return ret;
  3729. lflags = 0;
  3730. if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW;
  3731. if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
  3732. if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags |= LOOKUP_EMPTY;
  3733. ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
  3734. if (ret < 0)
  3735. goto out_from;
  3736. ret = security_move_mount(&from_path, &to_path);
  3737. if (ret < 0)
  3738. goto out_to;
  3739. if (flags & MOVE_MOUNT_SET_GROUP)
  3740. ret = do_set_group(&from_path, &to_path);
  3741. else
  3742. ret = do_move_mount(&from_path, &to_path,
  3743. (flags & MOVE_MOUNT_BENEATH));
  3744. out_to:
  3745. path_put(&to_path);
  3746. out_from:
  3747. path_put(&from_path);
  3748. return ret;
  3749. }
  3750. /*
  3751. * Return true if path is reachable from root
  3752. *
  3753. * namespace_sem or mount_lock is held
  3754. */
  3755. bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
  3756. const struct path *root)
  3757. {
  3758. while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
  3759. dentry = mnt->mnt_mountpoint;
  3760. mnt = mnt->mnt_parent;
  3761. }
  3762. return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
  3763. }
  3764. bool path_is_under(const struct path *path1, const struct path *path2)
  3765. {
  3766. bool res;
  3767. read_seqlock_excl(&mount_lock);
  3768. res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
  3769. read_sequnlock_excl(&mount_lock);
  3770. return res;
  3771. }
  3772. EXPORT_SYMBOL(path_is_under);
  3773. /*
  3774. * pivot_root Semantics:
  3775. * Moves the root file system of the current process to the directory put_old,
  3776. * makes new_root as the new root file system of the current process, and sets
  3777. * root/cwd of all processes which had them on the current root to new_root.
  3778. *
  3779. * Restrictions:
  3780. * The new_root and put_old must be directories, and must not be on the
  3781. * same file system as the current process root. The put_old must be
  3782. * underneath new_root, i.e. adding a non-zero number of /.. to the string
  3783. * pointed to by put_old must yield the same directory as new_root. No other
  3784. * file system may be mounted on put_old. After all, new_root is a mountpoint.
  3785. *
  3786. * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
  3787. * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
  3788. * in this situation.
  3789. *
  3790. * Notes:
  3791. * - we don't move root/cwd if they are not at the root (reason: if something
  3792. * cared enough to change them, it's probably wrong to force them elsewhere)
  3793. * - it's okay to pick a root that isn't the root of a file system, e.g.
  3794. * /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
  3795. * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
  3796. * first.
  3797. */
  3798. SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
  3799. const char __user *, put_old)
  3800. {
  3801. struct path new, old, root;
  3802. struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
  3803. struct mountpoint *old_mp, *root_mp;
  3804. int error;
  3805. if (!may_mount())
  3806. return -EPERM;
  3807. error = user_path_at(AT_FDCWD, new_root,
  3808. LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
  3809. if (error)
  3810. goto out0;
  3811. error = user_path_at(AT_FDCWD, put_old,
  3812. LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
  3813. if (error)
  3814. goto out1;
  3815. error = security_sb_pivotroot(&old, &new);
  3816. if (error)
  3817. goto out2;
  3818. get_fs_root(current->fs, &root);
  3819. old_mp = lock_mount(&old);
  3820. error = PTR_ERR(old_mp);
  3821. if (IS_ERR(old_mp))
  3822. goto out3;
  3823. error = -EINVAL;
  3824. new_mnt = real_mount(new.mnt);
  3825. root_mnt = real_mount(root.mnt);
  3826. old_mnt = real_mount(old.mnt);
  3827. ex_parent = new_mnt->mnt_parent;
  3828. root_parent = root_mnt->mnt_parent;
  3829. if (IS_MNT_SHARED(old_mnt) ||
  3830. IS_MNT_SHARED(ex_parent) ||
  3831. IS_MNT_SHARED(root_parent))
  3832. goto out4;
  3833. if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
  3834. goto out4;
  3835. if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
  3836. goto out4;
  3837. error = -ENOENT;
  3838. if (d_unlinked(new.dentry))
  3839. goto out4;
  3840. error = -EBUSY;
  3841. if (new_mnt == root_mnt || old_mnt == root_mnt)
  3842. goto out4; /* loop, on the same file system */
  3843. error = -EINVAL;
  3844. if (!path_mounted(&root))
  3845. goto out4; /* not a mountpoint */
  3846. if (!mnt_has_parent(root_mnt))
  3847. goto out4; /* not attached */
  3848. if (!path_mounted(&new))
  3849. goto out4; /* not a mountpoint */
  3850. if (!mnt_has_parent(new_mnt))
  3851. goto out4; /* not attached */
  3852. /* make sure we can reach put_old from new_root */
  3853. if (!is_path_reachable(old_mnt, old.dentry, &new))
  3854. goto out4;
  3855. /* make certain new is below the root */
  3856. if (!is_path_reachable(new_mnt, new.dentry, &root))
  3857. goto out4;
  3858. lock_mount_hash();
  3859. umount_mnt(new_mnt);
  3860. root_mp = unhash_mnt(root_mnt); /* we'll need its mountpoint */
  3861. if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
  3862. new_mnt->mnt.mnt_flags |= MNT_LOCKED;
  3863. root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
  3864. }
  3865. /* mount old root on put_old */
  3866. attach_mnt(root_mnt, old_mnt, old_mp, false);
  3867. /* mount new_root on / */
  3868. attach_mnt(new_mnt, root_parent, root_mp, false);
  3869. mnt_add_count(root_parent, -1);
  3870. touch_mnt_namespace(current->nsproxy->mnt_ns);
  3871. /* A moved mount should not expire automatically */
  3872. list_del_init(&new_mnt->mnt_expire);
  3873. put_mountpoint(root_mp);
  3874. unlock_mount_hash();
  3875. chroot_fs_refs(&root, &new);
  3876. error = 0;
  3877. out4:
  3878. unlock_mount(old_mp);
  3879. if (!error)
  3880. mntput_no_expire(ex_parent);
  3881. out3:
  3882. path_put(&root);
  3883. out2:
  3884. path_put(&old);
  3885. out1:
  3886. path_put(&new);
  3887. out0:
  3888. return error;
  3889. }
  3890. static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
  3891. {
  3892. unsigned int flags = mnt->mnt.mnt_flags;
  3893. /* flags to clear */
  3894. flags &= ~kattr->attr_clr;
  3895. /* flags to raise */
  3896. flags |= kattr->attr_set;
  3897. return flags;
  3898. }
  3899. static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
  3900. {
  3901. struct vfsmount *m = &mnt->mnt;
  3902. struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;
  3903. if (!kattr->mnt_idmap)
  3904. return 0;
  3905. /*
  3906. * Creating an idmapped mount with the filesystem wide idmapping
  3907. * doesn't make sense so block that. We don't allow mushy semantics.
  3908. */
  3909. if (kattr->mnt_userns == m->mnt_sb->s_user_ns)
  3910. return -EINVAL;
  3911. /*
  3912. * Once a mount has been idmapped we don't allow it to change its
  3913. * mapping. It makes things simpler and callers can just create
  3914. * another bind-mount they can idmap if they want to.
  3915. */
  3916. if (is_idmapped_mnt(m))
  3917. return -EPERM;
  3918. /* The underlying filesystem doesn't support idmapped mounts yet. */
  3919. if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
  3920. return -EINVAL;
  3921. /* The filesystem has turned off idmapped mounts. */
  3922. if (m->mnt_sb->s_iflags & SB_I_NOIDMAP)
  3923. return -EINVAL;
  3924. /* We're not controlling the superblock. */
  3925. if (!ns_capable(fs_userns, CAP_SYS_ADMIN))
  3926. return -EPERM;
  3927. /* Mount has already been visible in the filesystem hierarchy. */
  3928. if (!is_anon_ns(mnt->mnt_ns))
  3929. return -EINVAL;
  3930. return 0;
  3931. }
  3932. /**
  3933. * mnt_allow_writers() - check whether the attribute change allows writers
  3934. * @kattr: the new mount attributes
  3935. * @mnt: the mount to which @kattr will be applied
  3936. *
  3937. * Check whether thew new mount attributes in @kattr allow concurrent writers.
  3938. *
  3939. * Return: true if writers need to be held, false if not
  3940. */
  3941. static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
  3942. const struct mount *mnt)
  3943. {
  3944. return (!(kattr->attr_set & MNT_READONLY) ||
  3945. (mnt->mnt.mnt_flags & MNT_READONLY)) &&
  3946. !kattr->mnt_idmap;
  3947. }
  3948. static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
  3949. {
  3950. struct mount *m;
  3951. int err;
  3952. for (m = mnt; m; m = next_mnt(m, mnt)) {
  3953. if (!can_change_locked_flags(m, recalc_flags(kattr, m))) {
  3954. err = -EPERM;
  3955. break;
  3956. }
  3957. err = can_idmap_mount(kattr, m);
  3958. if (err)
  3959. break;
  3960. if (!mnt_allow_writers(kattr, m)) {
  3961. err = mnt_hold_writers(m);
  3962. if (err)
  3963. break;
  3964. }
  3965. if (!kattr->recurse)
  3966. return 0;
  3967. }
  3968. if (err) {
  3969. struct mount *p;
  3970. /*
  3971. * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will
  3972. * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all
  3973. * mounts and needs to take care to include the first mount.
  3974. */
  3975. for (p = mnt; p; p = next_mnt(p, mnt)) {
  3976. /* If we had to hold writers unblock them. */
  3977. if (p->mnt.mnt_flags & MNT_WRITE_HOLD)
  3978. mnt_unhold_writers(p);
  3979. /*
  3980. * We're done once the first mount we changed got
  3981. * MNT_WRITE_HOLD unset.
  3982. */
  3983. if (p == m)
  3984. break;
  3985. }
  3986. }
  3987. return err;
  3988. }
  3989. static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
  3990. {
  3991. if (!kattr->mnt_idmap)
  3992. return;
  3993. /*
  3994. * Pairs with smp_load_acquire() in mnt_idmap().
  3995. *
  3996. * Since we only allow a mount to change the idmapping once and
  3997. * verified this in can_idmap_mount() we know that the mount has
  3998. * @nop_mnt_idmap attached to it. So there's no need to drop any
  3999. * references.
  4000. */
  4001. smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
  4002. }
  4003. static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
  4004. {
  4005. struct mount *m;
  4006. for (m = mnt; m; m = next_mnt(m, mnt)) {
  4007. unsigned int flags;
  4008. do_idmap_mount(kattr, m);
  4009. flags = recalc_flags(kattr, m);
  4010. WRITE_ONCE(m->mnt.mnt_flags, flags);
  4011. /* If we had to hold writers unblock them. */
  4012. if (m->mnt.mnt_flags & MNT_WRITE_HOLD)
  4013. mnt_unhold_writers(m);
  4014. if (kattr->propagation)
  4015. change_mnt_propagation(m, kattr->propagation);
  4016. if (!kattr->recurse)
  4017. break;
  4018. }
  4019. touch_mnt_namespace(mnt->mnt_ns);
  4020. }
  4021. static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
  4022. {
  4023. struct mount *mnt = real_mount(path->mnt);
  4024. int err = 0;
  4025. if (!path_mounted(path))
  4026. return -EINVAL;
  4027. if (kattr->mnt_userns) {
  4028. struct mnt_idmap *mnt_idmap;
  4029. mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns);
  4030. if (IS_ERR(mnt_idmap))
  4031. return PTR_ERR(mnt_idmap);
  4032. kattr->mnt_idmap = mnt_idmap;
  4033. }
  4034. if (kattr->propagation) {
  4035. /*
  4036. * Only take namespace_lock() if we're actually changing
  4037. * propagation.
  4038. */
  4039. namespace_lock();
  4040. if (kattr->propagation == MS_SHARED) {
  4041. err = invent_group_ids(mnt, kattr->recurse);
  4042. if (err) {
  4043. namespace_unlock();
  4044. return err;
  4045. }
  4046. }
  4047. }
  4048. err = -EINVAL;
  4049. lock_mount_hash();
  4050. /* Ensure that this isn't anything purely vfs internal. */
  4051. if (!is_mounted(&mnt->mnt))
  4052. goto out;
  4053. /*
  4054. * If this is an attached mount make sure it's located in the callers
  4055. * mount namespace. If it's not don't let the caller interact with it.
  4056. *
  4057. * If this mount doesn't have a parent it's most often simply a
  4058. * detached mount with an anonymous mount namespace. IOW, something
  4059. * that's simply not attached yet. But there are apparently also users
  4060. * that do change mount properties on the rootfs itself. That obviously
  4061. * neither has a parent nor is it a detached mount so we cannot
  4062. * unconditionally check for detached mounts.
  4063. */
  4064. if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt))
  4065. goto out;
  4066. /*
  4067. * First, we get the mount tree in a shape where we can change mount
  4068. * properties without failure. If we succeeded to do so we commit all
  4069. * changes and if we failed we clean up.
  4070. */
  4071. err = mount_setattr_prepare(kattr, mnt);
  4072. if (!err)
  4073. mount_setattr_commit(kattr, mnt);
  4074. out:
  4075. unlock_mount_hash();
  4076. if (kattr->propagation) {
  4077. if (err)
  4078. cleanup_group_ids(mnt, NULL);
  4079. namespace_unlock();
  4080. }
  4081. return err;
  4082. }
  4083. static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
  4084. struct mount_kattr *kattr, unsigned int flags)
  4085. {
  4086. int err = 0;
  4087. struct ns_common *ns;
  4088. struct user_namespace *mnt_userns;
  4089. struct fd f;
  4090. if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
  4091. return 0;
  4092. /*
  4093. * We currently do not support clearing an idmapped mount. If this ever
  4094. * is a use-case we can revisit this but for now let's keep it simple
  4095. * and not allow it.
  4096. */
  4097. if (attr->attr_clr & MOUNT_ATTR_IDMAP)
  4098. return -EINVAL;
  4099. if (attr->userns_fd > INT_MAX)
  4100. return -EINVAL;
  4101. f = fdget(attr->userns_fd);
  4102. if (!fd_file(f))
  4103. return -EBADF;
  4104. if (!proc_ns_file(fd_file(f))) {
  4105. err = -EINVAL;
  4106. goto out_fput;
  4107. }
  4108. ns = get_proc_ns(file_inode(fd_file(f)));
  4109. if (ns->ops->type != CLONE_NEWUSER) {
  4110. err = -EINVAL;
  4111. goto out_fput;
  4112. }
  4113. /*
  4114. * The initial idmapping cannot be used to create an idmapped
  4115. * mount. We use the initial idmapping as an indicator of a mount
  4116. * that is not idmapped. It can simply be passed into helpers that
  4117. * are aware of idmapped mounts as a convenient shortcut. A user
  4118. * can just create a dedicated identity mapping to achieve the same
  4119. * result.
  4120. */
  4121. mnt_userns = container_of(ns, struct user_namespace, ns);
  4122. if (mnt_userns == &init_user_ns) {
  4123. err = -EPERM;
  4124. goto out_fput;
  4125. }
  4126. /* We're not controlling the target namespace. */
  4127. if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) {
  4128. err = -EPERM;
  4129. goto out_fput;
  4130. }
  4131. kattr->mnt_userns = get_user_ns(mnt_userns);
  4132. out_fput:
  4133. fdput(f);
  4134. return err;
  4135. }
  4136. static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
  4137. struct mount_kattr *kattr, unsigned int flags)
  4138. {
  4139. unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
  4140. if (flags & AT_NO_AUTOMOUNT)
  4141. lookup_flags &= ~LOOKUP_AUTOMOUNT;
  4142. if (flags & AT_SYMLINK_NOFOLLOW)
  4143. lookup_flags &= ~LOOKUP_FOLLOW;
  4144. if (flags & AT_EMPTY_PATH)
  4145. lookup_flags |= LOOKUP_EMPTY;
  4146. *kattr = (struct mount_kattr) {
  4147. .lookup_flags = lookup_flags,
  4148. .recurse = !!(flags & AT_RECURSIVE),
  4149. };
  4150. if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
  4151. return -EINVAL;
  4152. if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1)
  4153. return -EINVAL;
  4154. kattr->propagation = attr->propagation;
  4155. if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
  4156. return -EINVAL;
  4157. kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set);
  4158. kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr);
  4159. /*
  4160. * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap,
  4161. * users wanting to transition to a different atime setting cannot
  4162. * simply specify the atime setting in @attr_set, but must also
  4163. * specify MOUNT_ATTR__ATIME in the @attr_clr field.
  4164. * So ensure that MOUNT_ATTR__ATIME can't be partially set in
  4165. * @attr_clr and that @attr_set can't have any atime bits set if
  4166. * MOUNT_ATTR__ATIME isn't set in @attr_clr.
  4167. */
  4168. if (attr->attr_clr & MOUNT_ATTR__ATIME) {
  4169. if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME)
  4170. return -EINVAL;
  4171. /*
  4172. * Clear all previous time settings as they are mutually
  4173. * exclusive.
  4174. */
  4175. kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME;
  4176. switch (attr->attr_set & MOUNT_ATTR__ATIME) {
  4177. case MOUNT_ATTR_RELATIME:
  4178. kattr->attr_set |= MNT_RELATIME;
  4179. break;
  4180. case MOUNT_ATTR_NOATIME:
  4181. kattr->attr_set |= MNT_NOATIME;
  4182. break;
  4183. case MOUNT_ATTR_STRICTATIME:
  4184. break;
  4185. default:
  4186. return -EINVAL;
  4187. }
  4188. } else {
  4189. if (attr->attr_set & MOUNT_ATTR__ATIME)
  4190. return -EINVAL;
  4191. }
  4192. return build_mount_idmapped(attr, usize, kattr, flags);
  4193. }
  4194. static void finish_mount_kattr(struct mount_kattr *kattr)
  4195. {
  4196. put_user_ns(kattr->mnt_userns);
  4197. kattr->mnt_userns = NULL;
  4198. if (kattr->mnt_idmap)
  4199. mnt_idmap_put(kattr->mnt_idmap);
  4200. }
  4201. SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
  4202. unsigned int, flags, struct mount_attr __user *, uattr,
  4203. size_t, usize)
  4204. {
  4205. int err;
  4206. struct path target;
  4207. struct mount_attr attr;
  4208. struct mount_kattr kattr;
  4209. BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);
  4210. if (flags & ~(AT_EMPTY_PATH |
  4211. AT_RECURSIVE |
  4212. AT_SYMLINK_NOFOLLOW |
  4213. AT_NO_AUTOMOUNT))
  4214. return -EINVAL;
  4215. if (unlikely(usize > PAGE_SIZE))
  4216. return -E2BIG;
  4217. if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
  4218. return -EINVAL;
  4219. if (!may_mount())
  4220. return -EPERM;
  4221. err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
  4222. if (err)
  4223. return err;
  4224. /* Don't bother walking through the mounts if this is a nop. */
  4225. if (attr.attr_set == 0 &&
  4226. attr.attr_clr == 0 &&
  4227. attr.propagation == 0)
  4228. return 0;
  4229. err = build_mount_kattr(&attr, usize, &kattr, flags);
  4230. if (err)
  4231. return err;
  4232. err = user_path_at(dfd, path, kattr.lookup_flags, &target);
  4233. if (!err) {
  4234. err = do_mount_setattr(&target, &kattr);
  4235. path_put(&target);
  4236. }
  4237. finish_mount_kattr(&kattr);
  4238. return err;
  4239. }
  4240. int show_path(struct seq_file *m, struct dentry *root)
  4241. {
  4242. if (root->d_sb->s_op->show_path)
  4243. return root->d_sb->s_op->show_path(m, root);
  4244. seq_dentry(m, root, " \t\n\\");
  4245. return 0;
  4246. }
  4247. static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
  4248. {
  4249. struct mount *mnt = mnt_find_id_at(ns, id);
  4250. if (!mnt || mnt->mnt_id_unique != id)
  4251. return NULL;
  4252. return &mnt->mnt;
  4253. }
  4254. struct kstatmount {
  4255. struct statmount __user *buf;
  4256. size_t bufsize;
  4257. struct vfsmount *mnt;
  4258. u64 mask;
  4259. struct path root;
  4260. struct statmount sm;
  4261. struct seq_file seq;
  4262. };
  4263. static u64 mnt_to_attr_flags(struct vfsmount *mnt)
  4264. {
  4265. unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
  4266. u64 attr_flags = 0;
  4267. if (mnt_flags & MNT_READONLY)
  4268. attr_flags |= MOUNT_ATTR_RDONLY;
  4269. if (mnt_flags & MNT_NOSUID)
  4270. attr_flags |= MOUNT_ATTR_NOSUID;
  4271. if (mnt_flags & MNT_NODEV)
  4272. attr_flags |= MOUNT_ATTR_NODEV;
  4273. if (mnt_flags & MNT_NOEXEC)
  4274. attr_flags |= MOUNT_ATTR_NOEXEC;
  4275. if (mnt_flags & MNT_NODIRATIME)
  4276. attr_flags |= MOUNT_ATTR_NODIRATIME;
  4277. if (mnt_flags & MNT_NOSYMFOLLOW)
  4278. attr_flags |= MOUNT_ATTR_NOSYMFOLLOW;
  4279. if (mnt_flags & MNT_NOATIME)
  4280. attr_flags |= MOUNT_ATTR_NOATIME;
  4281. else if (mnt_flags & MNT_RELATIME)
  4282. attr_flags |= MOUNT_ATTR_RELATIME;
  4283. else
  4284. attr_flags |= MOUNT_ATTR_STRICTATIME;
  4285. if (is_idmapped_mnt(mnt))
  4286. attr_flags |= MOUNT_ATTR_IDMAP;
  4287. return attr_flags;
  4288. }
  4289. static u64 mnt_to_propagation_flags(struct mount *m)
  4290. {
  4291. u64 propagation = 0;
  4292. if (IS_MNT_SHARED(m))
  4293. propagation |= MS_SHARED;
  4294. if (IS_MNT_SLAVE(m))
  4295. propagation |= MS_SLAVE;
  4296. if (IS_MNT_UNBINDABLE(m))
  4297. propagation |= MS_UNBINDABLE;
  4298. if (!propagation)
  4299. propagation |= MS_PRIVATE;
  4300. return propagation;
  4301. }
  4302. static void statmount_sb_basic(struct kstatmount *s)
  4303. {
  4304. struct super_block *sb = s->mnt->mnt_sb;
  4305. s->sm.mask |= STATMOUNT_SB_BASIC;
  4306. s->sm.sb_dev_major = MAJOR(sb->s_dev);
  4307. s->sm.sb_dev_minor = MINOR(sb->s_dev);
  4308. s->sm.sb_magic = sb->s_magic;
  4309. s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME);
  4310. }
  4311. static void statmount_mnt_basic(struct kstatmount *s)
  4312. {
  4313. struct mount *m = real_mount(s->mnt);
  4314. s->sm.mask |= STATMOUNT_MNT_BASIC;
  4315. s->sm.mnt_id = m->mnt_id_unique;
  4316. s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
  4317. s->sm.mnt_id_old = m->mnt_id;
  4318. s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
  4319. s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
  4320. s->sm.mnt_propagation = mnt_to_propagation_flags(m);
  4321. s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0;
  4322. s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
  4323. }
  4324. static void statmount_propagate_from(struct kstatmount *s)
  4325. {
  4326. struct mount *m = real_mount(s->mnt);
  4327. s->sm.mask |= STATMOUNT_PROPAGATE_FROM;
  4328. if (IS_MNT_SLAVE(m))
  4329. s->sm.propagate_from = get_dominating_id(m, &current->fs->root);
  4330. }
  4331. static int statmount_mnt_root(struct kstatmount *s, struct seq_file *seq)
  4332. {
  4333. int ret;
  4334. size_t start = seq->count;
  4335. ret = show_path(seq, s->mnt->mnt_root);
  4336. if (ret)
  4337. return ret;
  4338. if (unlikely(seq_has_overflowed(seq)))
  4339. return -EAGAIN;
  4340. /*
  4341. * Unescape the result. It would be better if supplied string was not
  4342. * escaped in the first place, but that's a pretty invasive change.
  4343. */
  4344. seq->buf[seq->count] = '\0';
  4345. seq->count = start;
  4346. seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
  4347. return 0;
  4348. }
  4349. static int statmount_mnt_point(struct kstatmount *s, struct seq_file *seq)
  4350. {
  4351. struct vfsmount *mnt = s->mnt;
  4352. struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
  4353. int err;
  4354. err = seq_path_root(seq, &mnt_path, &s->root, "");
  4355. return err == SEQ_SKIP ? 0 : err;
  4356. }
  4357. static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
  4358. {
  4359. struct super_block *sb = s->mnt->mnt_sb;
  4360. seq_puts(seq, sb->s_type->name);
  4361. return 0;
  4362. }
  4363. static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
  4364. {
  4365. s->sm.mask |= STATMOUNT_MNT_NS_ID;
  4366. s->sm.mnt_ns_id = ns->seq;
  4367. }
  4368. static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
  4369. {
  4370. struct vfsmount *mnt = s->mnt;
  4371. struct super_block *sb = mnt->mnt_sb;
  4372. size_t start = seq->count;
  4373. int err;
  4374. err = security_sb_show_options(seq, sb);
  4375. if (err)
  4376. return err;
  4377. if (sb->s_op->show_options) {
  4378. err = sb->s_op->show_options(seq, mnt->mnt_root);
  4379. if (err)
  4380. return err;
  4381. }
  4382. if (unlikely(seq_has_overflowed(seq)))
  4383. return -EAGAIN;
  4384. if (seq->count == start)
  4385. return 0;
  4386. /* skip leading comma */
  4387. memmove(seq->buf + start, seq->buf + start + 1,
  4388. seq->count - start - 1);
  4389. seq->count--;
  4390. return 0;
  4391. }
  4392. static int statmount_string(struct kstatmount *s, u64 flag)
  4393. {
  4394. int ret;
  4395. size_t kbufsize;
  4396. struct seq_file *seq = &s->seq;
  4397. struct statmount *sm = &s->sm;
  4398. u32 start, *offp;
  4399. /* Reserve an empty string at the beginning for any unset offsets */
  4400. if (!seq->count)
  4401. seq_putc(seq, 0);
  4402. start = seq->count;
  4403. switch (flag) {
  4404. case STATMOUNT_FS_TYPE:
  4405. offp = &sm->fs_type;
  4406. ret = statmount_fs_type(s, seq);
  4407. break;
  4408. case STATMOUNT_MNT_ROOT:
  4409. offp = &sm->mnt_root;
  4410. ret = statmount_mnt_root(s, seq);
  4411. break;
  4412. case STATMOUNT_MNT_POINT:
  4413. offp = &sm->mnt_point;
  4414. ret = statmount_mnt_point(s, seq);
  4415. break;
  4416. case STATMOUNT_MNT_OPTS:
  4417. offp = &sm->mnt_opts;
  4418. ret = statmount_mnt_opts(s, seq);
  4419. break;
  4420. default:
  4421. WARN_ON_ONCE(true);
  4422. return -EINVAL;
  4423. }
  4424. if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize)))
  4425. return -EOVERFLOW;
  4426. if (kbufsize >= s->bufsize)
  4427. return -EOVERFLOW;
  4428. /* signal a retry */
  4429. if (unlikely(seq_has_overflowed(seq)))
  4430. return -EAGAIN;
  4431. if (ret)
  4432. return ret;
  4433. seq->buf[seq->count++] = '\0';
  4434. sm->mask |= flag;
  4435. *offp = start;
  4436. return 0;
  4437. }
  4438. static int copy_statmount_to_user(struct kstatmount *s)
  4439. {
  4440. struct statmount *sm = &s->sm;
  4441. struct seq_file *seq = &s->seq;
  4442. char __user *str = ((char __user *)s->buf) + sizeof(*sm);
  4443. size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm));
  4444. if (seq->count && copy_to_user(str, seq->buf, seq->count))
  4445. return -EFAULT;
  4446. /* Return the number of bytes copied to the buffer */
  4447. sm->size = copysize + seq->count;
  4448. if (copy_to_user(s->buf, sm, copysize))
  4449. return -EFAULT;
  4450. return 0;
  4451. }
  4452. static struct mount *listmnt_next(struct mount *curr, bool reverse)
  4453. {
  4454. struct rb_node *node;
  4455. if (reverse)
  4456. node = rb_prev(&curr->mnt_node);
  4457. else
  4458. node = rb_next(&curr->mnt_node);
  4459. return node_to_mount(node);
  4460. }
  4461. static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
  4462. {
  4463. struct mount *first, *child;
  4464. rwsem_assert_held(&namespace_sem);
  4465. /* We're looking at our own ns, just use get_fs_root. */
  4466. if (ns == current->nsproxy->mnt_ns) {
  4467. get_fs_root(current->fs, root);
  4468. return 0;
  4469. }
  4470. /*
  4471. * We have to find the first mount in our ns and use that, however it
  4472. * may not exist, so handle that properly.
  4473. */
  4474. if (RB_EMPTY_ROOT(&ns->mounts))
  4475. return -ENOENT;
  4476. first = child = ns->root;
  4477. for (;;) {
  4478. child = listmnt_next(child, false);
  4479. if (!child)
  4480. return -ENOENT;
  4481. if (child->mnt_parent == first)
  4482. break;
  4483. }
  4484. root->mnt = mntget(&child->mnt);
  4485. root->dentry = dget(root->mnt->mnt_root);
  4486. return 0;
  4487. }
  4488. static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
  4489. struct mnt_namespace *ns)
  4490. {
  4491. struct mount *m;
  4492. int err;
  4493. /* Has the namespace already been emptied? */
  4494. if (mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts))
  4495. return -ENOENT;
  4496. s->mnt = lookup_mnt_in_ns(mnt_id, ns);
  4497. if (!s->mnt)
  4498. return -ENOENT;
  4499. err = grab_requested_root(ns, &s->root);
  4500. if (err)
  4501. return err;
  4502. /*
  4503. * Don't trigger audit denials. We just want to determine what
  4504. * mounts to show users.
  4505. */
  4506. m = real_mount(s->mnt);
  4507. if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
  4508. !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
  4509. return -EPERM;
  4510. err = security_sb_statfs(s->mnt->mnt_root);
  4511. if (err)
  4512. return err;
  4513. if (s->mask & STATMOUNT_SB_BASIC)
  4514. statmount_sb_basic(s);
  4515. if (s->mask & STATMOUNT_MNT_BASIC)
  4516. statmount_mnt_basic(s);
  4517. if (s->mask & STATMOUNT_PROPAGATE_FROM)
  4518. statmount_propagate_from(s);
  4519. if (s->mask & STATMOUNT_FS_TYPE)
  4520. err = statmount_string(s, STATMOUNT_FS_TYPE);
  4521. if (!err && s->mask & STATMOUNT_MNT_ROOT)
  4522. err = statmount_string(s, STATMOUNT_MNT_ROOT);
  4523. if (!err && s->mask & STATMOUNT_MNT_POINT)
  4524. err = statmount_string(s, STATMOUNT_MNT_POINT);
  4525. if (!err && s->mask & STATMOUNT_MNT_OPTS)
  4526. err = statmount_string(s, STATMOUNT_MNT_OPTS);
  4527. if (!err && s->mask & STATMOUNT_MNT_NS_ID)
  4528. statmount_mnt_ns_id(s, ns);
  4529. if (err)
  4530. return err;
  4531. return 0;
  4532. }
  4533. static inline bool retry_statmount(const long ret, size_t *seq_size)
  4534. {
  4535. if (likely(ret != -EAGAIN))
  4536. return false;
  4537. if (unlikely(check_mul_overflow(*seq_size, 2, seq_size)))
  4538. return false;
  4539. if (unlikely(*seq_size > MAX_RW_COUNT))
  4540. return false;
  4541. return true;
  4542. }
  4543. #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
  4544. STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS)
  4545. static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
  4546. struct statmount __user *buf, size_t bufsize,
  4547. size_t seq_size)
  4548. {
  4549. if (!access_ok(buf, bufsize))
  4550. return -EFAULT;
  4551. memset(ks, 0, sizeof(*ks));
  4552. ks->mask = kreq->param;
  4553. ks->buf = buf;
  4554. ks->bufsize = bufsize;
  4555. if (ks->mask & STATMOUNT_STRING_REQ) {
  4556. if (bufsize == sizeof(ks->sm))
  4557. return -EOVERFLOW;
  4558. ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT);
  4559. if (!ks->seq.buf)
  4560. return -ENOMEM;
  4561. ks->seq.size = seq_size;
  4562. }
  4563. return 0;
  4564. }
  4565. static int copy_mnt_id_req(const struct mnt_id_req __user *req,
  4566. struct mnt_id_req *kreq)
  4567. {
  4568. int ret;
  4569. size_t usize;
  4570. BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1);
  4571. ret = get_user(usize, &req->size);
  4572. if (ret)
  4573. return -EFAULT;
  4574. if (unlikely(usize > PAGE_SIZE))
  4575. return -E2BIG;
  4576. if (unlikely(usize < MNT_ID_REQ_SIZE_VER0))
  4577. return -EINVAL;
  4578. memset(kreq, 0, sizeof(*kreq));
  4579. ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
  4580. if (ret)
  4581. return ret;
  4582. if (kreq->spare != 0)
  4583. return -EINVAL;
  4584. /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
  4585. if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET)
  4586. return -EINVAL;
  4587. return 0;
  4588. }
  4589. /*
  4590. * If the user requested a specific mount namespace id, look that up and return
  4591. * that, or if not simply grab a passive reference on our mount namespace and
  4592. * return that.
  4593. */
  4594. static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq)
  4595. {
  4596. struct mnt_namespace *mnt_ns;
  4597. if (kreq->mnt_ns_id && kreq->spare)
  4598. return ERR_PTR(-EINVAL);
  4599. if (kreq->mnt_ns_id)
  4600. return lookup_mnt_ns(kreq->mnt_ns_id);
  4601. if (kreq->spare) {
  4602. struct ns_common *ns;
  4603. CLASS(fd, f)(kreq->spare);
  4604. if (fd_empty(f))
  4605. return ERR_PTR(-EBADF);
  4606. if (!proc_ns_file(fd_file(f)))
  4607. return ERR_PTR(-EINVAL);
  4608. ns = get_proc_ns(file_inode(fd_file(f)));
  4609. if (ns->ops->type != CLONE_NEWNS)
  4610. return ERR_PTR(-EINVAL);
  4611. mnt_ns = to_mnt_ns(ns);
  4612. } else {
  4613. mnt_ns = current->nsproxy->mnt_ns;
  4614. }
  4615. refcount_inc(&mnt_ns->passive);
  4616. return mnt_ns;
  4617. }
  4618. SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
  4619. struct statmount __user *, buf, size_t, bufsize,
  4620. unsigned int, flags)
  4621. {
  4622. struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
  4623. struct kstatmount *ks __free(kfree) = NULL;
  4624. struct mnt_id_req kreq;
  4625. /* We currently support retrieval of 3 strings. */
  4626. size_t seq_size = 3 * PATH_MAX;
  4627. int ret;
  4628. if (flags)
  4629. return -EINVAL;
  4630. ret = copy_mnt_id_req(req, &kreq);
  4631. if (ret)
  4632. return ret;
  4633. ns = grab_requested_mnt_ns(&kreq);
  4634. if (!ns)
  4635. return -ENOENT;
  4636. if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
  4637. !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
  4638. return -ENOENT;
  4639. ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT);
  4640. if (!ks)
  4641. return -ENOMEM;
  4642. retry:
  4643. ret = prepare_kstatmount(ks, &kreq, buf, bufsize, seq_size);
  4644. if (ret)
  4645. return ret;
  4646. scoped_guard(rwsem_read, &namespace_sem)
  4647. ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns);
  4648. if (!ret)
  4649. ret = copy_statmount_to_user(ks);
  4650. kvfree(ks->seq.buf);
  4651. path_put(&ks->root);
  4652. if (retry_statmount(ret, &seq_size))
  4653. goto retry;
  4654. return ret;
  4655. }
  4656. struct klistmount {
  4657. u64 last_mnt_id;
  4658. u64 mnt_parent_id;
  4659. u64 *kmnt_ids;
  4660. u32 nr_mnt_ids;
  4661. struct mnt_namespace *ns;
  4662. struct path root;
  4663. };
  4664. static ssize_t do_listmount(struct klistmount *kls, bool reverse)
  4665. {
  4666. struct mnt_namespace *ns = kls->ns;
  4667. u64 mnt_parent_id = kls->mnt_parent_id;
  4668. u64 last_mnt_id = kls->last_mnt_id;
  4669. u64 *mnt_ids = kls->kmnt_ids;
  4670. size_t nr_mnt_ids = kls->nr_mnt_ids;
  4671. struct path orig;
  4672. struct mount *r, *first;
  4673. ssize_t ret;
  4674. rwsem_assert_held(&namespace_sem);
  4675. ret = grab_requested_root(ns, &kls->root);
  4676. if (ret)
  4677. return ret;
  4678. if (mnt_parent_id == LSMT_ROOT) {
  4679. orig = kls->root;
  4680. } else {
  4681. orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
  4682. if (!orig.mnt)
  4683. return -ENOENT;
  4684. orig.dentry = orig.mnt->mnt_root;
  4685. }
  4686. /*
  4687. * Don't trigger audit denials. We just want to determine what
  4688. * mounts to show users.
  4689. */
  4690. if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &kls->root) &&
  4691. !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
  4692. return -EPERM;
  4693. ret = security_sb_statfs(orig.dentry);
  4694. if (ret)
  4695. return ret;
  4696. if (!last_mnt_id) {
  4697. if (reverse)
  4698. first = node_to_mount(rb_last(&ns->mounts));
  4699. else
  4700. first = node_to_mount(rb_first(&ns->mounts));
  4701. } else {
  4702. if (reverse)
  4703. first = mnt_find_id_at_reverse(ns, last_mnt_id - 1);
  4704. else
  4705. first = mnt_find_id_at(ns, last_mnt_id + 1);
  4706. }
  4707. for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r, reverse)) {
  4708. if (r->mnt_id_unique == mnt_parent_id)
  4709. continue;
  4710. if (!is_path_reachable(r, r->mnt.mnt_root, &orig))
  4711. continue;
  4712. *mnt_ids = r->mnt_id_unique;
  4713. mnt_ids++;
  4714. nr_mnt_ids--;
  4715. ret++;
  4716. }
  4717. return ret;
  4718. }
  4719. static void __free_klistmount_free(const struct klistmount *kls)
  4720. {
  4721. path_put(&kls->root);
  4722. kvfree(kls->kmnt_ids);
  4723. mnt_ns_release(kls->ns);
  4724. }
  4725. static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq,
  4726. size_t nr_mnt_ids)
  4727. {
  4728. u64 last_mnt_id = kreq->param;
  4729. /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
  4730. if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
  4731. return -EINVAL;
  4732. kls->last_mnt_id = last_mnt_id;
  4733. kls->nr_mnt_ids = nr_mnt_ids;
  4734. kls->kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kls->kmnt_ids),
  4735. GFP_KERNEL_ACCOUNT);
  4736. if (!kls->kmnt_ids)
  4737. return -ENOMEM;
  4738. kls->ns = grab_requested_mnt_ns(kreq);
  4739. if (!kls->ns)
  4740. return -ENOENT;
  4741. kls->mnt_parent_id = kreq->mnt_id;
  4742. return 0;
  4743. }
  4744. SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
  4745. u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
  4746. {
  4747. struct klistmount kls __free(klistmount_free) = {};
  4748. const size_t maxcount = 1000000;
  4749. struct mnt_id_req kreq;
  4750. ssize_t ret;
  4751. if (flags & ~LISTMOUNT_REVERSE)
  4752. return -EINVAL;
  4753. /*
  4754. * If the mount namespace really has more than 1 million mounts the
  4755. * caller must iterate over the mount namespace (and reconsider their
  4756. * system design...).
  4757. */
  4758. if (unlikely(nr_mnt_ids > maxcount))
  4759. return -EOVERFLOW;
  4760. if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids)))
  4761. return -EFAULT;
  4762. ret = copy_mnt_id_req(req, &kreq);
  4763. if (ret)
  4764. return ret;
  4765. ret = prepare_klistmount(&kls, &kreq, nr_mnt_ids);
  4766. if (ret)
  4767. return ret;
  4768. if (kreq.mnt_ns_id && (kls.ns != current->nsproxy->mnt_ns) &&
  4769. !ns_capable_noaudit(kls.ns->user_ns, CAP_SYS_ADMIN))
  4770. return -ENOENT;
  4771. scoped_guard(rwsem_read, &namespace_sem)
  4772. ret = do_listmount(&kls, (flags & LISTMOUNT_REVERSE));
  4773. if (ret <= 0)
  4774. return ret;
  4775. if (copy_to_user(mnt_ids, kls.kmnt_ids, ret * sizeof(*mnt_ids)))
  4776. return -EFAULT;
  4777. return ret;
  4778. }
  4779. static void __init init_mount_tree(void)
  4780. {
  4781. struct vfsmount *mnt;
  4782. struct mount *m;
  4783. struct mnt_namespace *ns;
  4784. struct path root;
  4785. mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options);
  4786. if (IS_ERR(mnt))
  4787. panic("Can't create rootfs");
  4788. ns = alloc_mnt_ns(&init_user_ns, false);
  4789. if (IS_ERR(ns))
  4790. panic("Can't allocate initial namespace");
  4791. m = real_mount(mnt);
  4792. ns->root = m;
  4793. ns->nr_mounts = 1;
  4794. mnt_add_to_ns(ns, m);
  4795. init_task.nsproxy->mnt_ns = ns;
  4796. get_mnt_ns(ns);
  4797. root.mnt = mnt;
  4798. root.dentry = mnt->mnt_root;
  4799. mnt->mnt_flags |= MNT_LOCKED;
  4800. set_fs_pwd(current->fs, &root);
  4801. set_fs_root(current->fs, &root);
  4802. mnt_ns_tree_add(ns);
  4803. }
  4804. void __init mnt_init(void)
  4805. {
  4806. int err;
  4807. mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
  4808. 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
  4809. mount_hashtable = alloc_large_system_hash("Mount-cache",
  4810. sizeof(struct hlist_head),
  4811. mhash_entries, 19,
  4812. HASH_ZERO,
  4813. &m_hash_shift, &m_hash_mask, 0, 0);
  4814. mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
  4815. sizeof(struct hlist_head),
  4816. mphash_entries, 19,
  4817. HASH_ZERO,
  4818. &mp_hash_shift, &mp_hash_mask, 0, 0);
  4819. if (!mount_hashtable || !mountpoint_hashtable)
  4820. panic("Failed to allocate mount hash table\n");
  4821. kernfs_init();
  4822. err = sysfs_init();
  4823. if (err)
  4824. printk(KERN_WARNING "%s: sysfs_init error: %d\n",
  4825. __func__, err);
  4826. fs_kobj = kobject_create_and_add("fs", NULL);
  4827. if (!fs_kobj)
  4828. printk(KERN_WARNING "%s: kobj create error\n", __func__);
  4829. shmem_init();
  4830. init_rootfs();
  4831. init_mount_tree();
  4832. }
  4833. void put_mnt_ns(struct mnt_namespace *ns)
  4834. {
  4835. if (!refcount_dec_and_test(&ns->ns.count))
  4836. return;
  4837. drop_collected_mounts(&ns->root->mnt);
  4838. free_mnt_ns(ns);
  4839. }
  4840. struct vfsmount *kern_mount(struct file_system_type *type)
  4841. {
  4842. struct vfsmount *mnt;
  4843. mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
  4844. if (!IS_ERR(mnt)) {
  4845. /*
  4846. * it is a longterm mount, don't release mnt until
  4847. * we unmount before file sys is unregistered
  4848. */
  4849. real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
  4850. }
  4851. return mnt;
  4852. }
  4853. EXPORT_SYMBOL_GPL(kern_mount);
  4854. void kern_unmount(struct vfsmount *mnt)
  4855. {
  4856. /* release long term mount so mount point can be released */
  4857. if (!IS_ERR(mnt)) {
  4858. mnt_make_shortterm(mnt);
  4859. synchronize_rcu(); /* yecchhh... */
  4860. mntput(mnt);
  4861. }
  4862. }
  4863. EXPORT_SYMBOL(kern_unmount);
  4864. void kern_unmount_array(struct vfsmount *mnt[], unsigned int num)
  4865. {
  4866. unsigned int i;
  4867. for (i = 0; i < num; i++)
  4868. mnt_make_shortterm(mnt[i]);
  4869. synchronize_rcu_expedited();
  4870. for (i = 0; i < num; i++)
  4871. mntput(mnt[i]);
  4872. }
  4873. EXPORT_SYMBOL(kern_unmount_array);
  4874. bool our_mnt(struct vfsmount *mnt)
  4875. {
  4876. return check_mnt(real_mount(mnt));
  4877. }
  4878. bool current_chrooted(void)
  4879. {
  4880. /* Does the current process have a non-standard root */
  4881. struct path ns_root;
  4882. struct path fs_root;
  4883. bool chrooted;
  4884. /* Find the namespace root */
  4885. ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
  4886. ns_root.dentry = ns_root.mnt->mnt_root;
  4887. path_get(&ns_root);
  4888. while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
  4889. ;
  4890. get_fs_root(current->fs, &fs_root);
  4891. chrooted = !path_equal(&fs_root, &ns_root);
  4892. path_put(&fs_root);
  4893. path_put(&ns_root);
  4894. return chrooted;
  4895. }
  4896. static bool mnt_already_visible(struct mnt_namespace *ns,
  4897. const struct super_block *sb,
  4898. int *new_mnt_flags)
  4899. {
  4900. int new_flags = *new_mnt_flags;
  4901. struct mount *mnt, *n;
  4902. bool visible = false;
  4903. down_read(&namespace_sem);
  4904. rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
  4905. struct mount *child;
  4906. int mnt_flags;
  4907. if (mnt->mnt.mnt_sb->s_type != sb->s_type)
  4908. continue;
  4909. /* This mount is not fully visible if it's root directory
  4910. * is not the root directory of the filesystem.
  4911. */
  4912. if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
  4913. continue;
  4914. /* A local view of the mount flags */
  4915. mnt_flags = mnt->mnt.mnt_flags;
  4916. /* Don't miss readonly hidden in the superblock flags */
  4917. if (sb_rdonly(mnt->mnt.mnt_sb))
  4918. mnt_flags |= MNT_LOCK_READONLY;
  4919. /* Verify the mount flags are equal to or more permissive
  4920. * than the proposed new mount.
  4921. */
  4922. if ((mnt_flags & MNT_LOCK_READONLY) &&
  4923. !(new_flags & MNT_READONLY))
  4924. continue;
  4925. if ((mnt_flags & MNT_LOCK_ATIME) &&
  4926. ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
  4927. continue;
  4928. /* This mount is not fully visible if there are any
  4929. * locked child mounts that cover anything except for
  4930. * empty directories.
  4931. */
  4932. list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
  4933. struct inode *inode = child->mnt_mountpoint->d_inode;
  4934. /* Only worry about locked mounts */
  4935. if (!(child->mnt.mnt_flags & MNT_LOCKED))
  4936. continue;
  4937. /* Is the directory permanently empty? */
  4938. if (!is_empty_dir_inode(inode))
  4939. goto next;
  4940. }
  4941. /* Preserve the locked attributes */
  4942. *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
  4943. MNT_LOCK_ATIME);
  4944. visible = true;
  4945. goto found;
  4946. next: ;
  4947. }
  4948. found:
  4949. up_read(&namespace_sem);
  4950. return visible;
  4951. }
  4952. static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
  4953. {
  4954. const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
  4955. struct mnt_namespace *ns = current->nsproxy->mnt_ns;
  4956. unsigned long s_iflags;
  4957. if (ns->user_ns == &init_user_ns)
  4958. return false;
  4959. /* Can this filesystem be too revealing? */
  4960. s_iflags = sb->s_iflags;
  4961. if (!(s_iflags & SB_I_USERNS_VISIBLE))
  4962. return false;
  4963. if ((s_iflags & required_iflags) != required_iflags) {
  4964. WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
  4965. required_iflags);
  4966. return true;
  4967. }
  4968. return !mnt_already_visible(ns, sb, new_mnt_flags);
  4969. }
  4970. bool mnt_may_suid(struct vfsmount *mnt)
  4971. {
  4972. /*
  4973. * Foreign mounts (accessed via fchdir or through /proc
  4974. * symlinks) are always treated as if they are nosuid. This
  4975. * prevents namespaces from trusting potentially unsafe
  4976. * suid/sgid bits, file caps, or security labels that originate
  4977. * in other namespaces.
  4978. */
  4979. return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
  4980. current_in_userns(mnt->mnt_sb->s_user_ns);
  4981. }
  4982. static struct ns_common *mntns_get(struct task_struct *task)
  4983. {
  4984. struct ns_common *ns = NULL;
  4985. struct nsproxy *nsproxy;
  4986. task_lock(task);
  4987. nsproxy = task->nsproxy;
  4988. if (nsproxy) {
  4989. ns = &nsproxy->mnt_ns->ns;
  4990. get_mnt_ns(to_mnt_ns(ns));
  4991. }
  4992. task_unlock(task);
  4993. return ns;
  4994. }
  4995. static void mntns_put(struct ns_common *ns)
  4996. {
  4997. put_mnt_ns(to_mnt_ns(ns));
  4998. }
  4999. static int mntns_install(struct nsset *nsset, struct ns_common *ns)
  5000. {
  5001. struct nsproxy *nsproxy = nsset->nsproxy;
  5002. struct fs_struct *fs = nsset->fs;
  5003. struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
  5004. struct user_namespace *user_ns = nsset->cred->user_ns;
  5005. struct path root;
  5006. int err;
  5007. if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
  5008. !ns_capable(user_ns, CAP_SYS_CHROOT) ||
  5009. !ns_capable(user_ns, CAP_SYS_ADMIN))
  5010. return -EPERM;
  5011. if (is_anon_ns(mnt_ns))
  5012. return -EINVAL;
  5013. if (fs->users != 1)
  5014. return -EINVAL;
  5015. get_mnt_ns(mnt_ns);
  5016. old_mnt_ns = nsproxy->mnt_ns;
  5017. nsproxy->mnt_ns = mnt_ns;
  5018. /* Find the root */
  5019. err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
  5020. "/", LOOKUP_DOWN, &root);
  5021. if (err) {
  5022. /* revert to old namespace */
  5023. nsproxy->mnt_ns = old_mnt_ns;
  5024. put_mnt_ns(mnt_ns);
  5025. return err;
  5026. }
  5027. put_mnt_ns(old_mnt_ns);
  5028. /* Update the pwd and root */
  5029. set_fs_pwd(fs, &root);
  5030. set_fs_root(fs, &root);
  5031. path_put(&root);
  5032. return 0;
  5033. }
  5034. static struct user_namespace *mntns_owner(struct ns_common *ns)
  5035. {
  5036. return to_mnt_ns(ns)->user_ns;
  5037. }
  5038. const struct proc_ns_operations mntns_operations = {
  5039. .name = "mnt",
  5040. .type = CLONE_NEWNS,
  5041. .get = mntns_get,
  5042. .put = mntns_put,
  5043. .install = mntns_install,
  5044. .owner = mntns_owner,
  5045. };
  5046. #ifdef CONFIG_SYSCTL
  5047. static struct ctl_table fs_namespace_sysctls[] = {
  5048. {
  5049. .procname = "mount-max",
  5050. .data = &sysctl_mount_max,
  5051. .maxlen = sizeof(unsigned int),
  5052. .mode = 0644,
  5053. .proc_handler = proc_dointvec_minmax,
  5054. .extra1 = SYSCTL_ONE,
  5055. },
  5056. };
  5057. static int __init init_fs_namespace_sysctls(void)
  5058. {
  5059. register_sysctl_init("fs", fs_namespace_sysctls);
  5060. return 0;
  5061. }
  5062. fs_initcall(init_fs_namespace_sysctls);
  5063. #endif /* CONFIG_SYSCTL */