shmem.c 142 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370
  1. /*
  2. * Resizable virtual memory filesystem for Linux.
  3. *
  4. * Copyright (C) 2000 Linus Torvalds.
  5. * 2000 Transmeta Corp.
  6. * 2000-2001 Christoph Rohland
  7. * 2000-2001 SAP AG
  8. * 2002 Red Hat Inc.
  9. * Copyright (C) 2002-2011 Hugh Dickins.
  10. * Copyright (C) 2011 Google Inc.
  11. * Copyright (C) 2002-2005 VERITAS Software Corporation.
  12. * Copyright (C) 2004 Andi Kleen, SuSE Labs
  13. *
  14. * Extended attribute support for tmpfs:
  15. * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
  16. * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
  17. *
  18. * tiny-shmem:
  19. * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
  20. *
  21. * This file is released under the GPL.
  22. */
  23. #include <linux/fs.h>
  24. #include <linux/init.h>
  25. #include <linux/vfs.h>
  26. #include <linux/mount.h>
  27. #include <linux/ramfs.h>
  28. #include <linux/pagemap.h>
  29. #include <linux/file.h>
  30. #include <linux/fileattr.h>
  31. #include <linux/mm.h>
  32. #include <linux/random.h>
  33. #include <linux/sched/signal.h>
  34. #include <linux/export.h>
  35. #include <linux/shmem_fs.h>
  36. #include <linux/swap.h>
  37. #include <linux/uio.h>
  38. #include <linux/hugetlb.h>
  39. #include <linux/fs_parser.h>
  40. #include <linux/swapfile.h>
  41. #include <linux/iversion.h>
  42. #include "swap.h"
  43. static struct vfsmount *shm_mnt __ro_after_init;
  44. #ifdef CONFIG_SHMEM
  45. /*
  46. * This virtual memory filesystem is heavily based on the ramfs. It
  47. * extends ramfs by the ability to use swap and honor resource limits
  48. * which makes it a completely usable filesystem.
  49. */
  50. #include <linux/xattr.h>
  51. #include <linux/exportfs.h>
  52. #include <linux/posix_acl.h>
  53. #include <linux/posix_acl_xattr.h>
  54. #include <linux/mman.h>
  55. #include <linux/string.h>
  56. #include <linux/slab.h>
  57. #include <linux/backing-dev.h>
  58. #include <linux/writeback.h>
  59. #include <linux/pagevec.h>
  60. #include <linux/percpu_counter.h>
  61. #include <linux/falloc.h>
  62. #include <linux/splice.h>
  63. #include <linux/security.h>
  64. #include <linux/swapops.h>
  65. #include <linux/mempolicy.h>
  66. #include <linux/namei.h>
  67. #include <linux/ctype.h>
  68. #include <linux/migrate.h>
  69. #include <linux/highmem.h>
  70. #include <linux/seq_file.h>
  71. #include <linux/magic.h>
  72. #include <linux/syscalls.h>
  73. #include <linux/fcntl.h>
  74. #include <uapi/linux/memfd.h>
  75. #include <linux/rmap.h>
  76. #include <linux/uuid.h>
  77. #include <linux/quotaops.h>
  78. #include <linux/rcupdate_wait.h>
  79. #include <linux/uaccess.h>
  80. #include "internal.h"
  81. #define BLOCKS_PER_PAGE (PAGE_SIZE/512)
  82. #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT)
  83. /* Pretend that each entry is of this size in directory's i_size */
  84. #define BOGO_DIRENT_SIZE 20
  85. /* Pretend that one inode + its dentry occupy this much memory */
  86. #define BOGO_INODE_SIZE 1024
  87. /* Symlink up to this size is kmalloc'ed instead of using a swappable page */
  88. #define SHORT_SYMLINK_LEN 128
  89. /*
  90. * shmem_fallocate communicates with shmem_fault or shmem_writepage via
  91. * inode->i_private (with i_rwsem making sure that it has only one user at
  92. * a time): we would prefer not to enlarge the shmem inode just for that.
  93. */
  94. struct shmem_falloc {
  95. wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
  96. pgoff_t start; /* start of range currently being fallocated */
  97. pgoff_t next; /* the next page offset to be fallocated */
  98. pgoff_t nr_falloced; /* how many new pages have been fallocated */
  99. pgoff_t nr_unswapped; /* how often writepage refused to swap out */
  100. };
  101. struct shmem_options {
  102. unsigned long long blocks;
  103. unsigned long long inodes;
  104. struct mempolicy *mpol;
  105. kuid_t uid;
  106. kgid_t gid;
  107. umode_t mode;
  108. bool full_inums;
  109. int huge;
  110. int seen;
  111. bool noswap;
  112. unsigned short quota_types;
  113. struct shmem_quota_limits qlimits;
  114. #define SHMEM_SEEN_BLOCKS 1
  115. #define SHMEM_SEEN_INODES 2
  116. #define SHMEM_SEEN_HUGE 4
  117. #define SHMEM_SEEN_INUMS 8
  118. #define SHMEM_SEEN_NOSWAP 16
  119. #define SHMEM_SEEN_QUOTA 32
  120. };
  121. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  122. static unsigned long huge_shmem_orders_always __read_mostly;
  123. static unsigned long huge_shmem_orders_madvise __read_mostly;
  124. static unsigned long huge_shmem_orders_inherit __read_mostly;
  125. static unsigned long huge_shmem_orders_within_size __read_mostly;
  126. #endif
  127. #ifdef CONFIG_TMPFS
  128. static unsigned long shmem_default_max_blocks(void)
  129. {
  130. return totalram_pages() / 2;
  131. }
  132. static unsigned long shmem_default_max_inodes(void)
  133. {
  134. unsigned long nr_pages = totalram_pages();
  135. return min3(nr_pages - totalhigh_pages(), nr_pages / 2,
  136. ULONG_MAX / BOGO_INODE_SIZE);
  137. }
  138. #endif
  139. static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
  140. struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
  141. struct vm_area_struct *vma, vm_fault_t *fault_type);
  142. static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
  143. {
  144. return sb->s_fs_info;
  145. }
  146. /*
  147. * shmem_file_setup pre-accounts the whole fixed size of a VM object,
  148. * for shared memory and for shared anonymous (/dev/zero) mappings
  149. * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
  150. * consistent with the pre-accounting of private mappings ...
  151. */
  152. static inline int shmem_acct_size(unsigned long flags, loff_t size)
  153. {
  154. return (flags & VM_NORESERVE) ?
  155. 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
  156. }
  157. static inline void shmem_unacct_size(unsigned long flags, loff_t size)
  158. {
  159. if (!(flags & VM_NORESERVE))
  160. vm_unacct_memory(VM_ACCT(size));
  161. }
  162. static inline int shmem_reacct_size(unsigned long flags,
  163. loff_t oldsize, loff_t newsize)
  164. {
  165. if (!(flags & VM_NORESERVE)) {
  166. if (VM_ACCT(newsize) > VM_ACCT(oldsize))
  167. return security_vm_enough_memory_mm(current->mm,
  168. VM_ACCT(newsize) - VM_ACCT(oldsize));
  169. else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
  170. vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
  171. }
  172. return 0;
  173. }
  174. /*
  175. * ... whereas tmpfs objects are accounted incrementally as
  176. * pages are allocated, in order to allow large sparse files.
  177. * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM,
  178. * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
  179. */
  180. static inline int shmem_acct_blocks(unsigned long flags, long pages)
  181. {
  182. if (!(flags & VM_NORESERVE))
  183. return 0;
  184. return security_vm_enough_memory_mm(current->mm,
  185. pages * VM_ACCT(PAGE_SIZE));
  186. }
  187. static inline void shmem_unacct_blocks(unsigned long flags, long pages)
  188. {
  189. if (flags & VM_NORESERVE)
  190. vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
  191. }
  192. static int shmem_inode_acct_blocks(struct inode *inode, long pages)
  193. {
  194. struct shmem_inode_info *info = SHMEM_I(inode);
  195. struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
  196. int err = -ENOSPC;
  197. if (shmem_acct_blocks(info->flags, pages))
  198. return err;
  199. might_sleep(); /* when quotas */
  200. if (sbinfo->max_blocks) {
  201. if (!percpu_counter_limited_add(&sbinfo->used_blocks,
  202. sbinfo->max_blocks, pages))
  203. goto unacct;
  204. err = dquot_alloc_block_nodirty(inode, pages);
  205. if (err) {
  206. percpu_counter_sub(&sbinfo->used_blocks, pages);
  207. goto unacct;
  208. }
  209. } else {
  210. err = dquot_alloc_block_nodirty(inode, pages);
  211. if (err)
  212. goto unacct;
  213. }
  214. return 0;
  215. unacct:
  216. shmem_unacct_blocks(info->flags, pages);
  217. return err;
  218. }
  219. static void shmem_inode_unacct_blocks(struct inode *inode, long pages)
  220. {
  221. struct shmem_inode_info *info = SHMEM_I(inode);
  222. struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
  223. might_sleep(); /* when quotas */
  224. dquot_free_block_nodirty(inode, pages);
  225. if (sbinfo->max_blocks)
  226. percpu_counter_sub(&sbinfo->used_blocks, pages);
  227. shmem_unacct_blocks(info->flags, pages);
  228. }
  229. static const struct super_operations shmem_ops;
  230. static const struct address_space_operations shmem_aops;
  231. static const struct file_operations shmem_file_operations;
  232. static const struct inode_operations shmem_inode_operations;
  233. static const struct inode_operations shmem_dir_inode_operations;
  234. static const struct inode_operations shmem_special_inode_operations;
  235. static const struct vm_operations_struct shmem_vm_ops;
  236. static const struct vm_operations_struct shmem_anon_vm_ops;
  237. static struct file_system_type shmem_fs_type;
  238. bool shmem_mapping(struct address_space *mapping)
  239. {
  240. return mapping->a_ops == &shmem_aops;
  241. }
  242. EXPORT_SYMBOL_GPL(shmem_mapping);
  243. bool vma_is_anon_shmem(struct vm_area_struct *vma)
  244. {
  245. return vma->vm_ops == &shmem_anon_vm_ops;
  246. }
  247. bool vma_is_shmem(struct vm_area_struct *vma)
  248. {
  249. return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops;
  250. }
  251. static LIST_HEAD(shmem_swaplist);
  252. static DEFINE_MUTEX(shmem_swaplist_mutex);
  253. #ifdef CONFIG_TMPFS_QUOTA
  254. static int shmem_enable_quotas(struct super_block *sb,
  255. unsigned short quota_types)
  256. {
  257. int type, err = 0;
  258. sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
  259. for (type = 0; type < SHMEM_MAXQUOTAS; type++) {
  260. if (!(quota_types & (1 << type)))
  261. continue;
  262. err = dquot_load_quota_sb(sb, type, QFMT_SHMEM,
  263. DQUOT_USAGE_ENABLED |
  264. DQUOT_LIMITS_ENABLED);
  265. if (err)
  266. goto out_err;
  267. }
  268. return 0;
  269. out_err:
  270. pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n",
  271. type, err);
  272. for (type--; type >= 0; type--)
  273. dquot_quota_off(sb, type);
  274. return err;
  275. }
  276. static void shmem_disable_quotas(struct super_block *sb)
  277. {
  278. int type;
  279. for (type = 0; type < SHMEM_MAXQUOTAS; type++)
  280. dquot_quota_off(sb, type);
  281. }
  282. static struct dquot __rcu **shmem_get_dquots(struct inode *inode)
  283. {
  284. return SHMEM_I(inode)->i_dquot;
  285. }
  286. #endif /* CONFIG_TMPFS_QUOTA */
  287. /*
  288. * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
  289. * produces a novel ino for the newly allocated inode.
  290. *
  291. * It may also be called when making a hard link to permit the space needed by
  292. * each dentry. However, in that case, no new inode number is needed since that
  293. * internally draws from another pool of inode numbers (currently global
  294. * get_next_ino()). This case is indicated by passing NULL as inop.
  295. */
  296. #define SHMEM_INO_BATCH 1024
  297. static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
  298. {
  299. struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
  300. ino_t ino;
  301. if (!(sb->s_flags & SB_KERNMOUNT)) {
  302. raw_spin_lock(&sbinfo->stat_lock);
  303. if (sbinfo->max_inodes) {
  304. if (sbinfo->free_ispace < BOGO_INODE_SIZE) {
  305. raw_spin_unlock(&sbinfo->stat_lock);
  306. return -ENOSPC;
  307. }
  308. sbinfo->free_ispace -= BOGO_INODE_SIZE;
  309. }
  310. if (inop) {
  311. ino = sbinfo->next_ino++;
  312. if (unlikely(is_zero_ino(ino)))
  313. ino = sbinfo->next_ino++;
  314. if (unlikely(!sbinfo->full_inums &&
  315. ino > UINT_MAX)) {
  316. /*
  317. * Emulate get_next_ino uint wraparound for
  318. * compatibility
  319. */
  320. if (IS_ENABLED(CONFIG_64BIT))
  321. pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
  322. __func__, MINOR(sb->s_dev));
  323. sbinfo->next_ino = 1;
  324. ino = sbinfo->next_ino++;
  325. }
  326. *inop = ino;
  327. }
  328. raw_spin_unlock(&sbinfo->stat_lock);
  329. } else if (inop) {
  330. /*
  331. * __shmem_file_setup, one of our callers, is lock-free: it
  332. * doesn't hold stat_lock in shmem_reserve_inode since
  333. * max_inodes is always 0, and is called from potentially
  334. * unknown contexts. As such, use a per-cpu batched allocator
  335. * which doesn't require the per-sb stat_lock unless we are at
  336. * the batch boundary.
  337. *
  338. * We don't need to worry about inode{32,64} since SB_KERNMOUNT
  339. * shmem mounts are not exposed to userspace, so we don't need
  340. * to worry about things like glibc compatibility.
  341. */
  342. ino_t *next_ino;
  343. next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
  344. ino = *next_ino;
  345. if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
  346. raw_spin_lock(&sbinfo->stat_lock);
  347. ino = sbinfo->next_ino;
  348. sbinfo->next_ino += SHMEM_INO_BATCH;
  349. raw_spin_unlock(&sbinfo->stat_lock);
  350. if (unlikely(is_zero_ino(ino)))
  351. ino++;
  352. }
  353. *inop = ino;
  354. *next_ino = ++ino;
  355. put_cpu();
  356. }
  357. return 0;
  358. }
  359. static void shmem_free_inode(struct super_block *sb, size_t freed_ispace)
  360. {
  361. struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
  362. if (sbinfo->max_inodes) {
  363. raw_spin_lock(&sbinfo->stat_lock);
  364. sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace;
  365. raw_spin_unlock(&sbinfo->stat_lock);
  366. }
  367. }
  368. /**
  369. * shmem_recalc_inode - recalculate the block usage of an inode
  370. * @inode: inode to recalc
  371. * @alloced: the change in number of pages allocated to inode
  372. * @swapped: the change in number of pages swapped from inode
  373. *
  374. * We have to calculate the free blocks since the mm can drop
  375. * undirtied hole pages behind our back.
  376. *
  377. * But normally info->alloced == inode->i_mapping->nrpages + info->swapped
  378. * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
  379. */
  380. static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
  381. {
  382. struct shmem_inode_info *info = SHMEM_I(inode);
  383. long freed;
  384. spin_lock(&info->lock);
  385. info->alloced += alloced;
  386. info->swapped += swapped;
  387. freed = info->alloced - info->swapped -
  388. READ_ONCE(inode->i_mapping->nrpages);
  389. /*
  390. * Special case: whereas normally shmem_recalc_inode() is called
  391. * after i_mapping->nrpages has already been adjusted (up or down),
  392. * shmem_writepage() has to raise swapped before nrpages is lowered -
  393. * to stop a racing shmem_recalc_inode() from thinking that a page has
  394. * been freed. Compensate here, to avoid the need for a followup call.
  395. */
  396. if (swapped > 0)
  397. freed += swapped;
  398. if (freed > 0)
  399. info->alloced -= freed;
  400. spin_unlock(&info->lock);
  401. /* The quota case may block */
  402. if (freed > 0)
  403. shmem_inode_unacct_blocks(inode, freed);
  404. }
  405. bool shmem_charge(struct inode *inode, long pages)
  406. {
  407. struct address_space *mapping = inode->i_mapping;
  408. if (shmem_inode_acct_blocks(inode, pages))
  409. return false;
  410. /* nrpages adjustment first, then shmem_recalc_inode() when balanced */
  411. xa_lock_irq(&mapping->i_pages);
  412. mapping->nrpages += pages;
  413. xa_unlock_irq(&mapping->i_pages);
  414. shmem_recalc_inode(inode, pages, 0);
  415. return true;
  416. }
  417. void shmem_uncharge(struct inode *inode, long pages)
  418. {
  419. /* pages argument is currently unused: keep it to help debugging */
  420. /* nrpages adjustment done by __filemap_remove_folio() or caller */
  421. shmem_recalc_inode(inode, 0, 0);
  422. }
  423. /*
  424. * Replace item expected in xarray by a new item, while holding xa_lock.
  425. */
  426. static int shmem_replace_entry(struct address_space *mapping,
  427. pgoff_t index, void *expected, void *replacement)
  428. {
  429. XA_STATE(xas, &mapping->i_pages, index);
  430. void *item;
  431. VM_BUG_ON(!expected);
  432. VM_BUG_ON(!replacement);
  433. item = xas_load(&xas);
  434. if (item != expected)
  435. return -ENOENT;
  436. xas_store(&xas, replacement);
  437. return 0;
  438. }
  439. /*
  440. * Sometimes, before we decide whether to proceed or to fail, we must check
  441. * that an entry was not already brought back from swap by a racing thread.
  442. *
  443. * Checking folio is not enough: by the time a swapcache folio is locked, it
  444. * might be reused, and again be swapcache, using the same swap as before.
  445. */
  446. static bool shmem_confirm_swap(struct address_space *mapping,
  447. pgoff_t index, swp_entry_t swap)
  448. {
  449. return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
  450. }
  451. /*
  452. * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
  453. *
  454. * SHMEM_HUGE_NEVER:
  455. * disables huge pages for the mount;
  456. * SHMEM_HUGE_ALWAYS:
  457. * enables huge pages for the mount;
  458. * SHMEM_HUGE_WITHIN_SIZE:
  459. * only allocate huge pages if the page will be fully within i_size,
  460. * also respect fadvise()/madvise() hints;
  461. * SHMEM_HUGE_ADVISE:
  462. * only allocate huge pages if requested with fadvise()/madvise();
  463. */
  464. #define SHMEM_HUGE_NEVER 0
  465. #define SHMEM_HUGE_ALWAYS 1
  466. #define SHMEM_HUGE_WITHIN_SIZE 2
  467. #define SHMEM_HUGE_ADVISE 3
  468. /*
  469. * Special values.
  470. * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
  471. *
  472. * SHMEM_HUGE_DENY:
  473. * disables huge on shm_mnt and all mounts, for emergency use;
  474. * SHMEM_HUGE_FORCE:
  475. * enables huge on shm_mnt and all mounts, w/o needing option, for testing;
  476. *
  477. */
  478. #define SHMEM_HUGE_DENY (-1)
  479. #define SHMEM_HUGE_FORCE (-2)
  480. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  481. /* ifdef here to avoid bloating shmem.o when not necessary */
  482. static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
  483. static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
  484. loff_t write_end, bool shmem_huge_force,
  485. struct vm_area_struct *vma,
  486. unsigned long vm_flags)
  487. {
  488. struct mm_struct *mm = vma ? vma->vm_mm : NULL;
  489. loff_t i_size;
  490. if (!S_ISREG(inode->i_mode))
  491. return false;
  492. if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags)))
  493. return false;
  494. if (shmem_huge == SHMEM_HUGE_DENY)
  495. return false;
  496. if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
  497. return true;
  498. switch (SHMEM_SB(inode->i_sb)->huge) {
  499. case SHMEM_HUGE_ALWAYS:
  500. return true;
  501. case SHMEM_HUGE_WITHIN_SIZE:
  502. index = round_up(index + 1, HPAGE_PMD_NR);
  503. i_size = max(write_end, i_size_read(inode));
  504. i_size = round_up(i_size, PAGE_SIZE);
  505. if (i_size >> PAGE_SHIFT >= index)
  506. return true;
  507. fallthrough;
  508. case SHMEM_HUGE_ADVISE:
  509. if (mm && (vm_flags & VM_HUGEPAGE))
  510. return true;
  511. fallthrough;
  512. default:
  513. return false;
  514. }
  515. }
  516. static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
  517. loff_t write_end, bool shmem_huge_force,
  518. struct vm_area_struct *vma, unsigned long vm_flags)
  519. {
  520. if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER)
  521. return false;
  522. return __shmem_huge_global_enabled(inode, index, write_end,
  523. shmem_huge_force, vma, vm_flags);
  524. }
  525. #if defined(CONFIG_SYSFS)
  526. static int shmem_parse_huge(const char *str)
  527. {
  528. if (!strcmp(str, "never"))
  529. return SHMEM_HUGE_NEVER;
  530. if (!strcmp(str, "always"))
  531. return SHMEM_HUGE_ALWAYS;
  532. if (!strcmp(str, "within_size"))
  533. return SHMEM_HUGE_WITHIN_SIZE;
  534. if (!strcmp(str, "advise"))
  535. return SHMEM_HUGE_ADVISE;
  536. if (!strcmp(str, "deny"))
  537. return SHMEM_HUGE_DENY;
  538. if (!strcmp(str, "force"))
  539. return SHMEM_HUGE_FORCE;
  540. return -EINVAL;
  541. }
  542. #endif
  543. #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
  544. static const char *shmem_format_huge(int huge)
  545. {
  546. switch (huge) {
  547. case SHMEM_HUGE_NEVER:
  548. return "never";
  549. case SHMEM_HUGE_ALWAYS:
  550. return "always";
  551. case SHMEM_HUGE_WITHIN_SIZE:
  552. return "within_size";
  553. case SHMEM_HUGE_ADVISE:
  554. return "advise";
  555. case SHMEM_HUGE_DENY:
  556. return "deny";
  557. case SHMEM_HUGE_FORCE:
  558. return "force";
  559. default:
  560. VM_BUG_ON(1);
  561. return "bad_val";
  562. }
  563. }
  564. #endif
  565. static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
  566. struct shrink_control *sc, unsigned long nr_to_free)
  567. {
  568. LIST_HEAD(list), *pos, *next;
  569. struct inode *inode;
  570. struct shmem_inode_info *info;
  571. struct folio *folio;
  572. unsigned long batch = sc ? sc->nr_to_scan : 128;
  573. unsigned long split = 0, freed = 0;
  574. if (list_empty(&sbinfo->shrinklist))
  575. return SHRINK_STOP;
  576. spin_lock(&sbinfo->shrinklist_lock);
  577. list_for_each_safe(pos, next, &sbinfo->shrinklist) {
  578. info = list_entry(pos, struct shmem_inode_info, shrinklist);
  579. /* pin the inode */
  580. inode = igrab(&info->vfs_inode);
  581. /* inode is about to be evicted */
  582. if (!inode) {
  583. list_del_init(&info->shrinklist);
  584. goto next;
  585. }
  586. list_move(&info->shrinklist, &list);
  587. next:
  588. sbinfo->shrinklist_len--;
  589. if (!--batch)
  590. break;
  591. }
  592. spin_unlock(&sbinfo->shrinklist_lock);
  593. list_for_each_safe(pos, next, &list) {
  594. pgoff_t next, end;
  595. loff_t i_size;
  596. int ret;
  597. info = list_entry(pos, struct shmem_inode_info, shrinklist);
  598. inode = &info->vfs_inode;
  599. if (nr_to_free && freed >= nr_to_free)
  600. goto move_back;
  601. i_size = i_size_read(inode);
  602. folio = filemap_get_entry(inode->i_mapping, i_size / PAGE_SIZE);
  603. if (!folio || xa_is_value(folio))
  604. goto drop;
  605. /* No large folio at the end of the file: nothing to split */
  606. if (!folio_test_large(folio)) {
  607. folio_put(folio);
  608. goto drop;
  609. }
  610. /* Check if there is anything to gain from splitting */
  611. next = folio_next_index(folio);
  612. end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE));
  613. if (end <= folio->index || end >= next) {
  614. folio_put(folio);
  615. goto drop;
  616. }
  617. /*
  618. * Move the inode on the list back to shrinklist if we failed
  619. * to lock the page at this time.
  620. *
  621. * Waiting for the lock may lead to deadlock in the
  622. * reclaim path.
  623. */
  624. if (!folio_trylock(folio)) {
  625. folio_put(folio);
  626. goto move_back;
  627. }
  628. ret = split_folio(folio);
  629. folio_unlock(folio);
  630. folio_put(folio);
  631. /* If split failed move the inode on the list back to shrinklist */
  632. if (ret)
  633. goto move_back;
  634. freed += next - end;
  635. split++;
  636. drop:
  637. list_del_init(&info->shrinklist);
  638. goto put;
  639. move_back:
  640. /*
  641. * Make sure the inode is either on the global list or deleted
  642. * from any local list before iput() since it could be deleted
  643. * in another thread once we put the inode (then the local list
  644. * is corrupted).
  645. */
  646. spin_lock(&sbinfo->shrinklist_lock);
  647. list_move(&info->shrinklist, &sbinfo->shrinklist);
  648. sbinfo->shrinklist_len++;
  649. spin_unlock(&sbinfo->shrinklist_lock);
  650. put:
  651. iput(inode);
  652. }
  653. return split;
  654. }
  655. static long shmem_unused_huge_scan(struct super_block *sb,
  656. struct shrink_control *sc)
  657. {
  658. struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
  659. if (!READ_ONCE(sbinfo->shrinklist_len))
  660. return SHRINK_STOP;
  661. return shmem_unused_huge_shrink(sbinfo, sc, 0);
  662. }
  663. static long shmem_unused_huge_count(struct super_block *sb,
  664. struct shrink_control *sc)
  665. {
  666. struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
  667. return READ_ONCE(sbinfo->shrinklist_len);
  668. }
  669. #else /* !CONFIG_TRANSPARENT_HUGEPAGE */
  670. #define shmem_huge SHMEM_HUGE_DENY
  671. static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
  672. struct shrink_control *sc, unsigned long nr_to_free)
  673. {
  674. return 0;
  675. }
  676. static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
  677. loff_t write_end, bool shmem_huge_force,
  678. struct vm_area_struct *vma, unsigned long vm_flags)
  679. {
  680. return false;
  681. }
  682. #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  683. static void shmem_update_stats(struct folio *folio, int nr_pages)
  684. {
  685. if (folio_test_pmd_mappable(folio))
  686. __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages);
  687. __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
  688. __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages);
  689. }
  690. /*
  691. * Somewhat like filemap_add_folio, but error if expected item has gone.
  692. */
  693. static int shmem_add_to_page_cache(struct folio *folio,
  694. struct address_space *mapping,
  695. pgoff_t index, void *expected, gfp_t gfp)
  696. {
  697. XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
  698. long nr = folio_nr_pages(folio);
  699. VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
  700. VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
  701. VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
  702. folio_ref_add(folio, nr);
  703. folio->mapping = mapping;
  704. folio->index = index;
  705. gfp &= GFP_RECLAIM_MASK;
  706. folio_throttle_swaprate(folio, gfp);
  707. do {
  708. xas_lock_irq(&xas);
  709. if (expected != xas_find_conflict(&xas)) {
  710. xas_set_err(&xas, -EEXIST);
  711. goto unlock;
  712. }
  713. if (expected && xas_find_conflict(&xas)) {
  714. xas_set_err(&xas, -EEXIST);
  715. goto unlock;
  716. }
  717. xas_store(&xas, folio);
  718. if (xas_error(&xas))
  719. goto unlock;
  720. shmem_update_stats(folio, nr);
  721. mapping->nrpages += nr;
  722. unlock:
  723. xas_unlock_irq(&xas);
  724. } while (xas_nomem(&xas, gfp));
  725. if (xas_error(&xas)) {
  726. folio->mapping = NULL;
  727. folio_ref_sub(folio, nr);
  728. return xas_error(&xas);
  729. }
  730. return 0;
  731. }
  732. /*
  733. * Somewhat like filemap_remove_folio, but substitutes swap for @folio.
  734. */
  735. static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
  736. {
  737. struct address_space *mapping = folio->mapping;
  738. long nr = folio_nr_pages(folio);
  739. int error;
  740. xa_lock_irq(&mapping->i_pages);
  741. error = shmem_replace_entry(mapping, folio->index, folio, radswap);
  742. folio->mapping = NULL;
  743. mapping->nrpages -= nr;
  744. shmem_update_stats(folio, -nr);
  745. xa_unlock_irq(&mapping->i_pages);
  746. folio_put_refs(folio, nr);
  747. BUG_ON(error);
  748. }
  749. /*
  750. * Remove swap entry from page cache, free the swap and its page cache. Returns
  751. * the number of pages being freed. 0 means entry not found in XArray (0 pages
  752. * being freed).
  753. */
  754. static long shmem_free_swap(struct address_space *mapping,
  755. pgoff_t index, void *radswap)
  756. {
  757. int order = xa_get_order(&mapping->i_pages, index);
  758. void *old;
  759. old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
  760. if (old != radswap)
  761. return 0;
  762. free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order);
  763. return 1 << order;
  764. }
  765. /*
  766. * Determine (in bytes) how many of the shmem object's pages mapped by the
  767. * given offsets are swapped out.
  768. *
  769. * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
  770. * as long as the inode doesn't go away and racy results are not a problem.
  771. */
  772. unsigned long shmem_partial_swap_usage(struct address_space *mapping,
  773. pgoff_t start, pgoff_t end)
  774. {
  775. XA_STATE(xas, &mapping->i_pages, start);
  776. struct page *page;
  777. unsigned long swapped = 0;
  778. unsigned long max = end - 1;
  779. rcu_read_lock();
  780. xas_for_each(&xas, page, max) {
  781. if (xas_retry(&xas, page))
  782. continue;
  783. if (xa_is_value(page))
  784. swapped += 1 << xas_get_order(&xas);
  785. if (xas.xa_index == max)
  786. break;
  787. if (need_resched()) {
  788. xas_pause(&xas);
  789. cond_resched_rcu();
  790. }
  791. }
  792. rcu_read_unlock();
  793. return swapped << PAGE_SHIFT;
  794. }
  795. /*
  796. * Determine (in bytes) how many of the shmem object's pages mapped by the
  797. * given vma is swapped out.
  798. *
  799. * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
  800. * as long as the inode doesn't go away and racy results are not a problem.
  801. */
  802. unsigned long shmem_swap_usage(struct vm_area_struct *vma)
  803. {
  804. struct inode *inode = file_inode(vma->vm_file);
  805. struct shmem_inode_info *info = SHMEM_I(inode);
  806. struct address_space *mapping = inode->i_mapping;
  807. unsigned long swapped;
  808. /* Be careful as we don't hold info->lock */
  809. swapped = READ_ONCE(info->swapped);
  810. /*
  811. * The easier cases are when the shmem object has nothing in swap, or
  812. * the vma maps it whole. Then we can simply use the stats that we
  813. * already track.
  814. */
  815. if (!swapped)
  816. return 0;
  817. if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
  818. return swapped << PAGE_SHIFT;
  819. /* Here comes the more involved part */
  820. return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
  821. vma->vm_pgoff + vma_pages(vma));
  822. }
  823. /*
  824. * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
  825. */
  826. void shmem_unlock_mapping(struct address_space *mapping)
  827. {
  828. struct folio_batch fbatch;
  829. pgoff_t index = 0;
  830. folio_batch_init(&fbatch);
  831. /*
  832. * Minor point, but we might as well stop if someone else SHM_LOCKs it.
  833. */
  834. while (!mapping_unevictable(mapping) &&
  835. filemap_get_folios(mapping, &index, ~0UL, &fbatch)) {
  836. check_move_unevictable_folios(&fbatch);
  837. folio_batch_release(&fbatch);
  838. cond_resched();
  839. }
  840. }
  841. static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
  842. {
  843. struct folio *folio;
  844. /*
  845. * At first avoid shmem_get_folio(,,,SGP_READ): that fails
  846. * beyond i_size, and reports fallocated folios as holes.
  847. */
  848. folio = filemap_get_entry(inode->i_mapping, index);
  849. if (!folio)
  850. return folio;
  851. if (!xa_is_value(folio)) {
  852. folio_lock(folio);
  853. if (folio->mapping == inode->i_mapping)
  854. return folio;
  855. /* The folio has been swapped out */
  856. folio_unlock(folio);
  857. folio_put(folio);
  858. }
  859. /*
  860. * But read a folio back from swap if any of it is within i_size
  861. * (although in some cases this is just a waste of time).
  862. */
  863. folio = NULL;
  864. shmem_get_folio(inode, index, 0, &folio, SGP_READ);
  865. return folio;
  866. }
  867. /*
  868. * Remove range of pages and swap entries from page cache, and free them.
  869. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
  870. */
  871. static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
  872. bool unfalloc)
  873. {
  874. struct address_space *mapping = inode->i_mapping;
  875. struct shmem_inode_info *info = SHMEM_I(inode);
  876. pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
  877. pgoff_t end = (lend + 1) >> PAGE_SHIFT;
  878. struct folio_batch fbatch;
  879. pgoff_t indices[PAGEVEC_SIZE];
  880. struct folio *folio;
  881. bool same_folio;
  882. long nr_swaps_freed = 0;
  883. pgoff_t index;
  884. int i;
  885. if (lend == -1)
  886. end = -1; /* unsigned, so actually very big */
  887. if (info->fallocend > start && info->fallocend <= end && !unfalloc)
  888. info->fallocend = start;
  889. folio_batch_init(&fbatch);
  890. index = start;
  891. while (index < end && find_lock_entries(mapping, &index, end - 1,
  892. &fbatch, indices)) {
  893. for (i = 0; i < folio_batch_count(&fbatch); i++) {
  894. folio = fbatch.folios[i];
  895. if (xa_is_value(folio)) {
  896. if (unfalloc)
  897. continue;
  898. nr_swaps_freed += shmem_free_swap(mapping,
  899. indices[i], folio);
  900. continue;
  901. }
  902. if (!unfalloc || !folio_test_uptodate(folio))
  903. truncate_inode_folio(mapping, folio);
  904. folio_unlock(folio);
  905. }
  906. folio_batch_remove_exceptionals(&fbatch);
  907. folio_batch_release(&fbatch);
  908. cond_resched();
  909. }
  910. /*
  911. * When undoing a failed fallocate, we want none of the partial folio
  912. * zeroing and splitting below, but shall want to truncate the whole
  913. * folio when !uptodate indicates that it was added by this fallocate,
  914. * even when [lstart, lend] covers only a part of the folio.
  915. */
  916. if (unfalloc)
  917. goto whole_folios;
  918. same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
  919. folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
  920. if (folio) {
  921. same_folio = lend < folio_pos(folio) + folio_size(folio);
  922. folio_mark_dirty(folio);
  923. if (!truncate_inode_partial_folio(folio, lstart, lend)) {
  924. start = folio_next_index(folio);
  925. if (same_folio)
  926. end = folio->index;
  927. }
  928. folio_unlock(folio);
  929. folio_put(folio);
  930. folio = NULL;
  931. }
  932. if (!same_folio)
  933. folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
  934. if (folio) {
  935. folio_mark_dirty(folio);
  936. if (!truncate_inode_partial_folio(folio, lstart, lend))
  937. end = folio->index;
  938. folio_unlock(folio);
  939. folio_put(folio);
  940. }
  941. whole_folios:
  942. index = start;
  943. while (index < end) {
  944. cond_resched();
  945. if (!find_get_entries(mapping, &index, end - 1, &fbatch,
  946. indices)) {
  947. /* If all gone or hole-punch or unfalloc, we're done */
  948. if (index == start || end != -1)
  949. break;
  950. /* But if truncating, restart to make sure all gone */
  951. index = start;
  952. continue;
  953. }
  954. for (i = 0; i < folio_batch_count(&fbatch); i++) {
  955. folio = fbatch.folios[i];
  956. if (xa_is_value(folio)) {
  957. long swaps_freed;
  958. if (unfalloc)
  959. continue;
  960. swaps_freed = shmem_free_swap(mapping, indices[i], folio);
  961. if (!swaps_freed) {
  962. /* Swap was replaced by page: retry */
  963. index = indices[i];
  964. break;
  965. }
  966. nr_swaps_freed += swaps_freed;
  967. continue;
  968. }
  969. folio_lock(folio);
  970. if (!unfalloc || !folio_test_uptodate(folio)) {
  971. if (folio_mapping(folio) != mapping) {
  972. /* Page was replaced by swap: retry */
  973. folio_unlock(folio);
  974. index = indices[i];
  975. break;
  976. }
  977. VM_BUG_ON_FOLIO(folio_test_writeback(folio),
  978. folio);
  979. if (!folio_test_large(folio)) {
  980. truncate_inode_folio(mapping, folio);
  981. } else if (truncate_inode_partial_folio(folio, lstart, lend)) {
  982. /*
  983. * If we split a page, reset the loop so
  984. * that we pick up the new sub pages.
  985. * Otherwise the THP was entirely
  986. * dropped or the target range was
  987. * zeroed, so just continue the loop as
  988. * is.
  989. */
  990. if (!folio_test_large(folio)) {
  991. folio_unlock(folio);
  992. index = start;
  993. break;
  994. }
  995. }
  996. }
  997. folio_unlock(folio);
  998. }
  999. folio_batch_remove_exceptionals(&fbatch);
  1000. folio_batch_release(&fbatch);
  1001. }
  1002. shmem_recalc_inode(inode, 0, -nr_swaps_freed);
  1003. }
  1004. void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
  1005. {
  1006. shmem_undo_range(inode, lstart, lend, false);
  1007. inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
  1008. inode_inc_iversion(inode);
  1009. }
  1010. EXPORT_SYMBOL_GPL(shmem_truncate_range);
  1011. static int shmem_getattr(struct mnt_idmap *idmap,
  1012. const struct path *path, struct kstat *stat,
  1013. u32 request_mask, unsigned int query_flags)
  1014. {
  1015. struct inode *inode = path->dentry->d_inode;
  1016. struct shmem_inode_info *info = SHMEM_I(inode);
  1017. if (info->alloced - info->swapped != inode->i_mapping->nrpages)
  1018. shmem_recalc_inode(inode, 0, 0);
  1019. if (info->fsflags & FS_APPEND_FL)
  1020. stat->attributes |= STATX_ATTR_APPEND;
  1021. if (info->fsflags & FS_IMMUTABLE_FL)
  1022. stat->attributes |= STATX_ATTR_IMMUTABLE;
  1023. if (info->fsflags & FS_NODUMP_FL)
  1024. stat->attributes |= STATX_ATTR_NODUMP;
  1025. stat->attributes_mask |= (STATX_ATTR_APPEND |
  1026. STATX_ATTR_IMMUTABLE |
  1027. STATX_ATTR_NODUMP);
  1028. generic_fillattr(idmap, request_mask, inode, stat);
  1029. if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
  1030. stat->blksize = HPAGE_PMD_SIZE;
  1031. if (request_mask & STATX_BTIME) {
  1032. stat->result_mask |= STATX_BTIME;
  1033. stat->btime.tv_sec = info->i_crtime.tv_sec;
  1034. stat->btime.tv_nsec = info->i_crtime.tv_nsec;
  1035. }
  1036. return 0;
  1037. }
  1038. static int shmem_setattr(struct mnt_idmap *idmap,
  1039. struct dentry *dentry, struct iattr *attr)
  1040. {
  1041. struct inode *inode = d_inode(dentry);
  1042. struct shmem_inode_info *info = SHMEM_I(inode);
  1043. int error;
  1044. bool update_mtime = false;
  1045. bool update_ctime = true;
  1046. error = setattr_prepare(idmap, dentry, attr);
  1047. if (error)
  1048. return error;
  1049. if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) {
  1050. if ((inode->i_mode ^ attr->ia_mode) & 0111) {
  1051. return -EPERM;
  1052. }
  1053. }
  1054. if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
  1055. loff_t oldsize = inode->i_size;
  1056. loff_t newsize = attr->ia_size;
  1057. /* protected by i_rwsem */
  1058. if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
  1059. (newsize > oldsize && (info->seals & F_SEAL_GROW)))
  1060. return -EPERM;
  1061. if (newsize != oldsize) {
  1062. error = shmem_reacct_size(SHMEM_I(inode)->flags,
  1063. oldsize, newsize);
  1064. if (error)
  1065. return error;
  1066. i_size_write(inode, newsize);
  1067. update_mtime = true;
  1068. } else {
  1069. update_ctime = false;
  1070. }
  1071. if (newsize <= oldsize) {
  1072. loff_t holebegin = round_up(newsize, PAGE_SIZE);
  1073. if (oldsize > holebegin)
  1074. unmap_mapping_range(inode->i_mapping,
  1075. holebegin, 0, 1);
  1076. if (info->alloced)
  1077. shmem_truncate_range(inode,
  1078. newsize, (loff_t)-1);
  1079. /* unmap again to remove racily COWed private pages */
  1080. if (oldsize > holebegin)
  1081. unmap_mapping_range(inode->i_mapping,
  1082. holebegin, 0, 1);
  1083. }
  1084. }
  1085. if (is_quota_modification(idmap, inode, attr)) {
  1086. error = dquot_initialize(inode);
  1087. if (error)
  1088. return error;
  1089. }
  1090. /* Transfer quota accounting */
  1091. if (i_uid_needs_update(idmap, attr, inode) ||
  1092. i_gid_needs_update(idmap, attr, inode)) {
  1093. error = dquot_transfer(idmap, inode, attr);
  1094. if (error)
  1095. return error;
  1096. }
  1097. setattr_copy(idmap, inode, attr);
  1098. if (attr->ia_valid & ATTR_MODE)
  1099. error = posix_acl_chmod(idmap, dentry, inode->i_mode);
  1100. if (!error && update_ctime) {
  1101. inode_set_ctime_current(inode);
  1102. if (update_mtime)
  1103. inode_set_mtime_to_ts(inode, inode_get_ctime(inode));
  1104. inode_inc_iversion(inode);
  1105. }
  1106. return error;
  1107. }
  1108. static void shmem_evict_inode(struct inode *inode)
  1109. {
  1110. struct shmem_inode_info *info = SHMEM_I(inode);
  1111. struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
  1112. size_t freed = 0;
  1113. if (shmem_mapping(inode->i_mapping)) {
  1114. shmem_unacct_size(info->flags, inode->i_size);
  1115. inode->i_size = 0;
  1116. mapping_set_exiting(inode->i_mapping);
  1117. shmem_truncate_range(inode, 0, (loff_t)-1);
  1118. if (!list_empty(&info->shrinklist)) {
  1119. spin_lock(&sbinfo->shrinklist_lock);
  1120. if (!list_empty(&info->shrinklist)) {
  1121. list_del_init(&info->shrinklist);
  1122. sbinfo->shrinklist_len--;
  1123. }
  1124. spin_unlock(&sbinfo->shrinklist_lock);
  1125. }
  1126. while (!list_empty(&info->swaplist)) {
  1127. /* Wait while shmem_unuse() is scanning this inode... */
  1128. wait_var_event(&info->stop_eviction,
  1129. !atomic_read(&info->stop_eviction));
  1130. mutex_lock(&shmem_swaplist_mutex);
  1131. /* ...but beware of the race if we peeked too early */
  1132. if (!atomic_read(&info->stop_eviction))
  1133. list_del_init(&info->swaplist);
  1134. mutex_unlock(&shmem_swaplist_mutex);
  1135. }
  1136. }
  1137. simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL);
  1138. shmem_free_inode(inode->i_sb, freed);
  1139. WARN_ON(inode->i_blocks);
  1140. clear_inode(inode);
  1141. #ifdef CONFIG_TMPFS_QUOTA
  1142. dquot_free_inode(inode);
  1143. dquot_drop(inode);
  1144. #endif
  1145. }
  1146. static int shmem_find_swap_entries(struct address_space *mapping,
  1147. pgoff_t start, struct folio_batch *fbatch,
  1148. pgoff_t *indices, unsigned int type)
  1149. {
  1150. XA_STATE(xas, &mapping->i_pages, start);
  1151. struct folio *folio;
  1152. swp_entry_t entry;
  1153. rcu_read_lock();
  1154. xas_for_each(&xas, folio, ULONG_MAX) {
  1155. if (xas_retry(&xas, folio))
  1156. continue;
  1157. if (!xa_is_value(folio))
  1158. continue;
  1159. entry = radix_to_swp_entry(folio);
  1160. /*
  1161. * swapin error entries can be found in the mapping. But they're
  1162. * deliberately ignored here as we've done everything we can do.
  1163. */
  1164. if (swp_type(entry) != type)
  1165. continue;
  1166. indices[folio_batch_count(fbatch)] = xas.xa_index;
  1167. if (!folio_batch_add(fbatch, folio))
  1168. break;
  1169. if (need_resched()) {
  1170. xas_pause(&xas);
  1171. cond_resched_rcu();
  1172. }
  1173. }
  1174. rcu_read_unlock();
  1175. return xas.xa_index;
  1176. }
  1177. /*
  1178. * Move the swapped pages for an inode to page cache. Returns the count
  1179. * of pages swapped in, or the error in case of failure.
  1180. */
  1181. static int shmem_unuse_swap_entries(struct inode *inode,
  1182. struct folio_batch *fbatch, pgoff_t *indices)
  1183. {
  1184. int i = 0;
  1185. int ret = 0;
  1186. int error = 0;
  1187. struct address_space *mapping = inode->i_mapping;
  1188. for (i = 0; i < folio_batch_count(fbatch); i++) {
  1189. struct folio *folio = fbatch->folios[i];
  1190. if (!xa_is_value(folio))
  1191. continue;
  1192. error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
  1193. mapping_gfp_mask(mapping), NULL, NULL);
  1194. if (error == 0) {
  1195. folio_unlock(folio);
  1196. folio_put(folio);
  1197. ret++;
  1198. }
  1199. if (error == -ENOMEM)
  1200. break;
  1201. error = 0;
  1202. }
  1203. return error ? error : ret;
  1204. }
  1205. /*
  1206. * If swap found in inode, free it and move page from swapcache to filecache.
  1207. */
  1208. static int shmem_unuse_inode(struct inode *inode, unsigned int type)
  1209. {
  1210. struct address_space *mapping = inode->i_mapping;
  1211. pgoff_t start = 0;
  1212. struct folio_batch fbatch;
  1213. pgoff_t indices[PAGEVEC_SIZE];
  1214. int ret = 0;
  1215. do {
  1216. folio_batch_init(&fbatch);
  1217. shmem_find_swap_entries(mapping, start, &fbatch, indices, type);
  1218. if (folio_batch_count(&fbatch) == 0) {
  1219. ret = 0;
  1220. break;
  1221. }
  1222. ret = shmem_unuse_swap_entries(inode, &fbatch, indices);
  1223. if (ret < 0)
  1224. break;
  1225. start = indices[folio_batch_count(&fbatch) - 1];
  1226. } while (true);
  1227. return ret;
  1228. }
  1229. /*
  1230. * Read all the shared memory data that resides in the swap
  1231. * device 'type' back into memory, so the swap device can be
  1232. * unused.
  1233. */
  1234. int shmem_unuse(unsigned int type)
  1235. {
  1236. struct shmem_inode_info *info, *next;
  1237. int error = 0;
  1238. if (list_empty(&shmem_swaplist))
  1239. return 0;
  1240. mutex_lock(&shmem_swaplist_mutex);
  1241. list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
  1242. if (!info->swapped) {
  1243. list_del_init(&info->swaplist);
  1244. continue;
  1245. }
  1246. /*
  1247. * Drop the swaplist mutex while searching the inode for swap;
  1248. * but before doing so, make sure shmem_evict_inode() will not
  1249. * remove placeholder inode from swaplist, nor let it be freed
  1250. * (igrab() would protect from unlink, but not from unmount).
  1251. */
  1252. atomic_inc(&info->stop_eviction);
  1253. mutex_unlock(&shmem_swaplist_mutex);
  1254. error = shmem_unuse_inode(&info->vfs_inode, type);
  1255. cond_resched();
  1256. mutex_lock(&shmem_swaplist_mutex);
  1257. next = list_next_entry(info, swaplist);
  1258. if (!info->swapped)
  1259. list_del_init(&info->swaplist);
  1260. if (atomic_dec_and_test(&info->stop_eviction))
  1261. wake_up_var(&info->stop_eviction);
  1262. if (error)
  1263. break;
  1264. }
  1265. mutex_unlock(&shmem_swaplist_mutex);
  1266. return error;
  1267. }
  1268. /*
  1269. * Move the page from the page cache to the swap cache.
  1270. */
  1271. static int shmem_writepage(struct page *page, struct writeback_control *wbc)
  1272. {
  1273. struct folio *folio = page_folio(page);
  1274. struct address_space *mapping = folio->mapping;
  1275. struct inode *inode = mapping->host;
  1276. struct shmem_inode_info *info = SHMEM_I(inode);
  1277. struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
  1278. swp_entry_t swap;
  1279. pgoff_t index;
  1280. int nr_pages;
  1281. bool split = false;
  1282. /*
  1283. * Our capabilities prevent regular writeback or sync from ever calling
  1284. * shmem_writepage; but a stacking filesystem might use ->writepage of
  1285. * its underlying filesystem, in which case tmpfs should write out to
  1286. * swap only in response to memory pressure, and not for the writeback
  1287. * threads or sync.
  1288. */
  1289. if (WARN_ON_ONCE(!wbc->for_reclaim))
  1290. goto redirty;
  1291. if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap))
  1292. goto redirty;
  1293. if (!total_swap_pages)
  1294. goto redirty;
  1295. /*
  1296. * If CONFIG_THP_SWAP is not enabled, the large folio should be
  1297. * split when swapping.
  1298. *
  1299. * And shrinkage of pages beyond i_size does not split swap, so
  1300. * swapout of a large folio crossing i_size needs to split too
  1301. * (unless fallocate has been used to preallocate beyond EOF).
  1302. */
  1303. if (folio_test_large(folio)) {
  1304. index = shmem_fallocend(inode,
  1305. DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE));
  1306. if ((index > folio->index && index < folio_next_index(folio)) ||
  1307. !IS_ENABLED(CONFIG_THP_SWAP))
  1308. split = true;
  1309. }
  1310. if (split) {
  1311. try_split:
  1312. /* Ensure the subpages are still dirty */
  1313. folio_test_set_dirty(folio);
  1314. if (split_huge_page_to_list_to_order(page, wbc->list, 0))
  1315. goto redirty;
  1316. folio = page_folio(page);
  1317. folio_clear_dirty(folio);
  1318. }
  1319. index = folio->index;
  1320. nr_pages = folio_nr_pages(folio);
  1321. /*
  1322. * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
  1323. * value into swapfile.c, the only way we can correctly account for a
  1324. * fallocated folio arriving here is now to initialize it and write it.
  1325. *
  1326. * That's okay for a folio already fallocated earlier, but if we have
  1327. * not yet completed the fallocation, then (a) we want to keep track
  1328. * of this folio in case we have to undo it, and (b) it may not be a
  1329. * good idea to continue anyway, once we're pushing into swap. So
  1330. * reactivate the folio, and let shmem_fallocate() quit when too many.
  1331. */
  1332. if (!folio_test_uptodate(folio)) {
  1333. if (inode->i_private) {
  1334. struct shmem_falloc *shmem_falloc;
  1335. spin_lock(&inode->i_lock);
  1336. shmem_falloc = inode->i_private;
  1337. if (shmem_falloc &&
  1338. !shmem_falloc->waitq &&
  1339. index >= shmem_falloc->start &&
  1340. index < shmem_falloc->next)
  1341. shmem_falloc->nr_unswapped += nr_pages;
  1342. else
  1343. shmem_falloc = NULL;
  1344. spin_unlock(&inode->i_lock);
  1345. if (shmem_falloc)
  1346. goto redirty;
  1347. }
  1348. folio_zero_range(folio, 0, folio_size(folio));
  1349. flush_dcache_folio(folio);
  1350. folio_mark_uptodate(folio);
  1351. }
  1352. swap = folio_alloc_swap(folio);
  1353. if (!swap.val) {
  1354. if (nr_pages > 1)
  1355. goto try_split;
  1356. goto redirty;
  1357. }
  1358. /*
  1359. * Add inode to shmem_unuse()'s list of swapped-out inodes,
  1360. * if it's not already there. Do it now before the folio is
  1361. * moved to swap cache, when its pagelock no longer protects
  1362. * the inode from eviction. But don't unlock the mutex until
  1363. * we've incremented swapped, because shmem_unuse_inode() will
  1364. * prune a !swapped inode from the swaplist under this mutex.
  1365. */
  1366. mutex_lock(&shmem_swaplist_mutex);
  1367. if (list_empty(&info->swaplist))
  1368. list_add(&info->swaplist, &shmem_swaplist);
  1369. if (add_to_swap_cache(folio, swap,
  1370. __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
  1371. NULL) == 0) {
  1372. shmem_recalc_inode(inode, 0, nr_pages);
  1373. swap_shmem_alloc(swap, nr_pages);
  1374. shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
  1375. mutex_unlock(&shmem_swaplist_mutex);
  1376. BUG_ON(folio_mapped(folio));
  1377. return swap_writepage(&folio->page, wbc);
  1378. }
  1379. mutex_unlock(&shmem_swaplist_mutex);
  1380. put_swap_folio(folio, swap);
  1381. redirty:
  1382. folio_mark_dirty(folio);
  1383. if (wbc->for_reclaim)
  1384. return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
  1385. folio_unlock(folio);
  1386. return 0;
  1387. }
  1388. #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
  1389. static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
  1390. {
  1391. char buffer[64];
  1392. if (!mpol || mpol->mode == MPOL_DEFAULT)
  1393. return; /* show nothing */
  1394. mpol_to_str(buffer, sizeof(buffer), mpol);
  1395. seq_printf(seq, ",mpol=%s", buffer);
  1396. }
  1397. static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
  1398. {
  1399. struct mempolicy *mpol = NULL;
  1400. if (sbinfo->mpol) {
  1401. raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
  1402. mpol = sbinfo->mpol;
  1403. mpol_get(mpol);
  1404. raw_spin_unlock(&sbinfo->stat_lock);
  1405. }
  1406. return mpol;
  1407. }
  1408. #else /* !CONFIG_NUMA || !CONFIG_TMPFS */
  1409. static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
  1410. {
  1411. }
  1412. static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
  1413. {
  1414. return NULL;
  1415. }
  1416. #endif /* CONFIG_NUMA && CONFIG_TMPFS */
  1417. static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
  1418. pgoff_t index, unsigned int order, pgoff_t *ilx);
  1419. static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
  1420. struct shmem_inode_info *info, pgoff_t index)
  1421. {
  1422. struct mempolicy *mpol;
  1423. pgoff_t ilx;
  1424. struct folio *folio;
  1425. mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
  1426. folio = swap_cluster_readahead(swap, gfp, mpol, ilx);
  1427. mpol_cond_put(mpol);
  1428. return folio;
  1429. }
  1430. /*
  1431. * Make sure huge_gfp is always more limited than limit_gfp.
  1432. * Some of the flags set permissions, while others set limitations.
  1433. */
  1434. static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
  1435. {
  1436. gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
  1437. gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
  1438. gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
  1439. gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
  1440. /* Allow allocations only from the originally specified zones. */
  1441. result |= zoneflags;
  1442. /*
  1443. * Minimize the result gfp by taking the union with the deny flags,
  1444. * and the intersection of the allow flags.
  1445. */
  1446. result |= (limit_gfp & denyflags);
  1447. result |= (huge_gfp & limit_gfp) & allowflags;
  1448. return result;
  1449. }
  1450. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  1451. unsigned long shmem_allowable_huge_orders(struct inode *inode,
  1452. struct vm_area_struct *vma, pgoff_t index,
  1453. loff_t write_end, bool shmem_huge_force)
  1454. {
  1455. unsigned long mask = READ_ONCE(huge_shmem_orders_always);
  1456. unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
  1457. unsigned long vm_flags = vma ? vma->vm_flags : 0;
  1458. pgoff_t aligned_index;
  1459. bool global_huge;
  1460. loff_t i_size;
  1461. int order;
  1462. if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
  1463. return 0;
  1464. global_huge = shmem_huge_global_enabled(inode, index, write_end,
  1465. shmem_huge_force, vma, vm_flags);
  1466. if (!vma || !vma_is_anon_shmem(vma)) {
  1467. /*
  1468. * For tmpfs, we now only support PMD sized THP if huge page
  1469. * is enabled, otherwise fallback to order 0.
  1470. */
  1471. return global_huge ? BIT(HPAGE_PMD_ORDER) : 0;
  1472. }
  1473. /*
  1474. * Following the 'deny' semantics of the top level, force the huge
  1475. * option off from all mounts.
  1476. */
  1477. if (shmem_huge == SHMEM_HUGE_DENY)
  1478. return 0;
  1479. /*
  1480. * Only allow inherit orders if the top-level value is 'force', which
  1481. * means non-PMD sized THP can not override 'huge' mount option now.
  1482. */
  1483. if (shmem_huge == SHMEM_HUGE_FORCE)
  1484. return READ_ONCE(huge_shmem_orders_inherit);
  1485. /* Allow mTHP that will be fully within i_size. */
  1486. order = highest_order(within_size_orders);
  1487. while (within_size_orders) {
  1488. aligned_index = round_up(index + 1, 1 << order);
  1489. i_size = round_up(i_size_read(inode), PAGE_SIZE);
  1490. if (i_size >> PAGE_SHIFT >= aligned_index) {
  1491. mask |= within_size_orders;
  1492. break;
  1493. }
  1494. order = next_order(&within_size_orders, order);
  1495. }
  1496. if (vm_flags & VM_HUGEPAGE)
  1497. mask |= READ_ONCE(huge_shmem_orders_madvise);
  1498. if (global_huge)
  1499. mask |= READ_ONCE(huge_shmem_orders_inherit);
  1500. return THP_ORDERS_ALL_FILE_DEFAULT & mask;
  1501. }
  1502. static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
  1503. struct address_space *mapping, pgoff_t index,
  1504. unsigned long orders)
  1505. {
  1506. struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
  1507. pgoff_t aligned_index;
  1508. unsigned long pages;
  1509. int order;
  1510. if (vma) {
  1511. orders = thp_vma_suitable_orders(vma, vmf->address, orders);
  1512. if (!orders)
  1513. return 0;
  1514. }
  1515. /* Find the highest order that can add into the page cache */
  1516. order = highest_order(orders);
  1517. while (orders) {
  1518. pages = 1UL << order;
  1519. aligned_index = round_down(index, pages);
  1520. /*
  1521. * Check for conflict before waiting on a huge allocation.
  1522. * Conflict might be that a huge page has just been allocated
  1523. * and added to page cache by a racing thread, or that there
  1524. * is already at least one small page in the huge extent.
  1525. * Be careful to retry when appropriate, but not forever!
  1526. * Elsewhere -EEXIST would be the right code, but not here.
  1527. */
  1528. if (!xa_find(&mapping->i_pages, &aligned_index,
  1529. aligned_index + pages - 1, XA_PRESENT))
  1530. break;
  1531. order = next_order(&orders, order);
  1532. }
  1533. return orders;
  1534. }
  1535. #else
  1536. static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
  1537. struct address_space *mapping, pgoff_t index,
  1538. unsigned long orders)
  1539. {
  1540. return 0;
  1541. }
  1542. #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  1543. static struct folio *shmem_alloc_folio(gfp_t gfp, int order,
  1544. struct shmem_inode_info *info, pgoff_t index)
  1545. {
  1546. struct mempolicy *mpol;
  1547. pgoff_t ilx;
  1548. struct folio *folio;
  1549. mpol = shmem_get_pgoff_policy(info, index, order, &ilx);
  1550. folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id());
  1551. mpol_cond_put(mpol);
  1552. return folio;
  1553. }
  1554. static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
  1555. gfp_t gfp, struct inode *inode, pgoff_t index,
  1556. struct mm_struct *fault_mm, unsigned long orders)
  1557. {
  1558. struct address_space *mapping = inode->i_mapping;
  1559. struct shmem_inode_info *info = SHMEM_I(inode);
  1560. unsigned long suitable_orders = 0;
  1561. struct folio *folio = NULL;
  1562. long pages;
  1563. int error, order;
  1564. if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
  1565. orders = 0;
  1566. if (orders > 0) {
  1567. suitable_orders = shmem_suitable_orders(inode, vmf,
  1568. mapping, index, orders);
  1569. order = highest_order(suitable_orders);
  1570. while (suitable_orders) {
  1571. pages = 1UL << order;
  1572. index = round_down(index, pages);
  1573. folio = shmem_alloc_folio(gfp, order, info, index);
  1574. if (folio)
  1575. goto allocated;
  1576. if (pages == HPAGE_PMD_NR)
  1577. count_vm_event(THP_FILE_FALLBACK);
  1578. count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK);
  1579. order = next_order(&suitable_orders, order);
  1580. }
  1581. } else {
  1582. pages = 1;
  1583. folio = shmem_alloc_folio(gfp, 0, info, index);
  1584. }
  1585. if (!folio)
  1586. return ERR_PTR(-ENOMEM);
  1587. allocated:
  1588. __folio_set_locked(folio);
  1589. __folio_set_swapbacked(folio);
  1590. gfp &= GFP_RECLAIM_MASK;
  1591. error = mem_cgroup_charge(folio, fault_mm, gfp);
  1592. if (error) {
  1593. if (xa_find(&mapping->i_pages, &index,
  1594. index + pages - 1, XA_PRESENT)) {
  1595. error = -EEXIST;
  1596. } else if (pages > 1) {
  1597. if (pages == HPAGE_PMD_NR) {
  1598. count_vm_event(THP_FILE_FALLBACK);
  1599. count_vm_event(THP_FILE_FALLBACK_CHARGE);
  1600. }
  1601. count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK);
  1602. count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE);
  1603. }
  1604. goto unlock;
  1605. }
  1606. error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp);
  1607. if (error)
  1608. goto unlock;
  1609. error = shmem_inode_acct_blocks(inode, pages);
  1610. if (error) {
  1611. struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
  1612. long freed;
  1613. /*
  1614. * Try to reclaim some space by splitting a few
  1615. * large folios beyond i_size on the filesystem.
  1616. */
  1617. shmem_unused_huge_shrink(sbinfo, NULL, pages);
  1618. /*
  1619. * And do a shmem_recalc_inode() to account for freed pages:
  1620. * except our folio is there in cache, so not quite balanced.
  1621. */
  1622. spin_lock(&info->lock);
  1623. freed = pages + info->alloced - info->swapped -
  1624. READ_ONCE(mapping->nrpages);
  1625. if (freed > 0)
  1626. info->alloced -= freed;
  1627. spin_unlock(&info->lock);
  1628. if (freed > 0)
  1629. shmem_inode_unacct_blocks(inode, freed);
  1630. error = shmem_inode_acct_blocks(inode, pages);
  1631. if (error) {
  1632. filemap_remove_folio(folio);
  1633. goto unlock;
  1634. }
  1635. }
  1636. shmem_recalc_inode(inode, pages, 0);
  1637. folio_add_lru(folio);
  1638. return folio;
  1639. unlock:
  1640. folio_unlock(folio);
  1641. folio_put(folio);
  1642. return ERR_PTR(error);
  1643. }
  1644. /*
  1645. * When a page is moved from swapcache to shmem filecache (either by the
  1646. * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
  1647. * shmem_unuse_inode()), it may have been read in earlier from swap, in
  1648. * ignorance of the mapping it belongs to. If that mapping has special
  1649. * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
  1650. * we may need to copy to a suitable page before moving to filecache.
  1651. *
  1652. * In a future release, this may well be extended to respect cpuset and
  1653. * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
  1654. * but for now it is a simple matter of zone.
  1655. */
  1656. static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
  1657. {
  1658. return folio_zonenum(folio) > gfp_zone(gfp);
  1659. }
  1660. static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
  1661. struct shmem_inode_info *info, pgoff_t index,
  1662. struct vm_area_struct *vma)
  1663. {
  1664. struct folio *new, *old = *foliop;
  1665. swp_entry_t entry = old->swap;
  1666. struct address_space *swap_mapping = swap_address_space(entry);
  1667. pgoff_t swap_index = swap_cache_index(entry);
  1668. XA_STATE(xas, &swap_mapping->i_pages, swap_index);
  1669. int nr_pages = folio_nr_pages(old);
  1670. int error = 0, i;
  1671. /*
  1672. * We have arrived here because our zones are constrained, so don't
  1673. * limit chance of success by further cpuset and node constraints.
  1674. */
  1675. gfp &= ~GFP_CONSTRAINT_MASK;
  1676. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  1677. if (nr_pages > 1) {
  1678. gfp_t huge_gfp = vma_thp_gfp_mask(vma);
  1679. gfp = limit_gfp_mask(huge_gfp, gfp);
  1680. }
  1681. #endif
  1682. new = shmem_alloc_folio(gfp, folio_order(old), info, index);
  1683. if (!new)
  1684. return -ENOMEM;
  1685. folio_ref_add(new, nr_pages);
  1686. folio_copy(new, old);
  1687. flush_dcache_folio(new);
  1688. __folio_set_locked(new);
  1689. __folio_set_swapbacked(new);
  1690. folio_mark_uptodate(new);
  1691. new->swap = entry;
  1692. folio_set_swapcache(new);
  1693. /* Swap cache still stores N entries instead of a high-order entry */
  1694. xa_lock_irq(&swap_mapping->i_pages);
  1695. for (i = 0; i < nr_pages; i++) {
  1696. void *item = xas_load(&xas);
  1697. if (item != old) {
  1698. error = -ENOENT;
  1699. break;
  1700. }
  1701. xas_store(&xas, new);
  1702. xas_next(&xas);
  1703. }
  1704. if (!error) {
  1705. mem_cgroup_replace_folio(old, new);
  1706. shmem_update_stats(new, nr_pages);
  1707. shmem_update_stats(old, -nr_pages);
  1708. }
  1709. xa_unlock_irq(&swap_mapping->i_pages);
  1710. if (unlikely(error)) {
  1711. /*
  1712. * Is this possible? I think not, now that our callers
  1713. * check both the swapcache flag and folio->private
  1714. * after getting the folio lock; but be defensive.
  1715. * Reverse old to newpage for clear and free.
  1716. */
  1717. old = new;
  1718. } else {
  1719. folio_add_lru(new);
  1720. *foliop = new;
  1721. }
  1722. folio_clear_swapcache(old);
  1723. old->private = NULL;
  1724. folio_unlock(old);
  1725. /*
  1726. * The old folio are removed from swap cache, drop the 'nr_pages'
  1727. * reference, as well as one temporary reference getting from swap
  1728. * cache.
  1729. */
  1730. folio_put_refs(old, nr_pages + 1);
  1731. return error;
  1732. }
  1733. static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
  1734. struct folio *folio, swp_entry_t swap)
  1735. {
  1736. struct address_space *mapping = inode->i_mapping;
  1737. swp_entry_t swapin_error;
  1738. void *old;
  1739. int nr_pages;
  1740. swapin_error = make_poisoned_swp_entry();
  1741. old = xa_cmpxchg_irq(&mapping->i_pages, index,
  1742. swp_to_radix_entry(swap),
  1743. swp_to_radix_entry(swapin_error), 0);
  1744. if (old != swp_to_radix_entry(swap))
  1745. return;
  1746. nr_pages = folio_nr_pages(folio);
  1747. folio_wait_writeback(folio);
  1748. delete_from_swap_cache(folio);
  1749. /*
  1750. * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
  1751. * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
  1752. * in shmem_evict_inode().
  1753. */
  1754. shmem_recalc_inode(inode, -nr_pages, -nr_pages);
  1755. swap_free_nr(swap, nr_pages);
  1756. }
  1757. static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
  1758. swp_entry_t swap, gfp_t gfp)
  1759. {
  1760. struct address_space *mapping = inode->i_mapping;
  1761. XA_STATE_ORDER(xas, &mapping->i_pages, index, 0);
  1762. void *alloced_shadow = NULL;
  1763. int alloced_order = 0, i;
  1764. /* Convert user data gfp flags to xarray node gfp flags */
  1765. gfp &= GFP_RECLAIM_MASK;
  1766. for (;;) {
  1767. int order = -1, split_order = 0;
  1768. void *old = NULL;
  1769. xas_lock_irq(&xas);
  1770. old = xas_load(&xas);
  1771. if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) {
  1772. xas_set_err(&xas, -EEXIST);
  1773. goto unlock;
  1774. }
  1775. order = xas_get_order(&xas);
  1776. /* Swap entry may have changed before we re-acquire the lock */
  1777. if (alloced_order &&
  1778. (old != alloced_shadow || order != alloced_order)) {
  1779. xas_destroy(&xas);
  1780. alloced_order = 0;
  1781. }
  1782. /* Try to split large swap entry in pagecache */
  1783. if (order > 0) {
  1784. if (!alloced_order) {
  1785. split_order = order;
  1786. goto unlock;
  1787. }
  1788. xas_split(&xas, old, order);
  1789. /*
  1790. * Re-set the swap entry after splitting, and the swap
  1791. * offset of the original large entry must be continuous.
  1792. */
  1793. for (i = 0; i < 1 << order; i++) {
  1794. pgoff_t aligned_index = round_down(index, 1 << order);
  1795. swp_entry_t tmp;
  1796. tmp = swp_entry(swp_type(swap), swp_offset(swap) + i);
  1797. __xa_store(&mapping->i_pages, aligned_index + i,
  1798. swp_to_radix_entry(tmp), 0);
  1799. }
  1800. }
  1801. unlock:
  1802. xas_unlock_irq(&xas);
  1803. /* split needed, alloc here and retry. */
  1804. if (split_order) {
  1805. xas_split_alloc(&xas, old, split_order, gfp);
  1806. if (xas_error(&xas))
  1807. goto error;
  1808. alloced_shadow = old;
  1809. alloced_order = split_order;
  1810. xas_reset(&xas);
  1811. continue;
  1812. }
  1813. if (!xas_nomem(&xas, gfp))
  1814. break;
  1815. }
  1816. error:
  1817. if (xas_error(&xas))
  1818. return xas_error(&xas);
  1819. return alloced_order;
  1820. }
  1821. /*
  1822. * Swap in the folio pointed to by *foliop.
  1823. * Caller has to make sure that *foliop contains a valid swapped folio.
  1824. * Returns 0 and the folio in foliop if success. On failure, returns the
  1825. * error code and NULL in *foliop.
  1826. */
  1827. static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
  1828. struct folio **foliop, enum sgp_type sgp,
  1829. gfp_t gfp, struct vm_area_struct *vma,
  1830. vm_fault_t *fault_type)
  1831. {
  1832. struct address_space *mapping = inode->i_mapping;
  1833. struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
  1834. struct shmem_inode_info *info = SHMEM_I(inode);
  1835. struct swap_info_struct *si;
  1836. struct folio *folio = NULL;
  1837. swp_entry_t swap;
  1838. int error, nr_pages;
  1839. VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
  1840. swap = radix_to_swp_entry(*foliop);
  1841. *foliop = NULL;
  1842. if (is_poisoned_swp_entry(swap))
  1843. return -EIO;
  1844. si = get_swap_device(swap);
  1845. if (!si) {
  1846. if (!shmem_confirm_swap(mapping, index, swap))
  1847. return -EEXIST;
  1848. else
  1849. return -EINVAL;
  1850. }
  1851. /* Look it up and read it in.. */
  1852. folio = swap_cache_get_folio(swap, NULL, 0);
  1853. if (!folio) {
  1854. int split_order;
  1855. /* Or update major stats only when swapin succeeds?? */
  1856. if (fault_type) {
  1857. *fault_type |= VM_FAULT_MAJOR;
  1858. count_vm_event(PGMAJFAULT);
  1859. count_memcg_event_mm(fault_mm, PGMAJFAULT);
  1860. }
  1861. /*
  1862. * Now swap device can only swap in order 0 folio, then we
  1863. * should split the large swap entry stored in the pagecache
  1864. * if necessary.
  1865. */
  1866. split_order = shmem_split_large_entry(inode, index, swap, gfp);
  1867. if (split_order < 0) {
  1868. error = split_order;
  1869. goto failed;
  1870. }
  1871. /*
  1872. * If the large swap entry has already been split, it is
  1873. * necessary to recalculate the new swap entry based on
  1874. * the old order alignment.
  1875. */
  1876. if (split_order > 0) {
  1877. pgoff_t offset = index - round_down(index, 1 << split_order);
  1878. swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
  1879. }
  1880. /* Here we actually start the io */
  1881. folio = shmem_swapin_cluster(swap, gfp, info, index);
  1882. if (!folio) {
  1883. error = -ENOMEM;
  1884. goto failed;
  1885. }
  1886. }
  1887. /* We have to do this with folio locked to prevent races */
  1888. folio_lock(folio);
  1889. if (!folio_test_swapcache(folio) ||
  1890. folio->swap.val != swap.val ||
  1891. !shmem_confirm_swap(mapping, index, swap)) {
  1892. error = -EEXIST;
  1893. goto unlock;
  1894. }
  1895. if (!folio_test_uptodate(folio)) {
  1896. error = -EIO;
  1897. goto failed;
  1898. }
  1899. folio_wait_writeback(folio);
  1900. nr_pages = folio_nr_pages(folio);
  1901. /*
  1902. * Some architectures may have to restore extra metadata to the
  1903. * folio after reading from swap.
  1904. */
  1905. arch_swap_restore(folio_swap(swap, folio), folio);
  1906. if (shmem_should_replace_folio(folio, gfp)) {
  1907. error = shmem_replace_folio(&folio, gfp, info, index, vma);
  1908. if (error)
  1909. goto failed;
  1910. }
  1911. error = shmem_add_to_page_cache(folio, mapping,
  1912. round_down(index, nr_pages),
  1913. swp_to_radix_entry(swap), gfp);
  1914. if (error)
  1915. goto failed;
  1916. shmem_recalc_inode(inode, 0, -nr_pages);
  1917. if (sgp == SGP_WRITE)
  1918. folio_mark_accessed(folio);
  1919. delete_from_swap_cache(folio);
  1920. folio_mark_dirty(folio);
  1921. swap_free_nr(swap, nr_pages);
  1922. put_swap_device(si);
  1923. *foliop = folio;
  1924. return 0;
  1925. failed:
  1926. if (!shmem_confirm_swap(mapping, index, swap))
  1927. error = -EEXIST;
  1928. if (error == -EIO)
  1929. shmem_set_folio_swapin_error(inode, index, folio, swap);
  1930. unlock:
  1931. if (folio) {
  1932. folio_unlock(folio);
  1933. folio_put(folio);
  1934. }
  1935. put_swap_device(si);
  1936. return error;
  1937. }
  1938. /*
  1939. * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate
  1940. *
  1941. * If we allocate a new one we do not mark it dirty. That's up to the
  1942. * vm. If we swap it in we mark it dirty since we also free the swap
  1943. * entry since a page cannot live in both the swap and page cache.
  1944. *
  1945. * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL.
  1946. */
  1947. static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
  1948. loff_t write_end, struct folio **foliop, enum sgp_type sgp,
  1949. gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type)
  1950. {
  1951. struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
  1952. struct mm_struct *fault_mm;
  1953. struct folio *folio;
  1954. int error;
  1955. bool alloced;
  1956. unsigned long orders = 0;
  1957. if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping)))
  1958. return -EINVAL;
  1959. if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
  1960. return -EFBIG;
  1961. repeat:
  1962. if (sgp <= SGP_CACHE &&
  1963. ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode))
  1964. return -EINVAL;
  1965. alloced = false;
  1966. fault_mm = vma ? vma->vm_mm : NULL;
  1967. folio = filemap_get_entry(inode->i_mapping, index);
  1968. if (folio && vma && userfaultfd_minor(vma)) {
  1969. if (!xa_is_value(folio))
  1970. folio_put(folio);
  1971. *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
  1972. return 0;
  1973. }
  1974. if (xa_is_value(folio)) {
  1975. error = shmem_swapin_folio(inode, index, &folio,
  1976. sgp, gfp, vma, fault_type);
  1977. if (error == -EEXIST)
  1978. goto repeat;
  1979. *foliop = folio;
  1980. return error;
  1981. }
  1982. if (folio) {
  1983. folio_lock(folio);
  1984. /* Has the folio been truncated or swapped out? */
  1985. if (unlikely(folio->mapping != inode->i_mapping)) {
  1986. folio_unlock(folio);
  1987. folio_put(folio);
  1988. goto repeat;
  1989. }
  1990. if (sgp == SGP_WRITE)
  1991. folio_mark_accessed(folio);
  1992. if (folio_test_uptodate(folio))
  1993. goto out;
  1994. /* fallocated folio */
  1995. if (sgp != SGP_READ)
  1996. goto clear;
  1997. folio_unlock(folio);
  1998. folio_put(folio);
  1999. }
  2000. /*
  2001. * SGP_READ: succeed on hole, with NULL folio, letting caller zero.
  2002. * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail.
  2003. */
  2004. *foliop = NULL;
  2005. if (sgp == SGP_READ)
  2006. return 0;
  2007. if (sgp == SGP_NOALLOC)
  2008. return -ENOENT;
  2009. /*
  2010. * Fast cache lookup and swap lookup did not find it: allocate.
  2011. */
  2012. if (vma && userfaultfd_missing(vma)) {
  2013. *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
  2014. return 0;
  2015. }
  2016. /* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */
  2017. orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false);
  2018. if (orders > 0) {
  2019. gfp_t huge_gfp;
  2020. huge_gfp = vma_thp_gfp_mask(vma);
  2021. huge_gfp = limit_gfp_mask(huge_gfp, gfp);
  2022. folio = shmem_alloc_and_add_folio(vmf, huge_gfp,
  2023. inode, index, fault_mm, orders);
  2024. if (!IS_ERR(folio)) {
  2025. if (folio_test_pmd_mappable(folio))
  2026. count_vm_event(THP_FILE_ALLOC);
  2027. count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC);
  2028. goto alloced;
  2029. }
  2030. if (PTR_ERR(folio) == -EEXIST)
  2031. goto repeat;
  2032. }
  2033. folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, 0);
  2034. if (IS_ERR(folio)) {
  2035. error = PTR_ERR(folio);
  2036. if (error == -EEXIST)
  2037. goto repeat;
  2038. folio = NULL;
  2039. goto unlock;
  2040. }
  2041. alloced:
  2042. alloced = true;
  2043. if (folio_test_large(folio) &&
  2044. DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
  2045. folio_next_index(folio)) {
  2046. struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
  2047. struct shmem_inode_info *info = SHMEM_I(inode);
  2048. /*
  2049. * Part of the large folio is beyond i_size: subject
  2050. * to shrink under memory pressure.
  2051. */
  2052. spin_lock(&sbinfo->shrinklist_lock);
  2053. /*
  2054. * _careful to defend against unlocked access to
  2055. * ->shrink_list in shmem_unused_huge_shrink()
  2056. */
  2057. if (list_empty_careful(&info->shrinklist)) {
  2058. list_add_tail(&info->shrinklist,
  2059. &sbinfo->shrinklist);
  2060. sbinfo->shrinklist_len++;
  2061. }
  2062. spin_unlock(&sbinfo->shrinklist_lock);
  2063. }
  2064. if (sgp == SGP_WRITE)
  2065. folio_set_referenced(folio);
  2066. /*
  2067. * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
  2068. */
  2069. if (sgp == SGP_FALLOC)
  2070. sgp = SGP_WRITE;
  2071. clear:
  2072. /*
  2073. * Let SGP_WRITE caller clear ends if write does not fill folio;
  2074. * but SGP_FALLOC on a folio fallocated earlier must initialize
  2075. * it now, lest undo on failure cancel our earlier guarantee.
  2076. */
  2077. if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
  2078. long i, n = folio_nr_pages(folio);
  2079. for (i = 0; i < n; i++)
  2080. clear_highpage(folio_page(folio, i));
  2081. flush_dcache_folio(folio);
  2082. folio_mark_uptodate(folio);
  2083. }
  2084. /* Perhaps the file has been truncated since we checked */
  2085. if (sgp <= SGP_CACHE &&
  2086. ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
  2087. error = -EINVAL;
  2088. goto unlock;
  2089. }
  2090. out:
  2091. *foliop = folio;
  2092. return 0;
  2093. /*
  2094. * Error recovery.
  2095. */
  2096. unlock:
  2097. if (alloced)
  2098. filemap_remove_folio(folio);
  2099. shmem_recalc_inode(inode, 0, 0);
  2100. if (folio) {
  2101. folio_unlock(folio);
  2102. folio_put(folio);
  2103. }
  2104. return error;
  2105. }
  2106. /**
  2107. * shmem_get_folio - find, and lock a shmem folio.
  2108. * @inode: inode to search
  2109. * @index: the page index.
  2110. * @write_end: end of a write, could extend inode size
  2111. * @foliop: pointer to the folio if found
  2112. * @sgp: SGP_* flags to control behavior
  2113. *
  2114. * Looks up the page cache entry at @inode & @index. If a folio is
  2115. * present, it is returned locked with an increased refcount.
  2116. *
  2117. * If the caller modifies data in the folio, it must call folio_mark_dirty()
  2118. * before unlocking the folio to ensure that the folio is not reclaimed.
  2119. * There is no need to reserve space before calling folio_mark_dirty().
  2120. *
  2121. * When no folio is found, the behavior depends on @sgp:
  2122. * - for SGP_READ, *@foliop is %NULL and 0 is returned
  2123. * - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned
  2124. * - for all other flags a new folio is allocated, inserted into the
  2125. * page cache and returned locked in @foliop.
  2126. *
  2127. * Context: May sleep.
  2128. * Return: 0 if successful, else a negative error code.
  2129. */
  2130. int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
  2131. struct folio **foliop, enum sgp_type sgp)
  2132. {
  2133. return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp,
  2134. mapping_gfp_mask(inode->i_mapping), NULL, NULL);
  2135. }
  2136. EXPORT_SYMBOL_GPL(shmem_get_folio);
  2137. /*
  2138. * This is like autoremove_wake_function, but it removes the wait queue
  2139. * entry unconditionally - even if something else had already woken the
  2140. * target.
  2141. */
  2142. static int synchronous_wake_function(wait_queue_entry_t *wait,
  2143. unsigned int mode, int sync, void *key)
  2144. {
  2145. int ret = default_wake_function(wait, mode, sync, key);
  2146. list_del_init(&wait->entry);
  2147. return ret;
  2148. }
  2149. /*
  2150. * Trinity finds that probing a hole which tmpfs is punching can
  2151. * prevent the hole-punch from ever completing: which in turn
  2152. * locks writers out with its hold on i_rwsem. So refrain from
  2153. * faulting pages into the hole while it's being punched. Although
  2154. * shmem_undo_range() does remove the additions, it may be unable to
  2155. * keep up, as each new page needs its own unmap_mapping_range() call,
  2156. * and the i_mmap tree grows ever slower to scan if new vmas are added.
  2157. *
  2158. * It does not matter if we sometimes reach this check just before the
  2159. * hole-punch begins, so that one fault then races with the punch:
  2160. * we just need to make racing faults a rare case.
  2161. *
  2162. * The implementation below would be much simpler if we just used a
  2163. * standard mutex or completion: but we cannot take i_rwsem in fault,
  2164. * and bloating every shmem inode for this unlikely case would be sad.
  2165. */
  2166. static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode)
  2167. {
  2168. struct shmem_falloc *shmem_falloc;
  2169. struct file *fpin = NULL;
  2170. vm_fault_t ret = 0;
  2171. spin_lock(&inode->i_lock);
  2172. shmem_falloc = inode->i_private;
  2173. if (shmem_falloc &&
  2174. shmem_falloc->waitq &&
  2175. vmf->pgoff >= shmem_falloc->start &&
  2176. vmf->pgoff < shmem_falloc->next) {
  2177. wait_queue_head_t *shmem_falloc_waitq;
  2178. DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
  2179. ret = VM_FAULT_NOPAGE;
  2180. fpin = maybe_unlock_mmap_for_io(vmf, NULL);
  2181. shmem_falloc_waitq = shmem_falloc->waitq;
  2182. prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
  2183. TASK_UNINTERRUPTIBLE);
  2184. spin_unlock(&inode->i_lock);
  2185. schedule();
  2186. /*
  2187. * shmem_falloc_waitq points into the shmem_fallocate()
  2188. * stack of the hole-punching task: shmem_falloc_waitq
  2189. * is usually invalid by the time we reach here, but
  2190. * finish_wait() does not dereference it in that case;
  2191. * though i_lock needed lest racing with wake_up_all().
  2192. */
  2193. spin_lock(&inode->i_lock);
  2194. finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
  2195. }
  2196. spin_unlock(&inode->i_lock);
  2197. if (fpin) {
  2198. fput(fpin);
  2199. ret = VM_FAULT_RETRY;
  2200. }
  2201. return ret;
  2202. }
  2203. static vm_fault_t shmem_fault(struct vm_fault *vmf)
  2204. {
  2205. struct inode *inode = file_inode(vmf->vma->vm_file);
  2206. gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
  2207. struct folio *folio = NULL;
  2208. vm_fault_t ret = 0;
  2209. int err;
  2210. /*
  2211. * Trinity finds that probing a hole which tmpfs is punching can
  2212. * prevent the hole-punch from ever completing: noted in i_private.
  2213. */
  2214. if (unlikely(inode->i_private)) {
  2215. ret = shmem_falloc_wait(vmf, inode);
  2216. if (ret)
  2217. return ret;
  2218. }
  2219. WARN_ON_ONCE(vmf->page != NULL);
  2220. err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE,
  2221. gfp, vmf, &ret);
  2222. if (err)
  2223. return vmf_error(err);
  2224. if (folio) {
  2225. vmf->page = folio_file_page(folio, vmf->pgoff);
  2226. ret |= VM_FAULT_LOCKED;
  2227. }
  2228. return ret;
  2229. }
  2230. unsigned long shmem_get_unmapped_area(struct file *file,
  2231. unsigned long uaddr, unsigned long len,
  2232. unsigned long pgoff, unsigned long flags)
  2233. {
  2234. unsigned long addr;
  2235. unsigned long offset;
  2236. unsigned long inflated_len;
  2237. unsigned long inflated_addr;
  2238. unsigned long inflated_offset;
  2239. unsigned long hpage_size;
  2240. if (len > TASK_SIZE)
  2241. return -ENOMEM;
  2242. addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff,
  2243. flags);
  2244. if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
  2245. return addr;
  2246. if (IS_ERR_VALUE(addr))
  2247. return addr;
  2248. if (addr & ~PAGE_MASK)
  2249. return addr;
  2250. if (addr > TASK_SIZE - len)
  2251. return addr;
  2252. if (shmem_huge == SHMEM_HUGE_DENY)
  2253. return addr;
  2254. if (flags & MAP_FIXED)
  2255. return addr;
  2256. /*
  2257. * Our priority is to support MAP_SHARED mapped hugely;
  2258. * and support MAP_PRIVATE mapped hugely too, until it is COWed.
  2259. * But if caller specified an address hint and we allocated area there
  2260. * successfully, respect that as before.
  2261. */
  2262. if (uaddr == addr)
  2263. return addr;
  2264. hpage_size = HPAGE_PMD_SIZE;
  2265. if (shmem_huge != SHMEM_HUGE_FORCE) {
  2266. struct super_block *sb;
  2267. unsigned long __maybe_unused hpage_orders;
  2268. int order = 0;
  2269. if (file) {
  2270. VM_BUG_ON(file->f_op != &shmem_file_operations);
  2271. sb = file_inode(file)->i_sb;
  2272. } else {
  2273. /*
  2274. * Called directly from mm/mmap.c, or drivers/char/mem.c
  2275. * for "/dev/zero", to create a shared anonymous object.
  2276. */
  2277. if (IS_ERR(shm_mnt))
  2278. return addr;
  2279. sb = shm_mnt->mnt_sb;
  2280. /*
  2281. * Find the highest mTHP order used for anonymous shmem to
  2282. * provide a suitable alignment address.
  2283. */
  2284. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  2285. hpage_orders = READ_ONCE(huge_shmem_orders_always);
  2286. hpage_orders |= READ_ONCE(huge_shmem_orders_within_size);
  2287. hpage_orders |= READ_ONCE(huge_shmem_orders_madvise);
  2288. if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER)
  2289. hpage_orders |= READ_ONCE(huge_shmem_orders_inherit);
  2290. if (hpage_orders > 0) {
  2291. order = highest_order(hpage_orders);
  2292. hpage_size = PAGE_SIZE << order;
  2293. }
  2294. #endif
  2295. }
  2296. if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order)
  2297. return addr;
  2298. }
  2299. if (len < hpage_size)
  2300. return addr;
  2301. offset = (pgoff << PAGE_SHIFT) & (hpage_size - 1);
  2302. if (offset && offset + len < 2 * hpage_size)
  2303. return addr;
  2304. if ((addr & (hpage_size - 1)) == offset)
  2305. return addr;
  2306. inflated_len = len + hpage_size - PAGE_SIZE;
  2307. if (inflated_len > TASK_SIZE)
  2308. return addr;
  2309. if (inflated_len < len)
  2310. return addr;
  2311. inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr,
  2312. inflated_len, 0, flags);
  2313. if (IS_ERR_VALUE(inflated_addr))
  2314. return addr;
  2315. if (inflated_addr & ~PAGE_MASK)
  2316. return addr;
  2317. inflated_offset = inflated_addr & (hpage_size - 1);
  2318. inflated_addr += offset - inflated_offset;
  2319. if (inflated_offset > offset)
  2320. inflated_addr += hpage_size;
  2321. if (inflated_addr > TASK_SIZE - len)
  2322. return addr;
  2323. return inflated_addr;
  2324. }
  2325. #ifdef CONFIG_NUMA
  2326. static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
  2327. {
  2328. struct inode *inode = file_inode(vma->vm_file);
  2329. return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
  2330. }
  2331. static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
  2332. unsigned long addr, pgoff_t *ilx)
  2333. {
  2334. struct inode *inode = file_inode(vma->vm_file);
  2335. pgoff_t index;
  2336. /*
  2337. * Bias interleave by inode number to distribute better across nodes;
  2338. * but this interface is independent of which page order is used, so
  2339. * supplies only that bias, letting caller apply the offset (adjusted
  2340. * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()).
  2341. */
  2342. *ilx = inode->i_ino;
  2343. index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  2344. return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
  2345. }
  2346. static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
  2347. pgoff_t index, unsigned int order, pgoff_t *ilx)
  2348. {
  2349. struct mempolicy *mpol;
  2350. /* Bias interleave by inode number to distribute better across nodes */
  2351. *ilx = info->vfs_inode.i_ino + (index >> order);
  2352. mpol = mpol_shared_policy_lookup(&info->policy, index);
  2353. return mpol ? mpol : get_task_policy(current);
  2354. }
  2355. #else
  2356. static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
  2357. pgoff_t index, unsigned int order, pgoff_t *ilx)
  2358. {
  2359. *ilx = 0;
  2360. return NULL;
  2361. }
  2362. #endif /* CONFIG_NUMA */
  2363. int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
  2364. {
  2365. struct inode *inode = file_inode(file);
  2366. struct shmem_inode_info *info = SHMEM_I(inode);
  2367. int retval = -ENOMEM;
  2368. /*
  2369. * What serializes the accesses to info->flags?
  2370. * ipc_lock_object() when called from shmctl_do_lock(),
  2371. * no serialization needed when called from shm_destroy().
  2372. */
  2373. if (lock && !(info->flags & VM_LOCKED)) {
  2374. if (!user_shm_lock(inode->i_size, ucounts))
  2375. goto out_nomem;
  2376. info->flags |= VM_LOCKED;
  2377. mapping_set_unevictable(file->f_mapping);
  2378. }
  2379. if (!lock && (info->flags & VM_LOCKED) && ucounts) {
  2380. user_shm_unlock(inode->i_size, ucounts);
  2381. info->flags &= ~VM_LOCKED;
  2382. mapping_clear_unevictable(file->f_mapping);
  2383. }
  2384. retval = 0;
  2385. out_nomem:
  2386. return retval;
  2387. }
  2388. static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
  2389. {
  2390. struct inode *inode = file_inode(file);
  2391. struct shmem_inode_info *info = SHMEM_I(inode);
  2392. int ret;
  2393. ret = seal_check_write(info->seals, vma);
  2394. if (ret)
  2395. return ret;
  2396. file_accessed(file);
  2397. /* This is anonymous shared memory if it is unlinked at the time of mmap */
  2398. if (inode->i_nlink)
  2399. vma->vm_ops = &shmem_vm_ops;
  2400. else
  2401. vma->vm_ops = &shmem_anon_vm_ops;
  2402. return 0;
  2403. }
  2404. static int shmem_file_open(struct inode *inode, struct file *file)
  2405. {
  2406. file->f_mode |= FMODE_CAN_ODIRECT;
  2407. return generic_file_open(inode, file);
  2408. }
  2409. #ifdef CONFIG_TMPFS_XATTR
  2410. static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
  2411. /*
  2412. * chattr's fsflags are unrelated to extended attributes,
  2413. * but tmpfs has chosen to enable them under the same config option.
  2414. */
  2415. static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
  2416. {
  2417. unsigned int i_flags = 0;
  2418. if (fsflags & FS_NOATIME_FL)
  2419. i_flags |= S_NOATIME;
  2420. if (fsflags & FS_APPEND_FL)
  2421. i_flags |= S_APPEND;
  2422. if (fsflags & FS_IMMUTABLE_FL)
  2423. i_flags |= S_IMMUTABLE;
  2424. /*
  2425. * But FS_NODUMP_FL does not require any action in i_flags.
  2426. */
  2427. inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE);
  2428. }
  2429. #else
  2430. static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
  2431. {
  2432. }
  2433. #define shmem_initxattrs NULL
  2434. #endif
  2435. static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode)
  2436. {
  2437. return &SHMEM_I(inode)->dir_offsets;
  2438. }
  2439. static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
  2440. struct super_block *sb,
  2441. struct inode *dir, umode_t mode,
  2442. dev_t dev, unsigned long flags)
  2443. {
  2444. struct inode *inode;
  2445. struct shmem_inode_info *info;
  2446. struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
  2447. ino_t ino;
  2448. int err;
  2449. err = shmem_reserve_inode(sb, &ino);
  2450. if (err)
  2451. return ERR_PTR(err);
  2452. inode = new_inode(sb);
  2453. if (!inode) {
  2454. shmem_free_inode(sb, 0);
  2455. return ERR_PTR(-ENOSPC);
  2456. }
  2457. inode->i_ino = ino;
  2458. inode_init_owner(idmap, inode, dir, mode);
  2459. inode->i_blocks = 0;
  2460. simple_inode_init_ts(inode);
  2461. inode->i_generation = get_random_u32();
  2462. info = SHMEM_I(inode);
  2463. memset(info, 0, (char *)inode - (char *)info);
  2464. spin_lock_init(&info->lock);
  2465. atomic_set(&info->stop_eviction, 0);
  2466. info->seals = F_SEAL_SEAL;
  2467. info->flags = flags & VM_NORESERVE;
  2468. info->i_crtime = inode_get_mtime(inode);
  2469. info->fsflags = (dir == NULL) ? 0 :
  2470. SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
  2471. if (info->fsflags)
  2472. shmem_set_inode_flags(inode, info->fsflags);
  2473. INIT_LIST_HEAD(&info->shrinklist);
  2474. INIT_LIST_HEAD(&info->swaplist);
  2475. simple_xattrs_init(&info->xattrs);
  2476. cache_no_acl(inode);
  2477. if (sbinfo->noswap)
  2478. mapping_set_unevictable(inode->i_mapping);
  2479. mapping_set_large_folios(inode->i_mapping);
  2480. switch (mode & S_IFMT) {
  2481. default:
  2482. inode->i_op = &shmem_special_inode_operations;
  2483. init_special_inode(inode, mode, dev);
  2484. break;
  2485. case S_IFREG:
  2486. inode->i_mapping->a_ops = &shmem_aops;
  2487. inode->i_op = &shmem_inode_operations;
  2488. inode->i_fop = &shmem_file_operations;
  2489. mpol_shared_policy_init(&info->policy,
  2490. shmem_get_sbmpol(sbinfo));
  2491. break;
  2492. case S_IFDIR:
  2493. inc_nlink(inode);
  2494. /* Some things misbehave if size == 0 on a directory */
  2495. inode->i_size = 2 * BOGO_DIRENT_SIZE;
  2496. inode->i_op = &shmem_dir_inode_operations;
  2497. inode->i_fop = &simple_offset_dir_operations;
  2498. simple_offset_init(shmem_get_offset_ctx(inode));
  2499. break;
  2500. case S_IFLNK:
  2501. /*
  2502. * Must not load anything in the rbtree,
  2503. * mpol_free_shared_policy will not be called.
  2504. */
  2505. mpol_shared_policy_init(&info->policy, NULL);
  2506. break;
  2507. }
  2508. lockdep_annotate_inode_mutex_key(inode);
  2509. return inode;
  2510. }
  2511. #ifdef CONFIG_TMPFS_QUOTA
  2512. static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
  2513. struct super_block *sb, struct inode *dir,
  2514. umode_t mode, dev_t dev, unsigned long flags)
  2515. {
  2516. int err;
  2517. struct inode *inode;
  2518. inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
  2519. if (IS_ERR(inode))
  2520. return inode;
  2521. err = dquot_initialize(inode);
  2522. if (err)
  2523. goto errout;
  2524. err = dquot_alloc_inode(inode);
  2525. if (err) {
  2526. dquot_drop(inode);
  2527. goto errout;
  2528. }
  2529. return inode;
  2530. errout:
  2531. inode->i_flags |= S_NOQUOTA;
  2532. iput(inode);
  2533. return ERR_PTR(err);
  2534. }
  2535. #else
  2536. static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
  2537. struct super_block *sb, struct inode *dir,
  2538. umode_t mode, dev_t dev, unsigned long flags)
  2539. {
  2540. return __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
  2541. }
  2542. #endif /* CONFIG_TMPFS_QUOTA */
  2543. #ifdef CONFIG_USERFAULTFD
  2544. int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
  2545. struct vm_area_struct *dst_vma,
  2546. unsigned long dst_addr,
  2547. unsigned long src_addr,
  2548. uffd_flags_t flags,
  2549. struct folio **foliop)
  2550. {
  2551. struct inode *inode = file_inode(dst_vma->vm_file);
  2552. struct shmem_inode_info *info = SHMEM_I(inode);
  2553. struct address_space *mapping = inode->i_mapping;
  2554. gfp_t gfp = mapping_gfp_mask(mapping);
  2555. pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
  2556. void *page_kaddr;
  2557. struct folio *folio;
  2558. int ret;
  2559. pgoff_t max_off;
  2560. if (shmem_inode_acct_blocks(inode, 1)) {
  2561. /*
  2562. * We may have got a page, returned -ENOENT triggering a retry,
  2563. * and now we find ourselves with -ENOMEM. Release the page, to
  2564. * avoid a BUG_ON in our caller.
  2565. */
  2566. if (unlikely(*foliop)) {
  2567. folio_put(*foliop);
  2568. *foliop = NULL;
  2569. }
  2570. return -ENOMEM;
  2571. }
  2572. if (!*foliop) {
  2573. ret = -ENOMEM;
  2574. folio = shmem_alloc_folio(gfp, 0, info, pgoff);
  2575. if (!folio)
  2576. goto out_unacct_blocks;
  2577. if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
  2578. page_kaddr = kmap_local_folio(folio, 0);
  2579. /*
  2580. * The read mmap_lock is held here. Despite the
  2581. * mmap_lock being read recursive a deadlock is still
  2582. * possible if a writer has taken a lock. For example:
  2583. *
  2584. * process A thread 1 takes read lock on own mmap_lock
  2585. * process A thread 2 calls mmap, blocks taking write lock
  2586. * process B thread 1 takes page fault, read lock on own mmap lock
  2587. * process B thread 2 calls mmap, blocks taking write lock
  2588. * process A thread 1 blocks taking read lock on process B
  2589. * process B thread 1 blocks taking read lock on process A
  2590. *
  2591. * Disable page faults to prevent potential deadlock
  2592. * and retry the copy outside the mmap_lock.
  2593. */
  2594. pagefault_disable();
  2595. ret = copy_from_user(page_kaddr,
  2596. (const void __user *)src_addr,
  2597. PAGE_SIZE);
  2598. pagefault_enable();
  2599. kunmap_local(page_kaddr);
  2600. /* fallback to copy_from_user outside mmap_lock */
  2601. if (unlikely(ret)) {
  2602. *foliop = folio;
  2603. ret = -ENOENT;
  2604. /* don't free the page */
  2605. goto out_unacct_blocks;
  2606. }
  2607. flush_dcache_folio(folio);
  2608. } else { /* ZEROPAGE */
  2609. clear_user_highpage(&folio->page, dst_addr);
  2610. }
  2611. } else {
  2612. folio = *foliop;
  2613. VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
  2614. *foliop = NULL;
  2615. }
  2616. VM_BUG_ON(folio_test_locked(folio));
  2617. VM_BUG_ON(folio_test_swapbacked(folio));
  2618. __folio_set_locked(folio);
  2619. __folio_set_swapbacked(folio);
  2620. __folio_mark_uptodate(folio);
  2621. ret = -EFAULT;
  2622. max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
  2623. if (unlikely(pgoff >= max_off))
  2624. goto out_release;
  2625. ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp);
  2626. if (ret)
  2627. goto out_release;
  2628. ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp);
  2629. if (ret)
  2630. goto out_release;
  2631. ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
  2632. &folio->page, true, flags);
  2633. if (ret)
  2634. goto out_delete_from_cache;
  2635. shmem_recalc_inode(inode, 1, 0);
  2636. folio_unlock(folio);
  2637. return 0;
  2638. out_delete_from_cache:
  2639. filemap_remove_folio(folio);
  2640. out_release:
  2641. folio_unlock(folio);
  2642. folio_put(folio);
  2643. out_unacct_blocks:
  2644. shmem_inode_unacct_blocks(inode, 1);
  2645. return ret;
  2646. }
  2647. #endif /* CONFIG_USERFAULTFD */
  2648. #ifdef CONFIG_TMPFS
  2649. static const struct inode_operations shmem_symlink_inode_operations;
  2650. static const struct inode_operations shmem_short_symlink_operations;
  2651. static int
  2652. shmem_write_begin(struct file *file, struct address_space *mapping,
  2653. loff_t pos, unsigned len,
  2654. struct folio **foliop, void **fsdata)
  2655. {
  2656. struct inode *inode = mapping->host;
  2657. struct shmem_inode_info *info = SHMEM_I(inode);
  2658. pgoff_t index = pos >> PAGE_SHIFT;
  2659. struct folio *folio;
  2660. int ret = 0;
  2661. /* i_rwsem is held by caller */
  2662. if (unlikely(info->seals & (F_SEAL_GROW |
  2663. F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
  2664. if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
  2665. return -EPERM;
  2666. if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
  2667. return -EPERM;
  2668. }
  2669. ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE);
  2670. if (ret)
  2671. return ret;
  2672. if (folio_contain_hwpoisoned_page(folio)) {
  2673. folio_unlock(folio);
  2674. folio_put(folio);
  2675. return -EIO;
  2676. }
  2677. *foliop = folio;
  2678. return 0;
  2679. }
  2680. static int
  2681. shmem_write_end(struct file *file, struct address_space *mapping,
  2682. loff_t pos, unsigned len, unsigned copied,
  2683. struct folio *folio, void *fsdata)
  2684. {
  2685. struct inode *inode = mapping->host;
  2686. if (pos + copied > inode->i_size)
  2687. i_size_write(inode, pos + copied);
  2688. if (!folio_test_uptodate(folio)) {
  2689. if (copied < folio_size(folio)) {
  2690. size_t from = offset_in_folio(folio, pos);
  2691. folio_zero_segments(folio, 0, from,
  2692. from + copied, folio_size(folio));
  2693. }
  2694. folio_mark_uptodate(folio);
  2695. }
  2696. folio_mark_dirty(folio);
  2697. folio_unlock(folio);
  2698. folio_put(folio);
  2699. return copied;
  2700. }
  2701. static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
  2702. {
  2703. struct file *file = iocb->ki_filp;
  2704. struct inode *inode = file_inode(file);
  2705. struct address_space *mapping = inode->i_mapping;
  2706. pgoff_t index;
  2707. unsigned long offset;
  2708. int error = 0;
  2709. ssize_t retval = 0;
  2710. loff_t *ppos = &iocb->ki_pos;
  2711. index = *ppos >> PAGE_SHIFT;
  2712. offset = *ppos & ~PAGE_MASK;
  2713. for (;;) {
  2714. struct folio *folio = NULL;
  2715. struct page *page = NULL;
  2716. pgoff_t end_index;
  2717. unsigned long nr, ret;
  2718. loff_t i_size = i_size_read(inode);
  2719. end_index = i_size >> PAGE_SHIFT;
  2720. if (index > end_index)
  2721. break;
  2722. if (index == end_index) {
  2723. nr = i_size & ~PAGE_MASK;
  2724. if (nr <= offset)
  2725. break;
  2726. }
  2727. error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
  2728. if (error) {
  2729. if (error == -EINVAL)
  2730. error = 0;
  2731. break;
  2732. }
  2733. if (folio) {
  2734. folio_unlock(folio);
  2735. page = folio_file_page(folio, index);
  2736. if (PageHWPoison(page)) {
  2737. folio_put(folio);
  2738. error = -EIO;
  2739. break;
  2740. }
  2741. }
  2742. /*
  2743. * We must evaluate after, since reads (unlike writes)
  2744. * are called without i_rwsem protection against truncate
  2745. */
  2746. nr = PAGE_SIZE;
  2747. i_size = i_size_read(inode);
  2748. end_index = i_size >> PAGE_SHIFT;
  2749. if (index == end_index) {
  2750. nr = i_size & ~PAGE_MASK;
  2751. if (nr <= offset) {
  2752. if (folio)
  2753. folio_put(folio);
  2754. break;
  2755. }
  2756. }
  2757. nr -= offset;
  2758. if (folio) {
  2759. /*
  2760. * If users can be writing to this page using arbitrary
  2761. * virtual addresses, take care about potential aliasing
  2762. * before reading the page on the kernel side.
  2763. */
  2764. if (mapping_writably_mapped(mapping))
  2765. flush_dcache_page(page);
  2766. /*
  2767. * Mark the page accessed if we read the beginning.
  2768. */
  2769. if (!offset)
  2770. folio_mark_accessed(folio);
  2771. /*
  2772. * Ok, we have the page, and it's up-to-date, so
  2773. * now we can copy it to user space...
  2774. */
  2775. ret = copy_page_to_iter(page, offset, nr, to);
  2776. folio_put(folio);
  2777. } else if (user_backed_iter(to)) {
  2778. /*
  2779. * Copy to user tends to be so well optimized, but
  2780. * clear_user() not so much, that it is noticeably
  2781. * faster to copy the zero page instead of clearing.
  2782. */
  2783. ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
  2784. } else {
  2785. /*
  2786. * But submitting the same page twice in a row to
  2787. * splice() - or others? - can result in confusion:
  2788. * so don't attempt that optimization on pipes etc.
  2789. */
  2790. ret = iov_iter_zero(nr, to);
  2791. }
  2792. retval += ret;
  2793. offset += ret;
  2794. index += offset >> PAGE_SHIFT;
  2795. offset &= ~PAGE_MASK;
  2796. if (!iov_iter_count(to))
  2797. break;
  2798. if (ret < nr) {
  2799. error = -EFAULT;
  2800. break;
  2801. }
  2802. cond_resched();
  2803. }
  2804. *ppos = ((loff_t) index << PAGE_SHIFT) + offset;
  2805. file_accessed(file);
  2806. return retval ? retval : error;
  2807. }
  2808. static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
  2809. {
  2810. struct file *file = iocb->ki_filp;
  2811. struct inode *inode = file->f_mapping->host;
  2812. ssize_t ret;
  2813. inode_lock(inode);
  2814. ret = generic_write_checks(iocb, from);
  2815. if (ret <= 0)
  2816. goto unlock;
  2817. ret = file_remove_privs(file);
  2818. if (ret)
  2819. goto unlock;
  2820. ret = file_update_time(file);
  2821. if (ret)
  2822. goto unlock;
  2823. ret = generic_perform_write(iocb, from);
  2824. unlock:
  2825. inode_unlock(inode);
  2826. return ret;
  2827. }
  2828. static bool zero_pipe_buf_get(struct pipe_inode_info *pipe,
  2829. struct pipe_buffer *buf)
  2830. {
  2831. return true;
  2832. }
  2833. static void zero_pipe_buf_release(struct pipe_inode_info *pipe,
  2834. struct pipe_buffer *buf)
  2835. {
  2836. }
  2837. static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe,
  2838. struct pipe_buffer *buf)
  2839. {
  2840. return false;
  2841. }
  2842. static const struct pipe_buf_operations zero_pipe_buf_ops = {
  2843. .release = zero_pipe_buf_release,
  2844. .try_steal = zero_pipe_buf_try_steal,
  2845. .get = zero_pipe_buf_get,
  2846. };
  2847. static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
  2848. loff_t fpos, size_t size)
  2849. {
  2850. size_t offset = fpos & ~PAGE_MASK;
  2851. size = min_t(size_t, size, PAGE_SIZE - offset);
  2852. if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
  2853. struct pipe_buffer *buf = pipe_head_buf(pipe);
  2854. *buf = (struct pipe_buffer) {
  2855. .ops = &zero_pipe_buf_ops,
  2856. .page = ZERO_PAGE(0),
  2857. .offset = offset,
  2858. .len = size,
  2859. };
  2860. pipe->head++;
  2861. }
  2862. return size;
  2863. }
  2864. static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
  2865. struct pipe_inode_info *pipe,
  2866. size_t len, unsigned int flags)
  2867. {
  2868. struct inode *inode = file_inode(in);
  2869. struct address_space *mapping = inode->i_mapping;
  2870. struct folio *folio = NULL;
  2871. size_t total_spliced = 0, used, npages, n, part;
  2872. loff_t isize;
  2873. int error = 0;
  2874. /* Work out how much data we can actually add into the pipe */
  2875. used = pipe_occupancy(pipe->head, pipe->tail);
  2876. npages = max_t(ssize_t, pipe->max_usage - used, 0);
  2877. len = min_t(size_t, len, npages * PAGE_SIZE);
  2878. do {
  2879. if (*ppos >= i_size_read(inode))
  2880. break;
  2881. error = shmem_get_folio(inode, *ppos / PAGE_SIZE, 0, &folio,
  2882. SGP_READ);
  2883. if (error) {
  2884. if (error == -EINVAL)
  2885. error = 0;
  2886. break;
  2887. }
  2888. if (folio) {
  2889. folio_unlock(folio);
  2890. if (folio_test_hwpoison(folio) ||
  2891. (folio_test_large(folio) &&
  2892. folio_test_has_hwpoisoned(folio))) {
  2893. error = -EIO;
  2894. break;
  2895. }
  2896. }
  2897. /*
  2898. * i_size must be checked after we know the pages are Uptodate.
  2899. *
  2900. * Checking i_size after the check allows us to calculate
  2901. * the correct value for "nr", which means the zero-filled
  2902. * part of the page is not copied back to userspace (unless
  2903. * another truncate extends the file - this is desired though).
  2904. */
  2905. isize = i_size_read(inode);
  2906. if (unlikely(*ppos >= isize))
  2907. break;
  2908. part = min_t(loff_t, isize - *ppos, len);
  2909. if (folio) {
  2910. /*
  2911. * If users can be writing to this page using arbitrary
  2912. * virtual addresses, take care about potential aliasing
  2913. * before reading the page on the kernel side.
  2914. */
  2915. if (mapping_writably_mapped(mapping))
  2916. flush_dcache_folio(folio);
  2917. folio_mark_accessed(folio);
  2918. /*
  2919. * Ok, we have the page, and it's up-to-date, so we can
  2920. * now splice it into the pipe.
  2921. */
  2922. n = splice_folio_into_pipe(pipe, folio, *ppos, part);
  2923. folio_put(folio);
  2924. folio = NULL;
  2925. } else {
  2926. n = splice_zeropage_into_pipe(pipe, *ppos, part);
  2927. }
  2928. if (!n)
  2929. break;
  2930. len -= n;
  2931. total_spliced += n;
  2932. *ppos += n;
  2933. in->f_ra.prev_pos = *ppos;
  2934. if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
  2935. break;
  2936. cond_resched();
  2937. } while (len);
  2938. if (folio)
  2939. folio_put(folio);
  2940. file_accessed(in);
  2941. return total_spliced ? total_spliced : error;
  2942. }
  2943. static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
  2944. {
  2945. struct address_space *mapping = file->f_mapping;
  2946. struct inode *inode = mapping->host;
  2947. if (whence != SEEK_DATA && whence != SEEK_HOLE)
  2948. return generic_file_llseek_size(file, offset, whence,
  2949. MAX_LFS_FILESIZE, i_size_read(inode));
  2950. if (offset < 0)
  2951. return -ENXIO;
  2952. inode_lock(inode);
  2953. /* We're holding i_rwsem so we can access i_size directly */
  2954. offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
  2955. if (offset >= 0)
  2956. offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
  2957. inode_unlock(inode);
  2958. return offset;
  2959. }
  2960. static long shmem_fallocate(struct file *file, int mode, loff_t offset,
  2961. loff_t len)
  2962. {
  2963. struct inode *inode = file_inode(file);
  2964. struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
  2965. struct shmem_inode_info *info = SHMEM_I(inode);
  2966. struct shmem_falloc shmem_falloc;
  2967. pgoff_t start, index, end, undo_fallocend;
  2968. int error;
  2969. if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
  2970. return -EOPNOTSUPP;
  2971. inode_lock(inode);
  2972. if (mode & FALLOC_FL_PUNCH_HOLE) {
  2973. struct address_space *mapping = file->f_mapping;
  2974. loff_t unmap_start = round_up(offset, PAGE_SIZE);
  2975. loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
  2976. DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
  2977. /* protected by i_rwsem */
  2978. if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
  2979. error = -EPERM;
  2980. goto out;
  2981. }
  2982. shmem_falloc.waitq = &shmem_falloc_waitq;
  2983. shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
  2984. shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
  2985. spin_lock(&inode->i_lock);
  2986. inode->i_private = &shmem_falloc;
  2987. spin_unlock(&inode->i_lock);
  2988. if ((u64)unmap_end > (u64)unmap_start)
  2989. unmap_mapping_range(mapping, unmap_start,
  2990. 1 + unmap_end - unmap_start, 0);
  2991. shmem_truncate_range(inode, offset, offset + len - 1);
  2992. /* No need to unmap again: hole-punching leaves COWed pages */
  2993. spin_lock(&inode->i_lock);
  2994. inode->i_private = NULL;
  2995. wake_up_all(&shmem_falloc_waitq);
  2996. WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
  2997. spin_unlock(&inode->i_lock);
  2998. error = 0;
  2999. goto out;
  3000. }
  3001. /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
  3002. error = inode_newsize_ok(inode, offset + len);
  3003. if (error)
  3004. goto out;
  3005. if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
  3006. error = -EPERM;
  3007. goto out;
  3008. }
  3009. start = offset >> PAGE_SHIFT;
  3010. end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
  3011. /* Try to avoid a swapstorm if len is impossible to satisfy */
  3012. if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
  3013. error = -ENOSPC;
  3014. goto out;
  3015. }
  3016. shmem_falloc.waitq = NULL;
  3017. shmem_falloc.start = start;
  3018. shmem_falloc.next = start;
  3019. shmem_falloc.nr_falloced = 0;
  3020. shmem_falloc.nr_unswapped = 0;
  3021. spin_lock(&inode->i_lock);
  3022. inode->i_private = &shmem_falloc;
  3023. spin_unlock(&inode->i_lock);
  3024. /*
  3025. * info->fallocend is only relevant when huge pages might be
  3026. * involved: to prevent split_huge_page() freeing fallocated
  3027. * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
  3028. */
  3029. undo_fallocend = info->fallocend;
  3030. if (info->fallocend < end)
  3031. info->fallocend = end;
  3032. for (index = start; index < end; ) {
  3033. struct folio *folio;
  3034. /*
  3035. * Check for fatal signal so that we abort early in OOM
  3036. * situations. We don't want to abort in case of non-fatal
  3037. * signals as large fallocate can take noticeable time and
  3038. * e.g. periodic timers may result in fallocate constantly
  3039. * restarting.
  3040. */
  3041. if (fatal_signal_pending(current))
  3042. error = -EINTR;
  3043. else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
  3044. error = -ENOMEM;
  3045. else
  3046. error = shmem_get_folio(inode, index, offset + len,
  3047. &folio, SGP_FALLOC);
  3048. if (error) {
  3049. info->fallocend = undo_fallocend;
  3050. /* Remove the !uptodate folios we added */
  3051. if (index > start) {
  3052. shmem_undo_range(inode,
  3053. (loff_t)start << PAGE_SHIFT,
  3054. ((loff_t)index << PAGE_SHIFT) - 1, true);
  3055. }
  3056. goto undone;
  3057. }
  3058. /*
  3059. * Here is a more important optimization than it appears:
  3060. * a second SGP_FALLOC on the same large folio will clear it,
  3061. * making it uptodate and un-undoable if we fail later.
  3062. */
  3063. index = folio_next_index(folio);
  3064. /* Beware 32-bit wraparound */
  3065. if (!index)
  3066. index--;
  3067. /*
  3068. * Inform shmem_writepage() how far we have reached.
  3069. * No need for lock or barrier: we have the page lock.
  3070. */
  3071. if (!folio_test_uptodate(folio))
  3072. shmem_falloc.nr_falloced += index - shmem_falloc.next;
  3073. shmem_falloc.next = index;
  3074. /*
  3075. * If !uptodate, leave it that way so that freeable folios
  3076. * can be recognized if we need to rollback on error later.
  3077. * But mark it dirty so that memory pressure will swap rather
  3078. * than free the folios we are allocating (and SGP_CACHE folios
  3079. * might still be clean: we now need to mark those dirty too).
  3080. */
  3081. folio_mark_dirty(folio);
  3082. folio_unlock(folio);
  3083. folio_put(folio);
  3084. cond_resched();
  3085. }
  3086. if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
  3087. i_size_write(inode, offset + len);
  3088. undone:
  3089. spin_lock(&inode->i_lock);
  3090. inode->i_private = NULL;
  3091. spin_unlock(&inode->i_lock);
  3092. out:
  3093. if (!error)
  3094. file_modified(file);
  3095. inode_unlock(inode);
  3096. return error;
  3097. }
  3098. static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
  3099. {
  3100. struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
  3101. buf->f_type = TMPFS_MAGIC;
  3102. buf->f_bsize = PAGE_SIZE;
  3103. buf->f_namelen = NAME_MAX;
  3104. if (sbinfo->max_blocks) {
  3105. buf->f_blocks = sbinfo->max_blocks;
  3106. buf->f_bavail =
  3107. buf->f_bfree = sbinfo->max_blocks -
  3108. percpu_counter_sum(&sbinfo->used_blocks);
  3109. }
  3110. if (sbinfo->max_inodes) {
  3111. buf->f_files = sbinfo->max_inodes;
  3112. buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE;
  3113. }
  3114. /* else leave those fields 0 like simple_statfs */
  3115. buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
  3116. return 0;
  3117. }
  3118. /*
  3119. * File creation. Allocate an inode, and we're done..
  3120. */
  3121. static int
  3122. shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
  3123. struct dentry *dentry, umode_t mode, dev_t dev)
  3124. {
  3125. struct inode *inode;
  3126. int error;
  3127. inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
  3128. if (IS_ERR(inode))
  3129. return PTR_ERR(inode);
  3130. error = simple_acl_create(dir, inode);
  3131. if (error)
  3132. goto out_iput;
  3133. error = security_inode_init_security(inode, dir, &dentry->d_name,
  3134. shmem_initxattrs, NULL);
  3135. if (error && error != -EOPNOTSUPP)
  3136. goto out_iput;
  3137. error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
  3138. if (error)
  3139. goto out_iput;
  3140. dir->i_size += BOGO_DIRENT_SIZE;
  3141. inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
  3142. inode_inc_iversion(dir);
  3143. d_instantiate(dentry, inode);
  3144. dget(dentry); /* Extra count - pin the dentry in core */
  3145. return error;
  3146. out_iput:
  3147. iput(inode);
  3148. return error;
  3149. }
  3150. static int
  3151. shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
  3152. struct file *file, umode_t mode)
  3153. {
  3154. struct inode *inode;
  3155. int error;
  3156. inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
  3157. if (IS_ERR(inode)) {
  3158. error = PTR_ERR(inode);
  3159. goto err_out;
  3160. }
  3161. error = security_inode_init_security(inode, dir, NULL,
  3162. shmem_initxattrs, NULL);
  3163. if (error && error != -EOPNOTSUPP)
  3164. goto out_iput;
  3165. error = simple_acl_create(dir, inode);
  3166. if (error)
  3167. goto out_iput;
  3168. d_tmpfile(file, inode);
  3169. err_out:
  3170. return finish_open_simple(file, error);
  3171. out_iput:
  3172. iput(inode);
  3173. return error;
  3174. }
  3175. static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
  3176. struct dentry *dentry, umode_t mode)
  3177. {
  3178. int error;
  3179. error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
  3180. if (error)
  3181. return error;
  3182. inc_nlink(dir);
  3183. return 0;
  3184. }
  3185. static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
  3186. struct dentry *dentry, umode_t mode, bool excl)
  3187. {
  3188. return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
  3189. }
  3190. /*
  3191. * Link a file..
  3192. */
  3193. static int shmem_link(struct dentry *old_dentry, struct inode *dir,
  3194. struct dentry *dentry)
  3195. {
  3196. struct inode *inode = d_inode(old_dentry);
  3197. int ret = 0;
  3198. /*
  3199. * No ordinary (disk based) filesystem counts links as inodes;
  3200. * but each new link needs a new dentry, pinning lowmem, and
  3201. * tmpfs dentries cannot be pruned until they are unlinked.
  3202. * But if an O_TMPFILE file is linked into the tmpfs, the
  3203. * first link must skip that, to get the accounting right.
  3204. */
  3205. if (inode->i_nlink) {
  3206. ret = shmem_reserve_inode(inode->i_sb, NULL);
  3207. if (ret)
  3208. goto out;
  3209. }
  3210. ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
  3211. if (ret) {
  3212. if (inode->i_nlink)
  3213. shmem_free_inode(inode->i_sb, 0);
  3214. goto out;
  3215. }
  3216. dir->i_size += BOGO_DIRENT_SIZE;
  3217. inode_set_mtime_to_ts(dir,
  3218. inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
  3219. inode_inc_iversion(dir);
  3220. inc_nlink(inode);
  3221. ihold(inode); /* New dentry reference */
  3222. dget(dentry); /* Extra pinning count for the created dentry */
  3223. d_instantiate(dentry, inode);
  3224. out:
  3225. return ret;
  3226. }
  3227. static int shmem_unlink(struct inode *dir, struct dentry *dentry)
  3228. {
  3229. struct inode *inode = d_inode(dentry);
  3230. if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
  3231. shmem_free_inode(inode->i_sb, 0);
  3232. simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
  3233. dir->i_size -= BOGO_DIRENT_SIZE;
  3234. inode_set_mtime_to_ts(dir,
  3235. inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
  3236. inode_inc_iversion(dir);
  3237. drop_nlink(inode);
  3238. dput(dentry); /* Undo the count from "create" - does all the work */
  3239. return 0;
  3240. }
  3241. static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
  3242. {
  3243. if (!simple_empty(dentry))
  3244. return -ENOTEMPTY;
  3245. drop_nlink(d_inode(dentry));
  3246. drop_nlink(dir);
  3247. return shmem_unlink(dir, dentry);
  3248. }
  3249. static int shmem_whiteout(struct mnt_idmap *idmap,
  3250. struct inode *old_dir, struct dentry *old_dentry)
  3251. {
  3252. struct dentry *whiteout;
  3253. int error;
  3254. whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
  3255. if (!whiteout)
  3256. return -ENOMEM;
  3257. error = shmem_mknod(idmap, old_dir, whiteout,
  3258. S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
  3259. dput(whiteout);
  3260. if (error)
  3261. return error;
  3262. /*
  3263. * Cheat and hash the whiteout while the old dentry is still in
  3264. * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
  3265. *
  3266. * d_lookup() will consistently find one of them at this point,
  3267. * not sure which one, but that isn't even important.
  3268. */
  3269. d_rehash(whiteout);
  3270. return 0;
  3271. }
  3272. /*
  3273. * The VFS layer already does all the dentry stuff for rename,
  3274. * we just have to decrement the usage count for the target if
  3275. * it exists so that the VFS layer correctly free's it when it
  3276. * gets overwritten.
  3277. */
  3278. static int shmem_rename2(struct mnt_idmap *idmap,
  3279. struct inode *old_dir, struct dentry *old_dentry,
  3280. struct inode *new_dir, struct dentry *new_dentry,
  3281. unsigned int flags)
  3282. {
  3283. struct inode *inode = d_inode(old_dentry);
  3284. int they_are_dirs = S_ISDIR(inode->i_mode);
  3285. int error;
  3286. if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
  3287. return -EINVAL;
  3288. if (flags & RENAME_EXCHANGE)
  3289. return simple_offset_rename_exchange(old_dir, old_dentry,
  3290. new_dir, new_dentry);
  3291. if (!simple_empty(new_dentry))
  3292. return -ENOTEMPTY;
  3293. if (flags & RENAME_WHITEOUT) {
  3294. error = shmem_whiteout(idmap, old_dir, old_dentry);
  3295. if (error)
  3296. return error;
  3297. }
  3298. error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry);
  3299. if (error)
  3300. return error;
  3301. if (d_really_is_positive(new_dentry)) {
  3302. (void) shmem_unlink(new_dir, new_dentry);
  3303. if (they_are_dirs) {
  3304. drop_nlink(d_inode(new_dentry));
  3305. drop_nlink(old_dir);
  3306. }
  3307. } else if (they_are_dirs) {
  3308. drop_nlink(old_dir);
  3309. inc_nlink(new_dir);
  3310. }
  3311. old_dir->i_size -= BOGO_DIRENT_SIZE;
  3312. new_dir->i_size += BOGO_DIRENT_SIZE;
  3313. simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
  3314. inode_inc_iversion(old_dir);
  3315. inode_inc_iversion(new_dir);
  3316. return 0;
  3317. }
  3318. static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
  3319. struct dentry *dentry, const char *symname)
  3320. {
  3321. int error;
  3322. int len;
  3323. struct inode *inode;
  3324. struct folio *folio;
  3325. len = strlen(symname) + 1;
  3326. if (len > PAGE_SIZE)
  3327. return -ENAMETOOLONG;
  3328. inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
  3329. VM_NORESERVE);
  3330. if (IS_ERR(inode))
  3331. return PTR_ERR(inode);
  3332. error = security_inode_init_security(inode, dir, &dentry->d_name,
  3333. shmem_initxattrs, NULL);
  3334. if (error && error != -EOPNOTSUPP)
  3335. goto out_iput;
  3336. error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
  3337. if (error)
  3338. goto out_iput;
  3339. inode->i_size = len-1;
  3340. if (len <= SHORT_SYMLINK_LEN) {
  3341. inode->i_link = kmemdup(symname, len, GFP_KERNEL);
  3342. if (!inode->i_link) {
  3343. error = -ENOMEM;
  3344. goto out_remove_offset;
  3345. }
  3346. inode->i_op = &shmem_short_symlink_operations;
  3347. } else {
  3348. inode_nohighmem(inode);
  3349. inode->i_mapping->a_ops = &shmem_aops;
  3350. error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE);
  3351. if (error)
  3352. goto out_remove_offset;
  3353. inode->i_op = &shmem_symlink_inode_operations;
  3354. memcpy(folio_address(folio), symname, len);
  3355. folio_mark_uptodate(folio);
  3356. folio_mark_dirty(folio);
  3357. folio_unlock(folio);
  3358. folio_put(folio);
  3359. }
  3360. dir->i_size += BOGO_DIRENT_SIZE;
  3361. inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
  3362. inode_inc_iversion(dir);
  3363. d_instantiate(dentry, inode);
  3364. dget(dentry);
  3365. return 0;
  3366. out_remove_offset:
  3367. simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
  3368. out_iput:
  3369. iput(inode);
  3370. return error;
  3371. }
  3372. static void shmem_put_link(void *arg)
  3373. {
  3374. folio_mark_accessed(arg);
  3375. folio_put(arg);
  3376. }
  3377. static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
  3378. struct delayed_call *done)
  3379. {
  3380. struct folio *folio = NULL;
  3381. int error;
  3382. if (!dentry) {
  3383. folio = filemap_get_folio(inode->i_mapping, 0);
  3384. if (IS_ERR(folio))
  3385. return ERR_PTR(-ECHILD);
  3386. if (PageHWPoison(folio_page(folio, 0)) ||
  3387. !folio_test_uptodate(folio)) {
  3388. folio_put(folio);
  3389. return ERR_PTR(-ECHILD);
  3390. }
  3391. } else {
  3392. error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ);
  3393. if (error)
  3394. return ERR_PTR(error);
  3395. if (!folio)
  3396. return ERR_PTR(-ECHILD);
  3397. if (PageHWPoison(folio_page(folio, 0))) {
  3398. folio_unlock(folio);
  3399. folio_put(folio);
  3400. return ERR_PTR(-ECHILD);
  3401. }
  3402. folio_unlock(folio);
  3403. }
  3404. set_delayed_call(done, shmem_put_link, folio);
  3405. return folio_address(folio);
  3406. }
  3407. #ifdef CONFIG_TMPFS_XATTR
  3408. static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
  3409. {
  3410. struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
  3411. fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE);
  3412. return 0;
  3413. }
  3414. static int shmem_fileattr_set(struct mnt_idmap *idmap,
  3415. struct dentry *dentry, struct fileattr *fa)
  3416. {
  3417. struct inode *inode = d_inode(dentry);
  3418. struct shmem_inode_info *info = SHMEM_I(inode);
  3419. if (fileattr_has_fsx(fa))
  3420. return -EOPNOTSUPP;
  3421. if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
  3422. return -EOPNOTSUPP;
  3423. info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
  3424. (fa->flags & SHMEM_FL_USER_MODIFIABLE);
  3425. shmem_set_inode_flags(inode, info->fsflags);
  3426. inode_set_ctime_current(inode);
  3427. inode_inc_iversion(inode);
  3428. return 0;
  3429. }
  3430. /*
  3431. * Superblocks without xattr inode operations may get some security.* xattr
  3432. * support from the LSM "for free". As soon as we have any other xattrs
  3433. * like ACLs, we also need to implement the security.* handlers at
  3434. * filesystem level, though.
  3435. */
  3436. /*
  3437. * Callback for security_inode_init_security() for acquiring xattrs.
  3438. */
  3439. static int shmem_initxattrs(struct inode *inode,
  3440. const struct xattr *xattr_array, void *fs_info)
  3441. {
  3442. struct shmem_inode_info *info = SHMEM_I(inode);
  3443. struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
  3444. const struct xattr *xattr;
  3445. struct simple_xattr *new_xattr;
  3446. size_t ispace = 0;
  3447. size_t len;
  3448. if (sbinfo->max_inodes) {
  3449. for (xattr = xattr_array; xattr->name != NULL; xattr++) {
  3450. ispace += simple_xattr_space(xattr->name,
  3451. xattr->value_len + XATTR_SECURITY_PREFIX_LEN);
  3452. }
  3453. if (ispace) {
  3454. raw_spin_lock(&sbinfo->stat_lock);
  3455. if (sbinfo->free_ispace < ispace)
  3456. ispace = 0;
  3457. else
  3458. sbinfo->free_ispace -= ispace;
  3459. raw_spin_unlock(&sbinfo->stat_lock);
  3460. if (!ispace)
  3461. return -ENOSPC;
  3462. }
  3463. }
  3464. for (xattr = xattr_array; xattr->name != NULL; xattr++) {
  3465. new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
  3466. if (!new_xattr)
  3467. break;
  3468. len = strlen(xattr->name) + 1;
  3469. new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
  3470. GFP_KERNEL_ACCOUNT);
  3471. if (!new_xattr->name) {
  3472. kvfree(new_xattr);
  3473. break;
  3474. }
  3475. memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
  3476. XATTR_SECURITY_PREFIX_LEN);
  3477. memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
  3478. xattr->name, len);
  3479. simple_xattr_add(&info->xattrs, new_xattr);
  3480. }
  3481. if (xattr->name != NULL) {
  3482. if (ispace) {
  3483. raw_spin_lock(&sbinfo->stat_lock);
  3484. sbinfo->free_ispace += ispace;
  3485. raw_spin_unlock(&sbinfo->stat_lock);
  3486. }
  3487. simple_xattrs_free(&info->xattrs, NULL);
  3488. return -ENOMEM;
  3489. }
  3490. return 0;
  3491. }
  3492. static int shmem_xattr_handler_get(const struct xattr_handler *handler,
  3493. struct dentry *unused, struct inode *inode,
  3494. const char *name, void *buffer, size_t size)
  3495. {
  3496. struct shmem_inode_info *info = SHMEM_I(inode);
  3497. name = xattr_full_name(handler, name);
  3498. return simple_xattr_get(&info->xattrs, name, buffer, size);
  3499. }
  3500. static int shmem_xattr_handler_set(const struct xattr_handler *handler,
  3501. struct mnt_idmap *idmap,
  3502. struct dentry *unused, struct inode *inode,
  3503. const char *name, const void *value,
  3504. size_t size, int flags)
  3505. {
  3506. struct shmem_inode_info *info = SHMEM_I(inode);
  3507. struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
  3508. struct simple_xattr *old_xattr;
  3509. size_t ispace = 0;
  3510. name = xattr_full_name(handler, name);
  3511. if (value && sbinfo->max_inodes) {
  3512. ispace = simple_xattr_space(name, size);
  3513. raw_spin_lock(&sbinfo->stat_lock);
  3514. if (sbinfo->free_ispace < ispace)
  3515. ispace = 0;
  3516. else
  3517. sbinfo->free_ispace -= ispace;
  3518. raw_spin_unlock(&sbinfo->stat_lock);
  3519. if (!ispace)
  3520. return -ENOSPC;
  3521. }
  3522. old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags);
  3523. if (!IS_ERR(old_xattr)) {
  3524. ispace = 0;
  3525. if (old_xattr && sbinfo->max_inodes)
  3526. ispace = simple_xattr_space(old_xattr->name,
  3527. old_xattr->size);
  3528. simple_xattr_free(old_xattr);
  3529. old_xattr = NULL;
  3530. inode_set_ctime_current(inode);
  3531. inode_inc_iversion(inode);
  3532. }
  3533. if (ispace) {
  3534. raw_spin_lock(&sbinfo->stat_lock);
  3535. sbinfo->free_ispace += ispace;
  3536. raw_spin_unlock(&sbinfo->stat_lock);
  3537. }
  3538. return PTR_ERR(old_xattr);
  3539. }
  3540. static const struct xattr_handler shmem_security_xattr_handler = {
  3541. .prefix = XATTR_SECURITY_PREFIX,
  3542. .get = shmem_xattr_handler_get,
  3543. .set = shmem_xattr_handler_set,
  3544. };
  3545. static const struct xattr_handler shmem_trusted_xattr_handler = {
  3546. .prefix = XATTR_TRUSTED_PREFIX,
  3547. .get = shmem_xattr_handler_get,
  3548. .set = shmem_xattr_handler_set,
  3549. };
  3550. static const struct xattr_handler shmem_user_xattr_handler = {
  3551. .prefix = XATTR_USER_PREFIX,
  3552. .get = shmem_xattr_handler_get,
  3553. .set = shmem_xattr_handler_set,
  3554. };
  3555. static const struct xattr_handler * const shmem_xattr_handlers[] = {
  3556. &shmem_security_xattr_handler,
  3557. &shmem_trusted_xattr_handler,
  3558. &shmem_user_xattr_handler,
  3559. NULL
  3560. };
  3561. static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
  3562. {
  3563. struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
  3564. return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
  3565. }
  3566. #endif /* CONFIG_TMPFS_XATTR */
  3567. static const struct inode_operations shmem_short_symlink_operations = {
  3568. .getattr = shmem_getattr,
  3569. .setattr = shmem_setattr,
  3570. .get_link = simple_get_link,
  3571. #ifdef CONFIG_TMPFS_XATTR
  3572. .listxattr = shmem_listxattr,
  3573. #endif
  3574. };
  3575. static const struct inode_operations shmem_symlink_inode_operations = {
  3576. .getattr = shmem_getattr,
  3577. .setattr = shmem_setattr,
  3578. .get_link = shmem_get_link,
  3579. #ifdef CONFIG_TMPFS_XATTR
  3580. .listxattr = shmem_listxattr,
  3581. #endif
  3582. };
  3583. static struct dentry *shmem_get_parent(struct dentry *child)
  3584. {
  3585. return ERR_PTR(-ESTALE);
  3586. }
  3587. static int shmem_match(struct inode *ino, void *vfh)
  3588. {
  3589. __u32 *fh = vfh;
  3590. __u64 inum = fh[2];
  3591. inum = (inum << 32) | fh[1];
  3592. return ino->i_ino == inum && fh[0] == ino->i_generation;
  3593. }
  3594. /* Find any alias of inode, but prefer a hashed alias */
  3595. static struct dentry *shmem_find_alias(struct inode *inode)
  3596. {
  3597. struct dentry *alias = d_find_alias(inode);
  3598. return alias ?: d_find_any_alias(inode);
  3599. }
  3600. static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
  3601. struct fid *fid, int fh_len, int fh_type)
  3602. {
  3603. struct inode *inode;
  3604. struct dentry *dentry = NULL;
  3605. u64 inum;
  3606. if (fh_len < 3)
  3607. return NULL;
  3608. inum = fid->raw[2];
  3609. inum = (inum << 32) | fid->raw[1];
  3610. inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
  3611. shmem_match, fid->raw);
  3612. if (inode) {
  3613. dentry = shmem_find_alias(inode);
  3614. iput(inode);
  3615. }
  3616. return dentry;
  3617. }
  3618. static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
  3619. struct inode *parent)
  3620. {
  3621. if (*len < 3) {
  3622. *len = 3;
  3623. return FILEID_INVALID;
  3624. }
  3625. if (inode_unhashed(inode)) {
  3626. /* Unfortunately insert_inode_hash is not idempotent,
  3627. * so as we hash inodes here rather than at creation
  3628. * time, we need a lock to ensure we only try
  3629. * to do it once
  3630. */
  3631. static DEFINE_SPINLOCK(lock);
  3632. spin_lock(&lock);
  3633. if (inode_unhashed(inode))
  3634. __insert_inode_hash(inode,
  3635. inode->i_ino + inode->i_generation);
  3636. spin_unlock(&lock);
  3637. }
  3638. fh[0] = inode->i_generation;
  3639. fh[1] = inode->i_ino;
  3640. fh[2] = ((__u64)inode->i_ino) >> 32;
  3641. *len = 3;
  3642. return 1;
  3643. }
  3644. static const struct export_operations shmem_export_ops = {
  3645. .get_parent = shmem_get_parent,
  3646. .encode_fh = shmem_encode_fh,
  3647. .fh_to_dentry = shmem_fh_to_dentry,
  3648. };
  3649. enum shmem_param {
  3650. Opt_gid,
  3651. Opt_huge,
  3652. Opt_mode,
  3653. Opt_mpol,
  3654. Opt_nr_blocks,
  3655. Opt_nr_inodes,
  3656. Opt_size,
  3657. Opt_uid,
  3658. Opt_inode32,
  3659. Opt_inode64,
  3660. Opt_noswap,
  3661. Opt_quota,
  3662. Opt_usrquota,
  3663. Opt_grpquota,
  3664. Opt_usrquota_block_hardlimit,
  3665. Opt_usrquota_inode_hardlimit,
  3666. Opt_grpquota_block_hardlimit,
  3667. Opt_grpquota_inode_hardlimit,
  3668. };
  3669. static const struct constant_table shmem_param_enums_huge[] = {
  3670. {"never", SHMEM_HUGE_NEVER },
  3671. {"always", SHMEM_HUGE_ALWAYS },
  3672. {"within_size", SHMEM_HUGE_WITHIN_SIZE },
  3673. {"advise", SHMEM_HUGE_ADVISE },
  3674. {}
  3675. };
  3676. const struct fs_parameter_spec shmem_fs_parameters[] = {
  3677. fsparam_gid ("gid", Opt_gid),
  3678. fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge),
  3679. fsparam_u32oct("mode", Opt_mode),
  3680. fsparam_string("mpol", Opt_mpol),
  3681. fsparam_string("nr_blocks", Opt_nr_blocks),
  3682. fsparam_string("nr_inodes", Opt_nr_inodes),
  3683. fsparam_string("size", Opt_size),
  3684. fsparam_uid ("uid", Opt_uid),
  3685. fsparam_flag ("inode32", Opt_inode32),
  3686. fsparam_flag ("inode64", Opt_inode64),
  3687. fsparam_flag ("noswap", Opt_noswap),
  3688. #ifdef CONFIG_TMPFS_QUOTA
  3689. fsparam_flag ("quota", Opt_quota),
  3690. fsparam_flag ("usrquota", Opt_usrquota),
  3691. fsparam_flag ("grpquota", Opt_grpquota),
  3692. fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit),
  3693. fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit),
  3694. fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit),
  3695. fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit),
  3696. #endif
  3697. {}
  3698. };
  3699. static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
  3700. {
  3701. struct shmem_options *ctx = fc->fs_private;
  3702. struct fs_parse_result result;
  3703. unsigned long long size;
  3704. char *rest;
  3705. int opt;
  3706. kuid_t kuid;
  3707. kgid_t kgid;
  3708. opt = fs_parse(fc, shmem_fs_parameters, param, &result);
  3709. if (opt < 0)
  3710. return opt;
  3711. switch (opt) {
  3712. case Opt_size:
  3713. size = memparse(param->string, &rest);
  3714. if (*rest == '%') {
  3715. size <<= PAGE_SHIFT;
  3716. size *= totalram_pages();
  3717. do_div(size, 100);
  3718. rest++;
  3719. }
  3720. if (*rest)
  3721. goto bad_value;
  3722. ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
  3723. ctx->seen |= SHMEM_SEEN_BLOCKS;
  3724. break;
  3725. case Opt_nr_blocks:
  3726. ctx->blocks = memparse(param->string, &rest);
  3727. if (*rest || ctx->blocks > LONG_MAX)
  3728. goto bad_value;
  3729. ctx->seen |= SHMEM_SEEN_BLOCKS;
  3730. break;
  3731. case Opt_nr_inodes:
  3732. ctx->inodes = memparse(param->string, &rest);
  3733. if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE)
  3734. goto bad_value;
  3735. ctx->seen |= SHMEM_SEEN_INODES;
  3736. break;
  3737. case Opt_mode:
  3738. ctx->mode = result.uint_32 & 07777;
  3739. break;
  3740. case Opt_uid:
  3741. kuid = result.uid;
  3742. /*
  3743. * The requested uid must be representable in the
  3744. * filesystem's idmapping.
  3745. */
  3746. if (!kuid_has_mapping(fc->user_ns, kuid))
  3747. goto bad_value;
  3748. ctx->uid = kuid;
  3749. break;
  3750. case Opt_gid:
  3751. kgid = result.gid;
  3752. /*
  3753. * The requested gid must be representable in the
  3754. * filesystem's idmapping.
  3755. */
  3756. if (!kgid_has_mapping(fc->user_ns, kgid))
  3757. goto bad_value;
  3758. ctx->gid = kgid;
  3759. break;
  3760. case Opt_huge:
  3761. ctx->huge = result.uint_32;
  3762. if (ctx->huge != SHMEM_HUGE_NEVER &&
  3763. !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
  3764. has_transparent_hugepage()))
  3765. goto unsupported_parameter;
  3766. ctx->seen |= SHMEM_SEEN_HUGE;
  3767. break;
  3768. case Opt_mpol:
  3769. if (IS_ENABLED(CONFIG_NUMA)) {
  3770. mpol_put(ctx->mpol);
  3771. ctx->mpol = NULL;
  3772. if (mpol_parse_str(param->string, &ctx->mpol))
  3773. goto bad_value;
  3774. break;
  3775. }
  3776. goto unsupported_parameter;
  3777. case Opt_inode32:
  3778. ctx->full_inums = false;
  3779. ctx->seen |= SHMEM_SEEN_INUMS;
  3780. break;
  3781. case Opt_inode64:
  3782. if (sizeof(ino_t) < 8) {
  3783. return invalfc(fc,
  3784. "Cannot use inode64 with <64bit inums in kernel\n");
  3785. }
  3786. ctx->full_inums = true;
  3787. ctx->seen |= SHMEM_SEEN_INUMS;
  3788. break;
  3789. case Opt_noswap:
  3790. if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) {
  3791. return invalfc(fc,
  3792. "Turning off swap in unprivileged tmpfs mounts unsupported");
  3793. }
  3794. ctx->noswap = true;
  3795. ctx->seen |= SHMEM_SEEN_NOSWAP;
  3796. break;
  3797. case Opt_quota:
  3798. if (fc->user_ns != &init_user_ns)
  3799. return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
  3800. ctx->seen |= SHMEM_SEEN_QUOTA;
  3801. ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP);
  3802. break;
  3803. case Opt_usrquota:
  3804. if (fc->user_ns != &init_user_ns)
  3805. return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
  3806. ctx->seen |= SHMEM_SEEN_QUOTA;
  3807. ctx->quota_types |= QTYPE_MASK_USR;
  3808. break;
  3809. case Opt_grpquota:
  3810. if (fc->user_ns != &init_user_ns)
  3811. return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
  3812. ctx->seen |= SHMEM_SEEN_QUOTA;
  3813. ctx->quota_types |= QTYPE_MASK_GRP;
  3814. break;
  3815. case Opt_usrquota_block_hardlimit:
  3816. size = memparse(param->string, &rest);
  3817. if (*rest || !size)
  3818. goto bad_value;
  3819. if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
  3820. return invalfc(fc,
  3821. "User quota block hardlimit too large.");
  3822. ctx->qlimits.usrquota_bhardlimit = size;
  3823. break;
  3824. case Opt_grpquota_block_hardlimit:
  3825. size = memparse(param->string, &rest);
  3826. if (*rest || !size)
  3827. goto bad_value;
  3828. if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
  3829. return invalfc(fc,
  3830. "Group quota block hardlimit too large.");
  3831. ctx->qlimits.grpquota_bhardlimit = size;
  3832. break;
  3833. case Opt_usrquota_inode_hardlimit:
  3834. size = memparse(param->string, &rest);
  3835. if (*rest || !size)
  3836. goto bad_value;
  3837. if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
  3838. return invalfc(fc,
  3839. "User quota inode hardlimit too large.");
  3840. ctx->qlimits.usrquota_ihardlimit = size;
  3841. break;
  3842. case Opt_grpquota_inode_hardlimit:
  3843. size = memparse(param->string, &rest);
  3844. if (*rest || !size)
  3845. goto bad_value;
  3846. if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
  3847. return invalfc(fc,
  3848. "Group quota inode hardlimit too large.");
  3849. ctx->qlimits.grpquota_ihardlimit = size;
  3850. break;
  3851. }
  3852. return 0;
  3853. unsupported_parameter:
  3854. return invalfc(fc, "Unsupported parameter '%s'", param->key);
  3855. bad_value:
  3856. return invalfc(fc, "Bad value for '%s'", param->key);
  3857. }
  3858. static int shmem_parse_options(struct fs_context *fc, void *data)
  3859. {
  3860. char *options = data;
  3861. if (options) {
  3862. int err = security_sb_eat_lsm_opts(options, &fc->security);
  3863. if (err)
  3864. return err;
  3865. }
  3866. while (options != NULL) {
  3867. char *this_char = options;
  3868. for (;;) {
  3869. /*
  3870. * NUL-terminate this option: unfortunately,
  3871. * mount options form a comma-separated list,
  3872. * but mpol's nodelist may also contain commas.
  3873. */
  3874. options = strchr(options, ',');
  3875. if (options == NULL)
  3876. break;
  3877. options++;
  3878. if (!isdigit(*options)) {
  3879. options[-1] = '\0';
  3880. break;
  3881. }
  3882. }
  3883. if (*this_char) {
  3884. char *value = strchr(this_char, '=');
  3885. size_t len = 0;
  3886. int err;
  3887. if (value) {
  3888. *value++ = '\0';
  3889. len = strlen(value);
  3890. }
  3891. err = vfs_parse_fs_string(fc, this_char, value, len);
  3892. if (err < 0)
  3893. return err;
  3894. }
  3895. }
  3896. return 0;
  3897. }
  3898. /*
  3899. * Reconfigure a shmem filesystem.
  3900. */
  3901. static int shmem_reconfigure(struct fs_context *fc)
  3902. {
  3903. struct shmem_options *ctx = fc->fs_private;
  3904. struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
  3905. unsigned long used_isp;
  3906. struct mempolicy *mpol = NULL;
  3907. const char *err;
  3908. raw_spin_lock(&sbinfo->stat_lock);
  3909. used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace;
  3910. if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
  3911. if (!sbinfo->max_blocks) {
  3912. err = "Cannot retroactively limit size";
  3913. goto out;
  3914. }
  3915. if (percpu_counter_compare(&sbinfo->used_blocks,
  3916. ctx->blocks) > 0) {
  3917. err = "Too small a size for current use";
  3918. goto out;
  3919. }
  3920. }
  3921. if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
  3922. if (!sbinfo->max_inodes) {
  3923. err = "Cannot retroactively limit inodes";
  3924. goto out;
  3925. }
  3926. if (ctx->inodes * BOGO_INODE_SIZE < used_isp) {
  3927. err = "Too few inodes for current use";
  3928. goto out;
  3929. }
  3930. }
  3931. if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
  3932. sbinfo->next_ino > UINT_MAX) {
  3933. err = "Current inum too high to switch to 32-bit inums";
  3934. goto out;
  3935. }
  3936. if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) {
  3937. err = "Cannot disable swap on remount";
  3938. goto out;
  3939. }
  3940. if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) {
  3941. err = "Cannot enable swap on remount if it was disabled on first mount";
  3942. goto out;
  3943. }
  3944. if (ctx->seen & SHMEM_SEEN_QUOTA &&
  3945. !sb_any_quota_loaded(fc->root->d_sb)) {
  3946. err = "Cannot enable quota on remount";
  3947. goto out;
  3948. }
  3949. #ifdef CONFIG_TMPFS_QUOTA
  3950. #define CHANGED_LIMIT(name) \
  3951. (ctx->qlimits.name## hardlimit && \
  3952. (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit))
  3953. if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) ||
  3954. CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) {
  3955. err = "Cannot change global quota limit on remount";
  3956. goto out;
  3957. }
  3958. #endif /* CONFIG_TMPFS_QUOTA */
  3959. if (ctx->seen & SHMEM_SEEN_HUGE)
  3960. sbinfo->huge = ctx->huge;
  3961. if (ctx->seen & SHMEM_SEEN_INUMS)
  3962. sbinfo->full_inums = ctx->full_inums;
  3963. if (ctx->seen & SHMEM_SEEN_BLOCKS)
  3964. sbinfo->max_blocks = ctx->blocks;
  3965. if (ctx->seen & SHMEM_SEEN_INODES) {
  3966. sbinfo->max_inodes = ctx->inodes;
  3967. sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp;
  3968. }
  3969. /*
  3970. * Preserve previous mempolicy unless mpol remount option was specified.
  3971. */
  3972. if (ctx->mpol) {
  3973. mpol = sbinfo->mpol;
  3974. sbinfo->mpol = ctx->mpol; /* transfers initial ref */
  3975. ctx->mpol = NULL;
  3976. }
  3977. if (ctx->noswap)
  3978. sbinfo->noswap = true;
  3979. raw_spin_unlock(&sbinfo->stat_lock);
  3980. mpol_put(mpol);
  3981. return 0;
  3982. out:
  3983. raw_spin_unlock(&sbinfo->stat_lock);
  3984. return invalfc(fc, "%s", err);
  3985. }
  3986. static int shmem_show_options(struct seq_file *seq, struct dentry *root)
  3987. {
  3988. struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
  3989. struct mempolicy *mpol;
  3990. if (sbinfo->max_blocks != shmem_default_max_blocks())
  3991. seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks));
  3992. if (sbinfo->max_inodes != shmem_default_max_inodes())
  3993. seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
  3994. if (sbinfo->mode != (0777 | S_ISVTX))
  3995. seq_printf(seq, ",mode=%03ho", sbinfo->mode);
  3996. if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
  3997. seq_printf(seq, ",uid=%u",
  3998. from_kuid_munged(&init_user_ns, sbinfo->uid));
  3999. if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
  4000. seq_printf(seq, ",gid=%u",
  4001. from_kgid_munged(&init_user_ns, sbinfo->gid));
  4002. /*
  4003. * Showing inode{64,32} might be useful even if it's the system default,
  4004. * since then people don't have to resort to checking both here and
  4005. * /proc/config.gz to confirm 64-bit inums were successfully applied
  4006. * (which may not even exist if IKCONFIG_PROC isn't enabled).
  4007. *
  4008. * We hide it when inode64 isn't the default and we are using 32-bit
  4009. * inodes, since that probably just means the feature isn't even under
  4010. * consideration.
  4011. *
  4012. * As such:
  4013. *
  4014. * +-----------------+-----------------+
  4015. * | TMPFS_INODE64=y | TMPFS_INODE64=n |
  4016. * +------------------+-----------------+-----------------+
  4017. * | full_inums=true | show | show |
  4018. * | full_inums=false | show | hide |
  4019. * +------------------+-----------------+-----------------+
  4020. *
  4021. */
  4022. if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
  4023. seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
  4024. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  4025. /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
  4026. if (sbinfo->huge)
  4027. seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
  4028. #endif
  4029. mpol = shmem_get_sbmpol(sbinfo);
  4030. shmem_show_mpol(seq, mpol);
  4031. mpol_put(mpol);
  4032. if (sbinfo->noswap)
  4033. seq_printf(seq, ",noswap");
  4034. #ifdef CONFIG_TMPFS_QUOTA
  4035. if (sb_has_quota_active(root->d_sb, USRQUOTA))
  4036. seq_printf(seq, ",usrquota");
  4037. if (sb_has_quota_active(root->d_sb, GRPQUOTA))
  4038. seq_printf(seq, ",grpquota");
  4039. if (sbinfo->qlimits.usrquota_bhardlimit)
  4040. seq_printf(seq, ",usrquota_block_hardlimit=%lld",
  4041. sbinfo->qlimits.usrquota_bhardlimit);
  4042. if (sbinfo->qlimits.grpquota_bhardlimit)
  4043. seq_printf(seq, ",grpquota_block_hardlimit=%lld",
  4044. sbinfo->qlimits.grpquota_bhardlimit);
  4045. if (sbinfo->qlimits.usrquota_ihardlimit)
  4046. seq_printf(seq, ",usrquota_inode_hardlimit=%lld",
  4047. sbinfo->qlimits.usrquota_ihardlimit);
  4048. if (sbinfo->qlimits.grpquota_ihardlimit)
  4049. seq_printf(seq, ",grpquota_inode_hardlimit=%lld",
  4050. sbinfo->qlimits.grpquota_ihardlimit);
  4051. #endif
  4052. return 0;
  4053. }
  4054. #endif /* CONFIG_TMPFS */
  4055. static void shmem_put_super(struct super_block *sb)
  4056. {
  4057. struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
  4058. #ifdef CONFIG_TMPFS_QUOTA
  4059. shmem_disable_quotas(sb);
  4060. #endif
  4061. free_percpu(sbinfo->ino_batch);
  4062. percpu_counter_destroy(&sbinfo->used_blocks);
  4063. mpol_put(sbinfo->mpol);
  4064. kfree(sbinfo);
  4065. sb->s_fs_info = NULL;
  4066. }
  4067. static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
  4068. {
  4069. struct shmem_options *ctx = fc->fs_private;
  4070. struct inode *inode;
  4071. struct shmem_sb_info *sbinfo;
  4072. int error = -ENOMEM;
  4073. /* Round up to L1_CACHE_BYTES to resist false sharing */
  4074. sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
  4075. L1_CACHE_BYTES), GFP_KERNEL);
  4076. if (!sbinfo)
  4077. return error;
  4078. sb->s_fs_info = sbinfo;
  4079. #ifdef CONFIG_TMPFS
  4080. /*
  4081. * Per default we only allow half of the physical ram per
  4082. * tmpfs instance, limiting inodes to one per page of lowmem;
  4083. * but the internal instance is left unlimited.
  4084. */
  4085. if (!(sb->s_flags & SB_KERNMOUNT)) {
  4086. if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
  4087. ctx->blocks = shmem_default_max_blocks();
  4088. if (!(ctx->seen & SHMEM_SEEN_INODES))
  4089. ctx->inodes = shmem_default_max_inodes();
  4090. if (!(ctx->seen & SHMEM_SEEN_INUMS))
  4091. ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
  4092. sbinfo->noswap = ctx->noswap;
  4093. } else {
  4094. sb->s_flags |= SB_NOUSER;
  4095. }
  4096. sb->s_export_op = &shmem_export_ops;
  4097. sb->s_flags |= SB_NOSEC | SB_I_VERSION;
  4098. #else
  4099. sb->s_flags |= SB_NOUSER;
  4100. #endif
  4101. sbinfo->max_blocks = ctx->blocks;
  4102. sbinfo->max_inodes = ctx->inodes;
  4103. sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE;
  4104. if (sb->s_flags & SB_KERNMOUNT) {
  4105. sbinfo->ino_batch = alloc_percpu(ino_t);
  4106. if (!sbinfo->ino_batch)
  4107. goto failed;
  4108. }
  4109. sbinfo->uid = ctx->uid;
  4110. sbinfo->gid = ctx->gid;
  4111. sbinfo->full_inums = ctx->full_inums;
  4112. sbinfo->mode = ctx->mode;
  4113. sbinfo->huge = ctx->huge;
  4114. sbinfo->mpol = ctx->mpol;
  4115. ctx->mpol = NULL;
  4116. raw_spin_lock_init(&sbinfo->stat_lock);
  4117. if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
  4118. goto failed;
  4119. spin_lock_init(&sbinfo->shrinklist_lock);
  4120. INIT_LIST_HEAD(&sbinfo->shrinklist);
  4121. sb->s_maxbytes = MAX_LFS_FILESIZE;
  4122. sb->s_blocksize = PAGE_SIZE;
  4123. sb->s_blocksize_bits = PAGE_SHIFT;
  4124. sb->s_magic = TMPFS_MAGIC;
  4125. sb->s_op = &shmem_ops;
  4126. sb->s_time_gran = 1;
  4127. #ifdef CONFIG_TMPFS_XATTR
  4128. sb->s_xattr = shmem_xattr_handlers;
  4129. #endif
  4130. #ifdef CONFIG_TMPFS_POSIX_ACL
  4131. sb->s_flags |= SB_POSIXACL;
  4132. #endif
  4133. uuid_t uuid;
  4134. uuid_gen(&uuid);
  4135. super_set_uuid(sb, uuid.b, sizeof(uuid));
  4136. #ifdef CONFIG_TMPFS_QUOTA
  4137. if (ctx->seen & SHMEM_SEEN_QUOTA) {
  4138. sb->dq_op = &shmem_quota_operations;
  4139. sb->s_qcop = &dquot_quotactl_sysfile_ops;
  4140. sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
  4141. /* Copy the default limits from ctx into sbinfo */
  4142. memcpy(&sbinfo->qlimits, &ctx->qlimits,
  4143. sizeof(struct shmem_quota_limits));
  4144. if (shmem_enable_quotas(sb, ctx->quota_types))
  4145. goto failed;
  4146. }
  4147. #endif /* CONFIG_TMPFS_QUOTA */
  4148. inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL,
  4149. S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
  4150. if (IS_ERR(inode)) {
  4151. error = PTR_ERR(inode);
  4152. goto failed;
  4153. }
  4154. inode->i_uid = sbinfo->uid;
  4155. inode->i_gid = sbinfo->gid;
  4156. sb->s_root = d_make_root(inode);
  4157. if (!sb->s_root)
  4158. goto failed;
  4159. return 0;
  4160. failed:
  4161. shmem_put_super(sb);
  4162. return error;
  4163. }
  4164. static int shmem_get_tree(struct fs_context *fc)
  4165. {
  4166. return get_tree_nodev(fc, shmem_fill_super);
  4167. }
  4168. static void shmem_free_fc(struct fs_context *fc)
  4169. {
  4170. struct shmem_options *ctx = fc->fs_private;
  4171. if (ctx) {
  4172. mpol_put(ctx->mpol);
  4173. kfree(ctx);
  4174. }
  4175. }
  4176. static const struct fs_context_operations shmem_fs_context_ops = {
  4177. .free = shmem_free_fc,
  4178. .get_tree = shmem_get_tree,
  4179. #ifdef CONFIG_TMPFS
  4180. .parse_monolithic = shmem_parse_options,
  4181. .parse_param = shmem_parse_one,
  4182. .reconfigure = shmem_reconfigure,
  4183. #endif
  4184. };
  4185. static struct kmem_cache *shmem_inode_cachep __ro_after_init;
  4186. static struct inode *shmem_alloc_inode(struct super_block *sb)
  4187. {
  4188. struct shmem_inode_info *info;
  4189. info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL);
  4190. if (!info)
  4191. return NULL;
  4192. return &info->vfs_inode;
  4193. }
  4194. static void shmem_free_in_core_inode(struct inode *inode)
  4195. {
  4196. if (S_ISLNK(inode->i_mode))
  4197. kfree(inode->i_link);
  4198. kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
  4199. }
  4200. static void shmem_destroy_inode(struct inode *inode)
  4201. {
  4202. if (S_ISREG(inode->i_mode))
  4203. mpol_free_shared_policy(&SHMEM_I(inode)->policy);
  4204. if (S_ISDIR(inode->i_mode))
  4205. simple_offset_destroy(shmem_get_offset_ctx(inode));
  4206. }
  4207. static void shmem_init_inode(void *foo)
  4208. {
  4209. struct shmem_inode_info *info = foo;
  4210. inode_init_once(&info->vfs_inode);
  4211. }
  4212. static void __init shmem_init_inodecache(void)
  4213. {
  4214. shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
  4215. sizeof(struct shmem_inode_info),
  4216. 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
  4217. }
  4218. static void __init shmem_destroy_inodecache(void)
  4219. {
  4220. kmem_cache_destroy(shmem_inode_cachep);
  4221. }
  4222. /* Keep the page in page cache instead of truncating it */
  4223. static int shmem_error_remove_folio(struct address_space *mapping,
  4224. struct folio *folio)
  4225. {
  4226. return 0;
  4227. }
  4228. static const struct address_space_operations shmem_aops = {
  4229. .writepage = shmem_writepage,
  4230. .dirty_folio = noop_dirty_folio,
  4231. #ifdef CONFIG_TMPFS
  4232. .write_begin = shmem_write_begin,
  4233. .write_end = shmem_write_end,
  4234. #endif
  4235. #ifdef CONFIG_MIGRATION
  4236. .migrate_folio = migrate_folio,
  4237. #endif
  4238. .error_remove_folio = shmem_error_remove_folio,
  4239. };
  4240. static const struct file_operations shmem_file_operations = {
  4241. .mmap = shmem_mmap,
  4242. .open = shmem_file_open,
  4243. .get_unmapped_area = shmem_get_unmapped_area,
  4244. #ifdef CONFIG_TMPFS
  4245. .llseek = shmem_file_llseek,
  4246. .read_iter = shmem_file_read_iter,
  4247. .write_iter = shmem_file_write_iter,
  4248. .fsync = noop_fsync,
  4249. .splice_read = shmem_file_splice_read,
  4250. .splice_write = iter_file_splice_write,
  4251. .fallocate = shmem_fallocate,
  4252. #endif
  4253. };
  4254. static const struct inode_operations shmem_inode_operations = {
  4255. .getattr = shmem_getattr,
  4256. .setattr = shmem_setattr,
  4257. #ifdef CONFIG_TMPFS_XATTR
  4258. .listxattr = shmem_listxattr,
  4259. .set_acl = simple_set_acl,
  4260. .fileattr_get = shmem_fileattr_get,
  4261. .fileattr_set = shmem_fileattr_set,
  4262. #endif
  4263. };
  4264. static const struct inode_operations shmem_dir_inode_operations = {
  4265. #ifdef CONFIG_TMPFS
  4266. .getattr = shmem_getattr,
  4267. .create = shmem_create,
  4268. .lookup = simple_lookup,
  4269. .link = shmem_link,
  4270. .unlink = shmem_unlink,
  4271. .symlink = shmem_symlink,
  4272. .mkdir = shmem_mkdir,
  4273. .rmdir = shmem_rmdir,
  4274. .mknod = shmem_mknod,
  4275. .rename = shmem_rename2,
  4276. .tmpfile = shmem_tmpfile,
  4277. .get_offset_ctx = shmem_get_offset_ctx,
  4278. #endif
  4279. #ifdef CONFIG_TMPFS_XATTR
  4280. .listxattr = shmem_listxattr,
  4281. .fileattr_get = shmem_fileattr_get,
  4282. .fileattr_set = shmem_fileattr_set,
  4283. #endif
  4284. #ifdef CONFIG_TMPFS_POSIX_ACL
  4285. .setattr = shmem_setattr,
  4286. .set_acl = simple_set_acl,
  4287. #endif
  4288. };
  4289. static const struct inode_operations shmem_special_inode_operations = {
  4290. .getattr = shmem_getattr,
  4291. #ifdef CONFIG_TMPFS_XATTR
  4292. .listxattr = shmem_listxattr,
  4293. #endif
  4294. #ifdef CONFIG_TMPFS_POSIX_ACL
  4295. .setattr = shmem_setattr,
  4296. .set_acl = simple_set_acl,
  4297. #endif
  4298. };
  4299. static const struct super_operations shmem_ops = {
  4300. .alloc_inode = shmem_alloc_inode,
  4301. .free_inode = shmem_free_in_core_inode,
  4302. .destroy_inode = shmem_destroy_inode,
  4303. #ifdef CONFIG_TMPFS
  4304. .statfs = shmem_statfs,
  4305. .show_options = shmem_show_options,
  4306. #endif
  4307. #ifdef CONFIG_TMPFS_QUOTA
  4308. .get_dquots = shmem_get_dquots,
  4309. #endif
  4310. .evict_inode = shmem_evict_inode,
  4311. .drop_inode = generic_delete_inode,
  4312. .put_super = shmem_put_super,
  4313. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  4314. .nr_cached_objects = shmem_unused_huge_count,
  4315. .free_cached_objects = shmem_unused_huge_scan,
  4316. #endif
  4317. };
  4318. static const struct vm_operations_struct shmem_vm_ops = {
  4319. .fault = shmem_fault,
  4320. .map_pages = filemap_map_pages,
  4321. #ifdef CONFIG_NUMA
  4322. .set_policy = shmem_set_policy,
  4323. .get_policy = shmem_get_policy,
  4324. #endif
  4325. };
  4326. static const struct vm_operations_struct shmem_anon_vm_ops = {
  4327. .fault = shmem_fault,
  4328. .map_pages = filemap_map_pages,
  4329. #ifdef CONFIG_NUMA
  4330. .set_policy = shmem_set_policy,
  4331. .get_policy = shmem_get_policy,
  4332. #endif
  4333. };
  4334. int shmem_init_fs_context(struct fs_context *fc)
  4335. {
  4336. struct shmem_options *ctx;
  4337. ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
  4338. if (!ctx)
  4339. return -ENOMEM;
  4340. ctx->mode = 0777 | S_ISVTX;
  4341. ctx->uid = current_fsuid();
  4342. ctx->gid = current_fsgid();
  4343. fc->fs_private = ctx;
  4344. fc->ops = &shmem_fs_context_ops;
  4345. return 0;
  4346. }
  4347. static struct file_system_type shmem_fs_type = {
  4348. .owner = THIS_MODULE,
  4349. .name = "tmpfs",
  4350. .init_fs_context = shmem_init_fs_context,
  4351. #ifdef CONFIG_TMPFS
  4352. .parameters = shmem_fs_parameters,
  4353. #endif
  4354. .kill_sb = kill_litter_super,
  4355. .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
  4356. };
  4357. void __init shmem_init(void)
  4358. {
  4359. int error;
  4360. shmem_init_inodecache();
  4361. #ifdef CONFIG_TMPFS_QUOTA
  4362. register_quota_format(&shmem_quota_format);
  4363. #endif
  4364. error = register_filesystem(&shmem_fs_type);
  4365. if (error) {
  4366. pr_err("Could not register tmpfs\n");
  4367. goto out2;
  4368. }
  4369. shm_mnt = kern_mount(&shmem_fs_type);
  4370. if (IS_ERR(shm_mnt)) {
  4371. error = PTR_ERR(shm_mnt);
  4372. pr_err("Could not kern_mount tmpfs\n");
  4373. goto out1;
  4374. }
  4375. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  4376. if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
  4377. SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
  4378. else
  4379. shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
  4380. /*
  4381. * Default to setting PMD-sized THP to inherit the global setting and
  4382. * disable all other multi-size THPs.
  4383. */
  4384. huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER);
  4385. #endif
  4386. return;
  4387. out1:
  4388. unregister_filesystem(&shmem_fs_type);
  4389. out2:
  4390. #ifdef CONFIG_TMPFS_QUOTA
  4391. unregister_quota_format(&shmem_quota_format);
  4392. #endif
  4393. shmem_destroy_inodecache();
  4394. shm_mnt = ERR_PTR(error);
  4395. }
  4396. #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
  4397. static ssize_t shmem_enabled_show(struct kobject *kobj,
  4398. struct kobj_attribute *attr, char *buf)
  4399. {
  4400. static const int values[] = {
  4401. SHMEM_HUGE_ALWAYS,
  4402. SHMEM_HUGE_WITHIN_SIZE,
  4403. SHMEM_HUGE_ADVISE,
  4404. SHMEM_HUGE_NEVER,
  4405. SHMEM_HUGE_DENY,
  4406. SHMEM_HUGE_FORCE,
  4407. };
  4408. int len = 0;
  4409. int i;
  4410. for (i = 0; i < ARRAY_SIZE(values); i++) {
  4411. len += sysfs_emit_at(buf, len,
  4412. shmem_huge == values[i] ? "%s[%s]" : "%s%s",
  4413. i ? " " : "", shmem_format_huge(values[i]));
  4414. }
  4415. len += sysfs_emit_at(buf, len, "\n");
  4416. return len;
  4417. }
  4418. static ssize_t shmem_enabled_store(struct kobject *kobj,
  4419. struct kobj_attribute *attr, const char *buf, size_t count)
  4420. {
  4421. char tmp[16];
  4422. int huge;
  4423. if (count + 1 > sizeof(tmp))
  4424. return -EINVAL;
  4425. memcpy(tmp, buf, count);
  4426. tmp[count] = '\0';
  4427. if (count && tmp[count - 1] == '\n')
  4428. tmp[count - 1] = '\0';
  4429. huge = shmem_parse_huge(tmp);
  4430. if (huge == -EINVAL)
  4431. return -EINVAL;
  4432. if (!has_transparent_hugepage() &&
  4433. huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
  4434. return -EINVAL;
  4435. /* Do not override huge allocation policy with non-PMD sized mTHP */
  4436. if (huge == SHMEM_HUGE_FORCE &&
  4437. huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER))
  4438. return -EINVAL;
  4439. shmem_huge = huge;
  4440. if (shmem_huge > SHMEM_HUGE_DENY)
  4441. SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
  4442. return count;
  4443. }
  4444. struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
  4445. static DEFINE_SPINLOCK(huge_shmem_orders_lock);
  4446. static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj,
  4447. struct kobj_attribute *attr, char *buf)
  4448. {
  4449. int order = to_thpsize(kobj)->order;
  4450. const char *output;
  4451. if (test_bit(order, &huge_shmem_orders_always))
  4452. output = "[always] inherit within_size advise never";
  4453. else if (test_bit(order, &huge_shmem_orders_inherit))
  4454. output = "always [inherit] within_size advise never";
  4455. else if (test_bit(order, &huge_shmem_orders_within_size))
  4456. output = "always inherit [within_size] advise never";
  4457. else if (test_bit(order, &huge_shmem_orders_madvise))
  4458. output = "always inherit within_size [advise] never";
  4459. else
  4460. output = "always inherit within_size advise [never]";
  4461. return sysfs_emit(buf, "%s\n", output);
  4462. }
  4463. static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj,
  4464. struct kobj_attribute *attr,
  4465. const char *buf, size_t count)
  4466. {
  4467. int order = to_thpsize(kobj)->order;
  4468. ssize_t ret = count;
  4469. if (sysfs_streq(buf, "always")) {
  4470. spin_lock(&huge_shmem_orders_lock);
  4471. clear_bit(order, &huge_shmem_orders_inherit);
  4472. clear_bit(order, &huge_shmem_orders_madvise);
  4473. clear_bit(order, &huge_shmem_orders_within_size);
  4474. set_bit(order, &huge_shmem_orders_always);
  4475. spin_unlock(&huge_shmem_orders_lock);
  4476. } else if (sysfs_streq(buf, "inherit")) {
  4477. /* Do not override huge allocation policy with non-PMD sized mTHP */
  4478. if (shmem_huge == SHMEM_HUGE_FORCE &&
  4479. order != HPAGE_PMD_ORDER)
  4480. return -EINVAL;
  4481. spin_lock(&huge_shmem_orders_lock);
  4482. clear_bit(order, &huge_shmem_orders_always);
  4483. clear_bit(order, &huge_shmem_orders_madvise);
  4484. clear_bit(order, &huge_shmem_orders_within_size);
  4485. set_bit(order, &huge_shmem_orders_inherit);
  4486. spin_unlock(&huge_shmem_orders_lock);
  4487. } else if (sysfs_streq(buf, "within_size")) {
  4488. spin_lock(&huge_shmem_orders_lock);
  4489. clear_bit(order, &huge_shmem_orders_always);
  4490. clear_bit(order, &huge_shmem_orders_inherit);
  4491. clear_bit(order, &huge_shmem_orders_madvise);
  4492. set_bit(order, &huge_shmem_orders_within_size);
  4493. spin_unlock(&huge_shmem_orders_lock);
  4494. } else if (sysfs_streq(buf, "advise")) {
  4495. spin_lock(&huge_shmem_orders_lock);
  4496. clear_bit(order, &huge_shmem_orders_always);
  4497. clear_bit(order, &huge_shmem_orders_inherit);
  4498. clear_bit(order, &huge_shmem_orders_within_size);
  4499. set_bit(order, &huge_shmem_orders_madvise);
  4500. spin_unlock(&huge_shmem_orders_lock);
  4501. } else if (sysfs_streq(buf, "never")) {
  4502. spin_lock(&huge_shmem_orders_lock);
  4503. clear_bit(order, &huge_shmem_orders_always);
  4504. clear_bit(order, &huge_shmem_orders_inherit);
  4505. clear_bit(order, &huge_shmem_orders_within_size);
  4506. clear_bit(order, &huge_shmem_orders_madvise);
  4507. spin_unlock(&huge_shmem_orders_lock);
  4508. } else {
  4509. ret = -EINVAL;
  4510. }
  4511. return ret;
  4512. }
  4513. struct kobj_attribute thpsize_shmem_enabled_attr =
  4514. __ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store);
  4515. #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
  4516. #else /* !CONFIG_SHMEM */
  4517. /*
  4518. * tiny-shmem: simple shmemfs and tmpfs using ramfs code
  4519. *
  4520. * This is intended for small system where the benefits of the full
  4521. * shmem code (swap-backed and resource-limited) are outweighed by
  4522. * their complexity. On systems without swap this code should be
  4523. * effectively equivalent, but much lighter weight.
  4524. */
  4525. static struct file_system_type shmem_fs_type = {
  4526. .name = "tmpfs",
  4527. .init_fs_context = ramfs_init_fs_context,
  4528. .parameters = ramfs_fs_parameters,
  4529. .kill_sb = ramfs_kill_sb,
  4530. .fs_flags = FS_USERNS_MOUNT,
  4531. };
  4532. void __init shmem_init(void)
  4533. {
  4534. BUG_ON(register_filesystem(&shmem_fs_type) != 0);
  4535. shm_mnt = kern_mount(&shmem_fs_type);
  4536. BUG_ON(IS_ERR(shm_mnt));
  4537. }
  4538. int shmem_unuse(unsigned int type)
  4539. {
  4540. return 0;
  4541. }
  4542. int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
  4543. {
  4544. return 0;
  4545. }
  4546. void shmem_unlock_mapping(struct address_space *mapping)
  4547. {
  4548. }
  4549. #ifdef CONFIG_MMU
  4550. unsigned long shmem_get_unmapped_area(struct file *file,
  4551. unsigned long addr, unsigned long len,
  4552. unsigned long pgoff, unsigned long flags)
  4553. {
  4554. return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
  4555. }
  4556. #endif
  4557. void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
  4558. {
  4559. truncate_inode_pages_range(inode->i_mapping, lstart, lend);
  4560. }
  4561. EXPORT_SYMBOL_GPL(shmem_truncate_range);
  4562. #define shmem_vm_ops generic_file_vm_ops
  4563. #define shmem_anon_vm_ops generic_file_vm_ops
  4564. #define shmem_file_operations ramfs_file_operations
  4565. #define shmem_acct_size(flags, size) 0
  4566. #define shmem_unacct_size(flags, size) do {} while (0)
  4567. static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
  4568. struct super_block *sb, struct inode *dir,
  4569. umode_t mode, dev_t dev, unsigned long flags)
  4570. {
  4571. struct inode *inode = ramfs_get_inode(sb, dir, mode, dev);
  4572. return inode ? inode : ERR_PTR(-ENOSPC);
  4573. }
  4574. #endif /* CONFIG_SHMEM */
  4575. /* common code */
  4576. static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
  4577. loff_t size, unsigned long flags, unsigned int i_flags)
  4578. {
  4579. struct inode *inode;
  4580. struct file *res;
  4581. if (IS_ERR(mnt))
  4582. return ERR_CAST(mnt);
  4583. if (size < 0 || size > MAX_LFS_FILESIZE)
  4584. return ERR_PTR(-EINVAL);
  4585. if (shmem_acct_size(flags, size))
  4586. return ERR_PTR(-ENOMEM);
  4587. if (is_idmapped_mnt(mnt))
  4588. return ERR_PTR(-EINVAL);
  4589. inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
  4590. S_IFREG | S_IRWXUGO, 0, flags);
  4591. if (IS_ERR(inode)) {
  4592. shmem_unacct_size(flags, size);
  4593. return ERR_CAST(inode);
  4594. }
  4595. inode->i_flags |= i_flags;
  4596. inode->i_size = size;
  4597. clear_nlink(inode); /* It is unlinked */
  4598. res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
  4599. if (!IS_ERR(res))
  4600. res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
  4601. &shmem_file_operations);
  4602. if (IS_ERR(res))
  4603. iput(inode);
  4604. return res;
  4605. }
  4606. /**
  4607. * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
  4608. * kernel internal. There will be NO LSM permission checks against the
  4609. * underlying inode. So users of this interface must do LSM checks at a
  4610. * higher layer. The users are the big_key and shm implementations. LSM
  4611. * checks are provided at the key or shm level rather than the inode.
  4612. * @name: name for dentry (to be seen in /proc/<pid>/maps
  4613. * @size: size to be set for the file
  4614. * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
  4615. */
  4616. struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
  4617. {
  4618. return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
  4619. }
  4620. EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);
  4621. /**
  4622. * shmem_file_setup - get an unlinked file living in tmpfs
  4623. * @name: name for dentry (to be seen in /proc/<pid>/maps
  4624. * @size: size to be set for the file
  4625. * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
  4626. */
  4627. struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
  4628. {
  4629. return __shmem_file_setup(shm_mnt, name, size, flags, 0);
  4630. }
  4631. EXPORT_SYMBOL_GPL(shmem_file_setup);
  4632. /**
  4633. * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
  4634. * @mnt: the tmpfs mount where the file will be created
  4635. * @name: name for dentry (to be seen in /proc/<pid>/maps
  4636. * @size: size to be set for the file
  4637. * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
  4638. */
  4639. struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
  4640. loff_t size, unsigned long flags)
  4641. {
  4642. return __shmem_file_setup(mnt, name, size, flags, 0);
  4643. }
  4644. EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
  4645. /**
  4646. * shmem_zero_setup - setup a shared anonymous mapping
  4647. * @vma: the vma to be mmapped is prepared by do_mmap
  4648. */
  4649. int shmem_zero_setup(struct vm_area_struct *vma)
  4650. {
  4651. struct file *file;
  4652. loff_t size = vma->vm_end - vma->vm_start;
  4653. /*
  4654. * Cloning a new file under mmap_lock leads to a lock ordering conflict
  4655. * between XFS directory reading and selinux: since this file is only
  4656. * accessible to the user through its mapping, use S_PRIVATE flag to
  4657. * bypass file security, in the same way as shmem_kernel_file_setup().
  4658. */
  4659. file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
  4660. if (IS_ERR(file))
  4661. return PTR_ERR(file);
  4662. if (vma->vm_file)
  4663. fput(vma->vm_file);
  4664. vma->vm_file = file;
  4665. vma->vm_ops = &shmem_anon_vm_ops;
  4666. return 0;
  4667. }
  4668. /**
  4669. * shmem_read_folio_gfp - read into page cache, using specified page allocation flags.
  4670. * @mapping: the folio's address_space
  4671. * @index: the folio index
  4672. * @gfp: the page allocator flags to use if allocating
  4673. *
  4674. * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
  4675. * with any new page allocations done using the specified allocation flags.
  4676. * But read_cache_page_gfp() uses the ->read_folio() method: which does not
  4677. * suit tmpfs, since it may have pages in swapcache, and needs to find those
  4678. * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
  4679. *
  4680. * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
  4681. * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
  4682. */
  4683. struct folio *shmem_read_folio_gfp(struct address_space *mapping,
  4684. pgoff_t index, gfp_t gfp)
  4685. {
  4686. #ifdef CONFIG_SHMEM
  4687. struct inode *inode = mapping->host;
  4688. struct folio *folio;
  4689. int error;
  4690. error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE,
  4691. gfp, NULL, NULL);
  4692. if (error)
  4693. return ERR_PTR(error);
  4694. folio_unlock(folio);
  4695. return folio;
  4696. #else
  4697. /*
  4698. * The tiny !SHMEM case uses ramfs without swap
  4699. */
  4700. return mapping_read_folio_gfp(mapping, index, gfp);
  4701. #endif
  4702. }
  4703. EXPORT_SYMBOL_GPL(shmem_read_folio_gfp);
  4704. struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
  4705. pgoff_t index, gfp_t gfp)
  4706. {
  4707. struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp);
  4708. struct page *page;
  4709. if (IS_ERR(folio))
  4710. return &folio->page;
  4711. page = folio_file_page(folio, index);
  4712. if (PageHWPoison(page)) {
  4713. folio_put(folio);
  4714. return ERR_PTR(-EIO);
  4715. }
  4716. return page;
  4717. }
  4718. EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);