shmem.c 142 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371
  1. /*
  2. * Resizable virtual memory filesystem for Linux.
  3. *
  4. * Copyright (C) 2000 Linus Torvalds.
  5. * 2000 Transmeta Corp.
  6. * 2000-2001 Christoph Rohland
  7. * 2000-2001 SAP AG
  8. * 2002 Red Hat Inc.
  9. * Copyright (C) 2002-2011 Hugh Dickins.
  10. * Copyright (C) 2011 Google Inc.
  11. * Copyright (C) 2002-2005 VERITAS Software Corporation.
  12. * Copyright (C) 2004 Andi Kleen, SuSE Labs
  13. *
  14. * Extended attribute support for tmpfs:
  15. * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
  16. * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
  17. *
  18. * tiny-shmem:
  19. * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
  20. *
  21. * This file is released under the GPL.
  22. */
  23. #include <linux/fs.h>
  24. #include <linux/init.h>
  25. #include <linux/vfs.h>
  26. #include <linux/mount.h>
  27. #include <linux/ramfs.h>
  28. #include <linux/pagemap.h>
  29. #include <linux/file.h>
  30. #include <linux/fileattr.h>
  31. #include <linux/mm.h>
  32. #include <linux/random.h>
  33. #include <linux/sched/signal.h>
  34. #include <linux/export.h>
  35. #include <linux/shmem_fs.h>
  36. #include <linux/swap.h>
  37. #include <linux/uio.h>
  38. #include <linux/hugetlb.h>
  39. #include <linux/fs_parser.h>
  40. #include <linux/swapfile.h>
  41. #include <linux/iversion.h>
  42. #include "swap.h"
  43. static struct vfsmount *shm_mnt __ro_after_init;
  44. #ifdef CONFIG_SHMEM
  45. /*
  46. * This virtual memory filesystem is heavily based on the ramfs. It
  47. * extends ramfs by the ability to use swap and honor resource limits
  48. * which makes it a completely usable filesystem.
  49. */
  50. #include <linux/xattr.h>
  51. #include <linux/exportfs.h>
  52. #include <linux/posix_acl.h>
  53. #include <linux/posix_acl_xattr.h>
  54. #include <linux/mman.h>
  55. #include <linux/string.h>
  56. #include <linux/slab.h>
  57. #include <linux/backing-dev.h>
  58. #include <linux/writeback.h>
  59. #include <linux/pagevec.h>
  60. #include <linux/percpu_counter.h>
  61. #include <linux/falloc.h>
  62. #include <linux/splice.h>
  63. #include <linux/security.h>
  64. #include <linux/swapops.h>
  65. #include <linux/mempolicy.h>
  66. #include <linux/namei.h>
  67. #include <linux/ctype.h>
  68. #include <linux/migrate.h>
  69. #include <linux/highmem.h>
  70. #include <linux/seq_file.h>
  71. #include <linux/magic.h>
  72. #include <linux/syscalls.h>
  73. #include <linux/fcntl.h>
  74. #include <uapi/linux/memfd.h>
  75. #include <linux/rmap.h>
  76. #include <linux/uuid.h>
  77. #include <linux/quotaops.h>
  78. #include <linux/rcupdate_wait.h>
  79. #include <linux/uaccess.h>
  80. #include "internal.h"
  81. #define BLOCKS_PER_PAGE (PAGE_SIZE/512)
  82. #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT)
  83. /* Pretend that each entry is of this size in directory's i_size */
  84. #define BOGO_DIRENT_SIZE 20
  85. /* Pretend that one inode + its dentry occupy this much memory */
  86. #define BOGO_INODE_SIZE 1024
  87. /* Symlink up to this size is kmalloc'ed instead of using a swappable page */
  88. #define SHORT_SYMLINK_LEN 128
  89. /*
  90. * shmem_fallocate communicates with shmem_fault or shmem_writepage via
  91. * inode->i_private (with i_rwsem making sure that it has only one user at
  92. * a time): we would prefer not to enlarge the shmem inode just for that.
  93. */
  94. struct shmem_falloc {
  95. wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
  96. pgoff_t start; /* start of range currently being fallocated */
  97. pgoff_t next; /* the next page offset to be fallocated */
  98. pgoff_t nr_falloced; /* how many new pages have been fallocated */
  99. pgoff_t nr_unswapped; /* how often writepage refused to swap out */
  100. };
  101. struct shmem_options {
  102. unsigned long long blocks;
  103. unsigned long long inodes;
  104. struct mempolicy *mpol;
  105. kuid_t uid;
  106. kgid_t gid;
  107. umode_t mode;
  108. bool full_inums;
  109. int huge;
  110. int seen;
  111. bool noswap;
  112. unsigned short quota_types;
  113. struct shmem_quota_limits qlimits;
  114. #define SHMEM_SEEN_BLOCKS 1
  115. #define SHMEM_SEEN_INODES 2
  116. #define SHMEM_SEEN_HUGE 4
  117. #define SHMEM_SEEN_INUMS 8
  118. #define SHMEM_SEEN_NOSWAP 16
  119. #define SHMEM_SEEN_QUOTA 32
  120. };
  121. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  122. static unsigned long huge_shmem_orders_always __read_mostly;
  123. static unsigned long huge_shmem_orders_madvise __read_mostly;
  124. static unsigned long huge_shmem_orders_inherit __read_mostly;
  125. static unsigned long huge_shmem_orders_within_size __read_mostly;
  126. #endif
  127. #ifdef CONFIG_TMPFS
  128. static unsigned long shmem_default_max_blocks(void)
  129. {
  130. return totalram_pages() / 2;
  131. }
  132. static unsigned long shmem_default_max_inodes(void)
  133. {
  134. unsigned long nr_pages = totalram_pages();
  135. return min3(nr_pages - totalhigh_pages(), nr_pages / 2,
  136. ULONG_MAX / BOGO_INODE_SIZE);
  137. }
  138. #endif
  139. static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
  140. struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
  141. struct vm_area_struct *vma, vm_fault_t *fault_type);
  142. static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
  143. {
  144. return sb->s_fs_info;
  145. }
  146. /*
  147. * shmem_file_setup pre-accounts the whole fixed size of a VM object,
  148. * for shared memory and for shared anonymous (/dev/zero) mappings
  149. * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
  150. * consistent with the pre-accounting of private mappings ...
  151. */
  152. static inline int shmem_acct_size(unsigned long flags, loff_t size)
  153. {
  154. return (flags & VM_NORESERVE) ?
  155. 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
  156. }
  157. static inline void shmem_unacct_size(unsigned long flags, loff_t size)
  158. {
  159. if (!(flags & VM_NORESERVE))
  160. vm_unacct_memory(VM_ACCT(size));
  161. }
  162. static inline int shmem_reacct_size(unsigned long flags,
  163. loff_t oldsize, loff_t newsize)
  164. {
  165. if (!(flags & VM_NORESERVE)) {
  166. if (VM_ACCT(newsize) > VM_ACCT(oldsize))
  167. return security_vm_enough_memory_mm(current->mm,
  168. VM_ACCT(newsize) - VM_ACCT(oldsize));
  169. else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
  170. vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
  171. }
  172. return 0;
  173. }
  174. /*
  175. * ... whereas tmpfs objects are accounted incrementally as
  176. * pages are allocated, in order to allow large sparse files.
  177. * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM,
  178. * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
  179. */
  180. static inline int shmem_acct_blocks(unsigned long flags, long pages)
  181. {
  182. if (!(flags & VM_NORESERVE))
  183. return 0;
  184. return security_vm_enough_memory_mm(current->mm,
  185. pages * VM_ACCT(PAGE_SIZE));
  186. }
  187. static inline void shmem_unacct_blocks(unsigned long flags, long pages)
  188. {
  189. if (flags & VM_NORESERVE)
  190. vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
  191. }
  192. static int shmem_inode_acct_blocks(struct inode *inode, long pages)
  193. {
  194. struct shmem_inode_info *info = SHMEM_I(inode);
  195. struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
  196. int err = -ENOSPC;
  197. if (shmem_acct_blocks(info->flags, pages))
  198. return err;
  199. might_sleep(); /* when quotas */
  200. if (sbinfo->max_blocks) {
  201. if (!percpu_counter_limited_add(&sbinfo->used_blocks,
  202. sbinfo->max_blocks, pages))
  203. goto unacct;
  204. err = dquot_alloc_block_nodirty(inode, pages);
  205. if (err) {
  206. percpu_counter_sub(&sbinfo->used_blocks, pages);
  207. goto unacct;
  208. }
  209. } else {
  210. err = dquot_alloc_block_nodirty(inode, pages);
  211. if (err)
  212. goto unacct;
  213. }
  214. return 0;
  215. unacct:
  216. shmem_unacct_blocks(info->flags, pages);
  217. return err;
  218. }
  219. static void shmem_inode_unacct_blocks(struct inode *inode, long pages)
  220. {
  221. struct shmem_inode_info *info = SHMEM_I(inode);
  222. struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
  223. might_sleep(); /* when quotas */
  224. dquot_free_block_nodirty(inode, pages);
  225. if (sbinfo->max_blocks)
  226. percpu_counter_sub(&sbinfo->used_blocks, pages);
  227. shmem_unacct_blocks(info->flags, pages);
  228. }
  229. static const struct super_operations shmem_ops;
  230. static const struct address_space_operations shmem_aops;
  231. static const struct file_operations shmem_file_operations;
  232. static const struct inode_operations shmem_inode_operations;
  233. static const struct inode_operations shmem_dir_inode_operations;
  234. static const struct inode_operations shmem_special_inode_operations;
  235. static const struct vm_operations_struct shmem_vm_ops;
  236. static const struct vm_operations_struct shmem_anon_vm_ops;
  237. static struct file_system_type shmem_fs_type;
  238. bool shmem_mapping(struct address_space *mapping)
  239. {
  240. return mapping->a_ops == &shmem_aops;
  241. }
  242. EXPORT_SYMBOL_GPL(shmem_mapping);
  243. bool vma_is_anon_shmem(struct vm_area_struct *vma)
  244. {
  245. return vma->vm_ops == &shmem_anon_vm_ops;
  246. }
  247. bool vma_is_shmem(struct vm_area_struct *vma)
  248. {
  249. return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops;
  250. }
  251. static LIST_HEAD(shmem_swaplist);
  252. static DEFINE_MUTEX(shmem_swaplist_mutex);
  253. #ifdef CONFIG_TMPFS_QUOTA
  254. static int shmem_enable_quotas(struct super_block *sb,
  255. unsigned short quota_types)
  256. {
  257. int type, err = 0;
  258. sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
  259. for (type = 0; type < SHMEM_MAXQUOTAS; type++) {
  260. if (!(quota_types & (1 << type)))
  261. continue;
  262. err = dquot_load_quota_sb(sb, type, QFMT_SHMEM,
  263. DQUOT_USAGE_ENABLED |
  264. DQUOT_LIMITS_ENABLED);
  265. if (err)
  266. goto out_err;
  267. }
  268. return 0;
  269. out_err:
  270. pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n",
  271. type, err);
  272. for (type--; type >= 0; type--)
  273. dquot_quota_off(sb, type);
  274. return err;
  275. }
  276. static void shmem_disable_quotas(struct super_block *sb)
  277. {
  278. int type;
  279. for (type = 0; type < SHMEM_MAXQUOTAS; type++)
  280. dquot_quota_off(sb, type);
  281. }
  282. static struct dquot __rcu **shmem_get_dquots(struct inode *inode)
  283. {
  284. return SHMEM_I(inode)->i_dquot;
  285. }
  286. #endif /* CONFIG_TMPFS_QUOTA */
  287. /*
  288. * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
  289. * produces a novel ino for the newly allocated inode.
  290. *
  291. * It may also be called when making a hard link to permit the space needed by
  292. * each dentry. However, in that case, no new inode number is needed since that
  293. * internally draws from another pool of inode numbers (currently global
  294. * get_next_ino()). This case is indicated by passing NULL as inop.
  295. */
  296. #define SHMEM_INO_BATCH 1024
  297. static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
  298. {
  299. struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
  300. ino_t ino;
  301. if (!(sb->s_flags & SB_KERNMOUNT)) {
  302. raw_spin_lock(&sbinfo->stat_lock);
  303. if (sbinfo->max_inodes) {
  304. if (sbinfo->free_ispace < BOGO_INODE_SIZE) {
  305. raw_spin_unlock(&sbinfo->stat_lock);
  306. return -ENOSPC;
  307. }
  308. sbinfo->free_ispace -= BOGO_INODE_SIZE;
  309. }
  310. if (inop) {
  311. ino = sbinfo->next_ino++;
  312. if (unlikely(is_zero_ino(ino)))
  313. ino = sbinfo->next_ino++;
  314. if (unlikely(!sbinfo->full_inums &&
  315. ino > UINT_MAX)) {
  316. /*
  317. * Emulate get_next_ino uint wraparound for
  318. * compatibility
  319. */
  320. if (IS_ENABLED(CONFIG_64BIT))
  321. pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
  322. __func__, MINOR(sb->s_dev));
  323. sbinfo->next_ino = 1;
  324. ino = sbinfo->next_ino++;
  325. }
  326. *inop = ino;
  327. }
  328. raw_spin_unlock(&sbinfo->stat_lock);
  329. } else if (inop) {
  330. /*
  331. * __shmem_file_setup, one of our callers, is lock-free: it
  332. * doesn't hold stat_lock in shmem_reserve_inode since
  333. * max_inodes is always 0, and is called from potentially
  334. * unknown contexts. As such, use a per-cpu batched allocator
  335. * which doesn't require the per-sb stat_lock unless we are at
  336. * the batch boundary.
  337. *
  338. * We don't need to worry about inode{32,64} since SB_KERNMOUNT
  339. * shmem mounts are not exposed to userspace, so we don't need
  340. * to worry about things like glibc compatibility.
  341. */
  342. ino_t *next_ino;
  343. next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
  344. ino = *next_ino;
  345. if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
  346. raw_spin_lock(&sbinfo->stat_lock);
  347. ino = sbinfo->next_ino;
  348. sbinfo->next_ino += SHMEM_INO_BATCH;
  349. raw_spin_unlock(&sbinfo->stat_lock);
  350. if (unlikely(is_zero_ino(ino)))
  351. ino++;
  352. }
  353. *inop = ino;
  354. *next_ino = ++ino;
  355. put_cpu();
  356. }
  357. return 0;
  358. }
  359. static void shmem_free_inode(struct super_block *sb, size_t freed_ispace)
  360. {
  361. struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
  362. if (sbinfo->max_inodes) {
  363. raw_spin_lock(&sbinfo->stat_lock);
  364. sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace;
  365. raw_spin_unlock(&sbinfo->stat_lock);
  366. }
  367. }
  368. /**
  369. * shmem_recalc_inode - recalculate the block usage of an inode
  370. * @inode: inode to recalc
  371. * @alloced: the change in number of pages allocated to inode
  372. * @swapped: the change in number of pages swapped from inode
  373. *
  374. * We have to calculate the free blocks since the mm can drop
  375. * undirtied hole pages behind our back.
  376. *
  377. * But normally info->alloced == inode->i_mapping->nrpages + info->swapped
  378. * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
  379. */
  380. static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
  381. {
  382. struct shmem_inode_info *info = SHMEM_I(inode);
  383. long freed;
  384. spin_lock(&info->lock);
  385. info->alloced += alloced;
  386. info->swapped += swapped;
  387. freed = info->alloced - info->swapped -
  388. READ_ONCE(inode->i_mapping->nrpages);
  389. /*
  390. * Special case: whereas normally shmem_recalc_inode() is called
  391. * after i_mapping->nrpages has already been adjusted (up or down),
  392. * shmem_writepage() has to raise swapped before nrpages is lowered -
  393. * to stop a racing shmem_recalc_inode() from thinking that a page has
  394. * been freed. Compensate here, to avoid the need for a followup call.
  395. */
  396. if (swapped > 0)
  397. freed += swapped;
  398. if (freed > 0)
  399. info->alloced -= freed;
  400. spin_unlock(&info->lock);
  401. /* The quota case may block */
  402. if (freed > 0)
  403. shmem_inode_unacct_blocks(inode, freed);
  404. }
  405. bool shmem_charge(struct inode *inode, long pages)
  406. {
  407. struct address_space *mapping = inode->i_mapping;
  408. if (shmem_inode_acct_blocks(inode, pages))
  409. return false;
  410. /* nrpages adjustment first, then shmem_recalc_inode() when balanced */
  411. xa_lock_irq(&mapping->i_pages);
  412. mapping->nrpages += pages;
  413. xa_unlock_irq(&mapping->i_pages);
  414. shmem_recalc_inode(inode, pages, 0);
  415. return true;
  416. }
  417. void shmem_uncharge(struct inode *inode, long pages)
  418. {
  419. /* pages argument is currently unused: keep it to help debugging */
  420. /* nrpages adjustment done by __filemap_remove_folio() or caller */
  421. shmem_recalc_inode(inode, 0, 0);
  422. }
  423. /*
  424. * Replace item expected in xarray by a new item, while holding xa_lock.
  425. */
  426. static int shmem_replace_entry(struct address_space *mapping,
  427. pgoff_t index, void *expected, void *replacement)
  428. {
  429. XA_STATE(xas, &mapping->i_pages, index);
  430. void *item;
  431. VM_BUG_ON(!expected);
  432. VM_BUG_ON(!replacement);
  433. item = xas_load(&xas);
  434. if (item != expected)
  435. return -ENOENT;
  436. xas_store(&xas, replacement);
  437. return 0;
  438. }
  439. /*
  440. * Sometimes, before we decide whether to proceed or to fail, we must check
  441. * that an entry was not already brought back from swap by a racing thread.
  442. *
  443. * Checking folio is not enough: by the time a swapcache folio is locked, it
  444. * might be reused, and again be swapcache, using the same swap as before.
  445. */
  446. static bool shmem_confirm_swap(struct address_space *mapping,
  447. pgoff_t index, swp_entry_t swap)
  448. {
  449. return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
  450. }
  451. /*
  452. * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
  453. *
  454. * SHMEM_HUGE_NEVER:
  455. * disables huge pages for the mount;
  456. * SHMEM_HUGE_ALWAYS:
  457. * enables huge pages for the mount;
  458. * SHMEM_HUGE_WITHIN_SIZE:
  459. * only allocate huge pages if the page will be fully within i_size,
  460. * also respect fadvise()/madvise() hints;
  461. * SHMEM_HUGE_ADVISE:
  462. * only allocate huge pages if requested with fadvise()/madvise();
  463. */
  464. #define SHMEM_HUGE_NEVER 0
  465. #define SHMEM_HUGE_ALWAYS 1
  466. #define SHMEM_HUGE_WITHIN_SIZE 2
  467. #define SHMEM_HUGE_ADVISE 3
  468. /*
  469. * Special values.
  470. * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
  471. *
  472. * SHMEM_HUGE_DENY:
  473. * disables huge on shm_mnt and all mounts, for emergency use;
  474. * SHMEM_HUGE_FORCE:
  475. * enables huge on shm_mnt and all mounts, w/o needing option, for testing;
  476. *
  477. */
  478. #define SHMEM_HUGE_DENY (-1)
  479. #define SHMEM_HUGE_FORCE (-2)
  480. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  481. /* ifdef here to avoid bloating shmem.o when not necessary */
  482. static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
  483. static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
  484. loff_t write_end, bool shmem_huge_force,
  485. struct vm_area_struct *vma,
  486. unsigned long vm_flags)
  487. {
  488. struct mm_struct *mm = vma ? vma->vm_mm : NULL;
  489. loff_t i_size;
  490. if (!S_ISREG(inode->i_mode))
  491. return false;
  492. if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags)))
  493. return false;
  494. if (shmem_huge == SHMEM_HUGE_DENY)
  495. return false;
  496. if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
  497. return true;
  498. switch (SHMEM_SB(inode->i_sb)->huge) {
  499. case SHMEM_HUGE_ALWAYS:
  500. return true;
  501. case SHMEM_HUGE_WITHIN_SIZE:
  502. index = round_up(index + 1, HPAGE_PMD_NR);
  503. i_size = max(write_end, i_size_read(inode));
  504. i_size = round_up(i_size, PAGE_SIZE);
  505. if (i_size >> PAGE_SHIFT >= index)
  506. return true;
  507. fallthrough;
  508. case SHMEM_HUGE_ADVISE:
  509. if (mm && (vm_flags & VM_HUGEPAGE))
  510. return true;
  511. fallthrough;
  512. default:
  513. return false;
  514. }
  515. }
  516. static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
  517. loff_t write_end, bool shmem_huge_force,
  518. struct vm_area_struct *vma, unsigned long vm_flags)
  519. {
  520. if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER)
  521. return false;
  522. return __shmem_huge_global_enabled(inode, index, write_end,
  523. shmem_huge_force, vma, vm_flags);
  524. }
  525. #if defined(CONFIG_SYSFS)
  526. static int shmem_parse_huge(const char *str)
  527. {
  528. if (!strcmp(str, "never"))
  529. return SHMEM_HUGE_NEVER;
  530. if (!strcmp(str, "always"))
  531. return SHMEM_HUGE_ALWAYS;
  532. if (!strcmp(str, "within_size"))
  533. return SHMEM_HUGE_WITHIN_SIZE;
  534. if (!strcmp(str, "advise"))
  535. return SHMEM_HUGE_ADVISE;
  536. if (!strcmp(str, "deny"))
  537. return SHMEM_HUGE_DENY;
  538. if (!strcmp(str, "force"))
  539. return SHMEM_HUGE_FORCE;
  540. return -EINVAL;
  541. }
  542. #endif
  543. #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
  544. static const char *shmem_format_huge(int huge)
  545. {
  546. switch (huge) {
  547. case SHMEM_HUGE_NEVER:
  548. return "never";
  549. case SHMEM_HUGE_ALWAYS:
  550. return "always";
  551. case SHMEM_HUGE_WITHIN_SIZE:
  552. return "within_size";
  553. case SHMEM_HUGE_ADVISE:
  554. return "advise";
  555. case SHMEM_HUGE_DENY:
  556. return "deny";
  557. case SHMEM_HUGE_FORCE:
  558. return "force";
  559. default:
  560. VM_BUG_ON(1);
  561. return "bad_val";
  562. }
  563. }
  564. #endif
  565. static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
  566. struct shrink_control *sc, unsigned long nr_to_free)
  567. {
  568. LIST_HEAD(list), *pos, *next;
  569. struct inode *inode;
  570. struct shmem_inode_info *info;
  571. struct folio *folio;
  572. unsigned long batch = sc ? sc->nr_to_scan : 128;
  573. unsigned long split = 0, freed = 0;
  574. if (list_empty(&sbinfo->shrinklist))
  575. return SHRINK_STOP;
  576. spin_lock(&sbinfo->shrinklist_lock);
  577. list_for_each_safe(pos, next, &sbinfo->shrinklist) {
  578. info = list_entry(pos, struct shmem_inode_info, shrinklist);
  579. /* pin the inode */
  580. inode = igrab(&info->vfs_inode);
  581. /* inode is about to be evicted */
  582. if (!inode) {
  583. list_del_init(&info->shrinklist);
  584. goto next;
  585. }
  586. list_move(&info->shrinklist, &list);
  587. next:
  588. sbinfo->shrinklist_len--;
  589. if (!--batch)
  590. break;
  591. }
  592. spin_unlock(&sbinfo->shrinklist_lock);
  593. list_for_each_safe(pos, next, &list) {
  594. pgoff_t next, end;
  595. loff_t i_size;
  596. int ret;
  597. info = list_entry(pos, struct shmem_inode_info, shrinklist);
  598. inode = &info->vfs_inode;
  599. if (nr_to_free && freed >= nr_to_free)
  600. goto move_back;
  601. i_size = i_size_read(inode);
  602. folio = filemap_get_entry(inode->i_mapping, i_size / PAGE_SIZE);
  603. if (!folio || xa_is_value(folio))
  604. goto drop;
  605. /* No large folio at the end of the file: nothing to split */
  606. if (!folio_test_large(folio)) {
  607. folio_put(folio);
  608. goto drop;
  609. }
  610. /* Check if there is anything to gain from splitting */
  611. next = folio_next_index(folio);
  612. end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE));
  613. if (end <= folio->index || end >= next) {
  614. folio_put(folio);
  615. goto drop;
  616. }
  617. /*
  618. * Move the inode on the list back to shrinklist if we failed
  619. * to lock the page at this time.
  620. *
  621. * Waiting for the lock may lead to deadlock in the
  622. * reclaim path.
  623. */
  624. if (!folio_trylock(folio)) {
  625. folio_put(folio);
  626. goto move_back;
  627. }
  628. ret = split_folio(folio);
  629. folio_unlock(folio);
  630. folio_put(folio);
  631. /* If split failed move the inode on the list back to shrinklist */
  632. if (ret)
  633. goto move_back;
  634. freed += next - end;
  635. split++;
  636. drop:
  637. list_del_init(&info->shrinklist);
  638. goto put;
  639. move_back:
  640. /*
  641. * Make sure the inode is either on the global list or deleted
  642. * from any local list before iput() since it could be deleted
  643. * in another thread once we put the inode (then the local list
  644. * is corrupted).
  645. */
  646. spin_lock(&sbinfo->shrinklist_lock);
  647. list_move(&info->shrinklist, &sbinfo->shrinklist);
  648. sbinfo->shrinklist_len++;
  649. spin_unlock(&sbinfo->shrinklist_lock);
  650. put:
  651. iput(inode);
  652. }
  653. return split;
  654. }
  655. static long shmem_unused_huge_scan(struct super_block *sb,
  656. struct shrink_control *sc)
  657. {
  658. struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
  659. if (!READ_ONCE(sbinfo->shrinklist_len))
  660. return SHRINK_STOP;
  661. return shmem_unused_huge_shrink(sbinfo, sc, 0);
  662. }
  663. static long shmem_unused_huge_count(struct super_block *sb,
  664. struct shrink_control *sc)
  665. {
  666. struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
  667. return READ_ONCE(sbinfo->shrinklist_len);
  668. }
  669. #else /* !CONFIG_TRANSPARENT_HUGEPAGE */
  670. #define shmem_huge SHMEM_HUGE_DENY
  671. static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
  672. struct shrink_control *sc, unsigned long nr_to_free)
  673. {
  674. return 0;
  675. }
  676. static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
  677. loff_t write_end, bool shmem_huge_force,
  678. struct vm_area_struct *vma, unsigned long vm_flags)
  679. {
  680. return false;
  681. }
  682. #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  683. static void shmem_update_stats(struct folio *folio, int nr_pages)
  684. {
  685. if (folio_test_pmd_mappable(folio))
  686. __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages);
  687. __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
  688. __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages);
  689. }
  690. /*
  691. * Somewhat like filemap_add_folio, but error if expected item has gone.
  692. */
  693. static int shmem_add_to_page_cache(struct folio *folio,
  694. struct address_space *mapping,
  695. pgoff_t index, void *expected, gfp_t gfp)
  696. {
  697. XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
  698. long nr = folio_nr_pages(folio);
  699. VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
  700. VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
  701. VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
  702. folio_ref_add(folio, nr);
  703. folio->mapping = mapping;
  704. folio->index = index;
  705. gfp &= GFP_RECLAIM_MASK;
  706. folio_throttle_swaprate(folio, gfp);
  707. do {
  708. xas_lock_irq(&xas);
  709. if (expected != xas_find_conflict(&xas)) {
  710. xas_set_err(&xas, -EEXIST);
  711. goto unlock;
  712. }
  713. if (expected && xas_find_conflict(&xas)) {
  714. xas_set_err(&xas, -EEXIST);
  715. goto unlock;
  716. }
  717. xas_store(&xas, folio);
  718. if (xas_error(&xas))
  719. goto unlock;
  720. shmem_update_stats(folio, nr);
  721. mapping->nrpages += nr;
  722. unlock:
  723. xas_unlock_irq(&xas);
  724. } while (xas_nomem(&xas, gfp));
  725. if (xas_error(&xas)) {
  726. folio->mapping = NULL;
  727. folio_ref_sub(folio, nr);
  728. return xas_error(&xas);
  729. }
  730. return 0;
  731. }
  732. /*
  733. * Somewhat like filemap_remove_folio, but substitutes swap for @folio.
  734. */
  735. static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
  736. {
  737. struct address_space *mapping = folio->mapping;
  738. long nr = folio_nr_pages(folio);
  739. int error;
  740. xa_lock_irq(&mapping->i_pages);
  741. error = shmem_replace_entry(mapping, folio->index, folio, radswap);
  742. folio->mapping = NULL;
  743. mapping->nrpages -= nr;
  744. shmem_update_stats(folio, -nr);
  745. xa_unlock_irq(&mapping->i_pages);
  746. folio_put_refs(folio, nr);
  747. BUG_ON(error);
  748. }
  749. /*
  750. * Remove swap entry from page cache, free the swap and its page cache. Returns
  751. * the number of pages being freed. 0 means entry not found in XArray (0 pages
  752. * being freed).
  753. */
  754. static long shmem_free_swap(struct address_space *mapping,
  755. pgoff_t index, void *radswap)
  756. {
  757. int order = xa_get_order(&mapping->i_pages, index);
  758. void *old;
  759. old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
  760. if (old != radswap)
  761. return 0;
  762. free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order);
  763. return 1 << order;
  764. }
  765. /*
  766. * Determine (in bytes) how many of the shmem object's pages mapped by the
  767. * given offsets are swapped out.
  768. *
  769. * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
  770. * as long as the inode doesn't go away and racy results are not a problem.
  771. */
  772. unsigned long shmem_partial_swap_usage(struct address_space *mapping,
  773. pgoff_t start, pgoff_t end)
  774. {
  775. XA_STATE(xas, &mapping->i_pages, start);
  776. struct page *page;
  777. unsigned long swapped = 0;
  778. unsigned long max = end - 1;
  779. rcu_read_lock();
  780. xas_for_each(&xas, page, max) {
  781. if (xas_retry(&xas, page))
  782. continue;
  783. if (xa_is_value(page))
  784. swapped += 1 << xas_get_order(&xas);
  785. if (xas.xa_index == max)
  786. break;
  787. if (need_resched()) {
  788. xas_pause(&xas);
  789. cond_resched_rcu();
  790. }
  791. }
  792. rcu_read_unlock();
  793. return swapped << PAGE_SHIFT;
  794. }
  795. /*
  796. * Determine (in bytes) how many of the shmem object's pages mapped by the
  797. * given vma is swapped out.
  798. *
  799. * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
  800. * as long as the inode doesn't go away and racy results are not a problem.
  801. */
  802. unsigned long shmem_swap_usage(struct vm_area_struct *vma)
  803. {
  804. struct inode *inode = file_inode(vma->vm_file);
  805. struct shmem_inode_info *info = SHMEM_I(inode);
  806. struct address_space *mapping = inode->i_mapping;
  807. unsigned long swapped;
  808. /* Be careful as we don't hold info->lock */
  809. swapped = READ_ONCE(info->swapped);
  810. /*
  811. * The easier cases are when the shmem object has nothing in swap, or
  812. * the vma maps it whole. Then we can simply use the stats that we
  813. * already track.
  814. */
  815. if (!swapped)
  816. return 0;
  817. if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
  818. return swapped << PAGE_SHIFT;
  819. /* Here comes the more involved part */
  820. return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
  821. vma->vm_pgoff + vma_pages(vma));
  822. }
  823. /*
  824. * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
  825. */
  826. void shmem_unlock_mapping(struct address_space *mapping)
  827. {
  828. struct folio_batch fbatch;
  829. pgoff_t index = 0;
  830. folio_batch_init(&fbatch);
  831. /*
  832. * Minor point, but we might as well stop if someone else SHM_LOCKs it.
  833. */
  834. while (!mapping_unevictable(mapping) &&
  835. filemap_get_folios(mapping, &index, ~0UL, &fbatch)) {
  836. check_move_unevictable_folios(&fbatch);
  837. folio_batch_release(&fbatch);
  838. cond_resched();
  839. }
  840. }
  841. static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
  842. {
  843. struct folio *folio;
  844. /*
  845. * At first avoid shmem_get_folio(,,,SGP_READ): that fails
  846. * beyond i_size, and reports fallocated folios as holes.
  847. */
  848. folio = filemap_get_entry(inode->i_mapping, index);
  849. if (!folio)
  850. return folio;
  851. if (!xa_is_value(folio)) {
  852. folio_lock(folio);
  853. if (folio->mapping == inode->i_mapping)
  854. return folio;
  855. /* The folio has been swapped out */
  856. folio_unlock(folio);
  857. folio_put(folio);
  858. }
  859. /*
  860. * But read a folio back from swap if any of it is within i_size
  861. * (although in some cases this is just a waste of time).
  862. */
  863. folio = NULL;
  864. shmem_get_folio(inode, index, 0, &folio, SGP_READ);
  865. return folio;
  866. }
  867. /*
  868. * Remove range of pages and swap entries from page cache, and free them.
  869. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
  870. */
  871. static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
  872. bool unfalloc)
  873. {
  874. struct address_space *mapping = inode->i_mapping;
  875. struct shmem_inode_info *info = SHMEM_I(inode);
  876. pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
  877. pgoff_t end = (lend + 1) >> PAGE_SHIFT;
  878. struct folio_batch fbatch;
  879. pgoff_t indices[PAGEVEC_SIZE];
  880. struct folio *folio;
  881. bool same_folio;
  882. long nr_swaps_freed = 0;
  883. pgoff_t index;
  884. int i;
  885. if (lend == -1)
  886. end = -1; /* unsigned, so actually very big */
  887. if (info->fallocend > start && info->fallocend <= end && !unfalloc)
  888. info->fallocend = start;
  889. folio_batch_init(&fbatch);
  890. index = start;
  891. while (index < end && find_lock_entries(mapping, &index, end - 1,
  892. &fbatch, indices)) {
  893. for (i = 0; i < folio_batch_count(&fbatch); i++) {
  894. folio = fbatch.folios[i];
  895. if (xa_is_value(folio)) {
  896. if (unfalloc)
  897. continue;
  898. nr_swaps_freed += shmem_free_swap(mapping,
  899. indices[i], folio);
  900. continue;
  901. }
  902. if (!unfalloc || !folio_test_uptodate(folio))
  903. truncate_inode_folio(mapping, folio);
  904. folio_unlock(folio);
  905. }
  906. folio_batch_remove_exceptionals(&fbatch);
  907. folio_batch_release(&fbatch);
  908. cond_resched();
  909. }
  910. /*
  911. * When undoing a failed fallocate, we want none of the partial folio
  912. * zeroing and splitting below, but shall want to truncate the whole
  913. * folio when !uptodate indicates that it was added by this fallocate,
  914. * even when [lstart, lend] covers only a part of the folio.
  915. */
  916. if (unfalloc)
  917. goto whole_folios;
  918. same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
  919. folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
  920. if (folio) {
  921. same_folio = lend < folio_pos(folio) + folio_size(folio);
  922. folio_mark_dirty(folio);
  923. if (!truncate_inode_partial_folio(folio, lstart, lend)) {
  924. start = folio_next_index(folio);
  925. if (same_folio)
  926. end = folio->index;
  927. }
  928. folio_unlock(folio);
  929. folio_put(folio);
  930. folio = NULL;
  931. }
  932. if (!same_folio)
  933. folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
  934. if (folio) {
  935. folio_mark_dirty(folio);
  936. if (!truncate_inode_partial_folio(folio, lstart, lend))
  937. end = folio->index;
  938. folio_unlock(folio);
  939. folio_put(folio);
  940. }
  941. whole_folios:
  942. index = start;
  943. while (index < end) {
  944. cond_resched();
  945. if (!find_get_entries(mapping, &index, end - 1, &fbatch,
  946. indices)) {
  947. /* If all gone or hole-punch or unfalloc, we're done */
  948. if (index == start || end != -1)
  949. break;
  950. /* But if truncating, restart to make sure all gone */
  951. index = start;
  952. continue;
  953. }
  954. for (i = 0; i < folio_batch_count(&fbatch); i++) {
  955. folio = fbatch.folios[i];
  956. if (xa_is_value(folio)) {
  957. long swaps_freed;
  958. if (unfalloc)
  959. continue;
  960. swaps_freed = shmem_free_swap(mapping, indices[i], folio);
  961. if (!swaps_freed) {
  962. /* Swap was replaced by page: retry */
  963. index = indices[i];
  964. break;
  965. }
  966. nr_swaps_freed += swaps_freed;
  967. continue;
  968. }
  969. folio_lock(folio);
  970. if (!unfalloc || !folio_test_uptodate(folio)) {
  971. if (folio_mapping(folio) != mapping) {
  972. /* Page was replaced by swap: retry */
  973. folio_unlock(folio);
  974. index = indices[i];
  975. break;
  976. }
  977. VM_BUG_ON_FOLIO(folio_test_writeback(folio),
  978. folio);
  979. if (!folio_test_large(folio)) {
  980. truncate_inode_folio(mapping, folio);
  981. } else if (truncate_inode_partial_folio(folio, lstart, lend)) {
  982. /*
  983. * If we split a page, reset the loop so
  984. * that we pick up the new sub pages.
  985. * Otherwise the THP was entirely
  986. * dropped or the target range was
  987. * zeroed, so just continue the loop as
  988. * is.
  989. */
  990. if (!folio_test_large(folio)) {
  991. folio_unlock(folio);
  992. index = start;
  993. break;
  994. }
  995. }
  996. }
  997. folio_unlock(folio);
  998. }
  999. folio_batch_remove_exceptionals(&fbatch);
  1000. folio_batch_release(&fbatch);
  1001. }
  1002. shmem_recalc_inode(inode, 0, -nr_swaps_freed);
  1003. }
  1004. void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
  1005. {
  1006. shmem_undo_range(inode, lstart, lend, false);
  1007. inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
  1008. inode_inc_iversion(inode);
  1009. }
  1010. EXPORT_SYMBOL_GPL(shmem_truncate_range);
  1011. static int shmem_getattr(struct mnt_idmap *idmap,
  1012. const struct path *path, struct kstat *stat,
  1013. u32 request_mask, unsigned int query_flags)
  1014. {
  1015. struct inode *inode = path->dentry->d_inode;
  1016. struct shmem_inode_info *info = SHMEM_I(inode);
  1017. if (info->alloced - info->swapped != inode->i_mapping->nrpages)
  1018. shmem_recalc_inode(inode, 0, 0);
  1019. if (info->fsflags & FS_APPEND_FL)
  1020. stat->attributes |= STATX_ATTR_APPEND;
  1021. if (info->fsflags & FS_IMMUTABLE_FL)
  1022. stat->attributes |= STATX_ATTR_IMMUTABLE;
  1023. if (info->fsflags & FS_NODUMP_FL)
  1024. stat->attributes |= STATX_ATTR_NODUMP;
  1025. stat->attributes_mask |= (STATX_ATTR_APPEND |
  1026. STATX_ATTR_IMMUTABLE |
  1027. STATX_ATTR_NODUMP);
  1028. generic_fillattr(idmap, request_mask, inode, stat);
  1029. if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
  1030. stat->blksize = HPAGE_PMD_SIZE;
  1031. if (request_mask & STATX_BTIME) {
  1032. stat->result_mask |= STATX_BTIME;
  1033. stat->btime.tv_sec = info->i_crtime.tv_sec;
  1034. stat->btime.tv_nsec = info->i_crtime.tv_nsec;
  1035. }
  1036. return 0;
  1037. }
  1038. static int shmem_setattr(struct mnt_idmap *idmap,
  1039. struct dentry *dentry, struct iattr *attr)
  1040. {
  1041. struct inode *inode = d_inode(dentry);
  1042. struct shmem_inode_info *info = SHMEM_I(inode);
  1043. int error;
  1044. bool update_mtime = false;
  1045. bool update_ctime = true;
  1046. error = setattr_prepare(idmap, dentry, attr);
  1047. if (error)
  1048. return error;
  1049. if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) {
  1050. if ((inode->i_mode ^ attr->ia_mode) & 0111) {
  1051. return -EPERM;
  1052. }
  1053. }
  1054. if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
  1055. loff_t oldsize = inode->i_size;
  1056. loff_t newsize = attr->ia_size;
  1057. /* protected by i_rwsem */
  1058. if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
  1059. (newsize > oldsize && (info->seals & F_SEAL_GROW)))
  1060. return -EPERM;
  1061. if (newsize != oldsize) {
  1062. error = shmem_reacct_size(SHMEM_I(inode)->flags,
  1063. oldsize, newsize);
  1064. if (error)
  1065. return error;
  1066. i_size_write(inode, newsize);
  1067. update_mtime = true;
  1068. } else {
  1069. update_ctime = false;
  1070. }
  1071. if (newsize <= oldsize) {
  1072. loff_t holebegin = round_up(newsize, PAGE_SIZE);
  1073. if (oldsize > holebegin)
  1074. unmap_mapping_range(inode->i_mapping,
  1075. holebegin, 0, 1);
  1076. if (info->alloced)
  1077. shmem_truncate_range(inode,
  1078. newsize, (loff_t)-1);
  1079. /* unmap again to remove racily COWed private pages */
  1080. if (oldsize > holebegin)
  1081. unmap_mapping_range(inode->i_mapping,
  1082. holebegin, 0, 1);
  1083. }
  1084. }
  1085. if (is_quota_modification(idmap, inode, attr)) {
  1086. error = dquot_initialize(inode);
  1087. if (error)
  1088. return error;
  1089. }
  1090. /* Transfer quota accounting */
  1091. if (i_uid_needs_update(idmap, attr, inode) ||
  1092. i_gid_needs_update(idmap, attr, inode)) {
  1093. error = dquot_transfer(idmap, inode, attr);
  1094. if (error)
  1095. return error;
  1096. }
  1097. setattr_copy(idmap, inode, attr);
  1098. if (attr->ia_valid & ATTR_MODE)
  1099. error = posix_acl_chmod(idmap, dentry, inode->i_mode);
  1100. if (!error && update_ctime) {
  1101. inode_set_ctime_current(inode);
  1102. if (update_mtime)
  1103. inode_set_mtime_to_ts(inode, inode_get_ctime(inode));
  1104. inode_inc_iversion(inode);
  1105. }
  1106. return error;
  1107. }
  1108. static void shmem_evict_inode(struct inode *inode)
  1109. {
  1110. struct shmem_inode_info *info = SHMEM_I(inode);
  1111. struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
  1112. size_t freed = 0;
  1113. if (shmem_mapping(inode->i_mapping)) {
  1114. shmem_unacct_size(info->flags, inode->i_size);
  1115. inode->i_size = 0;
  1116. mapping_set_exiting(inode->i_mapping);
  1117. shmem_truncate_range(inode, 0, (loff_t)-1);
  1118. if (!list_empty(&info->shrinklist)) {
  1119. spin_lock(&sbinfo->shrinklist_lock);
  1120. if (!list_empty(&info->shrinklist)) {
  1121. list_del_init(&info->shrinklist);
  1122. sbinfo->shrinklist_len--;
  1123. }
  1124. spin_unlock(&sbinfo->shrinklist_lock);
  1125. }
  1126. while (!list_empty(&info->swaplist)) {
  1127. /* Wait while shmem_unuse() is scanning this inode... */
  1128. wait_var_event(&info->stop_eviction,
  1129. !atomic_read(&info->stop_eviction));
  1130. mutex_lock(&shmem_swaplist_mutex);
  1131. /* ...but beware of the race if we peeked too early */
  1132. if (!atomic_read(&info->stop_eviction))
  1133. list_del_init(&info->swaplist);
  1134. mutex_unlock(&shmem_swaplist_mutex);
  1135. }
  1136. }
  1137. simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL);
  1138. shmem_free_inode(inode->i_sb, freed);
  1139. WARN_ON(inode->i_blocks);
  1140. clear_inode(inode);
  1141. #ifdef CONFIG_TMPFS_QUOTA
  1142. dquot_free_inode(inode);
  1143. dquot_drop(inode);
  1144. #endif
  1145. }
  1146. static int shmem_find_swap_entries(struct address_space *mapping,
  1147. pgoff_t start, struct folio_batch *fbatch,
  1148. pgoff_t *indices, unsigned int type)
  1149. {
  1150. XA_STATE(xas, &mapping->i_pages, start);
  1151. struct folio *folio;
  1152. swp_entry_t entry;
  1153. rcu_read_lock();
  1154. xas_for_each(&xas, folio, ULONG_MAX) {
  1155. if (xas_retry(&xas, folio))
  1156. continue;
  1157. if (!xa_is_value(folio))
  1158. continue;
  1159. entry = radix_to_swp_entry(folio);
  1160. /*
  1161. * swapin error entries can be found in the mapping. But they're
  1162. * deliberately ignored here as we've done everything we can do.
  1163. */
  1164. if (swp_type(entry) != type)
  1165. continue;
  1166. indices[folio_batch_count(fbatch)] = xas.xa_index;
  1167. if (!folio_batch_add(fbatch, folio))
  1168. break;
  1169. if (need_resched()) {
  1170. xas_pause(&xas);
  1171. cond_resched_rcu();
  1172. }
  1173. }
  1174. rcu_read_unlock();
  1175. return xas.xa_index;
  1176. }
  1177. /*
  1178. * Move the swapped pages for an inode to page cache. Returns the count
  1179. * of pages swapped in, or the error in case of failure.
  1180. */
  1181. static int shmem_unuse_swap_entries(struct inode *inode,
  1182. struct folio_batch *fbatch, pgoff_t *indices)
  1183. {
  1184. int i = 0;
  1185. int ret = 0;
  1186. int error = 0;
  1187. struct address_space *mapping = inode->i_mapping;
  1188. for (i = 0; i < folio_batch_count(fbatch); i++) {
  1189. struct folio *folio = fbatch->folios[i];
  1190. if (!xa_is_value(folio))
  1191. continue;
  1192. error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
  1193. mapping_gfp_mask(mapping), NULL, NULL);
  1194. if (error == 0) {
  1195. folio_unlock(folio);
  1196. folio_put(folio);
  1197. ret++;
  1198. }
  1199. if (error == -ENOMEM)
  1200. break;
  1201. error = 0;
  1202. }
  1203. return error ? error : ret;
  1204. }
  1205. /*
  1206. * If swap found in inode, free it and move page from swapcache to filecache.
  1207. */
  1208. static int shmem_unuse_inode(struct inode *inode, unsigned int type)
  1209. {
  1210. struct address_space *mapping = inode->i_mapping;
  1211. pgoff_t start = 0;
  1212. struct folio_batch fbatch;
  1213. pgoff_t indices[PAGEVEC_SIZE];
  1214. int ret = 0;
  1215. do {
  1216. folio_batch_init(&fbatch);
  1217. shmem_find_swap_entries(mapping, start, &fbatch, indices, type);
  1218. if (folio_batch_count(&fbatch) == 0) {
  1219. ret = 0;
  1220. break;
  1221. }
  1222. ret = shmem_unuse_swap_entries(inode, &fbatch, indices);
  1223. if (ret < 0)
  1224. break;
  1225. start = indices[folio_batch_count(&fbatch) - 1];
  1226. } while (true);
  1227. return ret;
  1228. }
  1229. /*
  1230. * Read all the shared memory data that resides in the swap
  1231. * device 'type' back into memory, so the swap device can be
  1232. * unused.
  1233. */
  1234. int shmem_unuse(unsigned int type)
  1235. {
  1236. struct shmem_inode_info *info, *next;
  1237. int error = 0;
  1238. if (list_empty(&shmem_swaplist))
  1239. return 0;
  1240. mutex_lock(&shmem_swaplist_mutex);
  1241. list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
  1242. if (!info->swapped) {
  1243. list_del_init(&info->swaplist);
  1244. continue;
  1245. }
  1246. /*
  1247. * Drop the swaplist mutex while searching the inode for swap;
  1248. * but before doing so, make sure shmem_evict_inode() will not
  1249. * remove placeholder inode from swaplist, nor let it be freed
  1250. * (igrab() would protect from unlink, but not from unmount).
  1251. */
  1252. atomic_inc(&info->stop_eviction);
  1253. mutex_unlock(&shmem_swaplist_mutex);
  1254. error = shmem_unuse_inode(&info->vfs_inode, type);
  1255. cond_resched();
  1256. mutex_lock(&shmem_swaplist_mutex);
  1257. next = list_next_entry(info, swaplist);
  1258. if (!info->swapped)
  1259. list_del_init(&info->swaplist);
  1260. if (atomic_dec_and_test(&info->stop_eviction))
  1261. wake_up_var(&info->stop_eviction);
  1262. if (error)
  1263. break;
  1264. }
  1265. mutex_unlock(&shmem_swaplist_mutex);
  1266. return error;
  1267. }
  1268. /*
  1269. * Move the page from the page cache to the swap cache.
  1270. */
  1271. static int shmem_writepage(struct page *page, struct writeback_control *wbc)
  1272. {
  1273. struct folio *folio = page_folio(page);
  1274. struct address_space *mapping = folio->mapping;
  1275. struct inode *inode = mapping->host;
  1276. struct shmem_inode_info *info = SHMEM_I(inode);
  1277. struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
  1278. swp_entry_t swap;
  1279. pgoff_t index;
  1280. int nr_pages;
  1281. bool split = false;
  1282. /*
  1283. * Our capabilities prevent regular writeback or sync from ever calling
  1284. * shmem_writepage; but a stacking filesystem might use ->writepage of
  1285. * its underlying filesystem, in which case tmpfs should write out to
  1286. * swap only in response to memory pressure, and not for the writeback
  1287. * threads or sync.
  1288. */
  1289. if (WARN_ON_ONCE(!wbc->for_reclaim))
  1290. goto redirty;
  1291. if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap))
  1292. goto redirty;
  1293. if (!total_swap_pages)
  1294. goto redirty;
  1295. /*
  1296. * If CONFIG_THP_SWAP is not enabled, the large folio should be
  1297. * split when swapping.
  1298. *
  1299. * And shrinkage of pages beyond i_size does not split swap, so
  1300. * swapout of a large folio crossing i_size needs to split too
  1301. * (unless fallocate has been used to preallocate beyond EOF).
  1302. */
  1303. if (folio_test_large(folio)) {
  1304. index = shmem_fallocend(inode,
  1305. DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE));
  1306. if ((index > folio->index && index < folio_next_index(folio)) ||
  1307. !IS_ENABLED(CONFIG_THP_SWAP))
  1308. split = true;
  1309. }
  1310. if (split) {
  1311. try_split:
  1312. /* Ensure the subpages are still dirty */
  1313. folio_test_set_dirty(folio);
  1314. if (split_huge_page_to_list_to_order(page, wbc->list, 0))
  1315. goto redirty;
  1316. folio = page_folio(page);
  1317. folio_clear_dirty(folio);
  1318. }
  1319. index = folio->index;
  1320. nr_pages = folio_nr_pages(folio);
  1321. /*
  1322. * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
  1323. * value into swapfile.c, the only way we can correctly account for a
  1324. * fallocated folio arriving here is now to initialize it and write it.
  1325. *
  1326. * That's okay for a folio already fallocated earlier, but if we have
  1327. * not yet completed the fallocation, then (a) we want to keep track
  1328. * of this folio in case we have to undo it, and (b) it may not be a
  1329. * good idea to continue anyway, once we're pushing into swap. So
  1330. * reactivate the folio, and let shmem_fallocate() quit when too many.
  1331. */
  1332. if (!folio_test_uptodate(folio)) {
  1333. if (inode->i_private) {
  1334. struct shmem_falloc *shmem_falloc;
  1335. spin_lock(&inode->i_lock);
  1336. shmem_falloc = inode->i_private;
  1337. if (shmem_falloc &&
  1338. !shmem_falloc->waitq &&
  1339. index >= shmem_falloc->start &&
  1340. index < shmem_falloc->next)
  1341. shmem_falloc->nr_unswapped += nr_pages;
  1342. else
  1343. shmem_falloc = NULL;
  1344. spin_unlock(&inode->i_lock);
  1345. if (shmem_falloc)
  1346. goto redirty;
  1347. }
  1348. folio_zero_range(folio, 0, folio_size(folio));
  1349. flush_dcache_folio(folio);
  1350. folio_mark_uptodate(folio);
  1351. }
  1352. swap = folio_alloc_swap(folio);
  1353. if (!swap.val) {
  1354. if (nr_pages > 1)
  1355. goto try_split;
  1356. goto redirty;
  1357. }
  1358. /*
  1359. * Add inode to shmem_unuse()'s list of swapped-out inodes,
  1360. * if it's not already there. Do it now before the folio is
  1361. * moved to swap cache, when its pagelock no longer protects
  1362. * the inode from eviction. But don't unlock the mutex until
  1363. * we've incremented swapped, because shmem_unuse_inode() will
  1364. * prune a !swapped inode from the swaplist under this mutex.
  1365. */
  1366. mutex_lock(&shmem_swaplist_mutex);
  1367. if (list_empty(&info->swaplist))
  1368. list_add(&info->swaplist, &shmem_swaplist);
  1369. if (add_to_swap_cache(folio, swap,
  1370. __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
  1371. NULL) == 0) {
  1372. shmem_recalc_inode(inode, 0, nr_pages);
  1373. swap_shmem_alloc(swap, nr_pages);
  1374. shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
  1375. mutex_unlock(&shmem_swaplist_mutex);
  1376. BUG_ON(folio_mapped(folio));
  1377. return swap_writepage(&folio->page, wbc);
  1378. }
  1379. mutex_unlock(&shmem_swaplist_mutex);
  1380. put_swap_folio(folio, swap);
  1381. redirty:
  1382. folio_mark_dirty(folio);
  1383. if (wbc->for_reclaim)
  1384. return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
  1385. folio_unlock(folio);
  1386. return 0;
  1387. }
  1388. #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
  1389. static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
  1390. {
  1391. char buffer[64];
  1392. if (!mpol || mpol->mode == MPOL_DEFAULT)
  1393. return; /* show nothing */
  1394. mpol_to_str(buffer, sizeof(buffer), mpol);
  1395. seq_printf(seq, ",mpol=%s", buffer);
  1396. }
  1397. static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
  1398. {
  1399. struct mempolicy *mpol = NULL;
  1400. if (sbinfo->mpol) {
  1401. raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
  1402. mpol = sbinfo->mpol;
  1403. mpol_get(mpol);
  1404. raw_spin_unlock(&sbinfo->stat_lock);
  1405. }
  1406. return mpol;
  1407. }
  1408. #else /* !CONFIG_NUMA || !CONFIG_TMPFS */
  1409. static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
  1410. {
  1411. }
  1412. static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
  1413. {
  1414. return NULL;
  1415. }
  1416. #endif /* CONFIG_NUMA && CONFIG_TMPFS */
  1417. static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
  1418. pgoff_t index, unsigned int order, pgoff_t *ilx);
  1419. static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
  1420. struct shmem_inode_info *info, pgoff_t index)
  1421. {
  1422. struct mempolicy *mpol;
  1423. pgoff_t ilx;
  1424. struct folio *folio;
  1425. mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
  1426. folio = swap_cluster_readahead(swap, gfp, mpol, ilx);
  1427. mpol_cond_put(mpol);
  1428. return folio;
  1429. }
  1430. /*
  1431. * Make sure huge_gfp is always more limited than limit_gfp.
  1432. * Some of the flags set permissions, while others set limitations.
  1433. */
  1434. static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
  1435. {
  1436. gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
  1437. gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
  1438. gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
  1439. gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
  1440. /* Allow allocations only from the originally specified zones. */
  1441. result |= zoneflags;
  1442. /*
  1443. * Minimize the result gfp by taking the union with the deny flags,
  1444. * and the intersection of the allow flags.
  1445. */
  1446. result |= (limit_gfp & denyflags);
  1447. result |= (huge_gfp & limit_gfp) & allowflags;
  1448. return result;
  1449. }
  1450. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  1451. unsigned long shmem_allowable_huge_orders(struct inode *inode,
  1452. struct vm_area_struct *vma, pgoff_t index,
  1453. loff_t write_end, bool shmem_huge_force)
  1454. {
  1455. unsigned long mask = READ_ONCE(huge_shmem_orders_always);
  1456. unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
  1457. unsigned long vm_flags = vma ? vma->vm_flags : 0;
  1458. pgoff_t aligned_index;
  1459. bool global_huge;
  1460. loff_t i_size;
  1461. int order;
  1462. if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
  1463. return 0;
  1464. global_huge = shmem_huge_global_enabled(inode, index, write_end,
  1465. shmem_huge_force, vma, vm_flags);
  1466. if (!vma || !vma_is_anon_shmem(vma)) {
  1467. /*
  1468. * For tmpfs, we now only support PMD sized THP if huge page
  1469. * is enabled, otherwise fallback to order 0.
  1470. */
  1471. return global_huge ? BIT(HPAGE_PMD_ORDER) : 0;
  1472. }
  1473. /*
  1474. * Following the 'deny' semantics of the top level, force the huge
  1475. * option off from all mounts.
  1476. */
  1477. if (shmem_huge == SHMEM_HUGE_DENY)
  1478. return 0;
  1479. /*
  1480. * Only allow inherit orders if the top-level value is 'force', which
  1481. * means non-PMD sized THP can not override 'huge' mount option now.
  1482. */
  1483. if (shmem_huge == SHMEM_HUGE_FORCE)
  1484. return READ_ONCE(huge_shmem_orders_inherit);
  1485. /* Allow mTHP that will be fully within i_size. */
  1486. order = highest_order(within_size_orders);
  1487. while (within_size_orders) {
  1488. aligned_index = round_up(index + 1, 1 << order);
  1489. i_size = round_up(i_size_read(inode), PAGE_SIZE);
  1490. if (i_size >> PAGE_SHIFT >= aligned_index) {
  1491. mask |= within_size_orders;
  1492. break;
  1493. }
  1494. order = next_order(&within_size_orders, order);
  1495. }
  1496. if (vm_flags & VM_HUGEPAGE)
  1497. mask |= READ_ONCE(huge_shmem_orders_madvise);
  1498. if (global_huge)
  1499. mask |= READ_ONCE(huge_shmem_orders_inherit);
  1500. return THP_ORDERS_ALL_FILE_DEFAULT & mask;
  1501. }
  1502. static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
  1503. struct address_space *mapping, pgoff_t index,
  1504. unsigned long orders)
  1505. {
  1506. struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
  1507. pgoff_t aligned_index;
  1508. unsigned long pages;
  1509. int order;
  1510. if (vma) {
  1511. orders = thp_vma_suitable_orders(vma, vmf->address, orders);
  1512. if (!orders)
  1513. return 0;
  1514. }
  1515. /* Find the highest order that can add into the page cache */
  1516. order = highest_order(orders);
  1517. while (orders) {
  1518. pages = 1UL << order;
  1519. aligned_index = round_down(index, pages);
  1520. /*
  1521. * Check for conflict before waiting on a huge allocation.
  1522. * Conflict might be that a huge page has just been allocated
  1523. * and added to page cache by a racing thread, or that there
  1524. * is already at least one small page in the huge extent.
  1525. * Be careful to retry when appropriate, but not forever!
  1526. * Elsewhere -EEXIST would be the right code, but not here.
  1527. */
  1528. if (!xa_find(&mapping->i_pages, &aligned_index,
  1529. aligned_index + pages - 1, XA_PRESENT))
  1530. break;
  1531. order = next_order(&orders, order);
  1532. }
  1533. return orders;
  1534. }
  1535. #else
  1536. static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
  1537. struct address_space *mapping, pgoff_t index,
  1538. unsigned long orders)
  1539. {
  1540. return 0;
  1541. }
  1542. #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  1543. static struct folio *shmem_alloc_folio(gfp_t gfp, int order,
  1544. struct shmem_inode_info *info, pgoff_t index)
  1545. {
  1546. struct mempolicy *mpol;
  1547. pgoff_t ilx;
  1548. struct folio *folio;
  1549. mpol = shmem_get_pgoff_policy(info, index, order, &ilx);
  1550. folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id());
  1551. mpol_cond_put(mpol);
  1552. return folio;
  1553. }
  1554. static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
  1555. gfp_t gfp, struct inode *inode, pgoff_t index,
  1556. struct mm_struct *fault_mm, unsigned long orders)
  1557. {
  1558. struct address_space *mapping = inode->i_mapping;
  1559. struct shmem_inode_info *info = SHMEM_I(inode);
  1560. unsigned long suitable_orders = 0;
  1561. struct folio *folio = NULL;
  1562. long pages;
  1563. int error, order;
  1564. if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
  1565. orders = 0;
  1566. if (orders > 0) {
  1567. suitable_orders = shmem_suitable_orders(inode, vmf,
  1568. mapping, index, orders);
  1569. order = highest_order(suitable_orders);
  1570. while (suitable_orders) {
  1571. pages = 1UL << order;
  1572. index = round_down(index, pages);
  1573. folio = shmem_alloc_folio(gfp, order, info, index);
  1574. if (folio)
  1575. goto allocated;
  1576. if (pages == HPAGE_PMD_NR)
  1577. count_vm_event(THP_FILE_FALLBACK);
  1578. count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK);
  1579. order = next_order(&suitable_orders, order);
  1580. }
  1581. } else {
  1582. pages = 1;
  1583. folio = shmem_alloc_folio(gfp, 0, info, index);
  1584. }
  1585. if (!folio)
  1586. return ERR_PTR(-ENOMEM);
  1587. allocated:
  1588. __folio_set_locked(folio);
  1589. __folio_set_swapbacked(folio);
  1590. gfp &= GFP_RECLAIM_MASK;
  1591. error = mem_cgroup_charge(folio, fault_mm, gfp);
  1592. if (error) {
  1593. if (xa_find(&mapping->i_pages, &index,
  1594. index + pages - 1, XA_PRESENT)) {
  1595. error = -EEXIST;
  1596. } else if (pages > 1) {
  1597. if (pages == HPAGE_PMD_NR) {
  1598. count_vm_event(THP_FILE_FALLBACK);
  1599. count_vm_event(THP_FILE_FALLBACK_CHARGE);
  1600. }
  1601. count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK);
  1602. count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE);
  1603. }
  1604. goto unlock;
  1605. }
  1606. error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp);
  1607. if (error)
  1608. goto unlock;
  1609. error = shmem_inode_acct_blocks(inode, pages);
  1610. if (error) {
  1611. struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
  1612. long freed;
  1613. /*
  1614. * Try to reclaim some space by splitting a few
  1615. * large folios beyond i_size on the filesystem.
  1616. */
  1617. shmem_unused_huge_shrink(sbinfo, NULL, pages);
  1618. /*
  1619. * And do a shmem_recalc_inode() to account for freed pages:
  1620. * except our folio is there in cache, so not quite balanced.
  1621. */
  1622. spin_lock(&info->lock);
  1623. freed = pages + info->alloced - info->swapped -
  1624. READ_ONCE(mapping->nrpages);
  1625. if (freed > 0)
  1626. info->alloced -= freed;
  1627. spin_unlock(&info->lock);
  1628. if (freed > 0)
  1629. shmem_inode_unacct_blocks(inode, freed);
  1630. error = shmem_inode_acct_blocks(inode, pages);
  1631. if (error) {
  1632. filemap_remove_folio(folio);
  1633. goto unlock;
  1634. }
  1635. }
  1636. shmem_recalc_inode(inode, pages, 0);
  1637. folio_add_lru(folio);
  1638. return folio;
  1639. unlock:
  1640. folio_unlock(folio);
  1641. folio_put(folio);
  1642. return ERR_PTR(error);
  1643. }
  1644. /*
  1645. * When a page is moved from swapcache to shmem filecache (either by the
  1646. * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
  1647. * shmem_unuse_inode()), it may have been read in earlier from swap, in
  1648. * ignorance of the mapping it belongs to. If that mapping has special
  1649. * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
  1650. * we may need to copy to a suitable page before moving to filecache.
  1651. *
  1652. * In a future release, this may well be extended to respect cpuset and
  1653. * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
  1654. * but for now it is a simple matter of zone.
  1655. */
  1656. static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
  1657. {
  1658. return folio_zonenum(folio) > gfp_zone(gfp);
  1659. }
  1660. static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
  1661. struct shmem_inode_info *info, pgoff_t index,
  1662. struct vm_area_struct *vma)
  1663. {
  1664. struct folio *new, *old = *foliop;
  1665. swp_entry_t entry = old->swap;
  1666. struct address_space *swap_mapping = swap_address_space(entry);
  1667. pgoff_t swap_index = swap_cache_index(entry);
  1668. XA_STATE(xas, &swap_mapping->i_pages, swap_index);
  1669. int nr_pages = folio_nr_pages(old);
  1670. int error = 0, i;
  1671. /*
  1672. * We have arrived here because our zones are constrained, so don't
  1673. * limit chance of success by further cpuset and node constraints.
  1674. */
  1675. gfp &= ~GFP_CONSTRAINT_MASK;
  1676. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  1677. if (nr_pages > 1) {
  1678. gfp_t huge_gfp = vma_thp_gfp_mask(vma);
  1679. gfp = limit_gfp_mask(huge_gfp, gfp);
  1680. }
  1681. #endif
  1682. new = shmem_alloc_folio(gfp, folio_order(old), info, index);
  1683. if (!new)
  1684. return -ENOMEM;
  1685. folio_ref_add(new, nr_pages);
  1686. folio_copy(new, old);
  1687. flush_dcache_folio(new);
  1688. __folio_set_locked(new);
  1689. __folio_set_swapbacked(new);
  1690. folio_mark_uptodate(new);
  1691. new->swap = entry;
  1692. folio_set_swapcache(new);
  1693. /* Swap cache still stores N entries instead of a high-order entry */
  1694. xa_lock_irq(&swap_mapping->i_pages);
  1695. for (i = 0; i < nr_pages; i++) {
  1696. void *item = xas_load(&xas);
  1697. if (item != old) {
  1698. error = -ENOENT;
  1699. break;
  1700. }
  1701. xas_store(&xas, new);
  1702. xas_next(&xas);
  1703. }
  1704. if (!error) {
  1705. mem_cgroup_replace_folio(old, new);
  1706. shmem_update_stats(new, nr_pages);
  1707. shmem_update_stats(old, -nr_pages);
  1708. }
  1709. xa_unlock_irq(&swap_mapping->i_pages);
  1710. if (unlikely(error)) {
  1711. /*
  1712. * Is this possible? I think not, now that our callers
  1713. * check both the swapcache flag and folio->private
  1714. * after getting the folio lock; but be defensive.
  1715. * Reverse old to newpage for clear and free.
  1716. */
  1717. old = new;
  1718. } else {
  1719. folio_add_lru(new);
  1720. *foliop = new;
  1721. }
  1722. folio_clear_swapcache(old);
  1723. old->private = NULL;
  1724. folio_unlock(old);
  1725. /*
  1726. * The old folio are removed from swap cache, drop the 'nr_pages'
  1727. * reference, as well as one temporary reference getting from swap
  1728. * cache.
  1729. */
  1730. folio_put_refs(old, nr_pages + 1);
  1731. return error;
  1732. }
  1733. static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
  1734. struct folio *folio, swp_entry_t swap)
  1735. {
  1736. struct address_space *mapping = inode->i_mapping;
  1737. swp_entry_t swapin_error;
  1738. void *old;
  1739. int nr_pages;
  1740. swapin_error = make_poisoned_swp_entry();
  1741. old = xa_cmpxchg_irq(&mapping->i_pages, index,
  1742. swp_to_radix_entry(swap),
  1743. swp_to_radix_entry(swapin_error), 0);
  1744. if (old != swp_to_radix_entry(swap))
  1745. return;
  1746. nr_pages = folio_nr_pages(folio);
  1747. folio_wait_writeback(folio);
  1748. delete_from_swap_cache(folio);
  1749. /*
  1750. * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
  1751. * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
  1752. * in shmem_evict_inode().
  1753. */
  1754. shmem_recalc_inode(inode, -nr_pages, -nr_pages);
  1755. swap_free_nr(swap, nr_pages);
  1756. }
  1757. static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
  1758. swp_entry_t swap, gfp_t gfp)
  1759. {
  1760. struct address_space *mapping = inode->i_mapping;
  1761. XA_STATE_ORDER(xas, &mapping->i_pages, index, 0);
  1762. void *alloced_shadow = NULL;
  1763. int alloced_order = 0, i;
  1764. /* Convert user data gfp flags to xarray node gfp flags */
  1765. gfp &= GFP_RECLAIM_MASK;
  1766. for (;;) {
  1767. int order = -1, split_order = 0;
  1768. void *old = NULL;
  1769. xas_lock_irq(&xas);
  1770. old = xas_load(&xas);
  1771. if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) {
  1772. xas_set_err(&xas, -EEXIST);
  1773. goto unlock;
  1774. }
  1775. order = xas_get_order(&xas);
  1776. /* Swap entry may have changed before we re-acquire the lock */
  1777. if (alloced_order &&
  1778. (old != alloced_shadow || order != alloced_order)) {
  1779. xas_destroy(&xas);
  1780. alloced_order = 0;
  1781. }
  1782. /* Try to split large swap entry in pagecache */
  1783. if (order > 0) {
  1784. if (!alloced_order) {
  1785. split_order = order;
  1786. goto unlock;
  1787. }
  1788. xas_split(&xas, old, order);
  1789. /*
  1790. * Re-set the swap entry after splitting, and the swap
  1791. * offset of the original large entry must be continuous.
  1792. */
  1793. for (i = 0; i < 1 << order; i++) {
  1794. pgoff_t aligned_index = round_down(index, 1 << order);
  1795. swp_entry_t tmp;
  1796. tmp = swp_entry(swp_type(swap), swp_offset(swap) + i);
  1797. __xa_store(&mapping->i_pages, aligned_index + i,
  1798. swp_to_radix_entry(tmp), 0);
  1799. }
  1800. }
  1801. unlock:
  1802. xas_unlock_irq(&xas);
  1803. /* split needed, alloc here and retry. */
  1804. if (split_order) {
  1805. xas_split_alloc(&xas, old, split_order, gfp);
  1806. if (xas_error(&xas))
  1807. goto error;
  1808. alloced_shadow = old;
  1809. alloced_order = split_order;
  1810. xas_reset(&xas);
  1811. continue;
  1812. }
  1813. if (!xas_nomem(&xas, gfp))
  1814. break;
  1815. }
  1816. error:
  1817. if (xas_error(&xas))
  1818. return xas_error(&xas);
  1819. return alloced_order;
  1820. }
  1821. /*
  1822. * Swap in the folio pointed to by *foliop.
  1823. * Caller has to make sure that *foliop contains a valid swapped folio.
  1824. * Returns 0 and the folio in foliop if success. On failure, returns the
  1825. * error code and NULL in *foliop.
  1826. */
  1827. static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
  1828. struct folio **foliop, enum sgp_type sgp,
  1829. gfp_t gfp, struct vm_area_struct *vma,
  1830. vm_fault_t *fault_type)
  1831. {
  1832. struct address_space *mapping = inode->i_mapping;
  1833. struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
  1834. struct shmem_inode_info *info = SHMEM_I(inode);
  1835. struct swap_info_struct *si;
  1836. struct folio *folio = NULL;
  1837. swp_entry_t swap;
  1838. int error, nr_pages;
  1839. VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
  1840. swap = radix_to_swp_entry(*foliop);
  1841. *foliop = NULL;
  1842. if (is_poisoned_swp_entry(swap))
  1843. return -EIO;
  1844. si = get_swap_device(swap);
  1845. if (!si) {
  1846. if (!shmem_confirm_swap(mapping, index, swap))
  1847. return -EEXIST;
  1848. else
  1849. return -EINVAL;
  1850. }
  1851. /* Look it up and read it in.. */
  1852. folio = swap_cache_get_folio(swap, NULL, 0);
  1853. if (!folio) {
  1854. int split_order;
  1855. /* Or update major stats only when swapin succeeds?? */
  1856. if (fault_type) {
  1857. *fault_type |= VM_FAULT_MAJOR;
  1858. count_vm_event(PGMAJFAULT);
  1859. count_memcg_event_mm(fault_mm, PGMAJFAULT);
  1860. }
  1861. /*
  1862. * Now swap device can only swap in order 0 folio, then we
  1863. * should split the large swap entry stored in the pagecache
  1864. * if necessary.
  1865. */
  1866. split_order = shmem_split_large_entry(inode, index, swap, gfp);
  1867. if (split_order < 0) {
  1868. error = split_order;
  1869. goto failed;
  1870. }
  1871. /*
  1872. * If the large swap entry has already been split, it is
  1873. * necessary to recalculate the new swap entry based on
  1874. * the old order alignment.
  1875. */
  1876. if (split_order > 0) {
  1877. pgoff_t offset = index - round_down(index, 1 << split_order);
  1878. swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
  1879. }
  1880. /* Here we actually start the io */
  1881. folio = shmem_swapin_cluster(swap, gfp, info, index);
  1882. if (!folio) {
  1883. error = -ENOMEM;
  1884. goto failed;
  1885. }
  1886. }
  1887. /* We have to do this with folio locked to prevent races */
  1888. folio_lock(folio);
  1889. if (!folio_test_swapcache(folio) ||
  1890. folio->swap.val != swap.val ||
  1891. !shmem_confirm_swap(mapping, index, swap)) {
  1892. error = -EEXIST;
  1893. goto unlock;
  1894. }
  1895. if (!folio_test_uptodate(folio)) {
  1896. error = -EIO;
  1897. goto failed;
  1898. }
  1899. folio_wait_writeback(folio);
  1900. nr_pages = folio_nr_pages(folio);
  1901. /*
  1902. * Some architectures may have to restore extra metadata to the
  1903. * folio after reading from swap.
  1904. */
  1905. arch_swap_restore(folio_swap(swap, folio), folio);
  1906. if (shmem_should_replace_folio(folio, gfp)) {
  1907. error = shmem_replace_folio(&folio, gfp, info, index, vma);
  1908. if (error)
  1909. goto failed;
  1910. }
  1911. error = shmem_add_to_page_cache(folio, mapping,
  1912. round_down(index, nr_pages),
  1913. swp_to_radix_entry(swap), gfp);
  1914. if (error)
  1915. goto failed;
  1916. shmem_recalc_inode(inode, 0, -nr_pages);
  1917. if (sgp == SGP_WRITE)
  1918. folio_mark_accessed(folio);
  1919. delete_from_swap_cache(folio);
  1920. folio_mark_dirty(folio);
  1921. swap_free_nr(swap, nr_pages);
  1922. put_swap_device(si);
  1923. *foliop = folio;
  1924. return 0;
  1925. failed:
  1926. if (!shmem_confirm_swap(mapping, index, swap))
  1927. error = -EEXIST;
  1928. if (error == -EIO)
  1929. shmem_set_folio_swapin_error(inode, index, folio, swap);
  1930. unlock:
  1931. if (folio) {
  1932. folio_unlock(folio);
  1933. folio_put(folio);
  1934. }
  1935. put_swap_device(si);
  1936. return error;
  1937. }
  1938. /*
  1939. * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate
  1940. *
  1941. * If we allocate a new one we do not mark it dirty. That's up to the
  1942. * vm. If we swap it in we mark it dirty since we also free the swap
  1943. * entry since a page cannot live in both the swap and page cache.
  1944. *
  1945. * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL.
  1946. */
  1947. static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
  1948. loff_t write_end, struct folio **foliop, enum sgp_type sgp,
  1949. gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type)
  1950. {
  1951. struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
  1952. struct mm_struct *fault_mm;
  1953. struct folio *folio;
  1954. int error;
  1955. bool alloced;
  1956. unsigned long orders = 0;
  1957. if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping)))
  1958. return -EINVAL;
  1959. if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
  1960. return -EFBIG;
  1961. repeat:
  1962. if (sgp <= SGP_CACHE &&
  1963. ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode))
  1964. return -EINVAL;
  1965. alloced = false;
  1966. fault_mm = vma ? vma->vm_mm : NULL;
  1967. folio = filemap_get_entry(inode->i_mapping, index);
  1968. if (folio && vma && userfaultfd_minor(vma)) {
  1969. if (!xa_is_value(folio))
  1970. folio_put(folio);
  1971. *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
  1972. return 0;
  1973. }
  1974. if (xa_is_value(folio)) {
  1975. error = shmem_swapin_folio(inode, index, &folio,
  1976. sgp, gfp, vma, fault_type);
  1977. if (error == -EEXIST)
  1978. goto repeat;
  1979. *foliop = folio;
  1980. return error;
  1981. }
  1982. if (folio) {
  1983. folio_lock(folio);
  1984. /* Has the folio been truncated or swapped out? */
  1985. if (unlikely(folio->mapping != inode->i_mapping)) {
  1986. folio_unlock(folio);
  1987. folio_put(folio);
  1988. goto repeat;
  1989. }
  1990. if (sgp == SGP_WRITE)
  1991. folio_mark_accessed(folio);
  1992. if (folio_test_uptodate(folio))
  1993. goto out;
  1994. /* fallocated folio */
  1995. if (sgp != SGP_READ)
  1996. goto clear;
  1997. folio_unlock(folio);
  1998. folio_put(folio);
  1999. }
  2000. /*
  2001. * SGP_READ: succeed on hole, with NULL folio, letting caller zero.
  2002. * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail.
  2003. */
  2004. *foliop = NULL;
  2005. if (sgp == SGP_READ)
  2006. return 0;
  2007. if (sgp == SGP_NOALLOC)
  2008. return -ENOENT;
  2009. /*
  2010. * Fast cache lookup and swap lookup did not find it: allocate.
  2011. */
  2012. if (vma && userfaultfd_missing(vma)) {
  2013. *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
  2014. return 0;
  2015. }
  2016. /* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */
  2017. orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false);
  2018. if (orders > 0) {
  2019. gfp_t huge_gfp;
  2020. huge_gfp = vma_thp_gfp_mask(vma);
  2021. huge_gfp = limit_gfp_mask(huge_gfp, gfp);
  2022. folio = shmem_alloc_and_add_folio(vmf, huge_gfp,
  2023. inode, index, fault_mm, orders);
  2024. if (!IS_ERR(folio)) {
  2025. if (folio_test_pmd_mappable(folio))
  2026. count_vm_event(THP_FILE_ALLOC);
  2027. count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC);
  2028. goto alloced;
  2029. }
  2030. if (PTR_ERR(folio) == -EEXIST)
  2031. goto repeat;
  2032. }
  2033. folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, 0);
  2034. if (IS_ERR(folio)) {
  2035. error = PTR_ERR(folio);
  2036. if (error == -EEXIST)
  2037. goto repeat;
  2038. folio = NULL;
  2039. goto unlock;
  2040. }
  2041. alloced:
  2042. alloced = true;
  2043. if (folio_test_large(folio) &&
  2044. DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
  2045. folio_next_index(folio)) {
  2046. struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
  2047. struct shmem_inode_info *info = SHMEM_I(inode);
  2048. /*
  2049. * Part of the large folio is beyond i_size: subject
  2050. * to shrink under memory pressure.
  2051. */
  2052. spin_lock(&sbinfo->shrinklist_lock);
  2053. /*
  2054. * _careful to defend against unlocked access to
  2055. * ->shrink_list in shmem_unused_huge_shrink()
  2056. */
  2057. if (list_empty_careful(&info->shrinklist)) {
  2058. list_add_tail(&info->shrinklist,
  2059. &sbinfo->shrinklist);
  2060. sbinfo->shrinklist_len++;
  2061. }
  2062. spin_unlock(&sbinfo->shrinklist_lock);
  2063. }
  2064. if (sgp == SGP_WRITE)
  2065. folio_set_referenced(folio);
  2066. /*
  2067. * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
  2068. */
  2069. if (sgp == SGP_FALLOC)
  2070. sgp = SGP_WRITE;
  2071. clear:
  2072. /*
  2073. * Let SGP_WRITE caller clear ends if write does not fill folio;
  2074. * but SGP_FALLOC on a folio fallocated earlier must initialize
  2075. * it now, lest undo on failure cancel our earlier guarantee.
  2076. */
  2077. if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
  2078. long i, n = folio_nr_pages(folio);
  2079. for (i = 0; i < n; i++)
  2080. clear_highpage(folio_page(folio, i));
  2081. flush_dcache_folio(folio);
  2082. folio_mark_uptodate(folio);
  2083. }
  2084. /* Perhaps the file has been truncated since we checked */
  2085. if (sgp <= SGP_CACHE &&
  2086. ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
  2087. error = -EINVAL;
  2088. goto unlock;
  2089. }
  2090. out:
  2091. *foliop = folio;
  2092. return 0;
  2093. /*
  2094. * Error recovery.
  2095. */
  2096. unlock:
  2097. if (alloced)
  2098. filemap_remove_folio(folio);
  2099. shmem_recalc_inode(inode, 0, 0);
  2100. if (folio) {
  2101. folio_unlock(folio);
  2102. folio_put(folio);
  2103. }
  2104. return error;
  2105. }
  2106. /**
  2107. * shmem_get_folio - find, and lock a shmem folio.
  2108. * @inode: inode to search
  2109. * @index: the page index.
  2110. * @write_end: end of a write, could extend inode size
  2111. * @foliop: pointer to the folio if found
  2112. * @sgp: SGP_* flags to control behavior
  2113. *
  2114. * Looks up the page cache entry at @inode & @index. If a folio is
  2115. * present, it is returned locked with an increased refcount.
  2116. *
  2117. * If the caller modifies data in the folio, it must call folio_mark_dirty()
  2118. * before unlocking the folio to ensure that the folio is not reclaimed.
  2119. * There is no need to reserve space before calling folio_mark_dirty().
  2120. *
  2121. * When no folio is found, the behavior depends on @sgp:
  2122. * - for SGP_READ, *@foliop is %NULL and 0 is returned
  2123. * - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned
  2124. * - for all other flags a new folio is allocated, inserted into the
  2125. * page cache and returned locked in @foliop.
  2126. *
  2127. * Context: May sleep.
  2128. * Return: 0 if successful, else a negative error code.
  2129. */
  2130. int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
  2131. struct folio **foliop, enum sgp_type sgp)
  2132. {
  2133. return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp,
  2134. mapping_gfp_mask(inode->i_mapping), NULL, NULL);
  2135. }
  2136. EXPORT_SYMBOL_GPL(shmem_get_folio);
  2137. /*
  2138. * This is like autoremove_wake_function, but it removes the wait queue
  2139. * entry unconditionally - even if something else had already woken the
  2140. * target.
  2141. */
  2142. static int synchronous_wake_function(wait_queue_entry_t *wait,
  2143. unsigned int mode, int sync, void *key)
  2144. {
  2145. int ret = default_wake_function(wait, mode, sync, key);
  2146. list_del_init(&wait->entry);
  2147. return ret;
  2148. }
  2149. /*
  2150. * Trinity finds that probing a hole which tmpfs is punching can
  2151. * prevent the hole-punch from ever completing: which in turn
  2152. * locks writers out with its hold on i_rwsem. So refrain from
  2153. * faulting pages into the hole while it's being punched. Although
  2154. * shmem_undo_range() does remove the additions, it may be unable to
  2155. * keep up, as each new page needs its own unmap_mapping_range() call,
  2156. * and the i_mmap tree grows ever slower to scan if new vmas are added.
  2157. *
  2158. * It does not matter if we sometimes reach this check just before the
  2159. * hole-punch begins, so that one fault then races with the punch:
  2160. * we just need to make racing faults a rare case.
  2161. *
  2162. * The implementation below would be much simpler if we just used a
  2163. * standard mutex or completion: but we cannot take i_rwsem in fault,
  2164. * and bloating every shmem inode for this unlikely case would be sad.
  2165. */
  2166. static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode)
  2167. {
  2168. struct shmem_falloc *shmem_falloc;
  2169. struct file *fpin = NULL;
  2170. vm_fault_t ret = 0;
  2171. spin_lock(&inode->i_lock);
  2172. shmem_falloc = inode->i_private;
  2173. if (shmem_falloc &&
  2174. shmem_falloc->waitq &&
  2175. vmf->pgoff >= shmem_falloc->start &&
  2176. vmf->pgoff < shmem_falloc->next) {
  2177. wait_queue_head_t *shmem_falloc_waitq;
  2178. DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
  2179. ret = VM_FAULT_NOPAGE;
  2180. fpin = maybe_unlock_mmap_for_io(vmf, NULL);
  2181. shmem_falloc_waitq = shmem_falloc->waitq;
  2182. prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
  2183. TASK_UNINTERRUPTIBLE);
  2184. spin_unlock(&inode->i_lock);
  2185. schedule();
  2186. /*
  2187. * shmem_falloc_waitq points into the shmem_fallocate()
  2188. * stack of the hole-punching task: shmem_falloc_waitq
  2189. * is usually invalid by the time we reach here, but
  2190. * finish_wait() does not dereference it in that case;
  2191. * though i_lock needed lest racing with wake_up_all().
  2192. */
  2193. spin_lock(&inode->i_lock);
  2194. finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
  2195. }
  2196. spin_unlock(&inode->i_lock);
  2197. if (fpin) {
  2198. fput(fpin);
  2199. ret = VM_FAULT_RETRY;
  2200. }
  2201. return ret;
  2202. }
  2203. static vm_fault_t shmem_fault(struct vm_fault *vmf)
  2204. {
  2205. struct inode *inode = file_inode(vmf->vma->vm_file);
  2206. gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
  2207. struct folio *folio = NULL;
  2208. vm_fault_t ret = 0;
  2209. int err;
  2210. /*
  2211. * Trinity finds that probing a hole which tmpfs is punching can
  2212. * prevent the hole-punch from ever completing: noted in i_private.
  2213. */
  2214. if (unlikely(inode->i_private)) {
  2215. ret = shmem_falloc_wait(vmf, inode);
  2216. if (ret)
  2217. return ret;
  2218. }
  2219. WARN_ON_ONCE(vmf->page != NULL);
  2220. err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE,
  2221. gfp, vmf, &ret);
  2222. if (err)
  2223. return vmf_error(err);
  2224. if (folio) {
  2225. vmf->page = folio_file_page(folio, vmf->pgoff);
  2226. ret |= VM_FAULT_LOCKED;
  2227. }
  2228. return ret;
  2229. }
  2230. unsigned long shmem_get_unmapped_area(struct file *file,
  2231. unsigned long uaddr, unsigned long len,
  2232. unsigned long pgoff, unsigned long flags)
  2233. {
  2234. unsigned long addr;
  2235. unsigned long offset;
  2236. unsigned long inflated_len;
  2237. unsigned long inflated_addr;
  2238. unsigned long inflated_offset;
  2239. unsigned long hpage_size;
  2240. if (len > TASK_SIZE)
  2241. return -ENOMEM;
  2242. addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff,
  2243. flags);
  2244. if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
  2245. return addr;
  2246. if (IS_ERR_VALUE(addr))
  2247. return addr;
  2248. if (addr & ~PAGE_MASK)
  2249. return addr;
  2250. if (addr > TASK_SIZE - len)
  2251. return addr;
  2252. if (shmem_huge == SHMEM_HUGE_DENY)
  2253. return addr;
  2254. if (flags & MAP_FIXED)
  2255. return addr;
  2256. /*
  2257. * Our priority is to support MAP_SHARED mapped hugely;
  2258. * and support MAP_PRIVATE mapped hugely too, until it is COWed.
  2259. * But if caller specified an address hint and we allocated area there
  2260. * successfully, respect that as before.
  2261. */
  2262. if (uaddr == addr)
  2263. return addr;
  2264. hpage_size = HPAGE_PMD_SIZE;
  2265. if (shmem_huge != SHMEM_HUGE_FORCE) {
  2266. struct super_block *sb;
  2267. unsigned long __maybe_unused hpage_orders;
  2268. int order = 0;
  2269. if (file) {
  2270. VM_BUG_ON(file->f_op != &shmem_file_operations);
  2271. sb = file_inode(file)->i_sb;
  2272. } else {
  2273. /*
  2274. * Called directly from mm/mmap.c, or drivers/char/mem.c
  2275. * for "/dev/zero", to create a shared anonymous object.
  2276. */
  2277. if (IS_ERR(shm_mnt))
  2278. return addr;
  2279. sb = shm_mnt->mnt_sb;
  2280. /*
  2281. * Find the highest mTHP order used for anonymous shmem to
  2282. * provide a suitable alignment address.
  2283. */
  2284. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  2285. hpage_orders = READ_ONCE(huge_shmem_orders_always);
  2286. hpage_orders |= READ_ONCE(huge_shmem_orders_within_size);
  2287. hpage_orders |= READ_ONCE(huge_shmem_orders_madvise);
  2288. if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER)
  2289. hpage_orders |= READ_ONCE(huge_shmem_orders_inherit);
  2290. if (hpage_orders > 0) {
  2291. order = highest_order(hpage_orders);
  2292. hpage_size = PAGE_SIZE << order;
  2293. }
  2294. #endif
  2295. }
  2296. if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order)
  2297. return addr;
  2298. }
  2299. if (len < hpage_size)
  2300. return addr;
  2301. offset = (pgoff << PAGE_SHIFT) & (hpage_size - 1);
  2302. if (offset && offset + len < 2 * hpage_size)
  2303. return addr;
  2304. if ((addr & (hpage_size - 1)) == offset)
  2305. return addr;
  2306. inflated_len = len + hpage_size - PAGE_SIZE;
  2307. if (inflated_len > TASK_SIZE)
  2308. return addr;
  2309. if (inflated_len < len)
  2310. return addr;
  2311. inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr,
  2312. inflated_len, 0, flags);
  2313. if (IS_ERR_VALUE(inflated_addr))
  2314. return addr;
  2315. if (inflated_addr & ~PAGE_MASK)
  2316. return addr;
  2317. inflated_offset = inflated_addr & (hpage_size - 1);
  2318. inflated_addr += offset - inflated_offset;
  2319. if (inflated_offset > offset)
  2320. inflated_addr += hpage_size;
  2321. if (inflated_addr > TASK_SIZE - len)
  2322. return addr;
  2323. return inflated_addr;
  2324. }
  2325. #ifdef CONFIG_NUMA
  2326. static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
  2327. {
  2328. struct inode *inode = file_inode(vma->vm_file);
  2329. return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
  2330. }
  2331. static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
  2332. unsigned long addr, pgoff_t *ilx)
  2333. {
  2334. struct inode *inode = file_inode(vma->vm_file);
  2335. pgoff_t index;
  2336. /*
  2337. * Bias interleave by inode number to distribute better across nodes;
  2338. * but this interface is independent of which page order is used, so
  2339. * supplies only that bias, letting caller apply the offset (adjusted
  2340. * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()).
  2341. */
  2342. *ilx = inode->i_ino;
  2343. index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  2344. return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
  2345. }
  2346. static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
  2347. pgoff_t index, unsigned int order, pgoff_t *ilx)
  2348. {
  2349. struct mempolicy *mpol;
  2350. /* Bias interleave by inode number to distribute better across nodes */
  2351. *ilx = info->vfs_inode.i_ino + (index >> order);
  2352. mpol = mpol_shared_policy_lookup(&info->policy, index);
  2353. return mpol ? mpol : get_task_policy(current);
  2354. }
  2355. #else
  2356. static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
  2357. pgoff_t index, unsigned int order, pgoff_t *ilx)
  2358. {
  2359. *ilx = 0;
  2360. return NULL;
  2361. }
  2362. #endif /* CONFIG_NUMA */
  2363. int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
  2364. {
  2365. struct inode *inode = file_inode(file);
  2366. struct shmem_inode_info *info = SHMEM_I(inode);
  2367. int retval = -ENOMEM;
  2368. /*
  2369. * What serializes the accesses to info->flags?
  2370. * ipc_lock_object() when called from shmctl_do_lock(),
  2371. * no serialization needed when called from shm_destroy().
  2372. */
  2373. if (lock && !(info->flags & VM_LOCKED)) {
  2374. if (!user_shm_lock(inode->i_size, ucounts))
  2375. goto out_nomem;
  2376. info->flags |= VM_LOCKED;
  2377. mapping_set_unevictable(file->f_mapping);
  2378. }
  2379. if (!lock && (info->flags & VM_LOCKED) && ucounts) {
  2380. user_shm_unlock(inode->i_size, ucounts);
  2381. info->flags &= ~VM_LOCKED;
  2382. mapping_clear_unevictable(file->f_mapping);
  2383. }
  2384. retval = 0;
  2385. out_nomem:
  2386. return retval;
  2387. }
  2388. static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
  2389. {
  2390. struct inode *inode = file_inode(file);
  2391. struct shmem_inode_info *info = SHMEM_I(inode);
  2392. int ret;
  2393. ret = seal_check_write(info->seals, vma);
  2394. if (ret)
  2395. return ret;
  2396. file_accessed(file);
  2397. /* This is anonymous shared memory if it is unlinked at the time of mmap */
  2398. if (inode->i_nlink)
  2399. vma->vm_ops = &shmem_vm_ops;
  2400. else
  2401. vma->vm_ops = &shmem_anon_vm_ops;
  2402. return 0;
  2403. }
  2404. static int shmem_file_open(struct inode *inode, struct file *file)
  2405. {
  2406. file->f_mode |= FMODE_CAN_ODIRECT;
  2407. return generic_file_open(inode, file);
  2408. }
  2409. #ifdef CONFIG_TMPFS_XATTR
  2410. static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
  2411. /*
  2412. * chattr's fsflags are unrelated to extended attributes,
  2413. * but tmpfs has chosen to enable them under the same config option.
  2414. */
  2415. static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
  2416. {
  2417. unsigned int i_flags = 0;
  2418. if (fsflags & FS_NOATIME_FL)
  2419. i_flags |= S_NOATIME;
  2420. if (fsflags & FS_APPEND_FL)
  2421. i_flags |= S_APPEND;
  2422. if (fsflags & FS_IMMUTABLE_FL)
  2423. i_flags |= S_IMMUTABLE;
  2424. /*
  2425. * But FS_NODUMP_FL does not require any action in i_flags.
  2426. */
  2427. inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE);
  2428. }
  2429. #else
  2430. static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
  2431. {
  2432. }
  2433. #define shmem_initxattrs NULL
  2434. #endif
  2435. static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode)
  2436. {
  2437. return &SHMEM_I(inode)->dir_offsets;
  2438. }
  2439. static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
  2440. struct super_block *sb,
  2441. struct inode *dir, umode_t mode,
  2442. dev_t dev, unsigned long flags)
  2443. {
  2444. struct inode *inode;
  2445. struct shmem_inode_info *info;
  2446. struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
  2447. ino_t ino;
  2448. int err;
  2449. err = shmem_reserve_inode(sb, &ino);
  2450. if (err)
  2451. return ERR_PTR(err);
  2452. inode = new_inode(sb);
  2453. if (!inode) {
  2454. shmem_free_inode(sb, 0);
  2455. return ERR_PTR(-ENOSPC);
  2456. }
  2457. inode->i_ino = ino;
  2458. inode_init_owner(idmap, inode, dir, mode);
  2459. inode->i_blocks = 0;
  2460. simple_inode_init_ts(inode);
  2461. inode->i_generation = get_random_u32();
  2462. info = SHMEM_I(inode);
  2463. memset(info, 0, (char *)inode - (char *)info);
  2464. spin_lock_init(&info->lock);
  2465. atomic_set(&info->stop_eviction, 0);
  2466. info->seals = F_SEAL_SEAL;
  2467. info->flags = flags & VM_NORESERVE;
  2468. info->i_crtime = inode_get_mtime(inode);
  2469. info->fsflags = (dir == NULL) ? 0 :
  2470. SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
  2471. if (info->fsflags)
  2472. shmem_set_inode_flags(inode, info->fsflags);
  2473. INIT_LIST_HEAD(&info->shrinklist);
  2474. INIT_LIST_HEAD(&info->swaplist);
  2475. simple_xattrs_init(&info->xattrs);
  2476. cache_no_acl(inode);
  2477. if (sbinfo->noswap)
  2478. mapping_set_unevictable(inode->i_mapping);
  2479. mapping_set_large_folios(inode->i_mapping);
  2480. switch (mode & S_IFMT) {
  2481. default:
  2482. inode->i_op = &shmem_special_inode_operations;
  2483. init_special_inode(inode, mode, dev);
  2484. break;
  2485. case S_IFREG:
  2486. inode->i_mapping->a_ops = &shmem_aops;
  2487. inode->i_op = &shmem_inode_operations;
  2488. inode->i_fop = &shmem_file_operations;
  2489. mpol_shared_policy_init(&info->policy,
  2490. shmem_get_sbmpol(sbinfo));
  2491. break;
  2492. case S_IFDIR:
  2493. inc_nlink(inode);
  2494. /* Some things misbehave if size == 0 on a directory */
  2495. inode->i_size = 2 * BOGO_DIRENT_SIZE;
  2496. inode->i_op = &shmem_dir_inode_operations;
  2497. inode->i_fop = &simple_offset_dir_operations;
  2498. simple_offset_init(shmem_get_offset_ctx(inode));
  2499. break;
  2500. case S_IFLNK:
  2501. /*
  2502. * Must not load anything in the rbtree,
  2503. * mpol_free_shared_policy will not be called.
  2504. */
  2505. mpol_shared_policy_init(&info->policy, NULL);
  2506. break;
  2507. }
  2508. lockdep_annotate_inode_mutex_key(inode);
  2509. return inode;
  2510. }
  2511. #ifdef CONFIG_TMPFS_QUOTA
  2512. static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
  2513. struct super_block *sb, struct inode *dir,
  2514. umode_t mode, dev_t dev, unsigned long flags)
  2515. {
  2516. int err;
  2517. struct inode *inode;
  2518. inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
  2519. if (IS_ERR(inode))
  2520. return inode;
  2521. err = dquot_initialize(inode);
  2522. if (err)
  2523. goto errout;
  2524. err = dquot_alloc_inode(inode);
  2525. if (err) {
  2526. dquot_drop(inode);
  2527. goto errout;
  2528. }
  2529. return inode;
  2530. errout:
  2531. inode->i_flags |= S_NOQUOTA;
  2532. iput(inode);
  2533. return ERR_PTR(err);
  2534. }
  2535. #else
  2536. static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
  2537. struct super_block *sb, struct inode *dir,
  2538. umode_t mode, dev_t dev, unsigned long flags)
  2539. {
  2540. return __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
  2541. }
  2542. #endif /* CONFIG_TMPFS_QUOTA */
  2543. #ifdef CONFIG_USERFAULTFD
  2544. int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
  2545. struct vm_area_struct *dst_vma,
  2546. unsigned long dst_addr,
  2547. unsigned long src_addr,
  2548. uffd_flags_t flags,
  2549. struct folio **foliop)
  2550. {
  2551. struct inode *inode = file_inode(dst_vma->vm_file);
  2552. struct shmem_inode_info *info = SHMEM_I(inode);
  2553. struct address_space *mapping = inode->i_mapping;
  2554. gfp_t gfp = mapping_gfp_mask(mapping);
  2555. pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
  2556. void *page_kaddr;
  2557. struct folio *folio;
  2558. int ret;
  2559. pgoff_t max_off;
  2560. if (shmem_inode_acct_blocks(inode, 1)) {
  2561. /*
  2562. * We may have got a page, returned -ENOENT triggering a retry,
  2563. * and now we find ourselves with -ENOMEM. Release the page, to
  2564. * avoid a BUG_ON in our caller.
  2565. */
  2566. if (unlikely(*foliop)) {
  2567. folio_put(*foliop);
  2568. *foliop = NULL;
  2569. }
  2570. return -ENOMEM;
  2571. }
  2572. if (!*foliop) {
  2573. ret = -ENOMEM;
  2574. folio = shmem_alloc_folio(gfp, 0, info, pgoff);
  2575. if (!folio)
  2576. goto out_unacct_blocks;
  2577. if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
  2578. page_kaddr = kmap_local_folio(folio, 0);
  2579. /*
  2580. * The read mmap_lock is held here. Despite the
  2581. * mmap_lock being read recursive a deadlock is still
  2582. * possible if a writer has taken a lock. For example:
  2583. *
  2584. * process A thread 1 takes read lock on own mmap_lock
  2585. * process A thread 2 calls mmap, blocks taking write lock
  2586. * process B thread 1 takes page fault, read lock on own mmap lock
  2587. * process B thread 2 calls mmap, blocks taking write lock
  2588. * process A thread 1 blocks taking read lock on process B
  2589. * process B thread 1 blocks taking read lock on process A
  2590. *
  2591. * Disable page faults to prevent potential deadlock
  2592. * and retry the copy outside the mmap_lock.
  2593. */
  2594. pagefault_disable();
  2595. ret = copy_from_user(page_kaddr,
  2596. (const void __user *)src_addr,
  2597. PAGE_SIZE);
  2598. pagefault_enable();
  2599. kunmap_local(page_kaddr);
  2600. /* fallback to copy_from_user outside mmap_lock */
  2601. if (unlikely(ret)) {
  2602. *foliop = folio;
  2603. ret = -ENOENT;
  2604. /* don't free the page */
  2605. goto out_unacct_blocks;
  2606. }
  2607. flush_dcache_folio(folio);
  2608. } else { /* ZEROPAGE */
  2609. clear_user_highpage(&folio->page, dst_addr);
  2610. }
  2611. } else {
  2612. folio = *foliop;
  2613. VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
  2614. *foliop = NULL;
  2615. }
  2616. VM_BUG_ON(folio_test_locked(folio));
  2617. VM_BUG_ON(folio_test_swapbacked(folio));
  2618. __folio_set_locked(folio);
  2619. __folio_set_swapbacked(folio);
  2620. __folio_mark_uptodate(folio);
  2621. ret = -EFAULT;
  2622. max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
  2623. if (unlikely(pgoff >= max_off))
  2624. goto out_release;
  2625. ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp);
  2626. if (ret)
  2627. goto out_release;
  2628. ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp);
  2629. if (ret)
  2630. goto out_release;
  2631. ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
  2632. &folio->page, true, flags);
  2633. if (ret)
  2634. goto out_delete_from_cache;
  2635. shmem_recalc_inode(inode, 1, 0);
  2636. folio_unlock(folio);
  2637. return 0;
  2638. out_delete_from_cache:
  2639. filemap_remove_folio(folio);
  2640. out_release:
  2641. folio_unlock(folio);
  2642. folio_put(folio);
  2643. out_unacct_blocks:
  2644. shmem_inode_unacct_blocks(inode, 1);
  2645. return ret;
  2646. }
  2647. #endif /* CONFIG_USERFAULTFD */
  2648. #ifdef CONFIG_TMPFS
  2649. static const struct inode_operations shmem_symlink_inode_operations;
  2650. static const struct inode_operations shmem_short_symlink_operations;
  2651. static int
  2652. shmem_write_begin(struct file *file, struct address_space *mapping,
  2653. loff_t pos, unsigned len,
  2654. struct folio **foliop, void **fsdata)
  2655. {
  2656. struct inode *inode = mapping->host;
  2657. struct shmem_inode_info *info = SHMEM_I(inode);
  2658. pgoff_t index = pos >> PAGE_SHIFT;
  2659. struct folio *folio;
  2660. int ret = 0;
  2661. /* i_rwsem is held by caller */
  2662. if (unlikely(info->seals & (F_SEAL_GROW |
  2663. F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
  2664. if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
  2665. return -EPERM;
  2666. if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
  2667. return -EPERM;
  2668. }
  2669. ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE);
  2670. if (ret)
  2671. return ret;
  2672. if (folio_test_hwpoison(folio) ||
  2673. (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) {
  2674. folio_unlock(folio);
  2675. folio_put(folio);
  2676. return -EIO;
  2677. }
  2678. *foliop = folio;
  2679. return 0;
  2680. }
  2681. static int
  2682. shmem_write_end(struct file *file, struct address_space *mapping,
  2683. loff_t pos, unsigned len, unsigned copied,
  2684. struct folio *folio, void *fsdata)
  2685. {
  2686. struct inode *inode = mapping->host;
  2687. if (pos + copied > inode->i_size)
  2688. i_size_write(inode, pos + copied);
  2689. if (!folio_test_uptodate(folio)) {
  2690. if (copied < folio_size(folio)) {
  2691. size_t from = offset_in_folio(folio, pos);
  2692. folio_zero_segments(folio, 0, from,
  2693. from + copied, folio_size(folio));
  2694. }
  2695. folio_mark_uptodate(folio);
  2696. }
  2697. folio_mark_dirty(folio);
  2698. folio_unlock(folio);
  2699. folio_put(folio);
  2700. return copied;
  2701. }
  2702. static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
  2703. {
  2704. struct file *file = iocb->ki_filp;
  2705. struct inode *inode = file_inode(file);
  2706. struct address_space *mapping = inode->i_mapping;
  2707. pgoff_t index;
  2708. unsigned long offset;
  2709. int error = 0;
  2710. ssize_t retval = 0;
  2711. loff_t *ppos = &iocb->ki_pos;
  2712. index = *ppos >> PAGE_SHIFT;
  2713. offset = *ppos & ~PAGE_MASK;
  2714. for (;;) {
  2715. struct folio *folio = NULL;
  2716. struct page *page = NULL;
  2717. pgoff_t end_index;
  2718. unsigned long nr, ret;
  2719. loff_t i_size = i_size_read(inode);
  2720. end_index = i_size >> PAGE_SHIFT;
  2721. if (index > end_index)
  2722. break;
  2723. if (index == end_index) {
  2724. nr = i_size & ~PAGE_MASK;
  2725. if (nr <= offset)
  2726. break;
  2727. }
  2728. error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
  2729. if (error) {
  2730. if (error == -EINVAL)
  2731. error = 0;
  2732. break;
  2733. }
  2734. if (folio) {
  2735. folio_unlock(folio);
  2736. page = folio_file_page(folio, index);
  2737. if (PageHWPoison(page)) {
  2738. folio_put(folio);
  2739. error = -EIO;
  2740. break;
  2741. }
  2742. }
  2743. /*
  2744. * We must evaluate after, since reads (unlike writes)
  2745. * are called without i_rwsem protection against truncate
  2746. */
  2747. nr = PAGE_SIZE;
  2748. i_size = i_size_read(inode);
  2749. end_index = i_size >> PAGE_SHIFT;
  2750. if (index == end_index) {
  2751. nr = i_size & ~PAGE_MASK;
  2752. if (nr <= offset) {
  2753. if (folio)
  2754. folio_put(folio);
  2755. break;
  2756. }
  2757. }
  2758. nr -= offset;
  2759. if (folio) {
  2760. /*
  2761. * If users can be writing to this page using arbitrary
  2762. * virtual addresses, take care about potential aliasing
  2763. * before reading the page on the kernel side.
  2764. */
  2765. if (mapping_writably_mapped(mapping))
  2766. flush_dcache_page(page);
  2767. /*
  2768. * Mark the page accessed if we read the beginning.
  2769. */
  2770. if (!offset)
  2771. folio_mark_accessed(folio);
  2772. /*
  2773. * Ok, we have the page, and it's up-to-date, so
  2774. * now we can copy it to user space...
  2775. */
  2776. ret = copy_page_to_iter(page, offset, nr, to);
  2777. folio_put(folio);
  2778. } else if (user_backed_iter(to)) {
  2779. /*
  2780. * Copy to user tends to be so well optimized, but
  2781. * clear_user() not so much, that it is noticeably
  2782. * faster to copy the zero page instead of clearing.
  2783. */
  2784. ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
  2785. } else {
  2786. /*
  2787. * But submitting the same page twice in a row to
  2788. * splice() - or others? - can result in confusion:
  2789. * so don't attempt that optimization on pipes etc.
  2790. */
  2791. ret = iov_iter_zero(nr, to);
  2792. }
  2793. retval += ret;
  2794. offset += ret;
  2795. index += offset >> PAGE_SHIFT;
  2796. offset &= ~PAGE_MASK;
  2797. if (!iov_iter_count(to))
  2798. break;
  2799. if (ret < nr) {
  2800. error = -EFAULT;
  2801. break;
  2802. }
  2803. cond_resched();
  2804. }
  2805. *ppos = ((loff_t) index << PAGE_SHIFT) + offset;
  2806. file_accessed(file);
  2807. return retval ? retval : error;
  2808. }
  2809. static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
  2810. {
  2811. struct file *file = iocb->ki_filp;
  2812. struct inode *inode = file->f_mapping->host;
  2813. ssize_t ret;
  2814. inode_lock(inode);
  2815. ret = generic_write_checks(iocb, from);
  2816. if (ret <= 0)
  2817. goto unlock;
  2818. ret = file_remove_privs(file);
  2819. if (ret)
  2820. goto unlock;
  2821. ret = file_update_time(file);
  2822. if (ret)
  2823. goto unlock;
  2824. ret = generic_perform_write(iocb, from);
  2825. unlock:
  2826. inode_unlock(inode);
  2827. return ret;
  2828. }
  2829. static bool zero_pipe_buf_get(struct pipe_inode_info *pipe,
  2830. struct pipe_buffer *buf)
  2831. {
  2832. return true;
  2833. }
  2834. static void zero_pipe_buf_release(struct pipe_inode_info *pipe,
  2835. struct pipe_buffer *buf)
  2836. {
  2837. }
  2838. static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe,
  2839. struct pipe_buffer *buf)
  2840. {
  2841. return false;
  2842. }
  2843. static const struct pipe_buf_operations zero_pipe_buf_ops = {
  2844. .release = zero_pipe_buf_release,
  2845. .try_steal = zero_pipe_buf_try_steal,
  2846. .get = zero_pipe_buf_get,
  2847. };
  2848. static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
  2849. loff_t fpos, size_t size)
  2850. {
  2851. size_t offset = fpos & ~PAGE_MASK;
  2852. size = min_t(size_t, size, PAGE_SIZE - offset);
  2853. if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
  2854. struct pipe_buffer *buf = pipe_head_buf(pipe);
  2855. *buf = (struct pipe_buffer) {
  2856. .ops = &zero_pipe_buf_ops,
  2857. .page = ZERO_PAGE(0),
  2858. .offset = offset,
  2859. .len = size,
  2860. };
  2861. pipe->head++;
  2862. }
  2863. return size;
  2864. }
  2865. static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
  2866. struct pipe_inode_info *pipe,
  2867. size_t len, unsigned int flags)
  2868. {
  2869. struct inode *inode = file_inode(in);
  2870. struct address_space *mapping = inode->i_mapping;
  2871. struct folio *folio = NULL;
  2872. size_t total_spliced = 0, used, npages, n, part;
  2873. loff_t isize;
  2874. int error = 0;
  2875. /* Work out how much data we can actually add into the pipe */
  2876. used = pipe_occupancy(pipe->head, pipe->tail);
  2877. npages = max_t(ssize_t, pipe->max_usage - used, 0);
  2878. len = min_t(size_t, len, npages * PAGE_SIZE);
  2879. do {
  2880. if (*ppos >= i_size_read(inode))
  2881. break;
  2882. error = shmem_get_folio(inode, *ppos / PAGE_SIZE, 0, &folio,
  2883. SGP_READ);
  2884. if (error) {
  2885. if (error == -EINVAL)
  2886. error = 0;
  2887. break;
  2888. }
  2889. if (folio) {
  2890. folio_unlock(folio);
  2891. if (folio_test_hwpoison(folio) ||
  2892. (folio_test_large(folio) &&
  2893. folio_test_has_hwpoisoned(folio))) {
  2894. error = -EIO;
  2895. break;
  2896. }
  2897. }
  2898. /*
  2899. * i_size must be checked after we know the pages are Uptodate.
  2900. *
  2901. * Checking i_size after the check allows us to calculate
  2902. * the correct value for "nr", which means the zero-filled
  2903. * part of the page is not copied back to userspace (unless
  2904. * another truncate extends the file - this is desired though).
  2905. */
  2906. isize = i_size_read(inode);
  2907. if (unlikely(*ppos >= isize))
  2908. break;
  2909. part = min_t(loff_t, isize - *ppos, len);
  2910. if (folio) {
  2911. /*
  2912. * If users can be writing to this page using arbitrary
  2913. * virtual addresses, take care about potential aliasing
  2914. * before reading the page on the kernel side.
  2915. */
  2916. if (mapping_writably_mapped(mapping))
  2917. flush_dcache_folio(folio);
  2918. folio_mark_accessed(folio);
  2919. /*
  2920. * Ok, we have the page, and it's up-to-date, so we can
  2921. * now splice it into the pipe.
  2922. */
  2923. n = splice_folio_into_pipe(pipe, folio, *ppos, part);
  2924. folio_put(folio);
  2925. folio = NULL;
  2926. } else {
  2927. n = splice_zeropage_into_pipe(pipe, *ppos, part);
  2928. }
  2929. if (!n)
  2930. break;
  2931. len -= n;
  2932. total_spliced += n;
  2933. *ppos += n;
  2934. in->f_ra.prev_pos = *ppos;
  2935. if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
  2936. break;
  2937. cond_resched();
  2938. } while (len);
  2939. if (folio)
  2940. folio_put(folio);
  2941. file_accessed(in);
  2942. return total_spliced ? total_spliced : error;
  2943. }
  2944. static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
  2945. {
  2946. struct address_space *mapping = file->f_mapping;
  2947. struct inode *inode = mapping->host;
  2948. if (whence != SEEK_DATA && whence != SEEK_HOLE)
  2949. return generic_file_llseek_size(file, offset, whence,
  2950. MAX_LFS_FILESIZE, i_size_read(inode));
  2951. if (offset < 0)
  2952. return -ENXIO;
  2953. inode_lock(inode);
  2954. /* We're holding i_rwsem so we can access i_size directly */
  2955. offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
  2956. if (offset >= 0)
  2957. offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
  2958. inode_unlock(inode);
  2959. return offset;
  2960. }
  2961. static long shmem_fallocate(struct file *file, int mode, loff_t offset,
  2962. loff_t len)
  2963. {
  2964. struct inode *inode = file_inode(file);
  2965. struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
  2966. struct shmem_inode_info *info = SHMEM_I(inode);
  2967. struct shmem_falloc shmem_falloc;
  2968. pgoff_t start, index, end, undo_fallocend;
  2969. int error;
  2970. if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
  2971. return -EOPNOTSUPP;
  2972. inode_lock(inode);
  2973. if (mode & FALLOC_FL_PUNCH_HOLE) {
  2974. struct address_space *mapping = file->f_mapping;
  2975. loff_t unmap_start = round_up(offset, PAGE_SIZE);
  2976. loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
  2977. DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
  2978. /* protected by i_rwsem */
  2979. if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
  2980. error = -EPERM;
  2981. goto out;
  2982. }
  2983. shmem_falloc.waitq = &shmem_falloc_waitq;
  2984. shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
  2985. shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
  2986. spin_lock(&inode->i_lock);
  2987. inode->i_private = &shmem_falloc;
  2988. spin_unlock(&inode->i_lock);
  2989. if ((u64)unmap_end > (u64)unmap_start)
  2990. unmap_mapping_range(mapping, unmap_start,
  2991. 1 + unmap_end - unmap_start, 0);
  2992. shmem_truncate_range(inode, offset, offset + len - 1);
  2993. /* No need to unmap again: hole-punching leaves COWed pages */
  2994. spin_lock(&inode->i_lock);
  2995. inode->i_private = NULL;
  2996. wake_up_all(&shmem_falloc_waitq);
  2997. WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
  2998. spin_unlock(&inode->i_lock);
  2999. error = 0;
  3000. goto out;
  3001. }
  3002. /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
  3003. error = inode_newsize_ok(inode, offset + len);
  3004. if (error)
  3005. goto out;
  3006. if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
  3007. error = -EPERM;
  3008. goto out;
  3009. }
  3010. start = offset >> PAGE_SHIFT;
  3011. end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
  3012. /* Try to avoid a swapstorm if len is impossible to satisfy */
  3013. if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
  3014. error = -ENOSPC;
  3015. goto out;
  3016. }
  3017. shmem_falloc.waitq = NULL;
  3018. shmem_falloc.start = start;
  3019. shmem_falloc.next = start;
  3020. shmem_falloc.nr_falloced = 0;
  3021. shmem_falloc.nr_unswapped = 0;
  3022. spin_lock(&inode->i_lock);
  3023. inode->i_private = &shmem_falloc;
  3024. spin_unlock(&inode->i_lock);
  3025. /*
  3026. * info->fallocend is only relevant when huge pages might be
  3027. * involved: to prevent split_huge_page() freeing fallocated
  3028. * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
  3029. */
  3030. undo_fallocend = info->fallocend;
  3031. if (info->fallocend < end)
  3032. info->fallocend = end;
  3033. for (index = start; index < end; ) {
  3034. struct folio *folio;
  3035. /*
  3036. * Check for fatal signal so that we abort early in OOM
  3037. * situations. We don't want to abort in case of non-fatal
  3038. * signals as large fallocate can take noticeable time and
  3039. * e.g. periodic timers may result in fallocate constantly
  3040. * restarting.
  3041. */
  3042. if (fatal_signal_pending(current))
  3043. error = -EINTR;
  3044. else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
  3045. error = -ENOMEM;
  3046. else
  3047. error = shmem_get_folio(inode, index, offset + len,
  3048. &folio, SGP_FALLOC);
  3049. if (error) {
  3050. info->fallocend = undo_fallocend;
  3051. /* Remove the !uptodate folios we added */
  3052. if (index > start) {
  3053. shmem_undo_range(inode,
  3054. (loff_t)start << PAGE_SHIFT,
  3055. ((loff_t)index << PAGE_SHIFT) - 1, true);
  3056. }
  3057. goto undone;
  3058. }
  3059. /*
  3060. * Here is a more important optimization than it appears:
  3061. * a second SGP_FALLOC on the same large folio will clear it,
  3062. * making it uptodate and un-undoable if we fail later.
  3063. */
  3064. index = folio_next_index(folio);
  3065. /* Beware 32-bit wraparound */
  3066. if (!index)
  3067. index--;
  3068. /*
  3069. * Inform shmem_writepage() how far we have reached.
  3070. * No need for lock or barrier: we have the page lock.
  3071. */
  3072. if (!folio_test_uptodate(folio))
  3073. shmem_falloc.nr_falloced += index - shmem_falloc.next;
  3074. shmem_falloc.next = index;
  3075. /*
  3076. * If !uptodate, leave it that way so that freeable folios
  3077. * can be recognized if we need to rollback on error later.
  3078. * But mark it dirty so that memory pressure will swap rather
  3079. * than free the folios we are allocating (and SGP_CACHE folios
  3080. * might still be clean: we now need to mark those dirty too).
  3081. */
  3082. folio_mark_dirty(folio);
  3083. folio_unlock(folio);
  3084. folio_put(folio);
  3085. cond_resched();
  3086. }
  3087. if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
  3088. i_size_write(inode, offset + len);
  3089. undone:
  3090. spin_lock(&inode->i_lock);
  3091. inode->i_private = NULL;
  3092. spin_unlock(&inode->i_lock);
  3093. out:
  3094. if (!error)
  3095. file_modified(file);
  3096. inode_unlock(inode);
  3097. return error;
  3098. }
  3099. static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
  3100. {
  3101. struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
  3102. buf->f_type = TMPFS_MAGIC;
  3103. buf->f_bsize = PAGE_SIZE;
  3104. buf->f_namelen = NAME_MAX;
  3105. if (sbinfo->max_blocks) {
  3106. buf->f_blocks = sbinfo->max_blocks;
  3107. buf->f_bavail =
  3108. buf->f_bfree = sbinfo->max_blocks -
  3109. percpu_counter_sum(&sbinfo->used_blocks);
  3110. }
  3111. if (sbinfo->max_inodes) {
  3112. buf->f_files = sbinfo->max_inodes;
  3113. buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE;
  3114. }
  3115. /* else leave those fields 0 like simple_statfs */
  3116. buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
  3117. return 0;
  3118. }
  3119. /*
  3120. * File creation. Allocate an inode, and we're done..
  3121. */
  3122. static int
  3123. shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
  3124. struct dentry *dentry, umode_t mode, dev_t dev)
  3125. {
  3126. struct inode *inode;
  3127. int error;
  3128. inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
  3129. if (IS_ERR(inode))
  3130. return PTR_ERR(inode);
  3131. error = simple_acl_create(dir, inode);
  3132. if (error)
  3133. goto out_iput;
  3134. error = security_inode_init_security(inode, dir, &dentry->d_name,
  3135. shmem_initxattrs, NULL);
  3136. if (error && error != -EOPNOTSUPP)
  3137. goto out_iput;
  3138. error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
  3139. if (error)
  3140. goto out_iput;
  3141. dir->i_size += BOGO_DIRENT_SIZE;
  3142. inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
  3143. inode_inc_iversion(dir);
  3144. d_instantiate(dentry, inode);
  3145. dget(dentry); /* Extra count - pin the dentry in core */
  3146. return error;
  3147. out_iput:
  3148. iput(inode);
  3149. return error;
  3150. }
  3151. static int
  3152. shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
  3153. struct file *file, umode_t mode)
  3154. {
  3155. struct inode *inode;
  3156. int error;
  3157. inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
  3158. if (IS_ERR(inode)) {
  3159. error = PTR_ERR(inode);
  3160. goto err_out;
  3161. }
  3162. error = security_inode_init_security(inode, dir, NULL,
  3163. shmem_initxattrs, NULL);
  3164. if (error && error != -EOPNOTSUPP)
  3165. goto out_iput;
  3166. error = simple_acl_create(dir, inode);
  3167. if (error)
  3168. goto out_iput;
  3169. d_tmpfile(file, inode);
  3170. err_out:
  3171. return finish_open_simple(file, error);
  3172. out_iput:
  3173. iput(inode);
  3174. return error;
  3175. }
  3176. static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
  3177. struct dentry *dentry, umode_t mode)
  3178. {
  3179. int error;
  3180. error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
  3181. if (error)
  3182. return error;
  3183. inc_nlink(dir);
  3184. return 0;
  3185. }
  3186. static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
  3187. struct dentry *dentry, umode_t mode, bool excl)
  3188. {
  3189. return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
  3190. }
  3191. /*
  3192. * Link a file..
  3193. */
  3194. static int shmem_link(struct dentry *old_dentry, struct inode *dir,
  3195. struct dentry *dentry)
  3196. {
  3197. struct inode *inode = d_inode(old_dentry);
  3198. int ret = 0;
  3199. /*
  3200. * No ordinary (disk based) filesystem counts links as inodes;
  3201. * but each new link needs a new dentry, pinning lowmem, and
  3202. * tmpfs dentries cannot be pruned until they are unlinked.
  3203. * But if an O_TMPFILE file is linked into the tmpfs, the
  3204. * first link must skip that, to get the accounting right.
  3205. */
  3206. if (inode->i_nlink) {
  3207. ret = shmem_reserve_inode(inode->i_sb, NULL);
  3208. if (ret)
  3209. goto out;
  3210. }
  3211. ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
  3212. if (ret) {
  3213. if (inode->i_nlink)
  3214. shmem_free_inode(inode->i_sb, 0);
  3215. goto out;
  3216. }
  3217. dir->i_size += BOGO_DIRENT_SIZE;
  3218. inode_set_mtime_to_ts(dir,
  3219. inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
  3220. inode_inc_iversion(dir);
  3221. inc_nlink(inode);
  3222. ihold(inode); /* New dentry reference */
  3223. dget(dentry); /* Extra pinning count for the created dentry */
  3224. d_instantiate(dentry, inode);
  3225. out:
  3226. return ret;
  3227. }
  3228. static int shmem_unlink(struct inode *dir, struct dentry *dentry)
  3229. {
  3230. struct inode *inode = d_inode(dentry);
  3231. if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
  3232. shmem_free_inode(inode->i_sb, 0);
  3233. simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
  3234. dir->i_size -= BOGO_DIRENT_SIZE;
  3235. inode_set_mtime_to_ts(dir,
  3236. inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
  3237. inode_inc_iversion(dir);
  3238. drop_nlink(inode);
  3239. dput(dentry); /* Undo the count from "create" - does all the work */
  3240. return 0;
  3241. }
  3242. static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
  3243. {
  3244. if (!simple_empty(dentry))
  3245. return -ENOTEMPTY;
  3246. drop_nlink(d_inode(dentry));
  3247. drop_nlink(dir);
  3248. return shmem_unlink(dir, dentry);
  3249. }
  3250. static int shmem_whiteout(struct mnt_idmap *idmap,
  3251. struct inode *old_dir, struct dentry *old_dentry)
  3252. {
  3253. struct dentry *whiteout;
  3254. int error;
  3255. whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
  3256. if (!whiteout)
  3257. return -ENOMEM;
  3258. error = shmem_mknod(idmap, old_dir, whiteout,
  3259. S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
  3260. dput(whiteout);
  3261. if (error)
  3262. return error;
  3263. /*
  3264. * Cheat and hash the whiteout while the old dentry is still in
  3265. * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
  3266. *
  3267. * d_lookup() will consistently find one of them at this point,
  3268. * not sure which one, but that isn't even important.
  3269. */
  3270. d_rehash(whiteout);
  3271. return 0;
  3272. }
  3273. /*
  3274. * The VFS layer already does all the dentry stuff for rename,
  3275. * we just have to decrement the usage count for the target if
  3276. * it exists so that the VFS layer correctly free's it when it
  3277. * gets overwritten.
  3278. */
  3279. static int shmem_rename2(struct mnt_idmap *idmap,
  3280. struct inode *old_dir, struct dentry *old_dentry,
  3281. struct inode *new_dir, struct dentry *new_dentry,
  3282. unsigned int flags)
  3283. {
  3284. struct inode *inode = d_inode(old_dentry);
  3285. int they_are_dirs = S_ISDIR(inode->i_mode);
  3286. int error;
  3287. if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
  3288. return -EINVAL;
  3289. if (flags & RENAME_EXCHANGE)
  3290. return simple_offset_rename_exchange(old_dir, old_dentry,
  3291. new_dir, new_dentry);
  3292. if (!simple_empty(new_dentry))
  3293. return -ENOTEMPTY;
  3294. if (flags & RENAME_WHITEOUT) {
  3295. error = shmem_whiteout(idmap, old_dir, old_dentry);
  3296. if (error)
  3297. return error;
  3298. }
  3299. error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry);
  3300. if (error)
  3301. return error;
  3302. if (d_really_is_positive(new_dentry)) {
  3303. (void) shmem_unlink(new_dir, new_dentry);
  3304. if (they_are_dirs) {
  3305. drop_nlink(d_inode(new_dentry));
  3306. drop_nlink(old_dir);
  3307. }
  3308. } else if (they_are_dirs) {
  3309. drop_nlink(old_dir);
  3310. inc_nlink(new_dir);
  3311. }
  3312. old_dir->i_size -= BOGO_DIRENT_SIZE;
  3313. new_dir->i_size += BOGO_DIRENT_SIZE;
  3314. simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
  3315. inode_inc_iversion(old_dir);
  3316. inode_inc_iversion(new_dir);
  3317. return 0;
  3318. }
  3319. static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
  3320. struct dentry *dentry, const char *symname)
  3321. {
  3322. int error;
  3323. int len;
  3324. struct inode *inode;
  3325. struct folio *folio;
  3326. len = strlen(symname) + 1;
  3327. if (len > PAGE_SIZE)
  3328. return -ENAMETOOLONG;
  3329. inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
  3330. VM_NORESERVE);
  3331. if (IS_ERR(inode))
  3332. return PTR_ERR(inode);
  3333. error = security_inode_init_security(inode, dir, &dentry->d_name,
  3334. shmem_initxattrs, NULL);
  3335. if (error && error != -EOPNOTSUPP)
  3336. goto out_iput;
  3337. error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
  3338. if (error)
  3339. goto out_iput;
  3340. inode->i_size = len-1;
  3341. if (len <= SHORT_SYMLINK_LEN) {
  3342. inode->i_link = kmemdup(symname, len, GFP_KERNEL);
  3343. if (!inode->i_link) {
  3344. error = -ENOMEM;
  3345. goto out_remove_offset;
  3346. }
  3347. inode->i_op = &shmem_short_symlink_operations;
  3348. } else {
  3349. inode_nohighmem(inode);
  3350. inode->i_mapping->a_ops = &shmem_aops;
  3351. error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE);
  3352. if (error)
  3353. goto out_remove_offset;
  3354. inode->i_op = &shmem_symlink_inode_operations;
  3355. memcpy(folio_address(folio), symname, len);
  3356. folio_mark_uptodate(folio);
  3357. folio_mark_dirty(folio);
  3358. folio_unlock(folio);
  3359. folio_put(folio);
  3360. }
  3361. dir->i_size += BOGO_DIRENT_SIZE;
  3362. inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
  3363. inode_inc_iversion(dir);
  3364. d_instantiate(dentry, inode);
  3365. dget(dentry);
  3366. return 0;
  3367. out_remove_offset:
  3368. simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
  3369. out_iput:
  3370. iput(inode);
  3371. return error;
  3372. }
  3373. static void shmem_put_link(void *arg)
  3374. {
  3375. folio_mark_accessed(arg);
  3376. folio_put(arg);
  3377. }
  3378. static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
  3379. struct delayed_call *done)
  3380. {
  3381. struct folio *folio = NULL;
  3382. int error;
  3383. if (!dentry) {
  3384. folio = filemap_get_folio(inode->i_mapping, 0);
  3385. if (IS_ERR(folio))
  3386. return ERR_PTR(-ECHILD);
  3387. if (PageHWPoison(folio_page(folio, 0)) ||
  3388. !folio_test_uptodate(folio)) {
  3389. folio_put(folio);
  3390. return ERR_PTR(-ECHILD);
  3391. }
  3392. } else {
  3393. error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ);
  3394. if (error)
  3395. return ERR_PTR(error);
  3396. if (!folio)
  3397. return ERR_PTR(-ECHILD);
  3398. if (PageHWPoison(folio_page(folio, 0))) {
  3399. folio_unlock(folio);
  3400. folio_put(folio);
  3401. return ERR_PTR(-ECHILD);
  3402. }
  3403. folio_unlock(folio);
  3404. }
  3405. set_delayed_call(done, shmem_put_link, folio);
  3406. return folio_address(folio);
  3407. }
  3408. #ifdef CONFIG_TMPFS_XATTR
  3409. static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
  3410. {
  3411. struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
  3412. fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE);
  3413. return 0;
  3414. }
  3415. static int shmem_fileattr_set(struct mnt_idmap *idmap,
  3416. struct dentry *dentry, struct fileattr *fa)
  3417. {
  3418. struct inode *inode = d_inode(dentry);
  3419. struct shmem_inode_info *info = SHMEM_I(inode);
  3420. if (fileattr_has_fsx(fa))
  3421. return -EOPNOTSUPP;
  3422. if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
  3423. return -EOPNOTSUPP;
  3424. info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
  3425. (fa->flags & SHMEM_FL_USER_MODIFIABLE);
  3426. shmem_set_inode_flags(inode, info->fsflags);
  3427. inode_set_ctime_current(inode);
  3428. inode_inc_iversion(inode);
  3429. return 0;
  3430. }
  3431. /*
  3432. * Superblocks without xattr inode operations may get some security.* xattr
  3433. * support from the LSM "for free". As soon as we have any other xattrs
  3434. * like ACLs, we also need to implement the security.* handlers at
  3435. * filesystem level, though.
  3436. */
  3437. /*
  3438. * Callback for security_inode_init_security() for acquiring xattrs.
  3439. */
  3440. static int shmem_initxattrs(struct inode *inode,
  3441. const struct xattr *xattr_array, void *fs_info)
  3442. {
  3443. struct shmem_inode_info *info = SHMEM_I(inode);
  3444. struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
  3445. const struct xattr *xattr;
  3446. struct simple_xattr *new_xattr;
  3447. size_t ispace = 0;
  3448. size_t len;
  3449. if (sbinfo->max_inodes) {
  3450. for (xattr = xattr_array; xattr->name != NULL; xattr++) {
  3451. ispace += simple_xattr_space(xattr->name,
  3452. xattr->value_len + XATTR_SECURITY_PREFIX_LEN);
  3453. }
  3454. if (ispace) {
  3455. raw_spin_lock(&sbinfo->stat_lock);
  3456. if (sbinfo->free_ispace < ispace)
  3457. ispace = 0;
  3458. else
  3459. sbinfo->free_ispace -= ispace;
  3460. raw_spin_unlock(&sbinfo->stat_lock);
  3461. if (!ispace)
  3462. return -ENOSPC;
  3463. }
  3464. }
  3465. for (xattr = xattr_array; xattr->name != NULL; xattr++) {
  3466. new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
  3467. if (!new_xattr)
  3468. break;
  3469. len = strlen(xattr->name) + 1;
  3470. new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
  3471. GFP_KERNEL_ACCOUNT);
  3472. if (!new_xattr->name) {
  3473. kvfree(new_xattr);
  3474. break;
  3475. }
  3476. memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
  3477. XATTR_SECURITY_PREFIX_LEN);
  3478. memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
  3479. xattr->name, len);
  3480. simple_xattr_add(&info->xattrs, new_xattr);
  3481. }
  3482. if (xattr->name != NULL) {
  3483. if (ispace) {
  3484. raw_spin_lock(&sbinfo->stat_lock);
  3485. sbinfo->free_ispace += ispace;
  3486. raw_spin_unlock(&sbinfo->stat_lock);
  3487. }
  3488. simple_xattrs_free(&info->xattrs, NULL);
  3489. return -ENOMEM;
  3490. }
  3491. return 0;
  3492. }
  3493. static int shmem_xattr_handler_get(const struct xattr_handler *handler,
  3494. struct dentry *unused, struct inode *inode,
  3495. const char *name, void *buffer, size_t size)
  3496. {
  3497. struct shmem_inode_info *info = SHMEM_I(inode);
  3498. name = xattr_full_name(handler, name);
  3499. return simple_xattr_get(&info->xattrs, name, buffer, size);
  3500. }
  3501. static int shmem_xattr_handler_set(const struct xattr_handler *handler,
  3502. struct mnt_idmap *idmap,
  3503. struct dentry *unused, struct inode *inode,
  3504. const char *name, const void *value,
  3505. size_t size, int flags)
  3506. {
  3507. struct shmem_inode_info *info = SHMEM_I(inode);
  3508. struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
  3509. struct simple_xattr *old_xattr;
  3510. size_t ispace = 0;
  3511. name = xattr_full_name(handler, name);
  3512. if (value && sbinfo->max_inodes) {
  3513. ispace = simple_xattr_space(name, size);
  3514. raw_spin_lock(&sbinfo->stat_lock);
  3515. if (sbinfo->free_ispace < ispace)
  3516. ispace = 0;
  3517. else
  3518. sbinfo->free_ispace -= ispace;
  3519. raw_spin_unlock(&sbinfo->stat_lock);
  3520. if (!ispace)
  3521. return -ENOSPC;
  3522. }
  3523. old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags);
  3524. if (!IS_ERR(old_xattr)) {
  3525. ispace = 0;
  3526. if (old_xattr && sbinfo->max_inodes)
  3527. ispace = simple_xattr_space(old_xattr->name,
  3528. old_xattr->size);
  3529. simple_xattr_free(old_xattr);
  3530. old_xattr = NULL;
  3531. inode_set_ctime_current(inode);
  3532. inode_inc_iversion(inode);
  3533. }
  3534. if (ispace) {
  3535. raw_spin_lock(&sbinfo->stat_lock);
  3536. sbinfo->free_ispace += ispace;
  3537. raw_spin_unlock(&sbinfo->stat_lock);
  3538. }
  3539. return PTR_ERR(old_xattr);
  3540. }
  3541. static const struct xattr_handler shmem_security_xattr_handler = {
  3542. .prefix = XATTR_SECURITY_PREFIX,
  3543. .get = shmem_xattr_handler_get,
  3544. .set = shmem_xattr_handler_set,
  3545. };
  3546. static const struct xattr_handler shmem_trusted_xattr_handler = {
  3547. .prefix = XATTR_TRUSTED_PREFIX,
  3548. .get = shmem_xattr_handler_get,
  3549. .set = shmem_xattr_handler_set,
  3550. };
  3551. static const struct xattr_handler shmem_user_xattr_handler = {
  3552. .prefix = XATTR_USER_PREFIX,
  3553. .get = shmem_xattr_handler_get,
  3554. .set = shmem_xattr_handler_set,
  3555. };
  3556. static const struct xattr_handler * const shmem_xattr_handlers[] = {
  3557. &shmem_security_xattr_handler,
  3558. &shmem_trusted_xattr_handler,
  3559. &shmem_user_xattr_handler,
  3560. NULL
  3561. };
  3562. static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
  3563. {
  3564. struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
  3565. return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
  3566. }
  3567. #endif /* CONFIG_TMPFS_XATTR */
  3568. static const struct inode_operations shmem_short_symlink_operations = {
  3569. .getattr = shmem_getattr,
  3570. .setattr = shmem_setattr,
  3571. .get_link = simple_get_link,
  3572. #ifdef CONFIG_TMPFS_XATTR
  3573. .listxattr = shmem_listxattr,
  3574. #endif
  3575. };
  3576. static const struct inode_operations shmem_symlink_inode_operations = {
  3577. .getattr = shmem_getattr,
  3578. .setattr = shmem_setattr,
  3579. .get_link = shmem_get_link,
  3580. #ifdef CONFIG_TMPFS_XATTR
  3581. .listxattr = shmem_listxattr,
  3582. #endif
  3583. };
  3584. static struct dentry *shmem_get_parent(struct dentry *child)
  3585. {
  3586. return ERR_PTR(-ESTALE);
  3587. }
  3588. static int shmem_match(struct inode *ino, void *vfh)
  3589. {
  3590. __u32 *fh = vfh;
  3591. __u64 inum = fh[2];
  3592. inum = (inum << 32) | fh[1];
  3593. return ino->i_ino == inum && fh[0] == ino->i_generation;
  3594. }
  3595. /* Find any alias of inode, but prefer a hashed alias */
  3596. static struct dentry *shmem_find_alias(struct inode *inode)
  3597. {
  3598. struct dentry *alias = d_find_alias(inode);
  3599. return alias ?: d_find_any_alias(inode);
  3600. }
  3601. static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
  3602. struct fid *fid, int fh_len, int fh_type)
  3603. {
  3604. struct inode *inode;
  3605. struct dentry *dentry = NULL;
  3606. u64 inum;
  3607. if (fh_len < 3)
  3608. return NULL;
  3609. inum = fid->raw[2];
  3610. inum = (inum << 32) | fid->raw[1];
  3611. inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
  3612. shmem_match, fid->raw);
  3613. if (inode) {
  3614. dentry = shmem_find_alias(inode);
  3615. iput(inode);
  3616. }
  3617. return dentry;
  3618. }
  3619. static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
  3620. struct inode *parent)
  3621. {
  3622. if (*len < 3) {
  3623. *len = 3;
  3624. return FILEID_INVALID;
  3625. }
  3626. if (inode_unhashed(inode)) {
  3627. /* Unfortunately insert_inode_hash is not idempotent,
  3628. * so as we hash inodes here rather than at creation
  3629. * time, we need a lock to ensure we only try
  3630. * to do it once
  3631. */
  3632. static DEFINE_SPINLOCK(lock);
  3633. spin_lock(&lock);
  3634. if (inode_unhashed(inode))
  3635. __insert_inode_hash(inode,
  3636. inode->i_ino + inode->i_generation);
  3637. spin_unlock(&lock);
  3638. }
  3639. fh[0] = inode->i_generation;
  3640. fh[1] = inode->i_ino;
  3641. fh[2] = ((__u64)inode->i_ino) >> 32;
  3642. *len = 3;
  3643. return 1;
  3644. }
  3645. static const struct export_operations shmem_export_ops = {
  3646. .get_parent = shmem_get_parent,
  3647. .encode_fh = shmem_encode_fh,
  3648. .fh_to_dentry = shmem_fh_to_dentry,
  3649. };
  3650. enum shmem_param {
  3651. Opt_gid,
  3652. Opt_huge,
  3653. Opt_mode,
  3654. Opt_mpol,
  3655. Opt_nr_blocks,
  3656. Opt_nr_inodes,
  3657. Opt_size,
  3658. Opt_uid,
  3659. Opt_inode32,
  3660. Opt_inode64,
  3661. Opt_noswap,
  3662. Opt_quota,
  3663. Opt_usrquota,
  3664. Opt_grpquota,
  3665. Opt_usrquota_block_hardlimit,
  3666. Opt_usrquota_inode_hardlimit,
  3667. Opt_grpquota_block_hardlimit,
  3668. Opt_grpquota_inode_hardlimit,
  3669. };
  3670. static const struct constant_table shmem_param_enums_huge[] = {
  3671. {"never", SHMEM_HUGE_NEVER },
  3672. {"always", SHMEM_HUGE_ALWAYS },
  3673. {"within_size", SHMEM_HUGE_WITHIN_SIZE },
  3674. {"advise", SHMEM_HUGE_ADVISE },
  3675. {}
  3676. };
  3677. const struct fs_parameter_spec shmem_fs_parameters[] = {
  3678. fsparam_gid ("gid", Opt_gid),
  3679. fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge),
  3680. fsparam_u32oct("mode", Opt_mode),
  3681. fsparam_string("mpol", Opt_mpol),
  3682. fsparam_string("nr_blocks", Opt_nr_blocks),
  3683. fsparam_string("nr_inodes", Opt_nr_inodes),
  3684. fsparam_string("size", Opt_size),
  3685. fsparam_uid ("uid", Opt_uid),
  3686. fsparam_flag ("inode32", Opt_inode32),
  3687. fsparam_flag ("inode64", Opt_inode64),
  3688. fsparam_flag ("noswap", Opt_noswap),
  3689. #ifdef CONFIG_TMPFS_QUOTA
  3690. fsparam_flag ("quota", Opt_quota),
  3691. fsparam_flag ("usrquota", Opt_usrquota),
  3692. fsparam_flag ("grpquota", Opt_grpquota),
  3693. fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit),
  3694. fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit),
  3695. fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit),
  3696. fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit),
  3697. #endif
  3698. {}
  3699. };
  3700. static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
  3701. {
  3702. struct shmem_options *ctx = fc->fs_private;
  3703. struct fs_parse_result result;
  3704. unsigned long long size;
  3705. char *rest;
  3706. int opt;
  3707. kuid_t kuid;
  3708. kgid_t kgid;
  3709. opt = fs_parse(fc, shmem_fs_parameters, param, &result);
  3710. if (opt < 0)
  3711. return opt;
  3712. switch (opt) {
  3713. case Opt_size:
  3714. size = memparse(param->string, &rest);
  3715. if (*rest == '%') {
  3716. size <<= PAGE_SHIFT;
  3717. size *= totalram_pages();
  3718. do_div(size, 100);
  3719. rest++;
  3720. }
  3721. if (*rest)
  3722. goto bad_value;
  3723. ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
  3724. ctx->seen |= SHMEM_SEEN_BLOCKS;
  3725. break;
  3726. case Opt_nr_blocks:
  3727. ctx->blocks = memparse(param->string, &rest);
  3728. if (*rest || ctx->blocks > LONG_MAX)
  3729. goto bad_value;
  3730. ctx->seen |= SHMEM_SEEN_BLOCKS;
  3731. break;
  3732. case Opt_nr_inodes:
  3733. ctx->inodes = memparse(param->string, &rest);
  3734. if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE)
  3735. goto bad_value;
  3736. ctx->seen |= SHMEM_SEEN_INODES;
  3737. break;
  3738. case Opt_mode:
  3739. ctx->mode = result.uint_32 & 07777;
  3740. break;
  3741. case Opt_uid:
  3742. kuid = result.uid;
  3743. /*
  3744. * The requested uid must be representable in the
  3745. * filesystem's idmapping.
  3746. */
  3747. if (!kuid_has_mapping(fc->user_ns, kuid))
  3748. goto bad_value;
  3749. ctx->uid = kuid;
  3750. break;
  3751. case Opt_gid:
  3752. kgid = result.gid;
  3753. /*
  3754. * The requested gid must be representable in the
  3755. * filesystem's idmapping.
  3756. */
  3757. if (!kgid_has_mapping(fc->user_ns, kgid))
  3758. goto bad_value;
  3759. ctx->gid = kgid;
  3760. break;
  3761. case Opt_huge:
  3762. ctx->huge = result.uint_32;
  3763. if (ctx->huge != SHMEM_HUGE_NEVER &&
  3764. !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
  3765. has_transparent_hugepage()))
  3766. goto unsupported_parameter;
  3767. ctx->seen |= SHMEM_SEEN_HUGE;
  3768. break;
  3769. case Opt_mpol:
  3770. if (IS_ENABLED(CONFIG_NUMA)) {
  3771. mpol_put(ctx->mpol);
  3772. ctx->mpol = NULL;
  3773. if (mpol_parse_str(param->string, &ctx->mpol))
  3774. goto bad_value;
  3775. break;
  3776. }
  3777. goto unsupported_parameter;
  3778. case Opt_inode32:
  3779. ctx->full_inums = false;
  3780. ctx->seen |= SHMEM_SEEN_INUMS;
  3781. break;
  3782. case Opt_inode64:
  3783. if (sizeof(ino_t) < 8) {
  3784. return invalfc(fc,
  3785. "Cannot use inode64 with <64bit inums in kernel\n");
  3786. }
  3787. ctx->full_inums = true;
  3788. ctx->seen |= SHMEM_SEEN_INUMS;
  3789. break;
  3790. case Opt_noswap:
  3791. if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) {
  3792. return invalfc(fc,
  3793. "Turning off swap in unprivileged tmpfs mounts unsupported");
  3794. }
  3795. ctx->noswap = true;
  3796. ctx->seen |= SHMEM_SEEN_NOSWAP;
  3797. break;
  3798. case Opt_quota:
  3799. if (fc->user_ns != &init_user_ns)
  3800. return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
  3801. ctx->seen |= SHMEM_SEEN_QUOTA;
  3802. ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP);
  3803. break;
  3804. case Opt_usrquota:
  3805. if (fc->user_ns != &init_user_ns)
  3806. return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
  3807. ctx->seen |= SHMEM_SEEN_QUOTA;
  3808. ctx->quota_types |= QTYPE_MASK_USR;
  3809. break;
  3810. case Opt_grpquota:
  3811. if (fc->user_ns != &init_user_ns)
  3812. return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
  3813. ctx->seen |= SHMEM_SEEN_QUOTA;
  3814. ctx->quota_types |= QTYPE_MASK_GRP;
  3815. break;
  3816. case Opt_usrquota_block_hardlimit:
  3817. size = memparse(param->string, &rest);
  3818. if (*rest || !size)
  3819. goto bad_value;
  3820. if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
  3821. return invalfc(fc,
  3822. "User quota block hardlimit too large.");
  3823. ctx->qlimits.usrquota_bhardlimit = size;
  3824. break;
  3825. case Opt_grpquota_block_hardlimit:
  3826. size = memparse(param->string, &rest);
  3827. if (*rest || !size)
  3828. goto bad_value;
  3829. if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
  3830. return invalfc(fc,
  3831. "Group quota block hardlimit too large.");
  3832. ctx->qlimits.grpquota_bhardlimit = size;
  3833. break;
  3834. case Opt_usrquota_inode_hardlimit:
  3835. size = memparse(param->string, &rest);
  3836. if (*rest || !size)
  3837. goto bad_value;
  3838. if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
  3839. return invalfc(fc,
  3840. "User quota inode hardlimit too large.");
  3841. ctx->qlimits.usrquota_ihardlimit = size;
  3842. break;
  3843. case Opt_grpquota_inode_hardlimit:
  3844. size = memparse(param->string, &rest);
  3845. if (*rest || !size)
  3846. goto bad_value;
  3847. if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
  3848. return invalfc(fc,
  3849. "Group quota inode hardlimit too large.");
  3850. ctx->qlimits.grpquota_ihardlimit = size;
  3851. break;
  3852. }
  3853. return 0;
  3854. unsupported_parameter:
  3855. return invalfc(fc, "Unsupported parameter '%s'", param->key);
  3856. bad_value:
  3857. return invalfc(fc, "Bad value for '%s'", param->key);
  3858. }
  3859. static int shmem_parse_options(struct fs_context *fc, void *data)
  3860. {
  3861. char *options = data;
  3862. if (options) {
  3863. int err = security_sb_eat_lsm_opts(options, &fc->security);
  3864. if (err)
  3865. return err;
  3866. }
  3867. while (options != NULL) {
  3868. char *this_char = options;
  3869. for (;;) {
  3870. /*
  3871. * NUL-terminate this option: unfortunately,
  3872. * mount options form a comma-separated list,
  3873. * but mpol's nodelist may also contain commas.
  3874. */
  3875. options = strchr(options, ',');
  3876. if (options == NULL)
  3877. break;
  3878. options++;
  3879. if (!isdigit(*options)) {
  3880. options[-1] = '\0';
  3881. break;
  3882. }
  3883. }
  3884. if (*this_char) {
  3885. char *value = strchr(this_char, '=');
  3886. size_t len = 0;
  3887. int err;
  3888. if (value) {
  3889. *value++ = '\0';
  3890. len = strlen(value);
  3891. }
  3892. err = vfs_parse_fs_string(fc, this_char, value, len);
  3893. if (err < 0)
  3894. return err;
  3895. }
  3896. }
  3897. return 0;
  3898. }
  3899. /*
  3900. * Reconfigure a shmem filesystem.
  3901. */
  3902. static int shmem_reconfigure(struct fs_context *fc)
  3903. {
  3904. struct shmem_options *ctx = fc->fs_private;
  3905. struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
  3906. unsigned long used_isp;
  3907. struct mempolicy *mpol = NULL;
  3908. const char *err;
  3909. raw_spin_lock(&sbinfo->stat_lock);
  3910. used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace;
  3911. if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
  3912. if (!sbinfo->max_blocks) {
  3913. err = "Cannot retroactively limit size";
  3914. goto out;
  3915. }
  3916. if (percpu_counter_compare(&sbinfo->used_blocks,
  3917. ctx->blocks) > 0) {
  3918. err = "Too small a size for current use";
  3919. goto out;
  3920. }
  3921. }
  3922. if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
  3923. if (!sbinfo->max_inodes) {
  3924. err = "Cannot retroactively limit inodes";
  3925. goto out;
  3926. }
  3927. if (ctx->inodes * BOGO_INODE_SIZE < used_isp) {
  3928. err = "Too few inodes for current use";
  3929. goto out;
  3930. }
  3931. }
  3932. if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
  3933. sbinfo->next_ino > UINT_MAX) {
  3934. err = "Current inum too high to switch to 32-bit inums";
  3935. goto out;
  3936. }
  3937. if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) {
  3938. err = "Cannot disable swap on remount";
  3939. goto out;
  3940. }
  3941. if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) {
  3942. err = "Cannot enable swap on remount if it was disabled on first mount";
  3943. goto out;
  3944. }
  3945. if (ctx->seen & SHMEM_SEEN_QUOTA &&
  3946. !sb_any_quota_loaded(fc->root->d_sb)) {
  3947. err = "Cannot enable quota on remount";
  3948. goto out;
  3949. }
  3950. #ifdef CONFIG_TMPFS_QUOTA
  3951. #define CHANGED_LIMIT(name) \
  3952. (ctx->qlimits.name## hardlimit && \
  3953. (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit))
  3954. if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) ||
  3955. CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) {
  3956. err = "Cannot change global quota limit on remount";
  3957. goto out;
  3958. }
  3959. #endif /* CONFIG_TMPFS_QUOTA */
  3960. if (ctx->seen & SHMEM_SEEN_HUGE)
  3961. sbinfo->huge = ctx->huge;
  3962. if (ctx->seen & SHMEM_SEEN_INUMS)
  3963. sbinfo->full_inums = ctx->full_inums;
  3964. if (ctx->seen & SHMEM_SEEN_BLOCKS)
  3965. sbinfo->max_blocks = ctx->blocks;
  3966. if (ctx->seen & SHMEM_SEEN_INODES) {
  3967. sbinfo->max_inodes = ctx->inodes;
  3968. sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp;
  3969. }
  3970. /*
  3971. * Preserve previous mempolicy unless mpol remount option was specified.
  3972. */
  3973. if (ctx->mpol) {
  3974. mpol = sbinfo->mpol;
  3975. sbinfo->mpol = ctx->mpol; /* transfers initial ref */
  3976. ctx->mpol = NULL;
  3977. }
  3978. if (ctx->noswap)
  3979. sbinfo->noswap = true;
  3980. raw_spin_unlock(&sbinfo->stat_lock);
  3981. mpol_put(mpol);
  3982. return 0;
  3983. out:
  3984. raw_spin_unlock(&sbinfo->stat_lock);
  3985. return invalfc(fc, "%s", err);
  3986. }
  3987. static int shmem_show_options(struct seq_file *seq, struct dentry *root)
  3988. {
  3989. struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
  3990. struct mempolicy *mpol;
  3991. if (sbinfo->max_blocks != shmem_default_max_blocks())
  3992. seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks));
  3993. if (sbinfo->max_inodes != shmem_default_max_inodes())
  3994. seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
  3995. if (sbinfo->mode != (0777 | S_ISVTX))
  3996. seq_printf(seq, ",mode=%03ho", sbinfo->mode);
  3997. if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
  3998. seq_printf(seq, ",uid=%u",
  3999. from_kuid_munged(&init_user_ns, sbinfo->uid));
  4000. if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
  4001. seq_printf(seq, ",gid=%u",
  4002. from_kgid_munged(&init_user_ns, sbinfo->gid));
  4003. /*
  4004. * Showing inode{64,32} might be useful even if it's the system default,
  4005. * since then people don't have to resort to checking both here and
  4006. * /proc/config.gz to confirm 64-bit inums were successfully applied
  4007. * (which may not even exist if IKCONFIG_PROC isn't enabled).
  4008. *
  4009. * We hide it when inode64 isn't the default and we are using 32-bit
  4010. * inodes, since that probably just means the feature isn't even under
  4011. * consideration.
  4012. *
  4013. * As such:
  4014. *
  4015. * +-----------------+-----------------+
  4016. * | TMPFS_INODE64=y | TMPFS_INODE64=n |
  4017. * +------------------+-----------------+-----------------+
  4018. * | full_inums=true | show | show |
  4019. * | full_inums=false | show | hide |
  4020. * +------------------+-----------------+-----------------+
  4021. *
  4022. */
  4023. if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
  4024. seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
  4025. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  4026. /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
  4027. if (sbinfo->huge)
  4028. seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
  4029. #endif
  4030. mpol = shmem_get_sbmpol(sbinfo);
  4031. shmem_show_mpol(seq, mpol);
  4032. mpol_put(mpol);
  4033. if (sbinfo->noswap)
  4034. seq_printf(seq, ",noswap");
  4035. #ifdef CONFIG_TMPFS_QUOTA
  4036. if (sb_has_quota_active(root->d_sb, USRQUOTA))
  4037. seq_printf(seq, ",usrquota");
  4038. if (sb_has_quota_active(root->d_sb, GRPQUOTA))
  4039. seq_printf(seq, ",grpquota");
  4040. if (sbinfo->qlimits.usrquota_bhardlimit)
  4041. seq_printf(seq, ",usrquota_block_hardlimit=%lld",
  4042. sbinfo->qlimits.usrquota_bhardlimit);
  4043. if (sbinfo->qlimits.grpquota_bhardlimit)
  4044. seq_printf(seq, ",grpquota_block_hardlimit=%lld",
  4045. sbinfo->qlimits.grpquota_bhardlimit);
  4046. if (sbinfo->qlimits.usrquota_ihardlimit)
  4047. seq_printf(seq, ",usrquota_inode_hardlimit=%lld",
  4048. sbinfo->qlimits.usrquota_ihardlimit);
  4049. if (sbinfo->qlimits.grpquota_ihardlimit)
  4050. seq_printf(seq, ",grpquota_inode_hardlimit=%lld",
  4051. sbinfo->qlimits.grpquota_ihardlimit);
  4052. #endif
  4053. return 0;
  4054. }
  4055. #endif /* CONFIG_TMPFS */
  4056. static void shmem_put_super(struct super_block *sb)
  4057. {
  4058. struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
  4059. #ifdef CONFIG_TMPFS_QUOTA
  4060. shmem_disable_quotas(sb);
  4061. #endif
  4062. free_percpu(sbinfo->ino_batch);
  4063. percpu_counter_destroy(&sbinfo->used_blocks);
  4064. mpol_put(sbinfo->mpol);
  4065. kfree(sbinfo);
  4066. sb->s_fs_info = NULL;
  4067. }
  4068. static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
  4069. {
  4070. struct shmem_options *ctx = fc->fs_private;
  4071. struct inode *inode;
  4072. struct shmem_sb_info *sbinfo;
  4073. int error = -ENOMEM;
  4074. /* Round up to L1_CACHE_BYTES to resist false sharing */
  4075. sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
  4076. L1_CACHE_BYTES), GFP_KERNEL);
  4077. if (!sbinfo)
  4078. return error;
  4079. sb->s_fs_info = sbinfo;
  4080. #ifdef CONFIG_TMPFS
  4081. /*
  4082. * Per default we only allow half of the physical ram per
  4083. * tmpfs instance, limiting inodes to one per page of lowmem;
  4084. * but the internal instance is left unlimited.
  4085. */
  4086. if (!(sb->s_flags & SB_KERNMOUNT)) {
  4087. if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
  4088. ctx->blocks = shmem_default_max_blocks();
  4089. if (!(ctx->seen & SHMEM_SEEN_INODES))
  4090. ctx->inodes = shmem_default_max_inodes();
  4091. if (!(ctx->seen & SHMEM_SEEN_INUMS))
  4092. ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
  4093. sbinfo->noswap = ctx->noswap;
  4094. } else {
  4095. sb->s_flags |= SB_NOUSER;
  4096. }
  4097. sb->s_export_op = &shmem_export_ops;
  4098. sb->s_flags |= SB_NOSEC | SB_I_VERSION;
  4099. #else
  4100. sb->s_flags |= SB_NOUSER;
  4101. #endif
  4102. sbinfo->max_blocks = ctx->blocks;
  4103. sbinfo->max_inodes = ctx->inodes;
  4104. sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE;
  4105. if (sb->s_flags & SB_KERNMOUNT) {
  4106. sbinfo->ino_batch = alloc_percpu(ino_t);
  4107. if (!sbinfo->ino_batch)
  4108. goto failed;
  4109. }
  4110. sbinfo->uid = ctx->uid;
  4111. sbinfo->gid = ctx->gid;
  4112. sbinfo->full_inums = ctx->full_inums;
  4113. sbinfo->mode = ctx->mode;
  4114. sbinfo->huge = ctx->huge;
  4115. sbinfo->mpol = ctx->mpol;
  4116. ctx->mpol = NULL;
  4117. raw_spin_lock_init(&sbinfo->stat_lock);
  4118. if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
  4119. goto failed;
  4120. spin_lock_init(&sbinfo->shrinklist_lock);
  4121. INIT_LIST_HEAD(&sbinfo->shrinklist);
  4122. sb->s_maxbytes = MAX_LFS_FILESIZE;
  4123. sb->s_blocksize = PAGE_SIZE;
  4124. sb->s_blocksize_bits = PAGE_SHIFT;
  4125. sb->s_magic = TMPFS_MAGIC;
  4126. sb->s_op = &shmem_ops;
  4127. sb->s_time_gran = 1;
  4128. #ifdef CONFIG_TMPFS_XATTR
  4129. sb->s_xattr = shmem_xattr_handlers;
  4130. #endif
  4131. #ifdef CONFIG_TMPFS_POSIX_ACL
  4132. sb->s_flags |= SB_POSIXACL;
  4133. #endif
  4134. uuid_t uuid;
  4135. uuid_gen(&uuid);
  4136. super_set_uuid(sb, uuid.b, sizeof(uuid));
  4137. #ifdef CONFIG_TMPFS_QUOTA
  4138. if (ctx->seen & SHMEM_SEEN_QUOTA) {
  4139. sb->dq_op = &shmem_quota_operations;
  4140. sb->s_qcop = &dquot_quotactl_sysfile_ops;
  4141. sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
  4142. /* Copy the default limits from ctx into sbinfo */
  4143. memcpy(&sbinfo->qlimits, &ctx->qlimits,
  4144. sizeof(struct shmem_quota_limits));
  4145. if (shmem_enable_quotas(sb, ctx->quota_types))
  4146. goto failed;
  4147. }
  4148. #endif /* CONFIG_TMPFS_QUOTA */
  4149. inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL,
  4150. S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
  4151. if (IS_ERR(inode)) {
  4152. error = PTR_ERR(inode);
  4153. goto failed;
  4154. }
  4155. inode->i_uid = sbinfo->uid;
  4156. inode->i_gid = sbinfo->gid;
  4157. sb->s_root = d_make_root(inode);
  4158. if (!sb->s_root)
  4159. goto failed;
  4160. return 0;
  4161. failed:
  4162. shmem_put_super(sb);
  4163. return error;
  4164. }
  4165. static int shmem_get_tree(struct fs_context *fc)
  4166. {
  4167. return get_tree_nodev(fc, shmem_fill_super);
  4168. }
  4169. static void shmem_free_fc(struct fs_context *fc)
  4170. {
  4171. struct shmem_options *ctx = fc->fs_private;
  4172. if (ctx) {
  4173. mpol_put(ctx->mpol);
  4174. kfree(ctx);
  4175. }
  4176. }
  4177. static const struct fs_context_operations shmem_fs_context_ops = {
  4178. .free = shmem_free_fc,
  4179. .get_tree = shmem_get_tree,
  4180. #ifdef CONFIG_TMPFS
  4181. .parse_monolithic = shmem_parse_options,
  4182. .parse_param = shmem_parse_one,
  4183. .reconfigure = shmem_reconfigure,
  4184. #endif
  4185. };
  4186. static struct kmem_cache *shmem_inode_cachep __ro_after_init;
  4187. static struct inode *shmem_alloc_inode(struct super_block *sb)
  4188. {
  4189. struct shmem_inode_info *info;
  4190. info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL);
  4191. if (!info)
  4192. return NULL;
  4193. return &info->vfs_inode;
  4194. }
  4195. static void shmem_free_in_core_inode(struct inode *inode)
  4196. {
  4197. if (S_ISLNK(inode->i_mode))
  4198. kfree(inode->i_link);
  4199. kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
  4200. }
  4201. static void shmem_destroy_inode(struct inode *inode)
  4202. {
  4203. if (S_ISREG(inode->i_mode))
  4204. mpol_free_shared_policy(&SHMEM_I(inode)->policy);
  4205. if (S_ISDIR(inode->i_mode))
  4206. simple_offset_destroy(shmem_get_offset_ctx(inode));
  4207. }
  4208. static void shmem_init_inode(void *foo)
  4209. {
  4210. struct shmem_inode_info *info = foo;
  4211. inode_init_once(&info->vfs_inode);
  4212. }
  4213. static void __init shmem_init_inodecache(void)
  4214. {
  4215. shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
  4216. sizeof(struct shmem_inode_info),
  4217. 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
  4218. }
  4219. static void __init shmem_destroy_inodecache(void)
  4220. {
  4221. kmem_cache_destroy(shmem_inode_cachep);
  4222. }
  4223. /* Keep the page in page cache instead of truncating it */
  4224. static int shmem_error_remove_folio(struct address_space *mapping,
  4225. struct folio *folio)
  4226. {
  4227. return 0;
  4228. }
  4229. static const struct address_space_operations shmem_aops = {
  4230. .writepage = shmem_writepage,
  4231. .dirty_folio = noop_dirty_folio,
  4232. #ifdef CONFIG_TMPFS
  4233. .write_begin = shmem_write_begin,
  4234. .write_end = shmem_write_end,
  4235. #endif
  4236. #ifdef CONFIG_MIGRATION
  4237. .migrate_folio = migrate_folio,
  4238. #endif
  4239. .error_remove_folio = shmem_error_remove_folio,
  4240. };
  4241. static const struct file_operations shmem_file_operations = {
  4242. .mmap = shmem_mmap,
  4243. .open = shmem_file_open,
  4244. .get_unmapped_area = shmem_get_unmapped_area,
  4245. #ifdef CONFIG_TMPFS
  4246. .llseek = shmem_file_llseek,
  4247. .read_iter = shmem_file_read_iter,
  4248. .write_iter = shmem_file_write_iter,
  4249. .fsync = noop_fsync,
  4250. .splice_read = shmem_file_splice_read,
  4251. .splice_write = iter_file_splice_write,
  4252. .fallocate = shmem_fallocate,
  4253. #endif
  4254. };
  4255. static const struct inode_operations shmem_inode_operations = {
  4256. .getattr = shmem_getattr,
  4257. .setattr = shmem_setattr,
  4258. #ifdef CONFIG_TMPFS_XATTR
  4259. .listxattr = shmem_listxattr,
  4260. .set_acl = simple_set_acl,
  4261. .fileattr_get = shmem_fileattr_get,
  4262. .fileattr_set = shmem_fileattr_set,
  4263. #endif
  4264. };
  4265. static const struct inode_operations shmem_dir_inode_operations = {
  4266. #ifdef CONFIG_TMPFS
  4267. .getattr = shmem_getattr,
  4268. .create = shmem_create,
  4269. .lookup = simple_lookup,
  4270. .link = shmem_link,
  4271. .unlink = shmem_unlink,
  4272. .symlink = shmem_symlink,
  4273. .mkdir = shmem_mkdir,
  4274. .rmdir = shmem_rmdir,
  4275. .mknod = shmem_mknod,
  4276. .rename = shmem_rename2,
  4277. .tmpfile = shmem_tmpfile,
  4278. .get_offset_ctx = shmem_get_offset_ctx,
  4279. #endif
  4280. #ifdef CONFIG_TMPFS_XATTR
  4281. .listxattr = shmem_listxattr,
  4282. .fileattr_get = shmem_fileattr_get,
  4283. .fileattr_set = shmem_fileattr_set,
  4284. #endif
  4285. #ifdef CONFIG_TMPFS_POSIX_ACL
  4286. .setattr = shmem_setattr,
  4287. .set_acl = simple_set_acl,
  4288. #endif
  4289. };
  4290. static const struct inode_operations shmem_special_inode_operations = {
  4291. .getattr = shmem_getattr,
  4292. #ifdef CONFIG_TMPFS_XATTR
  4293. .listxattr = shmem_listxattr,
  4294. #endif
  4295. #ifdef CONFIG_TMPFS_POSIX_ACL
  4296. .setattr = shmem_setattr,
  4297. .set_acl = simple_set_acl,
  4298. #endif
  4299. };
  4300. static const struct super_operations shmem_ops = {
  4301. .alloc_inode = shmem_alloc_inode,
  4302. .free_inode = shmem_free_in_core_inode,
  4303. .destroy_inode = shmem_destroy_inode,
  4304. #ifdef CONFIG_TMPFS
  4305. .statfs = shmem_statfs,
  4306. .show_options = shmem_show_options,
  4307. #endif
  4308. #ifdef CONFIG_TMPFS_QUOTA
  4309. .get_dquots = shmem_get_dquots,
  4310. #endif
  4311. .evict_inode = shmem_evict_inode,
  4312. .drop_inode = generic_delete_inode,
  4313. .put_super = shmem_put_super,
  4314. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  4315. .nr_cached_objects = shmem_unused_huge_count,
  4316. .free_cached_objects = shmem_unused_huge_scan,
  4317. #endif
  4318. };
  4319. static const struct vm_operations_struct shmem_vm_ops = {
  4320. .fault = shmem_fault,
  4321. .map_pages = filemap_map_pages,
  4322. #ifdef CONFIG_NUMA
  4323. .set_policy = shmem_set_policy,
  4324. .get_policy = shmem_get_policy,
  4325. #endif
  4326. };
  4327. static const struct vm_operations_struct shmem_anon_vm_ops = {
  4328. .fault = shmem_fault,
  4329. .map_pages = filemap_map_pages,
  4330. #ifdef CONFIG_NUMA
  4331. .set_policy = shmem_set_policy,
  4332. .get_policy = shmem_get_policy,
  4333. #endif
  4334. };
  4335. int shmem_init_fs_context(struct fs_context *fc)
  4336. {
  4337. struct shmem_options *ctx;
  4338. ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
  4339. if (!ctx)
  4340. return -ENOMEM;
  4341. ctx->mode = 0777 | S_ISVTX;
  4342. ctx->uid = current_fsuid();
  4343. ctx->gid = current_fsgid();
  4344. fc->fs_private = ctx;
  4345. fc->ops = &shmem_fs_context_ops;
  4346. return 0;
  4347. }
  4348. static struct file_system_type shmem_fs_type = {
  4349. .owner = THIS_MODULE,
  4350. .name = "tmpfs",
  4351. .init_fs_context = shmem_init_fs_context,
  4352. #ifdef CONFIG_TMPFS
  4353. .parameters = shmem_fs_parameters,
  4354. #endif
  4355. .kill_sb = kill_litter_super,
  4356. .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
  4357. };
  4358. void __init shmem_init(void)
  4359. {
  4360. int error;
  4361. shmem_init_inodecache();
  4362. #ifdef CONFIG_TMPFS_QUOTA
  4363. register_quota_format(&shmem_quota_format);
  4364. #endif
  4365. error = register_filesystem(&shmem_fs_type);
  4366. if (error) {
  4367. pr_err("Could not register tmpfs\n");
  4368. goto out2;
  4369. }
  4370. shm_mnt = kern_mount(&shmem_fs_type);
  4371. if (IS_ERR(shm_mnt)) {
  4372. error = PTR_ERR(shm_mnt);
  4373. pr_err("Could not kern_mount tmpfs\n");
  4374. goto out1;
  4375. }
  4376. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  4377. if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
  4378. SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
  4379. else
  4380. shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
  4381. /*
  4382. * Default to setting PMD-sized THP to inherit the global setting and
  4383. * disable all other multi-size THPs.
  4384. */
  4385. huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER);
  4386. #endif
  4387. return;
  4388. out1:
  4389. unregister_filesystem(&shmem_fs_type);
  4390. out2:
  4391. #ifdef CONFIG_TMPFS_QUOTA
  4392. unregister_quota_format(&shmem_quota_format);
  4393. #endif
  4394. shmem_destroy_inodecache();
  4395. shm_mnt = ERR_PTR(error);
  4396. }
  4397. #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
  4398. static ssize_t shmem_enabled_show(struct kobject *kobj,
  4399. struct kobj_attribute *attr, char *buf)
  4400. {
  4401. static const int values[] = {
  4402. SHMEM_HUGE_ALWAYS,
  4403. SHMEM_HUGE_WITHIN_SIZE,
  4404. SHMEM_HUGE_ADVISE,
  4405. SHMEM_HUGE_NEVER,
  4406. SHMEM_HUGE_DENY,
  4407. SHMEM_HUGE_FORCE,
  4408. };
  4409. int len = 0;
  4410. int i;
  4411. for (i = 0; i < ARRAY_SIZE(values); i++) {
  4412. len += sysfs_emit_at(buf, len,
  4413. shmem_huge == values[i] ? "%s[%s]" : "%s%s",
  4414. i ? " " : "", shmem_format_huge(values[i]));
  4415. }
  4416. len += sysfs_emit_at(buf, len, "\n");
  4417. return len;
  4418. }
  4419. static ssize_t shmem_enabled_store(struct kobject *kobj,
  4420. struct kobj_attribute *attr, const char *buf, size_t count)
  4421. {
  4422. char tmp[16];
  4423. int huge;
  4424. if (count + 1 > sizeof(tmp))
  4425. return -EINVAL;
  4426. memcpy(tmp, buf, count);
  4427. tmp[count] = '\0';
  4428. if (count && tmp[count - 1] == '\n')
  4429. tmp[count - 1] = '\0';
  4430. huge = shmem_parse_huge(tmp);
  4431. if (huge == -EINVAL)
  4432. return -EINVAL;
  4433. if (!has_transparent_hugepage() &&
  4434. huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
  4435. return -EINVAL;
  4436. /* Do not override huge allocation policy with non-PMD sized mTHP */
  4437. if (huge == SHMEM_HUGE_FORCE &&
  4438. huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER))
  4439. return -EINVAL;
  4440. shmem_huge = huge;
  4441. if (shmem_huge > SHMEM_HUGE_DENY)
  4442. SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
  4443. return count;
  4444. }
  4445. struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
  4446. static DEFINE_SPINLOCK(huge_shmem_orders_lock);
  4447. static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj,
  4448. struct kobj_attribute *attr, char *buf)
  4449. {
  4450. int order = to_thpsize(kobj)->order;
  4451. const char *output;
  4452. if (test_bit(order, &huge_shmem_orders_always))
  4453. output = "[always] inherit within_size advise never";
  4454. else if (test_bit(order, &huge_shmem_orders_inherit))
  4455. output = "always [inherit] within_size advise never";
  4456. else if (test_bit(order, &huge_shmem_orders_within_size))
  4457. output = "always inherit [within_size] advise never";
  4458. else if (test_bit(order, &huge_shmem_orders_madvise))
  4459. output = "always inherit within_size [advise] never";
  4460. else
  4461. output = "always inherit within_size advise [never]";
  4462. return sysfs_emit(buf, "%s\n", output);
  4463. }
  4464. static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj,
  4465. struct kobj_attribute *attr,
  4466. const char *buf, size_t count)
  4467. {
  4468. int order = to_thpsize(kobj)->order;
  4469. ssize_t ret = count;
  4470. if (sysfs_streq(buf, "always")) {
  4471. spin_lock(&huge_shmem_orders_lock);
  4472. clear_bit(order, &huge_shmem_orders_inherit);
  4473. clear_bit(order, &huge_shmem_orders_madvise);
  4474. clear_bit(order, &huge_shmem_orders_within_size);
  4475. set_bit(order, &huge_shmem_orders_always);
  4476. spin_unlock(&huge_shmem_orders_lock);
  4477. } else if (sysfs_streq(buf, "inherit")) {
  4478. /* Do not override huge allocation policy with non-PMD sized mTHP */
  4479. if (shmem_huge == SHMEM_HUGE_FORCE &&
  4480. order != HPAGE_PMD_ORDER)
  4481. return -EINVAL;
  4482. spin_lock(&huge_shmem_orders_lock);
  4483. clear_bit(order, &huge_shmem_orders_always);
  4484. clear_bit(order, &huge_shmem_orders_madvise);
  4485. clear_bit(order, &huge_shmem_orders_within_size);
  4486. set_bit(order, &huge_shmem_orders_inherit);
  4487. spin_unlock(&huge_shmem_orders_lock);
  4488. } else if (sysfs_streq(buf, "within_size")) {
  4489. spin_lock(&huge_shmem_orders_lock);
  4490. clear_bit(order, &huge_shmem_orders_always);
  4491. clear_bit(order, &huge_shmem_orders_inherit);
  4492. clear_bit(order, &huge_shmem_orders_madvise);
  4493. set_bit(order, &huge_shmem_orders_within_size);
  4494. spin_unlock(&huge_shmem_orders_lock);
  4495. } else if (sysfs_streq(buf, "advise")) {
  4496. spin_lock(&huge_shmem_orders_lock);
  4497. clear_bit(order, &huge_shmem_orders_always);
  4498. clear_bit(order, &huge_shmem_orders_inherit);
  4499. clear_bit(order, &huge_shmem_orders_within_size);
  4500. set_bit(order, &huge_shmem_orders_madvise);
  4501. spin_unlock(&huge_shmem_orders_lock);
  4502. } else if (sysfs_streq(buf, "never")) {
  4503. spin_lock(&huge_shmem_orders_lock);
  4504. clear_bit(order, &huge_shmem_orders_always);
  4505. clear_bit(order, &huge_shmem_orders_inherit);
  4506. clear_bit(order, &huge_shmem_orders_within_size);
  4507. clear_bit(order, &huge_shmem_orders_madvise);
  4508. spin_unlock(&huge_shmem_orders_lock);
  4509. } else {
  4510. ret = -EINVAL;
  4511. }
  4512. return ret;
  4513. }
  4514. struct kobj_attribute thpsize_shmem_enabled_attr =
  4515. __ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store);
  4516. #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
  4517. #else /* !CONFIG_SHMEM */
  4518. /*
  4519. * tiny-shmem: simple shmemfs and tmpfs using ramfs code
  4520. *
  4521. * This is intended for small system where the benefits of the full
  4522. * shmem code (swap-backed and resource-limited) are outweighed by
  4523. * their complexity. On systems without swap this code should be
  4524. * effectively equivalent, but much lighter weight.
  4525. */
  4526. static struct file_system_type shmem_fs_type = {
  4527. .name = "tmpfs",
  4528. .init_fs_context = ramfs_init_fs_context,
  4529. .parameters = ramfs_fs_parameters,
  4530. .kill_sb = ramfs_kill_sb,
  4531. .fs_flags = FS_USERNS_MOUNT,
  4532. };
  4533. void __init shmem_init(void)
  4534. {
  4535. BUG_ON(register_filesystem(&shmem_fs_type) != 0);
  4536. shm_mnt = kern_mount(&shmem_fs_type);
  4537. BUG_ON(IS_ERR(shm_mnt));
  4538. }
  4539. int shmem_unuse(unsigned int type)
  4540. {
  4541. return 0;
  4542. }
  4543. int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
  4544. {
  4545. return 0;
  4546. }
  4547. void shmem_unlock_mapping(struct address_space *mapping)
  4548. {
  4549. }
  4550. #ifdef CONFIG_MMU
  4551. unsigned long shmem_get_unmapped_area(struct file *file,
  4552. unsigned long addr, unsigned long len,
  4553. unsigned long pgoff, unsigned long flags)
  4554. {
  4555. return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
  4556. }
  4557. #endif
  4558. void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
  4559. {
  4560. truncate_inode_pages_range(inode->i_mapping, lstart, lend);
  4561. }
  4562. EXPORT_SYMBOL_GPL(shmem_truncate_range);
  4563. #define shmem_vm_ops generic_file_vm_ops
  4564. #define shmem_anon_vm_ops generic_file_vm_ops
  4565. #define shmem_file_operations ramfs_file_operations
  4566. #define shmem_acct_size(flags, size) 0
  4567. #define shmem_unacct_size(flags, size) do {} while (0)
  4568. static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
  4569. struct super_block *sb, struct inode *dir,
  4570. umode_t mode, dev_t dev, unsigned long flags)
  4571. {
  4572. struct inode *inode = ramfs_get_inode(sb, dir, mode, dev);
  4573. return inode ? inode : ERR_PTR(-ENOSPC);
  4574. }
  4575. #endif /* CONFIG_SHMEM */
  4576. /* common code */
  4577. static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
  4578. loff_t size, unsigned long flags, unsigned int i_flags)
  4579. {
  4580. struct inode *inode;
  4581. struct file *res;
  4582. if (IS_ERR(mnt))
  4583. return ERR_CAST(mnt);
  4584. if (size < 0 || size > MAX_LFS_FILESIZE)
  4585. return ERR_PTR(-EINVAL);
  4586. if (shmem_acct_size(flags, size))
  4587. return ERR_PTR(-ENOMEM);
  4588. if (is_idmapped_mnt(mnt))
  4589. return ERR_PTR(-EINVAL);
  4590. inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
  4591. S_IFREG | S_IRWXUGO, 0, flags);
  4592. if (IS_ERR(inode)) {
  4593. shmem_unacct_size(flags, size);
  4594. return ERR_CAST(inode);
  4595. }
  4596. inode->i_flags |= i_flags;
  4597. inode->i_size = size;
  4598. clear_nlink(inode); /* It is unlinked */
  4599. res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
  4600. if (!IS_ERR(res))
  4601. res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
  4602. &shmem_file_operations);
  4603. if (IS_ERR(res))
  4604. iput(inode);
  4605. return res;
  4606. }
  4607. /**
  4608. * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
  4609. * kernel internal. There will be NO LSM permission checks against the
  4610. * underlying inode. So users of this interface must do LSM checks at a
  4611. * higher layer. The users are the big_key and shm implementations. LSM
  4612. * checks are provided at the key or shm level rather than the inode.
  4613. * @name: name for dentry (to be seen in /proc/<pid>/maps
  4614. * @size: size to be set for the file
  4615. * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
  4616. */
  4617. struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
  4618. {
  4619. return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
  4620. }
  4621. EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);
  4622. /**
  4623. * shmem_file_setup - get an unlinked file living in tmpfs
  4624. * @name: name for dentry (to be seen in /proc/<pid>/maps
  4625. * @size: size to be set for the file
  4626. * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
  4627. */
  4628. struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
  4629. {
  4630. return __shmem_file_setup(shm_mnt, name, size, flags, 0);
  4631. }
  4632. EXPORT_SYMBOL_GPL(shmem_file_setup);
  4633. /**
  4634. * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
  4635. * @mnt: the tmpfs mount where the file will be created
  4636. * @name: name for dentry (to be seen in /proc/<pid>/maps
  4637. * @size: size to be set for the file
  4638. * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
  4639. */
  4640. struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
  4641. loff_t size, unsigned long flags)
  4642. {
  4643. return __shmem_file_setup(mnt, name, size, flags, 0);
  4644. }
  4645. EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
  4646. /**
  4647. * shmem_zero_setup - setup a shared anonymous mapping
  4648. * @vma: the vma to be mmapped is prepared by do_mmap
  4649. */
  4650. int shmem_zero_setup(struct vm_area_struct *vma)
  4651. {
  4652. struct file *file;
  4653. loff_t size = vma->vm_end - vma->vm_start;
  4654. /*
  4655. * Cloning a new file under mmap_lock leads to a lock ordering conflict
  4656. * between XFS directory reading and selinux: since this file is only
  4657. * accessible to the user through its mapping, use S_PRIVATE flag to
  4658. * bypass file security, in the same way as shmem_kernel_file_setup().
  4659. */
  4660. file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
  4661. if (IS_ERR(file))
  4662. return PTR_ERR(file);
  4663. if (vma->vm_file)
  4664. fput(vma->vm_file);
  4665. vma->vm_file = file;
  4666. vma->vm_ops = &shmem_anon_vm_ops;
  4667. return 0;
  4668. }
  4669. /**
  4670. * shmem_read_folio_gfp - read into page cache, using specified page allocation flags.
  4671. * @mapping: the folio's address_space
  4672. * @index: the folio index
  4673. * @gfp: the page allocator flags to use if allocating
  4674. *
  4675. * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
  4676. * with any new page allocations done using the specified allocation flags.
  4677. * But read_cache_page_gfp() uses the ->read_folio() method: which does not
  4678. * suit tmpfs, since it may have pages in swapcache, and needs to find those
  4679. * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
  4680. *
  4681. * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
  4682. * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
  4683. */
  4684. struct folio *shmem_read_folio_gfp(struct address_space *mapping,
  4685. pgoff_t index, gfp_t gfp)
  4686. {
  4687. #ifdef CONFIG_SHMEM
  4688. struct inode *inode = mapping->host;
  4689. struct folio *folio;
  4690. int error;
  4691. error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE,
  4692. gfp, NULL, NULL);
  4693. if (error)
  4694. return ERR_PTR(error);
  4695. folio_unlock(folio);
  4696. return folio;
  4697. #else
  4698. /*
  4699. * The tiny !SHMEM case uses ramfs without swap
  4700. */
  4701. return mapping_read_folio_gfp(mapping, index, gfp);
  4702. #endif
  4703. }
  4704. EXPORT_SYMBOL_GPL(shmem_read_folio_gfp);
  4705. struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
  4706. pgoff_t index, gfp_t gfp)
  4707. {
  4708. struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp);
  4709. struct page *page;
  4710. if (IS_ERR(folio))
  4711. return &folio->page;
  4712. page = folio_file_page(folio, index);
  4713. if (PageHWPoison(page)) {
  4714. folio_put(folio);
  4715. return ERR_PTR(-EIO);
  4716. }
  4717. return page;
  4718. }
  4719. EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);