dlmglue.c 126 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * dlmglue.c
  4. *
  5. * Code which implements an OCFS2 specific interface to our DLM.
  6. *
  7. * Copyright (C) 2003, 2004 Oracle. All rights reserved.
  8. */
  9. #include <linux/types.h>
  10. #include <linux/slab.h>
  11. #include <linux/highmem.h>
  12. #include <linux/mm.h>
  13. #include <linux/kthread.h>
  14. #include <linux/pagemap.h>
  15. #include <linux/debugfs.h>
  16. #include <linux/seq_file.h>
  17. #include <linux/time.h>
  18. #include <linux/delay.h>
  19. #include <linux/quotaops.h>
  20. #include <linux/sched/signal.h>
  21. #define MLOG_MASK_PREFIX ML_DLM_GLUE
  22. #include <cluster/masklog.h>
  23. #include "ocfs2.h"
  24. #include "ocfs2_lockingver.h"
  25. #include "alloc.h"
  26. #include "dcache.h"
  27. #include "dlmglue.h"
  28. #include "extent_map.h"
  29. #include "file.h"
  30. #include "heartbeat.h"
  31. #include "inode.h"
  32. #include "journal.h"
  33. #include "stackglue.h"
  34. #include "slot_map.h"
  35. #include "super.h"
  36. #include "uptodate.h"
  37. #include "quota.h"
  38. #include "refcounttree.h"
  39. #include "acl.h"
  40. #include "buffer_head_io.h"
  41. struct ocfs2_mask_waiter {
  42. struct list_head mw_item;
  43. int mw_status;
  44. struct completion mw_complete;
  45. unsigned long mw_mask;
  46. unsigned long mw_goal;
  47. #ifdef CONFIG_OCFS2_FS_STATS
  48. ktime_t mw_lock_start;
  49. #endif
  50. };
  51. static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
  52. static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
  53. static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
  54. static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres);
  55. /*
  56. * Return value from ->downconvert_worker functions.
  57. *
  58. * These control the precise actions of ocfs2_unblock_lock()
  59. * and ocfs2_process_blocked_lock()
  60. *
  61. */
  62. enum ocfs2_unblock_action {
  63. UNBLOCK_CONTINUE = 0, /* Continue downconvert */
  64. UNBLOCK_CONTINUE_POST = 1, /* Continue downconvert, fire
  65. * ->post_unlock callback */
  66. UNBLOCK_STOP_POST = 2, /* Do not downconvert, fire
  67. * ->post_unlock() callback. */
  68. };
  69. struct ocfs2_unblock_ctl {
  70. int requeue;
  71. enum ocfs2_unblock_action unblock_action;
  72. };
  73. /* Lockdep class keys */
  74. #ifdef CONFIG_DEBUG_LOCK_ALLOC
  75. static struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES];
  76. #endif
  77. static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
  78. int new_level);
  79. static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres);
  80. static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
  81. int blocking);
  82. static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
  83. int blocking);
  84. static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
  85. struct ocfs2_lock_res *lockres);
  86. static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
  87. static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
  88. int new_level);
  89. static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
  90. int blocking);
  91. #define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
  92. /* This aids in debugging situations where a bad LVB might be involved. */
  93. static void ocfs2_dump_meta_lvb_info(u64 level,
  94. const char *function,
  95. unsigned int line,
  96. struct ocfs2_lock_res *lockres)
  97. {
  98. struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
  99. mlog(level, "LVB information for %s (called from %s:%u):\n",
  100. lockres->l_name, function, line);
  101. mlog(level, "version: %u, clusters: %u, generation: 0x%x\n",
  102. lvb->lvb_version, be32_to_cpu(lvb->lvb_iclusters),
  103. be32_to_cpu(lvb->lvb_igeneration));
  104. mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n",
  105. (unsigned long long)be64_to_cpu(lvb->lvb_isize),
  106. be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
  107. be16_to_cpu(lvb->lvb_imode));
  108. mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, "
  109. "mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink),
  110. (long long)be64_to_cpu(lvb->lvb_iatime_packed),
  111. (long long)be64_to_cpu(lvb->lvb_ictime_packed),
  112. (long long)be64_to_cpu(lvb->lvb_imtime_packed),
  113. be32_to_cpu(lvb->lvb_iattr));
  114. }
  115. /*
  116. * OCFS2 Lock Resource Operations
  117. *
  118. * These fine tune the behavior of the generic dlmglue locking infrastructure.
  119. *
  120. * The most basic of lock types can point ->l_priv to their respective
  121. * struct ocfs2_super and allow the default actions to manage things.
  122. *
  123. * Right now, each lock type also needs to implement an init function,
  124. * and trivial lock/unlock wrappers. ocfs2_simple_drop_lockres()
  125. * should be called when the lock is no longer needed (i.e., object
  126. * destruction time).
  127. */
  128. struct ocfs2_lock_res_ops {
  129. /*
  130. * Translate an ocfs2_lock_res * into an ocfs2_super *. Define
  131. * this callback if ->l_priv is not an ocfs2_super pointer
  132. */
  133. struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *);
  134. /*
  135. * Optionally called in the downconvert thread after a
  136. * successful downconvert. The lockres will not be referenced
  137. * after this callback is called, so it is safe to free
  138. * memory, etc.
  139. *
  140. * The exact semantics of when this is called are controlled
  141. * by ->downconvert_worker()
  142. */
  143. void (*post_unlock)(struct ocfs2_super *, struct ocfs2_lock_res *);
  144. /*
  145. * Allow a lock type to add checks to determine whether it is
  146. * safe to downconvert a lock. Return 0 to re-queue the
  147. * downconvert at a later time, nonzero to continue.
  148. *
  149. * For most locks, the default checks that there are no
  150. * incompatible holders are sufficient.
  151. *
  152. * Called with the lockres spinlock held.
  153. */
  154. int (*check_downconvert)(struct ocfs2_lock_res *, int);
  155. /*
  156. * Allows a lock type to populate the lock value block. This
  157. * is called on downconvert, and when we drop a lock.
  158. *
  159. * Locks that want to use this should set LOCK_TYPE_USES_LVB
  160. * in the flags field.
  161. *
  162. * Called with the lockres spinlock held.
  163. */
  164. void (*set_lvb)(struct ocfs2_lock_res *);
  165. /*
  166. * Called from the downconvert thread when it is determined
  167. * that a lock will be downconverted. This is called without
  168. * any locks held so the function can do work that might
  169. * schedule (syncing out data, etc).
  170. *
  171. * This should return any one of the ocfs2_unblock_action
  172. * values, depending on what it wants the thread to do.
  173. */
  174. int (*downconvert_worker)(struct ocfs2_lock_res *, int);
  175. /*
  176. * LOCK_TYPE_* flags which describe the specific requirements
  177. * of a lock type. Descriptions of each individual flag follow.
  178. */
  179. int flags;
  180. };
  181. /*
  182. * Some locks want to "refresh" potentially stale data when a
  183. * meaningful (PRMODE or EXMODE) lock level is first obtained. If this
  184. * flag is set, the OCFS2_LOCK_NEEDS_REFRESH flag will be set on the
  185. * individual lockres l_flags member from the ast function. It is
  186. * expected that the locking wrapper will clear the
  187. * OCFS2_LOCK_NEEDS_REFRESH flag when done.
  188. */
  189. #define LOCK_TYPE_REQUIRES_REFRESH 0x1
  190. /*
  191. * Indicate that a lock type makes use of the lock value block. The
  192. * ->set_lvb lock type callback must be defined.
  193. */
  194. #define LOCK_TYPE_USES_LVB 0x2
  195. static const struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
  196. .get_osb = ocfs2_get_inode_osb,
  197. .flags = 0,
  198. };
  199. static const struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
  200. .get_osb = ocfs2_get_inode_osb,
  201. .check_downconvert = ocfs2_check_meta_downconvert,
  202. .set_lvb = ocfs2_set_meta_lvb,
  203. .downconvert_worker = ocfs2_data_convert_worker,
  204. .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
  205. };
  206. static const struct ocfs2_lock_res_ops ocfs2_super_lops = {
  207. .flags = LOCK_TYPE_REQUIRES_REFRESH,
  208. };
  209. static const struct ocfs2_lock_res_ops ocfs2_rename_lops = {
  210. .flags = 0,
  211. };
  212. static const struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
  213. .flags = 0,
  214. };
  215. static const struct ocfs2_lock_res_ops ocfs2_trim_fs_lops = {
  216. .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
  217. };
  218. static const struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
  219. .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
  220. };
  221. static const struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
  222. .get_osb = ocfs2_get_dentry_osb,
  223. .post_unlock = ocfs2_dentry_post_unlock,
  224. .downconvert_worker = ocfs2_dentry_convert_worker,
  225. .flags = 0,
  226. };
  227. static const struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
  228. .get_osb = ocfs2_get_inode_osb,
  229. .flags = 0,
  230. };
  231. static const struct ocfs2_lock_res_ops ocfs2_flock_lops = {
  232. .get_osb = ocfs2_get_file_osb,
  233. .flags = 0,
  234. };
  235. static const struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
  236. .set_lvb = ocfs2_set_qinfo_lvb,
  237. .get_osb = ocfs2_get_qinfo_osb,
  238. .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
  239. };
  240. static const struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = {
  241. .check_downconvert = ocfs2_check_refcount_downconvert,
  242. .downconvert_worker = ocfs2_refcount_convert_worker,
  243. .flags = 0,
  244. };
  245. static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
  246. {
  247. return lockres->l_type == OCFS2_LOCK_TYPE_META ||
  248. lockres->l_type == OCFS2_LOCK_TYPE_RW ||
  249. lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
  250. }
  251. static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
  252. {
  253. return container_of(lksb, struct ocfs2_lock_res, l_lksb);
  254. }
  255. static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
  256. {
  257. BUG_ON(!ocfs2_is_inode_lock(lockres));
  258. return (struct inode *) lockres->l_priv;
  259. }
  260. static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res *lockres)
  261. {
  262. BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_DENTRY);
  263. return (struct ocfs2_dentry_lock *)lockres->l_priv;
  264. }
  265. static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_res *lockres)
  266. {
  267. BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO);
  268. return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
  269. }
  270. static inline struct ocfs2_refcount_tree *
  271. ocfs2_lock_res_refcount_tree(struct ocfs2_lock_res *res)
  272. {
  273. return container_of(res, struct ocfs2_refcount_tree, rf_lockres);
  274. }
  275. static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
  276. {
  277. if (lockres->l_ops->get_osb)
  278. return lockres->l_ops->get_osb(lockres);
  279. return (struct ocfs2_super *)lockres->l_priv;
  280. }
  281. static int ocfs2_lock_create(struct ocfs2_super *osb,
  282. struct ocfs2_lock_res *lockres,
  283. int level,
  284. u32 dlm_flags);
  285. static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
  286. int wanted);
  287. static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
  288. struct ocfs2_lock_res *lockres,
  289. int level, unsigned long caller_ip);
  290. static inline void ocfs2_cluster_unlock(struct ocfs2_super *osb,
  291. struct ocfs2_lock_res *lockres,
  292. int level)
  293. {
  294. __ocfs2_cluster_unlock(osb, lockres, level, _RET_IP_);
  295. }
  296. static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
  297. static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
  298. static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
  299. static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level);
  300. static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
  301. struct ocfs2_lock_res *lockres);
  302. static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
  303. int convert);
  304. #define ocfs2_log_dlm_error(_func, _err, _lockres) do { \
  305. if ((_lockres)->l_type != OCFS2_LOCK_TYPE_DENTRY) \
  306. mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \
  307. _err, _func, _lockres->l_name); \
  308. else \
  309. mlog(ML_ERROR, "DLM error %d while calling %s on resource %.*s%08x\n", \
  310. _err, _func, OCFS2_DENTRY_LOCK_INO_START - 1, (_lockres)->l_name, \
  311. (unsigned int)ocfs2_get_dentry_lock_ino(_lockres)); \
  312. } while (0)
  313. static int ocfs2_downconvert_thread(void *arg);
  314. static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
  315. struct ocfs2_lock_res *lockres);
  316. static int ocfs2_inode_lock_update(struct inode *inode,
  317. struct buffer_head **bh);
  318. static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
  319. static inline int ocfs2_highest_compat_lock_level(int level);
  320. static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
  321. int new_level);
  322. static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
  323. struct ocfs2_lock_res *lockres,
  324. int new_level,
  325. int lvb,
  326. unsigned int generation);
  327. static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
  328. struct ocfs2_lock_res *lockres);
  329. static int ocfs2_cancel_convert(struct ocfs2_super *osb,
  330. struct ocfs2_lock_res *lockres);
  331. static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
  332. u64 blkno,
  333. u32 generation,
  334. char *name)
  335. {
  336. int len;
  337. BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
  338. len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x",
  339. ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD,
  340. (long long)blkno, generation);
  341. BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
  342. mlog(0, "built lock resource with name: %s\n", name);
  343. }
  344. static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
  345. static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
  346. struct ocfs2_dlm_debug *dlm_debug)
  347. {
  348. mlog(0, "Add tracking for lockres %s\n", res->l_name);
  349. spin_lock(&ocfs2_dlm_tracking_lock);
  350. list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking);
  351. spin_unlock(&ocfs2_dlm_tracking_lock);
  352. }
  353. static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
  354. {
  355. spin_lock(&ocfs2_dlm_tracking_lock);
  356. if (!list_empty(&res->l_debug_list))
  357. list_del_init(&res->l_debug_list);
  358. spin_unlock(&ocfs2_dlm_tracking_lock);
  359. }
  360. #ifdef CONFIG_OCFS2_FS_STATS
  361. static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
  362. {
  363. res->l_lock_refresh = 0;
  364. res->l_lock_wait = 0;
  365. memset(&res->l_lock_prmode, 0, sizeof(struct ocfs2_lock_stats));
  366. memset(&res->l_lock_exmode, 0, sizeof(struct ocfs2_lock_stats));
  367. }
  368. static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
  369. struct ocfs2_mask_waiter *mw, int ret)
  370. {
  371. u32 usec;
  372. ktime_t kt;
  373. struct ocfs2_lock_stats *stats;
  374. if (level == LKM_PRMODE)
  375. stats = &res->l_lock_prmode;
  376. else if (level == LKM_EXMODE)
  377. stats = &res->l_lock_exmode;
  378. else
  379. return;
  380. kt = ktime_sub(ktime_get(), mw->mw_lock_start);
  381. usec = ktime_to_us(kt);
  382. stats->ls_gets++;
  383. stats->ls_total += ktime_to_ns(kt);
  384. /* overflow */
  385. if (unlikely(stats->ls_gets == 0)) {
  386. stats->ls_gets++;
  387. stats->ls_total = ktime_to_ns(kt);
  388. }
  389. if (stats->ls_max < usec)
  390. stats->ls_max = usec;
  391. if (ret)
  392. stats->ls_fail++;
  393. stats->ls_last = ktime_to_us(ktime_get_real());
  394. }
  395. static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
  396. {
  397. lockres->l_lock_refresh++;
  398. }
  399. static inline void ocfs2_track_lock_wait(struct ocfs2_lock_res *lockres)
  400. {
  401. struct ocfs2_mask_waiter *mw;
  402. if (list_empty(&lockres->l_mask_waiters)) {
  403. lockres->l_lock_wait = 0;
  404. return;
  405. }
  406. mw = list_first_entry(&lockres->l_mask_waiters,
  407. struct ocfs2_mask_waiter, mw_item);
  408. lockres->l_lock_wait =
  409. ktime_to_us(ktime_mono_to_real(mw->mw_lock_start));
  410. }
  411. static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
  412. {
  413. mw->mw_lock_start = ktime_get();
  414. }
  415. #else
  416. static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
  417. {
  418. }
  419. static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res,
  420. int level, struct ocfs2_mask_waiter *mw, int ret)
  421. {
  422. }
  423. static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
  424. {
  425. }
  426. static inline void ocfs2_track_lock_wait(struct ocfs2_lock_res *lockres)
  427. {
  428. }
  429. static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
  430. {
  431. }
  432. #endif
  433. static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
  434. struct ocfs2_lock_res *res,
  435. enum ocfs2_lock_type type,
  436. const struct ocfs2_lock_res_ops *ops,
  437. void *priv)
  438. {
  439. res->l_type = type;
  440. res->l_ops = ops;
  441. res->l_priv = priv;
  442. res->l_level = DLM_LOCK_IV;
  443. res->l_requested = DLM_LOCK_IV;
  444. res->l_blocking = DLM_LOCK_IV;
  445. res->l_action = OCFS2_AST_INVALID;
  446. res->l_unlock_action = OCFS2_UNLOCK_INVALID;
  447. res->l_flags = OCFS2_LOCK_INITIALIZED;
  448. ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
  449. ocfs2_init_lock_stats(res);
  450. #ifdef CONFIG_DEBUG_LOCK_ALLOC
  451. if (type != OCFS2_LOCK_TYPE_OPEN)
  452. lockdep_init_map(&res->l_lockdep_map, ocfs2_lock_type_strings[type],
  453. &lockdep_keys[type], 0);
  454. else
  455. res->l_lockdep_map.key = NULL;
  456. #endif
  457. }
  458. void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
  459. {
  460. /* This also clears out the lock status block */
  461. memset(res, 0, sizeof(struct ocfs2_lock_res));
  462. spin_lock_init(&res->l_lock);
  463. init_waitqueue_head(&res->l_event);
  464. INIT_LIST_HEAD(&res->l_blocked_list);
  465. INIT_LIST_HEAD(&res->l_mask_waiters);
  466. INIT_LIST_HEAD(&res->l_holders);
  467. }
  468. void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
  469. enum ocfs2_lock_type type,
  470. unsigned int generation,
  471. struct inode *inode)
  472. {
  473. const struct ocfs2_lock_res_ops *ops;
  474. switch(type) {
  475. case OCFS2_LOCK_TYPE_RW:
  476. ops = &ocfs2_inode_rw_lops;
  477. break;
  478. case OCFS2_LOCK_TYPE_META:
  479. ops = &ocfs2_inode_inode_lops;
  480. break;
  481. case OCFS2_LOCK_TYPE_OPEN:
  482. ops = &ocfs2_inode_open_lops;
  483. break;
  484. default:
  485. mlog_bug_on_msg(1, "type: %d\n", type);
  486. ops = NULL; /* thanks, gcc */
  487. break;
  488. }
  489. ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno,
  490. generation, res->l_name);
  491. ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, ops, inode);
  492. }
  493. static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
  494. {
  495. struct inode *inode = ocfs2_lock_res_inode(lockres);
  496. return OCFS2_SB(inode->i_sb);
  497. }
  498. static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres)
  499. {
  500. struct ocfs2_mem_dqinfo *info = lockres->l_priv;
  501. return OCFS2_SB(info->dqi_gi.dqi_sb);
  502. }
  503. static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
  504. {
  505. struct ocfs2_file_private *fp = lockres->l_priv;
  506. return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb);
  507. }
  508. static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
  509. {
  510. __be64 inode_blkno_be;
  511. memcpy(&inode_blkno_be, &lockres->l_name[OCFS2_DENTRY_LOCK_INO_START],
  512. sizeof(__be64));
  513. return be64_to_cpu(inode_blkno_be);
  514. }
  515. static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres)
  516. {
  517. struct ocfs2_dentry_lock *dl = lockres->l_priv;
  518. return OCFS2_SB(dl->dl_inode->i_sb);
  519. }
  520. void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
  521. u64 parent, struct inode *inode)
  522. {
  523. int len;
  524. u64 inode_blkno = OCFS2_I(inode)->ip_blkno;
  525. __be64 inode_blkno_be = cpu_to_be64(inode_blkno);
  526. struct ocfs2_lock_res *lockres = &dl->dl_lockres;
  527. ocfs2_lock_res_init_once(lockres);
  528. /*
  529. * Unfortunately, the standard lock naming scheme won't work
  530. * here because we have two 16 byte values to use. Instead,
  531. * we'll stuff the inode number as a binary value. We still
  532. * want error prints to show something without garbling the
  533. * display, so drop a null byte in there before the inode
  534. * number. A future version of OCFS2 will likely use all
  535. * binary lock names. The stringified names have been a
  536. * tremendous aid in debugging, but now that the debugfs
  537. * interface exists, we can mangle things there if need be.
  538. *
  539. * NOTE: We also drop the standard "pad" value (the total lock
  540. * name size stays the same though - the last part is all
  541. * zeros due to the memset in ocfs2_lock_res_init_once()
  542. */
  543. len = snprintf(lockres->l_name, OCFS2_DENTRY_LOCK_INO_START,
  544. "%c%016llx",
  545. ocfs2_lock_type_char(OCFS2_LOCK_TYPE_DENTRY),
  546. (long long)parent);
  547. BUG_ON(len != (OCFS2_DENTRY_LOCK_INO_START - 1));
  548. memcpy(&lockres->l_name[OCFS2_DENTRY_LOCK_INO_START], &inode_blkno_be,
  549. sizeof(__be64));
  550. ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
  551. OCFS2_LOCK_TYPE_DENTRY, &ocfs2_dentry_lops,
  552. dl);
  553. }
  554. static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res,
  555. struct ocfs2_super *osb)
  556. {
  557. /* Superblock lockres doesn't come from a slab so we call init
  558. * once on it manually. */
  559. ocfs2_lock_res_init_once(res);
  560. ocfs2_build_lock_name(OCFS2_LOCK_TYPE_SUPER, OCFS2_SUPER_BLOCK_BLKNO,
  561. 0, res->l_name);
  562. ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER,
  563. &ocfs2_super_lops, osb);
  564. }
  565. static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
  566. struct ocfs2_super *osb)
  567. {
  568. /* Rename lockres doesn't come from a slab so we call init
  569. * once on it manually. */
  570. ocfs2_lock_res_init_once(res);
  571. ocfs2_build_lock_name(OCFS2_LOCK_TYPE_RENAME, 0, 0, res->l_name);
  572. ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME,
  573. &ocfs2_rename_lops, osb);
  574. }
  575. static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
  576. struct ocfs2_super *osb)
  577. {
  578. /* nfs_sync lockres doesn't come from a slab so we call init
  579. * once on it manually. */
  580. ocfs2_lock_res_init_once(res);
  581. ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name);
  582. ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC,
  583. &ocfs2_nfs_sync_lops, osb);
  584. }
  585. static void ocfs2_nfs_sync_lock_init(struct ocfs2_super *osb)
  586. {
  587. ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
  588. init_rwsem(&osb->nfs_sync_rwlock);
  589. }
  590. void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
  591. {
  592. struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
  593. /* Only one trimfs thread are allowed to work at the same time. */
  594. mutex_lock(&osb->obs_trim_fs_mutex);
  595. ocfs2_lock_res_init_once(lockres);
  596. ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
  597. ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
  598. &ocfs2_trim_fs_lops, osb);
  599. }
  600. void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb)
  601. {
  602. struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
  603. ocfs2_simple_drop_lockres(osb, lockres);
  604. ocfs2_lock_res_free(lockres);
  605. mutex_unlock(&osb->obs_trim_fs_mutex);
  606. }
  607. static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
  608. struct ocfs2_super *osb)
  609. {
  610. ocfs2_lock_res_init_once(res);
  611. ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name);
  612. ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
  613. &ocfs2_orphan_scan_lops, osb);
  614. }
  615. void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
  616. struct ocfs2_file_private *fp)
  617. {
  618. struct inode *inode = fp->fp_file->f_mapping->host;
  619. struct ocfs2_inode_info *oi = OCFS2_I(inode);
  620. ocfs2_lock_res_init_once(lockres);
  621. ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno,
  622. inode->i_generation, lockres->l_name);
  623. ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
  624. OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops,
  625. fp);
  626. lockres->l_flags |= OCFS2_LOCK_NOCACHE;
  627. }
  628. void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
  629. struct ocfs2_mem_dqinfo *info)
  630. {
  631. ocfs2_lock_res_init_once(lockres);
  632. ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type,
  633. 0, lockres->l_name);
  634. ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres,
  635. OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops,
  636. info);
  637. }
  638. void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
  639. struct ocfs2_super *osb, u64 ref_blkno,
  640. unsigned int generation)
  641. {
  642. ocfs2_lock_res_init_once(lockres);
  643. ocfs2_build_lock_name(OCFS2_LOCK_TYPE_REFCOUNT, ref_blkno,
  644. generation, lockres->l_name);
  645. ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_REFCOUNT,
  646. &ocfs2_refcount_block_lops, osb);
  647. }
  648. void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
  649. {
  650. if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
  651. return;
  652. ocfs2_remove_lockres_tracking(res);
  653. mlog_bug_on_msg(!list_empty(&res->l_blocked_list),
  654. "Lockres %s is on the blocked list\n",
  655. res->l_name);
  656. mlog_bug_on_msg(!list_empty(&res->l_mask_waiters),
  657. "Lockres %s has mask waiters pending\n",
  658. res->l_name);
  659. mlog_bug_on_msg(spin_is_locked(&res->l_lock),
  660. "Lockres %s is locked\n",
  661. res->l_name);
  662. mlog_bug_on_msg(res->l_ro_holders,
  663. "Lockres %s has %u ro holders\n",
  664. res->l_name, res->l_ro_holders);
  665. mlog_bug_on_msg(res->l_ex_holders,
  666. "Lockres %s has %u ex holders\n",
  667. res->l_name, res->l_ex_holders);
  668. /* Need to clear out the lock status block for the dlm */
  669. memset(&res->l_lksb, 0, sizeof(res->l_lksb));
  670. res->l_flags = 0UL;
  671. }
  672. /*
  673. * Keep a list of processes who have interest in a lockres.
  674. * Note: this is now only uesed for check recursive cluster locking.
  675. */
  676. static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres,
  677. struct ocfs2_lock_holder *oh)
  678. {
  679. INIT_LIST_HEAD(&oh->oh_list);
  680. oh->oh_owner_pid = get_pid(task_pid(current));
  681. spin_lock(&lockres->l_lock);
  682. list_add_tail(&oh->oh_list, &lockres->l_holders);
  683. spin_unlock(&lockres->l_lock);
  684. }
  685. static struct ocfs2_lock_holder *
  686. ocfs2_pid_holder(struct ocfs2_lock_res *lockres,
  687. struct pid *pid)
  688. {
  689. struct ocfs2_lock_holder *oh;
  690. spin_lock(&lockres->l_lock);
  691. list_for_each_entry(oh, &lockres->l_holders, oh_list) {
  692. if (oh->oh_owner_pid == pid) {
  693. spin_unlock(&lockres->l_lock);
  694. return oh;
  695. }
  696. }
  697. spin_unlock(&lockres->l_lock);
  698. return NULL;
  699. }
  700. static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres,
  701. struct ocfs2_lock_holder *oh)
  702. {
  703. spin_lock(&lockres->l_lock);
  704. list_del(&oh->oh_list);
  705. spin_unlock(&lockres->l_lock);
  706. put_pid(oh->oh_owner_pid);
  707. }
  708. static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
  709. int level)
  710. {
  711. BUG_ON(!lockres);
  712. switch(level) {
  713. case DLM_LOCK_EX:
  714. lockres->l_ex_holders++;
  715. break;
  716. case DLM_LOCK_PR:
  717. lockres->l_ro_holders++;
  718. break;
  719. default:
  720. BUG();
  721. }
  722. }
  723. static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
  724. int level)
  725. {
  726. BUG_ON(!lockres);
  727. switch(level) {
  728. case DLM_LOCK_EX:
  729. BUG_ON(!lockres->l_ex_holders);
  730. lockres->l_ex_holders--;
  731. break;
  732. case DLM_LOCK_PR:
  733. BUG_ON(!lockres->l_ro_holders);
  734. lockres->l_ro_holders--;
  735. break;
  736. default:
  737. BUG();
  738. }
  739. }
  740. /* WARNING: This function lives in a world where the only three lock
  741. * levels are EX, PR, and NL. It *will* have to be adjusted when more
  742. * lock types are added. */
  743. static inline int ocfs2_highest_compat_lock_level(int level)
  744. {
  745. int new_level = DLM_LOCK_EX;
  746. if (level == DLM_LOCK_EX)
  747. new_level = DLM_LOCK_NL;
  748. else if (level == DLM_LOCK_PR)
  749. new_level = DLM_LOCK_PR;
  750. return new_level;
  751. }
  752. static void lockres_set_flags(struct ocfs2_lock_res *lockres,
  753. unsigned long newflags)
  754. {
  755. struct ocfs2_mask_waiter *mw, *tmp;
  756. assert_spin_locked(&lockres->l_lock);
  757. lockres->l_flags = newflags;
  758. list_for_each_entry_safe(mw, tmp, &lockres->l_mask_waiters, mw_item) {
  759. if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
  760. continue;
  761. list_del_init(&mw->mw_item);
  762. mw->mw_status = 0;
  763. complete(&mw->mw_complete);
  764. ocfs2_track_lock_wait(lockres);
  765. }
  766. }
  767. static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
  768. {
  769. lockres_set_flags(lockres, lockres->l_flags | or);
  770. }
  771. static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
  772. unsigned long clear)
  773. {
  774. lockres_set_flags(lockres, lockres->l_flags & ~clear);
  775. }
  776. static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
  777. {
  778. BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
  779. BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
  780. BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
  781. BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
  782. lockres->l_level = lockres->l_requested;
  783. if (lockres->l_level <=
  784. ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
  785. lockres->l_blocking = DLM_LOCK_NL;
  786. lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
  787. }
  788. lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
  789. }
  790. static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
  791. {
  792. BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
  793. BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
  794. /* Convert from RO to EX doesn't really need anything as our
  795. * information is already up to data. Convert from NL to
  796. * *anything* however should mark ourselves as needing an
  797. * update */
  798. if (lockres->l_level == DLM_LOCK_NL &&
  799. lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
  800. lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
  801. lockres->l_level = lockres->l_requested;
  802. /*
  803. * We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing
  804. * the OCFS2_LOCK_BUSY flag to prevent the dc thread from
  805. * downconverting the lock before the upconvert has fully completed.
  806. * Do not prevent the dc thread from downconverting if NONBLOCK lock
  807. * had already returned.
  808. */
  809. if (!(lockres->l_flags & OCFS2_LOCK_NONBLOCK_FINISHED))
  810. lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
  811. else
  812. lockres_clear_flags(lockres, OCFS2_LOCK_NONBLOCK_FINISHED);
  813. lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
  814. }
  815. static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
  816. {
  817. BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
  818. BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
  819. if (lockres->l_requested > DLM_LOCK_NL &&
  820. !(lockres->l_flags & OCFS2_LOCK_LOCAL) &&
  821. lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
  822. lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
  823. lockres->l_level = lockres->l_requested;
  824. lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
  825. lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
  826. }
  827. static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
  828. int level)
  829. {
  830. int needs_downconvert = 0;
  831. assert_spin_locked(&lockres->l_lock);
  832. if (level > lockres->l_blocking) {
  833. /* only schedule a downconvert if we haven't already scheduled
  834. * one that goes low enough to satisfy the level we're
  835. * blocking. this also catches the case where we get
  836. * duplicate BASTs */
  837. if (ocfs2_highest_compat_lock_level(level) <
  838. ocfs2_highest_compat_lock_level(lockres->l_blocking))
  839. needs_downconvert = 1;
  840. lockres->l_blocking = level;
  841. }
  842. mlog(ML_BASTS, "lockres %s, block %d, level %d, l_block %d, dwn %d\n",
  843. lockres->l_name, level, lockres->l_level, lockres->l_blocking,
  844. needs_downconvert);
  845. if (needs_downconvert)
  846. lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
  847. mlog(0, "needs_downconvert = %d\n", needs_downconvert);
  848. return needs_downconvert;
  849. }
  850. /*
  851. * OCFS2_LOCK_PENDING and l_pending_gen.
  852. *
  853. * Why does OCFS2_LOCK_PENDING exist? To close a race between setting
  854. * OCFS2_LOCK_BUSY and calling ocfs2_dlm_lock(). See ocfs2_unblock_lock()
  855. * for more details on the race.
  856. *
  857. * OCFS2_LOCK_PENDING closes the race quite nicely. However, it introduces
  858. * a race on itself. In o2dlm, we can get the ast before ocfs2_dlm_lock()
  859. * returns. The ast clears OCFS2_LOCK_BUSY, and must therefore clear
  860. * OCFS2_LOCK_PENDING at the same time. When ocfs2_dlm_lock() returns,
  861. * the caller is going to try to clear PENDING again. If nothing else is
  862. * happening, __lockres_clear_pending() sees PENDING is unset and does
  863. * nothing.
  864. *
  865. * But what if another path (eg downconvert thread) has just started a
  866. * new locking action? The other path has re-set PENDING. Our path
  867. * cannot clear PENDING, because that will re-open the original race
  868. * window.
  869. *
  870. * [Example]
  871. *
  872. * ocfs2_meta_lock()
  873. * ocfs2_cluster_lock()
  874. * set BUSY
  875. * set PENDING
  876. * drop l_lock
  877. * ocfs2_dlm_lock()
  878. * ocfs2_locking_ast() ocfs2_downconvert_thread()
  879. * clear PENDING ocfs2_unblock_lock()
  880. * take_l_lock
  881. * !BUSY
  882. * ocfs2_prepare_downconvert()
  883. * set BUSY
  884. * set PENDING
  885. * drop l_lock
  886. * take l_lock
  887. * clear PENDING
  888. * drop l_lock
  889. * <window>
  890. * ocfs2_dlm_lock()
  891. *
  892. * So as you can see, we now have a window where l_lock is not held,
  893. * PENDING is not set, and ocfs2_dlm_lock() has not been called.
  894. *
  895. * The core problem is that ocfs2_cluster_lock() has cleared the PENDING
  896. * set by ocfs2_prepare_downconvert(). That wasn't nice.
  897. *
  898. * To solve this we introduce l_pending_gen. A call to
  899. * lockres_clear_pending() will only do so when it is passed a generation
  900. * number that matches the lockres. lockres_set_pending() will return the
  901. * current generation number. When ocfs2_cluster_lock() goes to clear
  902. * PENDING, it passes the generation it got from set_pending(). In our
  903. * example above, the generation numbers will *not* match. Thus,
  904. * ocfs2_cluster_lock() will not clear the PENDING set by
  905. * ocfs2_prepare_downconvert().
  906. */
  907. /* Unlocked version for ocfs2_locking_ast() */
  908. static void __lockres_clear_pending(struct ocfs2_lock_res *lockres,
  909. unsigned int generation,
  910. struct ocfs2_super *osb)
  911. {
  912. assert_spin_locked(&lockres->l_lock);
  913. /*
  914. * The ast and locking functions can race us here. The winner
  915. * will clear pending, the loser will not.
  916. */
  917. if (!(lockres->l_flags & OCFS2_LOCK_PENDING) ||
  918. (lockres->l_pending_gen != generation))
  919. return;
  920. lockres_clear_flags(lockres, OCFS2_LOCK_PENDING);
  921. lockres->l_pending_gen++;
  922. /*
  923. * The downconvert thread may have skipped us because we
  924. * were PENDING. Wake it up.
  925. */
  926. if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
  927. ocfs2_wake_downconvert_thread(osb);
  928. }
  929. /* Locked version for callers of ocfs2_dlm_lock() */
  930. static void lockres_clear_pending(struct ocfs2_lock_res *lockres,
  931. unsigned int generation,
  932. struct ocfs2_super *osb)
  933. {
  934. unsigned long flags;
  935. spin_lock_irqsave(&lockres->l_lock, flags);
  936. __lockres_clear_pending(lockres, generation, osb);
  937. spin_unlock_irqrestore(&lockres->l_lock, flags);
  938. }
  939. static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
  940. {
  941. assert_spin_locked(&lockres->l_lock);
  942. BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
  943. lockres_or_flags(lockres, OCFS2_LOCK_PENDING);
  944. return lockres->l_pending_gen;
  945. }
  946. static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level)
  947. {
  948. struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
  949. struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
  950. int needs_downconvert;
  951. unsigned long flags;
  952. BUG_ON(level <= DLM_LOCK_NL);
  953. mlog(ML_BASTS, "BAST fired for lockres %s, blocking %d, level %d, "
  954. "type %s\n", lockres->l_name, level, lockres->l_level,
  955. ocfs2_lock_type_string(lockres->l_type));
  956. /*
  957. * We can skip the bast for locks which don't enable caching -
  958. * they'll be dropped at the earliest possible time anyway.
  959. */
  960. if (lockres->l_flags & OCFS2_LOCK_NOCACHE)
  961. return;
  962. spin_lock_irqsave(&lockres->l_lock, flags);
  963. needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
  964. if (needs_downconvert)
  965. ocfs2_schedule_blocked_lock(osb, lockres);
  966. spin_unlock_irqrestore(&lockres->l_lock, flags);
  967. wake_up(&lockres->l_event);
  968. ocfs2_wake_downconvert_thread(osb);
  969. }
  970. static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb)
  971. {
  972. struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
  973. struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
  974. unsigned long flags;
  975. int status;
  976. spin_lock_irqsave(&lockres->l_lock, flags);
  977. status = ocfs2_dlm_lock_status(&lockres->l_lksb);
  978. if (status == -EAGAIN) {
  979. lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
  980. goto out;
  981. }
  982. if (status) {
  983. mlog(ML_ERROR, "lockres %s: lksb status value of %d!\n",
  984. lockres->l_name, status);
  985. spin_unlock_irqrestore(&lockres->l_lock, flags);
  986. return;
  987. }
  988. mlog(ML_BASTS, "AST fired for lockres %s, action %d, unlock %d, "
  989. "level %d => %d\n", lockres->l_name, lockres->l_action,
  990. lockres->l_unlock_action, lockres->l_level, lockres->l_requested);
  991. switch(lockres->l_action) {
  992. case OCFS2_AST_ATTACH:
  993. ocfs2_generic_handle_attach_action(lockres);
  994. lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
  995. break;
  996. case OCFS2_AST_CONVERT:
  997. ocfs2_generic_handle_convert_action(lockres);
  998. break;
  999. case OCFS2_AST_DOWNCONVERT:
  1000. ocfs2_generic_handle_downconvert_action(lockres);
  1001. break;
  1002. default:
  1003. mlog(ML_ERROR, "lockres %s: AST fired with invalid action: %u, "
  1004. "flags 0x%lx, unlock: %u\n",
  1005. lockres->l_name, lockres->l_action, lockres->l_flags,
  1006. lockres->l_unlock_action);
  1007. BUG();
  1008. }
  1009. out:
  1010. /* set it to something invalid so if we get called again we
  1011. * can catch it. */
  1012. lockres->l_action = OCFS2_AST_INVALID;
  1013. /* Did we try to cancel this lock? Clear that state */
  1014. if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT)
  1015. lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
  1016. /*
  1017. * We may have beaten the locking functions here. We certainly
  1018. * know that dlm_lock() has been called :-)
  1019. * Because we can't have two lock calls in flight at once, we
  1020. * can use lockres->l_pending_gen.
  1021. */
  1022. __lockres_clear_pending(lockres, lockres->l_pending_gen, osb);
  1023. wake_up(&lockres->l_event);
  1024. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1025. }
  1026. static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
  1027. {
  1028. struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
  1029. unsigned long flags;
  1030. mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n",
  1031. lockres->l_name, lockres->l_unlock_action);
  1032. spin_lock_irqsave(&lockres->l_lock, flags);
  1033. if (error) {
  1034. mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
  1035. "unlock_action %d\n", error, lockres->l_name,
  1036. lockres->l_unlock_action);
  1037. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1038. return;
  1039. }
  1040. switch(lockres->l_unlock_action) {
  1041. case OCFS2_UNLOCK_CANCEL_CONVERT:
  1042. mlog(0, "Cancel convert success for %s\n", lockres->l_name);
  1043. lockres->l_action = OCFS2_AST_INVALID;
  1044. /* Downconvert thread may have requeued this lock, we
  1045. * need to wake it. */
  1046. if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
  1047. ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
  1048. break;
  1049. case OCFS2_UNLOCK_DROP_LOCK:
  1050. lockres->l_level = DLM_LOCK_IV;
  1051. break;
  1052. default:
  1053. BUG();
  1054. }
  1055. lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
  1056. lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
  1057. wake_up(&lockres->l_event);
  1058. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1059. }
  1060. /*
  1061. * This is the filesystem locking protocol. It provides the lock handling
  1062. * hooks for the underlying DLM. It has a maximum version number.
  1063. * The version number allows interoperability with systems running at
  1064. * the same major number and an equal or smaller minor number.
  1065. *
  1066. * Whenever the filesystem does new things with locks (adds or removes a
  1067. * lock, orders them differently, does different things underneath a lock),
  1068. * the version must be changed. The protocol is negotiated when joining
  1069. * the dlm domain. A node may join the domain if its major version is
  1070. * identical to all other nodes and its minor version is greater than
  1071. * or equal to all other nodes. When its minor version is greater than
  1072. * the other nodes, it will run at the minor version specified by the
  1073. * other nodes.
  1074. *
  1075. * If a locking change is made that will not be compatible with older
  1076. * versions, the major number must be increased and the minor version set
  1077. * to zero. If a change merely adds a behavior that can be disabled when
  1078. * speaking to older versions, the minor version must be increased. If a
  1079. * change adds a fully backwards compatible change (eg, LVB changes that
  1080. * are just ignored by older versions), the version does not need to be
  1081. * updated.
  1082. */
  1083. static struct ocfs2_locking_protocol lproto = {
  1084. .lp_max_version = {
  1085. .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
  1086. .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
  1087. },
  1088. .lp_lock_ast = ocfs2_locking_ast,
  1089. .lp_blocking_ast = ocfs2_blocking_ast,
  1090. .lp_unlock_ast = ocfs2_unlock_ast,
  1091. };
  1092. void ocfs2_set_locking_protocol(void)
  1093. {
  1094. ocfs2_stack_glue_set_max_proto_version(&lproto.lp_max_version);
  1095. }
  1096. static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
  1097. int convert)
  1098. {
  1099. unsigned long flags;
  1100. spin_lock_irqsave(&lockres->l_lock, flags);
  1101. lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
  1102. lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
  1103. if (convert)
  1104. lockres->l_action = OCFS2_AST_INVALID;
  1105. else
  1106. lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
  1107. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1108. wake_up(&lockres->l_event);
  1109. }
  1110. /* Note: If we detect another process working on the lock (i.e.,
  1111. * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller
  1112. * to do the right thing in that case.
  1113. */
  1114. static int ocfs2_lock_create(struct ocfs2_super *osb,
  1115. struct ocfs2_lock_res *lockres,
  1116. int level,
  1117. u32 dlm_flags)
  1118. {
  1119. int ret = 0;
  1120. unsigned long flags;
  1121. unsigned int gen;
  1122. mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level,
  1123. dlm_flags);
  1124. spin_lock_irqsave(&lockres->l_lock, flags);
  1125. if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) ||
  1126. (lockres->l_flags & OCFS2_LOCK_BUSY)) {
  1127. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1128. goto bail;
  1129. }
  1130. lockres->l_action = OCFS2_AST_ATTACH;
  1131. lockres->l_requested = level;
  1132. lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
  1133. gen = lockres_set_pending(lockres);
  1134. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1135. ret = ocfs2_dlm_lock(osb->cconn,
  1136. level,
  1137. &lockres->l_lksb,
  1138. dlm_flags,
  1139. lockres->l_name,
  1140. OCFS2_LOCK_ID_MAX_LEN - 1);
  1141. lockres_clear_pending(lockres, gen, osb);
  1142. if (ret) {
  1143. ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
  1144. ocfs2_recover_from_dlm_error(lockres, 1);
  1145. }
  1146. mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name);
  1147. bail:
  1148. return ret;
  1149. }
  1150. static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres,
  1151. int flag)
  1152. {
  1153. unsigned long flags;
  1154. int ret;
  1155. spin_lock_irqsave(&lockres->l_lock, flags);
  1156. ret = lockres->l_flags & flag;
  1157. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1158. return ret;
  1159. }
  1160. static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres)
  1161. {
  1162. wait_event(lockres->l_event,
  1163. !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY));
  1164. }
  1165. static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres)
  1166. {
  1167. wait_event(lockres->l_event,
  1168. !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING));
  1169. }
  1170. /* predict what lock level we'll be dropping down to on behalf
  1171. * of another node, and return true if the currently wanted
  1172. * level will be compatible with it. */
  1173. static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
  1174. int wanted)
  1175. {
  1176. BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
  1177. return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
  1178. }
  1179. static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
  1180. {
  1181. INIT_LIST_HEAD(&mw->mw_item);
  1182. init_completion(&mw->mw_complete);
  1183. ocfs2_init_start_time(mw);
  1184. }
  1185. static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
  1186. {
  1187. wait_for_completion(&mw->mw_complete);
  1188. /* Re-arm the completion in case we want to wait on it again */
  1189. reinit_completion(&mw->mw_complete);
  1190. return mw->mw_status;
  1191. }
  1192. static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
  1193. struct ocfs2_mask_waiter *mw,
  1194. unsigned long mask,
  1195. unsigned long goal)
  1196. {
  1197. BUG_ON(!list_empty(&mw->mw_item));
  1198. assert_spin_locked(&lockres->l_lock);
  1199. list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
  1200. mw->mw_mask = mask;
  1201. mw->mw_goal = goal;
  1202. ocfs2_track_lock_wait(lockres);
  1203. }
  1204. /* returns 0 if the mw that was removed was already satisfied, -EBUSY
  1205. * if the mask still hadn't reached its goal */
  1206. static int __lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
  1207. struct ocfs2_mask_waiter *mw)
  1208. {
  1209. int ret = 0;
  1210. assert_spin_locked(&lockres->l_lock);
  1211. if (!list_empty(&mw->mw_item)) {
  1212. if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
  1213. ret = -EBUSY;
  1214. list_del_init(&mw->mw_item);
  1215. init_completion(&mw->mw_complete);
  1216. ocfs2_track_lock_wait(lockres);
  1217. }
  1218. return ret;
  1219. }
  1220. static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
  1221. struct ocfs2_mask_waiter *mw)
  1222. {
  1223. unsigned long flags;
  1224. int ret = 0;
  1225. spin_lock_irqsave(&lockres->l_lock, flags);
  1226. ret = __lockres_remove_mask_waiter(lockres, mw);
  1227. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1228. return ret;
  1229. }
  1230. static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
  1231. struct ocfs2_lock_res *lockres)
  1232. {
  1233. int ret;
  1234. ret = wait_for_completion_interruptible(&mw->mw_complete);
  1235. if (ret)
  1236. lockres_remove_mask_waiter(lockres, mw);
  1237. else
  1238. ret = mw->mw_status;
  1239. /* Re-arm the completion in case we want to wait on it again */
  1240. reinit_completion(&mw->mw_complete);
  1241. return ret;
  1242. }
  1243. static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
  1244. struct ocfs2_lock_res *lockres,
  1245. int level,
  1246. u32 lkm_flags,
  1247. int arg_flags,
  1248. int l_subclass,
  1249. unsigned long caller_ip)
  1250. {
  1251. struct ocfs2_mask_waiter mw;
  1252. int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
  1253. int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
  1254. unsigned long flags;
  1255. unsigned int gen;
  1256. int noqueue_attempted = 0;
  1257. int dlm_locked = 0;
  1258. int kick_dc = 0;
  1259. if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) {
  1260. mlog_errno(-EINVAL);
  1261. return -EINVAL;
  1262. }
  1263. ocfs2_init_mask_waiter(&mw);
  1264. if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
  1265. lkm_flags |= DLM_LKF_VALBLK;
  1266. again:
  1267. wait = 0;
  1268. spin_lock_irqsave(&lockres->l_lock, flags);
  1269. if (catch_signals && signal_pending(current)) {
  1270. ret = -ERESTARTSYS;
  1271. goto unlock;
  1272. }
  1273. mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
  1274. "Cluster lock called on freeing lockres %s! flags "
  1275. "0x%lx\n", lockres->l_name, lockres->l_flags);
  1276. /* We only compare against the currently granted level
  1277. * here. If the lock is blocked waiting on a downconvert,
  1278. * we'll get caught below. */
  1279. if (lockres->l_flags & OCFS2_LOCK_BUSY &&
  1280. level > lockres->l_level) {
  1281. /* is someone sitting in dlm_lock? If so, wait on
  1282. * them. */
  1283. lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
  1284. wait = 1;
  1285. goto unlock;
  1286. }
  1287. if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) {
  1288. /*
  1289. * We've upconverted. If the lock now has a level we can
  1290. * work with, we take it. If, however, the lock is not at the
  1291. * required level, we go thru the full cycle. One way this could
  1292. * happen is if a process requesting an upconvert to PR is
  1293. * closely followed by another requesting upconvert to an EX.
  1294. * If the process requesting EX lands here, we want it to
  1295. * continue attempting to upconvert and let the process
  1296. * requesting PR take the lock.
  1297. * If multiple processes request upconvert to PR, the first one
  1298. * here will take the lock. The others will have to go thru the
  1299. * OCFS2_LOCK_BLOCKED check to ensure that there is no pending
  1300. * downconvert request.
  1301. */
  1302. if (level <= lockres->l_level)
  1303. goto update_holders;
  1304. }
  1305. if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
  1306. !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
  1307. /* is the lock is currently blocked on behalf of
  1308. * another node */
  1309. lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0);
  1310. wait = 1;
  1311. goto unlock;
  1312. }
  1313. if (level > lockres->l_level) {
  1314. if (noqueue_attempted > 0) {
  1315. ret = -EAGAIN;
  1316. goto unlock;
  1317. }
  1318. if (lkm_flags & DLM_LKF_NOQUEUE)
  1319. noqueue_attempted = 1;
  1320. if (lockres->l_action != OCFS2_AST_INVALID)
  1321. mlog(ML_ERROR, "lockres %s has action %u pending\n",
  1322. lockres->l_name, lockres->l_action);
  1323. if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
  1324. lockres->l_action = OCFS2_AST_ATTACH;
  1325. lkm_flags &= ~DLM_LKF_CONVERT;
  1326. } else {
  1327. lockres->l_action = OCFS2_AST_CONVERT;
  1328. lkm_flags |= DLM_LKF_CONVERT;
  1329. }
  1330. lockres->l_requested = level;
  1331. lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
  1332. gen = lockres_set_pending(lockres);
  1333. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1334. BUG_ON(level == DLM_LOCK_IV);
  1335. BUG_ON(level == DLM_LOCK_NL);
  1336. mlog(ML_BASTS, "lockres %s, convert from %d to %d\n",
  1337. lockres->l_name, lockres->l_level, level);
  1338. /* call dlm_lock to upgrade lock now */
  1339. ret = ocfs2_dlm_lock(osb->cconn,
  1340. level,
  1341. &lockres->l_lksb,
  1342. lkm_flags,
  1343. lockres->l_name,
  1344. OCFS2_LOCK_ID_MAX_LEN - 1);
  1345. lockres_clear_pending(lockres, gen, osb);
  1346. if (ret) {
  1347. if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
  1348. (ret != -EAGAIN)) {
  1349. ocfs2_log_dlm_error("ocfs2_dlm_lock",
  1350. ret, lockres);
  1351. }
  1352. ocfs2_recover_from_dlm_error(lockres, 1);
  1353. goto out;
  1354. }
  1355. dlm_locked = 1;
  1356. mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
  1357. lockres->l_name);
  1358. /* At this point we've gone inside the dlm and need to
  1359. * complete our work regardless. */
  1360. catch_signals = 0;
  1361. /* wait for busy to clear and carry on */
  1362. goto again;
  1363. }
  1364. update_holders:
  1365. /* Ok, if we get here then we're good to go. */
  1366. ocfs2_inc_holders(lockres, level);
  1367. ret = 0;
  1368. unlock:
  1369. lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
  1370. /* ocfs2_unblock_lock request on seeing OCFS2_LOCK_UPCONVERT_FINISHING */
  1371. kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED);
  1372. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1373. if (kick_dc)
  1374. ocfs2_wake_downconvert_thread(osb);
  1375. out:
  1376. /*
  1377. * This is helping work around a lock inversion between the page lock
  1378. * and dlm locks. One path holds the page lock while calling aops
  1379. * which block acquiring dlm locks. The voting thread holds dlm
  1380. * locks while acquiring page locks while down converting data locks.
  1381. * This block is helping an aop path notice the inversion and back
  1382. * off to unlock its page lock before trying the dlm lock again.
  1383. */
  1384. if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
  1385. mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
  1386. wait = 0;
  1387. spin_lock_irqsave(&lockres->l_lock, flags);
  1388. if (__lockres_remove_mask_waiter(lockres, &mw)) {
  1389. if (dlm_locked)
  1390. lockres_or_flags(lockres,
  1391. OCFS2_LOCK_NONBLOCK_FINISHED);
  1392. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1393. ret = -EAGAIN;
  1394. } else {
  1395. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1396. goto again;
  1397. }
  1398. }
  1399. if (wait) {
  1400. ret = ocfs2_wait_for_mask(&mw);
  1401. if (ret == 0)
  1402. goto again;
  1403. mlog_errno(ret);
  1404. }
  1405. ocfs2_update_lock_stats(lockres, level, &mw, ret);
  1406. #ifdef CONFIG_DEBUG_LOCK_ALLOC
  1407. if (!ret && lockres->l_lockdep_map.key != NULL) {
  1408. if (level == DLM_LOCK_PR)
  1409. rwsem_acquire_read(&lockres->l_lockdep_map, l_subclass,
  1410. !!(arg_flags & OCFS2_META_LOCK_NOQUEUE),
  1411. caller_ip);
  1412. else
  1413. rwsem_acquire(&lockres->l_lockdep_map, l_subclass,
  1414. !!(arg_flags & OCFS2_META_LOCK_NOQUEUE),
  1415. caller_ip);
  1416. }
  1417. #endif
  1418. return ret;
  1419. }
  1420. static inline int ocfs2_cluster_lock(struct ocfs2_super *osb,
  1421. struct ocfs2_lock_res *lockres,
  1422. int level,
  1423. u32 lkm_flags,
  1424. int arg_flags)
  1425. {
  1426. return __ocfs2_cluster_lock(osb, lockres, level, lkm_flags, arg_flags,
  1427. 0, _RET_IP_);
  1428. }
  1429. static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
  1430. struct ocfs2_lock_res *lockres,
  1431. int level,
  1432. unsigned long caller_ip)
  1433. {
  1434. unsigned long flags;
  1435. spin_lock_irqsave(&lockres->l_lock, flags);
  1436. ocfs2_dec_holders(lockres, level);
  1437. ocfs2_downconvert_on_unlock(osb, lockres);
  1438. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1439. #ifdef CONFIG_DEBUG_LOCK_ALLOC
  1440. if (lockres->l_lockdep_map.key != NULL)
  1441. rwsem_release(&lockres->l_lockdep_map, caller_ip);
  1442. #endif
  1443. }
  1444. static int ocfs2_create_new_lock(struct ocfs2_super *osb,
  1445. struct ocfs2_lock_res *lockres,
  1446. int ex,
  1447. int local)
  1448. {
  1449. int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
  1450. unsigned long flags;
  1451. u32 lkm_flags = local ? DLM_LKF_LOCAL : 0;
  1452. spin_lock_irqsave(&lockres->l_lock, flags);
  1453. BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
  1454. lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
  1455. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1456. return ocfs2_lock_create(osb, lockres, level, lkm_flags);
  1457. }
  1458. /* Grants us an EX lock on the data and metadata resources, skipping
  1459. * the normal cluster directory lookup. Use this ONLY on newly created
  1460. * inodes which other nodes can't possibly see, and which haven't been
  1461. * hashed in the inode hash yet. This can give us a good performance
  1462. * increase as it'll skip the network broadcast normally associated
  1463. * with creating a new lock resource. */
  1464. int ocfs2_create_new_inode_locks(struct inode *inode)
  1465. {
  1466. int ret;
  1467. struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  1468. BUG_ON(!ocfs2_inode_is_new(inode));
  1469. mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
  1470. /* NOTE: That we don't increment any of the holder counts, nor
  1471. * do we add anything to a journal handle. Since this is
  1472. * supposed to be a new inode which the cluster doesn't know
  1473. * about yet, there is no need to. As far as the LVB handling
  1474. * is concerned, this is basically like acquiring an EX lock
  1475. * on a resource which has an invalid one -- we'll set it
  1476. * valid when we release the EX. */
  1477. ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_rw_lockres, 1, 1);
  1478. if (ret) {
  1479. mlog_errno(ret);
  1480. goto bail;
  1481. }
  1482. /*
  1483. * We don't want to use DLM_LKF_LOCAL on a meta data lock as they
  1484. * don't use a generation in their lock names.
  1485. */
  1486. ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
  1487. if (ret) {
  1488. mlog_errno(ret);
  1489. goto bail;
  1490. }
  1491. ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
  1492. if (ret)
  1493. mlog_errno(ret);
  1494. bail:
  1495. return ret;
  1496. }
  1497. int ocfs2_rw_lock(struct inode *inode, int write)
  1498. {
  1499. int status, level;
  1500. struct ocfs2_lock_res *lockres;
  1501. struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  1502. mlog(0, "inode %llu take %s RW lock\n",
  1503. (unsigned long long)OCFS2_I(inode)->ip_blkno,
  1504. write ? "EXMODE" : "PRMODE");
  1505. if (ocfs2_mount_local(osb))
  1506. return 0;
  1507. lockres = &OCFS2_I(inode)->ip_rw_lockres;
  1508. level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
  1509. status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
  1510. if (status < 0)
  1511. mlog_errno(status);
  1512. return status;
  1513. }
  1514. int ocfs2_try_rw_lock(struct inode *inode, int write)
  1515. {
  1516. int status, level;
  1517. struct ocfs2_lock_res *lockres;
  1518. struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  1519. mlog(0, "inode %llu try to take %s RW lock\n",
  1520. (unsigned long long)OCFS2_I(inode)->ip_blkno,
  1521. write ? "EXMODE" : "PRMODE");
  1522. if (ocfs2_mount_local(osb))
  1523. return 0;
  1524. lockres = &OCFS2_I(inode)->ip_rw_lockres;
  1525. level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
  1526. status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0);
  1527. return status;
  1528. }
  1529. void ocfs2_rw_unlock(struct inode *inode, int write)
  1530. {
  1531. int level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
  1532. struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
  1533. struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  1534. mlog(0, "inode %llu drop %s RW lock\n",
  1535. (unsigned long long)OCFS2_I(inode)->ip_blkno,
  1536. write ? "EXMODE" : "PRMODE");
  1537. if (!ocfs2_mount_local(osb))
  1538. ocfs2_cluster_unlock(osb, lockres, level);
  1539. }
  1540. /*
  1541. * ocfs2_open_lock always get PR mode lock.
  1542. */
  1543. int ocfs2_open_lock(struct inode *inode)
  1544. {
  1545. int status = 0;
  1546. struct ocfs2_lock_res *lockres;
  1547. struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  1548. mlog(0, "inode %llu take PRMODE open lock\n",
  1549. (unsigned long long)OCFS2_I(inode)->ip_blkno);
  1550. if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
  1551. goto out;
  1552. lockres = &OCFS2_I(inode)->ip_open_lockres;
  1553. status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_PR, 0, 0);
  1554. if (status < 0)
  1555. mlog_errno(status);
  1556. out:
  1557. return status;
  1558. }
  1559. int ocfs2_try_open_lock(struct inode *inode, int write)
  1560. {
  1561. int status = 0, level;
  1562. struct ocfs2_lock_res *lockres;
  1563. struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  1564. mlog(0, "inode %llu try to take %s open lock\n",
  1565. (unsigned long long)OCFS2_I(inode)->ip_blkno,
  1566. write ? "EXMODE" : "PRMODE");
  1567. if (ocfs2_is_hard_readonly(osb)) {
  1568. if (write)
  1569. status = -EROFS;
  1570. goto out;
  1571. }
  1572. if (ocfs2_mount_local(osb))
  1573. goto out;
  1574. lockres = &OCFS2_I(inode)->ip_open_lockres;
  1575. level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
  1576. /*
  1577. * The file system may already holding a PRMODE/EXMODE open lock.
  1578. * Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on
  1579. * other nodes and the -EAGAIN will indicate to the caller that
  1580. * this inode is still in use.
  1581. */
  1582. status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0);
  1583. out:
  1584. return status;
  1585. }
  1586. /*
  1587. * ocfs2_open_unlock unlock PR and EX mode open locks.
  1588. */
  1589. void ocfs2_open_unlock(struct inode *inode)
  1590. {
  1591. struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
  1592. struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  1593. mlog(0, "inode %llu drop open lock\n",
  1594. (unsigned long long)OCFS2_I(inode)->ip_blkno);
  1595. if (ocfs2_mount_local(osb))
  1596. goto out;
  1597. if(lockres->l_ro_holders)
  1598. ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_PR);
  1599. if(lockres->l_ex_holders)
  1600. ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
  1601. out:
  1602. return;
  1603. }
  1604. static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
  1605. int level)
  1606. {
  1607. int ret;
  1608. struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
  1609. unsigned long flags;
  1610. struct ocfs2_mask_waiter mw;
  1611. ocfs2_init_mask_waiter(&mw);
  1612. retry_cancel:
  1613. spin_lock_irqsave(&lockres->l_lock, flags);
  1614. if (lockres->l_flags & OCFS2_LOCK_BUSY) {
  1615. ret = ocfs2_prepare_cancel_convert(osb, lockres);
  1616. if (ret) {
  1617. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1618. ret = ocfs2_cancel_convert(osb, lockres);
  1619. if (ret < 0) {
  1620. mlog_errno(ret);
  1621. goto out;
  1622. }
  1623. goto retry_cancel;
  1624. }
  1625. lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
  1626. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1627. ocfs2_wait_for_mask(&mw);
  1628. goto retry_cancel;
  1629. }
  1630. ret = -ERESTARTSYS;
  1631. /*
  1632. * We may still have gotten the lock, in which case there's no
  1633. * point to restarting the syscall.
  1634. */
  1635. if (lockres->l_level == level)
  1636. ret = 0;
  1637. mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret,
  1638. lockres->l_flags, lockres->l_level, lockres->l_action);
  1639. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1640. out:
  1641. return ret;
  1642. }
  1643. /*
  1644. * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
  1645. * flock() calls. The locking approach this requires is sufficiently
  1646. * different from all other cluster lock types that we implement a
  1647. * separate path to the "low-level" dlm calls. In particular:
  1648. *
  1649. * - No optimization of lock levels is done - we take at exactly
  1650. * what's been requested.
  1651. *
  1652. * - No lock caching is employed. We immediately downconvert to
  1653. * no-lock at unlock time. This also means flock locks never go on
  1654. * the blocking list).
  1655. *
  1656. * - Since userspace can trivially deadlock itself with flock, we make
  1657. * sure to allow cancellation of a misbehaving applications flock()
  1658. * request.
  1659. *
  1660. * - Access to any flock lockres doesn't require concurrency, so we
  1661. * can simplify the code by requiring the caller to guarantee
  1662. * serialization of dlmglue flock calls.
  1663. */
  1664. int ocfs2_file_lock(struct file *file, int ex, int trylock)
  1665. {
  1666. int ret, level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
  1667. unsigned int lkm_flags = trylock ? DLM_LKF_NOQUEUE : 0;
  1668. unsigned long flags;
  1669. struct ocfs2_file_private *fp = file->private_data;
  1670. struct ocfs2_lock_res *lockres = &fp->fp_flock;
  1671. struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
  1672. struct ocfs2_mask_waiter mw;
  1673. ocfs2_init_mask_waiter(&mw);
  1674. if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
  1675. (lockres->l_level > DLM_LOCK_NL)) {
  1676. mlog(ML_ERROR,
  1677. "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
  1678. "level: %u\n", lockres->l_name, lockres->l_flags,
  1679. lockres->l_level);
  1680. return -EINVAL;
  1681. }
  1682. spin_lock_irqsave(&lockres->l_lock, flags);
  1683. if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
  1684. lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
  1685. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1686. /*
  1687. * Get the lock at NLMODE to start - that way we
  1688. * can cancel the upconvert request if need be.
  1689. */
  1690. ret = ocfs2_lock_create(osb, lockres, DLM_LOCK_NL, 0);
  1691. if (ret < 0) {
  1692. mlog_errno(ret);
  1693. goto out;
  1694. }
  1695. ret = ocfs2_wait_for_mask(&mw);
  1696. if (ret) {
  1697. mlog_errno(ret);
  1698. goto out;
  1699. }
  1700. spin_lock_irqsave(&lockres->l_lock, flags);
  1701. }
  1702. lockres->l_action = OCFS2_AST_CONVERT;
  1703. lkm_flags |= DLM_LKF_CONVERT;
  1704. lockres->l_requested = level;
  1705. lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
  1706. lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
  1707. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1708. ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
  1709. lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1);
  1710. if (ret) {
  1711. if (!trylock || (ret != -EAGAIN)) {
  1712. ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
  1713. ret = -EINVAL;
  1714. }
  1715. ocfs2_recover_from_dlm_error(lockres, 1);
  1716. lockres_remove_mask_waiter(lockres, &mw);
  1717. goto out;
  1718. }
  1719. ret = ocfs2_wait_for_mask_interruptible(&mw, lockres);
  1720. if (ret == -ERESTARTSYS) {
  1721. /*
  1722. * Userspace can cause deadlock itself with
  1723. * flock(). Current behavior locally is to allow the
  1724. * deadlock, but abort the system call if a signal is
  1725. * received. We follow this example, otherwise a
  1726. * poorly written program could sit in kernel until
  1727. * reboot.
  1728. *
  1729. * Handling this is a bit more complicated for Ocfs2
  1730. * though. We can't exit this function with an
  1731. * outstanding lock request, so a cancel convert is
  1732. * required. We intentionally overwrite 'ret' - if the
  1733. * cancel fails and the lock was granted, it's easier
  1734. * to just bubble success back up to the user.
  1735. */
  1736. ret = ocfs2_flock_handle_signal(lockres, level);
  1737. } else if (!ret && (level > lockres->l_level)) {
  1738. /* Trylock failed asynchronously */
  1739. BUG_ON(!trylock);
  1740. ret = -EAGAIN;
  1741. }
  1742. out:
  1743. mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n",
  1744. lockres->l_name, ex, trylock, ret);
  1745. return ret;
  1746. }
  1747. void ocfs2_file_unlock(struct file *file)
  1748. {
  1749. int ret;
  1750. unsigned int gen;
  1751. unsigned long flags;
  1752. struct ocfs2_file_private *fp = file->private_data;
  1753. struct ocfs2_lock_res *lockres = &fp->fp_flock;
  1754. struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
  1755. struct ocfs2_mask_waiter mw;
  1756. ocfs2_init_mask_waiter(&mw);
  1757. if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
  1758. return;
  1759. if (lockres->l_level == DLM_LOCK_NL)
  1760. return;
  1761. mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
  1762. lockres->l_name, lockres->l_flags, lockres->l_level,
  1763. lockres->l_action);
  1764. spin_lock_irqsave(&lockres->l_lock, flags);
  1765. /*
  1766. * Fake a blocking ast for the downconvert code.
  1767. */
  1768. lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
  1769. lockres->l_blocking = DLM_LOCK_EX;
  1770. gen = ocfs2_prepare_downconvert(lockres, DLM_LOCK_NL);
  1771. lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
  1772. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1773. ret = ocfs2_downconvert_lock(osb, lockres, DLM_LOCK_NL, 0, gen);
  1774. if (ret) {
  1775. mlog_errno(ret);
  1776. return;
  1777. }
  1778. ret = ocfs2_wait_for_mask(&mw);
  1779. if (ret)
  1780. mlog_errno(ret);
  1781. }
  1782. static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
  1783. struct ocfs2_lock_res *lockres)
  1784. {
  1785. int kick = 0;
  1786. /* If we know that another node is waiting on our lock, kick
  1787. * the downconvert thread * pre-emptively when we reach a release
  1788. * condition. */
  1789. if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
  1790. switch(lockres->l_blocking) {
  1791. case DLM_LOCK_EX:
  1792. if (!lockres->l_ex_holders && !lockres->l_ro_holders)
  1793. kick = 1;
  1794. break;
  1795. case DLM_LOCK_PR:
  1796. if (!lockres->l_ex_holders)
  1797. kick = 1;
  1798. break;
  1799. default:
  1800. BUG();
  1801. }
  1802. }
  1803. if (kick)
  1804. ocfs2_wake_downconvert_thread(osb);
  1805. }
  1806. #define OCFS2_SEC_BITS 34
  1807. #define OCFS2_SEC_SHIFT (64 - OCFS2_SEC_BITS)
  1808. #define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1)
  1809. /* LVB only has room for 64 bits of time here so we pack it for
  1810. * now. */
  1811. static u64 ocfs2_pack_timespec(struct timespec64 *spec)
  1812. {
  1813. u64 res;
  1814. u64 sec = clamp_t(time64_t, spec->tv_sec, 0, 0x3ffffffffull);
  1815. u32 nsec = spec->tv_nsec;
  1816. res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
  1817. return res;
  1818. }
  1819. /* Call this with the lockres locked. I am reasonably sure we don't
  1820. * need ip_lock in this function as anyone who would be changing those
  1821. * values is supposed to be blocked in ocfs2_inode_lock right now. */
  1822. static void __ocfs2_stuff_meta_lvb(struct inode *inode)
  1823. {
  1824. struct ocfs2_inode_info *oi = OCFS2_I(inode);
  1825. struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
  1826. struct ocfs2_meta_lvb *lvb;
  1827. struct timespec64 ts;
  1828. lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
  1829. /*
  1830. * Invalidate the LVB of a deleted inode - this way other
  1831. * nodes are forced to go to disk and discover the new inode
  1832. * status.
  1833. */
  1834. if (oi->ip_flags & OCFS2_INODE_DELETED) {
  1835. lvb->lvb_version = 0;
  1836. goto out;
  1837. }
  1838. lvb->lvb_version = OCFS2_LVB_VERSION;
  1839. lvb->lvb_isize = cpu_to_be64(i_size_read(inode));
  1840. lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
  1841. lvb->lvb_iuid = cpu_to_be32(i_uid_read(inode));
  1842. lvb->lvb_igid = cpu_to_be32(i_gid_read(inode));
  1843. lvb->lvb_imode = cpu_to_be16(inode->i_mode);
  1844. lvb->lvb_inlink = cpu_to_be16(inode->i_nlink);
  1845. ts = inode_get_atime(inode);
  1846. lvb->lvb_iatime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts));
  1847. ts = inode_get_ctime(inode);
  1848. lvb->lvb_ictime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts));
  1849. ts = inode_get_mtime(inode);
  1850. lvb->lvb_imtime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts));
  1851. lvb->lvb_iattr = cpu_to_be32(oi->ip_attr);
  1852. lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features);
  1853. lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
  1854. out:
  1855. mlog_meta_lvb(0, lockres);
  1856. }
  1857. static void ocfs2_unpack_timespec(struct timespec64 *spec,
  1858. u64 packed_time)
  1859. {
  1860. spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
  1861. spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
  1862. }
  1863. static int ocfs2_refresh_inode_from_lvb(struct inode *inode)
  1864. {
  1865. struct ocfs2_inode_info *oi = OCFS2_I(inode);
  1866. struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
  1867. struct ocfs2_meta_lvb *lvb;
  1868. struct timespec64 ts;
  1869. mlog_meta_lvb(0, lockres);
  1870. lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
  1871. if (inode_wrong_type(inode, be16_to_cpu(lvb->lvb_imode)))
  1872. return -ESTALE;
  1873. /* We're safe here without the lockres lock... */
  1874. spin_lock(&oi->ip_lock);
  1875. oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters);
  1876. i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
  1877. oi->ip_attr = be32_to_cpu(lvb->lvb_iattr);
  1878. oi->ip_dyn_features = be16_to_cpu(lvb->lvb_idynfeatures);
  1879. ocfs2_set_inode_flags(inode);
  1880. /* fast-symlinks are a special case */
  1881. if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
  1882. inode->i_blocks = 0;
  1883. else
  1884. inode->i_blocks = ocfs2_inode_sector_count(inode);
  1885. i_uid_write(inode, be32_to_cpu(lvb->lvb_iuid));
  1886. i_gid_write(inode, be32_to_cpu(lvb->lvb_igid));
  1887. inode->i_mode = be16_to_cpu(lvb->lvb_imode);
  1888. set_nlink(inode, be16_to_cpu(lvb->lvb_inlink));
  1889. ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_iatime_packed));
  1890. inode_set_atime_to_ts(inode, ts);
  1891. ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_imtime_packed));
  1892. inode_set_mtime_to_ts(inode, ts);
  1893. ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_ictime_packed));
  1894. inode_set_ctime_to_ts(inode, ts);
  1895. spin_unlock(&oi->ip_lock);
  1896. return 0;
  1897. }
  1898. static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
  1899. struct ocfs2_lock_res *lockres)
  1900. {
  1901. struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
  1902. if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)
  1903. && lvb->lvb_version == OCFS2_LVB_VERSION
  1904. && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
  1905. return 1;
  1906. return 0;
  1907. }
  1908. /* Determine whether a lock resource needs to be refreshed, and
  1909. * arbitrate who gets to refresh it.
  1910. *
  1911. * 0 means no refresh needed.
  1912. *
  1913. * > 0 means you need to refresh this and you MUST call
  1914. * ocfs2_complete_lock_res_refresh afterwards. */
  1915. static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
  1916. {
  1917. unsigned long flags;
  1918. int status = 0;
  1919. refresh_check:
  1920. spin_lock_irqsave(&lockres->l_lock, flags);
  1921. if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
  1922. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1923. goto bail;
  1924. }
  1925. if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
  1926. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1927. ocfs2_wait_on_refreshing_lock(lockres);
  1928. goto refresh_check;
  1929. }
  1930. /* Ok, I'll be the one to refresh this lock. */
  1931. lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING);
  1932. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1933. status = 1;
  1934. bail:
  1935. mlog(0, "status %d\n", status);
  1936. return status;
  1937. }
  1938. /* If status is non zero, I'll mark it as not being in refresh
  1939. * anymroe, but i won't clear the needs refresh flag. */
  1940. static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres,
  1941. int status)
  1942. {
  1943. unsigned long flags;
  1944. spin_lock_irqsave(&lockres->l_lock, flags);
  1945. lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
  1946. if (!status)
  1947. lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
  1948. spin_unlock_irqrestore(&lockres->l_lock, flags);
  1949. wake_up(&lockres->l_event);
  1950. }
  1951. /* may or may not return a bh if it went to disk. */
  1952. static int ocfs2_inode_lock_update(struct inode *inode,
  1953. struct buffer_head **bh)
  1954. {
  1955. int status = 0;
  1956. struct ocfs2_inode_info *oi = OCFS2_I(inode);
  1957. struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
  1958. struct ocfs2_dinode *fe;
  1959. struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  1960. if (ocfs2_mount_local(osb))
  1961. goto bail;
  1962. spin_lock(&oi->ip_lock);
  1963. if (oi->ip_flags & OCFS2_INODE_DELETED) {
  1964. mlog(0, "Orphaned inode %llu was deleted while we "
  1965. "were waiting on a lock. ip_flags = 0x%x\n",
  1966. (unsigned long long)oi->ip_blkno, oi->ip_flags);
  1967. spin_unlock(&oi->ip_lock);
  1968. status = -ENOENT;
  1969. goto bail;
  1970. }
  1971. spin_unlock(&oi->ip_lock);
  1972. if (!ocfs2_should_refresh_lock_res(lockres))
  1973. goto bail;
  1974. /* This will discard any caching information we might have had
  1975. * for the inode metadata. */
  1976. ocfs2_metadata_cache_purge(INODE_CACHE(inode));
  1977. ocfs2_extent_map_trunc(inode, 0);
  1978. if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
  1979. mlog(0, "Trusting LVB on inode %llu\n",
  1980. (unsigned long long)oi->ip_blkno);
  1981. status = ocfs2_refresh_inode_from_lvb(inode);
  1982. goto bail_refresh;
  1983. } else {
  1984. /* Boo, we have to go to disk. */
  1985. /* read bh, cast, ocfs2_refresh_inode */
  1986. status = ocfs2_read_inode_block(inode, bh);
  1987. if (status < 0) {
  1988. mlog_errno(status);
  1989. goto bail_refresh;
  1990. }
  1991. fe = (struct ocfs2_dinode *) (*bh)->b_data;
  1992. if (inode_wrong_type(inode, le16_to_cpu(fe->i_mode))) {
  1993. status = -ESTALE;
  1994. goto bail_refresh;
  1995. }
  1996. /* This is a good chance to make sure we're not
  1997. * locking an invalid object. ocfs2_read_inode_block()
  1998. * already checked that the inode block is sane.
  1999. *
  2000. * We bug on a stale inode here because we checked
  2001. * above whether it was wiped from disk. The wiping
  2002. * node provides a guarantee that we receive that
  2003. * message and can mark the inode before dropping any
  2004. * locks associated with it. */
  2005. mlog_bug_on_msg(inode->i_generation !=
  2006. le32_to_cpu(fe->i_generation),
  2007. "Invalid dinode %llu disk generation: %u "
  2008. "inode->i_generation: %u\n",
  2009. (unsigned long long)oi->ip_blkno,
  2010. le32_to_cpu(fe->i_generation),
  2011. inode->i_generation);
  2012. mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) ||
  2013. !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
  2014. "Stale dinode %llu dtime: %llu flags: 0x%x\n",
  2015. (unsigned long long)oi->ip_blkno,
  2016. (unsigned long long)le64_to_cpu(fe->i_dtime),
  2017. le32_to_cpu(fe->i_flags));
  2018. ocfs2_refresh_inode(inode, fe);
  2019. ocfs2_track_lock_refresh(lockres);
  2020. }
  2021. status = 0;
  2022. bail_refresh:
  2023. ocfs2_complete_lock_res_refresh(lockres, status);
  2024. bail:
  2025. return status;
  2026. }
  2027. static int ocfs2_assign_bh(struct inode *inode,
  2028. struct buffer_head **ret_bh,
  2029. struct buffer_head *passed_bh)
  2030. {
  2031. int status;
  2032. if (passed_bh) {
  2033. /* Ok, the update went to disk for us, use the
  2034. * returned bh. */
  2035. *ret_bh = passed_bh;
  2036. get_bh(*ret_bh);
  2037. return 0;
  2038. }
  2039. status = ocfs2_read_inode_block(inode, ret_bh);
  2040. if (status < 0)
  2041. mlog_errno(status);
  2042. return status;
  2043. }
  2044. /*
  2045. * returns < 0 error if the callback will never be called, otherwise
  2046. * the result of the lock will be communicated via the callback.
  2047. */
  2048. int ocfs2_inode_lock_full_nested(struct inode *inode,
  2049. struct buffer_head **ret_bh,
  2050. int ex,
  2051. int arg_flags,
  2052. int subclass)
  2053. {
  2054. int status, level, acquired;
  2055. u32 dlm_flags;
  2056. struct ocfs2_lock_res *lockres = NULL;
  2057. struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  2058. struct buffer_head *local_bh = NULL;
  2059. mlog(0, "inode %llu, take %s META lock\n",
  2060. (unsigned long long)OCFS2_I(inode)->ip_blkno,
  2061. ex ? "EXMODE" : "PRMODE");
  2062. status = 0;
  2063. acquired = 0;
  2064. /* We'll allow faking a readonly metadata lock for
  2065. * rodevices. */
  2066. if (ocfs2_is_hard_readonly(osb)) {
  2067. if (ex)
  2068. status = -EROFS;
  2069. goto getbh;
  2070. }
  2071. if ((arg_flags & OCFS2_META_LOCK_GETBH) ||
  2072. ocfs2_mount_local(osb))
  2073. goto update;
  2074. if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
  2075. ocfs2_wait_for_recovery(osb);
  2076. lockres = &OCFS2_I(inode)->ip_inode_lockres;
  2077. level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
  2078. dlm_flags = 0;
  2079. if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
  2080. dlm_flags |= DLM_LKF_NOQUEUE;
  2081. status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags,
  2082. arg_flags, subclass, _RET_IP_);
  2083. if (status < 0) {
  2084. if (status != -EAGAIN)
  2085. mlog_errno(status);
  2086. goto bail;
  2087. }
  2088. /* Notify the error cleanup path to drop the cluster lock. */
  2089. acquired = 1;
  2090. /* We wait twice because a node may have died while we were in
  2091. * the lower dlm layers. The second time though, we've
  2092. * committed to owning this lock so we don't allow signals to
  2093. * abort the operation. */
  2094. if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
  2095. ocfs2_wait_for_recovery(osb);
  2096. update:
  2097. /*
  2098. * We only see this flag if we're being called from
  2099. * ocfs2_read_locked_inode(). It means we're locking an inode
  2100. * which hasn't been populated yet, so clear the refresh flag
  2101. * and let the caller handle it.
  2102. */
  2103. if (inode->i_state & I_NEW) {
  2104. status = 0;
  2105. if (lockres)
  2106. ocfs2_complete_lock_res_refresh(lockres, 0);
  2107. goto bail;
  2108. }
  2109. /* This is fun. The caller may want a bh back, or it may
  2110. * not. ocfs2_inode_lock_update definitely wants one in, but
  2111. * may or may not read one, depending on what's in the
  2112. * LVB. The result of all of this is that we've *only* gone to
  2113. * disk if we have to, so the complexity is worthwhile. */
  2114. status = ocfs2_inode_lock_update(inode, &local_bh);
  2115. if (status < 0) {
  2116. if (status != -ENOENT)
  2117. mlog_errno(status);
  2118. goto bail;
  2119. }
  2120. getbh:
  2121. if (ret_bh) {
  2122. status = ocfs2_assign_bh(inode, ret_bh, local_bh);
  2123. if (status < 0) {
  2124. mlog_errno(status);
  2125. goto bail;
  2126. }
  2127. }
  2128. bail:
  2129. if (status < 0) {
  2130. if (ret_bh && (*ret_bh)) {
  2131. brelse(*ret_bh);
  2132. *ret_bh = NULL;
  2133. }
  2134. if (acquired)
  2135. ocfs2_inode_unlock(inode, ex);
  2136. }
  2137. brelse(local_bh);
  2138. return status;
  2139. }
  2140. /*
  2141. * This is working around a lock inversion between tasks acquiring DLM
  2142. * locks while holding a page lock and the downconvert thread which
  2143. * blocks dlm lock acquiry while acquiring page locks.
  2144. *
  2145. * ** These _with_page variantes are only intended to be called from aop
  2146. * methods that hold page locks and return a very specific *positive* error
  2147. * code that aop methods pass up to the VFS -- test for errors with != 0. **
  2148. *
  2149. * The DLM is called such that it returns -EAGAIN if it would have
  2150. * blocked waiting for the downconvert thread. In that case we unlock
  2151. * our page so the downconvert thread can make progress. Once we've
  2152. * done this we have to return AOP_TRUNCATED_PAGE so the aop method
  2153. * that called us can bubble that back up into the VFS who will then
  2154. * immediately retry the aop call.
  2155. */
  2156. int ocfs2_inode_lock_with_page(struct inode *inode,
  2157. struct buffer_head **ret_bh,
  2158. int ex,
  2159. struct page *page)
  2160. {
  2161. int ret;
  2162. ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
  2163. if (ret == -EAGAIN) {
  2164. unlock_page(page);
  2165. /*
  2166. * If we can't get inode lock immediately, we should not return
  2167. * directly here, since this will lead to a softlockup problem.
  2168. * The method is to get a blocking lock and immediately unlock
  2169. * before returning, this can avoid CPU resource waste due to
  2170. * lots of retries, and benefits fairness in getting lock.
  2171. */
  2172. if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
  2173. ocfs2_inode_unlock(inode, ex);
  2174. ret = AOP_TRUNCATED_PAGE;
  2175. }
  2176. return ret;
  2177. }
  2178. int ocfs2_inode_lock_atime(struct inode *inode,
  2179. struct vfsmount *vfsmnt,
  2180. int *level, int wait)
  2181. {
  2182. int ret;
  2183. if (wait)
  2184. ret = ocfs2_inode_lock(inode, NULL, 0);
  2185. else
  2186. ret = ocfs2_try_inode_lock(inode, NULL, 0);
  2187. if (ret < 0) {
  2188. if (ret != -EAGAIN)
  2189. mlog_errno(ret);
  2190. return ret;
  2191. }
  2192. /*
  2193. * If we should update atime, we will get EX lock,
  2194. * otherwise we just get PR lock.
  2195. */
  2196. if (ocfs2_should_update_atime(inode, vfsmnt)) {
  2197. struct buffer_head *bh = NULL;
  2198. ocfs2_inode_unlock(inode, 0);
  2199. if (wait)
  2200. ret = ocfs2_inode_lock(inode, &bh, 1);
  2201. else
  2202. ret = ocfs2_try_inode_lock(inode, &bh, 1);
  2203. if (ret < 0) {
  2204. if (ret != -EAGAIN)
  2205. mlog_errno(ret);
  2206. return ret;
  2207. }
  2208. *level = 1;
  2209. if (ocfs2_should_update_atime(inode, vfsmnt))
  2210. ocfs2_update_inode_atime(inode, bh);
  2211. brelse(bh);
  2212. } else
  2213. *level = 0;
  2214. return ret;
  2215. }
  2216. void ocfs2_inode_unlock(struct inode *inode,
  2217. int ex)
  2218. {
  2219. int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
  2220. struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
  2221. struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  2222. mlog(0, "inode %llu drop %s META lock\n",
  2223. (unsigned long long)OCFS2_I(inode)->ip_blkno,
  2224. ex ? "EXMODE" : "PRMODE");
  2225. if (!ocfs2_is_hard_readonly(osb) &&
  2226. !ocfs2_mount_local(osb))
  2227. ocfs2_cluster_unlock(osb, lockres, level);
  2228. }
  2229. /*
  2230. * This _tracker variantes are introduced to deal with the recursive cluster
  2231. * locking issue. The idea is to keep track of a lock holder on the stack of
  2232. * the current process. If there's a lock holder on the stack, we know the
  2233. * task context is already protected by cluster locking. Currently, they're
  2234. * used in some VFS entry routines.
  2235. *
  2236. * return < 0 on error, return == 0 if there's no lock holder on the stack
  2237. * before this call, return == 1 if this call would be a recursive locking.
  2238. * return == -1 if this lock attempt will cause an upgrade which is forbidden.
  2239. *
  2240. * When taking lock levels into account,we face some different situations.
  2241. *
  2242. * 1. no lock is held
  2243. * In this case, just lock the inode as requested and return 0
  2244. *
  2245. * 2. We are holding a lock
  2246. * For this situation, things diverges into several cases
  2247. *
  2248. * wanted holding what to do
  2249. * ex ex see 2.1 below
  2250. * ex pr see 2.2 below
  2251. * pr ex see 2.1 below
  2252. * pr pr see 2.1 below
  2253. *
  2254. * 2.1 lock level that is been held is compatible
  2255. * with the wanted level, so no lock action will be tacken.
  2256. *
  2257. * 2.2 Otherwise, an upgrade is needed, but it is forbidden.
  2258. *
  2259. * Reason why upgrade within a process is forbidden is that
  2260. * lock upgrade may cause dead lock. The following illustrates
  2261. * how it happens.
  2262. *
  2263. * thread on node1 thread on node2
  2264. * ocfs2_inode_lock_tracker(ex=0)
  2265. *
  2266. * <====== ocfs2_inode_lock_tracker(ex=1)
  2267. *
  2268. * ocfs2_inode_lock_tracker(ex=1)
  2269. */
  2270. int ocfs2_inode_lock_tracker(struct inode *inode,
  2271. struct buffer_head **ret_bh,
  2272. int ex,
  2273. struct ocfs2_lock_holder *oh)
  2274. {
  2275. int status = 0;
  2276. struct ocfs2_lock_res *lockres;
  2277. struct ocfs2_lock_holder *tmp_oh;
  2278. struct pid *pid = task_pid(current);
  2279. lockres = &OCFS2_I(inode)->ip_inode_lockres;
  2280. tmp_oh = ocfs2_pid_holder(lockres, pid);
  2281. if (!tmp_oh) {
  2282. /*
  2283. * This corresponds to the case 1.
  2284. * We haven't got any lock before.
  2285. */
  2286. status = ocfs2_inode_lock_full(inode, ret_bh, ex, 0);
  2287. if (status < 0) {
  2288. if (status != -ENOENT)
  2289. mlog_errno(status);
  2290. return status;
  2291. }
  2292. oh->oh_ex = ex;
  2293. ocfs2_add_holder(lockres, oh);
  2294. return 0;
  2295. }
  2296. if (unlikely(ex && !tmp_oh->oh_ex)) {
  2297. /*
  2298. * case 2.2 upgrade may cause dead lock, forbid it.
  2299. */
  2300. mlog(ML_ERROR, "Recursive locking is not permitted to "
  2301. "upgrade to EX level from PR level.\n");
  2302. dump_stack();
  2303. return -EINVAL;
  2304. }
  2305. /*
  2306. * case 2.1 OCFS2_META_LOCK_GETBH flag make ocfs2_inode_lock_full.
  2307. * ignore the lock level and just update it.
  2308. */
  2309. if (ret_bh) {
  2310. status = ocfs2_inode_lock_full(inode, ret_bh, ex,
  2311. OCFS2_META_LOCK_GETBH);
  2312. if (status < 0) {
  2313. if (status != -ENOENT)
  2314. mlog_errno(status);
  2315. return status;
  2316. }
  2317. }
  2318. return 1;
  2319. }
  2320. void ocfs2_inode_unlock_tracker(struct inode *inode,
  2321. int ex,
  2322. struct ocfs2_lock_holder *oh,
  2323. int had_lock)
  2324. {
  2325. struct ocfs2_lock_res *lockres;
  2326. lockres = &OCFS2_I(inode)->ip_inode_lockres;
  2327. /* had_lock means that the currect process already takes the cluster
  2328. * lock previously.
  2329. * If had_lock is 1, we have nothing to do here.
  2330. * If had_lock is 0, we will release the lock.
  2331. */
  2332. if (!had_lock) {
  2333. ocfs2_inode_unlock(inode, oh->oh_ex);
  2334. ocfs2_remove_holder(lockres, oh);
  2335. }
  2336. }
  2337. int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno)
  2338. {
  2339. struct ocfs2_lock_res *lockres;
  2340. struct ocfs2_orphan_scan_lvb *lvb;
  2341. int status = 0;
  2342. if (ocfs2_is_hard_readonly(osb))
  2343. return -EROFS;
  2344. if (ocfs2_mount_local(osb))
  2345. return 0;
  2346. lockres = &osb->osb_orphan_scan.os_lockres;
  2347. status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
  2348. if (status < 0)
  2349. return status;
  2350. lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
  2351. if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
  2352. lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION)
  2353. *seqno = be32_to_cpu(lvb->lvb_os_seqno);
  2354. else
  2355. *seqno = osb->osb_orphan_scan.os_seqno + 1;
  2356. return status;
  2357. }
  2358. void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno)
  2359. {
  2360. struct ocfs2_lock_res *lockres;
  2361. struct ocfs2_orphan_scan_lvb *lvb;
  2362. if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) {
  2363. lockres = &osb->osb_orphan_scan.os_lockres;
  2364. lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
  2365. lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
  2366. lvb->lvb_os_seqno = cpu_to_be32(seqno);
  2367. ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
  2368. }
  2369. }
  2370. int ocfs2_super_lock(struct ocfs2_super *osb,
  2371. int ex)
  2372. {
  2373. int status = 0;
  2374. int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
  2375. struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
  2376. if (ocfs2_is_hard_readonly(osb))
  2377. return -EROFS;
  2378. if (ocfs2_mount_local(osb))
  2379. goto bail;
  2380. status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
  2381. if (status < 0) {
  2382. mlog_errno(status);
  2383. goto bail;
  2384. }
  2385. /* The super block lock path is really in the best position to
  2386. * know when resources covered by the lock need to be
  2387. * refreshed, so we do it here. Of course, making sense of
  2388. * everything is up to the caller :) */
  2389. status = ocfs2_should_refresh_lock_res(lockres);
  2390. if (status) {
  2391. status = ocfs2_refresh_slot_info(osb);
  2392. ocfs2_complete_lock_res_refresh(lockres, status);
  2393. if (status < 0) {
  2394. ocfs2_cluster_unlock(osb, lockres, level);
  2395. mlog_errno(status);
  2396. }
  2397. ocfs2_track_lock_refresh(lockres);
  2398. }
  2399. bail:
  2400. return status;
  2401. }
  2402. void ocfs2_super_unlock(struct ocfs2_super *osb,
  2403. int ex)
  2404. {
  2405. int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
  2406. struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
  2407. if (!ocfs2_mount_local(osb))
  2408. ocfs2_cluster_unlock(osb, lockres, level);
  2409. }
  2410. int ocfs2_rename_lock(struct ocfs2_super *osb)
  2411. {
  2412. int status;
  2413. struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
  2414. if (ocfs2_is_hard_readonly(osb))
  2415. return -EROFS;
  2416. if (ocfs2_mount_local(osb))
  2417. return 0;
  2418. status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
  2419. if (status < 0)
  2420. mlog_errno(status);
  2421. return status;
  2422. }
  2423. void ocfs2_rename_unlock(struct ocfs2_super *osb)
  2424. {
  2425. struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
  2426. if (!ocfs2_mount_local(osb))
  2427. ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
  2428. }
  2429. int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
  2430. {
  2431. int status;
  2432. struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
  2433. if (ocfs2_is_hard_readonly(osb))
  2434. return -EROFS;
  2435. if (ex)
  2436. down_write(&osb->nfs_sync_rwlock);
  2437. else
  2438. down_read(&osb->nfs_sync_rwlock);
  2439. if (ocfs2_mount_local(osb))
  2440. return 0;
  2441. status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
  2442. 0, 0);
  2443. if (status < 0) {
  2444. mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);
  2445. if (ex)
  2446. up_write(&osb->nfs_sync_rwlock);
  2447. else
  2448. up_read(&osb->nfs_sync_rwlock);
  2449. }
  2450. return status;
  2451. }
  2452. void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
  2453. {
  2454. struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
  2455. if (!ocfs2_mount_local(osb))
  2456. ocfs2_cluster_unlock(osb, lockres,
  2457. ex ? LKM_EXMODE : LKM_PRMODE);
  2458. if (ex)
  2459. up_write(&osb->nfs_sync_rwlock);
  2460. else
  2461. up_read(&osb->nfs_sync_rwlock);
  2462. }
  2463. int ocfs2_trim_fs_lock(struct ocfs2_super *osb,
  2464. struct ocfs2_trim_fs_info *info, int trylock)
  2465. {
  2466. int status;
  2467. struct ocfs2_trim_fs_lvb *lvb;
  2468. struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
  2469. if (info)
  2470. info->tf_valid = 0;
  2471. if (ocfs2_is_hard_readonly(osb))
  2472. return -EROFS;
  2473. if (ocfs2_mount_local(osb))
  2474. return 0;
  2475. status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX,
  2476. trylock ? DLM_LKF_NOQUEUE : 0, 0);
  2477. if (status < 0) {
  2478. if (status != -EAGAIN)
  2479. mlog_errno(status);
  2480. return status;
  2481. }
  2482. if (info) {
  2483. lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
  2484. if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
  2485. lvb->lvb_version == OCFS2_TRIMFS_LVB_VERSION) {
  2486. info->tf_valid = 1;
  2487. info->tf_success = lvb->lvb_success;
  2488. info->tf_nodenum = be32_to_cpu(lvb->lvb_nodenum);
  2489. info->tf_start = be64_to_cpu(lvb->lvb_start);
  2490. info->tf_len = be64_to_cpu(lvb->lvb_len);
  2491. info->tf_minlen = be64_to_cpu(lvb->lvb_minlen);
  2492. info->tf_trimlen = be64_to_cpu(lvb->lvb_trimlen);
  2493. }
  2494. }
  2495. return status;
  2496. }
  2497. void ocfs2_trim_fs_unlock(struct ocfs2_super *osb,
  2498. struct ocfs2_trim_fs_info *info)
  2499. {
  2500. struct ocfs2_trim_fs_lvb *lvb;
  2501. struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
  2502. if (ocfs2_mount_local(osb))
  2503. return;
  2504. if (info) {
  2505. lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
  2506. lvb->lvb_version = OCFS2_TRIMFS_LVB_VERSION;
  2507. lvb->lvb_success = info->tf_success;
  2508. lvb->lvb_nodenum = cpu_to_be32(info->tf_nodenum);
  2509. lvb->lvb_start = cpu_to_be64(info->tf_start);
  2510. lvb->lvb_len = cpu_to_be64(info->tf_len);
  2511. lvb->lvb_minlen = cpu_to_be64(info->tf_minlen);
  2512. lvb->lvb_trimlen = cpu_to_be64(info->tf_trimlen);
  2513. }
  2514. ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
  2515. }
  2516. int ocfs2_dentry_lock(struct dentry *dentry, int ex)
  2517. {
  2518. int ret;
  2519. int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
  2520. struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
  2521. struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
  2522. BUG_ON(!dl);
  2523. if (ocfs2_is_hard_readonly(osb)) {
  2524. if (ex)
  2525. return -EROFS;
  2526. return 0;
  2527. }
  2528. if (ocfs2_mount_local(osb))
  2529. return 0;
  2530. ret = ocfs2_cluster_lock(osb, &dl->dl_lockres, level, 0, 0);
  2531. if (ret < 0)
  2532. mlog_errno(ret);
  2533. return ret;
  2534. }
  2535. void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
  2536. {
  2537. int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
  2538. struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
  2539. struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
  2540. if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
  2541. ocfs2_cluster_unlock(osb, &dl->dl_lockres, level);
  2542. }
  2543. /* Reference counting of the dlm debug structure. We want this because
  2544. * open references on the debug inodes can live on after a mount, so
  2545. * we can't rely on the ocfs2_super to always exist. */
  2546. static void ocfs2_dlm_debug_free(struct kref *kref)
  2547. {
  2548. struct ocfs2_dlm_debug *dlm_debug;
  2549. dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt);
  2550. kfree(dlm_debug);
  2551. }
  2552. void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug)
  2553. {
  2554. if (dlm_debug)
  2555. kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free);
  2556. }
  2557. static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug)
  2558. {
  2559. kref_get(&debug->d_refcnt);
  2560. }
  2561. struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
  2562. {
  2563. struct ocfs2_dlm_debug *dlm_debug;
  2564. dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL);
  2565. if (!dlm_debug) {
  2566. mlog_errno(-ENOMEM);
  2567. goto out;
  2568. }
  2569. kref_init(&dlm_debug->d_refcnt);
  2570. INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
  2571. dlm_debug->d_filter_secs = 0;
  2572. out:
  2573. return dlm_debug;
  2574. }
  2575. /* Access to this is arbitrated for us via seq_file->sem. */
  2576. struct ocfs2_dlm_seq_priv {
  2577. struct ocfs2_dlm_debug *p_dlm_debug;
  2578. struct ocfs2_lock_res p_iter_res;
  2579. struct ocfs2_lock_res p_tmp_res;
  2580. };
  2581. static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start,
  2582. struct ocfs2_dlm_seq_priv *priv)
  2583. {
  2584. struct ocfs2_lock_res *iter, *ret = NULL;
  2585. struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug;
  2586. assert_spin_locked(&ocfs2_dlm_tracking_lock);
  2587. list_for_each_entry(iter, &start->l_debug_list, l_debug_list) {
  2588. /* discover the head of the list */
  2589. if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) {
  2590. mlog(0, "End of list found, %p\n", ret);
  2591. break;
  2592. }
  2593. /* We track our "dummy" iteration lockres' by a NULL
  2594. * l_ops field. */
  2595. if (iter->l_ops != NULL) {
  2596. ret = iter;
  2597. break;
  2598. }
  2599. }
  2600. return ret;
  2601. }
  2602. static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos)
  2603. {
  2604. struct ocfs2_dlm_seq_priv *priv = m->private;
  2605. struct ocfs2_lock_res *iter;
  2606. spin_lock(&ocfs2_dlm_tracking_lock);
  2607. iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv);
  2608. if (iter) {
  2609. /* Since lockres' have the lifetime of their container
  2610. * (which can be inodes, ocfs2_supers, etc) we want to
  2611. * copy this out to a temporary lockres while still
  2612. * under the spinlock. Obviously after this we can't
  2613. * trust any pointers on the copy returned, but that's
  2614. * ok as the information we want isn't typically held
  2615. * in them. */
  2616. priv->p_tmp_res = *iter;
  2617. iter = &priv->p_tmp_res;
  2618. }
  2619. spin_unlock(&ocfs2_dlm_tracking_lock);
  2620. return iter;
  2621. }
  2622. static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v)
  2623. {
  2624. }
  2625. static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
  2626. {
  2627. struct ocfs2_dlm_seq_priv *priv = m->private;
  2628. struct ocfs2_lock_res *iter = v;
  2629. struct ocfs2_lock_res *dummy = &priv->p_iter_res;
  2630. (*pos)++;
  2631. spin_lock(&ocfs2_dlm_tracking_lock);
  2632. iter = ocfs2_dlm_next_res(iter, priv);
  2633. list_del_init(&dummy->l_debug_list);
  2634. if (iter) {
  2635. list_add(&dummy->l_debug_list, &iter->l_debug_list);
  2636. priv->p_tmp_res = *iter;
  2637. iter = &priv->p_tmp_res;
  2638. }
  2639. spin_unlock(&ocfs2_dlm_tracking_lock);
  2640. return iter;
  2641. }
  2642. /*
  2643. * Version is used by debugfs.ocfs2 to determine the format being used
  2644. *
  2645. * New in version 2
  2646. * - Lock stats printed
  2647. * New in version 3
  2648. * - Max time in lock stats is in usecs (instead of nsecs)
  2649. * New in version 4
  2650. * - Add last pr/ex unlock times and first lock wait time in usecs
  2651. */
  2652. #define OCFS2_DLM_DEBUG_STR_VERSION 4
  2653. static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
  2654. {
  2655. int i;
  2656. char *lvb;
  2657. struct ocfs2_lock_res *lockres = v;
  2658. #ifdef CONFIG_OCFS2_FS_STATS
  2659. u64 now, last;
  2660. struct ocfs2_dlm_debug *dlm_debug =
  2661. ((struct ocfs2_dlm_seq_priv *)m->private)->p_dlm_debug;
  2662. #endif
  2663. if (!lockres)
  2664. return -EINVAL;
  2665. #ifdef CONFIG_OCFS2_FS_STATS
  2666. if (!lockres->l_lock_wait && dlm_debug->d_filter_secs) {
  2667. now = ktime_to_us(ktime_get_real());
  2668. last = max(lockres->l_lock_prmode.ls_last,
  2669. lockres->l_lock_exmode.ls_last);
  2670. /*
  2671. * Use d_filter_secs field to filter lock resources dump,
  2672. * the default d_filter_secs(0) value filters nothing,
  2673. * otherwise, only dump the last N seconds active lock
  2674. * resources.
  2675. */
  2676. if (div_u64(now - last, 1000000) > dlm_debug->d_filter_secs)
  2677. return 0;
  2678. }
  2679. #endif
  2680. seq_printf(m, "0x%x\t", OCFS2_DLM_DEBUG_STR_VERSION);
  2681. if (lockres->l_type == OCFS2_LOCK_TYPE_DENTRY)
  2682. seq_printf(m, "%.*s%08x\t", OCFS2_DENTRY_LOCK_INO_START - 1,
  2683. lockres->l_name,
  2684. (unsigned int)ocfs2_get_dentry_lock_ino(lockres));
  2685. else
  2686. seq_printf(m, "%.*s\t", OCFS2_LOCK_ID_MAX_LEN, lockres->l_name);
  2687. seq_printf(m, "%d\t"
  2688. "0x%lx\t"
  2689. "0x%x\t"
  2690. "0x%x\t"
  2691. "%u\t"
  2692. "%u\t"
  2693. "%d\t"
  2694. "%d\t",
  2695. lockres->l_level,
  2696. lockres->l_flags,
  2697. lockres->l_action,
  2698. lockres->l_unlock_action,
  2699. lockres->l_ro_holders,
  2700. lockres->l_ex_holders,
  2701. lockres->l_requested,
  2702. lockres->l_blocking);
  2703. /* Dump the raw LVB */
  2704. lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
  2705. for(i = 0; i < DLM_LVB_LEN; i++)
  2706. seq_printf(m, "0x%x\t", lvb[i]);
  2707. #ifdef CONFIG_OCFS2_FS_STATS
  2708. # define lock_num_prmode(_l) ((_l)->l_lock_prmode.ls_gets)
  2709. # define lock_num_exmode(_l) ((_l)->l_lock_exmode.ls_gets)
  2710. # define lock_num_prmode_failed(_l) ((_l)->l_lock_prmode.ls_fail)
  2711. # define lock_num_exmode_failed(_l) ((_l)->l_lock_exmode.ls_fail)
  2712. # define lock_total_prmode(_l) ((_l)->l_lock_prmode.ls_total)
  2713. # define lock_total_exmode(_l) ((_l)->l_lock_exmode.ls_total)
  2714. # define lock_max_prmode(_l) ((_l)->l_lock_prmode.ls_max)
  2715. # define lock_max_exmode(_l) ((_l)->l_lock_exmode.ls_max)
  2716. # define lock_refresh(_l) ((_l)->l_lock_refresh)
  2717. # define lock_last_prmode(_l) ((_l)->l_lock_prmode.ls_last)
  2718. # define lock_last_exmode(_l) ((_l)->l_lock_exmode.ls_last)
  2719. # define lock_wait(_l) ((_l)->l_lock_wait)
  2720. #else
  2721. # define lock_num_prmode(_l) (0)
  2722. # define lock_num_exmode(_l) (0)
  2723. # define lock_num_prmode_failed(_l) (0)
  2724. # define lock_num_exmode_failed(_l) (0)
  2725. # define lock_total_prmode(_l) (0ULL)
  2726. # define lock_total_exmode(_l) (0ULL)
  2727. # define lock_max_prmode(_l) (0)
  2728. # define lock_max_exmode(_l) (0)
  2729. # define lock_refresh(_l) (0)
  2730. # define lock_last_prmode(_l) (0ULL)
  2731. # define lock_last_exmode(_l) (0ULL)
  2732. # define lock_wait(_l) (0ULL)
  2733. #endif
  2734. /* The following seq_print was added in version 2 of this output */
  2735. seq_printf(m, "%u\t"
  2736. "%u\t"
  2737. "%u\t"
  2738. "%u\t"
  2739. "%llu\t"
  2740. "%llu\t"
  2741. "%u\t"
  2742. "%u\t"
  2743. "%u\t"
  2744. "%llu\t"
  2745. "%llu\t"
  2746. "%llu\t",
  2747. lock_num_prmode(lockres),
  2748. lock_num_exmode(lockres),
  2749. lock_num_prmode_failed(lockres),
  2750. lock_num_exmode_failed(lockres),
  2751. lock_total_prmode(lockres),
  2752. lock_total_exmode(lockres),
  2753. lock_max_prmode(lockres),
  2754. lock_max_exmode(lockres),
  2755. lock_refresh(lockres),
  2756. lock_last_prmode(lockres),
  2757. lock_last_exmode(lockres),
  2758. lock_wait(lockres));
  2759. /* End the line */
  2760. seq_printf(m, "\n");
  2761. return 0;
  2762. }
  2763. static const struct seq_operations ocfs2_dlm_seq_ops = {
  2764. .start = ocfs2_dlm_seq_start,
  2765. .stop = ocfs2_dlm_seq_stop,
  2766. .next = ocfs2_dlm_seq_next,
  2767. .show = ocfs2_dlm_seq_show,
  2768. };
  2769. static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
  2770. {
  2771. struct seq_file *seq = file->private_data;
  2772. struct ocfs2_dlm_seq_priv *priv = seq->private;
  2773. struct ocfs2_lock_res *res = &priv->p_iter_res;
  2774. ocfs2_remove_lockres_tracking(res);
  2775. ocfs2_put_dlm_debug(priv->p_dlm_debug);
  2776. return seq_release_private(inode, file);
  2777. }
  2778. static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
  2779. {
  2780. struct ocfs2_dlm_seq_priv *priv;
  2781. struct ocfs2_super *osb;
  2782. priv = __seq_open_private(file, &ocfs2_dlm_seq_ops, sizeof(*priv));
  2783. if (!priv) {
  2784. mlog_errno(-ENOMEM);
  2785. return -ENOMEM;
  2786. }
  2787. osb = inode->i_private;
  2788. ocfs2_get_dlm_debug(osb->osb_dlm_debug);
  2789. priv->p_dlm_debug = osb->osb_dlm_debug;
  2790. INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
  2791. ocfs2_add_lockres_tracking(&priv->p_iter_res,
  2792. priv->p_dlm_debug);
  2793. return 0;
  2794. }
  2795. static const struct file_operations ocfs2_dlm_debug_fops = {
  2796. .open = ocfs2_dlm_debug_open,
  2797. .release = ocfs2_dlm_debug_release,
  2798. .read = seq_read,
  2799. .llseek = seq_lseek,
  2800. };
  2801. static void ocfs2_dlm_init_debug(struct ocfs2_super *osb)
  2802. {
  2803. struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
  2804. debugfs_create_file("locking_state", S_IFREG|S_IRUSR,
  2805. osb->osb_debug_root, osb, &ocfs2_dlm_debug_fops);
  2806. debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root,
  2807. &dlm_debug->d_filter_secs);
  2808. ocfs2_get_dlm_debug(dlm_debug);
  2809. }
  2810. static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
  2811. {
  2812. struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
  2813. if (dlm_debug)
  2814. ocfs2_put_dlm_debug(dlm_debug);
  2815. }
  2816. int ocfs2_dlm_init(struct ocfs2_super *osb)
  2817. {
  2818. int status = 0;
  2819. struct ocfs2_cluster_connection *conn = NULL;
  2820. if (ocfs2_mount_local(osb)) {
  2821. osb->node_num = 0;
  2822. goto local;
  2823. }
  2824. ocfs2_dlm_init_debug(osb);
  2825. /* launch downconvert thread */
  2826. osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc-%s",
  2827. osb->uuid_str);
  2828. if (IS_ERR(osb->dc_task)) {
  2829. status = PTR_ERR(osb->dc_task);
  2830. osb->dc_task = NULL;
  2831. mlog_errno(status);
  2832. goto bail;
  2833. }
  2834. /* for now, uuid == domain */
  2835. status = ocfs2_cluster_connect(osb->osb_cluster_stack,
  2836. osb->osb_cluster_name,
  2837. strlen(osb->osb_cluster_name),
  2838. osb->uuid_str,
  2839. strlen(osb->uuid_str),
  2840. &lproto, ocfs2_do_node_down, osb,
  2841. &conn);
  2842. if (status) {
  2843. mlog_errno(status);
  2844. goto bail;
  2845. }
  2846. status = ocfs2_cluster_this_node(conn, &osb->node_num);
  2847. if (status < 0) {
  2848. mlog_errno(status);
  2849. mlog(ML_ERROR,
  2850. "could not find this host's node number\n");
  2851. ocfs2_cluster_disconnect(conn, 0);
  2852. goto bail;
  2853. }
  2854. local:
  2855. ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
  2856. ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
  2857. ocfs2_nfs_sync_lock_init(osb);
  2858. ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
  2859. osb->cconn = conn;
  2860. bail:
  2861. if (status < 0) {
  2862. ocfs2_dlm_shutdown_debug(osb);
  2863. if (osb->dc_task)
  2864. kthread_stop(osb->dc_task);
  2865. }
  2866. return status;
  2867. }
  2868. void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
  2869. int hangup_pending)
  2870. {
  2871. ocfs2_drop_osb_locks(osb);
  2872. /*
  2873. * Now that we have dropped all locks and ocfs2_dismount_volume()
  2874. * has disabled recovery, the DLM won't be talking to us. It's
  2875. * safe to tear things down before disconnecting the cluster.
  2876. */
  2877. if (osb->dc_task) {
  2878. kthread_stop(osb->dc_task);
  2879. osb->dc_task = NULL;
  2880. }
  2881. ocfs2_lock_res_free(&osb->osb_super_lockres);
  2882. ocfs2_lock_res_free(&osb->osb_rename_lockres);
  2883. ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
  2884. ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
  2885. if (osb->cconn) {
  2886. ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
  2887. osb->cconn = NULL;
  2888. ocfs2_dlm_shutdown_debug(osb);
  2889. }
  2890. }
  2891. static int ocfs2_drop_lock(struct ocfs2_super *osb,
  2892. struct ocfs2_lock_res *lockres)
  2893. {
  2894. int ret;
  2895. unsigned long flags;
  2896. u32 lkm_flags = 0;
  2897. /* We didn't get anywhere near actually using this lockres. */
  2898. if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
  2899. goto out;
  2900. if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
  2901. lkm_flags |= DLM_LKF_VALBLK;
  2902. spin_lock_irqsave(&lockres->l_lock, flags);
  2903. mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING),
  2904. "lockres %s, flags 0x%lx\n",
  2905. lockres->l_name, lockres->l_flags);
  2906. while (lockres->l_flags & OCFS2_LOCK_BUSY) {
  2907. mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = "
  2908. "%u, unlock_action = %u\n",
  2909. lockres->l_name, lockres->l_flags, lockres->l_action,
  2910. lockres->l_unlock_action);
  2911. spin_unlock_irqrestore(&lockres->l_lock, flags);
  2912. /* XXX: Today we just wait on any busy
  2913. * locks... Perhaps we need to cancel converts in the
  2914. * future? */
  2915. ocfs2_wait_on_busy_lock(lockres);
  2916. spin_lock_irqsave(&lockres->l_lock, flags);
  2917. }
  2918. if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
  2919. if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
  2920. lockres->l_level == DLM_LOCK_EX &&
  2921. !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
  2922. lockres->l_ops->set_lvb(lockres);
  2923. }
  2924. if (lockres->l_flags & OCFS2_LOCK_BUSY)
  2925. mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
  2926. lockres->l_name);
  2927. if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
  2928. mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name);
  2929. if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
  2930. spin_unlock_irqrestore(&lockres->l_lock, flags);
  2931. goto out;
  2932. }
  2933. lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED);
  2934. /* make sure we never get here while waiting for an ast to
  2935. * fire. */
  2936. BUG_ON(lockres->l_action != OCFS2_AST_INVALID);
  2937. /* is this necessary? */
  2938. lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
  2939. lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
  2940. spin_unlock_irqrestore(&lockres->l_lock, flags);
  2941. mlog(0, "lock %s\n", lockres->l_name);
  2942. ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags);
  2943. if (ret) {
  2944. ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
  2945. mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
  2946. ocfs2_dlm_dump_lksb(&lockres->l_lksb);
  2947. BUG();
  2948. }
  2949. mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n",
  2950. lockres->l_name);
  2951. ocfs2_wait_on_busy_lock(lockres);
  2952. out:
  2953. return 0;
  2954. }
  2955. static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
  2956. struct ocfs2_lock_res *lockres);
  2957. /* Mark the lockres as being dropped. It will no longer be
  2958. * queued if blocking, but we still may have to wait on it
  2959. * being dequeued from the downconvert thread before we can consider
  2960. * it safe to drop.
  2961. *
  2962. * You can *not* attempt to call cluster_lock on this lockres anymore. */
  2963. void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
  2964. struct ocfs2_lock_res *lockres)
  2965. {
  2966. int status;
  2967. struct ocfs2_mask_waiter mw;
  2968. unsigned long flags, flags2;
  2969. ocfs2_init_mask_waiter(&mw);
  2970. spin_lock_irqsave(&lockres->l_lock, flags);
  2971. lockres->l_flags |= OCFS2_LOCK_FREEING;
  2972. if (lockres->l_flags & OCFS2_LOCK_QUEUED && current == osb->dc_task) {
  2973. /*
  2974. * We know the downconvert is queued but not in progress
  2975. * because we are the downconvert thread and processing
  2976. * different lock. So we can just remove the lock from the
  2977. * queue. This is not only an optimization but also a way
  2978. * to avoid the following deadlock:
  2979. * ocfs2_dentry_post_unlock()
  2980. * ocfs2_dentry_lock_put()
  2981. * ocfs2_drop_dentry_lock()
  2982. * iput()
  2983. * ocfs2_evict_inode()
  2984. * ocfs2_clear_inode()
  2985. * ocfs2_mark_lockres_freeing()
  2986. * ... blocks waiting for OCFS2_LOCK_QUEUED
  2987. * since we are the downconvert thread which
  2988. * should clear the flag.
  2989. */
  2990. spin_unlock_irqrestore(&lockres->l_lock, flags);
  2991. spin_lock_irqsave(&osb->dc_task_lock, flags2);
  2992. list_del_init(&lockres->l_blocked_list);
  2993. osb->blocked_lock_count--;
  2994. spin_unlock_irqrestore(&osb->dc_task_lock, flags2);
  2995. /*
  2996. * Warn if we recurse into another post_unlock call. Strictly
  2997. * speaking it isn't a problem but we need to be careful if
  2998. * that happens (stack overflow, deadlocks, ...) so warn if
  2999. * ocfs2 grows a path for which this can happen.
  3000. */
  3001. WARN_ON_ONCE(lockres->l_ops->post_unlock);
  3002. /* Since the lock is freeing we don't do much in the fn below */
  3003. ocfs2_process_blocked_lock(osb, lockres);
  3004. return;
  3005. }
  3006. while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
  3007. lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
  3008. spin_unlock_irqrestore(&lockres->l_lock, flags);
  3009. mlog(0, "Waiting on lockres %s\n", lockres->l_name);
  3010. status = ocfs2_wait_for_mask(&mw);
  3011. if (status)
  3012. mlog_errno(status);
  3013. spin_lock_irqsave(&lockres->l_lock, flags);
  3014. }
  3015. spin_unlock_irqrestore(&lockres->l_lock, flags);
  3016. }
  3017. void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
  3018. struct ocfs2_lock_res *lockres)
  3019. {
  3020. int ret;
  3021. ocfs2_mark_lockres_freeing(osb, lockres);
  3022. ret = ocfs2_drop_lock(osb, lockres);
  3023. if (ret)
  3024. mlog_errno(ret);
  3025. }
  3026. static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
  3027. {
  3028. ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
  3029. ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
  3030. ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
  3031. ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres);
  3032. }
  3033. int ocfs2_drop_inode_locks(struct inode *inode)
  3034. {
  3035. int status, err;
  3036. /* No need to call ocfs2_mark_lockres_freeing here -
  3037. * ocfs2_clear_inode has done it for us. */
  3038. err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
  3039. &OCFS2_I(inode)->ip_open_lockres);
  3040. if (err < 0)
  3041. mlog_errno(err);
  3042. status = err;
  3043. err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
  3044. &OCFS2_I(inode)->ip_inode_lockres);
  3045. if (err < 0)
  3046. mlog_errno(err);
  3047. if (err < 0 && !status)
  3048. status = err;
  3049. err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
  3050. &OCFS2_I(inode)->ip_rw_lockres);
  3051. if (err < 0)
  3052. mlog_errno(err);
  3053. if (err < 0 && !status)
  3054. status = err;
  3055. return status;
  3056. }
  3057. static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
  3058. int new_level)
  3059. {
  3060. assert_spin_locked(&lockres->l_lock);
  3061. BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
  3062. if (lockres->l_level <= new_level) {
  3063. mlog(ML_ERROR, "lockres %s, lvl %d <= %d, blcklst %d, mask %d, "
  3064. "type %d, flags 0x%lx, hold %d %d, act %d %d, req %d, "
  3065. "block %d, pgen %d\n", lockres->l_name, lockres->l_level,
  3066. new_level, list_empty(&lockres->l_blocked_list),
  3067. list_empty(&lockres->l_mask_waiters), lockres->l_type,
  3068. lockres->l_flags, lockres->l_ro_holders,
  3069. lockres->l_ex_holders, lockres->l_action,
  3070. lockres->l_unlock_action, lockres->l_requested,
  3071. lockres->l_blocking, lockres->l_pending_gen);
  3072. BUG();
  3073. }
  3074. mlog(ML_BASTS, "lockres %s, level %d => %d, blocking %d\n",
  3075. lockres->l_name, lockres->l_level, new_level, lockres->l_blocking);
  3076. lockres->l_action = OCFS2_AST_DOWNCONVERT;
  3077. lockres->l_requested = new_level;
  3078. lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
  3079. return lockres_set_pending(lockres);
  3080. }
  3081. static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
  3082. struct ocfs2_lock_res *lockres,
  3083. int new_level,
  3084. int lvb,
  3085. unsigned int generation)
  3086. {
  3087. int ret;
  3088. u32 dlm_flags = DLM_LKF_CONVERT;
  3089. mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
  3090. lockres->l_level, new_level);
  3091. /*
  3092. * On DLM_LKF_VALBLK, fsdlm behaves differently with o2cb. It always
  3093. * expects DLM_LKF_VALBLK being set if the LKB has LVB, so that
  3094. * we can recover correctly from node failure. Otherwise, we may get
  3095. * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set.
  3096. */
  3097. if (ocfs2_userspace_stack(osb) &&
  3098. lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
  3099. lvb = 1;
  3100. if (lvb)
  3101. dlm_flags |= DLM_LKF_VALBLK;
  3102. ret = ocfs2_dlm_lock(osb->cconn,
  3103. new_level,
  3104. &lockres->l_lksb,
  3105. dlm_flags,
  3106. lockres->l_name,
  3107. OCFS2_LOCK_ID_MAX_LEN - 1);
  3108. lockres_clear_pending(lockres, generation, osb);
  3109. if (ret) {
  3110. ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
  3111. ocfs2_recover_from_dlm_error(lockres, 1);
  3112. goto bail;
  3113. }
  3114. ret = 0;
  3115. bail:
  3116. return ret;
  3117. }
  3118. /* returns 1 when the caller should unlock and call ocfs2_dlm_unlock */
  3119. static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
  3120. struct ocfs2_lock_res *lockres)
  3121. {
  3122. assert_spin_locked(&lockres->l_lock);
  3123. if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
  3124. /* If we're already trying to cancel a lock conversion
  3125. * then just drop the spinlock and allow the caller to
  3126. * requeue this lock. */
  3127. mlog(ML_BASTS, "lockres %s, skip convert\n", lockres->l_name);
  3128. return 0;
  3129. }
  3130. /* were we in a convert when we got the bast fire? */
  3131. BUG_ON(lockres->l_action != OCFS2_AST_CONVERT &&
  3132. lockres->l_action != OCFS2_AST_DOWNCONVERT);
  3133. /* set things up for the unlockast to know to just
  3134. * clear out the ast_action and unset busy, etc. */
  3135. lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT;
  3136. mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY),
  3137. "lock %s, invalid flags: 0x%lx\n",
  3138. lockres->l_name, lockres->l_flags);
  3139. mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
  3140. return 1;
  3141. }
  3142. static int ocfs2_cancel_convert(struct ocfs2_super *osb,
  3143. struct ocfs2_lock_res *lockres)
  3144. {
  3145. int ret;
  3146. ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
  3147. DLM_LKF_CANCEL);
  3148. if (ret) {
  3149. ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
  3150. ocfs2_recover_from_dlm_error(lockres, 0);
  3151. }
  3152. mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
  3153. return ret;
  3154. }
  3155. static int ocfs2_unblock_lock(struct ocfs2_super *osb,
  3156. struct ocfs2_lock_res *lockres,
  3157. struct ocfs2_unblock_ctl *ctl)
  3158. {
  3159. unsigned long flags;
  3160. int blocking;
  3161. int new_level;
  3162. int level;
  3163. int ret = 0;
  3164. int set_lvb = 0;
  3165. unsigned int gen;
  3166. spin_lock_irqsave(&lockres->l_lock, flags);
  3167. recheck:
  3168. /*
  3169. * Is it still blocking? If not, we have no more work to do.
  3170. */
  3171. if (!(lockres->l_flags & OCFS2_LOCK_BLOCKED)) {
  3172. BUG_ON(lockres->l_blocking != DLM_LOCK_NL);
  3173. spin_unlock_irqrestore(&lockres->l_lock, flags);
  3174. ret = 0;
  3175. goto leave;
  3176. }
  3177. if (lockres->l_flags & OCFS2_LOCK_BUSY) {
  3178. /* XXX
  3179. * This is a *big* race. The OCFS2_LOCK_PENDING flag
  3180. * exists entirely for one reason - another thread has set
  3181. * OCFS2_LOCK_BUSY, but has *NOT* yet called dlm_lock().
  3182. *
  3183. * If we do ocfs2_cancel_convert() before the other thread
  3184. * calls dlm_lock(), our cancel will do nothing. We will
  3185. * get no ast, and we will have no way of knowing the
  3186. * cancel failed. Meanwhile, the other thread will call
  3187. * into dlm_lock() and wait...forever.
  3188. *
  3189. * Why forever? Because another node has asked for the
  3190. * lock first; that's why we're here in unblock_lock().
  3191. *
  3192. * The solution is OCFS2_LOCK_PENDING. When PENDING is
  3193. * set, we just requeue the unblock. Only when the other
  3194. * thread has called dlm_lock() and cleared PENDING will
  3195. * we then cancel their request.
  3196. *
  3197. * All callers of dlm_lock() must set OCFS2_DLM_PENDING
  3198. * at the same time they set OCFS2_DLM_BUSY. They must
  3199. * clear OCFS2_DLM_PENDING after dlm_lock() returns.
  3200. */
  3201. if (lockres->l_flags & OCFS2_LOCK_PENDING) {
  3202. mlog(ML_BASTS, "lockres %s, ReQ: Pending\n",
  3203. lockres->l_name);
  3204. goto leave_requeue;
  3205. }
  3206. ctl->requeue = 1;
  3207. ret = ocfs2_prepare_cancel_convert(osb, lockres);
  3208. spin_unlock_irqrestore(&lockres->l_lock, flags);
  3209. if (ret) {
  3210. ret = ocfs2_cancel_convert(osb, lockres);
  3211. if (ret < 0)
  3212. mlog_errno(ret);
  3213. }
  3214. goto leave;
  3215. }
  3216. /*
  3217. * This prevents livelocks. OCFS2_LOCK_UPCONVERT_FINISHING flag is
  3218. * set when the ast is received for an upconvert just before the
  3219. * OCFS2_LOCK_BUSY flag is cleared. Now if the fs received a bast
  3220. * on the heels of the ast, we want to delay the downconvert just
  3221. * enough to allow the up requestor to do its task. Because this
  3222. * lock is in the blocked queue, the lock will be downconverted
  3223. * as soon as the requestor is done with the lock.
  3224. */
  3225. if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING)
  3226. goto leave_requeue;
  3227. /*
  3228. * How can we block and yet be at NL? We were trying to upconvert
  3229. * from NL and got canceled. The code comes back here, and now
  3230. * we notice and clear BLOCKING.
  3231. */
  3232. if (lockres->l_level == DLM_LOCK_NL) {
  3233. BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders);
  3234. mlog(ML_BASTS, "lockres %s, Aborting dc\n", lockres->l_name);
  3235. lockres->l_blocking = DLM_LOCK_NL;
  3236. lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
  3237. spin_unlock_irqrestore(&lockres->l_lock, flags);
  3238. goto leave;
  3239. }
  3240. /* if we're blocking an exclusive and we have *any* holders,
  3241. * then requeue. */
  3242. if ((lockres->l_blocking == DLM_LOCK_EX)
  3243. && (lockres->l_ex_holders || lockres->l_ro_holders)) {
  3244. mlog(ML_BASTS, "lockres %s, ReQ: EX/PR Holders %u,%u\n",
  3245. lockres->l_name, lockres->l_ex_holders,
  3246. lockres->l_ro_holders);
  3247. goto leave_requeue;
  3248. }
  3249. /* If it's a PR we're blocking, then only
  3250. * requeue if we've got any EX holders */
  3251. if (lockres->l_blocking == DLM_LOCK_PR &&
  3252. lockres->l_ex_holders) {
  3253. mlog(ML_BASTS, "lockres %s, ReQ: EX Holders %u\n",
  3254. lockres->l_name, lockres->l_ex_holders);
  3255. goto leave_requeue;
  3256. }
  3257. /*
  3258. * Can we get a lock in this state if the holder counts are
  3259. * zero? The meta data unblock code used to check this.
  3260. */
  3261. if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
  3262. && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) {
  3263. mlog(ML_BASTS, "lockres %s, ReQ: Lock Refreshing\n",
  3264. lockres->l_name);
  3265. goto leave_requeue;
  3266. }
  3267. new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
  3268. if (lockres->l_ops->check_downconvert
  3269. && !lockres->l_ops->check_downconvert(lockres, new_level)) {
  3270. mlog(ML_BASTS, "lockres %s, ReQ: Checkpointing\n",
  3271. lockres->l_name);
  3272. goto leave_requeue;
  3273. }
  3274. /* If we get here, then we know that there are no more
  3275. * incompatible holders (and anyone asking for an incompatible
  3276. * lock is blocked). We can now downconvert the lock */
  3277. if (!lockres->l_ops->downconvert_worker)
  3278. goto downconvert;
  3279. /* Some lockres types want to do a bit of work before
  3280. * downconverting a lock. Allow that here. The worker function
  3281. * may sleep, so we save off a copy of what we're blocking as
  3282. * it may change while we're not holding the spin lock. */
  3283. blocking = lockres->l_blocking;
  3284. level = lockres->l_level;
  3285. spin_unlock_irqrestore(&lockres->l_lock, flags);
  3286. ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking);
  3287. if (ctl->unblock_action == UNBLOCK_STOP_POST) {
  3288. mlog(ML_BASTS, "lockres %s, UNBLOCK_STOP_POST\n",
  3289. lockres->l_name);
  3290. goto leave;
  3291. }
  3292. spin_lock_irqsave(&lockres->l_lock, flags);
  3293. if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) {
  3294. /* If this changed underneath us, then we can't drop
  3295. * it just yet. */
  3296. mlog(ML_BASTS, "lockres %s, block=%d:%d, level=%d:%d, "
  3297. "Recheck\n", lockres->l_name, blocking,
  3298. lockres->l_blocking, level, lockres->l_level);
  3299. goto recheck;
  3300. }
  3301. downconvert:
  3302. ctl->requeue = 0;
  3303. if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
  3304. if (lockres->l_level == DLM_LOCK_EX)
  3305. set_lvb = 1;
  3306. /*
  3307. * We only set the lvb if the lock has been fully
  3308. * refreshed - otherwise we risk setting stale
  3309. * data. Otherwise, there's no need to actually clear
  3310. * out the lvb here as it's value is still valid.
  3311. */
  3312. if (set_lvb && !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
  3313. lockres->l_ops->set_lvb(lockres);
  3314. }
  3315. gen = ocfs2_prepare_downconvert(lockres, new_level);
  3316. spin_unlock_irqrestore(&lockres->l_lock, flags);
  3317. ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb,
  3318. gen);
  3319. /* The dlm lock convert is being cancelled in background,
  3320. * ocfs2_cancel_convert() is asynchronous in fs/dlm,
  3321. * requeue it, try again later.
  3322. */
  3323. if (ret == -EBUSY) {
  3324. ctl->requeue = 1;
  3325. mlog(ML_BASTS, "lockres %s, ReQ: Downconvert busy\n",
  3326. lockres->l_name);
  3327. ret = 0;
  3328. msleep(20);
  3329. }
  3330. leave:
  3331. if (ret)
  3332. mlog_errno(ret);
  3333. return ret;
  3334. leave_requeue:
  3335. spin_unlock_irqrestore(&lockres->l_lock, flags);
  3336. ctl->requeue = 1;
  3337. return 0;
  3338. }
  3339. static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
  3340. int blocking)
  3341. {
  3342. struct inode *inode;
  3343. struct address_space *mapping;
  3344. struct ocfs2_inode_info *oi;
  3345. inode = ocfs2_lock_res_inode(lockres);
  3346. mapping = inode->i_mapping;
  3347. if (S_ISDIR(inode->i_mode)) {
  3348. oi = OCFS2_I(inode);
  3349. oi->ip_dir_lock_gen++;
  3350. mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
  3351. goto out_forget;
  3352. }
  3353. if (!S_ISREG(inode->i_mode))
  3354. goto out;
  3355. /*
  3356. * We need this before the filemap_fdatawrite() so that it can
  3357. * transfer the dirty bit from the PTE to the
  3358. * page. Unfortunately this means that even for EX->PR
  3359. * downconverts, we'll lose our mappings and have to build
  3360. * them up again.
  3361. */
  3362. unmap_mapping_range(mapping, 0, 0, 0);
  3363. if (filemap_fdatawrite(mapping)) {
  3364. mlog(ML_ERROR, "Could not sync inode %llu for downconvert!",
  3365. (unsigned long long)OCFS2_I(inode)->ip_blkno);
  3366. }
  3367. sync_mapping_buffers(mapping);
  3368. if (blocking == DLM_LOCK_EX) {
  3369. truncate_inode_pages(mapping, 0);
  3370. } else {
  3371. /* We only need to wait on the I/O if we're not also
  3372. * truncating pages because truncate_inode_pages waits
  3373. * for us above. We don't truncate pages if we're
  3374. * blocking anything < EXMODE because we want to keep
  3375. * them around in that case. */
  3376. filemap_fdatawait(mapping);
  3377. }
  3378. out_forget:
  3379. forget_all_cached_acls(inode);
  3380. out:
  3381. return UNBLOCK_CONTINUE;
  3382. }
  3383. static int ocfs2_ci_checkpointed(struct ocfs2_caching_info *ci,
  3384. struct ocfs2_lock_res *lockres,
  3385. int new_level)
  3386. {
  3387. int checkpointed = ocfs2_ci_fully_checkpointed(ci);
  3388. BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR);
  3389. BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed);
  3390. if (checkpointed)
  3391. return 1;
  3392. ocfs2_start_checkpoint(OCFS2_SB(ocfs2_metadata_cache_get_super(ci)));
  3393. return 0;
  3394. }
  3395. static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
  3396. int new_level)
  3397. {
  3398. struct inode *inode = ocfs2_lock_res_inode(lockres);
  3399. return ocfs2_ci_checkpointed(INODE_CACHE(inode), lockres, new_level);
  3400. }
  3401. static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
  3402. {
  3403. struct inode *inode = ocfs2_lock_res_inode(lockres);
  3404. __ocfs2_stuff_meta_lvb(inode);
  3405. }
  3406. /*
  3407. * Does the final reference drop on our dentry lock. Right now this
  3408. * happens in the downconvert thread, but we could choose to simplify the
  3409. * dlmglue API and push these off to the ocfs2_wq in the future.
  3410. */
  3411. static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
  3412. struct ocfs2_lock_res *lockres)
  3413. {
  3414. struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres);
  3415. ocfs2_dentry_lock_put(osb, dl);
  3416. }
  3417. /*
  3418. * d_delete() matching dentries before the lock downconvert.
  3419. *
  3420. * At this point, any process waiting to destroy the
  3421. * dentry_lock due to last ref count is stopped by the
  3422. * OCFS2_LOCK_QUEUED flag.
  3423. *
  3424. * We have two potential problems
  3425. *
  3426. * 1) If we do the last reference drop on our dentry_lock (via dput)
  3427. * we'll wind up in ocfs2_release_dentry_lock(), waiting on
  3428. * the downconvert to finish. Instead we take an elevated
  3429. * reference and push the drop until after we've completed our
  3430. * unblock processing.
  3431. *
  3432. * 2) There might be another process with a final reference,
  3433. * waiting on us to finish processing. If this is the case, we
  3434. * detect it and exit out - there's no more dentries anyway.
  3435. */
  3436. static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
  3437. int blocking)
  3438. {
  3439. struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres);
  3440. struct ocfs2_inode_info *oi = OCFS2_I(dl->dl_inode);
  3441. struct dentry *dentry;
  3442. unsigned long flags;
  3443. int extra_ref = 0;
  3444. /*
  3445. * This node is blocking another node from getting a read
  3446. * lock. This happens when we've renamed within a
  3447. * directory. We've forced the other nodes to d_delete(), but
  3448. * we never actually dropped our lock because it's still
  3449. * valid. The downconvert code will retain a PR for this node,
  3450. * so there's no further work to do.
  3451. */
  3452. if (blocking == DLM_LOCK_PR)
  3453. return UNBLOCK_CONTINUE;
  3454. /*
  3455. * Mark this inode as potentially orphaned. The code in
  3456. * ocfs2_delete_inode() will figure out whether it actually
  3457. * needs to be freed or not.
  3458. */
  3459. spin_lock(&oi->ip_lock);
  3460. oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
  3461. spin_unlock(&oi->ip_lock);
  3462. /*
  3463. * Yuck. We need to make sure however that the check of
  3464. * OCFS2_LOCK_FREEING and the extra reference are atomic with
  3465. * respect to a reference decrement or the setting of that
  3466. * flag.
  3467. */
  3468. spin_lock_irqsave(&lockres->l_lock, flags);
  3469. spin_lock(&dentry_attach_lock);
  3470. if (!(lockres->l_flags & OCFS2_LOCK_FREEING)
  3471. && dl->dl_count) {
  3472. dl->dl_count++;
  3473. extra_ref = 1;
  3474. }
  3475. spin_unlock(&dentry_attach_lock);
  3476. spin_unlock_irqrestore(&lockres->l_lock, flags);
  3477. mlog(0, "extra_ref = %d\n", extra_ref);
  3478. /*
  3479. * We have a process waiting on us in ocfs2_dentry_iput(),
  3480. * which means we can't have any more outstanding
  3481. * aliases. There's no need to do any more work.
  3482. */
  3483. if (!extra_ref)
  3484. return UNBLOCK_CONTINUE;
  3485. spin_lock(&dentry_attach_lock);
  3486. while (1) {
  3487. dentry = ocfs2_find_local_alias(dl->dl_inode,
  3488. dl->dl_parent_blkno, 1);
  3489. if (!dentry)
  3490. break;
  3491. spin_unlock(&dentry_attach_lock);
  3492. if (S_ISDIR(dl->dl_inode->i_mode))
  3493. shrink_dcache_parent(dentry);
  3494. mlog(0, "d_delete(%pd);\n", dentry);
  3495. /*
  3496. * The following dcache calls may do an
  3497. * iput(). Normally we don't want that from the
  3498. * downconverting thread, but in this case it's ok
  3499. * because the requesting node already has an
  3500. * exclusive lock on the inode, so it can't be queued
  3501. * for a downconvert.
  3502. */
  3503. d_delete(dentry);
  3504. dput(dentry);
  3505. spin_lock(&dentry_attach_lock);
  3506. }
  3507. spin_unlock(&dentry_attach_lock);
  3508. /*
  3509. * If we are the last holder of this dentry lock, there is no
  3510. * reason to downconvert so skip straight to the unlock.
  3511. */
  3512. if (dl->dl_count == 1)
  3513. return UNBLOCK_STOP_POST;
  3514. return UNBLOCK_CONTINUE_POST;
  3515. }
  3516. static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
  3517. int new_level)
  3518. {
  3519. struct ocfs2_refcount_tree *tree =
  3520. ocfs2_lock_res_refcount_tree(lockres);
  3521. return ocfs2_ci_checkpointed(&tree->rf_ci, lockres, new_level);
  3522. }
  3523. static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
  3524. int blocking)
  3525. {
  3526. struct ocfs2_refcount_tree *tree =
  3527. ocfs2_lock_res_refcount_tree(lockres);
  3528. ocfs2_metadata_cache_purge(&tree->rf_ci);
  3529. return UNBLOCK_CONTINUE;
  3530. }
  3531. static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
  3532. {
  3533. struct ocfs2_qinfo_lvb *lvb;
  3534. struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres);
  3535. struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
  3536. oinfo->dqi_gi.dqi_type);
  3537. lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
  3538. lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
  3539. lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
  3540. lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace);
  3541. lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms);
  3542. lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
  3543. lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
  3544. lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
  3545. }
  3546. void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
  3547. {
  3548. struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
  3549. struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
  3550. int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
  3551. if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
  3552. ocfs2_cluster_unlock(osb, lockres, level);
  3553. }
  3554. static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
  3555. {
  3556. struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
  3557. oinfo->dqi_gi.dqi_type);
  3558. struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
  3559. struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
  3560. struct buffer_head *bh = NULL;
  3561. struct ocfs2_global_disk_dqinfo *gdinfo;
  3562. int status = 0;
  3563. if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
  3564. lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
  3565. info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
  3566. info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
  3567. oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
  3568. oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks);
  3569. oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk);
  3570. oinfo->dqi_gi.dqi_free_entry =
  3571. be32_to_cpu(lvb->lvb_free_entry);
  3572. } else {
  3573. status = ocfs2_read_quota_phys_block(oinfo->dqi_gqinode,
  3574. oinfo->dqi_giblk, &bh);
  3575. if (status) {
  3576. mlog_errno(status);
  3577. goto bail;
  3578. }
  3579. gdinfo = (struct ocfs2_global_disk_dqinfo *)
  3580. (bh->b_data + OCFS2_GLOBAL_INFO_OFF);
  3581. info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace);
  3582. info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace);
  3583. oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms);
  3584. oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks);
  3585. oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk);
  3586. oinfo->dqi_gi.dqi_free_entry =
  3587. le32_to_cpu(gdinfo->dqi_free_entry);
  3588. brelse(bh);
  3589. ocfs2_track_lock_refresh(lockres);
  3590. }
  3591. bail:
  3592. return status;
  3593. }
  3594. /* Lock quota info, this function expects at least shared lock on the quota file
  3595. * so that we can safely refresh quota info from disk. */
  3596. int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
  3597. {
  3598. struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
  3599. struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
  3600. int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
  3601. int status = 0;
  3602. /* On RO devices, locking really isn't needed... */
  3603. if (ocfs2_is_hard_readonly(osb)) {
  3604. if (ex)
  3605. status = -EROFS;
  3606. goto bail;
  3607. }
  3608. if (ocfs2_mount_local(osb))
  3609. goto bail;
  3610. status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
  3611. if (status < 0) {
  3612. mlog_errno(status);
  3613. goto bail;
  3614. }
  3615. if (!ocfs2_should_refresh_lock_res(lockres))
  3616. goto bail;
  3617. /* OK, we have the lock but we need to refresh the quota info */
  3618. status = ocfs2_refresh_qinfo(oinfo);
  3619. if (status)
  3620. ocfs2_qinfo_unlock(oinfo, ex);
  3621. ocfs2_complete_lock_res_refresh(lockres, status);
  3622. bail:
  3623. return status;
  3624. }
  3625. int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex)
  3626. {
  3627. int status;
  3628. int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
  3629. struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
  3630. struct ocfs2_super *osb = lockres->l_priv;
  3631. if (ocfs2_is_hard_readonly(osb))
  3632. return -EROFS;
  3633. if (ocfs2_mount_local(osb))
  3634. return 0;
  3635. status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
  3636. if (status < 0)
  3637. mlog_errno(status);
  3638. return status;
  3639. }
  3640. void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex)
  3641. {
  3642. int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
  3643. struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
  3644. struct ocfs2_super *osb = lockres->l_priv;
  3645. if (!ocfs2_mount_local(osb))
  3646. ocfs2_cluster_unlock(osb, lockres, level);
  3647. }
  3648. static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
  3649. struct ocfs2_lock_res *lockres)
  3650. {
  3651. int status;
  3652. struct ocfs2_unblock_ctl ctl = {0, 0,};
  3653. unsigned long flags;
  3654. /* Our reference to the lockres in this function can be
  3655. * considered valid until we remove the OCFS2_LOCK_QUEUED
  3656. * flag. */
  3657. BUG_ON(!lockres);
  3658. BUG_ON(!lockres->l_ops);
  3659. mlog(ML_BASTS, "lockres %s blocked\n", lockres->l_name);
  3660. /* Detect whether a lock has been marked as going away while
  3661. * the downconvert thread was processing other things. A lock can
  3662. * still be marked with OCFS2_LOCK_FREEING after this check,
  3663. * but short circuiting here will still save us some
  3664. * performance. */
  3665. spin_lock_irqsave(&lockres->l_lock, flags);
  3666. if (lockres->l_flags & OCFS2_LOCK_FREEING)
  3667. goto unqueue;
  3668. spin_unlock_irqrestore(&lockres->l_lock, flags);
  3669. status = ocfs2_unblock_lock(osb, lockres, &ctl);
  3670. if (status < 0)
  3671. mlog_errno(status);
  3672. spin_lock_irqsave(&lockres->l_lock, flags);
  3673. unqueue:
  3674. if (lockres->l_flags & OCFS2_LOCK_FREEING || !ctl.requeue) {
  3675. lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
  3676. } else
  3677. ocfs2_schedule_blocked_lock(osb, lockres);
  3678. mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name,
  3679. ctl.requeue ? "yes" : "no");
  3680. spin_unlock_irqrestore(&lockres->l_lock, flags);
  3681. if (ctl.unblock_action != UNBLOCK_CONTINUE
  3682. && lockres->l_ops->post_unlock)
  3683. lockres->l_ops->post_unlock(osb, lockres);
  3684. }
  3685. static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
  3686. struct ocfs2_lock_res *lockres)
  3687. {
  3688. unsigned long flags;
  3689. assert_spin_locked(&lockres->l_lock);
  3690. if (lockres->l_flags & OCFS2_LOCK_FREEING) {
  3691. /* Do not schedule a lock for downconvert when it's on
  3692. * the way to destruction - any nodes wanting access
  3693. * to the resource will get it soon. */
  3694. mlog(ML_BASTS, "lockres %s won't be scheduled: flags 0x%lx\n",
  3695. lockres->l_name, lockres->l_flags);
  3696. return;
  3697. }
  3698. lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
  3699. spin_lock_irqsave(&osb->dc_task_lock, flags);
  3700. if (list_empty(&lockres->l_blocked_list)) {
  3701. list_add_tail(&lockres->l_blocked_list,
  3702. &osb->blocked_lock_list);
  3703. osb->blocked_lock_count++;
  3704. }
  3705. spin_unlock_irqrestore(&osb->dc_task_lock, flags);
  3706. }
  3707. static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
  3708. {
  3709. unsigned long processed;
  3710. unsigned long flags;
  3711. struct ocfs2_lock_res *lockres;
  3712. spin_lock_irqsave(&osb->dc_task_lock, flags);
  3713. /* grab this early so we know to try again if a state change and
  3714. * wake happens part-way through our work */
  3715. osb->dc_work_sequence = osb->dc_wake_sequence;
  3716. processed = osb->blocked_lock_count;
  3717. /*
  3718. * blocked lock processing in this loop might call iput which can
  3719. * remove items off osb->blocked_lock_list. Downconvert up to
  3720. * 'processed' number of locks, but stop short if we had some
  3721. * removed in ocfs2_mark_lockres_freeing when downconverting.
  3722. */
  3723. while (processed && !list_empty(&osb->blocked_lock_list)) {
  3724. lockres = list_entry(osb->blocked_lock_list.next,
  3725. struct ocfs2_lock_res, l_blocked_list);
  3726. list_del_init(&lockres->l_blocked_list);
  3727. osb->blocked_lock_count--;
  3728. spin_unlock_irqrestore(&osb->dc_task_lock, flags);
  3729. BUG_ON(!processed);
  3730. processed--;
  3731. ocfs2_process_blocked_lock(osb, lockres);
  3732. spin_lock_irqsave(&osb->dc_task_lock, flags);
  3733. }
  3734. spin_unlock_irqrestore(&osb->dc_task_lock, flags);
  3735. }
  3736. static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
  3737. {
  3738. int empty = 0;
  3739. unsigned long flags;
  3740. spin_lock_irqsave(&osb->dc_task_lock, flags);
  3741. if (list_empty(&osb->blocked_lock_list))
  3742. empty = 1;
  3743. spin_unlock_irqrestore(&osb->dc_task_lock, flags);
  3744. return empty;
  3745. }
  3746. static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
  3747. {
  3748. int should_wake = 0;
  3749. unsigned long flags;
  3750. spin_lock_irqsave(&osb->dc_task_lock, flags);
  3751. if (osb->dc_work_sequence != osb->dc_wake_sequence)
  3752. should_wake = 1;
  3753. spin_unlock_irqrestore(&osb->dc_task_lock, flags);
  3754. return should_wake;
  3755. }
  3756. static int ocfs2_downconvert_thread(void *arg)
  3757. {
  3758. struct ocfs2_super *osb = arg;
  3759. /* only quit once we've been asked to stop and there is no more
  3760. * work available */
  3761. while (!(kthread_should_stop() &&
  3762. ocfs2_downconvert_thread_lists_empty(osb))) {
  3763. wait_event_interruptible(osb->dc_event,
  3764. ocfs2_downconvert_thread_should_wake(osb) ||
  3765. kthread_should_stop());
  3766. mlog(0, "downconvert_thread: awoken\n");
  3767. ocfs2_downconvert_thread_do_work(osb);
  3768. }
  3769. osb->dc_task = NULL;
  3770. return 0;
  3771. }
  3772. void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)
  3773. {
  3774. unsigned long flags;
  3775. spin_lock_irqsave(&osb->dc_task_lock, flags);
  3776. /* make sure the voting thread gets a swipe at whatever changes
  3777. * the caller may have made to the voting state */
  3778. osb->dc_wake_sequence++;
  3779. spin_unlock_irqrestore(&osb->dc_task_lock, flags);
  3780. wake_up(&osb->dc_event);
  3781. }