file.c 84 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/ceph/ceph_debug.h>
  3. #include <linux/ceph/striper.h>
  4. #include <linux/module.h>
  5. #include <linux/sched.h>
  6. #include <linux/slab.h>
  7. #include <linux/file.h>
  8. #include <linux/mount.h>
  9. #include <linux/namei.h>
  10. #include <linux/writeback.h>
  11. #include <linux/falloc.h>
  12. #include <linux/iversion.h>
  13. #include <linux/ktime.h>
  14. #include <linux/splice.h>
  15. #include "super.h"
  16. #include "mds_client.h"
  17. #include "cache.h"
  18. #include "io.h"
  19. #include "metric.h"
  20. static __le32 ceph_flags_sys2wire(struct ceph_mds_client *mdsc, u32 flags)
  21. {
  22. struct ceph_client *cl = mdsc->fsc->client;
  23. u32 wire_flags = 0;
  24. switch (flags & O_ACCMODE) {
  25. case O_RDONLY:
  26. wire_flags |= CEPH_O_RDONLY;
  27. break;
  28. case O_WRONLY:
  29. wire_flags |= CEPH_O_WRONLY;
  30. break;
  31. case O_RDWR:
  32. wire_flags |= CEPH_O_RDWR;
  33. break;
  34. }
  35. flags &= ~O_ACCMODE;
  36. #define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; }
  37. ceph_sys2wire(O_CREAT);
  38. ceph_sys2wire(O_EXCL);
  39. ceph_sys2wire(O_TRUNC);
  40. ceph_sys2wire(O_DIRECTORY);
  41. ceph_sys2wire(O_NOFOLLOW);
  42. #undef ceph_sys2wire
  43. if (flags)
  44. doutc(cl, "unused open flags: %x\n", flags);
  45. return cpu_to_le32(wire_flags);
  46. }
  47. /*
  48. * Ceph file operations
  49. *
  50. * Implement basic open/close functionality, and implement
  51. * read/write.
  52. *
  53. * We implement three modes of file I/O:
  54. * - buffered uses the generic_file_aio_{read,write} helpers
  55. *
  56. * - synchronous is used when there is multi-client read/write
  57. * sharing, avoids the page cache, and synchronously waits for an
  58. * ack from the OSD.
  59. *
  60. * - direct io takes the variant of the sync path that references
  61. * user pages directly.
  62. *
  63. * fsync() flushes and waits on dirty pages, but just queues metadata
  64. * for writeback: since the MDS can recover size and mtime there is no
  65. * need to wait for MDS acknowledgement.
  66. */
  67. /*
  68. * How many pages to get in one call to iov_iter_get_pages(). This
  69. * determines the size of the on-stack array used as a buffer.
  70. */
  71. #define ITER_GET_BVECS_PAGES 64
  72. static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize,
  73. struct bio_vec *bvecs)
  74. {
  75. size_t size = 0;
  76. int bvec_idx = 0;
  77. if (maxsize > iov_iter_count(iter))
  78. maxsize = iov_iter_count(iter);
  79. while (size < maxsize) {
  80. struct page *pages[ITER_GET_BVECS_PAGES];
  81. ssize_t bytes;
  82. size_t start;
  83. int idx = 0;
  84. bytes = iov_iter_get_pages2(iter, pages, maxsize - size,
  85. ITER_GET_BVECS_PAGES, &start);
  86. if (bytes < 0)
  87. return size ?: bytes;
  88. size += bytes;
  89. for ( ; bytes; idx++, bvec_idx++) {
  90. int len = min_t(int, bytes, PAGE_SIZE - start);
  91. bvec_set_page(&bvecs[bvec_idx], pages[idx], len, start);
  92. bytes -= len;
  93. start = 0;
  94. }
  95. }
  96. return size;
  97. }
  98. /*
  99. * iov_iter_get_pages() only considers one iov_iter segment, no matter
  100. * what maxsize or maxpages are given. For ITER_BVEC that is a single
  101. * page.
  102. *
  103. * Attempt to get up to @maxsize bytes worth of pages from @iter.
  104. * Return the number of bytes in the created bio_vec array, or an error.
  105. */
  106. static ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize,
  107. struct bio_vec **bvecs, int *num_bvecs)
  108. {
  109. struct bio_vec *bv;
  110. size_t orig_count = iov_iter_count(iter);
  111. ssize_t bytes;
  112. int npages;
  113. iov_iter_truncate(iter, maxsize);
  114. npages = iov_iter_npages(iter, INT_MAX);
  115. iov_iter_reexpand(iter, orig_count);
  116. /*
  117. * __iter_get_bvecs() may populate only part of the array -- zero it
  118. * out.
  119. */
  120. bv = kvmalloc_array(npages, sizeof(*bv), GFP_KERNEL | __GFP_ZERO);
  121. if (!bv)
  122. return -ENOMEM;
  123. bytes = __iter_get_bvecs(iter, maxsize, bv);
  124. if (bytes < 0) {
  125. /*
  126. * No pages were pinned -- just free the array.
  127. */
  128. kvfree(bv);
  129. return bytes;
  130. }
  131. *bvecs = bv;
  132. *num_bvecs = npages;
  133. return bytes;
  134. }
  135. static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty)
  136. {
  137. int i;
  138. for (i = 0; i < num_bvecs; i++) {
  139. if (bvecs[i].bv_page) {
  140. if (should_dirty)
  141. set_page_dirty_lock(bvecs[i].bv_page);
  142. put_page(bvecs[i].bv_page);
  143. }
  144. }
  145. kvfree(bvecs);
  146. }
  147. /*
  148. * Prepare an open request. Preallocate ceph_cap to avoid an
  149. * inopportune ENOMEM later.
  150. */
  151. static struct ceph_mds_request *
  152. prepare_open_request(struct super_block *sb, int flags, int create_mode)
  153. {
  154. struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
  155. struct ceph_mds_request *req;
  156. int want_auth = USE_ANY_MDS;
  157. int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
  158. if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
  159. want_auth = USE_AUTH_MDS;
  160. req = ceph_mdsc_create_request(mdsc, op, want_auth);
  161. if (IS_ERR(req))
  162. goto out;
  163. req->r_fmode = ceph_flags_to_mode(flags);
  164. req->r_args.open.flags = ceph_flags_sys2wire(mdsc, flags);
  165. req->r_args.open.mode = cpu_to_le32(create_mode);
  166. out:
  167. return req;
  168. }
  169. static int ceph_init_file_info(struct inode *inode, struct file *file,
  170. int fmode, bool isdir)
  171. {
  172. struct ceph_inode_info *ci = ceph_inode(inode);
  173. struct ceph_mount_options *opt =
  174. ceph_inode_to_fs_client(&ci->netfs.inode)->mount_options;
  175. struct ceph_client *cl = ceph_inode_to_client(inode);
  176. struct ceph_file_info *fi;
  177. int ret;
  178. doutc(cl, "%p %llx.%llx %p 0%o (%s)\n", inode, ceph_vinop(inode),
  179. file, inode->i_mode, isdir ? "dir" : "regular");
  180. BUG_ON(inode->i_fop->release != ceph_release);
  181. if (isdir) {
  182. struct ceph_dir_file_info *dfi =
  183. kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
  184. if (!dfi)
  185. return -ENOMEM;
  186. file->private_data = dfi;
  187. fi = &dfi->file_info;
  188. dfi->next_offset = 2;
  189. dfi->readdir_cache_idx = -1;
  190. } else {
  191. fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
  192. if (!fi)
  193. return -ENOMEM;
  194. if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
  195. fi->flags |= CEPH_F_SYNC;
  196. file->private_data = fi;
  197. }
  198. ceph_get_fmode(ci, fmode, 1);
  199. fi->fmode = fmode;
  200. spin_lock_init(&fi->rw_contexts_lock);
  201. INIT_LIST_HEAD(&fi->rw_contexts);
  202. fi->filp_gen = READ_ONCE(ceph_inode_to_fs_client(inode)->filp_gen);
  203. if ((file->f_mode & FMODE_WRITE) && ceph_has_inline_data(ci)) {
  204. ret = ceph_uninline_data(file);
  205. if (ret < 0)
  206. goto error;
  207. }
  208. return 0;
  209. error:
  210. ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
  211. ceph_put_fmode(ci, fi->fmode, 1);
  212. kmem_cache_free(ceph_file_cachep, fi);
  213. /* wake up anyone waiting for caps on this inode */
  214. wake_up_all(&ci->i_cap_wq);
  215. return ret;
  216. }
  217. /*
  218. * initialize private struct file data.
  219. * if we fail, clean up by dropping fmode reference on the ceph_inode
  220. */
  221. static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
  222. {
  223. struct ceph_client *cl = ceph_inode_to_client(inode);
  224. int ret = 0;
  225. switch (inode->i_mode & S_IFMT) {
  226. case S_IFREG:
  227. ceph_fscache_use_cookie(inode, file->f_mode & FMODE_WRITE);
  228. fallthrough;
  229. case S_IFDIR:
  230. ret = ceph_init_file_info(inode, file, fmode,
  231. S_ISDIR(inode->i_mode));
  232. break;
  233. case S_IFLNK:
  234. doutc(cl, "%p %llx.%llx %p 0%o (symlink)\n", inode,
  235. ceph_vinop(inode), file, inode->i_mode);
  236. break;
  237. default:
  238. doutc(cl, "%p %llx.%llx %p 0%o (special)\n", inode,
  239. ceph_vinop(inode), file, inode->i_mode);
  240. /*
  241. * we need to drop the open ref now, since we don't
  242. * have .release set to ceph_release.
  243. */
  244. BUG_ON(inode->i_fop->release == ceph_release);
  245. /* call the proper open fop */
  246. ret = inode->i_fop->open(inode, file);
  247. }
  248. return ret;
  249. }
  250. /*
  251. * try renew caps after session gets killed.
  252. */
  253. int ceph_renew_caps(struct inode *inode, int fmode)
  254. {
  255. struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
  256. struct ceph_client *cl = mdsc->fsc->client;
  257. struct ceph_inode_info *ci = ceph_inode(inode);
  258. struct ceph_mds_request *req;
  259. int err, flags, wanted;
  260. spin_lock(&ci->i_ceph_lock);
  261. __ceph_touch_fmode(ci, mdsc, fmode);
  262. wanted = __ceph_caps_file_wanted(ci);
  263. if (__ceph_is_any_real_caps(ci) &&
  264. (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
  265. int issued = __ceph_caps_issued(ci, NULL);
  266. spin_unlock(&ci->i_ceph_lock);
  267. doutc(cl, "%p %llx.%llx want %s issued %s updating mds_wanted\n",
  268. inode, ceph_vinop(inode), ceph_cap_string(wanted),
  269. ceph_cap_string(issued));
  270. ceph_check_caps(ci, 0);
  271. return 0;
  272. }
  273. spin_unlock(&ci->i_ceph_lock);
  274. flags = 0;
  275. if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
  276. flags = O_RDWR;
  277. else if (wanted & CEPH_CAP_FILE_RD)
  278. flags = O_RDONLY;
  279. else if (wanted & CEPH_CAP_FILE_WR)
  280. flags = O_WRONLY;
  281. #ifdef O_LAZY
  282. if (wanted & CEPH_CAP_FILE_LAZYIO)
  283. flags |= O_LAZY;
  284. #endif
  285. req = prepare_open_request(inode->i_sb, flags, 0);
  286. if (IS_ERR(req)) {
  287. err = PTR_ERR(req);
  288. goto out;
  289. }
  290. req->r_inode = inode;
  291. ihold(inode);
  292. req->r_num_caps = 1;
  293. err = ceph_mdsc_do_request(mdsc, NULL, req);
  294. ceph_mdsc_put_request(req);
  295. out:
  296. doutc(cl, "%p %llx.%llx open result=%d\n", inode, ceph_vinop(inode),
  297. err);
  298. return err < 0 ? err : 0;
  299. }
  300. /*
  301. * If we already have the requisite capabilities, we can satisfy
  302. * the open request locally (no need to request new caps from the
  303. * MDS). We do, however, need to inform the MDS (asynchronously)
  304. * if our wanted caps set expands.
  305. */
  306. int ceph_open(struct inode *inode, struct file *file)
  307. {
  308. struct ceph_inode_info *ci = ceph_inode(inode);
  309. struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb);
  310. struct ceph_client *cl = fsc->client;
  311. struct ceph_mds_client *mdsc = fsc->mdsc;
  312. struct ceph_mds_request *req;
  313. struct ceph_file_info *fi = file->private_data;
  314. int err;
  315. int flags, fmode, wanted;
  316. struct dentry *dentry;
  317. char *path;
  318. bool do_sync = false;
  319. int mask = MAY_READ;
  320. if (fi) {
  321. doutc(cl, "file %p is already opened\n", file);
  322. return 0;
  323. }
  324. /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */
  325. flags = file->f_flags & ~(O_CREAT|O_EXCL);
  326. if (S_ISDIR(inode->i_mode)) {
  327. flags = O_DIRECTORY; /* mds likes to know */
  328. } else if (S_ISREG(inode->i_mode)) {
  329. err = fscrypt_file_open(inode, file);
  330. if (err)
  331. return err;
  332. }
  333. doutc(cl, "%p %llx.%llx file %p flags %d (%d)\n", inode,
  334. ceph_vinop(inode), file, flags, file->f_flags);
  335. fmode = ceph_flags_to_mode(flags);
  336. wanted = ceph_caps_for_mode(fmode);
  337. if (fmode & CEPH_FILE_MODE_WR)
  338. mask |= MAY_WRITE;
  339. dentry = d_find_alias(inode);
  340. if (!dentry) {
  341. do_sync = true;
  342. } else {
  343. struct ceph_path_info path_info;
  344. path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
  345. if (IS_ERR(path)) {
  346. do_sync = true;
  347. err = 0;
  348. } else {
  349. err = ceph_mds_check_access(mdsc, path, mask);
  350. }
  351. ceph_mdsc_free_path_info(&path_info);
  352. dput(dentry);
  353. /* For none EACCES cases will let the MDS do the mds auth check */
  354. if (err == -EACCES) {
  355. return err;
  356. } else if (err < 0) {
  357. do_sync = true;
  358. err = 0;
  359. }
  360. }
  361. /* snapped files are read-only */
  362. if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
  363. return -EROFS;
  364. /* trivially open snapdir */
  365. if (ceph_snap(inode) == CEPH_SNAPDIR) {
  366. return ceph_init_file(inode, file, fmode);
  367. }
  368. /*
  369. * No need to block if we have caps on the auth MDS (for
  370. * write) or any MDS (for read). Update wanted set
  371. * asynchronously.
  372. */
  373. spin_lock(&ci->i_ceph_lock);
  374. if (!do_sync && __ceph_is_any_real_caps(ci) &&
  375. (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
  376. int mds_wanted = __ceph_caps_mds_wanted(ci, true);
  377. int issued = __ceph_caps_issued(ci, NULL);
  378. doutc(cl, "open %p fmode %d want %s issued %s using existing\n",
  379. inode, fmode, ceph_cap_string(wanted),
  380. ceph_cap_string(issued));
  381. __ceph_touch_fmode(ci, mdsc, fmode);
  382. spin_unlock(&ci->i_ceph_lock);
  383. /* adjust wanted? */
  384. if ((issued & wanted) != wanted &&
  385. (mds_wanted & wanted) != wanted &&
  386. ceph_snap(inode) != CEPH_SNAPDIR)
  387. ceph_check_caps(ci, 0);
  388. return ceph_init_file(inode, file, fmode);
  389. } else if (!do_sync && ceph_snap(inode) != CEPH_NOSNAP &&
  390. (ci->i_snap_caps & wanted) == wanted) {
  391. __ceph_touch_fmode(ci, mdsc, fmode);
  392. spin_unlock(&ci->i_ceph_lock);
  393. return ceph_init_file(inode, file, fmode);
  394. }
  395. spin_unlock(&ci->i_ceph_lock);
  396. doutc(cl, "open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
  397. req = prepare_open_request(inode->i_sb, flags, 0);
  398. if (IS_ERR(req)) {
  399. err = PTR_ERR(req);
  400. goto out;
  401. }
  402. req->r_inode = inode;
  403. ihold(inode);
  404. req->r_num_caps = 1;
  405. err = ceph_mdsc_do_request(mdsc, NULL, req);
  406. if (!err)
  407. err = ceph_init_file(inode, file, req->r_fmode);
  408. ceph_mdsc_put_request(req);
  409. doutc(cl, "open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
  410. out:
  411. return err;
  412. }
  413. /* Clone the layout from a synchronous create, if the dir now has Dc caps */
  414. static void
  415. cache_file_layout(struct inode *dst, struct inode *src)
  416. {
  417. struct ceph_inode_info *cdst = ceph_inode(dst);
  418. struct ceph_inode_info *csrc = ceph_inode(src);
  419. spin_lock(&cdst->i_ceph_lock);
  420. if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) &&
  421. !ceph_file_layout_is_valid(&cdst->i_cached_layout)) {
  422. memcpy(&cdst->i_cached_layout, &csrc->i_layout,
  423. sizeof(cdst->i_cached_layout));
  424. rcu_assign_pointer(cdst->i_cached_layout.pool_ns,
  425. ceph_try_get_string(csrc->i_layout.pool_ns));
  426. }
  427. spin_unlock(&cdst->i_ceph_lock);
  428. }
  429. /*
  430. * Try to set up an async create. We need caps, a file layout, and inode number,
  431. * and either a lease on the dentry or complete dir info. If any of those
  432. * criteria are not satisfied, then return false and the caller can go
  433. * synchronous.
  434. */
  435. static int try_prep_async_create(struct inode *dir, struct dentry *dentry,
  436. struct ceph_file_layout *lo, u64 *pino)
  437. {
  438. struct ceph_inode_info *ci = ceph_inode(dir);
  439. struct ceph_dentry_info *di = ceph_dentry(dentry);
  440. int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE;
  441. u64 ino;
  442. spin_lock(&ci->i_ceph_lock);
  443. /* No auth cap means no chance for Dc caps */
  444. if (!ci->i_auth_cap)
  445. goto no_async;
  446. /* Any delegated inos? */
  447. if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos))
  448. goto no_async;
  449. if (!ceph_file_layout_is_valid(&ci->i_cached_layout))
  450. goto no_async;
  451. if ((__ceph_caps_issued(ci, NULL) & want) != want)
  452. goto no_async;
  453. if (d_in_lookup(dentry)) {
  454. if (!__ceph_dir_is_complete(ci))
  455. goto no_async;
  456. spin_lock(&dentry->d_lock);
  457. di->lease_shared_gen = atomic_read(&ci->i_shared_gen);
  458. spin_unlock(&dentry->d_lock);
  459. } else if (atomic_read(&ci->i_shared_gen) !=
  460. READ_ONCE(di->lease_shared_gen)) {
  461. goto no_async;
  462. }
  463. ino = ceph_get_deleg_ino(ci->i_auth_cap->session);
  464. if (!ino)
  465. goto no_async;
  466. *pino = ino;
  467. ceph_take_cap_refs(ci, want, false);
  468. memcpy(lo, &ci->i_cached_layout, sizeof(*lo));
  469. rcu_assign_pointer(lo->pool_ns,
  470. ceph_try_get_string(ci->i_cached_layout.pool_ns));
  471. got = want;
  472. no_async:
  473. spin_unlock(&ci->i_ceph_lock);
  474. return got;
  475. }
  476. static void restore_deleg_ino(struct inode *dir, u64 ino)
  477. {
  478. struct ceph_client *cl = ceph_inode_to_client(dir);
  479. struct ceph_inode_info *ci = ceph_inode(dir);
  480. struct ceph_mds_session *s = NULL;
  481. spin_lock(&ci->i_ceph_lock);
  482. if (ci->i_auth_cap)
  483. s = ceph_get_mds_session(ci->i_auth_cap->session);
  484. spin_unlock(&ci->i_ceph_lock);
  485. if (s) {
  486. int err = ceph_restore_deleg_ino(s, ino);
  487. if (err)
  488. pr_warn_client(cl,
  489. "unable to restore delegated ino 0x%llx to session: %d\n",
  490. ino, err);
  491. ceph_put_mds_session(s);
  492. }
  493. }
  494. static void wake_async_create_waiters(struct inode *inode,
  495. struct ceph_mds_session *session)
  496. {
  497. struct ceph_inode_info *ci = ceph_inode(inode);
  498. bool check_cap = false;
  499. spin_lock(&ci->i_ceph_lock);
  500. if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
  501. ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
  502. wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
  503. if (ci->i_ceph_flags & CEPH_I_ASYNC_CHECK_CAPS) {
  504. ci->i_ceph_flags &= ~CEPH_I_ASYNC_CHECK_CAPS;
  505. check_cap = true;
  506. }
  507. }
  508. ceph_kick_flushing_inode_caps(session, ci);
  509. spin_unlock(&ci->i_ceph_lock);
  510. if (check_cap)
  511. ceph_check_caps(ci, CHECK_CAPS_FLUSH);
  512. }
  513. static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
  514. struct ceph_mds_request *req)
  515. {
  516. struct ceph_client *cl = mdsc->fsc->client;
  517. struct dentry *dentry = req->r_dentry;
  518. struct inode *dinode = d_inode(dentry);
  519. struct inode *tinode = req->r_target_inode;
  520. int result = req->r_err ? req->r_err :
  521. le32_to_cpu(req->r_reply_info.head->result);
  522. WARN_ON_ONCE(dinode && tinode && dinode != tinode);
  523. /* MDS changed -- caller must resubmit */
  524. if (result == -EJUKEBOX)
  525. goto out;
  526. mapping_set_error(req->r_parent->i_mapping, result);
  527. if (result) {
  528. struct ceph_path_info path_info = {0};
  529. char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
  530. pr_warn_client(cl,
  531. "async create failure path=(%llx)%s result=%d!\n",
  532. path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result);
  533. ceph_mdsc_free_path_info(&path_info);
  534. ceph_dir_clear_complete(req->r_parent);
  535. if (!d_unhashed(dentry))
  536. d_drop(dentry);
  537. if (dinode) {
  538. mapping_set_error(dinode->i_mapping, result);
  539. ceph_inode_shutdown(dinode);
  540. wake_async_create_waiters(dinode, req->r_session);
  541. }
  542. }
  543. if (tinode) {
  544. u64 ino = ceph_vino(tinode).ino;
  545. if (req->r_deleg_ino != ino)
  546. pr_warn_client(cl,
  547. "inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
  548. req->r_err, req->r_deleg_ino, ino);
  549. mapping_set_error(tinode->i_mapping, result);
  550. wake_async_create_waiters(tinode, req->r_session);
  551. } else if (!result) {
  552. pr_warn_client(cl, "no req->r_target_inode for 0x%llx\n",
  553. req->r_deleg_ino);
  554. }
  555. out:
  556. ceph_mdsc_release_dir_caps(req);
  557. }
  558. static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
  559. struct dentry *dentry,
  560. struct file *file, umode_t mode,
  561. struct ceph_mds_request *req,
  562. struct ceph_acl_sec_ctx *as_ctx,
  563. struct ceph_file_layout *lo)
  564. {
  565. int ret;
  566. char xattr_buf[4];
  567. struct ceph_mds_reply_inode in = { };
  568. struct ceph_mds_reply_info_in iinfo = { .in = &in };
  569. struct ceph_inode_info *ci = ceph_inode(dir);
  570. struct ceph_dentry_info *di = ceph_dentry(dentry);
  571. struct timespec64 now;
  572. struct ceph_string *pool_ns;
  573. struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
  574. struct ceph_client *cl = mdsc->fsc->client;
  575. struct ceph_vino vino = { .ino = req->r_deleg_ino,
  576. .snap = CEPH_NOSNAP };
  577. ktime_get_real_ts64(&now);
  578. iinfo.inline_version = CEPH_INLINE_NONE;
  579. iinfo.change_attr = 1;
  580. ceph_encode_timespec64(&iinfo.btime, &now);
  581. if (req->r_pagelist) {
  582. iinfo.xattr_len = req->r_pagelist->length;
  583. iinfo.xattr_data = req->r_pagelist->mapped_tail;
  584. } else {
  585. /* fake it */
  586. iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
  587. iinfo.xattr_data = xattr_buf;
  588. memset(iinfo.xattr_data, 0, iinfo.xattr_len);
  589. }
  590. in.ino = cpu_to_le64(vino.ino);
  591. in.snapid = cpu_to_le64(CEPH_NOSNAP);
  592. in.version = cpu_to_le64(1); // ???
  593. in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE);
  594. in.cap.cap_id = cpu_to_le64(1);
  595. in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino);
  596. in.cap.flags = CEPH_CAP_FLAG_AUTH;
  597. in.ctime = in.mtime = in.atime = iinfo.btime;
  598. in.truncate_seq = cpu_to_le32(1);
  599. in.truncate_size = cpu_to_le64(-1ULL);
  600. in.xattr_version = cpu_to_le64(1);
  601. in.uid = cpu_to_le32(from_kuid(&init_user_ns,
  602. mapped_fsuid(req->r_mnt_idmap,
  603. &init_user_ns)));
  604. if (dir->i_mode & S_ISGID) {
  605. in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_gid));
  606. /* Directories always inherit the setgid bit. */
  607. if (S_ISDIR(mode))
  608. mode |= S_ISGID;
  609. } else {
  610. in.gid = cpu_to_le32(from_kgid(&init_user_ns,
  611. mapped_fsgid(req->r_mnt_idmap,
  612. &init_user_ns)));
  613. }
  614. in.mode = cpu_to_le32((u32)mode);
  615. in.nlink = cpu_to_le32(1);
  616. in.max_size = cpu_to_le64(lo->stripe_unit);
  617. ceph_file_layout_to_legacy(lo, &in.layout);
  618. /* lo is private, so pool_ns can't change */
  619. pool_ns = rcu_dereference_raw(lo->pool_ns);
  620. if (pool_ns) {
  621. iinfo.pool_ns_len = pool_ns->len;
  622. iinfo.pool_ns_data = pool_ns->str;
  623. }
  624. down_read(&mdsc->snap_rwsem);
  625. ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session,
  626. req->r_fmode, NULL);
  627. up_read(&mdsc->snap_rwsem);
  628. if (ret) {
  629. doutc(cl, "failed to fill inode: %d\n", ret);
  630. ceph_dir_clear_complete(dir);
  631. if (!d_unhashed(dentry))
  632. d_drop(dentry);
  633. discard_new_inode(inode);
  634. } else {
  635. struct dentry *dn;
  636. doutc(cl, "d_adding new inode 0x%llx to 0x%llx/%s\n",
  637. vino.ino, ceph_ino(dir), dentry->d_name.name);
  638. ceph_dir_clear_ordered(dir);
  639. ceph_init_inode_acls(inode, as_ctx);
  640. if (inode->i_state & I_NEW) {
  641. /*
  642. * If it's not I_NEW, then someone created this before
  643. * we got here. Assume the server is aware of it at
  644. * that point and don't worry about setting
  645. * CEPH_I_ASYNC_CREATE.
  646. */
  647. ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE;
  648. unlock_new_inode(inode);
  649. }
  650. if (d_in_lookup(dentry) || d_really_is_negative(dentry)) {
  651. if (!d_unhashed(dentry))
  652. d_drop(dentry);
  653. dn = d_splice_alias(inode, dentry);
  654. WARN_ON_ONCE(dn && dn != dentry);
  655. }
  656. file->f_mode |= FMODE_CREATED;
  657. ret = finish_open(file, dentry, ceph_open);
  658. }
  659. spin_lock(&dentry->d_lock);
  660. di->flags &= ~CEPH_DENTRY_ASYNC_CREATE;
  661. wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT);
  662. spin_unlock(&dentry->d_lock);
  663. return ret;
  664. }
  665. /*
  666. * Do a lookup + open with a single request. If we get a non-existent
  667. * file or symlink, return 1 so the VFS can retry.
  668. */
  669. int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
  670. struct file *file, unsigned flags, umode_t mode)
  671. {
  672. struct mnt_idmap *idmap = file_mnt_idmap(file);
  673. struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dir->i_sb);
  674. struct ceph_client *cl = fsc->client;
  675. struct ceph_mds_client *mdsc = fsc->mdsc;
  676. struct ceph_mds_request *req;
  677. struct inode *new_inode = NULL;
  678. struct dentry *dn;
  679. struct ceph_acl_sec_ctx as_ctx = {};
  680. bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
  681. int mask;
  682. int err;
  683. char *path;
  684. doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n",
  685. dir, ceph_vinop(dir), dentry, dentry,
  686. d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
  687. if (dentry->d_name.len > NAME_MAX)
  688. return -ENAMETOOLONG;
  689. err = ceph_wait_on_conflict_unlink(dentry);
  690. if (err)
  691. return err;
  692. /*
  693. * Do not truncate the file, since atomic_open is called before the
  694. * permission check. The caller will do the truncation afterward.
  695. */
  696. flags &= ~O_TRUNC;
  697. dn = d_find_alias(dir);
  698. if (!dn) {
  699. try_async = false;
  700. } else {
  701. struct ceph_path_info path_info;
  702. path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0);
  703. if (IS_ERR(path)) {
  704. try_async = false;
  705. err = 0;
  706. } else {
  707. int fmode = ceph_flags_to_mode(flags);
  708. mask = MAY_READ;
  709. if (fmode & CEPH_FILE_MODE_WR)
  710. mask |= MAY_WRITE;
  711. err = ceph_mds_check_access(mdsc, path, mask);
  712. }
  713. ceph_mdsc_free_path_info(&path_info);
  714. dput(dn);
  715. /* For none EACCES cases will let the MDS do the mds auth check */
  716. if (err == -EACCES) {
  717. return err;
  718. } else if (err < 0) {
  719. try_async = false;
  720. err = 0;
  721. }
  722. }
  723. retry:
  724. if (flags & O_CREAT) {
  725. if (ceph_quota_is_max_files_exceeded(dir))
  726. return -EDQUOT;
  727. new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx);
  728. if (IS_ERR(new_inode)) {
  729. err = PTR_ERR(new_inode);
  730. goto out_ctx;
  731. }
  732. /* Async create can't handle more than a page of xattrs */
  733. if (as_ctx.pagelist &&
  734. !list_is_singular(&as_ctx.pagelist->head))
  735. try_async = false;
  736. } else if (!d_in_lookup(dentry)) {
  737. /* If it's not being looked up, it's negative */
  738. return -ENOENT;
  739. }
  740. /* do the open */
  741. req = prepare_open_request(dir->i_sb, flags, mode);
  742. if (IS_ERR(req)) {
  743. err = PTR_ERR(req);
  744. goto out_ctx;
  745. }
  746. req->r_dentry = dget(dentry);
  747. req->r_num_caps = 2;
  748. mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
  749. if (ceph_security_xattr_wanted(dir))
  750. mask |= CEPH_CAP_XATTR_SHARED;
  751. req->r_args.open.mask = cpu_to_le32(mask);
  752. req->r_parent = dir;
  753. if (req->r_op == CEPH_MDS_OP_CREATE)
  754. req->r_mnt_idmap = mnt_idmap_get(idmap);
  755. ihold(dir);
  756. if (IS_ENCRYPTED(dir)) {
  757. set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
  758. err = fscrypt_prepare_lookup_partial(dir, dentry);
  759. if (err < 0)
  760. goto out_req;
  761. }
  762. if (flags & O_CREAT) {
  763. struct ceph_file_layout lo;
  764. req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
  765. CEPH_CAP_XATTR_EXCL;
  766. req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
  767. ceph_as_ctx_to_req(req, &as_ctx);
  768. if (try_async && (req->r_dir_caps =
  769. try_prep_async_create(dir, dentry, &lo,
  770. &req->r_deleg_ino))) {
  771. struct ceph_vino vino = { .ino = req->r_deleg_ino,
  772. .snap = CEPH_NOSNAP };
  773. struct ceph_dentry_info *di = ceph_dentry(dentry);
  774. set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
  775. req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL);
  776. req->r_callback = ceph_async_create_cb;
  777. /* Hash inode before RPC */
  778. new_inode = ceph_get_inode(dir->i_sb, vino, new_inode);
  779. if (IS_ERR(new_inode)) {
  780. err = PTR_ERR(new_inode);
  781. new_inode = NULL;
  782. goto out_req;
  783. }
  784. WARN_ON_ONCE(!(new_inode->i_state & I_NEW));
  785. spin_lock(&dentry->d_lock);
  786. di->flags |= CEPH_DENTRY_ASYNC_CREATE;
  787. spin_unlock(&dentry->d_lock);
  788. err = ceph_mdsc_submit_request(mdsc, dir, req);
  789. if (!err) {
  790. err = ceph_finish_async_create(dir, new_inode,
  791. dentry, file,
  792. mode, req,
  793. &as_ctx, &lo);
  794. new_inode = NULL;
  795. } else if (err == -EJUKEBOX) {
  796. restore_deleg_ino(dir, req->r_deleg_ino);
  797. ceph_mdsc_put_request(req);
  798. discard_new_inode(new_inode);
  799. ceph_release_acl_sec_ctx(&as_ctx);
  800. memset(&as_ctx, 0, sizeof(as_ctx));
  801. new_inode = NULL;
  802. try_async = false;
  803. ceph_put_string(rcu_dereference_raw(lo.pool_ns));
  804. goto retry;
  805. }
  806. ceph_put_string(rcu_dereference_raw(lo.pool_ns));
  807. goto out_req;
  808. }
  809. }
  810. set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
  811. req->r_new_inode = new_inode;
  812. new_inode = NULL;
  813. err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req);
  814. if (err == -ENOENT) {
  815. dentry = ceph_handle_snapdir(req, dentry);
  816. if (IS_ERR(dentry)) {
  817. err = PTR_ERR(dentry);
  818. goto out_req;
  819. }
  820. err = 0;
  821. }
  822. if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
  823. err = ceph_handle_notrace_create(dir, dentry);
  824. if (d_in_lookup(dentry)) {
  825. dn = ceph_finish_lookup(req, dentry, err);
  826. if (IS_ERR(dn))
  827. err = PTR_ERR(dn);
  828. } else {
  829. /* we were given a hashed negative dentry */
  830. dn = NULL;
  831. }
  832. if (err)
  833. goto out_req;
  834. if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) {
  835. /* make vfs retry on splice, ENOENT, or symlink */
  836. doutc(cl, "finish_no_open on dn %p\n", dn);
  837. err = finish_no_open(file, dn);
  838. } else {
  839. if (IS_ENCRYPTED(dir) &&
  840. !fscrypt_has_permitted_context(dir, d_inode(dentry))) {
  841. pr_warn_client(cl,
  842. "Inconsistent encryption context (parent %llx:%llx child %llx:%llx)\n",
  843. ceph_vinop(dir), ceph_vinop(d_inode(dentry)));
  844. goto out_req;
  845. }
  846. doutc(cl, "finish_open on dn %p\n", dn);
  847. if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
  848. struct inode *newino = d_inode(dentry);
  849. cache_file_layout(dir, newino);
  850. ceph_init_inode_acls(newino, &as_ctx);
  851. file->f_mode |= FMODE_CREATED;
  852. }
  853. err = finish_open(file, dentry, ceph_open);
  854. }
  855. out_req:
  856. ceph_mdsc_put_request(req);
  857. iput(new_inode);
  858. out_ctx:
  859. ceph_release_acl_sec_ctx(&as_ctx);
  860. doutc(cl, "result=%d\n", err);
  861. return err;
  862. }
  863. int ceph_release(struct inode *inode, struct file *file)
  864. {
  865. struct ceph_client *cl = ceph_inode_to_client(inode);
  866. struct ceph_inode_info *ci = ceph_inode(inode);
  867. if (S_ISDIR(inode->i_mode)) {
  868. struct ceph_dir_file_info *dfi = file->private_data;
  869. doutc(cl, "%p %llx.%llx dir file %p\n", inode,
  870. ceph_vinop(inode), file);
  871. WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
  872. ceph_put_fmode(ci, dfi->file_info.fmode, 1);
  873. if (dfi->last_readdir)
  874. ceph_mdsc_put_request(dfi->last_readdir);
  875. kfree(dfi->last_name);
  876. kfree(dfi->dir_info);
  877. kmem_cache_free(ceph_dir_file_cachep, dfi);
  878. } else {
  879. struct ceph_file_info *fi = file->private_data;
  880. doutc(cl, "%p %llx.%llx regular file %p\n", inode,
  881. ceph_vinop(inode), file);
  882. WARN_ON(!list_empty(&fi->rw_contexts));
  883. ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
  884. ceph_put_fmode(ci, fi->fmode, 1);
  885. kmem_cache_free(ceph_file_cachep, fi);
  886. }
  887. /* wake up anyone waiting for caps on this inode */
  888. wake_up_all(&ci->i_cap_wq);
  889. return 0;
  890. }
  891. enum {
  892. HAVE_RETRIED = 1,
  893. CHECK_EOF = 2,
  894. READ_INLINE = 3,
  895. };
  896. /*
  897. * Completely synchronous read and write methods. Direct from __user
  898. * buffer to osd, or directly to user pages (if O_DIRECT).
  899. *
  900. * If the read spans object boundary, just do multiple reads. (That's not
  901. * atomic, but good enough for now.)
  902. *
  903. * If we get a short result from the OSD, check against i_size; we need to
  904. * only return a short read to the caller if we hit EOF.
  905. */
  906. ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
  907. struct iov_iter *to, int *retry_op,
  908. u64 *last_objver)
  909. {
  910. struct ceph_inode_info *ci = ceph_inode(inode);
  911. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  912. struct ceph_client *cl = fsc->client;
  913. struct ceph_osd_client *osdc = &fsc->client->osdc;
  914. ssize_t ret;
  915. u64 off = *ki_pos;
  916. u64 len = iov_iter_count(to);
  917. u64 i_size = i_size_read(inode);
  918. bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
  919. u64 objver = 0;
  920. doutc(cl, "on inode %p %llx.%llx %llx~%llx\n", inode,
  921. ceph_vinop(inode), *ki_pos, len);
  922. if (ceph_inode_is_shutdown(inode))
  923. return -EIO;
  924. if (!len || !i_size)
  925. return 0;
  926. /*
  927. * flush any page cache pages in this range. this
  928. * will make concurrent normal and sync io slow,
  929. * but it will at least behave sensibly when they are
  930. * in sequence.
  931. */
  932. ret = filemap_write_and_wait_range(inode->i_mapping,
  933. off, off + len - 1);
  934. if (ret < 0)
  935. return ret;
  936. ret = 0;
  937. while ((len = iov_iter_count(to)) > 0) {
  938. struct ceph_osd_request *req;
  939. struct page **pages;
  940. int num_pages;
  941. size_t page_off;
  942. bool more;
  943. int idx = 0;
  944. size_t left;
  945. struct ceph_osd_req_op *op;
  946. u64 read_off = off;
  947. u64 read_len = len;
  948. int extent_cnt;
  949. /* determine new offset/length if encrypted */
  950. ceph_fscrypt_adjust_off_and_len(inode, &read_off, &read_len);
  951. doutc(cl, "orig %llu~%llu reading %llu~%llu", off, len,
  952. read_off, read_len);
  953. req = ceph_osdc_new_request(osdc, &ci->i_layout,
  954. ci->i_vino, read_off, &read_len, 0, 1,
  955. sparse ? CEPH_OSD_OP_SPARSE_READ :
  956. CEPH_OSD_OP_READ,
  957. CEPH_OSD_FLAG_READ,
  958. NULL, ci->i_truncate_seq,
  959. ci->i_truncate_size, false);
  960. if (IS_ERR(req)) {
  961. ret = PTR_ERR(req);
  962. break;
  963. }
  964. /* adjust len downward if the request truncated the len */
  965. if (off + len > read_off + read_len)
  966. len = read_off + read_len - off;
  967. more = len < iov_iter_count(to);
  968. op = &req->r_ops[0];
  969. if (sparse) {
  970. extent_cnt = __ceph_sparse_read_ext_count(inode, read_len);
  971. ret = ceph_alloc_sparse_ext_map(op, extent_cnt);
  972. if (ret) {
  973. ceph_osdc_put_request(req);
  974. break;
  975. }
  976. }
  977. num_pages = calc_pages_for(read_off, read_len);
  978. page_off = offset_in_page(off);
  979. pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
  980. if (IS_ERR(pages)) {
  981. ceph_osdc_put_request(req);
  982. ret = PTR_ERR(pages);
  983. break;
  984. }
  985. osd_req_op_extent_osd_data_pages(req, 0, pages, read_len,
  986. offset_in_page(read_off),
  987. false, true);
  988. ceph_osdc_start_request(osdc, req);
  989. ret = ceph_osdc_wait_request(osdc, req);
  990. ceph_update_read_metrics(&fsc->mdsc->metric,
  991. req->r_start_latency,
  992. req->r_end_latency,
  993. read_len, ret);
  994. if (ret > 0)
  995. objver = req->r_version;
  996. i_size = i_size_read(inode);
  997. doutc(cl, "%llu~%llu got %zd i_size %llu%s\n", off, len,
  998. ret, i_size, (more ? " MORE" : ""));
  999. /* Fix it to go to end of extent map */
  1000. if (sparse && ret >= 0)
  1001. ret = ceph_sparse_ext_map_end(op);
  1002. else if (ret == -ENOENT)
  1003. ret = 0;
  1004. if (ret < 0) {
  1005. ceph_osdc_put_request(req);
  1006. if (ret == -EBLOCKLISTED)
  1007. fsc->blocklisted = true;
  1008. break;
  1009. }
  1010. if (IS_ENCRYPTED(inode)) {
  1011. int fret;
  1012. fret = ceph_fscrypt_decrypt_extents(inode, pages,
  1013. read_off, op->extent.sparse_ext,
  1014. op->extent.sparse_ext_cnt);
  1015. if (fret < 0) {
  1016. ret = fret;
  1017. ceph_osdc_put_request(req);
  1018. break;
  1019. }
  1020. /* account for any partial block at the beginning */
  1021. fret -= (off - read_off);
  1022. /*
  1023. * Short read after big offset adjustment?
  1024. * Nothing is usable, just call it a zero
  1025. * len read.
  1026. */
  1027. fret = max(fret, 0);
  1028. /* account for partial block at the end */
  1029. ret = min_t(ssize_t, fret, len);
  1030. }
  1031. /* Short read but not EOF? Zero out the remainder. */
  1032. if (ret < len && (off + ret < i_size)) {
  1033. int zlen = min(len - ret, i_size - off - ret);
  1034. int zoff = page_off + ret;
  1035. doutc(cl, "zero gap %llu~%llu\n", off + ret,
  1036. off + ret + zlen);
  1037. ceph_zero_page_vector_range(zoff, zlen, pages);
  1038. ret += zlen;
  1039. }
  1040. if (off + ret > i_size)
  1041. left = (i_size > off) ? i_size - off : 0;
  1042. else
  1043. left = ret;
  1044. while (left > 0) {
  1045. size_t plen, copied;
  1046. plen = min_t(size_t, left, PAGE_SIZE - page_off);
  1047. SetPageUptodate(pages[idx]);
  1048. copied = copy_page_to_iter(pages[idx++],
  1049. page_off, plen, to);
  1050. off += copied;
  1051. left -= copied;
  1052. page_off = 0;
  1053. if (copied < plen) {
  1054. ret = -EFAULT;
  1055. break;
  1056. }
  1057. }
  1058. ceph_osdc_put_request(req);
  1059. if (off >= i_size || !more)
  1060. break;
  1061. }
  1062. if (ret > 0) {
  1063. if (off >= i_size) {
  1064. *retry_op = CHECK_EOF;
  1065. ret = i_size - *ki_pos;
  1066. *ki_pos = i_size;
  1067. } else {
  1068. ret = off - *ki_pos;
  1069. *ki_pos = off;
  1070. }
  1071. if (last_objver)
  1072. *last_objver = objver;
  1073. }
  1074. doutc(cl, "result %zd retry_op %d\n", ret, *retry_op);
  1075. return ret;
  1076. }
  1077. static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
  1078. int *retry_op)
  1079. {
  1080. struct file *file = iocb->ki_filp;
  1081. struct inode *inode = file_inode(file);
  1082. struct ceph_client *cl = ceph_inode_to_client(inode);
  1083. doutc(cl, "on file %p %llx~%zx %s\n", file, iocb->ki_pos,
  1084. iov_iter_count(to),
  1085. (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
  1086. return __ceph_sync_read(inode, &iocb->ki_pos, to, retry_op, NULL);
  1087. }
  1088. struct ceph_aio_request {
  1089. struct kiocb *iocb;
  1090. size_t total_len;
  1091. bool write;
  1092. bool should_dirty;
  1093. int error;
  1094. struct list_head osd_reqs;
  1095. unsigned num_reqs;
  1096. atomic_t pending_reqs;
  1097. struct timespec64 mtime;
  1098. struct ceph_cap_flush *prealloc_cf;
  1099. };
  1100. struct ceph_aio_work {
  1101. struct work_struct work;
  1102. struct ceph_osd_request *req;
  1103. };
  1104. static void ceph_aio_retry_work(struct work_struct *work);
  1105. static void ceph_aio_complete(struct inode *inode,
  1106. struct ceph_aio_request *aio_req)
  1107. {
  1108. struct ceph_client *cl = ceph_inode_to_client(inode);
  1109. struct ceph_inode_info *ci = ceph_inode(inode);
  1110. int ret;
  1111. if (!atomic_dec_and_test(&aio_req->pending_reqs))
  1112. return;
  1113. if (aio_req->iocb->ki_flags & IOCB_DIRECT)
  1114. inode_dio_end(inode);
  1115. ret = aio_req->error;
  1116. if (!ret)
  1117. ret = aio_req->total_len;
  1118. doutc(cl, "%p %llx.%llx rc %d\n", inode, ceph_vinop(inode), ret);
  1119. if (ret >= 0 && aio_req->write) {
  1120. int dirty;
  1121. loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len;
  1122. if (endoff > i_size_read(inode)) {
  1123. if (ceph_inode_set_size(inode, endoff))
  1124. ceph_check_caps(ci, CHECK_CAPS_AUTHONLY);
  1125. }
  1126. spin_lock(&ci->i_ceph_lock);
  1127. dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
  1128. &aio_req->prealloc_cf);
  1129. spin_unlock(&ci->i_ceph_lock);
  1130. if (dirty)
  1131. __mark_inode_dirty(inode, dirty);
  1132. }
  1133. ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR :
  1134. CEPH_CAP_FILE_RD));
  1135. aio_req->iocb->ki_complete(aio_req->iocb, ret);
  1136. ceph_free_cap_flush(aio_req->prealloc_cf);
  1137. kfree(aio_req);
  1138. }
  1139. static void ceph_aio_complete_req(struct ceph_osd_request *req)
  1140. {
  1141. int rc = req->r_result;
  1142. struct inode *inode = req->r_inode;
  1143. struct ceph_aio_request *aio_req = req->r_priv;
  1144. struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
  1145. struct ceph_osd_req_op *op = &req->r_ops[0];
  1146. struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric;
  1147. unsigned int len = osd_data->bvec_pos.iter.bi_size;
  1148. bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ);
  1149. struct ceph_client *cl = ceph_inode_to_client(inode);
  1150. BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
  1151. BUG_ON(!osd_data->num_bvecs);
  1152. doutc(cl, "req %p inode %p %llx.%llx, rc %d bytes %u\n", req,
  1153. inode, ceph_vinop(inode), rc, len);
  1154. if (rc == -EOLDSNAPC) {
  1155. struct ceph_aio_work *aio_work;
  1156. BUG_ON(!aio_req->write);
  1157. aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS);
  1158. if (aio_work) {
  1159. INIT_WORK(&aio_work->work, ceph_aio_retry_work);
  1160. aio_work->req = req;
  1161. queue_work(ceph_inode_to_fs_client(inode)->inode_wq,
  1162. &aio_work->work);
  1163. return;
  1164. }
  1165. rc = -ENOMEM;
  1166. } else if (!aio_req->write) {
  1167. if (sparse && rc >= 0)
  1168. rc = ceph_sparse_ext_map_end(op);
  1169. if (rc == -ENOENT)
  1170. rc = 0;
  1171. if (rc >= 0 && len > rc) {
  1172. struct iov_iter i;
  1173. int zlen = len - rc;
  1174. /*
  1175. * If read is satisfied by single OSD request,
  1176. * it can pass EOF. Otherwise read is within
  1177. * i_size.
  1178. */
  1179. if (aio_req->num_reqs == 1) {
  1180. loff_t i_size = i_size_read(inode);
  1181. loff_t endoff = aio_req->iocb->ki_pos + rc;
  1182. if (endoff < i_size)
  1183. zlen = min_t(size_t, zlen,
  1184. i_size - endoff);
  1185. aio_req->total_len = rc + zlen;
  1186. }
  1187. iov_iter_bvec(&i, ITER_DEST, osd_data->bvec_pos.bvecs,
  1188. osd_data->num_bvecs, len);
  1189. iov_iter_advance(&i, rc);
  1190. iov_iter_zero(zlen, &i);
  1191. }
  1192. }
  1193. /* r_start_latency == 0 means the request was not submitted */
  1194. if (req->r_start_latency) {
  1195. if (aio_req->write)
  1196. ceph_update_write_metrics(metric, req->r_start_latency,
  1197. req->r_end_latency, len, rc);
  1198. else
  1199. ceph_update_read_metrics(metric, req->r_start_latency,
  1200. req->r_end_latency, len, rc);
  1201. }
  1202. put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs,
  1203. aio_req->should_dirty);
  1204. ceph_osdc_put_request(req);
  1205. if (rc < 0)
  1206. cmpxchg(&aio_req->error, 0, rc);
  1207. ceph_aio_complete(inode, aio_req);
  1208. return;
  1209. }
  1210. static void ceph_aio_retry_work(struct work_struct *work)
  1211. {
  1212. struct ceph_aio_work *aio_work =
  1213. container_of(work, struct ceph_aio_work, work);
  1214. struct ceph_osd_request *orig_req = aio_work->req;
  1215. struct ceph_aio_request *aio_req = orig_req->r_priv;
  1216. struct inode *inode = orig_req->r_inode;
  1217. struct ceph_inode_info *ci = ceph_inode(inode);
  1218. struct ceph_snap_context *snapc;
  1219. struct ceph_osd_request *req;
  1220. int ret;
  1221. spin_lock(&ci->i_ceph_lock);
  1222. if (__ceph_have_pending_cap_snap(ci)) {
  1223. struct ceph_cap_snap *capsnap =
  1224. list_last_entry(&ci->i_cap_snaps,
  1225. struct ceph_cap_snap,
  1226. ci_item);
  1227. snapc = ceph_get_snap_context(capsnap->context);
  1228. } else {
  1229. BUG_ON(!ci->i_head_snapc);
  1230. snapc = ceph_get_snap_context(ci->i_head_snapc);
  1231. }
  1232. spin_unlock(&ci->i_ceph_lock);
  1233. req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 1,
  1234. false, GFP_NOFS);
  1235. if (!req) {
  1236. ret = -ENOMEM;
  1237. req = orig_req;
  1238. goto out;
  1239. }
  1240. req->r_flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
  1241. ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
  1242. ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
  1243. req->r_ops[0] = orig_req->r_ops[0];
  1244. req->r_mtime = aio_req->mtime;
  1245. req->r_data_offset = req->r_ops[0].extent.offset;
  1246. ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
  1247. if (ret) {
  1248. ceph_osdc_put_request(req);
  1249. req = orig_req;
  1250. goto out;
  1251. }
  1252. ceph_osdc_put_request(orig_req);
  1253. req->r_callback = ceph_aio_complete_req;
  1254. req->r_inode = inode;
  1255. req->r_priv = aio_req;
  1256. ceph_osdc_start_request(req->r_osdc, req);
  1257. out:
  1258. if (ret < 0) {
  1259. req->r_result = ret;
  1260. ceph_aio_complete_req(req);
  1261. }
  1262. ceph_put_snap_context(snapc);
  1263. kfree(aio_work);
  1264. }
  1265. static ssize_t
  1266. ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
  1267. struct ceph_snap_context *snapc,
  1268. struct ceph_cap_flush **pcf)
  1269. {
  1270. struct file *file = iocb->ki_filp;
  1271. struct inode *inode = file_inode(file);
  1272. struct ceph_inode_info *ci = ceph_inode(inode);
  1273. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  1274. struct ceph_client *cl = fsc->client;
  1275. struct ceph_client_metric *metric = &fsc->mdsc->metric;
  1276. struct ceph_vino vino;
  1277. struct ceph_osd_request *req;
  1278. struct bio_vec *bvecs;
  1279. struct ceph_aio_request *aio_req = NULL;
  1280. int num_pages = 0;
  1281. int flags;
  1282. int ret = 0;
  1283. struct timespec64 mtime = current_time(inode);
  1284. size_t count = iov_iter_count(iter);
  1285. loff_t pos = iocb->ki_pos;
  1286. bool write = iov_iter_rw(iter) == WRITE;
  1287. bool should_dirty = !write && user_backed_iter(iter);
  1288. bool sparse = ceph_test_mount_opt(fsc, SPARSEREAD);
  1289. if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
  1290. return -EROFS;
  1291. doutc(cl, "sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n",
  1292. (write ? "write" : "read"), file, pos, (unsigned)count,
  1293. snapc, snapc ? snapc->seq : 0);
  1294. if (write) {
  1295. int ret2;
  1296. ceph_fscache_invalidate(inode, true);
  1297. ret2 = invalidate_inode_pages2_range(inode->i_mapping,
  1298. pos >> PAGE_SHIFT,
  1299. (pos + count - 1) >> PAGE_SHIFT);
  1300. if (ret2 < 0)
  1301. doutc(cl, "invalidate_inode_pages2_range returned %d\n",
  1302. ret2);
  1303. flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
  1304. } else {
  1305. flags = CEPH_OSD_FLAG_READ;
  1306. }
  1307. while (iov_iter_count(iter) > 0) {
  1308. u64 size = iov_iter_count(iter);
  1309. ssize_t len;
  1310. struct ceph_osd_req_op *op;
  1311. int readop = sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ;
  1312. int extent_cnt;
  1313. if (write)
  1314. size = min_t(u64, size, fsc->mount_options->wsize);
  1315. else
  1316. size = min_t(u64, size, fsc->mount_options->rsize);
  1317. vino = ceph_vino(inode);
  1318. req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
  1319. vino, pos, &size, 0,
  1320. 1,
  1321. write ? CEPH_OSD_OP_WRITE : readop,
  1322. flags, snapc,
  1323. ci->i_truncate_seq,
  1324. ci->i_truncate_size,
  1325. false);
  1326. if (IS_ERR(req)) {
  1327. ret = PTR_ERR(req);
  1328. break;
  1329. }
  1330. op = &req->r_ops[0];
  1331. if (!write && sparse) {
  1332. extent_cnt = __ceph_sparse_read_ext_count(inode, size);
  1333. ret = ceph_alloc_sparse_ext_map(op, extent_cnt);
  1334. if (ret) {
  1335. ceph_osdc_put_request(req);
  1336. break;
  1337. }
  1338. }
  1339. len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages);
  1340. if (len < 0) {
  1341. ceph_osdc_put_request(req);
  1342. ret = len;
  1343. break;
  1344. }
  1345. if (len != size)
  1346. osd_req_op_extent_update(req, 0, len);
  1347. osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
  1348. /*
  1349. * To simplify error handling, allow AIO when IO within i_size
  1350. * or IO can be satisfied by single OSD request.
  1351. */
  1352. if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) &&
  1353. (len == count || pos + count <= i_size_read(inode))) {
  1354. aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL);
  1355. if (aio_req) {
  1356. aio_req->iocb = iocb;
  1357. aio_req->write = write;
  1358. aio_req->should_dirty = should_dirty;
  1359. INIT_LIST_HEAD(&aio_req->osd_reqs);
  1360. if (write) {
  1361. aio_req->mtime = mtime;
  1362. swap(aio_req->prealloc_cf, *pcf);
  1363. }
  1364. }
  1365. /* ignore error */
  1366. }
  1367. if (write) {
  1368. /*
  1369. * throw out any page cache pages in this range. this
  1370. * may block.
  1371. */
  1372. truncate_inode_pages_range(inode->i_mapping, pos,
  1373. PAGE_ALIGN(pos + len) - 1);
  1374. req->r_mtime = mtime;
  1375. }
  1376. if (aio_req) {
  1377. aio_req->total_len += len;
  1378. aio_req->num_reqs++;
  1379. atomic_inc(&aio_req->pending_reqs);
  1380. req->r_callback = ceph_aio_complete_req;
  1381. req->r_inode = inode;
  1382. req->r_priv = aio_req;
  1383. list_add_tail(&req->r_private_item, &aio_req->osd_reqs);
  1384. pos += len;
  1385. continue;
  1386. }
  1387. ceph_osdc_start_request(req->r_osdc, req);
  1388. ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
  1389. if (write)
  1390. ceph_update_write_metrics(metric, req->r_start_latency,
  1391. req->r_end_latency, len, ret);
  1392. else
  1393. ceph_update_read_metrics(metric, req->r_start_latency,
  1394. req->r_end_latency, len, ret);
  1395. size = i_size_read(inode);
  1396. if (!write) {
  1397. if (sparse && ret >= 0)
  1398. ret = ceph_sparse_ext_map_end(op);
  1399. else if (ret == -ENOENT)
  1400. ret = 0;
  1401. if (ret >= 0 && ret < len && pos + ret < size) {
  1402. struct iov_iter i;
  1403. int zlen = min_t(size_t, len - ret,
  1404. size - pos - ret);
  1405. iov_iter_bvec(&i, ITER_DEST, bvecs, num_pages, len);
  1406. iov_iter_advance(&i, ret);
  1407. iov_iter_zero(zlen, &i);
  1408. ret += zlen;
  1409. }
  1410. if (ret >= 0)
  1411. len = ret;
  1412. }
  1413. put_bvecs(bvecs, num_pages, should_dirty);
  1414. ceph_osdc_put_request(req);
  1415. if (ret < 0)
  1416. break;
  1417. pos += len;
  1418. if (!write && pos >= size)
  1419. break;
  1420. if (write && pos > size) {
  1421. if (ceph_inode_set_size(inode, pos))
  1422. ceph_check_caps(ceph_inode(inode),
  1423. CHECK_CAPS_AUTHONLY);
  1424. }
  1425. }
  1426. if (aio_req) {
  1427. LIST_HEAD(osd_reqs);
  1428. if (aio_req->num_reqs == 0) {
  1429. kfree(aio_req);
  1430. return ret;
  1431. }
  1432. ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
  1433. CEPH_CAP_FILE_RD);
  1434. list_splice(&aio_req->osd_reqs, &osd_reqs);
  1435. inode_dio_begin(inode);
  1436. while (!list_empty(&osd_reqs)) {
  1437. req = list_first_entry(&osd_reqs,
  1438. struct ceph_osd_request,
  1439. r_private_item);
  1440. list_del_init(&req->r_private_item);
  1441. if (ret >= 0)
  1442. ceph_osdc_start_request(req->r_osdc, req);
  1443. if (ret < 0) {
  1444. req->r_result = ret;
  1445. ceph_aio_complete_req(req);
  1446. }
  1447. }
  1448. return -EIOCBQUEUED;
  1449. }
  1450. if (ret != -EOLDSNAPC && pos > iocb->ki_pos) {
  1451. ret = pos - iocb->ki_pos;
  1452. iocb->ki_pos = pos;
  1453. }
  1454. return ret;
  1455. }
  1456. /*
  1457. * Synchronous write, straight from __user pointer or user pages.
  1458. *
  1459. * If write spans object boundary, just do multiple writes. (For a
  1460. * correct atomic write, we should e.g. take write locks on all
  1461. * objects, rollback on failure, etc.)
  1462. */
  1463. static ssize_t
  1464. ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
  1465. struct ceph_snap_context *snapc)
  1466. {
  1467. struct file *file = iocb->ki_filp;
  1468. struct inode *inode = file_inode(file);
  1469. struct ceph_inode_info *ci = ceph_inode(inode);
  1470. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  1471. struct ceph_client *cl = fsc->client;
  1472. struct ceph_osd_client *osdc = &fsc->client->osdc;
  1473. struct ceph_osd_request *req;
  1474. struct page **pages;
  1475. u64 len;
  1476. int num_pages;
  1477. int written = 0;
  1478. int ret;
  1479. bool check_caps = false;
  1480. struct timespec64 mtime = current_time(inode);
  1481. size_t count = iov_iter_count(from);
  1482. if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
  1483. return -EROFS;
  1484. doutc(cl, "on file %p %lld~%u snapc %p seq %lld\n", file, pos,
  1485. (unsigned)count, snapc, snapc->seq);
  1486. ret = filemap_write_and_wait_range(inode->i_mapping,
  1487. pos, pos + count - 1);
  1488. if (ret < 0)
  1489. return ret;
  1490. ceph_fscache_invalidate(inode, false);
  1491. while ((len = iov_iter_count(from)) > 0) {
  1492. size_t left;
  1493. int n;
  1494. u64 write_pos = pos;
  1495. u64 write_len = len;
  1496. u64 objnum, objoff;
  1497. u32 xlen;
  1498. u64 assert_ver = 0;
  1499. bool rmw;
  1500. bool first, last;
  1501. struct iov_iter saved_iter = *from;
  1502. size_t off;
  1503. ceph_fscrypt_adjust_off_and_len(inode, &write_pos, &write_len);
  1504. /* clamp the length to the end of first object */
  1505. ceph_calc_file_object_mapping(&ci->i_layout, write_pos,
  1506. write_len, &objnum, &objoff,
  1507. &xlen);
  1508. write_len = xlen;
  1509. /* adjust len downward if it goes beyond current object */
  1510. if (pos + len > write_pos + write_len)
  1511. len = write_pos + write_len - pos;
  1512. /*
  1513. * If we had to adjust the length or position to align with a
  1514. * crypto block, then we must do a read/modify/write cycle. We
  1515. * use a version assertion to redrive the thing if something
  1516. * changes in between.
  1517. */
  1518. first = pos != write_pos;
  1519. last = (pos + len) != (write_pos + write_len);
  1520. rmw = first || last;
  1521. doutc(cl, "ino %llx %lld~%llu adjusted %lld~%llu -- %srmw\n",
  1522. ci->i_vino.ino, pos, len, write_pos, write_len,
  1523. rmw ? "" : "no ");
  1524. /*
  1525. * The data is emplaced into the page as it would be if it were
  1526. * in an array of pagecache pages.
  1527. */
  1528. num_pages = calc_pages_for(write_pos, write_len);
  1529. pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
  1530. if (IS_ERR(pages)) {
  1531. ret = PTR_ERR(pages);
  1532. break;
  1533. }
  1534. /* Do we need to preload the pages? */
  1535. if (rmw) {
  1536. u64 first_pos = write_pos;
  1537. u64 last_pos = (write_pos + write_len) - CEPH_FSCRYPT_BLOCK_SIZE;
  1538. u64 read_len = CEPH_FSCRYPT_BLOCK_SIZE;
  1539. struct ceph_osd_req_op *op;
  1540. /* We should only need to do this for encrypted inodes */
  1541. WARN_ON_ONCE(!IS_ENCRYPTED(inode));
  1542. /* No need to do two reads if first and last blocks are same */
  1543. if (first && last_pos == first_pos)
  1544. last = false;
  1545. /*
  1546. * Allocate a read request for one or two extents,
  1547. * depending on how the request was aligned.
  1548. */
  1549. req = ceph_osdc_new_request(osdc, &ci->i_layout,
  1550. ci->i_vino, first ? first_pos : last_pos,
  1551. &read_len, 0, (first && last) ? 2 : 1,
  1552. CEPH_OSD_OP_SPARSE_READ, CEPH_OSD_FLAG_READ,
  1553. NULL, ci->i_truncate_seq,
  1554. ci->i_truncate_size, false);
  1555. if (IS_ERR(req)) {
  1556. ceph_release_page_vector(pages, num_pages);
  1557. ret = PTR_ERR(req);
  1558. break;
  1559. }
  1560. /* Something is misaligned! */
  1561. if (read_len != CEPH_FSCRYPT_BLOCK_SIZE) {
  1562. ceph_osdc_put_request(req);
  1563. ceph_release_page_vector(pages, num_pages);
  1564. ret = -EIO;
  1565. break;
  1566. }
  1567. /* Add extent for first block? */
  1568. op = &req->r_ops[0];
  1569. if (first) {
  1570. osd_req_op_extent_osd_data_pages(req, 0, pages,
  1571. CEPH_FSCRYPT_BLOCK_SIZE,
  1572. offset_in_page(first_pos),
  1573. false, false);
  1574. /* We only expect a single extent here */
  1575. ret = __ceph_alloc_sparse_ext_map(op, 1);
  1576. if (ret) {
  1577. ceph_osdc_put_request(req);
  1578. ceph_release_page_vector(pages, num_pages);
  1579. break;
  1580. }
  1581. }
  1582. /* Add extent for last block */
  1583. if (last) {
  1584. /* Init the other extent if first extent has been used */
  1585. if (first) {
  1586. op = &req->r_ops[1];
  1587. osd_req_op_extent_init(req, 1,
  1588. CEPH_OSD_OP_SPARSE_READ,
  1589. last_pos, CEPH_FSCRYPT_BLOCK_SIZE,
  1590. ci->i_truncate_size,
  1591. ci->i_truncate_seq);
  1592. }
  1593. ret = __ceph_alloc_sparse_ext_map(op, 1);
  1594. if (ret) {
  1595. ceph_osdc_put_request(req);
  1596. ceph_release_page_vector(pages, num_pages);
  1597. break;
  1598. }
  1599. osd_req_op_extent_osd_data_pages(req, first ? 1 : 0,
  1600. &pages[num_pages - 1],
  1601. CEPH_FSCRYPT_BLOCK_SIZE,
  1602. offset_in_page(last_pos),
  1603. false, false);
  1604. }
  1605. ceph_osdc_start_request(osdc, req);
  1606. ret = ceph_osdc_wait_request(osdc, req);
  1607. /* FIXME: length field is wrong if there are 2 extents */
  1608. ceph_update_read_metrics(&fsc->mdsc->metric,
  1609. req->r_start_latency,
  1610. req->r_end_latency,
  1611. read_len, ret);
  1612. /* Ok if object is not already present */
  1613. if (ret == -ENOENT) {
  1614. /*
  1615. * If there is no object, then we can't assert
  1616. * on its version. Set it to 0, and we'll use an
  1617. * exclusive create instead.
  1618. */
  1619. ceph_osdc_put_request(req);
  1620. ret = 0;
  1621. /*
  1622. * zero out the soon-to-be uncopied parts of the
  1623. * first and last pages.
  1624. */
  1625. if (first)
  1626. zero_user_segment(pages[0], 0,
  1627. offset_in_page(first_pos));
  1628. if (last)
  1629. zero_user_segment(pages[num_pages - 1],
  1630. offset_in_page(last_pos),
  1631. PAGE_SIZE);
  1632. } else {
  1633. if (ret < 0) {
  1634. ceph_osdc_put_request(req);
  1635. ceph_release_page_vector(pages, num_pages);
  1636. break;
  1637. }
  1638. op = &req->r_ops[0];
  1639. if (op->extent.sparse_ext_cnt == 0) {
  1640. if (first)
  1641. zero_user_segment(pages[0], 0,
  1642. offset_in_page(first_pos));
  1643. else
  1644. zero_user_segment(pages[num_pages - 1],
  1645. offset_in_page(last_pos),
  1646. PAGE_SIZE);
  1647. } else if (op->extent.sparse_ext_cnt != 1 ||
  1648. ceph_sparse_ext_map_end(op) !=
  1649. CEPH_FSCRYPT_BLOCK_SIZE) {
  1650. ret = -EIO;
  1651. ceph_osdc_put_request(req);
  1652. ceph_release_page_vector(pages, num_pages);
  1653. break;
  1654. }
  1655. if (first && last) {
  1656. op = &req->r_ops[1];
  1657. if (op->extent.sparse_ext_cnt == 0) {
  1658. zero_user_segment(pages[num_pages - 1],
  1659. offset_in_page(last_pos),
  1660. PAGE_SIZE);
  1661. } else if (op->extent.sparse_ext_cnt != 1 ||
  1662. ceph_sparse_ext_map_end(op) !=
  1663. CEPH_FSCRYPT_BLOCK_SIZE) {
  1664. ret = -EIO;
  1665. ceph_osdc_put_request(req);
  1666. ceph_release_page_vector(pages, num_pages);
  1667. break;
  1668. }
  1669. }
  1670. /* Grab assert version. It must be non-zero. */
  1671. assert_ver = req->r_version;
  1672. WARN_ON_ONCE(ret > 0 && assert_ver == 0);
  1673. ceph_osdc_put_request(req);
  1674. if (first) {
  1675. ret = ceph_fscrypt_decrypt_block_inplace(inode,
  1676. pages[0], CEPH_FSCRYPT_BLOCK_SIZE,
  1677. offset_in_page(first_pos),
  1678. first_pos >> CEPH_FSCRYPT_BLOCK_SHIFT);
  1679. if (ret < 0) {
  1680. ceph_release_page_vector(pages, num_pages);
  1681. break;
  1682. }
  1683. }
  1684. if (last) {
  1685. ret = ceph_fscrypt_decrypt_block_inplace(inode,
  1686. pages[num_pages - 1],
  1687. CEPH_FSCRYPT_BLOCK_SIZE,
  1688. offset_in_page(last_pos),
  1689. last_pos >> CEPH_FSCRYPT_BLOCK_SHIFT);
  1690. if (ret < 0) {
  1691. ceph_release_page_vector(pages, num_pages);
  1692. break;
  1693. }
  1694. }
  1695. }
  1696. }
  1697. left = len;
  1698. off = offset_in_page(pos);
  1699. for (n = 0; n < num_pages; n++) {
  1700. size_t plen = min_t(size_t, left, PAGE_SIZE - off);
  1701. /* copy the data */
  1702. ret = copy_page_from_iter(pages[n], off, plen, from);
  1703. if (ret != plen) {
  1704. ret = -EFAULT;
  1705. break;
  1706. }
  1707. off = 0;
  1708. left -= ret;
  1709. }
  1710. if (ret < 0) {
  1711. doutc(cl, "write failed with %d\n", ret);
  1712. ceph_release_page_vector(pages, num_pages);
  1713. break;
  1714. }
  1715. if (IS_ENCRYPTED(inode)) {
  1716. ret = ceph_fscrypt_encrypt_pages(inode, pages,
  1717. write_pos, write_len,
  1718. GFP_KERNEL);
  1719. if (ret < 0) {
  1720. doutc(cl, "encryption failed with %d\n", ret);
  1721. ceph_release_page_vector(pages, num_pages);
  1722. break;
  1723. }
  1724. }
  1725. req = ceph_osdc_new_request(osdc, &ci->i_layout,
  1726. ci->i_vino, write_pos, &write_len,
  1727. rmw ? 1 : 0, rmw ? 2 : 1,
  1728. CEPH_OSD_OP_WRITE,
  1729. CEPH_OSD_FLAG_WRITE,
  1730. snapc, ci->i_truncate_seq,
  1731. ci->i_truncate_size, false);
  1732. if (IS_ERR(req)) {
  1733. ret = PTR_ERR(req);
  1734. ceph_release_page_vector(pages, num_pages);
  1735. break;
  1736. }
  1737. doutc(cl, "write op %lld~%llu\n", write_pos, write_len);
  1738. osd_req_op_extent_osd_data_pages(req, rmw ? 1 : 0, pages, write_len,
  1739. offset_in_page(write_pos), false,
  1740. true);
  1741. req->r_inode = inode;
  1742. req->r_mtime = mtime;
  1743. /* Set up the assertion */
  1744. if (rmw) {
  1745. /*
  1746. * Set up the assertion. If we don't have a version
  1747. * number, then the object doesn't exist yet. Use an
  1748. * exclusive create instead of a version assertion in
  1749. * that case.
  1750. */
  1751. if (assert_ver) {
  1752. osd_req_op_init(req, 0, CEPH_OSD_OP_ASSERT_VER, 0);
  1753. req->r_ops[0].assert_ver.ver = assert_ver;
  1754. } else {
  1755. osd_req_op_init(req, 0, CEPH_OSD_OP_CREATE,
  1756. CEPH_OSD_OP_FLAG_EXCL);
  1757. }
  1758. }
  1759. ceph_osdc_start_request(osdc, req);
  1760. ret = ceph_osdc_wait_request(osdc, req);
  1761. ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
  1762. req->r_end_latency, len, ret);
  1763. ceph_osdc_put_request(req);
  1764. if (ret != 0) {
  1765. doutc(cl, "osd write returned %d\n", ret);
  1766. /* Version changed! Must re-do the rmw cycle */
  1767. if ((assert_ver && (ret == -ERANGE || ret == -EOVERFLOW)) ||
  1768. (!assert_ver && ret == -EEXIST)) {
  1769. /* We should only ever see this on a rmw */
  1770. WARN_ON_ONCE(!rmw);
  1771. /* The version should never go backward */
  1772. WARN_ON_ONCE(ret == -EOVERFLOW);
  1773. *from = saved_iter;
  1774. /* FIXME: limit number of times we loop? */
  1775. continue;
  1776. }
  1777. ceph_set_error_write(ci);
  1778. break;
  1779. }
  1780. ceph_clear_error_write(ci);
  1781. /*
  1782. * We successfully wrote to a range of the file. Declare
  1783. * that region of the pagecache invalid.
  1784. */
  1785. ret = invalidate_inode_pages2_range(
  1786. inode->i_mapping,
  1787. pos >> PAGE_SHIFT,
  1788. (pos + len - 1) >> PAGE_SHIFT);
  1789. if (ret < 0) {
  1790. doutc(cl, "invalidate_inode_pages2_range returned %d\n",
  1791. ret);
  1792. ret = 0;
  1793. }
  1794. pos += len;
  1795. written += len;
  1796. doutc(cl, "written %d\n", written);
  1797. if (pos > i_size_read(inode)) {
  1798. check_caps = ceph_inode_set_size(inode, pos);
  1799. if (check_caps)
  1800. ceph_check_caps(ceph_inode(inode),
  1801. CHECK_CAPS_AUTHONLY);
  1802. }
  1803. }
  1804. if (ret != -EOLDSNAPC && written > 0) {
  1805. ret = written;
  1806. iocb->ki_pos = pos;
  1807. }
  1808. doutc(cl, "returning %d\n", ret);
  1809. return ret;
  1810. }
  1811. /*
  1812. * Wrap generic_file_aio_read with checks for cap bits on the inode.
  1813. * Atomically grab references, so that those bits are not released
  1814. * back to the MDS mid-read.
  1815. *
  1816. * Hmm, the sync read case isn't actually async... should it be?
  1817. */
  1818. static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
  1819. {
  1820. struct file *filp = iocb->ki_filp;
  1821. struct ceph_file_info *fi = filp->private_data;
  1822. size_t len = iov_iter_count(to);
  1823. struct inode *inode = file_inode(filp);
  1824. struct ceph_inode_info *ci = ceph_inode(inode);
  1825. bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
  1826. struct ceph_client *cl = ceph_inode_to_client(inode);
  1827. ssize_t ret;
  1828. int want = 0, got = 0;
  1829. int retry_op = 0, read = 0;
  1830. again:
  1831. doutc(cl, "%llu~%u trying to get caps on %p %llx.%llx\n",
  1832. iocb->ki_pos, (unsigned)len, inode, ceph_vinop(inode));
  1833. if (ceph_inode_is_shutdown(inode))
  1834. return -ESTALE;
  1835. if (direct_lock)
  1836. ceph_start_io_direct(inode);
  1837. else
  1838. ceph_start_io_read(inode);
  1839. if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
  1840. want |= CEPH_CAP_FILE_CACHE;
  1841. if (fi->fmode & CEPH_FILE_MODE_LAZY)
  1842. want |= CEPH_CAP_FILE_LAZYIO;
  1843. ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got);
  1844. if (ret < 0) {
  1845. if (direct_lock)
  1846. ceph_end_io_direct(inode);
  1847. else
  1848. ceph_end_io_read(inode);
  1849. return ret;
  1850. }
  1851. if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
  1852. (iocb->ki_flags & IOCB_DIRECT) ||
  1853. (fi->flags & CEPH_F_SYNC)) {
  1854. doutc(cl, "sync %p %llx.%llx %llu~%u got cap refs on %s\n",
  1855. inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
  1856. ceph_cap_string(got));
  1857. if (!ceph_has_inline_data(ci)) {
  1858. if (!retry_op &&
  1859. (iocb->ki_flags & IOCB_DIRECT) &&
  1860. !IS_ENCRYPTED(inode)) {
  1861. ret = ceph_direct_read_write(iocb, to,
  1862. NULL, NULL);
  1863. if (ret >= 0 && ret < len)
  1864. retry_op = CHECK_EOF;
  1865. } else {
  1866. ret = ceph_sync_read(iocb, to, &retry_op);
  1867. }
  1868. } else {
  1869. retry_op = READ_INLINE;
  1870. }
  1871. } else {
  1872. CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
  1873. doutc(cl, "async %p %llx.%llx %llu~%u got cap refs on %s\n",
  1874. inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
  1875. ceph_cap_string(got));
  1876. ceph_add_rw_context(fi, &rw_ctx);
  1877. ret = generic_file_read_iter(iocb, to);
  1878. ceph_del_rw_context(fi, &rw_ctx);
  1879. }
  1880. doutc(cl, "%p %llx.%llx dropping cap refs on %s = %d\n",
  1881. inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
  1882. ceph_put_cap_refs(ci, got);
  1883. if (direct_lock)
  1884. ceph_end_io_direct(inode);
  1885. else
  1886. ceph_end_io_read(inode);
  1887. if (retry_op > HAVE_RETRIED && ret >= 0) {
  1888. int statret;
  1889. struct page *page = NULL;
  1890. loff_t i_size;
  1891. int mask = CEPH_STAT_CAP_SIZE;
  1892. if (retry_op == READ_INLINE) {
  1893. page = __page_cache_alloc(GFP_KERNEL);
  1894. if (!page)
  1895. return -ENOMEM;
  1896. mask = CEPH_STAT_CAP_INLINE_DATA;
  1897. }
  1898. statret = __ceph_do_getattr(inode, page, mask, !!page);
  1899. if (statret < 0) {
  1900. if (page)
  1901. __free_page(page);
  1902. if (statret == -ENODATA) {
  1903. BUG_ON(retry_op != READ_INLINE);
  1904. goto again;
  1905. }
  1906. return statret;
  1907. }
  1908. i_size = i_size_read(inode);
  1909. if (retry_op == READ_INLINE) {
  1910. BUG_ON(ret > 0 || read > 0);
  1911. if (iocb->ki_pos < i_size &&
  1912. iocb->ki_pos < PAGE_SIZE) {
  1913. loff_t end = min_t(loff_t, i_size,
  1914. iocb->ki_pos + len);
  1915. end = min_t(loff_t, end, PAGE_SIZE);
  1916. if (statret < end)
  1917. zero_user_segment(page, statret, end);
  1918. ret = copy_page_to_iter(page,
  1919. iocb->ki_pos & ~PAGE_MASK,
  1920. end - iocb->ki_pos, to);
  1921. iocb->ki_pos += ret;
  1922. read += ret;
  1923. }
  1924. if (iocb->ki_pos < i_size && read < len) {
  1925. size_t zlen = min_t(size_t, len - read,
  1926. i_size - iocb->ki_pos);
  1927. ret = iov_iter_zero(zlen, to);
  1928. iocb->ki_pos += ret;
  1929. read += ret;
  1930. }
  1931. __free_pages(page, 0);
  1932. return read;
  1933. }
  1934. /* hit EOF or hole? */
  1935. if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
  1936. ret < len) {
  1937. doutc(cl, "may hit hole, ppos %lld < size %lld, reading more\n",
  1938. iocb->ki_pos, i_size);
  1939. read += ret;
  1940. len -= ret;
  1941. retry_op = HAVE_RETRIED;
  1942. goto again;
  1943. }
  1944. }
  1945. if (ret >= 0)
  1946. ret += read;
  1947. return ret;
  1948. }
  1949. /*
  1950. * Wrap filemap_splice_read with checks for cap bits on the inode.
  1951. * Atomically grab references, so that those bits are not released
  1952. * back to the MDS mid-read.
  1953. */
  1954. static ssize_t ceph_splice_read(struct file *in, loff_t *ppos,
  1955. struct pipe_inode_info *pipe,
  1956. size_t len, unsigned int flags)
  1957. {
  1958. struct ceph_file_info *fi = in->private_data;
  1959. struct inode *inode = file_inode(in);
  1960. struct ceph_inode_info *ci = ceph_inode(inode);
  1961. ssize_t ret;
  1962. int want = 0, got = 0;
  1963. CEPH_DEFINE_RW_CONTEXT(rw_ctx, 0);
  1964. dout("splice_read %p %llx.%llx %llu~%zu trying to get caps on %p\n",
  1965. inode, ceph_vinop(inode), *ppos, len, inode);
  1966. if (ceph_inode_is_shutdown(inode))
  1967. return -ESTALE;
  1968. if (ceph_has_inline_data(ci) ||
  1969. (fi->flags & CEPH_F_SYNC))
  1970. return copy_splice_read(in, ppos, pipe, len, flags);
  1971. ceph_start_io_read(inode);
  1972. want = CEPH_CAP_FILE_CACHE;
  1973. if (fi->fmode & CEPH_FILE_MODE_LAZY)
  1974. want |= CEPH_CAP_FILE_LAZYIO;
  1975. ret = ceph_get_caps(in, CEPH_CAP_FILE_RD, want, -1, &got);
  1976. if (ret < 0)
  1977. goto out_end;
  1978. if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) == 0) {
  1979. dout("splice_read/sync %p %llx.%llx %llu~%zu got cap refs on %s\n",
  1980. inode, ceph_vinop(inode), *ppos, len,
  1981. ceph_cap_string(got));
  1982. ceph_put_cap_refs(ci, got);
  1983. ceph_end_io_read(inode);
  1984. return copy_splice_read(in, ppos, pipe, len, flags);
  1985. }
  1986. dout("splice_read %p %llx.%llx %llu~%zu got cap refs on %s\n",
  1987. inode, ceph_vinop(inode), *ppos, len, ceph_cap_string(got));
  1988. rw_ctx.caps = got;
  1989. ceph_add_rw_context(fi, &rw_ctx);
  1990. ret = filemap_splice_read(in, ppos, pipe, len, flags);
  1991. ceph_del_rw_context(fi, &rw_ctx);
  1992. dout("splice_read %p %llx.%llx dropping cap refs on %s = %zd\n",
  1993. inode, ceph_vinop(inode), ceph_cap_string(got), ret);
  1994. ceph_put_cap_refs(ci, got);
  1995. out_end:
  1996. ceph_end_io_read(inode);
  1997. return ret;
  1998. }
  1999. /*
  2000. * Take cap references to avoid releasing caps to MDS mid-write.
  2001. *
  2002. * If we are synchronous, and write with an old snap context, the OSD
  2003. * may return EOLDSNAPC. In that case, retry the write.. _after_
  2004. * dropping our cap refs and allowing the pending snap to logically
  2005. * complete _before_ this write occurs.
  2006. *
  2007. * If we are near ENOSPC, write synchronously.
  2008. */
  2009. static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
  2010. {
  2011. struct file *file = iocb->ki_filp;
  2012. struct ceph_file_info *fi = file->private_data;
  2013. struct inode *inode = file_inode(file);
  2014. struct ceph_inode_info *ci = ceph_inode(inode);
  2015. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  2016. struct ceph_client *cl = fsc->client;
  2017. struct ceph_osd_client *osdc = &fsc->client->osdc;
  2018. struct ceph_cap_flush *prealloc_cf;
  2019. ssize_t count, written = 0;
  2020. int err, want = 0, got;
  2021. bool direct_lock = false;
  2022. u32 map_flags;
  2023. u64 pool_flags;
  2024. loff_t pos;
  2025. loff_t limit = max(i_size_read(inode), fsc->max_file_size);
  2026. if (ceph_inode_is_shutdown(inode))
  2027. return -ESTALE;
  2028. if (ceph_snap(inode) != CEPH_NOSNAP)
  2029. return -EROFS;
  2030. prealloc_cf = ceph_alloc_cap_flush();
  2031. if (!prealloc_cf)
  2032. return -ENOMEM;
  2033. if ((iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND)) == IOCB_DIRECT)
  2034. direct_lock = true;
  2035. retry_snap:
  2036. if (direct_lock)
  2037. ceph_start_io_direct(inode);
  2038. else
  2039. ceph_start_io_write(inode);
  2040. if (iocb->ki_flags & IOCB_APPEND) {
  2041. err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
  2042. if (err < 0)
  2043. goto out;
  2044. }
  2045. err = generic_write_checks(iocb, from);
  2046. if (err <= 0)
  2047. goto out;
  2048. pos = iocb->ki_pos;
  2049. if (unlikely(pos >= limit)) {
  2050. err = -EFBIG;
  2051. goto out;
  2052. } else {
  2053. iov_iter_truncate(from, limit - pos);
  2054. }
  2055. count = iov_iter_count(from);
  2056. if (ceph_quota_is_max_bytes_exceeded(inode, pos + count)) {
  2057. err = -EDQUOT;
  2058. goto out;
  2059. }
  2060. down_read(&osdc->lock);
  2061. map_flags = osdc->osdmap->flags;
  2062. pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
  2063. up_read(&osdc->lock);
  2064. if ((map_flags & CEPH_OSDMAP_FULL) ||
  2065. (pool_flags & CEPH_POOL_FLAG_FULL)) {
  2066. err = -ENOSPC;
  2067. goto out;
  2068. }
  2069. err = file_remove_privs(file);
  2070. if (err)
  2071. goto out;
  2072. doutc(cl, "%p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
  2073. inode, ceph_vinop(inode), pos, count,
  2074. i_size_read(inode));
  2075. if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
  2076. want |= CEPH_CAP_FILE_BUFFER;
  2077. if (fi->fmode & CEPH_FILE_MODE_LAZY)
  2078. want |= CEPH_CAP_FILE_LAZYIO;
  2079. got = 0;
  2080. err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got);
  2081. if (err < 0)
  2082. goto out;
  2083. err = file_update_time(file);
  2084. if (err)
  2085. goto out_caps;
  2086. inode_inc_iversion_raw(inode);
  2087. doutc(cl, "%p %llx.%llx %llu~%zd got cap refs on %s\n",
  2088. inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
  2089. if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
  2090. (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC) ||
  2091. (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
  2092. struct ceph_snap_context *snapc;
  2093. struct iov_iter data;
  2094. spin_lock(&ci->i_ceph_lock);
  2095. if (__ceph_have_pending_cap_snap(ci)) {
  2096. struct ceph_cap_snap *capsnap =
  2097. list_last_entry(&ci->i_cap_snaps,
  2098. struct ceph_cap_snap,
  2099. ci_item);
  2100. snapc = ceph_get_snap_context(capsnap->context);
  2101. } else {
  2102. BUG_ON(!ci->i_head_snapc);
  2103. snapc = ceph_get_snap_context(ci->i_head_snapc);
  2104. }
  2105. spin_unlock(&ci->i_ceph_lock);
  2106. /* we might need to revert back to that point */
  2107. data = *from;
  2108. if ((iocb->ki_flags & IOCB_DIRECT) && !IS_ENCRYPTED(inode))
  2109. written = ceph_direct_read_write(iocb, &data, snapc,
  2110. &prealloc_cf);
  2111. else
  2112. written = ceph_sync_write(iocb, &data, pos, snapc);
  2113. if (direct_lock)
  2114. ceph_end_io_direct(inode);
  2115. else
  2116. ceph_end_io_write(inode);
  2117. if (written > 0)
  2118. iov_iter_advance(from, written);
  2119. ceph_put_snap_context(snapc);
  2120. } else {
  2121. /*
  2122. * No need to acquire the i_truncate_mutex. Because
  2123. * the MDS revokes Fwb caps before sending truncate
  2124. * message to us. We can't get Fwb cap while there
  2125. * are pending vmtruncate. So write and vmtruncate
  2126. * can not run at the same time
  2127. */
  2128. written = generic_perform_write(iocb, from);
  2129. ceph_end_io_write(inode);
  2130. }
  2131. if (written >= 0) {
  2132. int dirty;
  2133. spin_lock(&ci->i_ceph_lock);
  2134. dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
  2135. &prealloc_cf);
  2136. spin_unlock(&ci->i_ceph_lock);
  2137. if (dirty)
  2138. __mark_inode_dirty(inode, dirty);
  2139. if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
  2140. ceph_check_caps(ci, CHECK_CAPS_FLUSH);
  2141. }
  2142. doutc(cl, "%p %llx.%llx %llu~%u dropping cap refs on %s\n",
  2143. inode, ceph_vinop(inode), pos, (unsigned)count,
  2144. ceph_cap_string(got));
  2145. ceph_put_cap_refs(ci, got);
  2146. if (written == -EOLDSNAPC) {
  2147. doutc(cl, "%p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n",
  2148. inode, ceph_vinop(inode), pos, (unsigned)count);
  2149. goto retry_snap;
  2150. }
  2151. if (written >= 0) {
  2152. if ((map_flags & CEPH_OSDMAP_NEARFULL) ||
  2153. (pool_flags & CEPH_POOL_FLAG_NEARFULL))
  2154. iocb->ki_flags |= IOCB_DSYNC;
  2155. written = generic_write_sync(iocb, written);
  2156. }
  2157. goto out_unlocked;
  2158. out_caps:
  2159. ceph_put_cap_refs(ci, got);
  2160. out:
  2161. if (direct_lock)
  2162. ceph_end_io_direct(inode);
  2163. else
  2164. ceph_end_io_write(inode);
  2165. out_unlocked:
  2166. ceph_free_cap_flush(prealloc_cf);
  2167. return written ? written : err;
  2168. }
  2169. /*
  2170. * llseek. be sure to verify file size on SEEK_END.
  2171. */
  2172. static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
  2173. {
  2174. if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
  2175. struct inode *inode = file_inode(file);
  2176. int ret;
  2177. ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
  2178. if (ret < 0)
  2179. return ret;
  2180. }
  2181. return generic_file_llseek(file, offset, whence);
  2182. }
  2183. static inline void ceph_zero_partial_page(
  2184. struct inode *inode, loff_t offset, unsigned size)
  2185. {
  2186. struct page *page;
  2187. pgoff_t index = offset >> PAGE_SHIFT;
  2188. page = find_lock_page(inode->i_mapping, index);
  2189. if (page) {
  2190. wait_on_page_writeback(page);
  2191. zero_user(page, offset & (PAGE_SIZE - 1), size);
  2192. unlock_page(page);
  2193. put_page(page);
  2194. }
  2195. }
  2196. static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
  2197. loff_t length)
  2198. {
  2199. loff_t nearly = round_up(offset, PAGE_SIZE);
  2200. if (offset < nearly) {
  2201. loff_t size = nearly - offset;
  2202. if (length < size)
  2203. size = length;
  2204. ceph_zero_partial_page(inode, offset, size);
  2205. offset += size;
  2206. length -= size;
  2207. }
  2208. if (length >= PAGE_SIZE) {
  2209. loff_t size = round_down(length, PAGE_SIZE);
  2210. truncate_pagecache_range(inode, offset, offset + size - 1);
  2211. offset += size;
  2212. length -= size;
  2213. }
  2214. if (length)
  2215. ceph_zero_partial_page(inode, offset, length);
  2216. }
  2217. static int ceph_zero_partial_object(struct inode *inode,
  2218. loff_t offset, loff_t *length)
  2219. {
  2220. struct ceph_inode_info *ci = ceph_inode(inode);
  2221. struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
  2222. struct ceph_osd_request *req;
  2223. int ret = 0;
  2224. loff_t zero = 0;
  2225. int op;
  2226. if (ceph_inode_is_shutdown(inode))
  2227. return -EIO;
  2228. if (!length) {
  2229. op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE;
  2230. length = &zero;
  2231. } else {
  2232. op = CEPH_OSD_OP_ZERO;
  2233. }
  2234. req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
  2235. ceph_vino(inode),
  2236. offset, length,
  2237. 0, 1, op,
  2238. CEPH_OSD_FLAG_WRITE,
  2239. NULL, 0, 0, false);
  2240. if (IS_ERR(req)) {
  2241. ret = PTR_ERR(req);
  2242. goto out;
  2243. }
  2244. req->r_mtime = inode_get_mtime(inode);
  2245. ceph_osdc_start_request(&fsc->client->osdc, req);
  2246. ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
  2247. if (ret == -ENOENT)
  2248. ret = 0;
  2249. ceph_osdc_put_request(req);
  2250. out:
  2251. return ret;
  2252. }
  2253. static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
  2254. {
  2255. int ret = 0;
  2256. struct ceph_inode_info *ci = ceph_inode(inode);
  2257. s32 stripe_unit = ci->i_layout.stripe_unit;
  2258. s32 stripe_count = ci->i_layout.stripe_count;
  2259. s32 object_size = ci->i_layout.object_size;
  2260. u64 object_set_size = (u64) object_size * stripe_count;
  2261. u64 nearly, t;
  2262. /* round offset up to next period boundary */
  2263. nearly = offset + object_set_size - 1;
  2264. t = nearly;
  2265. nearly -= do_div(t, object_set_size);
  2266. while (length && offset < nearly) {
  2267. loff_t size = length;
  2268. ret = ceph_zero_partial_object(inode, offset, &size);
  2269. if (ret < 0)
  2270. return ret;
  2271. offset += size;
  2272. length -= size;
  2273. }
  2274. while (length >= object_set_size) {
  2275. int i;
  2276. loff_t pos = offset;
  2277. for (i = 0; i < stripe_count; ++i) {
  2278. ret = ceph_zero_partial_object(inode, pos, NULL);
  2279. if (ret < 0)
  2280. return ret;
  2281. pos += stripe_unit;
  2282. }
  2283. offset += object_set_size;
  2284. length -= object_set_size;
  2285. }
  2286. while (length) {
  2287. loff_t size = length;
  2288. ret = ceph_zero_partial_object(inode, offset, &size);
  2289. if (ret < 0)
  2290. return ret;
  2291. offset += size;
  2292. length -= size;
  2293. }
  2294. return ret;
  2295. }
  2296. static long ceph_fallocate(struct file *file, int mode,
  2297. loff_t offset, loff_t length)
  2298. {
  2299. struct ceph_file_info *fi = file->private_data;
  2300. struct inode *inode = file_inode(file);
  2301. struct ceph_inode_info *ci = ceph_inode(inode);
  2302. struct ceph_cap_flush *prealloc_cf;
  2303. struct ceph_client *cl = ceph_inode_to_client(inode);
  2304. int want, got = 0;
  2305. int dirty;
  2306. int ret = 0;
  2307. loff_t endoff = 0;
  2308. loff_t size;
  2309. doutc(cl, "%p %llx.%llx mode %x, offset %llu length %llu\n",
  2310. inode, ceph_vinop(inode), mode, offset, length);
  2311. if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
  2312. return -EOPNOTSUPP;
  2313. if (!S_ISREG(inode->i_mode))
  2314. return -EOPNOTSUPP;
  2315. if (IS_ENCRYPTED(inode))
  2316. return -EOPNOTSUPP;
  2317. prealloc_cf = ceph_alloc_cap_flush();
  2318. if (!prealloc_cf)
  2319. return -ENOMEM;
  2320. inode_lock(inode);
  2321. if (ceph_snap(inode) != CEPH_NOSNAP) {
  2322. ret = -EROFS;
  2323. goto unlock;
  2324. }
  2325. size = i_size_read(inode);
  2326. /* Are we punching a hole beyond EOF? */
  2327. if (offset >= size)
  2328. goto unlock;
  2329. if ((offset + length) > size)
  2330. length = size - offset;
  2331. if (fi->fmode & CEPH_FILE_MODE_LAZY)
  2332. want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
  2333. else
  2334. want = CEPH_CAP_FILE_BUFFER;
  2335. ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got);
  2336. if (ret < 0)
  2337. goto unlock;
  2338. ret = file_modified(file);
  2339. if (ret)
  2340. goto put_caps;
  2341. filemap_invalidate_lock(inode->i_mapping);
  2342. ceph_fscache_invalidate(inode, false);
  2343. ceph_zero_pagecache_range(inode, offset, length);
  2344. ret = ceph_zero_objects(inode, offset, length);
  2345. if (!ret) {
  2346. spin_lock(&ci->i_ceph_lock);
  2347. dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
  2348. &prealloc_cf);
  2349. spin_unlock(&ci->i_ceph_lock);
  2350. if (dirty)
  2351. __mark_inode_dirty(inode, dirty);
  2352. }
  2353. filemap_invalidate_unlock(inode->i_mapping);
  2354. put_caps:
  2355. ceph_put_cap_refs(ci, got);
  2356. unlock:
  2357. inode_unlock(inode);
  2358. ceph_free_cap_flush(prealloc_cf);
  2359. return ret;
  2360. }
  2361. /*
  2362. * This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for
  2363. * src_ci. Two attempts are made to obtain both caps, and an error is return if
  2364. * this fails; zero is returned on success.
  2365. */
  2366. static int get_rd_wr_caps(struct file *src_filp, int *src_got,
  2367. struct file *dst_filp,
  2368. loff_t dst_endoff, int *dst_got)
  2369. {
  2370. int ret = 0;
  2371. bool retrying = false;
  2372. retry_caps:
  2373. ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
  2374. dst_endoff, dst_got);
  2375. if (ret < 0)
  2376. return ret;
  2377. /*
  2378. * Since we're already holding the FILE_WR capability for the dst file,
  2379. * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some
  2380. * retry dance instead to try to get both capabilities.
  2381. */
  2382. ret = ceph_try_get_caps(file_inode(src_filp),
  2383. CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED,
  2384. false, src_got);
  2385. if (ret <= 0) {
  2386. /* Start by dropping dst_ci caps and getting src_ci caps */
  2387. ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got);
  2388. if (retrying) {
  2389. if (!ret)
  2390. /* ceph_try_get_caps masks EAGAIN */
  2391. ret = -EAGAIN;
  2392. return ret;
  2393. }
  2394. ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD,
  2395. CEPH_CAP_FILE_SHARED, -1, src_got);
  2396. if (ret < 0)
  2397. return ret;
  2398. /*... drop src_ci caps too, and retry */
  2399. ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got);
  2400. retrying = true;
  2401. goto retry_caps;
  2402. }
  2403. return ret;
  2404. }
  2405. static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got,
  2406. struct ceph_inode_info *dst_ci, int dst_got)
  2407. {
  2408. ceph_put_cap_refs(src_ci, src_got);
  2409. ceph_put_cap_refs(dst_ci, dst_got);
  2410. }
  2411. /*
  2412. * This function does several size-related checks, returning an error if:
  2413. * - source file is smaller than off+len
  2414. * - destination file size is not OK (inode_newsize_ok())
  2415. * - max bytes quotas is exceeded
  2416. */
  2417. static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
  2418. loff_t src_off, loff_t dst_off, size_t len)
  2419. {
  2420. struct ceph_client *cl = ceph_inode_to_client(src_inode);
  2421. loff_t size, endoff;
  2422. size = i_size_read(src_inode);
  2423. /*
  2424. * Don't copy beyond source file EOF. Instead of simply setting length
  2425. * to (size - src_off), just drop to VFS default implementation, as the
  2426. * local i_size may be stale due to other clients writing to the source
  2427. * inode.
  2428. */
  2429. if (src_off + len > size) {
  2430. doutc(cl, "Copy beyond EOF (%llu + %zu > %llu)\n", src_off,
  2431. len, size);
  2432. return -EOPNOTSUPP;
  2433. }
  2434. size = i_size_read(dst_inode);
  2435. endoff = dst_off + len;
  2436. if (inode_newsize_ok(dst_inode, endoff))
  2437. return -EOPNOTSUPP;
  2438. if (ceph_quota_is_max_bytes_exceeded(dst_inode, endoff))
  2439. return -EDQUOT;
  2440. return 0;
  2441. }
  2442. static struct ceph_osd_request *
  2443. ceph_alloc_copyfrom_request(struct ceph_osd_client *osdc,
  2444. u64 src_snapid,
  2445. struct ceph_object_id *src_oid,
  2446. struct ceph_object_locator *src_oloc,
  2447. struct ceph_object_id *dst_oid,
  2448. struct ceph_object_locator *dst_oloc,
  2449. u32 truncate_seq, u64 truncate_size)
  2450. {
  2451. struct ceph_osd_request *req;
  2452. int ret;
  2453. u32 src_fadvise_flags =
  2454. CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
  2455. CEPH_OSD_OP_FLAG_FADVISE_NOCACHE;
  2456. u32 dst_fadvise_flags =
  2457. CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
  2458. CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
  2459. req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
  2460. if (!req)
  2461. return ERR_PTR(-ENOMEM);
  2462. req->r_flags = CEPH_OSD_FLAG_WRITE;
  2463. ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc);
  2464. ceph_oid_copy(&req->r_t.base_oid, dst_oid);
  2465. ret = osd_req_op_copy_from_init(req, src_snapid, 0,
  2466. src_oid, src_oloc,
  2467. src_fadvise_flags,
  2468. dst_fadvise_flags,
  2469. truncate_seq,
  2470. truncate_size,
  2471. CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
  2472. if (ret)
  2473. goto out;
  2474. ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
  2475. if (ret)
  2476. goto out;
  2477. return req;
  2478. out:
  2479. ceph_osdc_put_request(req);
  2480. return ERR_PTR(ret);
  2481. }
  2482. static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off,
  2483. struct ceph_inode_info *dst_ci, u64 *dst_off,
  2484. struct ceph_fs_client *fsc,
  2485. size_t len, unsigned int flags)
  2486. {
  2487. struct ceph_object_locator src_oloc, dst_oloc;
  2488. struct ceph_object_id src_oid, dst_oid;
  2489. struct ceph_osd_client *osdc;
  2490. struct ceph_osd_request *req;
  2491. size_t bytes = 0;
  2492. u64 src_objnum, src_objoff, dst_objnum, dst_objoff;
  2493. u32 src_objlen, dst_objlen;
  2494. u32 object_size = src_ci->i_layout.object_size;
  2495. struct ceph_client *cl = fsc->client;
  2496. int ret;
  2497. src_oloc.pool = src_ci->i_layout.pool_id;
  2498. src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
  2499. dst_oloc.pool = dst_ci->i_layout.pool_id;
  2500. dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
  2501. osdc = &fsc->client->osdc;
  2502. while (len >= object_size) {
  2503. ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off,
  2504. object_size, &src_objnum,
  2505. &src_objoff, &src_objlen);
  2506. ceph_calc_file_object_mapping(&dst_ci->i_layout, *dst_off,
  2507. object_size, &dst_objnum,
  2508. &dst_objoff, &dst_objlen);
  2509. ceph_oid_init(&src_oid);
  2510. ceph_oid_printf(&src_oid, "%llx.%08llx",
  2511. src_ci->i_vino.ino, src_objnum);
  2512. ceph_oid_init(&dst_oid);
  2513. ceph_oid_printf(&dst_oid, "%llx.%08llx",
  2514. dst_ci->i_vino.ino, dst_objnum);
  2515. /* Do an object remote copy */
  2516. req = ceph_alloc_copyfrom_request(osdc, src_ci->i_vino.snap,
  2517. &src_oid, &src_oloc,
  2518. &dst_oid, &dst_oloc,
  2519. dst_ci->i_truncate_seq,
  2520. dst_ci->i_truncate_size);
  2521. if (IS_ERR(req))
  2522. ret = PTR_ERR(req);
  2523. else {
  2524. ceph_osdc_start_request(osdc, req);
  2525. ret = ceph_osdc_wait_request(osdc, req);
  2526. ceph_update_copyfrom_metrics(&fsc->mdsc->metric,
  2527. req->r_start_latency,
  2528. req->r_end_latency,
  2529. object_size, ret);
  2530. ceph_osdc_put_request(req);
  2531. }
  2532. if (ret) {
  2533. if (ret == -EOPNOTSUPP) {
  2534. fsc->have_copy_from2 = false;
  2535. pr_notice_client(cl,
  2536. "OSDs don't support copy-from2; disabling copy offload\n");
  2537. }
  2538. doutc(cl, "returned %d\n", ret);
  2539. if (!bytes)
  2540. bytes = ret;
  2541. goto out;
  2542. }
  2543. len -= object_size;
  2544. bytes += object_size;
  2545. *src_off += object_size;
  2546. *dst_off += object_size;
  2547. }
  2548. out:
  2549. ceph_oloc_destroy(&src_oloc);
  2550. ceph_oloc_destroy(&dst_oloc);
  2551. return bytes;
  2552. }
  2553. static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
  2554. struct file *dst_file, loff_t dst_off,
  2555. size_t len, unsigned int flags)
  2556. {
  2557. struct inode *src_inode = file_inode(src_file);
  2558. struct inode *dst_inode = file_inode(dst_file);
  2559. struct ceph_inode_info *src_ci = ceph_inode(src_inode);
  2560. struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
  2561. struct ceph_cap_flush *prealloc_cf;
  2562. struct ceph_fs_client *src_fsc = ceph_inode_to_fs_client(src_inode);
  2563. struct ceph_client *cl = src_fsc->client;
  2564. loff_t size;
  2565. ssize_t ret = -EIO, bytes;
  2566. u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
  2567. u32 src_objlen, dst_objlen;
  2568. int src_got = 0, dst_got = 0, err, dirty;
  2569. if (src_inode->i_sb != dst_inode->i_sb) {
  2570. struct ceph_fs_client *dst_fsc = ceph_inode_to_fs_client(dst_inode);
  2571. if (ceph_fsid_compare(&src_fsc->client->fsid,
  2572. &dst_fsc->client->fsid)) {
  2573. dout("Copying files across clusters: src: %pU dst: %pU\n",
  2574. &src_fsc->client->fsid, &dst_fsc->client->fsid);
  2575. return -EXDEV;
  2576. }
  2577. }
  2578. if (ceph_snap(dst_inode) != CEPH_NOSNAP)
  2579. return -EROFS;
  2580. /*
  2581. * Some of the checks below will return -EOPNOTSUPP, which will force a
  2582. * fallback to the default VFS copy_file_range implementation. This is
  2583. * desirable in several cases (for ex, the 'len' is smaller than the
  2584. * size of the objects, or in cases where that would be more
  2585. * efficient).
  2586. */
  2587. if (ceph_test_mount_opt(src_fsc, NOCOPYFROM))
  2588. return -EOPNOTSUPP;
  2589. if (!src_fsc->have_copy_from2)
  2590. return -EOPNOTSUPP;
  2591. /*
  2592. * Striped file layouts require that we copy partial objects, but the
  2593. * OSD copy-from operation only supports full-object copies. Limit
  2594. * this to non-striped file layouts for now.
  2595. */
  2596. if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) ||
  2597. (src_ci->i_layout.stripe_count != 1) ||
  2598. (dst_ci->i_layout.stripe_count != 1) ||
  2599. (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) {
  2600. doutc(cl, "Invalid src/dst files layout\n");
  2601. return -EOPNOTSUPP;
  2602. }
  2603. /* Every encrypted inode gets its own key, so we can't offload them */
  2604. if (IS_ENCRYPTED(src_inode) || IS_ENCRYPTED(dst_inode))
  2605. return -EOPNOTSUPP;
  2606. if (len < src_ci->i_layout.object_size)
  2607. return -EOPNOTSUPP; /* no remote copy will be done */
  2608. prealloc_cf = ceph_alloc_cap_flush();
  2609. if (!prealloc_cf)
  2610. return -ENOMEM;
  2611. /* Start by sync'ing the source and destination files */
  2612. ret = file_write_and_wait_range(src_file, src_off, (src_off + len));
  2613. if (ret < 0) {
  2614. doutc(cl, "failed to write src file (%zd)\n", ret);
  2615. goto out;
  2616. }
  2617. ret = file_write_and_wait_range(dst_file, dst_off, (dst_off + len));
  2618. if (ret < 0) {
  2619. doutc(cl, "failed to write dst file (%zd)\n", ret);
  2620. goto out;
  2621. }
  2622. /*
  2623. * We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other
  2624. * clients may have dirty data in their caches. And OSDs know nothing
  2625. * about caps, so they can't safely do the remote object copies.
  2626. */
  2627. err = get_rd_wr_caps(src_file, &src_got,
  2628. dst_file, (dst_off + len), &dst_got);
  2629. if (err < 0) {
  2630. doutc(cl, "get_rd_wr_caps returned %d\n", err);
  2631. ret = -EOPNOTSUPP;
  2632. goto out;
  2633. }
  2634. ret = is_file_size_ok(src_inode, dst_inode, src_off, dst_off, len);
  2635. if (ret < 0)
  2636. goto out_caps;
  2637. /* Drop dst file cached pages */
  2638. ceph_fscache_invalidate(dst_inode, false);
  2639. ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
  2640. dst_off >> PAGE_SHIFT,
  2641. (dst_off + len) >> PAGE_SHIFT);
  2642. if (ret < 0) {
  2643. doutc(cl, "Failed to invalidate inode pages (%zd)\n",
  2644. ret);
  2645. ret = 0; /* XXX */
  2646. }
  2647. ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
  2648. src_ci->i_layout.object_size,
  2649. &src_objnum, &src_objoff, &src_objlen);
  2650. ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
  2651. dst_ci->i_layout.object_size,
  2652. &dst_objnum, &dst_objoff, &dst_objlen);
  2653. /* object-level offsets need to the same */
  2654. if (src_objoff != dst_objoff) {
  2655. ret = -EOPNOTSUPP;
  2656. goto out_caps;
  2657. }
  2658. /*
  2659. * Do a manual copy if the object offset isn't object aligned.
  2660. * 'src_objlen' contains the bytes left until the end of the object,
  2661. * starting at the src_off
  2662. */
  2663. if (src_objoff) {
  2664. doutc(cl, "Initial partial copy of %u bytes\n", src_objlen);
  2665. /*
  2666. * we need to temporarily drop all caps as we'll be calling
  2667. * {read,write}_iter, which will get caps again.
  2668. */
  2669. put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
  2670. ret = splice_file_range(src_file, &src_off, dst_file, &dst_off,
  2671. src_objlen);
  2672. /* Abort on short copies or on error */
  2673. if (ret < (long)src_objlen) {
  2674. doutc(cl, "Failed partial copy (%zd)\n", ret);
  2675. goto out;
  2676. }
  2677. len -= ret;
  2678. err = get_rd_wr_caps(src_file, &src_got,
  2679. dst_file, (dst_off + len), &dst_got);
  2680. if (err < 0)
  2681. goto out;
  2682. err = is_file_size_ok(src_inode, dst_inode,
  2683. src_off, dst_off, len);
  2684. if (err < 0)
  2685. goto out_caps;
  2686. }
  2687. size = i_size_read(dst_inode);
  2688. bytes = ceph_do_objects_copy(src_ci, &src_off, dst_ci, &dst_off,
  2689. src_fsc, len, flags);
  2690. if (bytes <= 0) {
  2691. if (!ret)
  2692. ret = bytes;
  2693. goto out_caps;
  2694. }
  2695. doutc(cl, "Copied %zu bytes out of %zu\n", bytes, len);
  2696. len -= bytes;
  2697. ret += bytes;
  2698. file_update_time(dst_file);
  2699. inode_inc_iversion_raw(dst_inode);
  2700. if (dst_off > size) {
  2701. /* Let the MDS know about dst file size change */
  2702. if (ceph_inode_set_size(dst_inode, dst_off) ||
  2703. ceph_quota_is_max_bytes_approaching(dst_inode, dst_off))
  2704. ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH);
  2705. }
  2706. /* Mark Fw dirty */
  2707. spin_lock(&dst_ci->i_ceph_lock);
  2708. dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf);
  2709. spin_unlock(&dst_ci->i_ceph_lock);
  2710. if (dirty)
  2711. __mark_inode_dirty(dst_inode, dirty);
  2712. out_caps:
  2713. put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
  2714. /*
  2715. * Do the final manual copy if we still have some bytes left, unless
  2716. * there were errors in remote object copies (len >= object_size).
  2717. */
  2718. if (len && (len < src_ci->i_layout.object_size)) {
  2719. doutc(cl, "Final partial copy of %zu bytes\n", len);
  2720. bytes = splice_file_range(src_file, &src_off, dst_file,
  2721. &dst_off, len);
  2722. if (bytes > 0)
  2723. ret += bytes;
  2724. else
  2725. doutc(cl, "Failed partial copy (%zd)\n", bytes);
  2726. }
  2727. out:
  2728. ceph_free_cap_flush(prealloc_cf);
  2729. return ret;
  2730. }
  2731. static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
  2732. struct file *dst_file, loff_t dst_off,
  2733. size_t len, unsigned int flags)
  2734. {
  2735. ssize_t ret;
  2736. ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off,
  2737. len, flags);
  2738. if (ret == -EOPNOTSUPP || ret == -EXDEV)
  2739. ret = splice_copy_file_range(src_file, src_off, dst_file,
  2740. dst_off, len);
  2741. return ret;
  2742. }
  2743. const struct file_operations ceph_file_fops = {
  2744. .open = ceph_open,
  2745. .release = ceph_release,
  2746. .llseek = ceph_llseek,
  2747. .read_iter = ceph_read_iter,
  2748. .write_iter = ceph_write_iter,
  2749. .mmap = ceph_mmap,
  2750. .fsync = ceph_fsync,
  2751. .lock = ceph_lock,
  2752. .setlease = simple_nosetlease,
  2753. .flock = ceph_flock,
  2754. .splice_read = ceph_splice_read,
  2755. .splice_write = iter_file_splice_write,
  2756. .unlocked_ioctl = ceph_ioctl,
  2757. .compat_ioctl = compat_ptr_ioctl,
  2758. .fallocate = ceph_fallocate,
  2759. .copy_file_range = ceph_copy_file_range,
  2760. };