xfs_bmap_util.c 45 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  4. * Copyright (c) 2012 Red Hat, Inc.
  5. * All Rights Reserved.
  6. */
  7. #include "xfs.h"
  8. #include "xfs_fs.h"
  9. #include "xfs_shared.h"
  10. #include "xfs_format.h"
  11. #include "xfs_log_format.h"
  12. #include "xfs_trans_resv.h"
  13. #include "xfs_bit.h"
  14. #include "xfs_mount.h"
  15. #include "xfs_defer.h"
  16. #include "xfs_inode.h"
  17. #include "xfs_btree.h"
  18. #include "xfs_trans.h"
  19. #include "xfs_alloc.h"
  20. #include "xfs_bmap.h"
  21. #include "xfs_bmap_util.h"
  22. #include "xfs_bmap_btree.h"
  23. #include "xfs_rtalloc.h"
  24. #include "xfs_error.h"
  25. #include "xfs_quota.h"
  26. #include "xfs_trans_space.h"
  27. #include "xfs_trace.h"
  28. #include "xfs_icache.h"
  29. #include "xfs_iomap.h"
  30. #include "xfs_reflink.h"
  31. #include "xfs_rtbitmap.h"
  32. /* Kernel only BMAP related definitions and functions */
  33. /*
  34. * Convert the given file system block to a disk block. We have to treat it
  35. * differently based on whether the file is a real time file or not, because the
  36. * bmap code does.
  37. */
  38. xfs_daddr_t
  39. xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
  40. {
  41. if (XFS_IS_REALTIME_INODE(ip))
  42. return XFS_FSB_TO_BB(ip->i_mount, fsb);
  43. return XFS_FSB_TO_DADDR(ip->i_mount, fsb);
  44. }
  45. /*
  46. * Routine to zero an extent on disk allocated to the specific inode.
  47. *
  48. * The VFS functions take a linearised filesystem block offset, so we have to
  49. * convert the sparse xfs fsb to the right format first.
  50. * VFS types are real funky, too.
  51. */
  52. int
  53. xfs_zero_extent(
  54. struct xfs_inode *ip,
  55. xfs_fsblock_t start_fsb,
  56. xfs_off_t count_fsb)
  57. {
  58. struct xfs_mount *mp = ip->i_mount;
  59. struct xfs_buftarg *target = xfs_inode_buftarg(ip);
  60. xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb);
  61. sector_t block = XFS_BB_TO_FSBT(mp, sector);
  62. return blkdev_issue_zeroout(target->bt_bdev,
  63. block << (mp->m_super->s_blocksize_bits - 9),
  64. count_fsb << (mp->m_super->s_blocksize_bits - 9),
  65. GFP_KERNEL, 0);
  66. }
  67. /*
  68. * Extent tree block counting routines.
  69. */
  70. /*
  71. * Count leaf blocks given a range of extent records. Delayed allocation
  72. * extents are not counted towards the totals.
  73. */
  74. xfs_extnum_t
  75. xfs_bmap_count_leaves(
  76. struct xfs_ifork *ifp,
  77. xfs_filblks_t *count)
  78. {
  79. struct xfs_iext_cursor icur;
  80. struct xfs_bmbt_irec got;
  81. xfs_extnum_t numrecs = 0;
  82. for_each_xfs_iext(ifp, &icur, &got) {
  83. if (!isnullstartblock(got.br_startblock)) {
  84. *count += got.br_blockcount;
  85. numrecs++;
  86. }
  87. }
  88. return numrecs;
  89. }
  90. /*
  91. * Count fsblocks of the given fork. Delayed allocation extents are
  92. * not counted towards the totals.
  93. */
  94. int
  95. xfs_bmap_count_blocks(
  96. struct xfs_trans *tp,
  97. struct xfs_inode *ip,
  98. int whichfork,
  99. xfs_extnum_t *nextents,
  100. xfs_filblks_t *count)
  101. {
  102. struct xfs_mount *mp = ip->i_mount;
  103. struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
  104. struct xfs_btree_cur *cur;
  105. xfs_filblks_t btblocks = 0;
  106. int error;
  107. *nextents = 0;
  108. *count = 0;
  109. if (!ifp)
  110. return 0;
  111. switch (ifp->if_format) {
  112. case XFS_DINODE_FMT_BTREE:
  113. error = xfs_iread_extents(tp, ip, whichfork);
  114. if (error)
  115. return error;
  116. cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
  117. error = xfs_btree_count_blocks(cur, &btblocks);
  118. xfs_btree_del_cursor(cur, error);
  119. if (error)
  120. return error;
  121. /*
  122. * xfs_btree_count_blocks includes the root block contained in
  123. * the inode fork in @btblocks, so subtract one because we're
  124. * only interested in allocated disk blocks.
  125. */
  126. *count += btblocks - 1;
  127. fallthrough;
  128. case XFS_DINODE_FMT_EXTENTS:
  129. *nextents = xfs_bmap_count_leaves(ifp, count);
  130. break;
  131. }
  132. return 0;
  133. }
  134. static int
  135. xfs_getbmap_report_one(
  136. struct xfs_inode *ip,
  137. struct getbmapx *bmv,
  138. struct kgetbmap *out,
  139. int64_t bmv_end,
  140. struct xfs_bmbt_irec *got)
  141. {
  142. struct kgetbmap *p = out + bmv->bmv_entries;
  143. bool shared = false;
  144. int error;
  145. error = xfs_reflink_trim_around_shared(ip, got, &shared);
  146. if (error)
  147. return error;
  148. if (isnullstartblock(got->br_startblock) ||
  149. got->br_startblock == DELAYSTARTBLOCK) {
  150. /*
  151. * Take the flush completion as being a point-in-time snapshot
  152. * where there are no delalloc extents, and if any new ones
  153. * have been created racily, just skip them as being 'after'
  154. * the flush and so don't get reported.
  155. */
  156. if (!(bmv->bmv_iflags & BMV_IF_DELALLOC))
  157. return 0;
  158. p->bmv_oflags |= BMV_OF_DELALLOC;
  159. p->bmv_block = -2;
  160. } else {
  161. p->bmv_block = xfs_fsb_to_db(ip, got->br_startblock);
  162. }
  163. if (got->br_state == XFS_EXT_UNWRITTEN &&
  164. (bmv->bmv_iflags & BMV_IF_PREALLOC))
  165. p->bmv_oflags |= BMV_OF_PREALLOC;
  166. if (shared)
  167. p->bmv_oflags |= BMV_OF_SHARED;
  168. p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, got->br_startoff);
  169. p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, got->br_blockcount);
  170. bmv->bmv_offset = p->bmv_offset + p->bmv_length;
  171. bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
  172. bmv->bmv_entries++;
  173. return 0;
  174. }
  175. static void
  176. xfs_getbmap_report_hole(
  177. struct xfs_inode *ip,
  178. struct getbmapx *bmv,
  179. struct kgetbmap *out,
  180. int64_t bmv_end,
  181. xfs_fileoff_t bno,
  182. xfs_fileoff_t end)
  183. {
  184. struct kgetbmap *p = out + bmv->bmv_entries;
  185. if (bmv->bmv_iflags & BMV_IF_NO_HOLES)
  186. return;
  187. p->bmv_block = -1;
  188. p->bmv_offset = XFS_FSB_TO_BB(ip->i_mount, bno);
  189. p->bmv_length = XFS_FSB_TO_BB(ip->i_mount, end - bno);
  190. bmv->bmv_offset = p->bmv_offset + p->bmv_length;
  191. bmv->bmv_length = max(0LL, bmv_end - bmv->bmv_offset);
  192. bmv->bmv_entries++;
  193. }
  194. static inline bool
  195. xfs_getbmap_full(
  196. struct getbmapx *bmv)
  197. {
  198. return bmv->bmv_length == 0 || bmv->bmv_entries >= bmv->bmv_count - 1;
  199. }
  200. static bool
  201. xfs_getbmap_next_rec(
  202. struct xfs_bmbt_irec *rec,
  203. xfs_fileoff_t total_end)
  204. {
  205. xfs_fileoff_t end = rec->br_startoff + rec->br_blockcount;
  206. if (end == total_end)
  207. return false;
  208. rec->br_startoff += rec->br_blockcount;
  209. if (!isnullstartblock(rec->br_startblock) &&
  210. rec->br_startblock != DELAYSTARTBLOCK)
  211. rec->br_startblock += rec->br_blockcount;
  212. rec->br_blockcount = total_end - end;
  213. return true;
  214. }
  215. /*
  216. * Get inode's extents as described in bmv, and format for output.
  217. * Calls formatter to fill the user's buffer until all extents
  218. * are mapped, until the passed-in bmv->bmv_count slots have
  219. * been filled, or until the formatter short-circuits the loop,
  220. * if it is tracking filled-in extents on its own.
  221. */
  222. int /* error code */
  223. xfs_getbmap(
  224. struct xfs_inode *ip,
  225. struct getbmapx *bmv, /* user bmap structure */
  226. struct kgetbmap *out)
  227. {
  228. struct xfs_mount *mp = ip->i_mount;
  229. int iflags = bmv->bmv_iflags;
  230. int whichfork, lock, error = 0;
  231. int64_t bmv_end, max_len;
  232. xfs_fileoff_t bno, first_bno;
  233. struct xfs_ifork *ifp;
  234. struct xfs_bmbt_irec got, rec;
  235. xfs_filblks_t len;
  236. struct xfs_iext_cursor icur;
  237. if (bmv->bmv_iflags & ~BMV_IF_VALID)
  238. return -EINVAL;
  239. #ifndef DEBUG
  240. /* Only allow CoW fork queries if we're debugging. */
  241. if (iflags & BMV_IF_COWFORK)
  242. return -EINVAL;
  243. #endif
  244. if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK))
  245. return -EINVAL;
  246. if (bmv->bmv_length < -1)
  247. return -EINVAL;
  248. bmv->bmv_entries = 0;
  249. if (bmv->bmv_length == 0)
  250. return 0;
  251. if (iflags & BMV_IF_ATTRFORK)
  252. whichfork = XFS_ATTR_FORK;
  253. else if (iflags & BMV_IF_COWFORK)
  254. whichfork = XFS_COW_FORK;
  255. else
  256. whichfork = XFS_DATA_FORK;
  257. xfs_ilock(ip, XFS_IOLOCK_SHARED);
  258. switch (whichfork) {
  259. case XFS_ATTR_FORK:
  260. lock = xfs_ilock_attr_map_shared(ip);
  261. if (!xfs_inode_has_attr_fork(ip))
  262. goto out_unlock_ilock;
  263. max_len = 1LL << 32;
  264. break;
  265. case XFS_COW_FORK:
  266. lock = XFS_ILOCK_SHARED;
  267. xfs_ilock(ip, lock);
  268. /* No CoW fork? Just return */
  269. if (!xfs_ifork_ptr(ip, whichfork))
  270. goto out_unlock_ilock;
  271. if (xfs_get_cowextsz_hint(ip))
  272. max_len = mp->m_super->s_maxbytes;
  273. else
  274. max_len = XFS_ISIZE(ip);
  275. break;
  276. case XFS_DATA_FORK:
  277. if (!(iflags & BMV_IF_DELALLOC) &&
  278. (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_disk_size)) {
  279. error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
  280. if (error)
  281. goto out_unlock_iolock;
  282. /*
  283. * Even after flushing the inode, there can still be
  284. * delalloc blocks on the inode beyond EOF due to
  285. * speculative preallocation. These are not removed
  286. * until the release function is called or the inode
  287. * is inactivated. Hence we cannot assert here that
  288. * ip->i_delayed_blks == 0.
  289. */
  290. }
  291. if (xfs_get_extsz_hint(ip) ||
  292. (ip->i_diflags & XFS_DIFLAG_PREALLOC))
  293. max_len = mp->m_super->s_maxbytes;
  294. else
  295. max_len = XFS_ISIZE(ip);
  296. lock = xfs_ilock_data_map_shared(ip);
  297. break;
  298. }
  299. ifp = xfs_ifork_ptr(ip, whichfork);
  300. switch (ifp->if_format) {
  301. case XFS_DINODE_FMT_EXTENTS:
  302. case XFS_DINODE_FMT_BTREE:
  303. break;
  304. case XFS_DINODE_FMT_LOCAL:
  305. /* Local format inode forks report no extents. */
  306. goto out_unlock_ilock;
  307. default:
  308. error = -EINVAL;
  309. goto out_unlock_ilock;
  310. }
  311. if (bmv->bmv_length == -1) {
  312. max_len = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, max_len));
  313. bmv->bmv_length = max(0LL, max_len - bmv->bmv_offset);
  314. }
  315. bmv_end = bmv->bmv_offset + bmv->bmv_length;
  316. first_bno = bno = XFS_BB_TO_FSBT(mp, bmv->bmv_offset);
  317. len = XFS_BB_TO_FSB(mp, bmv->bmv_length);
  318. error = xfs_iread_extents(NULL, ip, whichfork);
  319. if (error)
  320. goto out_unlock_ilock;
  321. if (!xfs_iext_lookup_extent(ip, ifp, bno, &icur, &got)) {
  322. /*
  323. * Report a whole-file hole if the delalloc flag is set to
  324. * stay compatible with the old implementation.
  325. */
  326. if (iflags & BMV_IF_DELALLOC)
  327. xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno,
  328. XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
  329. goto out_unlock_ilock;
  330. }
  331. while (!xfs_getbmap_full(bmv)) {
  332. xfs_trim_extent(&got, first_bno, len);
  333. /*
  334. * Report an entry for a hole if this extent doesn't directly
  335. * follow the previous one.
  336. */
  337. if (got.br_startoff > bno) {
  338. xfs_getbmap_report_hole(ip, bmv, out, bmv_end, bno,
  339. got.br_startoff);
  340. if (xfs_getbmap_full(bmv))
  341. break;
  342. }
  343. /*
  344. * In order to report shared extents accurately, we report each
  345. * distinct shared / unshared part of a single bmbt record with
  346. * an individual getbmapx record.
  347. */
  348. bno = got.br_startoff + got.br_blockcount;
  349. rec = got;
  350. do {
  351. error = xfs_getbmap_report_one(ip, bmv, out, bmv_end,
  352. &rec);
  353. if (error || xfs_getbmap_full(bmv))
  354. goto out_unlock_ilock;
  355. } while (xfs_getbmap_next_rec(&rec, bno));
  356. if (!xfs_iext_next_extent(ifp, &icur, &got)) {
  357. xfs_fileoff_t end = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
  358. if (bmv->bmv_entries > 0)
  359. out[bmv->bmv_entries - 1].bmv_oflags |=
  360. BMV_OF_LAST;
  361. if (whichfork != XFS_ATTR_FORK && bno < end &&
  362. !xfs_getbmap_full(bmv)) {
  363. xfs_getbmap_report_hole(ip, bmv, out, bmv_end,
  364. bno, end);
  365. }
  366. break;
  367. }
  368. if (bno >= first_bno + len)
  369. break;
  370. }
  371. out_unlock_ilock:
  372. xfs_iunlock(ip, lock);
  373. out_unlock_iolock:
  374. xfs_iunlock(ip, XFS_IOLOCK_SHARED);
  375. return error;
  376. }
  377. /*
  378. * Dead simple method of punching delalyed allocation blocks from a range in
  379. * the inode. This will always punch out both the start and end blocks, even
  380. * if the ranges only partially overlap them, so it is up to the caller to
  381. * ensure that partial blocks are not passed in.
  382. */
  383. void
  384. xfs_bmap_punch_delalloc_range(
  385. struct xfs_inode *ip,
  386. int whichfork,
  387. xfs_off_t start_byte,
  388. xfs_off_t end_byte)
  389. {
  390. struct xfs_mount *mp = ip->i_mount;
  391. struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
  392. xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte);
  393. xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte);
  394. struct xfs_bmbt_irec got, del;
  395. struct xfs_iext_cursor icur;
  396. ASSERT(!xfs_need_iread_extents(ifp));
  397. xfs_ilock(ip, XFS_ILOCK_EXCL);
  398. if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
  399. goto out_unlock;
  400. while (got.br_startoff + got.br_blockcount > start_fsb) {
  401. del = got;
  402. xfs_trim_extent(&del, start_fsb, end_fsb - start_fsb);
  403. /*
  404. * A delete can push the cursor forward. Step back to the
  405. * previous extent on non-delalloc or extents outside the
  406. * target range.
  407. */
  408. if (!del.br_blockcount ||
  409. !isnullstartblock(del.br_startblock)) {
  410. if (!xfs_iext_prev_extent(ifp, &icur, &got))
  411. break;
  412. continue;
  413. }
  414. xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
  415. if (!xfs_iext_get_extent(ifp, &icur, &got))
  416. break;
  417. }
  418. if (whichfork == XFS_COW_FORK && !ifp->if_bytes)
  419. xfs_inode_clear_cowblocks_tag(ip);
  420. out_unlock:
  421. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  422. }
  423. /*
  424. * Test whether it is appropriate to check an inode for and free post EOF
  425. * blocks.
  426. */
  427. bool
  428. xfs_can_free_eofblocks(
  429. struct xfs_inode *ip)
  430. {
  431. struct xfs_mount *mp = ip->i_mount;
  432. bool found_blocks = false;
  433. xfs_fileoff_t end_fsb;
  434. xfs_fileoff_t last_fsb;
  435. struct xfs_bmbt_irec imap;
  436. struct xfs_iext_cursor icur;
  437. /*
  438. * Caller must either hold the exclusive io lock; or be inactivating
  439. * the inode, which guarantees there are no other users of the inode.
  440. */
  441. if (!(VFS_I(ip)->i_state & I_FREEING))
  442. xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
  443. /* prealloc/delalloc exists only on regular files */
  444. if (!S_ISREG(VFS_I(ip)->i_mode))
  445. return false;
  446. /*
  447. * Zero sized files with no cached pages and delalloc blocks will not
  448. * have speculative prealloc/delalloc blocks to remove.
  449. */
  450. if (VFS_I(ip)->i_size == 0 &&
  451. VFS_I(ip)->i_mapping->nrpages == 0 &&
  452. ip->i_delayed_blks == 0)
  453. return false;
  454. /* If we haven't read in the extent list, then don't do it now. */
  455. if (xfs_need_iread_extents(&ip->i_df))
  456. return false;
  457. /*
  458. * Do not free real extents in preallocated files unless the file has
  459. * delalloc blocks and we are forced to remove them.
  460. */
  461. if ((ip->i_diflags & XFS_DIFLAG_PREALLOC) && !ip->i_delayed_blks)
  462. return false;
  463. /*
  464. * Do not try to free post-EOF blocks if EOF is beyond the end of the
  465. * range supported by the page cache, because the truncation will loop
  466. * forever.
  467. */
  468. end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
  469. if (xfs_inode_has_bigrtalloc(ip))
  470. end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb);
  471. last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
  472. if (last_fsb <= end_fsb)
  473. return false;
  474. /*
  475. * Check if there is an post-EOF extent to free. If there are any
  476. * delalloc blocks attached to the inode (data fork delalloc
  477. * reservations or CoW extents of any kind), we need to free them so
  478. * that inactivation doesn't fail to erase them.
  479. */
  480. xfs_ilock(ip, XFS_ILOCK_SHARED);
  481. if (ip->i_delayed_blks ||
  482. xfs_iext_lookup_extent(ip, &ip->i_df, end_fsb, &icur, &imap))
  483. found_blocks = true;
  484. xfs_iunlock(ip, XFS_ILOCK_SHARED);
  485. return found_blocks;
  486. }
  487. /*
  488. * This is called to free any blocks beyond eof. The caller must hold
  489. * IOLOCK_EXCL unless we are in the inode reclaim path and have the only
  490. * reference to the inode.
  491. */
  492. int
  493. xfs_free_eofblocks(
  494. struct xfs_inode *ip)
  495. {
  496. struct xfs_trans *tp;
  497. struct xfs_mount *mp = ip->i_mount;
  498. int error;
  499. /* Attach the dquots to the inode up front. */
  500. error = xfs_qm_dqattach(ip);
  501. if (error)
  502. return error;
  503. /* Wait on dio to ensure i_size has settled. */
  504. inode_dio_wait(VFS_I(ip));
  505. /*
  506. * For preallocated files only free delayed allocations.
  507. *
  508. * Note that this means we also leave speculative preallocations in
  509. * place for preallocated files.
  510. */
  511. if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) {
  512. if (ip->i_delayed_blks) {
  513. xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK,
  514. round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
  515. LLONG_MAX);
  516. }
  517. xfs_inode_clear_eofblocks_tag(ip);
  518. return 0;
  519. }
  520. error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
  521. if (error) {
  522. ASSERT(xfs_is_shutdown(mp));
  523. return error;
  524. }
  525. xfs_ilock(ip, XFS_ILOCK_EXCL);
  526. xfs_trans_ijoin(tp, ip, 0);
  527. /*
  528. * Do not update the on-disk file size. If we update the on-disk file
  529. * size and then the system crashes before the contents of the file are
  530. * flushed to disk then the files may be full of holes (ie NULL files
  531. * bug).
  532. */
  533. error = xfs_itruncate_extents_flags(&tp, ip, XFS_DATA_FORK,
  534. XFS_ISIZE(ip), XFS_BMAPI_NODISCARD);
  535. if (error)
  536. goto err_cancel;
  537. error = xfs_trans_commit(tp);
  538. if (error)
  539. goto out_unlock;
  540. xfs_inode_clear_eofblocks_tag(ip);
  541. goto out_unlock;
  542. err_cancel:
  543. /*
  544. * If we get an error at this point we simply don't
  545. * bother truncating the file.
  546. */
  547. xfs_trans_cancel(tp);
  548. out_unlock:
  549. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  550. return error;
  551. }
  552. int
  553. xfs_alloc_file_space(
  554. struct xfs_inode *ip,
  555. xfs_off_t offset,
  556. xfs_off_t len)
  557. {
  558. xfs_mount_t *mp = ip->i_mount;
  559. xfs_off_t count;
  560. xfs_filblks_t allocatesize_fsb;
  561. xfs_extlen_t extsz, temp;
  562. xfs_fileoff_t startoffset_fsb;
  563. xfs_fileoff_t endoffset_fsb;
  564. int rt;
  565. xfs_trans_t *tp;
  566. xfs_bmbt_irec_t imaps[1], *imapp;
  567. int error;
  568. if (xfs_is_always_cow_inode(ip))
  569. return 0;
  570. trace_xfs_alloc_file_space(ip);
  571. if (xfs_is_shutdown(mp))
  572. return -EIO;
  573. error = xfs_qm_dqattach(ip);
  574. if (error)
  575. return error;
  576. if (len <= 0)
  577. return -EINVAL;
  578. rt = XFS_IS_REALTIME_INODE(ip);
  579. extsz = xfs_get_extsz_hint(ip);
  580. count = len;
  581. imapp = &imaps[0];
  582. startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
  583. endoffset_fsb = XFS_B_TO_FSB(mp, offset + count);
  584. allocatesize_fsb = endoffset_fsb - startoffset_fsb;
  585. /*
  586. * Allocate file space until done or until there is an error
  587. */
  588. while (allocatesize_fsb && !error) {
  589. xfs_fileoff_t s, e;
  590. unsigned int dblocks, rblocks, resblks;
  591. int nimaps = 1;
  592. /*
  593. * Determine space reservations for data/realtime.
  594. */
  595. if (unlikely(extsz)) {
  596. s = startoffset_fsb;
  597. do_div(s, extsz);
  598. s *= extsz;
  599. e = startoffset_fsb + allocatesize_fsb;
  600. div_u64_rem(startoffset_fsb, extsz, &temp);
  601. if (temp)
  602. e += temp;
  603. div_u64_rem(e, extsz, &temp);
  604. if (temp)
  605. e += extsz - temp;
  606. } else {
  607. s = 0;
  608. e = allocatesize_fsb;
  609. }
  610. /*
  611. * The transaction reservation is limited to a 32-bit block
  612. * count, hence we need to limit the number of blocks we are
  613. * trying to reserve to avoid an overflow. We can't allocate
  614. * more than @nimaps extents, and an extent is limited on disk
  615. * to XFS_BMBT_MAX_EXTLEN (21 bits), so use that to enforce the
  616. * limit.
  617. */
  618. resblks = min_t(xfs_fileoff_t, (e - s),
  619. (XFS_MAX_BMBT_EXTLEN * nimaps));
  620. if (unlikely(rt)) {
  621. dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
  622. rblocks = resblks;
  623. } else {
  624. dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
  625. rblocks = 0;
  626. }
  627. error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
  628. dblocks, rblocks, false, &tp);
  629. if (error)
  630. break;
  631. error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
  632. XFS_IEXT_ADD_NOSPLIT_CNT);
  633. if (error)
  634. goto error;
  635. /*
  636. * If the allocator cannot find a single free extent large
  637. * enough to cover the start block of the requested range,
  638. * xfs_bmapi_write will return -ENOSR.
  639. *
  640. * In that case we simply need to keep looping with the same
  641. * startoffset_fsb so that one of the following allocations
  642. * will eventually reach the requested range.
  643. */
  644. error = xfs_bmapi_write(tp, ip, startoffset_fsb,
  645. allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
  646. &nimaps);
  647. if (error) {
  648. if (error != -ENOSR)
  649. goto error;
  650. error = 0;
  651. } else {
  652. startoffset_fsb += imapp->br_blockcount;
  653. allocatesize_fsb -= imapp->br_blockcount;
  654. }
  655. ip->i_diflags |= XFS_DIFLAG_PREALLOC;
  656. xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
  657. error = xfs_trans_commit(tp);
  658. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  659. }
  660. return error;
  661. error:
  662. xfs_trans_cancel(tp);
  663. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  664. return error;
  665. }
  666. static int
  667. xfs_unmap_extent(
  668. struct xfs_inode *ip,
  669. xfs_fileoff_t startoffset_fsb,
  670. xfs_filblks_t len_fsb,
  671. int *done)
  672. {
  673. struct xfs_mount *mp = ip->i_mount;
  674. struct xfs_trans *tp;
  675. uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
  676. int error;
  677. error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0,
  678. false, &tp);
  679. if (error)
  680. return error;
  681. error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
  682. XFS_IEXT_PUNCH_HOLE_CNT);
  683. if (error)
  684. goto out_trans_cancel;
  685. error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, done);
  686. if (error)
  687. goto out_trans_cancel;
  688. error = xfs_trans_commit(tp);
  689. out_unlock:
  690. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  691. return error;
  692. out_trans_cancel:
  693. xfs_trans_cancel(tp);
  694. goto out_unlock;
  695. }
  696. /* Caller must first wait for the completion of any pending DIOs if required. */
  697. int
  698. xfs_flush_unmap_range(
  699. struct xfs_inode *ip,
  700. xfs_off_t offset,
  701. xfs_off_t len)
  702. {
  703. struct inode *inode = VFS_I(ip);
  704. xfs_off_t rounding, start, end;
  705. int error;
  706. /*
  707. * Make sure we extend the flush out to extent alignment
  708. * boundaries so any extent range overlapping the start/end
  709. * of the modification we are about to do is clean and idle.
  710. */
  711. rounding = max_t(xfs_off_t, xfs_inode_alloc_unitsize(ip), PAGE_SIZE);
  712. start = rounddown_64(offset, rounding);
  713. end = roundup_64(offset + len, rounding) - 1;
  714. error = filemap_write_and_wait_range(inode->i_mapping, start, end);
  715. if (error)
  716. return error;
  717. truncate_pagecache_range(inode, start, end);
  718. return 0;
  719. }
  720. int
  721. xfs_free_file_space(
  722. struct xfs_inode *ip,
  723. xfs_off_t offset,
  724. xfs_off_t len)
  725. {
  726. struct xfs_mount *mp = ip->i_mount;
  727. xfs_fileoff_t startoffset_fsb;
  728. xfs_fileoff_t endoffset_fsb;
  729. int done = 0, error;
  730. trace_xfs_free_file_space(ip);
  731. error = xfs_qm_dqattach(ip);
  732. if (error)
  733. return error;
  734. if (len <= 0) /* if nothing being freed */
  735. return 0;
  736. /*
  737. * Now AIO and DIO has drained we flush and (if necessary) invalidate
  738. * the cached range over the first operation we are about to run.
  739. */
  740. error = xfs_flush_unmap_range(ip, offset, len);
  741. if (error)
  742. return error;
  743. startoffset_fsb = XFS_B_TO_FSB(mp, offset);
  744. endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
  745. /* We can only free complete realtime extents. */
  746. if (xfs_inode_has_bigrtalloc(ip)) {
  747. startoffset_fsb = xfs_rtb_roundup_rtx(mp, startoffset_fsb);
  748. endoffset_fsb = xfs_rtb_rounddown_rtx(mp, endoffset_fsb);
  749. }
  750. /*
  751. * Need to zero the stuff we're not freeing, on disk.
  752. */
  753. if (endoffset_fsb > startoffset_fsb) {
  754. while (!done) {
  755. error = xfs_unmap_extent(ip, startoffset_fsb,
  756. endoffset_fsb - startoffset_fsb, &done);
  757. if (error)
  758. return error;
  759. }
  760. }
  761. /*
  762. * Now that we've unmap all full blocks we'll have to zero out any
  763. * partial block at the beginning and/or end. xfs_zero_range is smart
  764. * enough to skip any holes, including those we just created, but we
  765. * must take care not to zero beyond EOF and enlarge i_size.
  766. */
  767. if (offset >= XFS_ISIZE(ip))
  768. return 0;
  769. if (offset + len > XFS_ISIZE(ip))
  770. len = XFS_ISIZE(ip) - offset;
  771. error = xfs_zero_range(ip, offset, len, NULL);
  772. if (error)
  773. return error;
  774. /*
  775. * If we zeroed right up to EOF and EOF straddles a page boundary we
  776. * must make sure that the post-EOF area is also zeroed because the
  777. * page could be mmap'd and xfs_zero_range doesn't do that for us.
  778. * Writeback of the eof page will do this, albeit clumsily.
  779. */
  780. if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) {
  781. error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
  782. round_down(offset + len, PAGE_SIZE), LLONG_MAX);
  783. }
  784. return error;
  785. }
  786. static int
  787. xfs_prepare_shift(
  788. struct xfs_inode *ip,
  789. loff_t offset)
  790. {
  791. unsigned int rounding;
  792. int error;
  793. /*
  794. * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
  795. * into the accessible region of the file.
  796. */
  797. if (xfs_can_free_eofblocks(ip)) {
  798. error = xfs_free_eofblocks(ip);
  799. if (error)
  800. return error;
  801. }
  802. /*
  803. * Shift operations must stabilize the start block offset boundary along
  804. * with the full range of the operation. If we don't, a COW writeback
  805. * completion could race with an insert, front merge with the start
  806. * extent (after split) during the shift and corrupt the file. Start
  807. * with the allocation unit just prior to the start to stabilize the
  808. * boundary.
  809. */
  810. rounding = xfs_inode_alloc_unitsize(ip);
  811. offset = rounddown_64(offset, rounding);
  812. if (offset)
  813. offset -= rounding;
  814. /*
  815. * Writeback and invalidate cache for the remainder of the file as we're
  816. * about to shift down every extent from offset to EOF.
  817. */
  818. error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip));
  819. if (error)
  820. return error;
  821. /*
  822. * Clean out anything hanging around in the cow fork now that
  823. * we've flushed all the dirty data out to disk to avoid having
  824. * CoW extents at the wrong offsets.
  825. */
  826. if (xfs_inode_has_cow_data(ip)) {
  827. error = xfs_reflink_cancel_cow_range(ip, offset, NULLFILEOFF,
  828. true);
  829. if (error)
  830. return error;
  831. }
  832. return 0;
  833. }
  834. /*
  835. * xfs_collapse_file_space()
  836. * This routine frees disk space and shift extent for the given file.
  837. * The first thing we do is to free data blocks in the specified range
  838. * by calling xfs_free_file_space(). It would also sync dirty data
  839. * and invalidate page cache over the region on which collapse range
  840. * is working. And Shift extent records to the left to cover a hole.
  841. * RETURNS:
  842. * 0 on success
  843. * errno on error
  844. *
  845. */
  846. int
  847. xfs_collapse_file_space(
  848. struct xfs_inode *ip,
  849. xfs_off_t offset,
  850. xfs_off_t len)
  851. {
  852. struct xfs_mount *mp = ip->i_mount;
  853. struct xfs_trans *tp;
  854. int error;
  855. xfs_fileoff_t next_fsb = XFS_B_TO_FSB(mp, offset + len);
  856. xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len);
  857. bool done = false;
  858. xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
  859. trace_xfs_collapse_file_space(ip);
  860. error = xfs_free_file_space(ip, offset, len);
  861. if (error)
  862. return error;
  863. error = xfs_prepare_shift(ip, offset);
  864. if (error)
  865. return error;
  866. error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
  867. if (error)
  868. return error;
  869. xfs_ilock(ip, XFS_ILOCK_EXCL);
  870. xfs_trans_ijoin(tp, ip, 0);
  871. while (!done) {
  872. error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb,
  873. &done);
  874. if (error)
  875. goto out_trans_cancel;
  876. if (done)
  877. break;
  878. /* finish any deferred frees and roll the transaction */
  879. error = xfs_defer_finish(&tp);
  880. if (error)
  881. goto out_trans_cancel;
  882. }
  883. error = xfs_trans_commit(tp);
  884. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  885. return error;
  886. out_trans_cancel:
  887. xfs_trans_cancel(tp);
  888. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  889. return error;
  890. }
  891. /*
  892. * xfs_insert_file_space()
  893. * This routine create hole space by shifting extents for the given file.
  894. * The first thing we do is to sync dirty data and invalidate page cache
  895. * over the region on which insert range is working. And split an extent
  896. * to two extents at given offset by calling xfs_bmap_split_extent.
  897. * And shift all extent records which are laying between [offset,
  898. * last allocated extent] to the right to reserve hole range.
  899. * RETURNS:
  900. * 0 on success
  901. * errno on error
  902. */
  903. int
  904. xfs_insert_file_space(
  905. struct xfs_inode *ip,
  906. loff_t offset,
  907. loff_t len)
  908. {
  909. struct xfs_mount *mp = ip->i_mount;
  910. struct xfs_trans *tp;
  911. int error;
  912. xfs_fileoff_t stop_fsb = XFS_B_TO_FSB(mp, offset);
  913. xfs_fileoff_t next_fsb = NULLFSBLOCK;
  914. xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len);
  915. bool done = false;
  916. xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
  917. trace_xfs_insert_file_space(ip);
  918. error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb);
  919. if (error)
  920. return error;
  921. error = xfs_prepare_shift(ip, offset);
  922. if (error)
  923. return error;
  924. error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
  925. XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
  926. if (error)
  927. return error;
  928. xfs_ilock(ip, XFS_ILOCK_EXCL);
  929. xfs_trans_ijoin(tp, ip, 0);
  930. error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
  931. XFS_IEXT_PUNCH_HOLE_CNT);
  932. if (error)
  933. goto out_trans_cancel;
  934. /*
  935. * The extent shifting code works on extent granularity. So, if stop_fsb
  936. * is not the starting block of extent, we need to split the extent at
  937. * stop_fsb.
  938. */
  939. error = xfs_bmap_split_extent(tp, ip, stop_fsb);
  940. if (error)
  941. goto out_trans_cancel;
  942. do {
  943. error = xfs_defer_finish(&tp);
  944. if (error)
  945. goto out_trans_cancel;
  946. error = xfs_bmap_insert_extents(tp, ip, &next_fsb, shift_fsb,
  947. &done, stop_fsb);
  948. if (error)
  949. goto out_trans_cancel;
  950. } while (!done);
  951. error = xfs_trans_commit(tp);
  952. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  953. return error;
  954. out_trans_cancel:
  955. xfs_trans_cancel(tp);
  956. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  957. return error;
  958. }
  959. /*
  960. * We need to check that the format of the data fork in the temporary inode is
  961. * valid for the target inode before doing the swap. This is not a problem with
  962. * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
  963. * data fork depending on the space the attribute fork is taking so we can get
  964. * invalid formats on the target inode.
  965. *
  966. * E.g. target has space for 7 extents in extent format, temp inode only has
  967. * space for 6. If we defragment down to 7 extents, then the tmp format is a
  968. * btree, but when swapped it needs to be in extent format. Hence we can't just
  969. * blindly swap data forks on attr2 filesystems.
  970. *
  971. * Note that we check the swap in both directions so that we don't end up with
  972. * a corrupt temporary inode, either.
  973. *
  974. * Note that fixing the way xfs_fsr sets up the attribute fork in the source
  975. * inode will prevent this situation from occurring, so all we do here is
  976. * reject and log the attempt. basically we are putting the responsibility on
  977. * userspace to get this right.
  978. */
  979. static int
  980. xfs_swap_extents_check_format(
  981. struct xfs_inode *ip, /* target inode */
  982. struct xfs_inode *tip) /* tmp inode */
  983. {
  984. struct xfs_ifork *ifp = &ip->i_df;
  985. struct xfs_ifork *tifp = &tip->i_df;
  986. /* User/group/project quota ids must match if quotas are enforced. */
  987. if (XFS_IS_QUOTA_ON(ip->i_mount) &&
  988. (!uid_eq(VFS_I(ip)->i_uid, VFS_I(tip)->i_uid) ||
  989. !gid_eq(VFS_I(ip)->i_gid, VFS_I(tip)->i_gid) ||
  990. ip->i_projid != tip->i_projid))
  991. return -EINVAL;
  992. /* Should never get a local format */
  993. if (ifp->if_format == XFS_DINODE_FMT_LOCAL ||
  994. tifp->if_format == XFS_DINODE_FMT_LOCAL)
  995. return -EINVAL;
  996. /*
  997. * if the target inode has less extents that then temporary inode then
  998. * why did userspace call us?
  999. */
  1000. if (ifp->if_nextents < tifp->if_nextents)
  1001. return -EINVAL;
  1002. /*
  1003. * If we have to use the (expensive) rmap swap method, we can
  1004. * handle any number of extents and any format.
  1005. */
  1006. if (xfs_has_rmapbt(ip->i_mount))
  1007. return 0;
  1008. /*
  1009. * if the target inode is in extent form and the temp inode is in btree
  1010. * form then we will end up with the target inode in the wrong format
  1011. * as we already know there are less extents in the temp inode.
  1012. */
  1013. if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
  1014. tifp->if_format == XFS_DINODE_FMT_BTREE)
  1015. return -EINVAL;
  1016. /* Check temp in extent form to max in target */
  1017. if (tifp->if_format == XFS_DINODE_FMT_EXTENTS &&
  1018. tifp->if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
  1019. return -EINVAL;
  1020. /* Check target in extent form to max in temp */
  1021. if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
  1022. ifp->if_nextents > XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
  1023. return -EINVAL;
  1024. /*
  1025. * If we are in a btree format, check that the temp root block will fit
  1026. * in the target and that it has enough extents to be in btree format
  1027. * in the target.
  1028. *
  1029. * Note that we have to be careful to allow btree->extent conversions
  1030. * (a common defrag case) which will occur when the temp inode is in
  1031. * extent format...
  1032. */
  1033. if (tifp->if_format == XFS_DINODE_FMT_BTREE) {
  1034. if (xfs_inode_has_attr_fork(ip) &&
  1035. xfs_bmap_bmdr_space(tifp->if_broot) > xfs_inode_fork_boff(ip))
  1036. return -EINVAL;
  1037. if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
  1038. return -EINVAL;
  1039. }
  1040. /* Reciprocal target->temp btree format checks */
  1041. if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
  1042. if (xfs_inode_has_attr_fork(tip) &&
  1043. xfs_bmap_bmdr_space(ip->i_df.if_broot) > xfs_inode_fork_boff(tip))
  1044. return -EINVAL;
  1045. if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
  1046. return -EINVAL;
  1047. }
  1048. return 0;
  1049. }
  1050. static int
  1051. xfs_swap_extent_flush(
  1052. struct xfs_inode *ip)
  1053. {
  1054. int error;
  1055. error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
  1056. if (error)
  1057. return error;
  1058. truncate_pagecache_range(VFS_I(ip), 0, -1);
  1059. /* Verify O_DIRECT for ftmp */
  1060. if (VFS_I(ip)->i_mapping->nrpages)
  1061. return -EINVAL;
  1062. return 0;
  1063. }
  1064. /*
  1065. * Move extents from one file to another, when rmap is enabled.
  1066. */
  1067. STATIC int
  1068. xfs_swap_extent_rmap(
  1069. struct xfs_trans **tpp,
  1070. struct xfs_inode *ip,
  1071. struct xfs_inode *tip)
  1072. {
  1073. struct xfs_trans *tp = *tpp;
  1074. struct xfs_bmbt_irec irec;
  1075. struct xfs_bmbt_irec uirec;
  1076. struct xfs_bmbt_irec tirec;
  1077. xfs_fileoff_t offset_fsb;
  1078. xfs_fileoff_t end_fsb;
  1079. xfs_filblks_t count_fsb;
  1080. int error;
  1081. xfs_filblks_t ilen;
  1082. xfs_filblks_t rlen;
  1083. int nimaps;
  1084. uint64_t tip_flags2;
  1085. /*
  1086. * If the source file has shared blocks, we must flag the donor
  1087. * file as having shared blocks so that we get the shared-block
  1088. * rmap functions when we go to fix up the rmaps. The flags
  1089. * will be switch for reals later.
  1090. */
  1091. tip_flags2 = tip->i_diflags2;
  1092. if (ip->i_diflags2 & XFS_DIFLAG2_REFLINK)
  1093. tip->i_diflags2 |= XFS_DIFLAG2_REFLINK;
  1094. offset_fsb = 0;
  1095. end_fsb = XFS_B_TO_FSB(ip->i_mount, i_size_read(VFS_I(ip)));
  1096. count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb);
  1097. while (count_fsb) {
  1098. /* Read extent from the donor file */
  1099. nimaps = 1;
  1100. error = xfs_bmapi_read(tip, offset_fsb, count_fsb, &tirec,
  1101. &nimaps, 0);
  1102. if (error)
  1103. goto out;
  1104. ASSERT(nimaps == 1);
  1105. ASSERT(tirec.br_startblock != DELAYSTARTBLOCK);
  1106. trace_xfs_swap_extent_rmap_remap(tip, &tirec);
  1107. ilen = tirec.br_blockcount;
  1108. /* Unmap the old blocks in the source file. */
  1109. while (tirec.br_blockcount) {
  1110. ASSERT(tp->t_highest_agno == NULLAGNUMBER);
  1111. trace_xfs_swap_extent_rmap_remap_piece(tip, &tirec);
  1112. /* Read extent from the source file */
  1113. nimaps = 1;
  1114. error = xfs_bmapi_read(ip, tirec.br_startoff,
  1115. tirec.br_blockcount, &irec,
  1116. &nimaps, 0);
  1117. if (error)
  1118. goto out;
  1119. ASSERT(nimaps == 1);
  1120. ASSERT(tirec.br_startoff == irec.br_startoff);
  1121. trace_xfs_swap_extent_rmap_remap_piece(ip, &irec);
  1122. /* Trim the extent. */
  1123. uirec = tirec;
  1124. uirec.br_blockcount = rlen = min_t(xfs_filblks_t,
  1125. tirec.br_blockcount,
  1126. irec.br_blockcount);
  1127. trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
  1128. if (xfs_bmap_is_real_extent(&uirec)) {
  1129. error = xfs_iext_count_extend(tp, ip,
  1130. XFS_DATA_FORK,
  1131. XFS_IEXT_SWAP_RMAP_CNT);
  1132. if (error)
  1133. goto out;
  1134. }
  1135. if (xfs_bmap_is_real_extent(&irec)) {
  1136. error = xfs_iext_count_extend(tp, tip,
  1137. XFS_DATA_FORK,
  1138. XFS_IEXT_SWAP_RMAP_CNT);
  1139. if (error)
  1140. goto out;
  1141. }
  1142. /* Remove the mapping from the donor file. */
  1143. xfs_bmap_unmap_extent(tp, tip, XFS_DATA_FORK, &uirec);
  1144. /* Remove the mapping from the source file. */
  1145. xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &irec);
  1146. /* Map the donor file's blocks into the source file. */
  1147. xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &uirec);
  1148. /* Map the source file's blocks into the donor file. */
  1149. xfs_bmap_map_extent(tp, tip, XFS_DATA_FORK, &irec);
  1150. error = xfs_defer_finish(tpp);
  1151. tp = *tpp;
  1152. if (error)
  1153. goto out;
  1154. tirec.br_startoff += rlen;
  1155. if (tirec.br_startblock != HOLESTARTBLOCK &&
  1156. tirec.br_startblock != DELAYSTARTBLOCK)
  1157. tirec.br_startblock += rlen;
  1158. tirec.br_blockcount -= rlen;
  1159. }
  1160. /* Roll on... */
  1161. count_fsb -= ilen;
  1162. offset_fsb += ilen;
  1163. }
  1164. tip->i_diflags2 = tip_flags2;
  1165. return 0;
  1166. out:
  1167. trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_);
  1168. tip->i_diflags2 = tip_flags2;
  1169. return error;
  1170. }
  1171. /* Swap the extents of two files by swapping data forks. */
  1172. STATIC int
  1173. xfs_swap_extent_forks(
  1174. struct xfs_trans *tp,
  1175. struct xfs_inode *ip,
  1176. struct xfs_inode *tip,
  1177. int *src_log_flags,
  1178. int *target_log_flags)
  1179. {
  1180. xfs_filblks_t aforkblks = 0;
  1181. xfs_filblks_t taforkblks = 0;
  1182. xfs_extnum_t junk;
  1183. uint64_t tmp;
  1184. int error;
  1185. /*
  1186. * Count the number of extended attribute blocks
  1187. */
  1188. if (xfs_inode_has_attr_fork(ip) && ip->i_af.if_nextents > 0 &&
  1189. ip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
  1190. error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk,
  1191. &aforkblks);
  1192. if (error)
  1193. return error;
  1194. }
  1195. if (xfs_inode_has_attr_fork(tip) && tip->i_af.if_nextents > 0 &&
  1196. tip->i_af.if_format != XFS_DINODE_FMT_LOCAL) {
  1197. error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk,
  1198. &taforkblks);
  1199. if (error)
  1200. return error;
  1201. }
  1202. /*
  1203. * Btree format (v3) inodes have the inode number stamped in the bmbt
  1204. * block headers. We can't start changing the bmbt blocks until the
  1205. * inode owner change is logged so recovery does the right thing in the
  1206. * event of a crash. Set the owner change log flags now and leave the
  1207. * bmbt scan as the last step.
  1208. */
  1209. if (xfs_has_v3inodes(ip->i_mount)) {
  1210. if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE)
  1211. (*target_log_flags) |= XFS_ILOG_DOWNER;
  1212. if (tip->i_df.if_format == XFS_DINODE_FMT_BTREE)
  1213. (*src_log_flags) |= XFS_ILOG_DOWNER;
  1214. }
  1215. /*
  1216. * Swap the data forks of the inodes
  1217. */
  1218. swap(ip->i_df, tip->i_df);
  1219. /*
  1220. * Fix the on-disk inode values
  1221. */
  1222. tmp = (uint64_t)ip->i_nblocks;
  1223. ip->i_nblocks = tip->i_nblocks - taforkblks + aforkblks;
  1224. tip->i_nblocks = tmp + taforkblks - aforkblks;
  1225. /*
  1226. * The extents in the source inode could still contain speculative
  1227. * preallocation beyond EOF (e.g. the file is open but not modified
  1228. * while defrag is in progress). In that case, we need to copy over the
  1229. * number of delalloc blocks the data fork in the source inode is
  1230. * tracking beyond EOF so that when the fork is truncated away when the
  1231. * temporary inode is unlinked we don't underrun the i_delayed_blks
  1232. * counter on that inode.
  1233. */
  1234. ASSERT(tip->i_delayed_blks == 0);
  1235. tip->i_delayed_blks = ip->i_delayed_blks;
  1236. ip->i_delayed_blks = 0;
  1237. switch (ip->i_df.if_format) {
  1238. case XFS_DINODE_FMT_EXTENTS:
  1239. (*src_log_flags) |= XFS_ILOG_DEXT;
  1240. break;
  1241. case XFS_DINODE_FMT_BTREE:
  1242. ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
  1243. (*src_log_flags & XFS_ILOG_DOWNER));
  1244. (*src_log_flags) |= XFS_ILOG_DBROOT;
  1245. break;
  1246. }
  1247. switch (tip->i_df.if_format) {
  1248. case XFS_DINODE_FMT_EXTENTS:
  1249. (*target_log_flags) |= XFS_ILOG_DEXT;
  1250. break;
  1251. case XFS_DINODE_FMT_BTREE:
  1252. (*target_log_flags) |= XFS_ILOG_DBROOT;
  1253. ASSERT(!xfs_has_v3inodes(ip->i_mount) ||
  1254. (*target_log_flags & XFS_ILOG_DOWNER));
  1255. break;
  1256. }
  1257. return 0;
  1258. }
  1259. /*
  1260. * Fix up the owners of the bmbt blocks to refer to the current inode. The
  1261. * change owner scan attempts to order all modified buffers in the current
  1262. * transaction. In the event of ordered buffer failure, the offending buffer is
  1263. * physically logged as a fallback and the scan returns -EAGAIN. We must roll
  1264. * the transaction in this case to replenish the fallback log reservation and
  1265. * restart the scan. This process repeats until the scan completes.
  1266. */
  1267. static int
  1268. xfs_swap_change_owner(
  1269. struct xfs_trans **tpp,
  1270. struct xfs_inode *ip,
  1271. struct xfs_inode *tmpip)
  1272. {
  1273. int error;
  1274. struct xfs_trans *tp = *tpp;
  1275. do {
  1276. error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
  1277. NULL);
  1278. /* success or fatal error */
  1279. if (error != -EAGAIN)
  1280. break;
  1281. error = xfs_trans_roll(tpp);
  1282. if (error)
  1283. break;
  1284. tp = *tpp;
  1285. /*
  1286. * Redirty both inodes so they can relog and keep the log tail
  1287. * moving forward.
  1288. */
  1289. xfs_trans_ijoin(tp, ip, 0);
  1290. xfs_trans_ijoin(tp, tmpip, 0);
  1291. xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
  1292. xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
  1293. } while (true);
  1294. return error;
  1295. }
  1296. int
  1297. xfs_swap_extents(
  1298. struct xfs_inode *ip, /* target inode */
  1299. struct xfs_inode *tip, /* tmp inode */
  1300. struct xfs_swapext *sxp)
  1301. {
  1302. struct xfs_mount *mp = ip->i_mount;
  1303. struct xfs_trans *tp;
  1304. struct xfs_bstat *sbp = &sxp->sx_stat;
  1305. int src_log_flags, target_log_flags;
  1306. int error = 0;
  1307. uint64_t f;
  1308. int resblks = 0;
  1309. unsigned int flags = 0;
  1310. struct timespec64 ctime, mtime;
  1311. /*
  1312. * Lock the inodes against other IO, page faults and truncate to
  1313. * begin with. Then we can ensure the inodes are flushed and have no
  1314. * page cache safely. Once we have done this we can take the ilocks and
  1315. * do the rest of the checks.
  1316. */
  1317. lock_two_nondirectories(VFS_I(ip), VFS_I(tip));
  1318. filemap_invalidate_lock_two(VFS_I(ip)->i_mapping,
  1319. VFS_I(tip)->i_mapping);
  1320. /* Verify that both files have the same format */
  1321. if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
  1322. error = -EINVAL;
  1323. goto out_unlock;
  1324. }
  1325. /* Verify both files are either real-time or non-realtime */
  1326. if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
  1327. error = -EINVAL;
  1328. goto out_unlock;
  1329. }
  1330. error = xfs_qm_dqattach(ip);
  1331. if (error)
  1332. goto out_unlock;
  1333. error = xfs_qm_dqattach(tip);
  1334. if (error)
  1335. goto out_unlock;
  1336. error = xfs_swap_extent_flush(ip);
  1337. if (error)
  1338. goto out_unlock;
  1339. error = xfs_swap_extent_flush(tip);
  1340. if (error)
  1341. goto out_unlock;
  1342. if (xfs_inode_has_cow_data(tip)) {
  1343. error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true);
  1344. if (error)
  1345. goto out_unlock;
  1346. }
  1347. /*
  1348. * Extent "swapping" with rmap requires a permanent reservation and
  1349. * a block reservation because it's really just a remap operation
  1350. * performed with log redo items!
  1351. */
  1352. if (xfs_has_rmapbt(mp)) {
  1353. int w = XFS_DATA_FORK;
  1354. uint32_t ipnext = ip->i_df.if_nextents;
  1355. uint32_t tipnext = tip->i_df.if_nextents;
  1356. /*
  1357. * Conceptually this shouldn't affect the shape of either bmbt,
  1358. * but since we atomically move extents one by one, we reserve
  1359. * enough space to rebuild both trees.
  1360. */
  1361. resblks = XFS_SWAP_RMAP_SPACE_RES(mp, ipnext, w);
  1362. resblks += XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w);
  1363. /*
  1364. * If either inode straddles a bmapbt block allocation boundary,
  1365. * the rmapbt algorithm triggers repeated allocs and frees as
  1366. * extents are remapped. This can exhaust the block reservation
  1367. * prematurely and cause shutdown. Return freed blocks to the
  1368. * transaction reservation to counter this behavior.
  1369. */
  1370. flags |= XFS_TRANS_RES_FDBLKS;
  1371. }
  1372. error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, flags,
  1373. &tp);
  1374. if (error)
  1375. goto out_unlock;
  1376. /*
  1377. * Lock and join the inodes to the tansaction so that transaction commit
  1378. * or cancel will unlock the inodes from this point onwards.
  1379. */
  1380. xfs_lock_two_inodes(ip, XFS_ILOCK_EXCL, tip, XFS_ILOCK_EXCL);
  1381. xfs_trans_ijoin(tp, ip, 0);
  1382. xfs_trans_ijoin(tp, tip, 0);
  1383. /* Verify all data are being swapped */
  1384. if (sxp->sx_offset != 0 ||
  1385. sxp->sx_length != ip->i_disk_size ||
  1386. sxp->sx_length != tip->i_disk_size) {
  1387. error = -EFAULT;
  1388. goto out_trans_cancel;
  1389. }
  1390. trace_xfs_swap_extent_before(ip, 0);
  1391. trace_xfs_swap_extent_before(tip, 1);
  1392. /* check inode formats now that data is flushed */
  1393. error = xfs_swap_extents_check_format(ip, tip);
  1394. if (error) {
  1395. xfs_notice(mp,
  1396. "%s: inode 0x%llx format is incompatible for exchanging.",
  1397. __func__, ip->i_ino);
  1398. goto out_trans_cancel;
  1399. }
  1400. /*
  1401. * Compare the current change & modify times with that
  1402. * passed in. If they differ, we abort this swap.
  1403. * This is the mechanism used to ensure the calling
  1404. * process that the file was not changed out from
  1405. * under it.
  1406. */
  1407. ctime = inode_get_ctime(VFS_I(ip));
  1408. mtime = inode_get_mtime(VFS_I(ip));
  1409. if ((sbp->bs_ctime.tv_sec != ctime.tv_sec) ||
  1410. (sbp->bs_ctime.tv_nsec != ctime.tv_nsec) ||
  1411. (sbp->bs_mtime.tv_sec != mtime.tv_sec) ||
  1412. (sbp->bs_mtime.tv_nsec != mtime.tv_nsec)) {
  1413. error = -EBUSY;
  1414. goto out_trans_cancel;
  1415. }
  1416. /*
  1417. * Note the trickiness in setting the log flags - we set the owner log
  1418. * flag on the opposite inode (i.e. the inode we are setting the new
  1419. * owner to be) because once we swap the forks and log that, log
  1420. * recovery is going to see the fork as owned by the swapped inode,
  1421. * not the pre-swapped inodes.
  1422. */
  1423. src_log_flags = XFS_ILOG_CORE;
  1424. target_log_flags = XFS_ILOG_CORE;
  1425. if (xfs_has_rmapbt(mp))
  1426. error = xfs_swap_extent_rmap(&tp, ip, tip);
  1427. else
  1428. error = xfs_swap_extent_forks(tp, ip, tip, &src_log_flags,
  1429. &target_log_flags);
  1430. if (error)
  1431. goto out_trans_cancel;
  1432. /* Do we have to swap reflink flags? */
  1433. if ((ip->i_diflags2 & XFS_DIFLAG2_REFLINK) ^
  1434. (tip->i_diflags2 & XFS_DIFLAG2_REFLINK)) {
  1435. f = ip->i_diflags2 & XFS_DIFLAG2_REFLINK;
  1436. ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
  1437. ip->i_diflags2 |= tip->i_diflags2 & XFS_DIFLAG2_REFLINK;
  1438. tip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
  1439. tip->i_diflags2 |= f & XFS_DIFLAG2_REFLINK;
  1440. }
  1441. /* Swap the cow forks. */
  1442. if (xfs_has_reflink(mp)) {
  1443. ASSERT(!ip->i_cowfp ||
  1444. ip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
  1445. ASSERT(!tip->i_cowfp ||
  1446. tip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
  1447. swap(ip->i_cowfp, tip->i_cowfp);
  1448. if (ip->i_cowfp && ip->i_cowfp->if_bytes)
  1449. xfs_inode_set_cowblocks_tag(ip);
  1450. else
  1451. xfs_inode_clear_cowblocks_tag(ip);
  1452. if (tip->i_cowfp && tip->i_cowfp->if_bytes)
  1453. xfs_inode_set_cowblocks_tag(tip);
  1454. else
  1455. xfs_inode_clear_cowblocks_tag(tip);
  1456. }
  1457. xfs_trans_log_inode(tp, ip, src_log_flags);
  1458. xfs_trans_log_inode(tp, tip, target_log_flags);
  1459. /*
  1460. * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
  1461. * have inode number owner values in the bmbt blocks that still refer to
  1462. * the old inode. Scan each bmbt to fix up the owner values with the
  1463. * inode number of the current inode.
  1464. */
  1465. if (src_log_flags & XFS_ILOG_DOWNER) {
  1466. error = xfs_swap_change_owner(&tp, ip, tip);
  1467. if (error)
  1468. goto out_trans_cancel;
  1469. }
  1470. if (target_log_flags & XFS_ILOG_DOWNER) {
  1471. error = xfs_swap_change_owner(&tp, tip, ip);
  1472. if (error)
  1473. goto out_trans_cancel;
  1474. }
  1475. /*
  1476. * If this is a synchronous mount, make sure that the
  1477. * transaction goes to disk before returning to the user.
  1478. */
  1479. if (xfs_has_wsync(mp))
  1480. xfs_trans_set_sync(tp);
  1481. error = xfs_trans_commit(tp);
  1482. trace_xfs_swap_extent_after(ip, 0);
  1483. trace_xfs_swap_extent_after(tip, 1);
  1484. out_unlock_ilock:
  1485. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  1486. xfs_iunlock(tip, XFS_ILOCK_EXCL);
  1487. out_unlock:
  1488. filemap_invalidate_unlock_two(VFS_I(ip)->i_mapping,
  1489. VFS_I(tip)->i_mapping);
  1490. unlock_two_nondirectories(VFS_I(ip), VFS_I(tip));
  1491. return error;
  1492. out_trans_cancel:
  1493. xfs_trans_cancel(tp);
  1494. goto out_unlock_ilock;
  1495. }