xfs_exchrange.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
  4. * Author: Darrick J. Wong <djwong@kernel.org>
  5. */
  6. #include "xfs.h"
  7. #include "xfs_shared.h"
  8. #include "xfs_format.h"
  9. #include "xfs_log_format.h"
  10. #include "xfs_trans_resv.h"
  11. #include "xfs_mount.h"
  12. #include "xfs_defer.h"
  13. #include "xfs_inode.h"
  14. #include "xfs_trans.h"
  15. #include "xfs_quota.h"
  16. #include "xfs_bmap_util.h"
  17. #include "xfs_reflink.h"
  18. #include "xfs_trace.h"
  19. #include "xfs_exchrange.h"
  20. #include "xfs_exchmaps.h"
  21. #include "xfs_sb.h"
  22. #include "xfs_icache.h"
  23. #include "xfs_log.h"
  24. #include "xfs_rtbitmap.h"
  25. #include <linux/fsnotify.h>
  26. /* Lock (and optionally join) two inodes for a file range exchange. */
  27. void
  28. xfs_exchrange_ilock(
  29. struct xfs_trans *tp,
  30. struct xfs_inode *ip1,
  31. struct xfs_inode *ip2)
  32. {
  33. if (ip1 != ip2)
  34. xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL,
  35. ip2, XFS_ILOCK_EXCL);
  36. else
  37. xfs_ilock(ip1, XFS_ILOCK_EXCL);
  38. if (tp) {
  39. xfs_trans_ijoin(tp, ip1, 0);
  40. if (ip2 != ip1)
  41. xfs_trans_ijoin(tp, ip2, 0);
  42. }
  43. }
  44. /* Unlock two inodes after a file range exchange operation. */
  45. void
  46. xfs_exchrange_iunlock(
  47. struct xfs_inode *ip1,
  48. struct xfs_inode *ip2)
  49. {
  50. if (ip2 != ip1)
  51. xfs_iunlock(ip2, XFS_ILOCK_EXCL);
  52. xfs_iunlock(ip1, XFS_ILOCK_EXCL);
  53. }
  54. /*
  55. * Estimate the resource requirements to exchange file contents between the two
  56. * files. The caller is required to hold the IOLOCK and the MMAPLOCK and to
  57. * have flushed both inodes' pagecache and active direct-ios.
  58. */
  59. int
  60. xfs_exchrange_estimate(
  61. struct xfs_exchmaps_req *req)
  62. {
  63. int error;
  64. xfs_exchrange_ilock(NULL, req->ip1, req->ip2);
  65. error = xfs_exchmaps_estimate(req);
  66. xfs_exchrange_iunlock(req->ip1, req->ip2);
  67. return error;
  68. }
  69. /*
  70. * Check that file2's metadata agree with the snapshot that we took for the
  71. * range commit request.
  72. *
  73. * This should be called after the filesystem has locked /all/ inode metadata
  74. * against modification.
  75. */
  76. STATIC int
  77. xfs_exchrange_check_freshness(
  78. const struct xfs_exchrange *fxr,
  79. struct xfs_inode *ip2)
  80. {
  81. struct inode *inode2 = VFS_I(ip2);
  82. struct timespec64 ctime = inode_get_ctime(inode2);
  83. struct timespec64 mtime = inode_get_mtime(inode2);
  84. trace_xfs_exchrange_freshness(fxr, ip2);
  85. /* Check that file2 hasn't otherwise been modified. */
  86. if (fxr->file2_ino != ip2->i_ino ||
  87. fxr->file2_gen != inode2->i_generation ||
  88. !timespec64_equal(&fxr->file2_ctime, &ctime) ||
  89. !timespec64_equal(&fxr->file2_mtime, &mtime))
  90. return -EBUSY;
  91. return 0;
  92. }
  93. #define QRETRY_IP1 (0x1)
  94. #define QRETRY_IP2 (0x2)
  95. /*
  96. * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip
  97. * this if quota enforcement is disabled or if both inodes' dquots are the
  98. * same. The qretry structure must be initialized to zeroes before the first
  99. * call to this function.
  100. */
  101. STATIC int
  102. xfs_exchrange_reserve_quota(
  103. struct xfs_trans *tp,
  104. const struct xfs_exchmaps_req *req,
  105. unsigned int *qretry)
  106. {
  107. int64_t ddelta, rdelta;
  108. int ip1_error = 0;
  109. int error;
  110. /*
  111. * Don't bother with a quota reservation if we're not enforcing them
  112. * or the two inodes have the same dquots.
  113. */
  114. if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
  115. (req->ip1->i_udquot == req->ip2->i_udquot &&
  116. req->ip1->i_gdquot == req->ip2->i_gdquot &&
  117. req->ip1->i_pdquot == req->ip2->i_pdquot))
  118. return 0;
  119. *qretry = 0;
  120. /*
  121. * For each file, compute the net gain in the number of regular blocks
  122. * that will be mapped into that file and reserve that much quota. The
  123. * quota counts must be able to absorb at least that much space.
  124. */
  125. ddelta = req->ip2_bcount - req->ip1_bcount;
  126. rdelta = req->ip2_rtbcount - req->ip1_rtbcount;
  127. if (ddelta > 0 || rdelta > 0) {
  128. error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
  129. ddelta > 0 ? ddelta : 0,
  130. rdelta > 0 ? rdelta : 0,
  131. false);
  132. if (error == -EDQUOT || error == -ENOSPC) {
  133. /*
  134. * Save this error and see what happens if we try to
  135. * reserve quota for ip2. Then report both.
  136. */
  137. *qretry |= QRETRY_IP1;
  138. ip1_error = error;
  139. error = 0;
  140. }
  141. if (error)
  142. return error;
  143. }
  144. if (ddelta < 0 || rdelta < 0) {
  145. error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
  146. ddelta < 0 ? -ddelta : 0,
  147. rdelta < 0 ? -rdelta : 0,
  148. false);
  149. if (error == -EDQUOT || error == -ENOSPC)
  150. *qretry |= QRETRY_IP2;
  151. if (error)
  152. return error;
  153. }
  154. if (ip1_error)
  155. return ip1_error;
  156. /*
  157. * For each file, forcibly reserve the gross gain in mapped blocks so
  158. * that we don't trip over any quota block reservation assertions.
  159. * We must reserve the gross gain because the quota code subtracts from
  160. * bcount the number of blocks that we unmap; it does not add that
  161. * quantity back to the quota block reservation.
  162. */
  163. error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount,
  164. req->ip1_rtbcount, true);
  165. if (error)
  166. return error;
  167. return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount,
  168. req->ip2_rtbcount, true);
  169. }
  170. /* Exchange the mappings (and hence the contents) of two files' forks. */
  171. STATIC int
  172. xfs_exchrange_mappings(
  173. const struct xfs_exchrange *fxr,
  174. struct xfs_inode *ip1,
  175. struct xfs_inode *ip2)
  176. {
  177. struct xfs_mount *mp = ip1->i_mount;
  178. struct xfs_exchmaps_req req = {
  179. .ip1 = ip1,
  180. .ip2 = ip2,
  181. .startoff1 = XFS_B_TO_FSBT(mp, fxr->file1_offset),
  182. .startoff2 = XFS_B_TO_FSBT(mp, fxr->file2_offset),
  183. .blockcount = XFS_B_TO_FSB(mp, fxr->length),
  184. };
  185. struct xfs_trans *tp;
  186. unsigned int qretry;
  187. bool retried = false;
  188. int error;
  189. trace_xfs_exchrange_mappings(fxr, ip1, ip2);
  190. if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
  191. req.flags |= XFS_EXCHMAPS_SET_SIZES;
  192. if (fxr->flags & XFS_EXCHANGE_RANGE_FILE1_WRITTEN)
  193. req.flags |= XFS_EXCHMAPS_INO1_WRITTEN;
  194. /*
  195. * Round the request length up to the nearest file allocation unit.
  196. * The prep function already checked that the request offsets and
  197. * length in @fxr are safe to round up.
  198. */
  199. if (xfs_inode_has_bigrtalloc(ip2))
  200. req.blockcount = xfs_rtb_roundup_rtx(mp, req.blockcount);
  201. error = xfs_exchrange_estimate(&req);
  202. if (error)
  203. return error;
  204. retry:
  205. /* Allocate the transaction, lock the inodes, and join them. */
  206. error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0,
  207. XFS_TRANS_RES_FDBLKS, &tp);
  208. if (error)
  209. return error;
  210. xfs_exchrange_ilock(tp, ip1, ip2);
  211. trace_xfs_exchrange_before(ip2, 2);
  212. trace_xfs_exchrange_before(ip1, 1);
  213. error = xfs_exchmaps_check_forks(mp, &req);
  214. if (error)
  215. goto out_trans_cancel;
  216. /*
  217. * Reserve ourselves some quota if any of them are in enforcing mode.
  218. * In theory we only need enough to satisfy the change in the number
  219. * of blocks between the two ranges being remapped.
  220. */
  221. error = xfs_exchrange_reserve_quota(tp, &req, &qretry);
  222. if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
  223. xfs_trans_cancel(tp);
  224. xfs_exchrange_iunlock(ip1, ip2);
  225. if (qretry & QRETRY_IP1)
  226. xfs_blockgc_free_quota(ip1, 0);
  227. if (qretry & QRETRY_IP2)
  228. xfs_blockgc_free_quota(ip2, 0);
  229. retried = true;
  230. goto retry;
  231. }
  232. if (error)
  233. goto out_trans_cancel;
  234. /* If we got this far on a dry run, all parameters are ok. */
  235. if (fxr->flags & XFS_EXCHANGE_RANGE_DRY_RUN)
  236. goto out_trans_cancel;
  237. /* Update the mtime and ctime of both files. */
  238. if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME1)
  239. xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
  240. if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
  241. xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
  242. xfs_exchange_mappings(tp, &req);
  243. /*
  244. * Force the log to persist metadata updates if the caller or the
  245. * administrator requires this. The generic prep function already
  246. * flushed the relevant parts of the page cache.
  247. */
  248. if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCHANGE_RANGE_DSYNC))
  249. xfs_trans_set_sync(tp);
  250. error = xfs_trans_commit(tp);
  251. trace_xfs_exchrange_after(ip2, 2);
  252. trace_xfs_exchrange_after(ip1, 1);
  253. if (error)
  254. goto out_unlock;
  255. /*
  256. * If the caller wanted us to exchange the contents of two complete
  257. * files of unequal length, exchange the incore sizes now. This should
  258. * be safe because we flushed both files' page caches, exchanged all
  259. * the mappings, and updated the ondisk sizes.
  260. */
  261. if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
  262. loff_t temp;
  263. temp = i_size_read(VFS_I(ip2));
  264. i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
  265. i_size_write(VFS_I(ip1), temp);
  266. }
  267. out_unlock:
  268. xfs_exchrange_iunlock(ip1, ip2);
  269. return error;
  270. out_trans_cancel:
  271. xfs_trans_cancel(tp);
  272. goto out_unlock;
  273. }
  274. /*
  275. * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
  276. * This part deals with struct file objects and byte ranges and does not deal
  277. * with XFS-specific data structures such as xfs_inodes and block ranges. This
  278. * separation may some day facilitate porting to another filesystem.
  279. *
  280. * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
  281. * file1 with the same number of bytes starting at fxr.file2_offset in file2.
  282. * Implementations must call xfs_exchange_range_prep to prepare the two
  283. * files prior to taking locks; and they must update the inode change and mod
  284. * times of both files as part of the metadata update. The timestamp update
  285. * and freshness checks must be done atomically as part of the data exchange
  286. * operation to ensure correctness of the freshness check.
  287. * xfs_exchange_range_finish must be called after the operation completes
  288. * successfully but before locks are dropped.
  289. */
  290. /*
  291. * Performs necessary checks before doing a range exchange, having stabilized
  292. * mutable inode attributes via i_rwsem.
  293. */
  294. static inline int
  295. xfs_exchange_range_checks(
  296. struct xfs_exchrange *fxr,
  297. unsigned int alloc_unit)
  298. {
  299. struct inode *inode1 = file_inode(fxr->file1);
  300. loff_t size1 = i_size_read(inode1);
  301. struct inode *inode2 = file_inode(fxr->file2);
  302. loff_t size2 = i_size_read(inode2);
  303. uint64_t allocmask = alloc_unit - 1;
  304. int64_t test_len;
  305. uint64_t blen;
  306. loff_t tmp;
  307. int error;
  308. /* Don't touch certain kinds of inodes */
  309. if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2))
  310. return -EPERM;
  311. if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
  312. return -ETXTBSY;
  313. /* Ranges cannot start after EOF. */
  314. if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
  315. return -EINVAL;
  316. if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
  317. /*
  318. * If the caller said to exchange to EOF, we set the length of
  319. * the request large enough to cover everything to the end of
  320. * both files.
  321. */
  322. fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
  323. size2 - fxr->file2_offset);
  324. } else {
  325. /*
  326. * Otherwise we require both ranges to end within EOF.
  327. */
  328. if (fxr->file1_offset + fxr->length > size1 ||
  329. fxr->file2_offset + fxr->length > size2)
  330. return -EINVAL;
  331. }
  332. /*
  333. * The start of both ranges must be aligned to the file allocation
  334. * unit.
  335. */
  336. if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) ||
  337. !IS_ALIGNED(fxr->file2_offset, alloc_unit))
  338. return -EINVAL;
  339. /* Ensure offsets don't wrap. */
  340. if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) ||
  341. check_add_overflow(fxr->file2_offset, fxr->length, &tmp))
  342. return -EINVAL;
  343. /*
  344. * Make sure we don't hit any file size limits. If we hit any size
  345. * limits such that test_length was adjusted, we abort the whole
  346. * operation.
  347. */
  348. test_len = fxr->length;
  349. error = generic_write_check_limits(fxr->file2, fxr->file2_offset,
  350. &test_len);
  351. if (error)
  352. return error;
  353. error = generic_write_check_limits(fxr->file1, fxr->file1_offset,
  354. &test_len);
  355. if (error)
  356. return error;
  357. if (test_len != fxr->length)
  358. return -EINVAL;
  359. /*
  360. * If the user wanted us to exchange up to the infile's EOF, round up
  361. * to the next allocation unit boundary for this check. Do the same
  362. * for the outfile.
  363. *
  364. * Otherwise, reject the range length if it's not aligned to an
  365. * allocation unit.
  366. */
  367. if (fxr->file1_offset + fxr->length == size1)
  368. blen = ALIGN(size1, alloc_unit) - fxr->file1_offset;
  369. else if (fxr->file2_offset + fxr->length == size2)
  370. blen = ALIGN(size2, alloc_unit) - fxr->file2_offset;
  371. else if (!IS_ALIGNED(fxr->length, alloc_unit))
  372. return -EINVAL;
  373. else
  374. blen = fxr->length;
  375. /* Don't allow overlapped exchanges within the same file. */
  376. if (inode1 == inode2 &&
  377. fxr->file2_offset + blen > fxr->file1_offset &&
  378. fxr->file1_offset + blen > fxr->file2_offset)
  379. return -EINVAL;
  380. /*
  381. * Ensure that we don't exchange a partial EOF block into the middle of
  382. * another file.
  383. */
  384. if ((fxr->length & allocmask) == 0)
  385. return 0;
  386. blen = fxr->length;
  387. if (fxr->file2_offset + blen < size2)
  388. blen &= ~allocmask;
  389. if (fxr->file1_offset + blen < size1)
  390. blen &= ~allocmask;
  391. return blen == fxr->length ? 0 : -EINVAL;
  392. }
  393. /*
  394. * Check that the two inodes are eligible for range exchanges, the ranges make
  395. * sense, and then flush all dirty data. Caller must ensure that the inodes
  396. * have been locked against any other modifications.
  397. */
  398. static inline int
  399. xfs_exchange_range_prep(
  400. struct xfs_exchrange *fxr,
  401. unsigned int alloc_unit)
  402. {
  403. struct inode *inode1 = file_inode(fxr->file1);
  404. struct inode *inode2 = file_inode(fxr->file2);
  405. bool same_inode = (inode1 == inode2);
  406. int error;
  407. /* Check that we don't violate system file offset limits. */
  408. error = xfs_exchange_range_checks(fxr, alloc_unit);
  409. if (error || fxr->length == 0)
  410. return error;
  411. /* Wait for the completion of any pending IOs on both files */
  412. inode_dio_wait(inode1);
  413. if (!same_inode)
  414. inode_dio_wait(inode2);
  415. error = filemap_write_and_wait_range(inode1->i_mapping,
  416. fxr->file1_offset,
  417. fxr->file1_offset + fxr->length - 1);
  418. if (error)
  419. return error;
  420. error = filemap_write_and_wait_range(inode2->i_mapping,
  421. fxr->file2_offset,
  422. fxr->file2_offset + fxr->length - 1);
  423. if (error)
  424. return error;
  425. /*
  426. * If the files or inodes involved require synchronous writes, amend
  427. * the request to force the filesystem to flush all data and metadata
  428. * to disk after the operation completes.
  429. */
  430. if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) ||
  431. IS_SYNC(inode1) || IS_SYNC(inode2))
  432. fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC;
  433. return 0;
  434. }
  435. /*
  436. * Finish a range exchange operation, if it was successful. Caller must ensure
  437. * that the inodes are still locked against any other modifications.
  438. */
  439. static inline int
  440. xfs_exchange_range_finish(
  441. struct xfs_exchrange *fxr)
  442. {
  443. int error;
  444. error = file_remove_privs(fxr->file1);
  445. if (error)
  446. return error;
  447. if (file_inode(fxr->file1) == file_inode(fxr->file2))
  448. return 0;
  449. return file_remove_privs(fxr->file2);
  450. }
  451. /*
  452. * Check the alignment of an exchange request when the allocation unit size
  453. * isn't a power of two. The generic file-level helpers use (fast)
  454. * bitmask-based alignment checks, but here we have to use slow long division.
  455. */
  456. static int
  457. xfs_exchrange_check_rtalign(
  458. const struct xfs_exchrange *fxr,
  459. struct xfs_inode *ip1,
  460. struct xfs_inode *ip2,
  461. unsigned int alloc_unit)
  462. {
  463. uint64_t length = fxr->length;
  464. uint64_t blen;
  465. loff_t size1, size2;
  466. size1 = i_size_read(VFS_I(ip1));
  467. size2 = i_size_read(VFS_I(ip2));
  468. /* The start of both ranges must be aligned to a rt extent. */
  469. if (!isaligned_64(fxr->file1_offset, alloc_unit) ||
  470. !isaligned_64(fxr->file2_offset, alloc_unit))
  471. return -EINVAL;
  472. if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
  473. length = max_t(int64_t, size1 - fxr->file1_offset,
  474. size2 - fxr->file2_offset);
  475. /*
  476. * If the user wanted us to exchange up to the infile's EOF, round up
  477. * to the next rt extent boundary for this check. Do the same for the
  478. * outfile.
  479. *
  480. * Otherwise, reject the range length if it's not rt extent aligned.
  481. * We already confirmed the starting offsets' rt extent block
  482. * alignment.
  483. */
  484. if (fxr->file1_offset + length == size1)
  485. blen = roundup_64(size1, alloc_unit) - fxr->file1_offset;
  486. else if (fxr->file2_offset + length == size2)
  487. blen = roundup_64(size2, alloc_unit) - fxr->file2_offset;
  488. else if (!isaligned_64(length, alloc_unit))
  489. return -EINVAL;
  490. else
  491. blen = length;
  492. /* Don't allow overlapped exchanges within the same file. */
  493. if (ip1 == ip2 &&
  494. fxr->file2_offset + blen > fxr->file1_offset &&
  495. fxr->file1_offset + blen > fxr->file2_offset)
  496. return -EINVAL;
  497. /*
  498. * Ensure that we don't exchange a partial EOF rt extent into the
  499. * middle of another file.
  500. */
  501. if (isaligned_64(length, alloc_unit))
  502. return 0;
  503. blen = length;
  504. if (fxr->file2_offset + length < size2)
  505. blen = rounddown_64(blen, alloc_unit);
  506. if (fxr->file1_offset + blen < size1)
  507. blen = rounddown_64(blen, alloc_unit);
  508. return blen == length ? 0 : -EINVAL;
  509. }
  510. /* Prepare two files to have their data exchanged. */
  511. STATIC int
  512. xfs_exchrange_prep(
  513. struct xfs_exchrange *fxr,
  514. struct xfs_inode *ip1,
  515. struct xfs_inode *ip2)
  516. {
  517. struct xfs_mount *mp = ip2->i_mount;
  518. unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip2);
  519. int error;
  520. trace_xfs_exchrange_prep(fxr, ip1, ip2);
  521. /* Verify both files are either real-time or non-realtime */
  522. if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
  523. return -EINVAL;
  524. /* Check non-power of two alignment issues, if necessary. */
  525. if (!is_power_of_2(alloc_unit)) {
  526. error = xfs_exchrange_check_rtalign(fxr, ip1, ip2, alloc_unit);
  527. if (error)
  528. return error;
  529. /*
  530. * Do the generic file-level checks with the regular block
  531. * alignment.
  532. */
  533. alloc_unit = mp->m_sb.sb_blocksize;
  534. }
  535. error = xfs_exchange_range_prep(fxr, alloc_unit);
  536. if (error || fxr->length == 0)
  537. return error;
  538. if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) {
  539. error = xfs_exchrange_check_freshness(fxr, ip2);
  540. if (error)
  541. return error;
  542. }
  543. /* Attach dquots to both inodes before changing block maps. */
  544. error = xfs_qm_dqattach(ip2);
  545. if (error)
  546. return error;
  547. error = xfs_qm_dqattach(ip1);
  548. if (error)
  549. return error;
  550. trace_xfs_exchrange_flush(fxr, ip1, ip2);
  551. /* Flush the relevant ranges of both files. */
  552. error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
  553. if (error)
  554. return error;
  555. error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
  556. if (error)
  557. return error;
  558. /*
  559. * Cancel CoW fork preallocations for the ranges of both files. The
  560. * prep function should have flushed all the dirty data, so the only
  561. * CoW mappings remaining should be speculative.
  562. */
  563. if (xfs_inode_has_cow_data(ip1)) {
  564. error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
  565. fxr->length, true);
  566. if (error)
  567. return error;
  568. }
  569. if (xfs_inode_has_cow_data(ip2)) {
  570. error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
  571. fxr->length, true);
  572. if (error)
  573. return error;
  574. }
  575. return 0;
  576. }
  577. /*
  578. * Exchange contents of files. This is the binding between the generic
  579. * file-level concepts and the XFS inode-specific implementation.
  580. */
  581. STATIC int
  582. xfs_exchrange_contents(
  583. struct xfs_exchrange *fxr)
  584. {
  585. struct inode *inode1 = file_inode(fxr->file1);
  586. struct inode *inode2 = file_inode(fxr->file2);
  587. struct xfs_inode *ip1 = XFS_I(inode1);
  588. struct xfs_inode *ip2 = XFS_I(inode2);
  589. struct xfs_mount *mp = ip1->i_mount;
  590. int error;
  591. if (!xfs_has_exchange_range(mp))
  592. return -EOPNOTSUPP;
  593. if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
  594. XFS_EXCHANGE_RANGE_PRIV_FLAGS))
  595. return -EINVAL;
  596. if (xfs_is_shutdown(mp))
  597. return -EIO;
  598. /* Lock both files against IO */
  599. error = xfs_ilock2_io_mmap(ip1, ip2);
  600. if (error)
  601. goto out_err;
  602. /* Prepare and then exchange file contents. */
  603. error = xfs_exchrange_prep(fxr, ip1, ip2);
  604. if (error)
  605. goto out_unlock;
  606. error = xfs_exchrange_mappings(fxr, ip1, ip2);
  607. if (error)
  608. goto out_unlock;
  609. /*
  610. * Finish the exchange by removing special file privileges like any
  611. * other file write would do. This may involve turning on support for
  612. * logged xattrs if either file has security capabilities.
  613. */
  614. error = xfs_exchange_range_finish(fxr);
  615. if (error)
  616. goto out_unlock;
  617. out_unlock:
  618. xfs_iunlock2_io_mmap(ip1, ip2);
  619. out_err:
  620. if (error)
  621. trace_xfs_exchrange_error(ip2, error, _RET_IP_);
  622. return error;
  623. }
  624. /* Exchange parts of two files. */
  625. static int
  626. xfs_exchange_range(
  627. struct xfs_exchrange *fxr)
  628. {
  629. struct inode *inode1 = file_inode(fxr->file1);
  630. struct inode *inode2 = file_inode(fxr->file2);
  631. loff_t check_len = fxr->length;
  632. int ret;
  633. BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS &
  634. XFS_EXCHANGE_RANGE_PRIV_FLAGS);
  635. /* Both files must be on the same mount/filesystem. */
  636. if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
  637. return -EXDEV;
  638. if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
  639. __XFS_EXCHANGE_RANGE_CHECK_FRESH2))
  640. return -EINVAL;
  641. /* Userspace requests only honored for regular files. */
  642. if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
  643. return -EISDIR;
  644. if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
  645. return -EINVAL;
  646. /* Both files must be opened for read and write. */
  647. if (!(fxr->file1->f_mode & FMODE_READ) ||
  648. !(fxr->file1->f_mode & FMODE_WRITE) ||
  649. !(fxr->file2->f_mode & FMODE_READ) ||
  650. !(fxr->file2->f_mode & FMODE_WRITE))
  651. return -EBADF;
  652. /* Neither file can be opened append-only. */
  653. if ((fxr->file1->f_flags & O_APPEND) ||
  654. (fxr->file2->f_flags & O_APPEND))
  655. return -EBADF;
  656. /*
  657. * If we're exchanging to EOF we can't calculate the length until taking
  658. * the iolock. Pass a 0 length to remap_verify_area similar to the
  659. * FICLONE and FICLONERANGE ioctls that support cloning to EOF as well.
  660. */
  661. if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
  662. check_len = 0;
  663. ret = remap_verify_area(fxr->file1, fxr->file1_offset, check_len, true);
  664. if (ret)
  665. return ret;
  666. ret = remap_verify_area(fxr->file2, fxr->file2_offset, check_len, true);
  667. if (ret)
  668. return ret;
  669. /* Update cmtime if the fd/inode don't forbid it. */
  670. if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))
  671. fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1;
  672. if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2))
  673. fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2;
  674. file_start_write(fxr->file2);
  675. ret = xfs_exchrange_contents(fxr);
  676. file_end_write(fxr->file2);
  677. if (ret)
  678. return ret;
  679. fsnotify_modify(fxr->file1);
  680. if (fxr->file2 != fxr->file1)
  681. fsnotify_modify(fxr->file2);
  682. return 0;
  683. }
  684. /* Collect exchange-range arguments from userspace. */
  685. long
  686. xfs_ioc_exchange_range(
  687. struct file *file,
  688. struct xfs_exchange_range __user *argp)
  689. {
  690. struct xfs_exchrange fxr = {
  691. .file2 = file,
  692. };
  693. struct xfs_exchange_range args;
  694. struct fd file1;
  695. int error;
  696. if (copy_from_user(&args, argp, sizeof(args)))
  697. return -EFAULT;
  698. if (memchr_inv(&args.pad, 0, sizeof(args.pad)))
  699. return -EINVAL;
  700. if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
  701. return -EINVAL;
  702. fxr.file1_offset = args.file1_offset;
  703. fxr.file2_offset = args.file2_offset;
  704. fxr.length = args.length;
  705. fxr.flags = args.flags;
  706. file1 = fdget(args.file1_fd);
  707. if (!fd_file(file1))
  708. return -EBADF;
  709. fxr.file1 = fd_file(file1);
  710. error = xfs_exchange_range(&fxr);
  711. fdput(file1);
  712. return error;
  713. }
  714. /* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */
  715. struct xfs_commit_range_fresh {
  716. xfs_fsid_t fsid; /* m_fixedfsid */
  717. __u64 file2_ino; /* inode number */
  718. __s64 file2_mtime; /* modification time */
  719. __s64 file2_ctime; /* change time */
  720. __s32 file2_mtime_nsec; /* mod time, nsec */
  721. __s32 file2_ctime_nsec; /* change time, nsec */
  722. __u32 file2_gen; /* inode generation */
  723. __u32 magic; /* zero */
  724. };
  725. #define XCR_FRESH_MAGIC 0x444F524B /* DORK */
  726. /* Set up a commitrange operation by sampling file2's write-related attrs */
  727. long
  728. xfs_ioc_start_commit(
  729. struct file *file,
  730. struct xfs_commit_range __user *argp)
  731. {
  732. struct xfs_commit_range args = { };
  733. struct timespec64 ts;
  734. struct xfs_commit_range_fresh *kern_f;
  735. struct xfs_commit_range_fresh __user *user_f;
  736. struct inode *inode2 = file_inode(file);
  737. struct xfs_inode *ip2 = XFS_I(inode2);
  738. const unsigned int lockflags = XFS_IOLOCK_SHARED |
  739. XFS_MMAPLOCK_SHARED |
  740. XFS_ILOCK_SHARED;
  741. BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) !=
  742. sizeof(args.file2_freshness));
  743. kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
  744. memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
  745. xfs_ilock(ip2, lockflags);
  746. ts = inode_get_ctime(inode2);
  747. kern_f->file2_ctime = ts.tv_sec;
  748. kern_f->file2_ctime_nsec = ts.tv_nsec;
  749. ts = inode_get_mtime(inode2);
  750. kern_f->file2_mtime = ts.tv_sec;
  751. kern_f->file2_mtime_nsec = ts.tv_nsec;
  752. kern_f->file2_ino = ip2->i_ino;
  753. kern_f->file2_gen = inode2->i_generation;
  754. kern_f->magic = XCR_FRESH_MAGIC;
  755. xfs_iunlock(ip2, lockflags);
  756. user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness;
  757. if (copy_to_user(user_f, kern_f, sizeof(*kern_f)))
  758. return -EFAULT;
  759. return 0;
  760. }
  761. /*
  762. * Exchange file1 and file2 contents if file2 has not been written since the
  763. * start commit operation.
  764. */
  765. long
  766. xfs_ioc_commit_range(
  767. struct file *file,
  768. struct xfs_commit_range __user *argp)
  769. {
  770. struct xfs_exchrange fxr = {
  771. .file2 = file,
  772. };
  773. struct xfs_commit_range args;
  774. struct xfs_commit_range_fresh *kern_f;
  775. struct xfs_inode *ip2 = XFS_I(file_inode(file));
  776. struct xfs_mount *mp = ip2->i_mount;
  777. struct fd file1;
  778. int error;
  779. kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
  780. if (copy_from_user(&args, argp, sizeof(args)))
  781. return -EFAULT;
  782. if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
  783. return -EINVAL;
  784. if (kern_f->magic != XCR_FRESH_MAGIC)
  785. return -EBUSY;
  786. if (memcmp(&kern_f->fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)))
  787. return -EBUSY;
  788. fxr.file1_offset = args.file1_offset;
  789. fxr.file2_offset = args.file2_offset;
  790. fxr.length = args.length;
  791. fxr.flags = args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2;
  792. fxr.file2_ino = kern_f->file2_ino;
  793. fxr.file2_gen = kern_f->file2_gen;
  794. fxr.file2_mtime.tv_sec = kern_f->file2_mtime;
  795. fxr.file2_mtime.tv_nsec = kern_f->file2_mtime_nsec;
  796. fxr.file2_ctime.tv_sec = kern_f->file2_ctime;
  797. fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec;
  798. file1 = fdget(args.file1_fd);
  799. if (fd_empty(file1))
  800. return -EBADF;
  801. fxr.file1 = fd_file(file1);
  802. error = xfs_exchange_range(&fxr);
  803. fdput(file1);
  804. return error;
  805. }