remap_range.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. #include <linux/slab.h>
  3. #include <linux/stat.h>
  4. #include <linux/sched/xacct.h>
  5. #include <linux/fcntl.h>
  6. #include <linux/file.h>
  7. #include <linux/uio.h>
  8. #include <linux/fsnotify.h>
  9. #include <linux/security.h>
  10. #include <linux/export.h>
  11. #include <linux/syscalls.h>
  12. #include <linux/pagemap.h>
  13. #include <linux/splice.h>
  14. #include <linux/compat.h>
  15. #include <linux/mount.h>
  16. #include <linux/fs.h>
  17. #include <linux/dax.h>
  18. #include <linux/overflow.h>
  19. #include "internal.h"
  20. #include <linux/uaccess.h>
  21. #include <asm/unistd.h>
  22. /*
  23. * Performs necessary checks before doing a clone.
  24. *
  25. * Can adjust amount of bytes to clone via @req_count argument.
  26. * Returns appropriate error code that caller should return or
  27. * zero in case the clone should be allowed.
  28. */
  29. static int generic_remap_checks(struct file *file_in, loff_t pos_in,
  30. struct file *file_out, loff_t pos_out,
  31. loff_t *req_count, unsigned int remap_flags)
  32. {
  33. struct inode *inode_in = file_in->f_mapping->host;
  34. struct inode *inode_out = file_out->f_mapping->host;
  35. uint64_t count = *req_count;
  36. uint64_t bcount;
  37. loff_t size_in, size_out;
  38. loff_t bs = inode_out->i_sb->s_blocksize;
  39. int ret;
  40. /* The start of both ranges must be aligned to an fs block. */
  41. if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
  42. return -EINVAL;
  43. /* Ensure offsets don't wrap. */
  44. if (pos_in + count < pos_in || pos_out + count < pos_out)
  45. return -EINVAL;
  46. size_in = i_size_read(inode_in);
  47. size_out = i_size_read(inode_out);
  48. /* Dedupe requires both ranges to be within EOF. */
  49. if ((remap_flags & REMAP_FILE_DEDUP) &&
  50. (pos_in >= size_in || pos_in + count > size_in ||
  51. pos_out >= size_out || pos_out + count > size_out))
  52. return -EINVAL;
  53. /* Ensure the infile range is within the infile. */
  54. if (pos_in >= size_in)
  55. return -EINVAL;
  56. count = min(count, size_in - (uint64_t)pos_in);
  57. ret = generic_write_check_limits(file_out, pos_out, &count);
  58. if (ret)
  59. return ret;
  60. /*
  61. * If the user wanted us to link to the infile's EOF, round up to the
  62. * next block boundary for this check.
  63. *
  64. * Otherwise, make sure the count is also block-aligned, having
  65. * already confirmed the starting offsets' block alignment.
  66. */
  67. if (pos_in + count == size_in &&
  68. (!(remap_flags & REMAP_FILE_DEDUP) || pos_out + count == size_out)) {
  69. bcount = ALIGN(size_in, bs) - pos_in;
  70. } else {
  71. if (!IS_ALIGNED(count, bs))
  72. count = ALIGN_DOWN(count, bs);
  73. bcount = count;
  74. }
  75. /* Don't allow overlapped cloning within the same file. */
  76. if (inode_in == inode_out &&
  77. pos_out + bcount > pos_in &&
  78. pos_out < pos_in + bcount)
  79. return -EINVAL;
  80. /*
  81. * We shortened the request but the caller can't deal with that, so
  82. * bounce the request back to userspace.
  83. */
  84. if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
  85. return -EINVAL;
  86. *req_count = count;
  87. return 0;
  88. }
  89. int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write)
  90. {
  91. int mask = write ? MAY_WRITE : MAY_READ;
  92. loff_t tmp;
  93. int ret;
  94. if (unlikely(pos < 0 || len < 0))
  95. return -EINVAL;
  96. if (unlikely(check_add_overflow(pos, len, &tmp)))
  97. return -EINVAL;
  98. ret = security_file_permission(file, mask);
  99. if (ret)
  100. return ret;
  101. return fsnotify_file_area_perm(file, mask, &pos, len);
  102. }
  103. EXPORT_SYMBOL_GPL(remap_verify_area);
  104. /*
  105. * Ensure that we don't remap a partial EOF block in the middle of something
  106. * else. Assume that the offsets have already been checked for block
  107. * alignment.
  108. *
  109. * For clone we only link a partial EOF block above or at the destination file's
  110. * EOF. For deduplication we accept a partial EOF block only if it ends at the
  111. * destination file's EOF (can not link it into the middle of a file).
  112. *
  113. * Shorten the request if possible.
  114. */
  115. static int generic_remap_check_len(struct inode *inode_in,
  116. struct inode *inode_out,
  117. loff_t pos_out,
  118. loff_t *len,
  119. unsigned int remap_flags)
  120. {
  121. u64 blkmask = i_blocksize(inode_in) - 1;
  122. loff_t new_len = *len;
  123. if ((*len & blkmask) == 0)
  124. return 0;
  125. if (pos_out + *len < i_size_read(inode_out))
  126. new_len &= ~blkmask;
  127. if (new_len == *len)
  128. return 0;
  129. if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
  130. *len = new_len;
  131. return 0;
  132. }
  133. return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
  134. }
  135. /* Read a page's worth of file data into the page cache. */
  136. static struct folio *vfs_dedupe_get_folio(struct file *file, loff_t pos)
  137. {
  138. return read_mapping_folio(file->f_mapping, pos >> PAGE_SHIFT, file);
  139. }
  140. /*
  141. * Lock two folios, ensuring that we lock in offset order if the folios
  142. * are from the same file.
  143. */
  144. static void vfs_lock_two_folios(struct folio *folio1, struct folio *folio2)
  145. {
  146. /* Always lock in order of increasing index. */
  147. if (folio1->index > folio2->index)
  148. swap(folio1, folio2);
  149. folio_lock(folio1);
  150. if (folio1 != folio2)
  151. folio_lock(folio2);
  152. }
  153. /* Unlock two folios, being careful not to unlock the same folio twice. */
  154. static void vfs_unlock_two_folios(struct folio *folio1, struct folio *folio2)
  155. {
  156. folio_unlock(folio1);
  157. if (folio1 != folio2)
  158. folio_unlock(folio2);
  159. }
  160. /*
  161. * Compare extents of two files to see if they are the same.
  162. * Caller must have locked both inodes to prevent write races.
  163. */
  164. static int vfs_dedupe_file_range_compare(struct file *src, loff_t srcoff,
  165. struct file *dest, loff_t dstoff,
  166. loff_t len, bool *is_same)
  167. {
  168. bool same = true;
  169. int error = -EINVAL;
  170. while (len) {
  171. struct folio *src_folio, *dst_folio;
  172. void *src_addr, *dst_addr;
  173. loff_t cmp_len = min(PAGE_SIZE - offset_in_page(srcoff),
  174. PAGE_SIZE - offset_in_page(dstoff));
  175. cmp_len = min(cmp_len, len);
  176. if (cmp_len <= 0)
  177. goto out_error;
  178. src_folio = vfs_dedupe_get_folio(src, srcoff);
  179. if (IS_ERR(src_folio)) {
  180. error = PTR_ERR(src_folio);
  181. goto out_error;
  182. }
  183. dst_folio = vfs_dedupe_get_folio(dest, dstoff);
  184. if (IS_ERR(dst_folio)) {
  185. error = PTR_ERR(dst_folio);
  186. folio_put(src_folio);
  187. goto out_error;
  188. }
  189. vfs_lock_two_folios(src_folio, dst_folio);
  190. /*
  191. * Now that we've locked both folios, make sure they're still
  192. * mapped to the file data we're interested in. If not,
  193. * someone is invalidating pages on us and we lose.
  194. */
  195. if (!folio_test_uptodate(src_folio) || !folio_test_uptodate(dst_folio) ||
  196. src_folio->mapping != src->f_mapping ||
  197. dst_folio->mapping != dest->f_mapping) {
  198. same = false;
  199. goto unlock;
  200. }
  201. src_addr = kmap_local_folio(src_folio,
  202. offset_in_folio(src_folio, srcoff));
  203. dst_addr = kmap_local_folio(dst_folio,
  204. offset_in_folio(dst_folio, dstoff));
  205. flush_dcache_folio(src_folio);
  206. flush_dcache_folio(dst_folio);
  207. if (memcmp(src_addr, dst_addr, cmp_len))
  208. same = false;
  209. kunmap_local(dst_addr);
  210. kunmap_local(src_addr);
  211. unlock:
  212. vfs_unlock_two_folios(src_folio, dst_folio);
  213. folio_put(dst_folio);
  214. folio_put(src_folio);
  215. if (!same)
  216. break;
  217. srcoff += cmp_len;
  218. dstoff += cmp_len;
  219. len -= cmp_len;
  220. }
  221. *is_same = same;
  222. return 0;
  223. out_error:
  224. return error;
  225. }
  226. /*
  227. * Check that the two inodes are eligible for cloning, the ranges make
  228. * sense, and then flush all dirty data. Caller must ensure that the
  229. * inodes have been locked against any other modifications.
  230. *
  231. * If there's an error, then the usual negative error code is returned.
  232. * Otherwise returns 0 with *len set to the request length.
  233. */
  234. int
  235. __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
  236. struct file *file_out, loff_t pos_out,
  237. loff_t *len, unsigned int remap_flags,
  238. const struct iomap_ops *dax_read_ops)
  239. {
  240. struct inode *inode_in = file_inode(file_in);
  241. struct inode *inode_out = file_inode(file_out);
  242. bool same_inode = (inode_in == inode_out);
  243. int ret;
  244. /* Don't touch certain kinds of inodes */
  245. if (IS_IMMUTABLE(inode_out))
  246. return -EPERM;
  247. if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
  248. return -ETXTBSY;
  249. /* Don't reflink dirs, pipes, sockets... */
  250. if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
  251. return -EISDIR;
  252. if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
  253. return -EINVAL;
  254. /* Zero length dedupe exits immediately; reflink goes to EOF. */
  255. if (*len == 0) {
  256. loff_t isize = i_size_read(inode_in);
  257. if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
  258. return 0;
  259. if (pos_in > isize)
  260. return -EINVAL;
  261. *len = isize - pos_in;
  262. if (*len == 0)
  263. return 0;
  264. }
  265. /* Check that we don't violate system file offset limits. */
  266. ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
  267. remap_flags);
  268. if (ret || *len == 0)
  269. return ret;
  270. /* Wait for the completion of any pending IOs on both files */
  271. inode_dio_wait(inode_in);
  272. if (!same_inode)
  273. inode_dio_wait(inode_out);
  274. ret = filemap_write_and_wait_range(inode_in->i_mapping,
  275. pos_in, pos_in + *len - 1);
  276. if (ret)
  277. return ret;
  278. ret = filemap_write_and_wait_range(inode_out->i_mapping,
  279. pos_out, pos_out + *len - 1);
  280. if (ret)
  281. return ret;
  282. /*
  283. * Check that the extents are the same.
  284. */
  285. if (remap_flags & REMAP_FILE_DEDUP) {
  286. bool is_same = false;
  287. if (!IS_DAX(inode_in))
  288. ret = vfs_dedupe_file_range_compare(file_in, pos_in,
  289. file_out, pos_out, *len, &is_same);
  290. else if (dax_read_ops)
  291. ret = dax_dedupe_file_range_compare(inode_in, pos_in,
  292. inode_out, pos_out, *len, &is_same,
  293. dax_read_ops);
  294. else
  295. return -EINVAL;
  296. if (ret)
  297. return ret;
  298. if (!is_same)
  299. return -EBADE;
  300. }
  301. ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
  302. remap_flags);
  303. if (ret || *len == 0)
  304. return ret;
  305. /* If can't alter the file contents, we're done. */
  306. if (!(remap_flags & REMAP_FILE_DEDUP))
  307. ret = file_modified(file_out);
  308. return ret;
  309. }
  310. int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
  311. struct file *file_out, loff_t pos_out,
  312. loff_t *len, unsigned int remap_flags)
  313. {
  314. return __generic_remap_file_range_prep(file_in, pos_in, file_out,
  315. pos_out, len, remap_flags, NULL);
  316. }
  317. EXPORT_SYMBOL(generic_remap_file_range_prep);
  318. loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
  319. struct file *file_out, loff_t pos_out,
  320. loff_t len, unsigned int remap_flags)
  321. {
  322. loff_t ret;
  323. WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
  324. if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
  325. return -EXDEV;
  326. ret = generic_file_rw_checks(file_in, file_out);
  327. if (ret < 0)
  328. return ret;
  329. if (!file_in->f_op->remap_file_range)
  330. return -EOPNOTSUPP;
  331. ret = remap_verify_area(file_in, pos_in, len, false);
  332. if (ret)
  333. return ret;
  334. ret = remap_verify_area(file_out, pos_out, len, true);
  335. if (ret)
  336. return ret;
  337. file_start_write(file_out);
  338. ret = file_in->f_op->remap_file_range(file_in, pos_in,
  339. file_out, pos_out, len, remap_flags);
  340. file_end_write(file_out);
  341. if (ret < 0)
  342. return ret;
  343. fsnotify_access(file_in);
  344. fsnotify_modify(file_out);
  345. return ret;
  346. }
  347. EXPORT_SYMBOL(vfs_clone_file_range);
  348. /* Check whether we are allowed to dedupe the destination file */
  349. static bool may_dedupe_file(struct file *file)
  350. {
  351. struct mnt_idmap *idmap = file_mnt_idmap(file);
  352. struct inode *inode = file_inode(file);
  353. if (capable(CAP_SYS_ADMIN))
  354. return true;
  355. if (file->f_mode & FMODE_WRITE)
  356. return true;
  357. if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid()))
  358. return true;
  359. if (!inode_permission(idmap, inode, MAY_WRITE))
  360. return true;
  361. return false;
  362. }
  363. loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
  364. struct file *dst_file, loff_t dst_pos,
  365. loff_t len, unsigned int remap_flags)
  366. {
  367. loff_t ret;
  368. WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
  369. REMAP_FILE_CAN_SHORTEN));
  370. /*
  371. * This is redundant if called from vfs_dedupe_file_range(), but other
  372. * callers need it and it's not performance sesitive...
  373. */
  374. ret = remap_verify_area(src_file, src_pos, len, false);
  375. if (ret)
  376. return ret;
  377. ret = remap_verify_area(dst_file, dst_pos, len, true);
  378. if (ret)
  379. return ret;
  380. /*
  381. * This needs to be called after remap_verify_area() because of
  382. * sb_start_write() and before may_dedupe_file() because the mount's
  383. * MAY_WRITE need to be checked with mnt_get_write_access_file() held.
  384. */
  385. ret = mnt_want_write_file(dst_file);
  386. if (ret)
  387. return ret;
  388. ret = -EPERM;
  389. if (!may_dedupe_file(dst_file))
  390. goto out_drop_write;
  391. ret = -EXDEV;
  392. if (file_inode(src_file)->i_sb != file_inode(dst_file)->i_sb)
  393. goto out_drop_write;
  394. ret = -EISDIR;
  395. if (S_ISDIR(file_inode(dst_file)->i_mode))
  396. goto out_drop_write;
  397. ret = -EINVAL;
  398. if (!dst_file->f_op->remap_file_range)
  399. goto out_drop_write;
  400. if (len == 0) {
  401. ret = 0;
  402. goto out_drop_write;
  403. }
  404. ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
  405. dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
  406. out_drop_write:
  407. mnt_drop_write_file(dst_file);
  408. return ret;
  409. }
  410. EXPORT_SYMBOL(vfs_dedupe_file_range_one);
  411. int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
  412. {
  413. struct file_dedupe_range_info *info;
  414. struct inode *src = file_inode(file);
  415. u64 off;
  416. u64 len;
  417. int i;
  418. int ret;
  419. u16 count = same->dest_count;
  420. loff_t deduped;
  421. if (!(file->f_mode & FMODE_READ))
  422. return -EINVAL;
  423. if (same->reserved1 || same->reserved2)
  424. return -EINVAL;
  425. off = same->src_offset;
  426. len = same->src_length;
  427. if (S_ISDIR(src->i_mode))
  428. return -EISDIR;
  429. if (!S_ISREG(src->i_mode))
  430. return -EINVAL;
  431. if (!file->f_op->remap_file_range)
  432. return -EOPNOTSUPP;
  433. ret = remap_verify_area(file, off, len, false);
  434. if (ret < 0)
  435. return ret;
  436. ret = 0;
  437. if (off + len > i_size_read(src))
  438. return -EINVAL;
  439. /* Arbitrary 1G limit on a single dedupe request, can be raised. */
  440. len = min_t(u64, len, 1 << 30);
  441. /* pre-format output fields to sane values */
  442. for (i = 0; i < count; i++) {
  443. same->info[i].bytes_deduped = 0ULL;
  444. same->info[i].status = FILE_DEDUPE_RANGE_SAME;
  445. }
  446. for (i = 0, info = same->info; i < count; i++, info++) {
  447. struct fd dst_fd = fdget(info->dest_fd);
  448. struct file *dst_file = fd_file(dst_fd);
  449. if (!dst_file) {
  450. info->status = -EBADF;
  451. goto next_loop;
  452. }
  453. if (info->reserved) {
  454. info->status = -EINVAL;
  455. goto next_fdput;
  456. }
  457. deduped = vfs_dedupe_file_range_one(file, off, dst_file,
  458. info->dest_offset, len,
  459. REMAP_FILE_CAN_SHORTEN);
  460. if (deduped == -EBADE)
  461. info->status = FILE_DEDUPE_RANGE_DIFFERS;
  462. else if (deduped < 0)
  463. info->status = deduped;
  464. else
  465. info->bytes_deduped = len;
  466. next_fdput:
  467. fdput(dst_fd);
  468. next_loop:
  469. if (fatal_signal_pending(current))
  470. break;
  471. }
  472. return ret;
  473. }
  474. EXPORT_SYMBOL(vfs_dedupe_file_range);