direct-io.c 33 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/fsverity.h>
  3. #include <linux/iomap.h>
  4. #include "ctree.h"
  5. #include "delalloc-space.h"
  6. #include "direct-io.h"
  7. #include "extent-tree.h"
  8. #include "file.h"
  9. #include "fs.h"
  10. #include "transaction.h"
  11. #include "volumes.h"
  12. struct btrfs_dio_data {
  13. ssize_t submitted;
  14. struct extent_changeset *data_reserved;
  15. struct btrfs_ordered_extent *ordered;
  16. bool data_space_reserved;
  17. bool nocow_done;
  18. };
  19. struct btrfs_dio_private {
  20. /* Range of I/O */
  21. u64 file_offset;
  22. u32 bytes;
  23. /* This must be last */
  24. struct btrfs_bio bbio;
  25. };
  26. static struct bio_set btrfs_dio_bioset;
  27. static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
  28. struct extent_state **cached_state,
  29. unsigned int iomap_flags)
  30. {
  31. const bool writing = (iomap_flags & IOMAP_WRITE);
  32. const bool nowait = (iomap_flags & IOMAP_NOWAIT);
  33. struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
  34. struct btrfs_ordered_extent *ordered;
  35. int ret = 0;
  36. /* Direct lock must be taken before the extent lock. */
  37. if (nowait) {
  38. if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
  39. return -EAGAIN;
  40. } else {
  41. lock_dio_extent(io_tree, lockstart, lockend, cached_state);
  42. }
  43. while (1) {
  44. if (nowait) {
  45. if (!try_lock_extent(io_tree, lockstart, lockend,
  46. cached_state)) {
  47. ret = -EAGAIN;
  48. break;
  49. }
  50. } else {
  51. lock_extent(io_tree, lockstart, lockend, cached_state);
  52. }
  53. /*
  54. * We're concerned with the entire range that we're going to be
  55. * doing DIO to, so we need to make sure there's no ordered
  56. * extents in this range.
  57. */
  58. ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
  59. lockend - lockstart + 1);
  60. /*
  61. * We need to make sure there are no buffered pages in this
  62. * range either, we could have raced between the invalidate in
  63. * generic_file_direct_write and locking the extent. The
  64. * invalidate needs to happen so that reads after a write do not
  65. * get stale data.
  66. */
  67. if (!ordered &&
  68. (!writing || !filemap_range_has_page(inode->i_mapping,
  69. lockstart, lockend)))
  70. break;
  71. unlock_extent(io_tree, lockstart, lockend, cached_state);
  72. if (ordered) {
  73. if (nowait) {
  74. btrfs_put_ordered_extent(ordered);
  75. ret = -EAGAIN;
  76. break;
  77. }
  78. /*
  79. * If we are doing a DIO read and the ordered extent we
  80. * found is for a buffered write, we can not wait for it
  81. * to complete and retry, because if we do so we can
  82. * deadlock with concurrent buffered writes on page
  83. * locks. This happens only if our DIO read covers more
  84. * than one extent map, if at this point has already
  85. * created an ordered extent for a previous extent map
  86. * and locked its range in the inode's io tree, and a
  87. * concurrent write against that previous extent map's
  88. * range and this range started (we unlock the ranges
  89. * in the io tree only when the bios complete and
  90. * buffered writes always lock pages before attempting
  91. * to lock range in the io tree).
  92. */
  93. if (writing ||
  94. test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
  95. btrfs_start_ordered_extent(ordered);
  96. else
  97. ret = nowait ? -EAGAIN : -ENOTBLK;
  98. btrfs_put_ordered_extent(ordered);
  99. } else {
  100. /*
  101. * We could trigger writeback for this range (and wait
  102. * for it to complete) and then invalidate the pages for
  103. * this range (through invalidate_inode_pages2_range()),
  104. * but that can lead us to a deadlock with a concurrent
  105. * call to readahead (a buffered read or a defrag call
  106. * triggered a readahead) on a page lock due to an
  107. * ordered dio extent we created before but did not have
  108. * yet a corresponding bio submitted (whence it can not
  109. * complete), which makes readahead wait for that
  110. * ordered extent to complete while holding a lock on
  111. * that page.
  112. */
  113. ret = nowait ? -EAGAIN : -ENOTBLK;
  114. }
  115. if (ret)
  116. break;
  117. cond_resched();
  118. }
  119. if (ret)
  120. unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
  121. return ret;
  122. }
  123. static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
  124. struct btrfs_dio_data *dio_data,
  125. const u64 start,
  126. const struct btrfs_file_extent *file_extent,
  127. const int type)
  128. {
  129. struct extent_map *em = NULL;
  130. struct btrfs_ordered_extent *ordered;
  131. if (type != BTRFS_ORDERED_NOCOW) {
  132. em = btrfs_create_io_em(inode, start, file_extent, type);
  133. if (IS_ERR(em))
  134. goto out;
  135. }
  136. ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
  137. (1U << type) |
  138. (1U << BTRFS_ORDERED_DIRECT));
  139. if (IS_ERR(ordered)) {
  140. if (em) {
  141. free_extent_map(em);
  142. btrfs_drop_extent_map_range(inode, start,
  143. start + file_extent->num_bytes - 1, false);
  144. }
  145. em = ERR_CAST(ordered);
  146. } else {
  147. ASSERT(!dio_data->ordered);
  148. dio_data->ordered = ordered;
  149. }
  150. out:
  151. return em;
  152. }
  153. static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
  154. struct btrfs_dio_data *dio_data,
  155. u64 start, u64 len)
  156. {
  157. struct btrfs_root *root = inode->root;
  158. struct btrfs_fs_info *fs_info = root->fs_info;
  159. struct btrfs_file_extent file_extent;
  160. struct extent_map *em;
  161. struct btrfs_key ins;
  162. u64 alloc_hint;
  163. int ret;
  164. alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);
  165. again:
  166. ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
  167. 0, alloc_hint, &ins, 1, 1);
  168. if (ret == -EAGAIN) {
  169. ASSERT(btrfs_is_zoned(fs_info));
  170. wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
  171. TASK_UNINTERRUPTIBLE);
  172. goto again;
  173. }
  174. if (ret)
  175. return ERR_PTR(ret);
  176. file_extent.disk_bytenr = ins.objectid;
  177. file_extent.disk_num_bytes = ins.offset;
  178. file_extent.num_bytes = ins.offset;
  179. file_extent.ram_bytes = ins.offset;
  180. file_extent.offset = 0;
  181. file_extent.compression = BTRFS_COMPRESS_NONE;
  182. em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent,
  183. BTRFS_ORDERED_REGULAR);
  184. btrfs_dec_block_group_reservations(fs_info, ins.objectid);
  185. if (IS_ERR(em))
  186. btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
  187. 1);
  188. return em;
  189. }
  190. static int btrfs_get_blocks_direct_write(struct extent_map **map,
  191. struct inode *inode,
  192. struct btrfs_dio_data *dio_data,
  193. u64 start, u64 *lenp,
  194. unsigned int iomap_flags)
  195. {
  196. const bool nowait = (iomap_flags & IOMAP_NOWAIT);
  197. struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
  198. struct btrfs_file_extent file_extent;
  199. struct extent_map *em = *map;
  200. int type;
  201. u64 block_start;
  202. struct btrfs_block_group *bg;
  203. bool can_nocow = false;
  204. bool space_reserved = false;
  205. u64 len = *lenp;
  206. u64 prev_len;
  207. int ret = 0;
  208. /*
  209. * We don't allocate a new extent in the following cases
  210. *
  211. * 1) The inode is marked as NODATACOW. In this case we'll just use the
  212. * existing extent.
  213. * 2) The extent is marked as PREALLOC. We're good to go here and can
  214. * just use the extent.
  215. *
  216. */
  217. if ((em->flags & EXTENT_FLAG_PREALLOC) ||
  218. ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
  219. em->disk_bytenr != EXTENT_MAP_HOLE)) {
  220. if (em->flags & EXTENT_FLAG_PREALLOC)
  221. type = BTRFS_ORDERED_PREALLOC;
  222. else
  223. type = BTRFS_ORDERED_NOCOW;
  224. len = min(len, em->len - (start - em->start));
  225. block_start = extent_map_block_start(em) + (start - em->start);
  226. if (can_nocow_extent(inode, start, &len,
  227. &file_extent, false, false) == 1) {
  228. bg = btrfs_inc_nocow_writers(fs_info, block_start);
  229. if (bg)
  230. can_nocow = true;
  231. }
  232. }
  233. prev_len = len;
  234. if (can_nocow) {
  235. struct extent_map *em2;
  236. /* We can NOCOW, so only need to reserve metadata space. */
  237. ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
  238. nowait);
  239. if (ret < 0) {
  240. /* Our caller expects us to free the input extent map. */
  241. free_extent_map(em);
  242. *map = NULL;
  243. btrfs_dec_nocow_writers(bg);
  244. if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
  245. ret = -EAGAIN;
  246. goto out;
  247. }
  248. space_reserved = true;
  249. em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start,
  250. &file_extent, type);
  251. btrfs_dec_nocow_writers(bg);
  252. if (type == BTRFS_ORDERED_PREALLOC) {
  253. free_extent_map(em);
  254. *map = em2;
  255. em = em2;
  256. }
  257. if (IS_ERR(em2)) {
  258. ret = PTR_ERR(em2);
  259. goto out;
  260. }
  261. dio_data->nocow_done = true;
  262. } else {
  263. /* Our caller expects us to free the input extent map. */
  264. free_extent_map(em);
  265. *map = NULL;
  266. if (nowait) {
  267. ret = -EAGAIN;
  268. goto out;
  269. }
  270. /*
  271. * If we could not allocate data space before locking the file
  272. * range and we can't do a NOCOW write, then we have to fail.
  273. */
  274. if (!dio_data->data_space_reserved) {
  275. ret = -ENOSPC;
  276. goto out;
  277. }
  278. /*
  279. * We have to COW and we have already reserved data space before,
  280. * so now we reserve only metadata.
  281. */
  282. ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
  283. false);
  284. if (ret < 0)
  285. goto out;
  286. space_reserved = true;
  287. em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
  288. if (IS_ERR(em)) {
  289. ret = PTR_ERR(em);
  290. goto out;
  291. }
  292. *map = em;
  293. len = min(len, em->len - (start - em->start));
  294. if (len < prev_len)
  295. btrfs_delalloc_release_metadata(BTRFS_I(inode),
  296. prev_len - len, true);
  297. }
  298. /*
  299. * We have created our ordered extent, so we can now release our reservation
  300. * for an outstanding extent.
  301. */
  302. btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
  303. /*
  304. * Need to update the i_size under the extent lock so buffered
  305. * readers will get the updated i_size when we unlock.
  306. */
  307. if (start + len > i_size_read(inode))
  308. i_size_write(inode, start + len);
  309. out:
  310. if (ret && space_reserved) {
  311. btrfs_delalloc_release_extents(BTRFS_I(inode), len);
  312. btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
  313. }
  314. *lenp = len;
  315. return ret;
  316. }
  317. static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
  318. loff_t length, unsigned int flags, struct iomap *iomap,
  319. struct iomap *srcmap)
  320. {
  321. struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
  322. struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
  323. struct extent_map *em;
  324. struct extent_state *cached_state = NULL;
  325. struct btrfs_dio_data *dio_data = iter->private;
  326. u64 lockstart, lockend;
  327. const bool write = !!(flags & IOMAP_WRITE);
  328. int ret = 0;
  329. u64 len = length;
  330. const u64 data_alloc_len = length;
  331. u32 unlock_bits = EXTENT_LOCKED;
  332. /*
  333. * We could potentially fault if we have a buffer > PAGE_SIZE, and if
  334. * we're NOWAIT we may submit a bio for a partial range and return
  335. * EIOCBQUEUED, which would result in an errant short read.
  336. *
  337. * The best way to handle this would be to allow for partial completions
  338. * of iocb's, so we could submit the partial bio, return and fault in
  339. * the rest of the pages, and then submit the io for the rest of the
  340. * range. However we don't have that currently, so simply return
  341. * -EAGAIN at this point so that the normal path is used.
  342. */
  343. if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
  344. return -EAGAIN;
  345. /*
  346. * Cap the size of reads to that usually seen in buffered I/O as we need
  347. * to allocate a contiguous array for the checksums.
  348. */
  349. if (!write)
  350. len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
  351. lockstart = start;
  352. lockend = start + len - 1;
  353. /*
  354. * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
  355. * enough if we've written compressed pages to this area, so we need to
  356. * flush the dirty pages again to make absolutely sure that any
  357. * outstanding dirty pages are on disk - the first flush only starts
  358. * compression on the data, while keeping the pages locked, so by the
  359. * time the second flush returns we know bios for the compressed pages
  360. * were submitted and finished, and the pages no longer under writeback.
  361. *
  362. * If we have a NOWAIT request and we have any pages in the range that
  363. * are locked, likely due to compression still in progress, we don't want
  364. * to block on page locks. We also don't want to block on pages marked as
  365. * dirty or under writeback (same as for the non-compression case).
  366. * iomap_dio_rw() did the same check, but after that and before we got
  367. * here, mmap'ed writes may have happened or buffered reads started
  368. * (readpage() and readahead(), which lock pages), as we haven't locked
  369. * the file range yet.
  370. */
  371. if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
  372. &BTRFS_I(inode)->runtime_flags)) {
  373. if (flags & IOMAP_NOWAIT) {
  374. if (filemap_range_needs_writeback(inode->i_mapping,
  375. lockstart, lockend))
  376. return -EAGAIN;
  377. } else {
  378. ret = filemap_fdatawrite_range(inode->i_mapping, start,
  379. start + length - 1);
  380. if (ret)
  381. return ret;
  382. }
  383. }
  384. memset(dio_data, 0, sizeof(*dio_data));
  385. /*
  386. * We always try to allocate data space and must do it before locking
  387. * the file range, to avoid deadlocks with concurrent writes to the same
  388. * range if the range has several extents and the writes don't expand the
  389. * current i_size (the inode lock is taken in shared mode). If we fail to
  390. * allocate data space here we continue and later, after locking the
  391. * file range, we fail with ENOSPC only if we figure out we can not do a
  392. * NOCOW write.
  393. */
  394. if (write && !(flags & IOMAP_NOWAIT)) {
  395. ret = btrfs_check_data_free_space(BTRFS_I(inode),
  396. &dio_data->data_reserved,
  397. start, data_alloc_len, false);
  398. if (!ret)
  399. dio_data->data_space_reserved = true;
  400. else if (ret && !(BTRFS_I(inode)->flags &
  401. (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
  402. goto err;
  403. }
  404. /*
  405. * If this errors out it's because we couldn't invalidate pagecache for
  406. * this range and we need to fallback to buffered IO, or we are doing a
  407. * NOWAIT read/write and we need to block.
  408. */
  409. ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
  410. if (ret < 0)
  411. goto err;
  412. em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
  413. if (IS_ERR(em)) {
  414. ret = PTR_ERR(em);
  415. goto unlock_err;
  416. }
  417. /*
  418. * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
  419. * io. INLINE is special, and we could probably kludge it in here, but
  420. * it's still buffered so for safety lets just fall back to the generic
  421. * buffered path.
  422. *
  423. * For COMPRESSED we _have_ to read the entire extent in so we can
  424. * decompress it, so there will be buffering required no matter what we
  425. * do, so go ahead and fallback to buffered.
  426. *
  427. * We return -ENOTBLK because that's what makes DIO go ahead and go back
  428. * to buffered IO. Don't blame me, this is the price we pay for using
  429. * the generic code.
  430. */
  431. if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
  432. free_extent_map(em);
  433. /*
  434. * If we are in a NOWAIT context, return -EAGAIN in order to
  435. * fallback to buffered IO. This is not only because we can
  436. * block with buffered IO (no support for NOWAIT semantics at
  437. * the moment) but also to avoid returning short reads to user
  438. * space - this happens if we were able to read some data from
  439. * previous non-compressed extents and then when we fallback to
  440. * buffered IO, at btrfs_file_read_iter() by calling
  441. * filemap_read(), we fail to fault in pages for the read buffer,
  442. * in which case filemap_read() returns a short read (the number
  443. * of bytes previously read is > 0, so it does not return -EFAULT).
  444. */
  445. ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
  446. goto unlock_err;
  447. }
  448. len = min(len, em->len - (start - em->start));
  449. /*
  450. * If we have a NOWAIT request and the range contains multiple extents
  451. * (or a mix of extents and holes), then we return -EAGAIN to make the
  452. * caller fallback to a context where it can do a blocking (without
  453. * NOWAIT) request. This way we avoid doing partial IO and returning
  454. * success to the caller, which is not optimal for writes and for reads
  455. * it can result in unexpected behaviour for an application.
  456. *
  457. * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
  458. * iomap_dio_rw(), we can end up returning less data then what the caller
  459. * asked for, resulting in an unexpected, and incorrect, short read.
  460. * That is, the caller asked to read N bytes and we return less than that,
  461. * which is wrong unless we are crossing EOF. This happens if we get a
  462. * page fault error when trying to fault in pages for the buffer that is
  463. * associated to the struct iov_iter passed to iomap_dio_rw(), and we
  464. * have previously submitted bios for other extents in the range, in
  465. * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
  466. * those bios have completed by the time we get the page fault error,
  467. * which we return back to our caller - we should only return EIOCBQUEUED
  468. * after we have submitted bios for all the extents in the range.
  469. */
  470. if ((flags & IOMAP_NOWAIT) && len < length) {
  471. free_extent_map(em);
  472. ret = -EAGAIN;
  473. goto unlock_err;
  474. }
  475. if (write) {
  476. ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
  477. start, &len, flags);
  478. if (ret < 0)
  479. goto unlock_err;
  480. /* Recalc len in case the new em is smaller than requested */
  481. len = min(len, em->len - (start - em->start));
  482. if (dio_data->data_space_reserved) {
  483. u64 release_offset;
  484. u64 release_len = 0;
  485. if (dio_data->nocow_done) {
  486. release_offset = start;
  487. release_len = data_alloc_len;
  488. } else if (len < data_alloc_len) {
  489. release_offset = start + len;
  490. release_len = data_alloc_len - len;
  491. }
  492. if (release_len > 0)
  493. btrfs_free_reserved_data_space(BTRFS_I(inode),
  494. dio_data->data_reserved,
  495. release_offset,
  496. release_len);
  497. }
  498. }
  499. /*
  500. * Translate extent map information to iomap.
  501. * We trim the extents (and move the addr) even though iomap code does
  502. * that, since we have locked only the parts we are performing I/O in.
  503. */
  504. if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||
  505. ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
  506. iomap->addr = IOMAP_NULL_ADDR;
  507. iomap->type = IOMAP_HOLE;
  508. } else {
  509. iomap->addr = extent_map_block_start(em) + (start - em->start);
  510. iomap->type = IOMAP_MAPPED;
  511. }
  512. iomap->offset = start;
  513. iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
  514. iomap->length = len;
  515. free_extent_map(em);
  516. /*
  517. * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,
  518. * writes only hold it for this part. We hold the extent lock until
  519. * we're completely done with the extent map to make sure it remains
  520. * valid.
  521. */
  522. if (write)
  523. unlock_bits |= EXTENT_DIO_LOCKED;
  524. clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
  525. unlock_bits, &cached_state);
  526. /* We didn't use everything, unlock the dio extent for the remainder. */
  527. if (!write && (start + len) < lockend)
  528. unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
  529. lockend, NULL);
  530. return 0;
  531. unlock_err:
  532. /*
  533. * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget
  534. * to update this, be explicit that we expect EXTENT_LOCKED and
  535. * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.
  536. */
  537. clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
  538. EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
  539. err:
  540. if (dio_data->data_space_reserved) {
  541. btrfs_free_reserved_data_space(BTRFS_I(inode),
  542. dio_data->data_reserved,
  543. start, data_alloc_len);
  544. extent_changeset_free(dio_data->data_reserved);
  545. }
  546. return ret;
  547. }
  548. static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
  549. ssize_t written, unsigned int flags, struct iomap *iomap)
  550. {
  551. struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
  552. struct btrfs_dio_data *dio_data = iter->private;
  553. size_t submitted = dio_data->submitted;
  554. const bool write = !!(flags & IOMAP_WRITE);
  555. int ret = 0;
  556. if (!write && (iomap->type == IOMAP_HOLE)) {
  557. /* If reading from a hole, unlock and return */
  558. unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
  559. pos + length - 1, NULL);
  560. return 0;
  561. }
  562. if (submitted < length) {
  563. pos += submitted;
  564. length -= submitted;
  565. if (write)
  566. btrfs_finish_ordered_extent(dio_data->ordered, NULL,
  567. pos, length, false);
  568. else
  569. unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
  570. pos + length - 1, NULL);
  571. ret = -ENOTBLK;
  572. }
  573. if (write) {
  574. btrfs_put_ordered_extent(dio_data->ordered);
  575. dio_data->ordered = NULL;
  576. }
  577. if (write)
  578. extent_changeset_free(dio_data->data_reserved);
  579. return ret;
  580. }
  581. static void btrfs_dio_end_io(struct btrfs_bio *bbio)
  582. {
  583. struct btrfs_dio_private *dip =
  584. container_of(bbio, struct btrfs_dio_private, bbio);
  585. struct btrfs_inode *inode = bbio->inode;
  586. struct bio *bio = &bbio->bio;
  587. if (bio->bi_status) {
  588. btrfs_warn(inode->root->fs_info,
  589. "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
  590. btrfs_ino(inode), bio->bi_opf,
  591. dip->file_offset, dip->bytes, bio->bi_status);
  592. }
  593. if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
  594. btrfs_finish_ordered_extent(bbio->ordered, NULL,
  595. dip->file_offset, dip->bytes,
  596. !bio->bi_status);
  597. } else {
  598. unlock_dio_extent(&inode->io_tree, dip->file_offset,
  599. dip->file_offset + dip->bytes - 1, NULL);
  600. }
  601. bbio->bio.bi_private = bbio->private;
  602. iomap_dio_bio_end_io(bio);
  603. }
  604. static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
  605. struct btrfs_ordered_extent *ordered)
  606. {
  607. u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
  608. u64 len = bbio->bio.bi_iter.bi_size;
  609. struct btrfs_ordered_extent *new;
  610. int ret;
  611. /* Must always be called for the beginning of an ordered extent. */
  612. if (WARN_ON_ONCE(start != ordered->disk_bytenr))
  613. return -EINVAL;
  614. /* No need to split if the ordered extent covers the entire bio. */
  615. if (ordered->disk_num_bytes == len) {
  616. refcount_inc(&ordered->refs);
  617. bbio->ordered = ordered;
  618. return 0;
  619. }
  620. /*
  621. * Don't split the extent_map for NOCOW extents, as we're writing into
  622. * a pre-existing one.
  623. */
  624. if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
  625. ret = split_extent_map(bbio->inode, bbio->file_offset,
  626. ordered->num_bytes, len,
  627. ordered->disk_bytenr);
  628. if (ret)
  629. return ret;
  630. }
  631. new = btrfs_split_ordered_extent(ordered, len);
  632. if (IS_ERR(new))
  633. return PTR_ERR(new);
  634. bbio->ordered = new;
  635. return 0;
  636. }
  637. static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
  638. loff_t file_offset)
  639. {
  640. struct btrfs_bio *bbio = btrfs_bio(bio);
  641. struct btrfs_dio_private *dip =
  642. container_of(bbio, struct btrfs_dio_private, bbio);
  643. struct btrfs_dio_data *dio_data = iter->private;
  644. btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
  645. btrfs_dio_end_io, bio->bi_private);
  646. bbio->inode = BTRFS_I(iter->inode);
  647. bbio->file_offset = file_offset;
  648. dip->file_offset = file_offset;
  649. dip->bytes = bio->bi_iter.bi_size;
  650. dio_data->submitted += bio->bi_iter.bi_size;
  651. /*
  652. * Check if we are doing a partial write. If we are, we need to split
  653. * the ordered extent to match the submitted bio. Hang on to the
  654. * remaining unfinishable ordered_extent in dio_data so that it can be
  655. * cancelled in iomap_end to avoid a deadlock wherein faulting the
  656. * remaining pages is blocked on the outstanding ordered extent.
  657. */
  658. if (iter->flags & IOMAP_WRITE) {
  659. int ret;
  660. ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
  661. if (ret) {
  662. btrfs_finish_ordered_extent(dio_data->ordered, NULL,
  663. file_offset, dip->bytes,
  664. !ret);
  665. bio->bi_status = errno_to_blk_status(ret);
  666. iomap_dio_bio_end_io(bio);
  667. return;
  668. }
  669. }
  670. btrfs_submit_bbio(bbio, 0);
  671. }
  672. static const struct iomap_ops btrfs_dio_iomap_ops = {
  673. .iomap_begin = btrfs_dio_iomap_begin,
  674. .iomap_end = btrfs_dio_iomap_end,
  675. };
  676. static const struct iomap_dio_ops btrfs_dio_ops = {
  677. .submit_io = btrfs_dio_submit_io,
  678. .bio_set = &btrfs_dio_bioset,
  679. };
  680. static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
  681. size_t done_before)
  682. {
  683. struct btrfs_dio_data data = { 0 };
  684. return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
  685. IOMAP_DIO_PARTIAL, &data, done_before);
  686. }
  687. static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
  688. size_t done_before)
  689. {
  690. struct btrfs_dio_data data = { 0 };
  691. return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
  692. IOMAP_DIO_PARTIAL, &data, done_before);
  693. }
  694. static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
  695. const struct iov_iter *iter, loff_t offset)
  696. {
  697. const u32 blocksize_mask = fs_info->sectorsize - 1;
  698. if (offset & blocksize_mask)
  699. return -EINVAL;
  700. if (iov_iter_alignment(iter) & blocksize_mask)
  701. return -EINVAL;
  702. return 0;
  703. }
  704. ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
  705. {
  706. struct file *file = iocb->ki_filp;
  707. struct inode *inode = file_inode(file);
  708. struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
  709. loff_t pos;
  710. ssize_t written = 0;
  711. ssize_t written_buffered;
  712. size_t prev_left = 0;
  713. loff_t endbyte;
  714. ssize_t ret;
  715. unsigned int ilock_flags = 0;
  716. struct iomap_dio *dio;
  717. if (iocb->ki_flags & IOCB_NOWAIT)
  718. ilock_flags |= BTRFS_ILOCK_TRY;
  719. /*
  720. * If the write DIO is within EOF, use a shared lock and also only if
  721. * security bits will likely not be dropped by file_remove_privs() called
  722. * from btrfs_write_check(). Either will need to be rechecked after the
  723. * lock was acquired.
  724. */
  725. if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
  726. ilock_flags |= BTRFS_ILOCK_SHARED;
  727. relock:
  728. ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
  729. if (ret < 0)
  730. return ret;
  731. /* Shared lock cannot be used with security bits set. */
  732. if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
  733. btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
  734. ilock_flags &= ~BTRFS_ILOCK_SHARED;
  735. goto relock;
  736. }
  737. ret = generic_write_checks(iocb, from);
  738. if (ret <= 0) {
  739. btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
  740. return ret;
  741. }
  742. ret = btrfs_write_check(iocb, from, ret);
  743. if (ret < 0) {
  744. btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
  745. goto out;
  746. }
  747. pos = iocb->ki_pos;
  748. /*
  749. * Re-check since file size may have changed just before taking the
  750. * lock or pos may have changed because of O_APPEND in generic_write_check()
  751. */
  752. if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
  753. pos + iov_iter_count(from) > i_size_read(inode)) {
  754. btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
  755. ilock_flags &= ~BTRFS_ILOCK_SHARED;
  756. goto relock;
  757. }
  758. if (check_direct_IO(fs_info, from, pos)) {
  759. btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
  760. goto buffered;
  761. }
  762. /*
  763. * The iov_iter can be mapped to the same file range we are writing to.
  764. * If that's the case, then we will deadlock in the iomap code, because
  765. * it first calls our callback btrfs_dio_iomap_begin(), which will create
  766. * an ordered extent, and after that it will fault in the pages that the
  767. * iov_iter refers to. During the fault in we end up in the readahead
  768. * pages code (starting at btrfs_readahead()), which will lock the range,
  769. * find that ordered extent and then wait for it to complete (at
  770. * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
  771. * obviously the ordered extent can never complete as we didn't submit
  772. * yet the respective bio(s). This always happens when the buffer is
  773. * memory mapped to the same file range, since the iomap DIO code always
  774. * invalidates pages in the target file range (after starting and waiting
  775. * for any writeback).
  776. *
  777. * So here we disable page faults in the iov_iter and then retry if we
  778. * got -EFAULT, faulting in the pages before the retry.
  779. */
  780. again:
  781. from->nofault = true;
  782. dio = btrfs_dio_write(iocb, from, written);
  783. from->nofault = false;
  784. if (IS_ERR_OR_NULL(dio)) {
  785. ret = PTR_ERR_OR_ZERO(dio);
  786. } else {
  787. /*
  788. * If we have a synchronous write, we must make sure the fsync
  789. * triggered by the iomap_dio_complete() call below doesn't
  790. * deadlock on the inode lock - we are already holding it and we
  791. * can't call it after unlocking because we may need to complete
  792. * partial writes due to the input buffer (or parts of it) not
  793. * being already faulted in.
  794. */
  795. ASSERT(current->journal_info == NULL);
  796. current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
  797. ret = iomap_dio_complete(dio);
  798. current->journal_info = NULL;
  799. }
  800. /* No increment (+=) because iomap returns a cumulative value. */
  801. if (ret > 0)
  802. written = ret;
  803. if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
  804. const size_t left = iov_iter_count(from);
  805. /*
  806. * We have more data left to write. Try to fault in as many as
  807. * possible of the remainder pages and retry. We do this without
  808. * releasing and locking again the inode, to prevent races with
  809. * truncate.
  810. *
  811. * Also, in case the iov refers to pages in the file range of the
  812. * file we want to write to (due to a mmap), we could enter an
  813. * infinite loop if we retry after faulting the pages in, since
  814. * iomap will invalidate any pages in the range early on, before
  815. * it tries to fault in the pages of the iov. So we keep track of
  816. * how much was left of iov in the previous EFAULT and fallback
  817. * to buffered IO in case we haven't made any progress.
  818. */
  819. if (left == prev_left) {
  820. ret = -ENOTBLK;
  821. } else {
  822. fault_in_iov_iter_readable(from, left);
  823. prev_left = left;
  824. goto again;
  825. }
  826. }
  827. btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
  828. /*
  829. * If 'ret' is -ENOTBLK or we have not written all data, then it means
  830. * we must fallback to buffered IO.
  831. */
  832. if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
  833. goto out;
  834. buffered:
  835. /*
  836. * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
  837. * it must retry the operation in a context where blocking is acceptable,
  838. * because even if we end up not blocking during the buffered IO attempt
  839. * below, we will block when flushing and waiting for the IO.
  840. */
  841. if (iocb->ki_flags & IOCB_NOWAIT) {
  842. ret = -EAGAIN;
  843. goto out;
  844. }
  845. pos = iocb->ki_pos;
  846. written_buffered = btrfs_buffered_write(iocb, from);
  847. if (written_buffered < 0) {
  848. ret = written_buffered;
  849. goto out;
  850. }
  851. /*
  852. * Ensure all data is persisted. We want the next direct IO read to be
  853. * able to read what was just written.
  854. */
  855. endbyte = pos + written_buffered - 1;
  856. ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
  857. if (ret)
  858. goto out;
  859. ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
  860. if (ret)
  861. goto out;
  862. written += written_buffered;
  863. iocb->ki_pos = pos + written_buffered;
  864. invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
  865. endbyte >> PAGE_SHIFT);
  866. out:
  867. return ret < 0 ? ret : written;
  868. }
  869. static int check_direct_read(struct btrfs_fs_info *fs_info,
  870. const struct iov_iter *iter, loff_t offset)
  871. {
  872. int ret;
  873. int i, seg;
  874. ret = check_direct_IO(fs_info, iter, offset);
  875. if (ret < 0)
  876. return ret;
  877. if (!iter_is_iovec(iter))
  878. return 0;
  879. for (seg = 0; seg < iter->nr_segs; seg++) {
  880. for (i = seg + 1; i < iter->nr_segs; i++) {
  881. const struct iovec *iov1 = iter_iov(iter) + seg;
  882. const struct iovec *iov2 = iter_iov(iter) + i;
  883. if (iov1->iov_base == iov2->iov_base)
  884. return -EINVAL;
  885. }
  886. }
  887. return 0;
  888. }
  889. ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
  890. {
  891. struct inode *inode = file_inode(iocb->ki_filp);
  892. size_t prev_left = 0;
  893. ssize_t read = 0;
  894. ssize_t ret;
  895. if (fsverity_active(inode))
  896. return 0;
  897. if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
  898. return 0;
  899. btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
  900. again:
  901. /*
  902. * This is similar to what we do for direct IO writes, see the comment
  903. * at btrfs_direct_write(), but we also disable page faults in addition
  904. * to disabling them only at the iov_iter level. This is because when
  905. * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
  906. * which can still trigger page fault ins despite having set ->nofault
  907. * to true of our 'to' iov_iter.
  908. *
  909. * The difference to direct IO writes is that we deadlock when trying
  910. * to lock the extent range in the inode's tree during he page reads
  911. * triggered by the fault in (while for writes it is due to waiting for
  912. * our own ordered extent). This is because for direct IO reads,
  913. * btrfs_dio_iomap_begin() returns with the extent range locked, which
  914. * is only unlocked in the endio callback (end_bio_extent_readpage()).
  915. */
  916. pagefault_disable();
  917. to->nofault = true;
  918. ret = btrfs_dio_read(iocb, to, read);
  919. to->nofault = false;
  920. pagefault_enable();
  921. /* No increment (+=) because iomap returns a cumulative value. */
  922. if (ret > 0)
  923. read = ret;
  924. if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
  925. const size_t left = iov_iter_count(to);
  926. if (left == prev_left) {
  927. /*
  928. * We didn't make any progress since the last attempt,
  929. * fallback to a buffered read for the remainder of the
  930. * range. This is just to avoid any possibility of looping
  931. * for too long.
  932. */
  933. ret = read;
  934. } else {
  935. /*
  936. * We made some progress since the last retry or this is
  937. * the first time we are retrying. Fault in as many pages
  938. * as possible and retry.
  939. */
  940. fault_in_iov_iter_writeable(to, left);
  941. prev_left = left;
  942. goto again;
  943. }
  944. }
  945. btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
  946. return ret < 0 ? ret : read;
  947. }
  948. int __init btrfs_init_dio(void)
  949. {
  950. if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
  951. offsetof(struct btrfs_dio_private, bbio.bio),
  952. BIOSET_NEED_BVECS))
  953. return -ENOMEM;
  954. return 0;
  955. }
  956. void __cold btrfs_destroy_dio(void)
  957. {
  958. bioset_exit(&btrfs_dio_bioset);
  959. }