ordered-data.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2007 Oracle. All rights reserved.
  4. */
  5. #include <linux/slab.h>
  6. #include <linux/blkdev.h>
  7. #include <linux/writeback.h>
  8. #include "ctree.h"
  9. #include "transaction.h"
  10. #include "btrfs_inode.h"
  11. #include "extent_io.h"
  12. #include "disk-io.h"
  13. #include "compression.h"
  14. static struct kmem_cache *btrfs_ordered_extent_cache;
  15. static u64 entry_end(struct btrfs_ordered_extent *entry)
  16. {
  17. if (entry->file_offset + entry->len < entry->file_offset)
  18. return (u64)-1;
  19. return entry->file_offset + entry->len;
  20. }
  21. /* returns NULL if the insertion worked, or it returns the node it did find
  22. * in the tree
  23. */
  24. static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
  25. struct rb_node *node)
  26. {
  27. struct rb_node **p = &root->rb_node;
  28. struct rb_node *parent = NULL;
  29. struct btrfs_ordered_extent *entry;
  30. while (*p) {
  31. parent = *p;
  32. entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
  33. if (file_offset < entry->file_offset)
  34. p = &(*p)->rb_left;
  35. else if (file_offset >= entry_end(entry))
  36. p = &(*p)->rb_right;
  37. else
  38. return parent;
  39. }
  40. rb_link_node(node, parent, p);
  41. rb_insert_color(node, root);
  42. return NULL;
  43. }
  44. static void ordered_data_tree_panic(struct inode *inode, int errno,
  45. u64 offset)
  46. {
  47. struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
  48. btrfs_panic(fs_info, errno,
  49. "Inconsistency in ordered tree at offset %llu", offset);
  50. }
  51. /*
  52. * look for a given offset in the tree, and if it can't be found return the
  53. * first lesser offset
  54. */
  55. static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
  56. struct rb_node **prev_ret)
  57. {
  58. struct rb_node *n = root->rb_node;
  59. struct rb_node *prev = NULL;
  60. struct rb_node *test;
  61. struct btrfs_ordered_extent *entry;
  62. struct btrfs_ordered_extent *prev_entry = NULL;
  63. while (n) {
  64. entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
  65. prev = n;
  66. prev_entry = entry;
  67. if (file_offset < entry->file_offset)
  68. n = n->rb_left;
  69. else if (file_offset >= entry_end(entry))
  70. n = n->rb_right;
  71. else
  72. return n;
  73. }
  74. if (!prev_ret)
  75. return NULL;
  76. while (prev && file_offset >= entry_end(prev_entry)) {
  77. test = rb_next(prev);
  78. if (!test)
  79. break;
  80. prev_entry = rb_entry(test, struct btrfs_ordered_extent,
  81. rb_node);
  82. if (file_offset < entry_end(prev_entry))
  83. break;
  84. prev = test;
  85. }
  86. if (prev)
  87. prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
  88. rb_node);
  89. while (prev && file_offset < entry_end(prev_entry)) {
  90. test = rb_prev(prev);
  91. if (!test)
  92. break;
  93. prev_entry = rb_entry(test, struct btrfs_ordered_extent,
  94. rb_node);
  95. prev = test;
  96. }
  97. *prev_ret = prev;
  98. return NULL;
  99. }
  100. /*
  101. * helper to check if a given offset is inside a given entry
  102. */
  103. static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
  104. {
  105. if (file_offset < entry->file_offset ||
  106. entry->file_offset + entry->len <= file_offset)
  107. return 0;
  108. return 1;
  109. }
  110. static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
  111. u64 len)
  112. {
  113. if (file_offset + len <= entry->file_offset ||
  114. entry->file_offset + entry->len <= file_offset)
  115. return 0;
  116. return 1;
  117. }
  118. /*
  119. * look find the first ordered struct that has this offset, otherwise
  120. * the first one less than this offset
  121. */
  122. static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
  123. u64 file_offset)
  124. {
  125. struct rb_root *root = &tree->tree;
  126. struct rb_node *prev = NULL;
  127. struct rb_node *ret;
  128. struct btrfs_ordered_extent *entry;
  129. if (tree->last) {
  130. entry = rb_entry(tree->last, struct btrfs_ordered_extent,
  131. rb_node);
  132. if (offset_in_entry(entry, file_offset))
  133. return tree->last;
  134. }
  135. ret = __tree_search(root, file_offset, &prev);
  136. if (!ret)
  137. ret = prev;
  138. if (ret)
  139. tree->last = ret;
  140. return ret;
  141. }
  142. /* allocate and add a new ordered_extent into the per-inode tree.
  143. * file_offset is the logical offset in the file
  144. *
  145. * start is the disk block number of an extent already reserved in the
  146. * extent allocation tree
  147. *
  148. * len is the length of the extent
  149. *
  150. * The tree is given a single reference on the ordered extent that was
  151. * inserted.
  152. */
  153. static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
  154. u64 start, u64 len, u64 disk_len,
  155. int type, int dio, int compress_type)
  156. {
  157. struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
  158. struct btrfs_root *root = BTRFS_I(inode)->root;
  159. struct btrfs_ordered_inode_tree *tree;
  160. struct rb_node *node;
  161. struct btrfs_ordered_extent *entry;
  162. tree = &BTRFS_I(inode)->ordered_tree;
  163. entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
  164. if (!entry)
  165. return -ENOMEM;
  166. entry->file_offset = file_offset;
  167. entry->start = start;
  168. entry->len = len;
  169. entry->disk_len = disk_len;
  170. entry->bytes_left = len;
  171. entry->inode = igrab(inode);
  172. entry->compress_type = compress_type;
  173. entry->truncated_len = (u64)-1;
  174. if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
  175. set_bit(type, &entry->flags);
  176. if (dio)
  177. set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
  178. /* one ref for the tree */
  179. refcount_set(&entry->refs, 1);
  180. init_waitqueue_head(&entry->wait);
  181. INIT_LIST_HEAD(&entry->list);
  182. INIT_LIST_HEAD(&entry->root_extent_list);
  183. INIT_LIST_HEAD(&entry->work_list);
  184. init_completion(&entry->completion);
  185. INIT_LIST_HEAD(&entry->log_list);
  186. INIT_LIST_HEAD(&entry->trans_list);
  187. trace_btrfs_ordered_extent_add(inode, entry);
  188. spin_lock_irq(&tree->lock);
  189. node = tree_insert(&tree->tree, file_offset,
  190. &entry->rb_node);
  191. if (node)
  192. ordered_data_tree_panic(inode, -EEXIST, file_offset);
  193. spin_unlock_irq(&tree->lock);
  194. spin_lock(&root->ordered_extent_lock);
  195. list_add_tail(&entry->root_extent_list,
  196. &root->ordered_extents);
  197. root->nr_ordered_extents++;
  198. if (root->nr_ordered_extents == 1) {
  199. spin_lock(&fs_info->ordered_root_lock);
  200. BUG_ON(!list_empty(&root->ordered_root));
  201. list_add_tail(&root->ordered_root, &fs_info->ordered_roots);
  202. spin_unlock(&fs_info->ordered_root_lock);
  203. }
  204. spin_unlock(&root->ordered_extent_lock);
  205. /*
  206. * We don't need the count_max_extents here, we can assume that all of
  207. * that work has been done at higher layers, so this is truly the
  208. * smallest the extent is going to get.
  209. */
  210. spin_lock(&BTRFS_I(inode)->lock);
  211. btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
  212. spin_unlock(&BTRFS_I(inode)->lock);
  213. return 0;
  214. }
  215. int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
  216. u64 start, u64 len, u64 disk_len, int type)
  217. {
  218. return __btrfs_add_ordered_extent(inode, file_offset, start, len,
  219. disk_len, type, 0,
  220. BTRFS_COMPRESS_NONE);
  221. }
  222. int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
  223. u64 start, u64 len, u64 disk_len, int type)
  224. {
  225. return __btrfs_add_ordered_extent(inode, file_offset, start, len,
  226. disk_len, type, 1,
  227. BTRFS_COMPRESS_NONE);
  228. }
  229. int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
  230. u64 start, u64 len, u64 disk_len,
  231. int type, int compress_type)
  232. {
  233. return __btrfs_add_ordered_extent(inode, file_offset, start, len,
  234. disk_len, type, 0,
  235. compress_type);
  236. }
  237. /*
  238. * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
  239. * when an ordered extent is finished. If the list covers more than one
  240. * ordered extent, it is split across multiples.
  241. */
  242. void btrfs_add_ordered_sum(struct inode *inode,
  243. struct btrfs_ordered_extent *entry,
  244. struct btrfs_ordered_sum *sum)
  245. {
  246. struct btrfs_ordered_inode_tree *tree;
  247. tree = &BTRFS_I(inode)->ordered_tree;
  248. spin_lock_irq(&tree->lock);
  249. list_add_tail(&sum->list, &entry->list);
  250. spin_unlock_irq(&tree->lock);
  251. }
  252. /*
  253. * this is used to account for finished IO across a given range
  254. * of the file. The IO may span ordered extents. If
  255. * a given ordered_extent is completely done, 1 is returned, otherwise
  256. * 0.
  257. *
  258. * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
  259. * to make sure this function only returns 1 once for a given ordered extent.
  260. *
  261. * file_offset is updated to one byte past the range that is recorded as
  262. * complete. This allows you to walk forward in the file.
  263. */
  264. int btrfs_dec_test_first_ordered_pending(struct inode *inode,
  265. struct btrfs_ordered_extent **cached,
  266. u64 *file_offset, u64 io_size, int uptodate)
  267. {
  268. struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
  269. struct btrfs_ordered_inode_tree *tree;
  270. struct rb_node *node;
  271. struct btrfs_ordered_extent *entry = NULL;
  272. int ret;
  273. unsigned long flags;
  274. u64 dec_end;
  275. u64 dec_start;
  276. u64 to_dec;
  277. tree = &BTRFS_I(inode)->ordered_tree;
  278. spin_lock_irqsave(&tree->lock, flags);
  279. node = tree_search(tree, *file_offset);
  280. if (!node) {
  281. ret = 1;
  282. goto out;
  283. }
  284. entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
  285. if (!offset_in_entry(entry, *file_offset)) {
  286. ret = 1;
  287. goto out;
  288. }
  289. dec_start = max(*file_offset, entry->file_offset);
  290. dec_end = min(*file_offset + io_size, entry->file_offset +
  291. entry->len);
  292. *file_offset = dec_end;
  293. if (dec_start > dec_end) {
  294. btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu",
  295. dec_start, dec_end);
  296. }
  297. to_dec = dec_end - dec_start;
  298. if (to_dec > entry->bytes_left) {
  299. btrfs_crit(fs_info,
  300. "bad ordered accounting left %llu size %llu",
  301. entry->bytes_left, to_dec);
  302. }
  303. entry->bytes_left -= to_dec;
  304. if (!uptodate)
  305. set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
  306. if (entry->bytes_left == 0) {
  307. ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
  308. /* test_and_set_bit implies a barrier */
  309. cond_wake_up_nomb(&entry->wait);
  310. } else {
  311. ret = 1;
  312. }
  313. out:
  314. if (!ret && cached && entry) {
  315. *cached = entry;
  316. refcount_inc(&entry->refs);
  317. }
  318. spin_unlock_irqrestore(&tree->lock, flags);
  319. return ret == 0;
  320. }
  321. /*
  322. * this is used to account for finished IO across a given range
  323. * of the file. The IO should not span ordered extents. If
  324. * a given ordered_extent is completely done, 1 is returned, otherwise
  325. * 0.
  326. *
  327. * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
  328. * to make sure this function only returns 1 once for a given ordered extent.
  329. */
  330. int btrfs_dec_test_ordered_pending(struct inode *inode,
  331. struct btrfs_ordered_extent **cached,
  332. u64 file_offset, u64 io_size, int uptodate)
  333. {
  334. struct btrfs_ordered_inode_tree *tree;
  335. struct rb_node *node;
  336. struct btrfs_ordered_extent *entry = NULL;
  337. unsigned long flags;
  338. int ret;
  339. tree = &BTRFS_I(inode)->ordered_tree;
  340. spin_lock_irqsave(&tree->lock, flags);
  341. if (cached && *cached) {
  342. entry = *cached;
  343. goto have_entry;
  344. }
  345. node = tree_search(tree, file_offset);
  346. if (!node) {
  347. ret = 1;
  348. goto out;
  349. }
  350. entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
  351. have_entry:
  352. if (!offset_in_entry(entry, file_offset)) {
  353. ret = 1;
  354. goto out;
  355. }
  356. if (io_size > entry->bytes_left) {
  357. btrfs_crit(BTRFS_I(inode)->root->fs_info,
  358. "bad ordered accounting left %llu size %llu",
  359. entry->bytes_left, io_size);
  360. }
  361. entry->bytes_left -= io_size;
  362. if (!uptodate)
  363. set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
  364. if (entry->bytes_left == 0) {
  365. ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
  366. /* test_and_set_bit implies a barrier */
  367. cond_wake_up_nomb(&entry->wait);
  368. } else {
  369. ret = 1;
  370. }
  371. out:
  372. if (!ret && cached && entry) {
  373. *cached = entry;
  374. refcount_inc(&entry->refs);
  375. }
  376. spin_unlock_irqrestore(&tree->lock, flags);
  377. return ret == 0;
  378. }
  379. /*
  380. * used to drop a reference on an ordered extent. This will free
  381. * the extent if the last reference is dropped
  382. */
  383. void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
  384. {
  385. struct list_head *cur;
  386. struct btrfs_ordered_sum *sum;
  387. trace_btrfs_ordered_extent_put(entry->inode, entry);
  388. if (refcount_dec_and_test(&entry->refs)) {
  389. ASSERT(list_empty(&entry->log_list));
  390. ASSERT(list_empty(&entry->trans_list));
  391. ASSERT(list_empty(&entry->root_extent_list));
  392. ASSERT(RB_EMPTY_NODE(&entry->rb_node));
  393. if (entry->inode)
  394. btrfs_add_delayed_iput(entry->inode);
  395. while (!list_empty(&entry->list)) {
  396. cur = entry->list.next;
  397. sum = list_entry(cur, struct btrfs_ordered_sum, list);
  398. list_del(&sum->list);
  399. kfree(sum);
  400. }
  401. kmem_cache_free(btrfs_ordered_extent_cache, entry);
  402. }
  403. }
  404. /*
  405. * remove an ordered extent from the tree. No references are dropped
  406. * and waiters are woken up.
  407. */
  408. void btrfs_remove_ordered_extent(struct inode *inode,
  409. struct btrfs_ordered_extent *entry)
  410. {
  411. struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
  412. struct btrfs_ordered_inode_tree *tree;
  413. struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
  414. struct btrfs_root *root = btrfs_inode->root;
  415. struct rb_node *node;
  416. bool dec_pending_ordered = false;
  417. /* This is paired with btrfs_add_ordered_extent. */
  418. spin_lock(&btrfs_inode->lock);
  419. btrfs_mod_outstanding_extents(btrfs_inode, -1);
  420. spin_unlock(&btrfs_inode->lock);
  421. if (root != fs_info->tree_root)
  422. btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false);
  423. tree = &btrfs_inode->ordered_tree;
  424. spin_lock_irq(&tree->lock);
  425. node = &entry->rb_node;
  426. rb_erase(node, &tree->tree);
  427. RB_CLEAR_NODE(node);
  428. if (tree->last == node)
  429. tree->last = NULL;
  430. set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
  431. if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags))
  432. dec_pending_ordered = true;
  433. spin_unlock_irq(&tree->lock);
  434. /*
  435. * The current running transaction is waiting on us, we need to let it
  436. * know that we're complete and wake it up.
  437. */
  438. if (dec_pending_ordered) {
  439. struct btrfs_transaction *trans;
  440. /*
  441. * The checks for trans are just a formality, it should be set,
  442. * but if it isn't we don't want to deref/assert under the spin
  443. * lock, so be nice and check if trans is set, but ASSERT() so
  444. * if it isn't set a developer will notice.
  445. */
  446. spin_lock(&fs_info->trans_lock);
  447. trans = fs_info->running_transaction;
  448. if (trans)
  449. refcount_inc(&trans->use_count);
  450. spin_unlock(&fs_info->trans_lock);
  451. ASSERT(trans);
  452. if (trans) {
  453. if (atomic_dec_and_test(&trans->pending_ordered))
  454. wake_up(&trans->pending_wait);
  455. btrfs_put_transaction(trans);
  456. }
  457. }
  458. spin_lock(&root->ordered_extent_lock);
  459. list_del_init(&entry->root_extent_list);
  460. root->nr_ordered_extents--;
  461. trace_btrfs_ordered_extent_remove(inode, entry);
  462. if (!root->nr_ordered_extents) {
  463. spin_lock(&fs_info->ordered_root_lock);
  464. BUG_ON(list_empty(&root->ordered_root));
  465. list_del_init(&root->ordered_root);
  466. spin_unlock(&fs_info->ordered_root_lock);
  467. }
  468. spin_unlock(&root->ordered_extent_lock);
  469. wake_up(&entry->wait);
  470. }
  471. static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
  472. {
  473. struct btrfs_ordered_extent *ordered;
  474. ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
  475. btrfs_start_ordered_extent(ordered->inode, ordered, 1);
  476. complete(&ordered->completion);
  477. }
  478. /*
  479. * wait for all the ordered extents in a root. This is done when balancing
  480. * space between drives.
  481. */
  482. u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
  483. const u64 range_start, const u64 range_len)
  484. {
  485. struct btrfs_fs_info *fs_info = root->fs_info;
  486. LIST_HEAD(splice);
  487. LIST_HEAD(skipped);
  488. LIST_HEAD(works);
  489. struct btrfs_ordered_extent *ordered, *next;
  490. u64 count = 0;
  491. const u64 range_end = range_start + range_len;
  492. mutex_lock(&root->ordered_extent_mutex);
  493. spin_lock(&root->ordered_extent_lock);
  494. list_splice_init(&root->ordered_extents, &splice);
  495. while (!list_empty(&splice) && nr) {
  496. ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
  497. root_extent_list);
  498. if (range_end <= ordered->start ||
  499. ordered->start + ordered->disk_len <= range_start) {
  500. list_move_tail(&ordered->root_extent_list, &skipped);
  501. cond_resched_lock(&root->ordered_extent_lock);
  502. continue;
  503. }
  504. list_move_tail(&ordered->root_extent_list,
  505. &root->ordered_extents);
  506. refcount_inc(&ordered->refs);
  507. spin_unlock(&root->ordered_extent_lock);
  508. btrfs_init_work(&ordered->flush_work,
  509. btrfs_flush_delalloc_helper,
  510. btrfs_run_ordered_extent_work, NULL, NULL);
  511. list_add_tail(&ordered->work_list, &works);
  512. btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
  513. cond_resched();
  514. spin_lock(&root->ordered_extent_lock);
  515. if (nr != U64_MAX)
  516. nr--;
  517. count++;
  518. }
  519. list_splice_tail(&skipped, &root->ordered_extents);
  520. list_splice_tail(&splice, &root->ordered_extents);
  521. spin_unlock(&root->ordered_extent_lock);
  522. list_for_each_entry_safe(ordered, next, &works, work_list) {
  523. list_del_init(&ordered->work_list);
  524. wait_for_completion(&ordered->completion);
  525. btrfs_put_ordered_extent(ordered);
  526. cond_resched();
  527. }
  528. mutex_unlock(&root->ordered_extent_mutex);
  529. return count;
  530. }
  531. u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
  532. const u64 range_start, const u64 range_len)
  533. {
  534. struct btrfs_root *root;
  535. struct list_head splice;
  536. u64 total_done = 0;
  537. u64 done;
  538. INIT_LIST_HEAD(&splice);
  539. mutex_lock(&fs_info->ordered_operations_mutex);
  540. spin_lock(&fs_info->ordered_root_lock);
  541. list_splice_init(&fs_info->ordered_roots, &splice);
  542. while (!list_empty(&splice) && nr) {
  543. root = list_first_entry(&splice, struct btrfs_root,
  544. ordered_root);
  545. root = btrfs_grab_fs_root(root);
  546. BUG_ON(!root);
  547. list_move_tail(&root->ordered_root,
  548. &fs_info->ordered_roots);
  549. spin_unlock(&fs_info->ordered_root_lock);
  550. done = btrfs_wait_ordered_extents(root, nr,
  551. range_start, range_len);
  552. btrfs_put_fs_root(root);
  553. total_done += done;
  554. spin_lock(&fs_info->ordered_root_lock);
  555. if (nr != U64_MAX) {
  556. nr -= done;
  557. }
  558. }
  559. list_splice_tail(&splice, &fs_info->ordered_roots);
  560. spin_unlock(&fs_info->ordered_root_lock);
  561. mutex_unlock(&fs_info->ordered_operations_mutex);
  562. return total_done;
  563. }
  564. /*
  565. * Used to start IO or wait for a given ordered extent to finish.
  566. *
  567. * If wait is one, this effectively waits on page writeback for all the pages
  568. * in the extent, and it waits on the io completion code to insert
  569. * metadata into the btree corresponding to the extent
  570. */
  571. void btrfs_start_ordered_extent(struct inode *inode,
  572. struct btrfs_ordered_extent *entry,
  573. int wait)
  574. {
  575. u64 start = entry->file_offset;
  576. u64 end = start + entry->len - 1;
  577. trace_btrfs_ordered_extent_start(inode, entry);
  578. /*
  579. * pages in the range can be dirty, clean or writeback. We
  580. * start IO on any dirty ones so the wait doesn't stall waiting
  581. * for the flusher thread to find them
  582. */
  583. if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
  584. filemap_fdatawrite_range(inode->i_mapping, start, end);
  585. if (wait) {
  586. wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
  587. &entry->flags));
  588. }
  589. }
  590. /*
  591. * Used to wait on ordered extents across a large range of bytes.
  592. */
  593. int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
  594. {
  595. int ret = 0;
  596. int ret_wb = 0;
  597. u64 end;
  598. u64 orig_end;
  599. struct btrfs_ordered_extent *ordered;
  600. if (start + len < start) {
  601. orig_end = INT_LIMIT(loff_t);
  602. } else {
  603. orig_end = start + len - 1;
  604. if (orig_end > INT_LIMIT(loff_t))
  605. orig_end = INT_LIMIT(loff_t);
  606. }
  607. /* start IO across the range first to instantiate any delalloc
  608. * extents
  609. */
  610. ret = btrfs_fdatawrite_range(inode, start, orig_end);
  611. if (ret)
  612. return ret;
  613. /*
  614. * If we have a writeback error don't return immediately. Wait first
  615. * for any ordered extents that haven't completed yet. This is to make
  616. * sure no one can dirty the same page ranges and call writepages()
  617. * before the ordered extents complete - to avoid failures (-EEXIST)
  618. * when adding the new ordered extents to the ordered tree.
  619. */
  620. ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
  621. end = orig_end;
  622. while (1) {
  623. ordered = btrfs_lookup_first_ordered_extent(inode, end);
  624. if (!ordered)
  625. break;
  626. if (ordered->file_offset > orig_end) {
  627. btrfs_put_ordered_extent(ordered);
  628. break;
  629. }
  630. if (ordered->file_offset + ordered->len <= start) {
  631. btrfs_put_ordered_extent(ordered);
  632. break;
  633. }
  634. btrfs_start_ordered_extent(inode, ordered, 1);
  635. end = ordered->file_offset;
  636. /*
  637. * If the ordered extent had an error save the error but don't
  638. * exit without waiting first for all other ordered extents in
  639. * the range to complete.
  640. */
  641. if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
  642. ret = -EIO;
  643. btrfs_put_ordered_extent(ordered);
  644. if (end == 0 || end == start)
  645. break;
  646. end--;
  647. }
  648. return ret_wb ? ret_wb : ret;
  649. }
  650. /*
  651. * find an ordered extent corresponding to file_offset. return NULL if
  652. * nothing is found, otherwise take a reference on the extent and return it
  653. */
  654. struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
  655. u64 file_offset)
  656. {
  657. struct btrfs_ordered_inode_tree *tree;
  658. struct rb_node *node;
  659. struct btrfs_ordered_extent *entry = NULL;
  660. tree = &BTRFS_I(inode)->ordered_tree;
  661. spin_lock_irq(&tree->lock);
  662. node = tree_search(tree, file_offset);
  663. if (!node)
  664. goto out;
  665. entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
  666. if (!offset_in_entry(entry, file_offset))
  667. entry = NULL;
  668. if (entry)
  669. refcount_inc(&entry->refs);
  670. out:
  671. spin_unlock_irq(&tree->lock);
  672. return entry;
  673. }
  674. /* Since the DIO code tries to lock a wide area we need to look for any ordered
  675. * extents that exist in the range, rather than just the start of the range.
  676. */
  677. struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
  678. struct btrfs_inode *inode, u64 file_offset, u64 len)
  679. {
  680. struct btrfs_ordered_inode_tree *tree;
  681. struct rb_node *node;
  682. struct btrfs_ordered_extent *entry = NULL;
  683. tree = &inode->ordered_tree;
  684. spin_lock_irq(&tree->lock);
  685. node = tree_search(tree, file_offset);
  686. if (!node) {
  687. node = tree_search(tree, file_offset + len);
  688. if (!node)
  689. goto out;
  690. }
  691. while (1) {
  692. entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
  693. if (range_overlaps(entry, file_offset, len))
  694. break;
  695. if (entry->file_offset >= file_offset + len) {
  696. entry = NULL;
  697. break;
  698. }
  699. entry = NULL;
  700. node = rb_next(node);
  701. if (!node)
  702. break;
  703. }
  704. out:
  705. if (entry)
  706. refcount_inc(&entry->refs);
  707. spin_unlock_irq(&tree->lock);
  708. return entry;
  709. }
  710. /*
  711. * lookup and return any extent before 'file_offset'. NULL is returned
  712. * if none is found
  713. */
  714. struct btrfs_ordered_extent *
  715. btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
  716. {
  717. struct btrfs_ordered_inode_tree *tree;
  718. struct rb_node *node;
  719. struct btrfs_ordered_extent *entry = NULL;
  720. tree = &BTRFS_I(inode)->ordered_tree;
  721. spin_lock_irq(&tree->lock);
  722. node = tree_search(tree, file_offset);
  723. if (!node)
  724. goto out;
  725. entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
  726. refcount_inc(&entry->refs);
  727. out:
  728. spin_unlock_irq(&tree->lock);
  729. return entry;
  730. }
  731. /*
  732. * After an extent is done, call this to conditionally update the on disk
  733. * i_size. i_size is updated to cover any fully written part of the file.
  734. */
  735. int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
  736. struct btrfs_ordered_extent *ordered)
  737. {
  738. struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
  739. u64 disk_i_size;
  740. u64 new_i_size;
  741. u64 i_size = i_size_read(inode);
  742. struct rb_node *node;
  743. struct rb_node *prev = NULL;
  744. struct btrfs_ordered_extent *test;
  745. int ret = 1;
  746. u64 orig_offset = offset;
  747. spin_lock_irq(&tree->lock);
  748. if (ordered) {
  749. offset = entry_end(ordered);
  750. if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags))
  751. offset = min(offset,
  752. ordered->file_offset +
  753. ordered->truncated_len);
  754. } else {
  755. offset = ALIGN(offset, btrfs_inode_sectorsize(inode));
  756. }
  757. disk_i_size = BTRFS_I(inode)->disk_i_size;
  758. /*
  759. * truncate file.
  760. * If ordered is not NULL, then this is called from endio and
  761. * disk_i_size will be updated by either truncate itself or any
  762. * in-flight IOs which are inside the disk_i_size.
  763. *
  764. * Because btrfs_setsize() may set i_size with disk_i_size if truncate
  765. * fails somehow, we need to make sure we have a precise disk_i_size by
  766. * updating it as usual.
  767. *
  768. */
  769. if (!ordered && disk_i_size > i_size) {
  770. BTRFS_I(inode)->disk_i_size = orig_offset;
  771. ret = 0;
  772. goto out;
  773. }
  774. /*
  775. * if the disk i_size is already at the inode->i_size, or
  776. * this ordered extent is inside the disk i_size, we're done
  777. */
  778. if (disk_i_size == i_size)
  779. goto out;
  780. /*
  781. * We still need to update disk_i_size if outstanding_isize is greater
  782. * than disk_i_size.
  783. */
  784. if (offset <= disk_i_size &&
  785. (!ordered || ordered->outstanding_isize <= disk_i_size))
  786. goto out;
  787. /*
  788. * walk backward from this ordered extent to disk_i_size.
  789. * if we find an ordered extent then we can't update disk i_size
  790. * yet
  791. */
  792. if (ordered) {
  793. node = rb_prev(&ordered->rb_node);
  794. } else {
  795. prev = tree_search(tree, offset);
  796. /*
  797. * we insert file extents without involving ordered struct,
  798. * so there should be no ordered struct cover this offset
  799. */
  800. if (prev) {
  801. test = rb_entry(prev, struct btrfs_ordered_extent,
  802. rb_node);
  803. BUG_ON(offset_in_entry(test, offset));
  804. }
  805. node = prev;
  806. }
  807. for (; node; node = rb_prev(node)) {
  808. test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
  809. /* We treat this entry as if it doesn't exist */
  810. if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
  811. continue;
  812. if (entry_end(test) <= disk_i_size)
  813. break;
  814. if (test->file_offset >= i_size)
  815. break;
  816. /*
  817. * We don't update disk_i_size now, so record this undealt
  818. * i_size. Or we will not know the real i_size.
  819. */
  820. if (test->outstanding_isize < offset)
  821. test->outstanding_isize = offset;
  822. if (ordered &&
  823. ordered->outstanding_isize > test->outstanding_isize)
  824. test->outstanding_isize = ordered->outstanding_isize;
  825. goto out;
  826. }
  827. new_i_size = min_t(u64, offset, i_size);
  828. /*
  829. * Some ordered extents may completed before the current one, and
  830. * we hold the real i_size in ->outstanding_isize.
  831. */
  832. if (ordered && ordered->outstanding_isize > new_i_size)
  833. new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
  834. BTRFS_I(inode)->disk_i_size = new_i_size;
  835. ret = 0;
  836. out:
  837. /*
  838. * We need to do this because we can't remove ordered extents until
  839. * after the i_disk_size has been updated and then the inode has been
  840. * updated to reflect the change, so we need to tell anybody who finds
  841. * this ordered extent that we've already done all the real work, we
  842. * just haven't completed all the other work.
  843. */
  844. if (ordered)
  845. set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
  846. spin_unlock_irq(&tree->lock);
  847. return ret;
  848. }
  849. /*
  850. * search the ordered extents for one corresponding to 'offset' and
  851. * try to find a checksum. This is used because we allow pages to
  852. * be reclaimed before their checksum is actually put into the btree
  853. */
  854. int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
  855. u32 *sum, int len)
  856. {
  857. struct btrfs_ordered_sum *ordered_sum;
  858. struct btrfs_ordered_extent *ordered;
  859. struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
  860. unsigned long num_sectors;
  861. unsigned long i;
  862. u32 sectorsize = btrfs_inode_sectorsize(inode);
  863. int index = 0;
  864. ordered = btrfs_lookup_ordered_extent(inode, offset);
  865. if (!ordered)
  866. return 0;
  867. spin_lock_irq(&tree->lock);
  868. list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
  869. if (disk_bytenr >= ordered_sum->bytenr &&
  870. disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
  871. i = (disk_bytenr - ordered_sum->bytenr) >>
  872. inode->i_sb->s_blocksize_bits;
  873. num_sectors = ordered_sum->len >>
  874. inode->i_sb->s_blocksize_bits;
  875. num_sectors = min_t(int, len - index, num_sectors - i);
  876. memcpy(sum + index, ordered_sum->sums + i,
  877. num_sectors);
  878. index += (int)num_sectors;
  879. if (index == len)
  880. goto out;
  881. disk_bytenr += num_sectors * sectorsize;
  882. }
  883. }
  884. out:
  885. spin_unlock_irq(&tree->lock);
  886. btrfs_put_ordered_extent(ordered);
  887. return index;
  888. }
  889. int __init ordered_data_init(void)
  890. {
  891. btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
  892. sizeof(struct btrfs_ordered_extent), 0,
  893. SLAB_MEM_SPREAD,
  894. NULL);
  895. if (!btrfs_ordered_extent_cache)
  896. return -ENOMEM;
  897. return 0;
  898. }
  899. void __cold ordered_data_exit(void)
  900. {
  901. kmem_cache_destroy(btrfs_ordered_extent_cache);
  902. }