bio.c 51 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
  4. */
  5. #include <linux/mm.h>
  6. #include <linux/swap.h>
  7. #include <linux/bio-integrity.h>
  8. #include <linux/blkdev.h>
  9. #include <linux/uio.h>
  10. #include <linux/iocontext.h>
  11. #include <linux/slab.h>
  12. #include <linux/init.h>
  13. #include <linux/kernel.h>
  14. #include <linux/export.h>
  15. #include <linux/mempool.h>
  16. #include <linux/workqueue.h>
  17. #include <linux/cgroup.h>
  18. #include <linux/highmem.h>
  19. #include <linux/blk-crypto.h>
  20. #include <linux/xarray.h>
  21. #include <trace/events/block.h>
  22. #include "blk.h"
  23. #include "blk-rq-qos.h"
  24. #include "blk-cgroup.h"
  25. #define ALLOC_CACHE_THRESHOLD 16
  26. #define ALLOC_CACHE_MAX 256
  27. struct bio_alloc_cache {
  28. struct bio *free_list;
  29. struct bio *free_list_irq;
  30. unsigned int nr;
  31. unsigned int nr_irq;
  32. };
  33. static struct biovec_slab {
  34. int nr_vecs;
  35. char *name;
  36. struct kmem_cache *slab;
  37. } bvec_slabs[] __read_mostly = {
  38. { .nr_vecs = 16, .name = "biovec-16" },
  39. { .nr_vecs = 64, .name = "biovec-64" },
  40. { .nr_vecs = 128, .name = "biovec-128" },
  41. { .nr_vecs = BIO_MAX_VECS, .name = "biovec-max" },
  42. };
  43. static struct biovec_slab *biovec_slab(unsigned short nr_vecs)
  44. {
  45. switch (nr_vecs) {
  46. /* smaller bios use inline vecs */
  47. case 5 ... 16:
  48. return &bvec_slabs[0];
  49. case 17 ... 64:
  50. return &bvec_slabs[1];
  51. case 65 ... 128:
  52. return &bvec_slabs[2];
  53. case 129 ... BIO_MAX_VECS:
  54. return &bvec_slabs[3];
  55. default:
  56. BUG();
  57. return NULL;
  58. }
  59. }
  60. /*
  61. * fs_bio_set is the bio_set containing bio and iovec memory pools used by
  62. * IO code that does not need private memory pools.
  63. */
  64. struct bio_set fs_bio_set;
  65. EXPORT_SYMBOL(fs_bio_set);
  66. /*
  67. * Our slab pool management
  68. */
  69. struct bio_slab {
  70. struct kmem_cache *slab;
  71. unsigned int slab_ref;
  72. unsigned int slab_size;
  73. char name[8];
  74. };
  75. static DEFINE_MUTEX(bio_slab_lock);
  76. static DEFINE_XARRAY(bio_slabs);
  77. static struct bio_slab *create_bio_slab(unsigned int size)
  78. {
  79. struct bio_slab *bslab = kzalloc(sizeof(*bslab), GFP_KERNEL);
  80. if (!bslab)
  81. return NULL;
  82. snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size);
  83. bslab->slab = kmem_cache_create(bslab->name, size,
  84. ARCH_KMALLOC_MINALIGN,
  85. SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL);
  86. if (!bslab->slab)
  87. goto fail_alloc_slab;
  88. bslab->slab_ref = 1;
  89. bslab->slab_size = size;
  90. if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL)))
  91. return bslab;
  92. kmem_cache_destroy(bslab->slab);
  93. fail_alloc_slab:
  94. kfree(bslab);
  95. return NULL;
  96. }
  97. static inline unsigned int bs_bio_slab_size(struct bio_set *bs)
  98. {
  99. return bs->front_pad + sizeof(struct bio) + bs->back_pad;
  100. }
  101. static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs)
  102. {
  103. unsigned int size = bs_bio_slab_size(bs);
  104. struct bio_slab *bslab;
  105. mutex_lock(&bio_slab_lock);
  106. bslab = xa_load(&bio_slabs, size);
  107. if (bslab)
  108. bslab->slab_ref++;
  109. else
  110. bslab = create_bio_slab(size);
  111. mutex_unlock(&bio_slab_lock);
  112. if (bslab)
  113. return bslab->slab;
  114. return NULL;
  115. }
  116. static void bio_put_slab(struct bio_set *bs)
  117. {
  118. struct bio_slab *bslab = NULL;
  119. unsigned int slab_size = bs_bio_slab_size(bs);
  120. mutex_lock(&bio_slab_lock);
  121. bslab = xa_load(&bio_slabs, slab_size);
  122. if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
  123. goto out;
  124. WARN_ON_ONCE(bslab->slab != bs->bio_slab);
  125. WARN_ON(!bslab->slab_ref);
  126. if (--bslab->slab_ref)
  127. goto out;
  128. xa_erase(&bio_slabs, slab_size);
  129. kmem_cache_destroy(bslab->slab);
  130. kfree(bslab);
  131. out:
  132. mutex_unlock(&bio_slab_lock);
  133. }
  134. void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
  135. {
  136. BUG_ON(nr_vecs > BIO_MAX_VECS);
  137. if (nr_vecs == BIO_MAX_VECS)
  138. mempool_free(bv, pool);
  139. else if (nr_vecs > BIO_INLINE_VECS)
  140. kmem_cache_free(biovec_slab(nr_vecs)->slab, bv);
  141. }
  142. /*
  143. * Make the first allocation restricted and don't dump info on allocation
  144. * failures, since we'll fall back to the mempool in case of failure.
  145. */
  146. static inline gfp_t bvec_alloc_gfp(gfp_t gfp)
  147. {
  148. return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) |
  149. __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
  150. }
  151. struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
  152. gfp_t gfp_mask)
  153. {
  154. struct biovec_slab *bvs = biovec_slab(*nr_vecs);
  155. if (WARN_ON_ONCE(!bvs))
  156. return NULL;
  157. /*
  158. * Upgrade the nr_vecs request to take full advantage of the allocation.
  159. * We also rely on this in the bvec_free path.
  160. */
  161. *nr_vecs = bvs->nr_vecs;
  162. /*
  163. * Try a slab allocation first for all smaller allocations. If that
  164. * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool.
  165. * The mempool is sized to handle up to BIO_MAX_VECS entries.
  166. */
  167. if (*nr_vecs < BIO_MAX_VECS) {
  168. struct bio_vec *bvl;
  169. bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask));
  170. if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM))
  171. return bvl;
  172. *nr_vecs = BIO_MAX_VECS;
  173. }
  174. return mempool_alloc(pool, gfp_mask);
  175. }
  176. void bio_uninit(struct bio *bio)
  177. {
  178. #ifdef CONFIG_BLK_CGROUP
  179. if (bio->bi_blkg) {
  180. blkg_put(bio->bi_blkg);
  181. bio->bi_blkg = NULL;
  182. }
  183. #endif
  184. if (bio_integrity(bio))
  185. bio_integrity_free(bio);
  186. bio_crypt_free_ctx(bio);
  187. }
  188. EXPORT_SYMBOL(bio_uninit);
  189. static void bio_free(struct bio *bio)
  190. {
  191. struct bio_set *bs = bio->bi_pool;
  192. void *p = bio;
  193. WARN_ON_ONCE(!bs);
  194. bio_uninit(bio);
  195. bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs);
  196. mempool_free(p - bs->front_pad, &bs->bio_pool);
  197. }
  198. /*
  199. * Users of this function have their own bio allocation. Subsequently,
  200. * they must remember to pair any call to bio_init() with bio_uninit()
  201. * when IO has completed, or when the bio is released.
  202. */
  203. void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
  204. unsigned short max_vecs, blk_opf_t opf)
  205. {
  206. bio->bi_next = NULL;
  207. bio->bi_bdev = bdev;
  208. bio->bi_opf = opf;
  209. bio->bi_flags = 0;
  210. bio->bi_ioprio = 0;
  211. bio->bi_write_hint = 0;
  212. bio->bi_status = 0;
  213. bio->bi_iter.bi_sector = 0;
  214. bio->bi_iter.bi_size = 0;
  215. bio->bi_iter.bi_idx = 0;
  216. bio->bi_iter.bi_bvec_done = 0;
  217. bio->bi_end_io = NULL;
  218. bio->bi_private = NULL;
  219. #ifdef CONFIG_BLK_CGROUP
  220. bio->bi_blkg = NULL;
  221. bio->bi_issue.value = 0;
  222. if (bdev)
  223. bio_associate_blkg(bio);
  224. #ifdef CONFIG_BLK_CGROUP_IOCOST
  225. bio->bi_iocost_cost = 0;
  226. #endif
  227. #endif
  228. #ifdef CONFIG_BLK_INLINE_ENCRYPTION
  229. bio->bi_crypt_context = NULL;
  230. #endif
  231. #ifdef CONFIG_BLK_DEV_INTEGRITY
  232. bio->bi_integrity = NULL;
  233. #endif
  234. bio->bi_vcnt = 0;
  235. atomic_set(&bio->__bi_remaining, 1);
  236. atomic_set(&bio->__bi_cnt, 1);
  237. bio->bi_cookie = BLK_QC_T_NONE;
  238. bio->bi_max_vecs = max_vecs;
  239. bio->bi_io_vec = table;
  240. bio->bi_pool = NULL;
  241. }
  242. EXPORT_SYMBOL(bio_init);
  243. /**
  244. * bio_reset - reinitialize a bio
  245. * @bio: bio to reset
  246. * @bdev: block device to use the bio for
  247. * @opf: operation and flags for bio
  248. *
  249. * Description:
  250. * After calling bio_reset(), @bio will be in the same state as a freshly
  251. * allocated bio returned bio bio_alloc_bioset() - the only fields that are
  252. * preserved are the ones that are initialized by bio_alloc_bioset(). See
  253. * comment in struct bio.
  254. */
  255. void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf)
  256. {
  257. bio_uninit(bio);
  258. memset(bio, 0, BIO_RESET_BYTES);
  259. atomic_set(&bio->__bi_remaining, 1);
  260. bio->bi_bdev = bdev;
  261. if (bio->bi_bdev)
  262. bio_associate_blkg(bio);
  263. bio->bi_opf = opf;
  264. }
  265. EXPORT_SYMBOL(bio_reset);
  266. static struct bio *__bio_chain_endio(struct bio *bio)
  267. {
  268. struct bio *parent = bio->bi_private;
  269. if (bio->bi_status && !parent->bi_status)
  270. parent->bi_status = bio->bi_status;
  271. bio_put(bio);
  272. return parent;
  273. }
  274. static void bio_chain_endio(struct bio *bio)
  275. {
  276. bio_endio(__bio_chain_endio(bio));
  277. }
  278. /**
  279. * bio_chain - chain bio completions
  280. * @bio: the target bio
  281. * @parent: the parent bio of @bio
  282. *
  283. * The caller won't have a bi_end_io called when @bio completes - instead,
  284. * @parent's bi_end_io won't be called until both @parent and @bio have
  285. * completed; the chained bio will also be freed when it completes.
  286. *
  287. * The caller must not set bi_private or bi_end_io in @bio.
  288. */
  289. void bio_chain(struct bio *bio, struct bio *parent)
  290. {
  291. BUG_ON(bio->bi_private || bio->bi_end_io);
  292. bio->bi_private = parent;
  293. bio->bi_end_io = bio_chain_endio;
  294. bio_inc_remaining(parent);
  295. }
  296. EXPORT_SYMBOL(bio_chain);
  297. /**
  298. * bio_chain_and_submit - submit a bio after chaining it to another one
  299. * @prev: bio to chain and submit
  300. * @new: bio to chain to
  301. *
  302. * If @prev is non-NULL, chain it to @new and submit it.
  303. *
  304. * Return: @new.
  305. */
  306. struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new)
  307. {
  308. if (prev) {
  309. bio_chain(prev, new);
  310. submit_bio(prev);
  311. }
  312. return new;
  313. }
  314. struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
  315. unsigned int nr_pages, blk_opf_t opf, gfp_t gfp)
  316. {
  317. return bio_chain_and_submit(bio, bio_alloc(bdev, nr_pages, opf, gfp));
  318. }
  319. EXPORT_SYMBOL_GPL(blk_next_bio);
  320. static void bio_alloc_rescue(struct work_struct *work)
  321. {
  322. struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
  323. struct bio *bio;
  324. while (1) {
  325. spin_lock(&bs->rescue_lock);
  326. bio = bio_list_pop(&bs->rescue_list);
  327. spin_unlock(&bs->rescue_lock);
  328. if (!bio)
  329. break;
  330. submit_bio_noacct(bio);
  331. }
  332. }
  333. static void punt_bios_to_rescuer(struct bio_set *bs)
  334. {
  335. struct bio_list punt, nopunt;
  336. struct bio *bio;
  337. if (WARN_ON_ONCE(!bs->rescue_workqueue))
  338. return;
  339. /*
  340. * In order to guarantee forward progress we must punt only bios that
  341. * were allocated from this bio_set; otherwise, if there was a bio on
  342. * there for a stacking driver higher up in the stack, processing it
  343. * could require allocating bios from this bio_set, and doing that from
  344. * our own rescuer would be bad.
  345. *
  346. * Since bio lists are singly linked, pop them all instead of trying to
  347. * remove from the middle of the list:
  348. */
  349. bio_list_init(&punt);
  350. bio_list_init(&nopunt);
  351. while ((bio = bio_list_pop(&current->bio_list[0])))
  352. bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
  353. current->bio_list[0] = nopunt;
  354. bio_list_init(&nopunt);
  355. while ((bio = bio_list_pop(&current->bio_list[1])))
  356. bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
  357. current->bio_list[1] = nopunt;
  358. spin_lock(&bs->rescue_lock);
  359. bio_list_merge(&bs->rescue_list, &punt);
  360. spin_unlock(&bs->rescue_lock);
  361. queue_work(bs->rescue_workqueue, &bs->rescue_work);
  362. }
  363. static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache)
  364. {
  365. unsigned long flags;
  366. /* cache->free_list must be empty */
  367. if (WARN_ON_ONCE(cache->free_list))
  368. return;
  369. local_irq_save(flags);
  370. cache->free_list = cache->free_list_irq;
  371. cache->free_list_irq = NULL;
  372. cache->nr += cache->nr_irq;
  373. cache->nr_irq = 0;
  374. local_irq_restore(flags);
  375. }
  376. static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
  377. unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp,
  378. struct bio_set *bs)
  379. {
  380. struct bio_alloc_cache *cache;
  381. struct bio *bio;
  382. cache = per_cpu_ptr(bs->cache, get_cpu());
  383. if (!cache->free_list) {
  384. if (READ_ONCE(cache->nr_irq) >= ALLOC_CACHE_THRESHOLD)
  385. bio_alloc_irq_cache_splice(cache);
  386. if (!cache->free_list) {
  387. put_cpu();
  388. return NULL;
  389. }
  390. }
  391. bio = cache->free_list;
  392. cache->free_list = bio->bi_next;
  393. cache->nr--;
  394. put_cpu();
  395. bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs, opf);
  396. bio->bi_pool = bs;
  397. return bio;
  398. }
  399. /**
  400. * bio_alloc_bioset - allocate a bio for I/O
  401. * @bdev: block device to allocate the bio for (can be %NULL)
  402. * @nr_vecs: number of bvecs to pre-allocate
  403. * @opf: operation and flags for bio
  404. * @gfp_mask: the GFP_* mask given to the slab allocator
  405. * @bs: the bio_set to allocate from.
  406. *
  407. * Allocate a bio from the mempools in @bs.
  408. *
  409. * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to
  410. * allocate a bio. This is due to the mempool guarantees. To make this work,
  411. * callers must never allocate more than 1 bio at a time from the general pool.
  412. * Callers that need to allocate more than 1 bio must always submit the
  413. * previously allocated bio for IO before attempting to allocate a new one.
  414. * Failure to do so can cause deadlocks under memory pressure.
  415. *
  416. * Note that when running under submit_bio_noacct() (i.e. any block driver),
  417. * bios are not submitted until after you return - see the code in
  418. * submit_bio_noacct() that converts recursion into iteration, to prevent
  419. * stack overflows.
  420. *
  421. * This would normally mean allocating multiple bios under submit_bio_noacct()
  422. * would be susceptible to deadlocks, but we have
  423. * deadlock avoidance code that resubmits any blocked bios from a rescuer
  424. * thread.
  425. *
  426. * However, we do not guarantee forward progress for allocations from other
  427. * mempools. Doing multiple allocations from the same mempool under
  428. * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
  429. * for per bio allocations.
  430. *
  431. * Returns: Pointer to new bio on success, NULL on failure.
  432. */
  433. struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
  434. blk_opf_t opf, gfp_t gfp_mask,
  435. struct bio_set *bs)
  436. {
  437. gfp_t saved_gfp = gfp_mask;
  438. struct bio *bio;
  439. void *p;
  440. /* should not use nobvec bioset for nr_vecs > 0 */
  441. if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0))
  442. return NULL;
  443. if (opf & REQ_ALLOC_CACHE) {
  444. if (bs->cache && nr_vecs <= BIO_INLINE_VECS) {
  445. bio = bio_alloc_percpu_cache(bdev, nr_vecs, opf,
  446. gfp_mask, bs);
  447. if (bio)
  448. return bio;
  449. /*
  450. * No cached bio available, bio returned below marked with
  451. * REQ_ALLOC_CACHE to particpate in per-cpu alloc cache.
  452. */
  453. } else {
  454. opf &= ~REQ_ALLOC_CACHE;
  455. }
  456. }
  457. /*
  458. * submit_bio_noacct() converts recursion to iteration; this means if
  459. * we're running beneath it, any bios we allocate and submit will not be
  460. * submitted (and thus freed) until after we return.
  461. *
  462. * This exposes us to a potential deadlock if we allocate multiple bios
  463. * from the same bio_set() while running underneath submit_bio_noacct().
  464. * If we were to allocate multiple bios (say a stacking block driver
  465. * that was splitting bios), we would deadlock if we exhausted the
  466. * mempool's reserve.
  467. *
  468. * We solve this, and guarantee forward progress, with a rescuer
  469. * workqueue per bio_set. If we go to allocate and there are bios on
  470. * current->bio_list, we first try the allocation without
  471. * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be
  472. * blocking to the rescuer workqueue before we retry with the original
  473. * gfp_flags.
  474. */
  475. if (current->bio_list &&
  476. (!bio_list_empty(&current->bio_list[0]) ||
  477. !bio_list_empty(&current->bio_list[1])) &&
  478. bs->rescue_workqueue)
  479. gfp_mask &= ~__GFP_DIRECT_RECLAIM;
  480. p = mempool_alloc(&bs->bio_pool, gfp_mask);
  481. if (!p && gfp_mask != saved_gfp) {
  482. punt_bios_to_rescuer(bs);
  483. gfp_mask = saved_gfp;
  484. p = mempool_alloc(&bs->bio_pool, gfp_mask);
  485. }
  486. if (unlikely(!p))
  487. return NULL;
  488. if (!mempool_is_saturated(&bs->bio_pool))
  489. opf &= ~REQ_ALLOC_CACHE;
  490. bio = p + bs->front_pad;
  491. if (nr_vecs > BIO_INLINE_VECS) {
  492. struct bio_vec *bvl = NULL;
  493. bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
  494. if (!bvl && gfp_mask != saved_gfp) {
  495. punt_bios_to_rescuer(bs);
  496. gfp_mask = saved_gfp;
  497. bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
  498. }
  499. if (unlikely(!bvl))
  500. goto err_free;
  501. bio_init(bio, bdev, bvl, nr_vecs, opf);
  502. } else if (nr_vecs) {
  503. bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf);
  504. } else {
  505. bio_init(bio, bdev, NULL, 0, opf);
  506. }
  507. bio->bi_pool = bs;
  508. return bio;
  509. err_free:
  510. mempool_free(p, &bs->bio_pool);
  511. return NULL;
  512. }
  513. EXPORT_SYMBOL(bio_alloc_bioset);
  514. /**
  515. * bio_kmalloc - kmalloc a bio
  516. * @nr_vecs: number of bio_vecs to allocate
  517. * @gfp_mask: the GFP_* mask given to the slab allocator
  518. *
  519. * Use kmalloc to allocate a bio (including bvecs). The bio must be initialized
  520. * using bio_init() before use. To free a bio returned from this function use
  521. * kfree() after calling bio_uninit(). A bio returned from this function can
  522. * be reused by calling bio_uninit() before calling bio_init() again.
  523. *
  524. * Note that unlike bio_alloc() or bio_alloc_bioset() allocations from this
  525. * function are not backed by a mempool can fail. Do not use this function
  526. * for allocations in the file system I/O path.
  527. *
  528. * Returns: Pointer to new bio on success, NULL on failure.
  529. */
  530. struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask)
  531. {
  532. struct bio *bio;
  533. if (nr_vecs > UIO_MAXIOV)
  534. return NULL;
  535. return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask);
  536. }
  537. EXPORT_SYMBOL(bio_kmalloc);
  538. void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
  539. {
  540. struct bio_vec bv;
  541. struct bvec_iter iter;
  542. __bio_for_each_segment(bv, bio, iter, start)
  543. memzero_bvec(&bv);
  544. }
  545. EXPORT_SYMBOL(zero_fill_bio_iter);
  546. /**
  547. * bio_truncate - truncate the bio to small size of @new_size
  548. * @bio: the bio to be truncated
  549. * @new_size: new size for truncating the bio
  550. *
  551. * Description:
  552. * Truncate the bio to new size of @new_size. If bio_op(bio) is
  553. * REQ_OP_READ, zero the truncated part. This function should only
  554. * be used for handling corner cases, such as bio eod.
  555. */
  556. static void bio_truncate(struct bio *bio, unsigned new_size)
  557. {
  558. struct bio_vec bv;
  559. struct bvec_iter iter;
  560. unsigned int done = 0;
  561. bool truncated = false;
  562. if (new_size >= bio->bi_iter.bi_size)
  563. return;
  564. if (bio_op(bio) != REQ_OP_READ)
  565. goto exit;
  566. bio_for_each_segment(bv, bio, iter) {
  567. if (done + bv.bv_len > new_size) {
  568. unsigned offset;
  569. if (!truncated)
  570. offset = new_size - done;
  571. else
  572. offset = 0;
  573. zero_user(bv.bv_page, bv.bv_offset + offset,
  574. bv.bv_len - offset);
  575. truncated = true;
  576. }
  577. done += bv.bv_len;
  578. }
  579. exit:
  580. /*
  581. * Don't touch bvec table here and make it really immutable, since
  582. * fs bio user has to retrieve all pages via bio_for_each_segment_all
  583. * in its .end_bio() callback.
  584. *
  585. * It is enough to truncate bio by updating .bi_size since we can make
  586. * correct bvec with the updated .bi_size for drivers.
  587. */
  588. bio->bi_iter.bi_size = new_size;
  589. }
  590. /**
  591. * guard_bio_eod - truncate a BIO to fit the block device
  592. * @bio: bio to truncate
  593. *
  594. * This allows us to do IO even on the odd last sectors of a device, even if the
  595. * block size is some multiple of the physical sector size.
  596. *
  597. * We'll just truncate the bio to the size of the device, and clear the end of
  598. * the buffer head manually. Truly out-of-range accesses will turn into actual
  599. * I/O errors, this only handles the "we need to be able to do I/O at the final
  600. * sector" case.
  601. */
  602. void guard_bio_eod(struct bio *bio)
  603. {
  604. sector_t maxsector = bdev_nr_sectors(bio->bi_bdev);
  605. if (!maxsector)
  606. return;
  607. /*
  608. * If the *whole* IO is past the end of the device,
  609. * let it through, and the IO layer will turn it into
  610. * an EIO.
  611. */
  612. if (unlikely(bio->bi_iter.bi_sector >= maxsector))
  613. return;
  614. maxsector -= bio->bi_iter.bi_sector;
  615. if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
  616. return;
  617. bio_truncate(bio, maxsector << 9);
  618. }
  619. static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache,
  620. unsigned int nr)
  621. {
  622. unsigned int i = 0;
  623. struct bio *bio;
  624. while ((bio = cache->free_list) != NULL) {
  625. cache->free_list = bio->bi_next;
  626. cache->nr--;
  627. bio_free(bio);
  628. if (++i == nr)
  629. break;
  630. }
  631. return i;
  632. }
  633. static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
  634. unsigned int nr)
  635. {
  636. nr -= __bio_alloc_cache_prune(cache, nr);
  637. if (!READ_ONCE(cache->free_list)) {
  638. bio_alloc_irq_cache_splice(cache);
  639. __bio_alloc_cache_prune(cache, nr);
  640. }
  641. }
  642. static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node)
  643. {
  644. struct bio_set *bs;
  645. bs = hlist_entry_safe(node, struct bio_set, cpuhp_dead);
  646. if (bs->cache) {
  647. struct bio_alloc_cache *cache = per_cpu_ptr(bs->cache, cpu);
  648. bio_alloc_cache_prune(cache, -1U);
  649. }
  650. return 0;
  651. }
  652. static void bio_alloc_cache_destroy(struct bio_set *bs)
  653. {
  654. int cpu;
  655. if (!bs->cache)
  656. return;
  657. cpuhp_state_remove_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
  658. for_each_possible_cpu(cpu) {
  659. struct bio_alloc_cache *cache;
  660. cache = per_cpu_ptr(bs->cache, cpu);
  661. bio_alloc_cache_prune(cache, -1U);
  662. }
  663. free_percpu(bs->cache);
  664. bs->cache = NULL;
  665. }
  666. static inline void bio_put_percpu_cache(struct bio *bio)
  667. {
  668. struct bio_alloc_cache *cache;
  669. cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
  670. if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX)
  671. goto out_free;
  672. if (in_task()) {
  673. bio_uninit(bio);
  674. bio->bi_next = cache->free_list;
  675. /* Not necessary but helps not to iopoll already freed bios */
  676. bio->bi_bdev = NULL;
  677. cache->free_list = bio;
  678. cache->nr++;
  679. } else if (in_hardirq()) {
  680. lockdep_assert_irqs_disabled();
  681. bio_uninit(bio);
  682. bio->bi_next = cache->free_list_irq;
  683. cache->free_list_irq = bio;
  684. cache->nr_irq++;
  685. } else {
  686. goto out_free;
  687. }
  688. put_cpu();
  689. return;
  690. out_free:
  691. put_cpu();
  692. bio_free(bio);
  693. }
  694. /**
  695. * bio_put - release a reference to a bio
  696. * @bio: bio to release reference to
  697. *
  698. * Description:
  699. * Put a reference to a &struct bio, either one you have gotten with
  700. * bio_alloc, bio_get or bio_clone_*. The last put of a bio will free it.
  701. **/
  702. void bio_put(struct bio *bio)
  703. {
  704. if (unlikely(bio_flagged(bio, BIO_REFFED))) {
  705. BUG_ON(!atomic_read(&bio->__bi_cnt));
  706. if (!atomic_dec_and_test(&bio->__bi_cnt))
  707. return;
  708. }
  709. if (bio->bi_opf & REQ_ALLOC_CACHE)
  710. bio_put_percpu_cache(bio);
  711. else
  712. bio_free(bio);
  713. }
  714. EXPORT_SYMBOL(bio_put);
  715. static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
  716. {
  717. bio_set_flag(bio, BIO_CLONED);
  718. bio->bi_ioprio = bio_src->bi_ioprio;
  719. bio->bi_write_hint = bio_src->bi_write_hint;
  720. bio->bi_iter = bio_src->bi_iter;
  721. if (bio->bi_bdev) {
  722. if (bio->bi_bdev == bio_src->bi_bdev &&
  723. bio_flagged(bio_src, BIO_REMAPPED))
  724. bio_set_flag(bio, BIO_REMAPPED);
  725. bio_clone_blkg_association(bio, bio_src);
  726. }
  727. if (bio_crypt_clone(bio, bio_src, gfp) < 0)
  728. return -ENOMEM;
  729. if (bio_integrity(bio_src) &&
  730. bio_integrity_clone(bio, bio_src, gfp) < 0)
  731. return -ENOMEM;
  732. return 0;
  733. }
  734. /**
  735. * bio_alloc_clone - clone a bio that shares the original bio's biovec
  736. * @bdev: block_device to clone onto
  737. * @bio_src: bio to clone from
  738. * @gfp: allocation priority
  739. * @bs: bio_set to allocate from
  740. *
  741. * Allocate a new bio that is a clone of @bio_src. The caller owns the returned
  742. * bio, but not the actual data it points to.
  743. *
  744. * The caller must ensure that the return bio is not freed before @bio_src.
  745. */
  746. struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src,
  747. gfp_t gfp, struct bio_set *bs)
  748. {
  749. struct bio *bio;
  750. bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs);
  751. if (!bio)
  752. return NULL;
  753. if (__bio_clone(bio, bio_src, gfp) < 0) {
  754. bio_put(bio);
  755. return NULL;
  756. }
  757. bio->bi_io_vec = bio_src->bi_io_vec;
  758. return bio;
  759. }
  760. EXPORT_SYMBOL(bio_alloc_clone);
  761. /**
  762. * bio_init_clone - clone a bio that shares the original bio's biovec
  763. * @bdev: block_device to clone onto
  764. * @bio: bio to clone into
  765. * @bio_src: bio to clone from
  766. * @gfp: allocation priority
  767. *
  768. * Initialize a new bio in caller provided memory that is a clone of @bio_src.
  769. * The caller owns the returned bio, but not the actual data it points to.
  770. *
  771. * The caller must ensure that @bio_src is not freed before @bio.
  772. */
  773. int bio_init_clone(struct block_device *bdev, struct bio *bio,
  774. struct bio *bio_src, gfp_t gfp)
  775. {
  776. int ret;
  777. bio_init(bio, bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf);
  778. ret = __bio_clone(bio, bio_src, gfp);
  779. if (ret)
  780. bio_uninit(bio);
  781. return ret;
  782. }
  783. EXPORT_SYMBOL(bio_init_clone);
  784. /**
  785. * bio_full - check if the bio is full
  786. * @bio: bio to check
  787. * @len: length of one segment to be added
  788. *
  789. * Return true if @bio is full and one segment with @len bytes can't be
  790. * added to the bio, otherwise return false
  791. */
  792. static inline bool bio_full(struct bio *bio, unsigned len)
  793. {
  794. if (bio->bi_vcnt >= bio->bi_max_vecs)
  795. return true;
  796. if (bio->bi_iter.bi_size > UINT_MAX - len)
  797. return true;
  798. return false;
  799. }
  800. static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
  801. unsigned int len, unsigned int off, bool *same_page)
  802. {
  803. size_t bv_end = bv->bv_offset + bv->bv_len;
  804. phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1;
  805. phys_addr_t page_addr = page_to_phys(page);
  806. if (vec_end_addr + 1 != page_addr + off)
  807. return false;
  808. if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
  809. return false;
  810. if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
  811. return false;
  812. *same_page = ((vec_end_addr & PAGE_MASK) == ((page_addr + off) &
  813. PAGE_MASK));
  814. if (!*same_page) {
  815. if (IS_ENABLED(CONFIG_KMSAN))
  816. return false;
  817. if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE)
  818. return false;
  819. }
  820. bv->bv_len += len;
  821. return true;
  822. }
  823. /*
  824. * Try to merge a page into a segment, while obeying the hardware segment
  825. * size limit. This is not for normal read/write bios, but for passthrough
  826. * or Zone Append operations that we can't split.
  827. */
  828. bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
  829. struct page *page, unsigned len, unsigned offset,
  830. bool *same_page)
  831. {
  832. unsigned long mask = queue_segment_boundary(q);
  833. phys_addr_t addr1 = bvec_phys(bv);
  834. phys_addr_t addr2 = page_to_phys(page) + offset + len - 1;
  835. if ((addr1 | mask) != (addr2 | mask))
  836. return false;
  837. if (len > queue_max_segment_size(q) - bv->bv_len)
  838. return false;
  839. return bvec_try_merge_page(bv, page, len, offset, same_page);
  840. }
  841. /**
  842. * bio_add_hw_page - attempt to add a page to a bio with hw constraints
  843. * @q: the target queue
  844. * @bio: destination bio
  845. * @page: page to add
  846. * @len: vec entry length
  847. * @offset: vec entry offset
  848. * @max_sectors: maximum number of sectors that can be added
  849. * @same_page: return if the segment has been merged inside the same page
  850. *
  851. * Add a page to a bio while respecting the hardware max_sectors, max_segment
  852. * and gap limitations.
  853. */
  854. int bio_add_hw_page(struct request_queue *q, struct bio *bio,
  855. struct page *page, unsigned int len, unsigned int offset,
  856. unsigned int max_sectors, bool *same_page)
  857. {
  858. unsigned int max_size = max_sectors << SECTOR_SHIFT;
  859. if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
  860. return 0;
  861. len = min3(len, max_size, queue_max_segment_size(q));
  862. if (len > max_size - bio->bi_iter.bi_size)
  863. return 0;
  864. if (bio->bi_vcnt > 0) {
  865. struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
  866. if (bvec_try_merge_hw_page(q, bv, page, len, offset,
  867. same_page)) {
  868. bio->bi_iter.bi_size += len;
  869. return len;
  870. }
  871. if (bio->bi_vcnt >=
  872. min(bio->bi_max_vecs, queue_max_segments(q)))
  873. return 0;
  874. /*
  875. * If the queue doesn't support SG gaps and adding this segment
  876. * would create a gap, disallow it.
  877. */
  878. if (bvec_gap_to_prev(&q->limits, bv, offset))
  879. return 0;
  880. }
  881. bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset);
  882. bio->bi_vcnt++;
  883. bio->bi_iter.bi_size += len;
  884. return len;
  885. }
  886. /**
  887. * bio_add_hw_folio - attempt to add a folio to a bio with hw constraints
  888. * @q: the target queue
  889. * @bio: destination bio
  890. * @folio: folio to add
  891. * @len: vec entry length
  892. * @offset: vec entry offset in the folio
  893. * @max_sectors: maximum number of sectors that can be added
  894. * @same_page: return if the segment has been merged inside the same folio
  895. *
  896. * Add a folio to a bio while respecting the hardware max_sectors, max_segment
  897. * and gap limitations.
  898. */
  899. int bio_add_hw_folio(struct request_queue *q, struct bio *bio,
  900. struct folio *folio, size_t len, size_t offset,
  901. unsigned int max_sectors, bool *same_page)
  902. {
  903. if (len > UINT_MAX || offset > UINT_MAX)
  904. return 0;
  905. return bio_add_hw_page(q, bio, folio_page(folio, 0), len, offset,
  906. max_sectors, same_page);
  907. }
  908. /**
  909. * bio_add_pc_page - attempt to add page to passthrough bio
  910. * @q: the target queue
  911. * @bio: destination bio
  912. * @page: page to add
  913. * @len: vec entry length
  914. * @offset: vec entry offset
  915. *
  916. * Attempt to add a page to the bio_vec maplist. This can fail for a
  917. * number of reasons, such as the bio being full or target block device
  918. * limitations. The target block device must allow bio's up to PAGE_SIZE,
  919. * so it is always possible to add a single page to an empty bio.
  920. *
  921. * This should only be used by passthrough bios.
  922. */
  923. int bio_add_pc_page(struct request_queue *q, struct bio *bio,
  924. struct page *page, unsigned int len, unsigned int offset)
  925. {
  926. bool same_page = false;
  927. return bio_add_hw_page(q, bio, page, len, offset,
  928. queue_max_hw_sectors(q), &same_page);
  929. }
  930. EXPORT_SYMBOL(bio_add_pc_page);
  931. /**
  932. * bio_add_zone_append_page - attempt to add page to zone-append bio
  933. * @bio: destination bio
  934. * @page: page to add
  935. * @len: vec entry length
  936. * @offset: vec entry offset
  937. *
  938. * Attempt to add a page to the bio_vec maplist of a bio that will be submitted
  939. * for a zone-append request. This can fail for a number of reasons, such as the
  940. * bio being full or the target block device is not a zoned block device or
  941. * other limitations of the target block device. The target block device must
  942. * allow bio's up to PAGE_SIZE, so it is always possible to add a single page
  943. * to an empty bio.
  944. *
  945. * Returns: number of bytes added to the bio, or 0 in case of a failure.
  946. */
  947. int bio_add_zone_append_page(struct bio *bio, struct page *page,
  948. unsigned int len, unsigned int offset)
  949. {
  950. struct request_queue *q = bdev_get_queue(bio->bi_bdev);
  951. bool same_page = false;
  952. if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
  953. return 0;
  954. if (WARN_ON_ONCE(!bdev_is_zoned(bio->bi_bdev)))
  955. return 0;
  956. return bio_add_hw_page(q, bio, page, len, offset,
  957. queue_max_zone_append_sectors(q), &same_page);
  958. }
  959. EXPORT_SYMBOL_GPL(bio_add_zone_append_page);
  960. /**
  961. * __bio_add_page - add page(s) to a bio in a new segment
  962. * @bio: destination bio
  963. * @page: start page to add
  964. * @len: length of the data to add, may cross pages
  965. * @off: offset of the data relative to @page, may cross pages
  966. *
  967. * Add the data at @page + @off to @bio as a new bvec. The caller must ensure
  968. * that @bio has space for another bvec.
  969. */
  970. void __bio_add_page(struct bio *bio, struct page *page,
  971. unsigned int len, unsigned int off)
  972. {
  973. WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
  974. WARN_ON_ONCE(bio_full(bio, len));
  975. bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off);
  976. bio->bi_iter.bi_size += len;
  977. bio->bi_vcnt++;
  978. }
  979. EXPORT_SYMBOL_GPL(__bio_add_page);
  980. /**
  981. * bio_add_page - attempt to add page(s) to bio
  982. * @bio: destination bio
  983. * @page: start page to add
  984. * @len: vec entry length, may cross pages
  985. * @offset: vec entry offset relative to @page, may cross pages
  986. *
  987. * Attempt to add page(s) to the bio_vec maplist. This will only fail
  988. * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio.
  989. */
  990. int bio_add_page(struct bio *bio, struct page *page,
  991. unsigned int len, unsigned int offset)
  992. {
  993. bool same_page = false;
  994. if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
  995. return 0;
  996. if (bio->bi_iter.bi_size > UINT_MAX - len)
  997. return 0;
  998. if (bio->bi_vcnt > 0 &&
  999. bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
  1000. page, len, offset, &same_page)) {
  1001. bio->bi_iter.bi_size += len;
  1002. return len;
  1003. }
  1004. if (bio->bi_vcnt >= bio->bi_max_vecs)
  1005. return 0;
  1006. __bio_add_page(bio, page, len, offset);
  1007. return len;
  1008. }
  1009. EXPORT_SYMBOL(bio_add_page);
  1010. void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len,
  1011. size_t off)
  1012. {
  1013. WARN_ON_ONCE(len > UINT_MAX);
  1014. WARN_ON_ONCE(off > UINT_MAX);
  1015. __bio_add_page(bio, &folio->page, len, off);
  1016. }
  1017. EXPORT_SYMBOL_GPL(bio_add_folio_nofail);
  1018. /**
  1019. * bio_add_folio - Attempt to add part of a folio to a bio.
  1020. * @bio: BIO to add to.
  1021. * @folio: Folio to add.
  1022. * @len: How many bytes from the folio to add.
  1023. * @off: First byte in this folio to add.
  1024. *
  1025. * Filesystems that use folios can call this function instead of calling
  1026. * bio_add_page() for each page in the folio. If @off is bigger than
  1027. * PAGE_SIZE, this function can create a bio_vec that starts in a page
  1028. * after the bv_page. BIOs do not support folios that are 4GiB or larger.
  1029. *
  1030. * Return: Whether the addition was successful.
  1031. */
  1032. bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
  1033. size_t off)
  1034. {
  1035. if (len > UINT_MAX || off > UINT_MAX)
  1036. return false;
  1037. return bio_add_page(bio, &folio->page, len, off) > 0;
  1038. }
  1039. EXPORT_SYMBOL(bio_add_folio);
  1040. void __bio_release_pages(struct bio *bio, bool mark_dirty)
  1041. {
  1042. struct folio_iter fi;
  1043. bio_for_each_folio_all(fi, bio) {
  1044. size_t nr_pages;
  1045. if (mark_dirty) {
  1046. folio_lock(fi.folio);
  1047. folio_mark_dirty(fi.folio);
  1048. folio_unlock(fi.folio);
  1049. }
  1050. nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE -
  1051. fi.offset / PAGE_SIZE + 1;
  1052. unpin_user_folio(fi.folio, nr_pages);
  1053. }
  1054. }
  1055. EXPORT_SYMBOL_GPL(__bio_release_pages);
  1056. void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
  1057. {
  1058. size_t size = iov_iter_count(iter);
  1059. WARN_ON_ONCE(bio->bi_max_vecs);
  1060. if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
  1061. struct request_queue *q = bdev_get_queue(bio->bi_bdev);
  1062. size_t max_sectors = queue_max_zone_append_sectors(q);
  1063. size = min(size, max_sectors << SECTOR_SHIFT);
  1064. }
  1065. bio->bi_vcnt = iter->nr_segs;
  1066. bio->bi_io_vec = (struct bio_vec *)iter->bvec;
  1067. bio->bi_iter.bi_bvec_done = iter->iov_offset;
  1068. bio->bi_iter.bi_size = size;
  1069. bio_set_flag(bio, BIO_CLONED);
  1070. }
  1071. static int bio_iov_add_folio(struct bio *bio, struct folio *folio, size_t len,
  1072. size_t offset)
  1073. {
  1074. bool same_page = false;
  1075. if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len))
  1076. return -EIO;
  1077. if (bio->bi_vcnt > 0 &&
  1078. bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
  1079. folio_page(folio, 0), len, offset,
  1080. &same_page)) {
  1081. bio->bi_iter.bi_size += len;
  1082. if (same_page && bio_flagged(bio, BIO_PAGE_PINNED))
  1083. unpin_user_folio(folio, 1);
  1084. return 0;
  1085. }
  1086. bio_add_folio_nofail(bio, folio, len, offset);
  1087. return 0;
  1088. }
  1089. static int bio_iov_add_zone_append_folio(struct bio *bio, struct folio *folio,
  1090. size_t len, size_t offset)
  1091. {
  1092. struct request_queue *q = bdev_get_queue(bio->bi_bdev);
  1093. bool same_page = false;
  1094. if (bio_add_hw_folio(q, bio, folio, len, offset,
  1095. queue_max_zone_append_sectors(q), &same_page) != len)
  1096. return -EINVAL;
  1097. if (same_page && bio_flagged(bio, BIO_PAGE_PINNED))
  1098. unpin_user_folio(folio, 1);
  1099. return 0;
  1100. }
  1101. static unsigned int get_contig_folio_len(unsigned int *num_pages,
  1102. struct page **pages, unsigned int i,
  1103. struct folio *folio, size_t left,
  1104. size_t offset)
  1105. {
  1106. size_t bytes = left;
  1107. size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes);
  1108. unsigned int j;
  1109. /*
  1110. * We might COW a single page in the middle of
  1111. * a large folio, so we have to check that all
  1112. * pages belong to the same folio.
  1113. */
  1114. bytes -= contig_sz;
  1115. for (j = i + 1; j < i + *num_pages; j++) {
  1116. size_t next = min_t(size_t, PAGE_SIZE, bytes);
  1117. if (page_folio(pages[j]) != folio ||
  1118. pages[j] != pages[j - 1] + 1) {
  1119. break;
  1120. }
  1121. contig_sz += next;
  1122. bytes -= next;
  1123. }
  1124. *num_pages = j - i;
  1125. return contig_sz;
  1126. }
  1127. #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *))
  1128. /**
  1129. * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
  1130. * @bio: bio to add pages to
  1131. * @iter: iov iterator describing the region to be mapped
  1132. *
  1133. * Extracts pages from *iter and appends them to @bio's bvec array. The pages
  1134. * will have to be cleaned up in the way indicated by the BIO_PAGE_PINNED flag.
  1135. * For a multi-segment *iter, this function only adds pages from the next
  1136. * non-empty segment of the iov iterator.
  1137. */
  1138. static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
  1139. {
  1140. iov_iter_extraction_t extraction_flags = 0;
  1141. unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
  1142. unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
  1143. struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
  1144. struct page **pages = (struct page **)bv;
  1145. ssize_t size;
  1146. unsigned int num_pages, i = 0;
  1147. size_t offset, folio_offset, left, len;
  1148. int ret = 0;
  1149. /*
  1150. * Move page array up in the allocated memory for the bio vecs as far as
  1151. * possible so that we can start filling biovecs from the beginning
  1152. * without overwriting the temporary page array.
  1153. */
  1154. BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
  1155. pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
  1156. if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
  1157. extraction_flags |= ITER_ALLOW_P2PDMA;
  1158. /*
  1159. * Each segment in the iov is required to be a block size multiple.
  1160. * However, we may not be able to get the entire segment if it spans
  1161. * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the
  1162. * result to ensure the bio's total size is correct. The remainder of
  1163. * the iov data will be picked up in the next bio iteration.
  1164. */
  1165. size = iov_iter_extract_pages(iter, &pages,
  1166. UINT_MAX - bio->bi_iter.bi_size,
  1167. nr_pages, extraction_flags, &offset);
  1168. if (unlikely(size <= 0))
  1169. return size ? size : -EFAULT;
  1170. nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
  1171. if (bio->bi_bdev) {
  1172. size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1);
  1173. iov_iter_revert(iter, trim);
  1174. size -= trim;
  1175. }
  1176. if (unlikely(!size)) {
  1177. ret = -EFAULT;
  1178. goto out;
  1179. }
  1180. for (left = size, i = 0; left > 0; left -= len, i += num_pages) {
  1181. struct page *page = pages[i];
  1182. struct folio *folio = page_folio(page);
  1183. folio_offset = ((size_t)folio_page_idx(folio, page) <<
  1184. PAGE_SHIFT) + offset;
  1185. len = min(folio_size(folio) - folio_offset, left);
  1186. num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
  1187. if (num_pages > 1)
  1188. len = get_contig_folio_len(&num_pages, pages, i,
  1189. folio, left, offset);
  1190. if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
  1191. ret = bio_iov_add_zone_append_folio(bio, folio, len,
  1192. folio_offset);
  1193. if (ret)
  1194. break;
  1195. } else
  1196. bio_iov_add_folio(bio, folio, len, folio_offset);
  1197. offset = 0;
  1198. }
  1199. iov_iter_revert(iter, left);
  1200. out:
  1201. while (i < nr_pages)
  1202. bio_release_page(bio, pages[i++]);
  1203. return ret;
  1204. }
  1205. /**
  1206. * bio_iov_iter_get_pages - add user or kernel pages to a bio
  1207. * @bio: bio to add pages to
  1208. * @iter: iov iterator describing the region to be added
  1209. *
  1210. * This takes either an iterator pointing to user memory, or one pointing to
  1211. * kernel pages (BVEC iterator). If we're adding user pages, we pin them and
  1212. * map them into the kernel. On IO completion, the caller should put those
  1213. * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided
  1214. * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs
  1215. * to ensure the bvecs and pages stay referenced until the submitted I/O is
  1216. * completed by a call to ->ki_complete() or returns with an error other than
  1217. * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF
  1218. * on IO completion. If it isn't, then pages should be released.
  1219. *
  1220. * The function tries, but does not guarantee, to pin as many pages as
  1221. * fit into the bio, or are requested in @iter, whatever is smaller. If
  1222. * MM encounters an error pinning the requested pages, it stops. Error
  1223. * is returned only if 0 pages could be pinned.
  1224. */
  1225. int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
  1226. {
  1227. int ret = 0;
  1228. if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
  1229. return -EIO;
  1230. if (iov_iter_is_bvec(iter)) {
  1231. bio_iov_bvec_set(bio, iter);
  1232. iov_iter_advance(iter, bio->bi_iter.bi_size);
  1233. return 0;
  1234. }
  1235. if (iov_iter_extract_will_pin(iter))
  1236. bio_set_flag(bio, BIO_PAGE_PINNED);
  1237. do {
  1238. ret = __bio_iov_iter_get_pages(bio, iter);
  1239. } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
  1240. return bio->bi_vcnt ? 0 : ret;
  1241. }
  1242. EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
  1243. static void submit_bio_wait_endio(struct bio *bio)
  1244. {
  1245. complete(bio->bi_private);
  1246. }
  1247. /**
  1248. * submit_bio_wait - submit a bio, and wait until it completes
  1249. * @bio: The &struct bio which describes the I/O
  1250. *
  1251. * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
  1252. * bio_endio() on failure.
  1253. *
  1254. * WARNING: Unlike to how submit_bio() is usually used, this function does not
  1255. * result in bio reference to be consumed. The caller must drop the reference
  1256. * on his own.
  1257. */
  1258. int submit_bio_wait(struct bio *bio)
  1259. {
  1260. DECLARE_COMPLETION_ONSTACK_MAP(done,
  1261. bio->bi_bdev->bd_disk->lockdep_map);
  1262. bio->bi_private = &done;
  1263. bio->bi_end_io = submit_bio_wait_endio;
  1264. bio->bi_opf |= REQ_SYNC;
  1265. submit_bio(bio);
  1266. blk_wait_io(&done);
  1267. return blk_status_to_errno(bio->bi_status);
  1268. }
  1269. EXPORT_SYMBOL(submit_bio_wait);
  1270. static void bio_wait_end_io(struct bio *bio)
  1271. {
  1272. complete(bio->bi_private);
  1273. bio_put(bio);
  1274. }
  1275. /*
  1276. * bio_await_chain - ends @bio and waits for every chained bio to complete
  1277. */
  1278. void bio_await_chain(struct bio *bio)
  1279. {
  1280. DECLARE_COMPLETION_ONSTACK_MAP(done,
  1281. bio->bi_bdev->bd_disk->lockdep_map);
  1282. bio->bi_private = &done;
  1283. bio->bi_end_io = bio_wait_end_io;
  1284. bio_endio(bio);
  1285. blk_wait_io(&done);
  1286. }
  1287. void __bio_advance(struct bio *bio, unsigned bytes)
  1288. {
  1289. if (bio_integrity(bio))
  1290. bio_integrity_advance(bio, bytes);
  1291. bio_crypt_advance(bio, bytes);
  1292. bio_advance_iter(bio, &bio->bi_iter, bytes);
  1293. }
  1294. EXPORT_SYMBOL(__bio_advance);
  1295. void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
  1296. struct bio *src, struct bvec_iter *src_iter)
  1297. {
  1298. while (src_iter->bi_size && dst_iter->bi_size) {
  1299. struct bio_vec src_bv = bio_iter_iovec(src, *src_iter);
  1300. struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter);
  1301. unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
  1302. void *src_buf = bvec_kmap_local(&src_bv);
  1303. void *dst_buf = bvec_kmap_local(&dst_bv);
  1304. memcpy(dst_buf, src_buf, bytes);
  1305. kunmap_local(dst_buf);
  1306. kunmap_local(src_buf);
  1307. bio_advance_iter_single(src, src_iter, bytes);
  1308. bio_advance_iter_single(dst, dst_iter, bytes);
  1309. }
  1310. }
  1311. EXPORT_SYMBOL(bio_copy_data_iter);
  1312. /**
  1313. * bio_copy_data - copy contents of data buffers from one bio to another
  1314. * @src: source bio
  1315. * @dst: destination bio
  1316. *
  1317. * Stops when it reaches the end of either @src or @dst - that is, copies
  1318. * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
  1319. */
  1320. void bio_copy_data(struct bio *dst, struct bio *src)
  1321. {
  1322. struct bvec_iter src_iter = src->bi_iter;
  1323. struct bvec_iter dst_iter = dst->bi_iter;
  1324. bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
  1325. }
  1326. EXPORT_SYMBOL(bio_copy_data);
  1327. void bio_free_pages(struct bio *bio)
  1328. {
  1329. struct bio_vec *bvec;
  1330. struct bvec_iter_all iter_all;
  1331. bio_for_each_segment_all(bvec, bio, iter_all)
  1332. __free_page(bvec->bv_page);
  1333. }
  1334. EXPORT_SYMBOL(bio_free_pages);
  1335. /*
  1336. * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
  1337. * for performing direct-IO in BIOs.
  1338. *
  1339. * The problem is that we cannot run folio_mark_dirty() from interrupt context
  1340. * because the required locks are not interrupt-safe. So what we can do is to
  1341. * mark the pages dirty _before_ performing IO. And in interrupt context,
  1342. * check that the pages are still dirty. If so, fine. If not, redirty them
  1343. * in process context.
  1344. *
  1345. * Note that this code is very hard to test under normal circumstances because
  1346. * direct-io pins the pages with get_user_pages(). This makes
  1347. * is_page_cache_freeable return false, and the VM will not clean the pages.
  1348. * But other code (eg, flusher threads) could clean the pages if they are mapped
  1349. * pagecache.
  1350. *
  1351. * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
  1352. * deferred bio dirtying paths.
  1353. */
  1354. /*
  1355. * bio_set_pages_dirty() will mark all the bio's pages as dirty.
  1356. */
  1357. void bio_set_pages_dirty(struct bio *bio)
  1358. {
  1359. struct folio_iter fi;
  1360. bio_for_each_folio_all(fi, bio) {
  1361. folio_lock(fi.folio);
  1362. folio_mark_dirty(fi.folio);
  1363. folio_unlock(fi.folio);
  1364. }
  1365. }
  1366. EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
  1367. /*
  1368. * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
  1369. * If they are, then fine. If, however, some pages are clean then they must
  1370. * have been written out during the direct-IO read. So we take another ref on
  1371. * the BIO and re-dirty the pages in process context.
  1372. *
  1373. * It is expected that bio_check_pages_dirty() will wholly own the BIO from
  1374. * here on. It will unpin each page and will run one bio_put() against the
  1375. * BIO.
  1376. */
  1377. static void bio_dirty_fn(struct work_struct *work);
  1378. static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
  1379. static DEFINE_SPINLOCK(bio_dirty_lock);
  1380. static struct bio *bio_dirty_list;
  1381. /*
  1382. * This runs in process context
  1383. */
  1384. static void bio_dirty_fn(struct work_struct *work)
  1385. {
  1386. struct bio *bio, *next;
  1387. spin_lock_irq(&bio_dirty_lock);
  1388. next = bio_dirty_list;
  1389. bio_dirty_list = NULL;
  1390. spin_unlock_irq(&bio_dirty_lock);
  1391. while ((bio = next) != NULL) {
  1392. next = bio->bi_private;
  1393. bio_release_pages(bio, true);
  1394. bio_put(bio);
  1395. }
  1396. }
  1397. void bio_check_pages_dirty(struct bio *bio)
  1398. {
  1399. struct folio_iter fi;
  1400. unsigned long flags;
  1401. bio_for_each_folio_all(fi, bio) {
  1402. if (!folio_test_dirty(fi.folio))
  1403. goto defer;
  1404. }
  1405. bio_release_pages(bio, false);
  1406. bio_put(bio);
  1407. return;
  1408. defer:
  1409. spin_lock_irqsave(&bio_dirty_lock, flags);
  1410. bio->bi_private = bio_dirty_list;
  1411. bio_dirty_list = bio;
  1412. spin_unlock_irqrestore(&bio_dirty_lock, flags);
  1413. schedule_work(&bio_dirty_work);
  1414. }
  1415. EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
  1416. static inline bool bio_remaining_done(struct bio *bio)
  1417. {
  1418. /*
  1419. * If we're not chaining, then ->__bi_remaining is always 1 and
  1420. * we always end io on the first invocation.
  1421. */
  1422. if (!bio_flagged(bio, BIO_CHAIN))
  1423. return true;
  1424. BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
  1425. if (atomic_dec_and_test(&bio->__bi_remaining)) {
  1426. bio_clear_flag(bio, BIO_CHAIN);
  1427. return true;
  1428. }
  1429. return false;
  1430. }
  1431. /**
  1432. * bio_endio - end I/O on a bio
  1433. * @bio: bio
  1434. *
  1435. * Description:
  1436. * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
  1437. * way to end I/O on a bio. No one should call bi_end_io() directly on a
  1438. * bio unless they own it and thus know that it has an end_io function.
  1439. *
  1440. * bio_endio() can be called several times on a bio that has been chained
  1441. * using bio_chain(). The ->bi_end_io() function will only be called the
  1442. * last time.
  1443. **/
  1444. void bio_endio(struct bio *bio)
  1445. {
  1446. again:
  1447. if (!bio_remaining_done(bio))
  1448. return;
  1449. if (!bio_integrity_endio(bio))
  1450. return;
  1451. blk_zone_bio_endio(bio);
  1452. rq_qos_done_bio(bio);
  1453. if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
  1454. trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio);
  1455. bio_clear_flag(bio, BIO_TRACE_COMPLETION);
  1456. }
  1457. /*
  1458. * Need to have a real endio function for chained bios, otherwise
  1459. * various corner cases will break (like stacking block devices that
  1460. * save/restore bi_end_io) - however, we want to avoid unbounded
  1461. * recursion and blowing the stack. Tail call optimization would
  1462. * handle this, but compiling with frame pointers also disables
  1463. * gcc's sibling call optimization.
  1464. */
  1465. if (bio->bi_end_io == bio_chain_endio) {
  1466. bio = __bio_chain_endio(bio);
  1467. goto again;
  1468. }
  1469. #ifdef CONFIG_BLK_CGROUP
  1470. /*
  1471. * Release cgroup info. We shouldn't have to do this here, but quite
  1472. * a few callers of bio_init fail to call bio_uninit, so we cover up
  1473. * for that here at least for now.
  1474. */
  1475. if (bio->bi_blkg) {
  1476. blkg_put(bio->bi_blkg);
  1477. bio->bi_blkg = NULL;
  1478. }
  1479. #endif
  1480. if (bio->bi_end_io)
  1481. bio->bi_end_io(bio);
  1482. }
  1483. EXPORT_SYMBOL(bio_endio);
  1484. /**
  1485. * bio_split - split a bio
  1486. * @bio: bio to split
  1487. * @sectors: number of sectors to split from the front of @bio
  1488. * @gfp: gfp mask
  1489. * @bs: bio set to allocate from
  1490. *
  1491. * Allocates and returns a new bio which represents @sectors from the start of
  1492. * @bio, and updates @bio to represent the remaining sectors.
  1493. *
  1494. * Unless this is a discard request the newly allocated bio will point
  1495. * to @bio's bi_io_vec. It is the caller's responsibility to ensure that
  1496. * neither @bio nor @bs are freed before the split bio.
  1497. */
  1498. struct bio *bio_split(struct bio *bio, int sectors,
  1499. gfp_t gfp, struct bio_set *bs)
  1500. {
  1501. struct bio *split;
  1502. BUG_ON(sectors <= 0);
  1503. BUG_ON(sectors >= bio_sectors(bio));
  1504. /* Zone append commands cannot be split */
  1505. if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
  1506. return NULL;
  1507. split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs);
  1508. if (!split)
  1509. return NULL;
  1510. split->bi_iter.bi_size = sectors << 9;
  1511. if (bio_integrity(split))
  1512. bio_integrity_trim(split);
  1513. bio_advance(bio, split->bi_iter.bi_size);
  1514. if (bio_flagged(bio, BIO_TRACE_COMPLETION))
  1515. bio_set_flag(split, BIO_TRACE_COMPLETION);
  1516. return split;
  1517. }
  1518. EXPORT_SYMBOL(bio_split);
  1519. /**
  1520. * bio_trim - trim a bio
  1521. * @bio: bio to trim
  1522. * @offset: number of sectors to trim from the front of @bio
  1523. * @size: size we want to trim @bio to, in sectors
  1524. *
  1525. * This function is typically used for bios that are cloned and submitted
  1526. * to the underlying device in parts.
  1527. */
  1528. void bio_trim(struct bio *bio, sector_t offset, sector_t size)
  1529. {
  1530. if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS ||
  1531. offset + size > bio_sectors(bio)))
  1532. return;
  1533. size <<= 9;
  1534. if (offset == 0 && size == bio->bi_iter.bi_size)
  1535. return;
  1536. bio_advance(bio, offset << 9);
  1537. bio->bi_iter.bi_size = size;
  1538. if (bio_integrity(bio))
  1539. bio_integrity_trim(bio);
  1540. }
  1541. EXPORT_SYMBOL_GPL(bio_trim);
  1542. /*
  1543. * create memory pools for biovec's in a bio_set.
  1544. * use the global biovec slabs created for general use.
  1545. */
  1546. int biovec_init_pool(mempool_t *pool, int pool_entries)
  1547. {
  1548. struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1;
  1549. return mempool_init_slab_pool(pool, pool_entries, bp->slab);
  1550. }
  1551. /*
  1552. * bioset_exit - exit a bioset initialized with bioset_init()
  1553. *
  1554. * May be called on a zeroed but uninitialized bioset (i.e. allocated with
  1555. * kzalloc()).
  1556. */
  1557. void bioset_exit(struct bio_set *bs)
  1558. {
  1559. bio_alloc_cache_destroy(bs);
  1560. if (bs->rescue_workqueue)
  1561. destroy_workqueue(bs->rescue_workqueue);
  1562. bs->rescue_workqueue = NULL;
  1563. mempool_exit(&bs->bio_pool);
  1564. mempool_exit(&bs->bvec_pool);
  1565. bioset_integrity_free(bs);
  1566. if (bs->bio_slab)
  1567. bio_put_slab(bs);
  1568. bs->bio_slab = NULL;
  1569. }
  1570. EXPORT_SYMBOL(bioset_exit);
  1571. /**
  1572. * bioset_init - Initialize a bio_set
  1573. * @bs: pool to initialize
  1574. * @pool_size: Number of bio and bio_vecs to cache in the mempool
  1575. * @front_pad: Number of bytes to allocate in front of the returned bio
  1576. * @flags: Flags to modify behavior, currently %BIOSET_NEED_BVECS
  1577. * and %BIOSET_NEED_RESCUER
  1578. *
  1579. * Description:
  1580. * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
  1581. * to ask for a number of bytes to be allocated in front of the bio.
  1582. * Front pad allocation is useful for embedding the bio inside
  1583. * another structure, to avoid allocating extra data to go with the bio.
  1584. * Note that the bio must be embedded at the END of that structure always,
  1585. * or things will break badly.
  1586. * If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated
  1587. * for allocating iovecs. This pool is not needed e.g. for bio_init_clone().
  1588. * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used
  1589. * to dispatch queued requests when the mempool runs out of space.
  1590. *
  1591. */
  1592. int bioset_init(struct bio_set *bs,
  1593. unsigned int pool_size,
  1594. unsigned int front_pad,
  1595. int flags)
  1596. {
  1597. bs->front_pad = front_pad;
  1598. if (flags & BIOSET_NEED_BVECS)
  1599. bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
  1600. else
  1601. bs->back_pad = 0;
  1602. spin_lock_init(&bs->rescue_lock);
  1603. bio_list_init(&bs->rescue_list);
  1604. INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
  1605. bs->bio_slab = bio_find_or_create_slab(bs);
  1606. if (!bs->bio_slab)
  1607. return -ENOMEM;
  1608. if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab))
  1609. goto bad;
  1610. if ((flags & BIOSET_NEED_BVECS) &&
  1611. biovec_init_pool(&bs->bvec_pool, pool_size))
  1612. goto bad;
  1613. if (flags & BIOSET_NEED_RESCUER) {
  1614. bs->rescue_workqueue = alloc_workqueue("bioset",
  1615. WQ_MEM_RECLAIM, 0);
  1616. if (!bs->rescue_workqueue)
  1617. goto bad;
  1618. }
  1619. if (flags & BIOSET_PERCPU_CACHE) {
  1620. bs->cache = alloc_percpu(struct bio_alloc_cache);
  1621. if (!bs->cache)
  1622. goto bad;
  1623. cpuhp_state_add_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
  1624. }
  1625. return 0;
  1626. bad:
  1627. bioset_exit(bs);
  1628. return -ENOMEM;
  1629. }
  1630. EXPORT_SYMBOL(bioset_init);
  1631. static int __init init_bio(void)
  1632. {
  1633. int i;
  1634. BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags));
  1635. bio_integrity_init();
  1636. for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) {
  1637. struct biovec_slab *bvs = bvec_slabs + i;
  1638. bvs->slab = kmem_cache_create(bvs->name,
  1639. bvs->nr_vecs * sizeof(struct bio_vec), 0,
  1640. SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
  1641. }
  1642. cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL,
  1643. bio_cpu_dead);
  1644. if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0,
  1645. BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE))
  1646. panic("bio: can't allocate bios\n");
  1647. if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))
  1648. panic("bio: can't create integrity pool\n");
  1649. return 0;
  1650. }
  1651. subsys_initcall(init_bio);