super.c 59 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * bcache setup/teardown code, and some metadata io - read a superblock and
  4. * figure out what to do with it.
  5. *
  6. * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
  7. * Copyright 2012 Google, Inc.
  8. */
  9. #include "bcache.h"
  10. #include "btree.h"
  11. #include "debug.h"
  12. #include "extents.h"
  13. #include "request.h"
  14. #include "writeback.h"
  15. #include <linux/blkdev.h>
  16. #include <linux/buffer_head.h>
  17. #include <linux/debugfs.h>
  18. #include <linux/genhd.h>
  19. #include <linux/idr.h>
  20. #include <linux/kthread.h>
  21. #include <linux/module.h>
  22. #include <linux/random.h>
  23. #include <linux/reboot.h>
  24. #include <linux/sysfs.h>
  25. MODULE_LICENSE("GPL");
  26. MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
  27. static const char bcache_magic[] = {
  28. 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
  29. 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
  30. };
  31. static const char invalid_uuid[] = {
  32. 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
  33. 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
  34. };
  35. static struct kobject *bcache_kobj;
  36. struct mutex bch_register_lock;
  37. LIST_HEAD(bch_cache_sets);
  38. static LIST_HEAD(uncached_devices);
  39. static int bcache_major;
  40. static DEFINE_IDA(bcache_device_idx);
  41. static wait_queue_head_t unregister_wait;
  42. struct workqueue_struct *bcache_wq;
  43. struct workqueue_struct *bch_journal_wq;
  44. #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
  45. /* limitation of partitions number on single bcache device */
  46. #define BCACHE_MINORS 128
  47. /* limitation of bcache devices number on single system */
  48. #define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS)
  49. /* Superblock */
  50. static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
  51. struct page **res)
  52. {
  53. const char *err;
  54. struct cache_sb *s;
  55. struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
  56. unsigned int i;
  57. if (!bh)
  58. return "IO error";
  59. s = (struct cache_sb *) bh->b_data;
  60. sb->offset = le64_to_cpu(s->offset);
  61. sb->version = le64_to_cpu(s->version);
  62. memcpy(sb->magic, s->magic, 16);
  63. memcpy(sb->uuid, s->uuid, 16);
  64. memcpy(sb->set_uuid, s->set_uuid, 16);
  65. memcpy(sb->label, s->label, SB_LABEL_SIZE);
  66. sb->flags = le64_to_cpu(s->flags);
  67. sb->seq = le64_to_cpu(s->seq);
  68. sb->last_mount = le32_to_cpu(s->last_mount);
  69. sb->first_bucket = le16_to_cpu(s->first_bucket);
  70. sb->keys = le16_to_cpu(s->keys);
  71. for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
  72. sb->d[i] = le64_to_cpu(s->d[i]);
  73. pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
  74. sb->version, sb->flags, sb->seq, sb->keys);
  75. err = "Not a bcache superblock";
  76. if (sb->offset != SB_SECTOR)
  77. goto err;
  78. if (memcmp(sb->magic, bcache_magic, 16))
  79. goto err;
  80. err = "Too many journal buckets";
  81. if (sb->keys > SB_JOURNAL_BUCKETS)
  82. goto err;
  83. err = "Bad checksum";
  84. if (s->csum != csum_set(s))
  85. goto err;
  86. err = "Bad UUID";
  87. if (bch_is_zero(sb->uuid, 16))
  88. goto err;
  89. sb->block_size = le16_to_cpu(s->block_size);
  90. err = "Superblock block size smaller than device block size";
  91. if (sb->block_size << 9 < bdev_logical_block_size(bdev))
  92. goto err;
  93. switch (sb->version) {
  94. case BCACHE_SB_VERSION_BDEV:
  95. sb->data_offset = BDEV_DATA_START_DEFAULT;
  96. break;
  97. case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
  98. sb->data_offset = le64_to_cpu(s->data_offset);
  99. err = "Bad data offset";
  100. if (sb->data_offset < BDEV_DATA_START_DEFAULT)
  101. goto err;
  102. break;
  103. case BCACHE_SB_VERSION_CDEV:
  104. case BCACHE_SB_VERSION_CDEV_WITH_UUID:
  105. sb->nbuckets = le64_to_cpu(s->nbuckets);
  106. sb->bucket_size = le16_to_cpu(s->bucket_size);
  107. sb->nr_in_set = le16_to_cpu(s->nr_in_set);
  108. sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
  109. err = "Too many buckets";
  110. if (sb->nbuckets > LONG_MAX)
  111. goto err;
  112. err = "Not enough buckets";
  113. if (sb->nbuckets < 1 << 7)
  114. goto err;
  115. err = "Bad block/bucket size";
  116. if (!is_power_of_2(sb->block_size) ||
  117. sb->block_size > PAGE_SECTORS ||
  118. !is_power_of_2(sb->bucket_size) ||
  119. sb->bucket_size < PAGE_SECTORS)
  120. goto err;
  121. err = "Invalid superblock: device too small";
  122. if (get_capacity(bdev->bd_disk) <
  123. sb->bucket_size * sb->nbuckets)
  124. goto err;
  125. err = "Bad UUID";
  126. if (bch_is_zero(sb->set_uuid, 16))
  127. goto err;
  128. err = "Bad cache device number in set";
  129. if (!sb->nr_in_set ||
  130. sb->nr_in_set <= sb->nr_this_dev ||
  131. sb->nr_in_set > MAX_CACHES_PER_SET)
  132. goto err;
  133. err = "Journal buckets not sequential";
  134. for (i = 0; i < sb->keys; i++)
  135. if (sb->d[i] != sb->first_bucket + i)
  136. goto err;
  137. err = "Too many journal buckets";
  138. if (sb->first_bucket + sb->keys > sb->nbuckets)
  139. goto err;
  140. err = "Invalid superblock: first bucket comes before end of super";
  141. if (sb->first_bucket * sb->bucket_size < 16)
  142. goto err;
  143. break;
  144. default:
  145. err = "Unsupported superblock version";
  146. goto err;
  147. }
  148. sb->last_mount = (u32)ktime_get_real_seconds();
  149. err = NULL;
  150. get_page(bh->b_page);
  151. *res = bh->b_page;
  152. err:
  153. put_bh(bh);
  154. return err;
  155. }
  156. static void write_bdev_super_endio(struct bio *bio)
  157. {
  158. struct cached_dev *dc = bio->bi_private;
  159. /* XXX: error checking */
  160. closure_put(&dc->sb_write);
  161. }
  162. static void __write_super(struct cache_sb *sb, struct bio *bio)
  163. {
  164. struct cache_sb *out = page_address(bio_first_page_all(bio));
  165. unsigned int i;
  166. bio->bi_iter.bi_sector = SB_SECTOR;
  167. bio->bi_iter.bi_size = SB_SIZE;
  168. bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
  169. bch_bio_map(bio, NULL);
  170. out->offset = cpu_to_le64(sb->offset);
  171. out->version = cpu_to_le64(sb->version);
  172. memcpy(out->uuid, sb->uuid, 16);
  173. memcpy(out->set_uuid, sb->set_uuid, 16);
  174. memcpy(out->label, sb->label, SB_LABEL_SIZE);
  175. out->flags = cpu_to_le64(sb->flags);
  176. out->seq = cpu_to_le64(sb->seq);
  177. out->last_mount = cpu_to_le32(sb->last_mount);
  178. out->first_bucket = cpu_to_le16(sb->first_bucket);
  179. out->keys = cpu_to_le16(sb->keys);
  180. for (i = 0; i < sb->keys; i++)
  181. out->d[i] = cpu_to_le64(sb->d[i]);
  182. out->csum = csum_set(out);
  183. pr_debug("ver %llu, flags %llu, seq %llu",
  184. sb->version, sb->flags, sb->seq);
  185. submit_bio(bio);
  186. }
  187. static void bch_write_bdev_super_unlock(struct closure *cl)
  188. {
  189. struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
  190. up(&dc->sb_write_mutex);
  191. }
  192. void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
  193. {
  194. struct closure *cl = &dc->sb_write;
  195. struct bio *bio = &dc->sb_bio;
  196. down(&dc->sb_write_mutex);
  197. closure_init(cl, parent);
  198. bio_reset(bio);
  199. bio_set_dev(bio, dc->bdev);
  200. bio->bi_end_io = write_bdev_super_endio;
  201. bio->bi_private = dc;
  202. closure_get(cl);
  203. /* I/O request sent to backing device */
  204. __write_super(&dc->sb, bio);
  205. closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
  206. }
  207. static void write_super_endio(struct bio *bio)
  208. {
  209. struct cache *ca = bio->bi_private;
  210. /* is_read = 0 */
  211. bch_count_io_errors(ca, bio->bi_status, 0,
  212. "writing superblock");
  213. closure_put(&ca->set->sb_write);
  214. }
  215. static void bcache_write_super_unlock(struct closure *cl)
  216. {
  217. struct cache_set *c = container_of(cl, struct cache_set, sb_write);
  218. up(&c->sb_write_mutex);
  219. }
  220. void bcache_write_super(struct cache_set *c)
  221. {
  222. struct closure *cl = &c->sb_write;
  223. struct cache *ca;
  224. unsigned int i;
  225. down(&c->sb_write_mutex);
  226. closure_init(cl, &c->cl);
  227. c->sb.seq++;
  228. for_each_cache(ca, c, i) {
  229. struct bio *bio = &ca->sb_bio;
  230. ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
  231. ca->sb.seq = c->sb.seq;
  232. ca->sb.last_mount = c->sb.last_mount;
  233. SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
  234. bio_reset(bio);
  235. bio_set_dev(bio, ca->bdev);
  236. bio->bi_end_io = write_super_endio;
  237. bio->bi_private = ca;
  238. closure_get(cl);
  239. __write_super(&ca->sb, bio);
  240. }
  241. closure_return_with_destructor(cl, bcache_write_super_unlock);
  242. }
  243. /* UUID io */
  244. static void uuid_endio(struct bio *bio)
  245. {
  246. struct closure *cl = bio->bi_private;
  247. struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
  248. cache_set_err_on(bio->bi_status, c, "accessing uuids");
  249. bch_bbio_free(bio, c);
  250. closure_put(cl);
  251. }
  252. static void uuid_io_unlock(struct closure *cl)
  253. {
  254. struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
  255. up(&c->uuid_write_mutex);
  256. }
  257. static void uuid_io(struct cache_set *c, int op, unsigned long op_flags,
  258. struct bkey *k, struct closure *parent)
  259. {
  260. struct closure *cl = &c->uuid_write;
  261. struct uuid_entry *u;
  262. unsigned int i;
  263. char buf[80];
  264. BUG_ON(!parent);
  265. down(&c->uuid_write_mutex);
  266. closure_init(cl, parent);
  267. for (i = 0; i < KEY_PTRS(k); i++) {
  268. struct bio *bio = bch_bbio_alloc(c);
  269. bio->bi_opf = REQ_SYNC | REQ_META | op_flags;
  270. bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
  271. bio->bi_end_io = uuid_endio;
  272. bio->bi_private = cl;
  273. bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
  274. bch_bio_map(bio, c->uuids);
  275. bch_submit_bbio(bio, c, k, i);
  276. if (op != REQ_OP_WRITE)
  277. break;
  278. }
  279. bch_extent_to_text(buf, sizeof(buf), k);
  280. pr_debug("%s UUIDs at %s", op == REQ_OP_WRITE ? "wrote" : "read", buf);
  281. for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
  282. if (!bch_is_zero(u->uuid, 16))
  283. pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
  284. u - c->uuids, u->uuid, u->label,
  285. u->first_reg, u->last_reg, u->invalidated);
  286. closure_return_with_destructor(cl, uuid_io_unlock);
  287. }
  288. static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
  289. {
  290. struct bkey *k = &j->uuid_bucket;
  291. if (__bch_btree_ptr_invalid(c, k))
  292. return "bad uuid pointer";
  293. bkey_copy(&c->uuid_bucket, k);
  294. uuid_io(c, REQ_OP_READ, 0, k, cl);
  295. if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
  296. struct uuid_entry_v0 *u0 = (void *) c->uuids;
  297. struct uuid_entry *u1 = (void *) c->uuids;
  298. int i;
  299. closure_sync(cl);
  300. /*
  301. * Since the new uuid entry is bigger than the old, we have to
  302. * convert starting at the highest memory address and work down
  303. * in order to do it in place
  304. */
  305. for (i = c->nr_uuids - 1;
  306. i >= 0;
  307. --i) {
  308. memcpy(u1[i].uuid, u0[i].uuid, 16);
  309. memcpy(u1[i].label, u0[i].label, 32);
  310. u1[i].first_reg = u0[i].first_reg;
  311. u1[i].last_reg = u0[i].last_reg;
  312. u1[i].invalidated = u0[i].invalidated;
  313. u1[i].flags = 0;
  314. u1[i].sectors = 0;
  315. }
  316. }
  317. return NULL;
  318. }
  319. static int __uuid_write(struct cache_set *c)
  320. {
  321. BKEY_PADDED(key) k;
  322. struct closure cl;
  323. struct cache *ca;
  324. closure_init_stack(&cl);
  325. lockdep_assert_held(&bch_register_lock);
  326. if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
  327. return 1;
  328. SET_KEY_SIZE(&k.key, c->sb.bucket_size);
  329. uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
  330. closure_sync(&cl);
  331. /* Only one bucket used for uuid write */
  332. ca = PTR_CACHE(c, &k.key, 0);
  333. atomic_long_add(ca->sb.bucket_size, &ca->meta_sectors_written);
  334. bkey_copy(&c->uuid_bucket, &k.key);
  335. bkey_put(c, &k.key);
  336. return 0;
  337. }
  338. int bch_uuid_write(struct cache_set *c)
  339. {
  340. int ret = __uuid_write(c);
  341. if (!ret)
  342. bch_journal_meta(c, NULL);
  343. return ret;
  344. }
  345. static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
  346. {
  347. struct uuid_entry *u;
  348. for (u = c->uuids;
  349. u < c->uuids + c->nr_uuids; u++)
  350. if (!memcmp(u->uuid, uuid, 16))
  351. return u;
  352. return NULL;
  353. }
  354. static struct uuid_entry *uuid_find_empty(struct cache_set *c)
  355. {
  356. static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
  357. return uuid_find(c, zero_uuid);
  358. }
  359. /*
  360. * Bucket priorities/gens:
  361. *
  362. * For each bucket, we store on disk its
  363. * 8 bit gen
  364. * 16 bit priority
  365. *
  366. * See alloc.c for an explanation of the gen. The priority is used to implement
  367. * lru (and in the future other) cache replacement policies; for most purposes
  368. * it's just an opaque integer.
  369. *
  370. * The gens and the priorities don't have a whole lot to do with each other, and
  371. * it's actually the gens that must be written out at specific times - it's no
  372. * big deal if the priorities don't get written, if we lose them we just reuse
  373. * buckets in suboptimal order.
  374. *
  375. * On disk they're stored in a packed array, and in as many buckets are required
  376. * to fit them all. The buckets we use to store them form a list; the journal
  377. * header points to the first bucket, the first bucket points to the second
  378. * bucket, et cetera.
  379. *
  380. * This code is used by the allocation code; periodically (whenever it runs out
  381. * of buckets to allocate from) the allocation code will invalidate some
  382. * buckets, but it can't use those buckets until their new gens are safely on
  383. * disk.
  384. */
  385. static void prio_endio(struct bio *bio)
  386. {
  387. struct cache *ca = bio->bi_private;
  388. cache_set_err_on(bio->bi_status, ca->set, "accessing priorities");
  389. bch_bbio_free(bio, ca->set);
  390. closure_put(&ca->prio);
  391. }
  392. static void prio_io(struct cache *ca, uint64_t bucket, int op,
  393. unsigned long op_flags)
  394. {
  395. struct closure *cl = &ca->prio;
  396. struct bio *bio = bch_bbio_alloc(ca->set);
  397. closure_init_stack(cl);
  398. bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size;
  399. bio_set_dev(bio, ca->bdev);
  400. bio->bi_iter.bi_size = bucket_bytes(ca);
  401. bio->bi_end_io = prio_endio;
  402. bio->bi_private = ca;
  403. bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
  404. bch_bio_map(bio, ca->disk_buckets);
  405. closure_bio_submit(ca->set, bio, &ca->prio);
  406. closure_sync(cl);
  407. }
  408. int bch_prio_write(struct cache *ca, bool wait)
  409. {
  410. int i;
  411. struct bucket *b;
  412. struct closure cl;
  413. pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu",
  414. fifo_used(&ca->free[RESERVE_PRIO]),
  415. fifo_used(&ca->free[RESERVE_NONE]),
  416. fifo_used(&ca->free_inc));
  417. /*
  418. * Pre-check if there are enough free buckets. In the non-blocking
  419. * scenario it's better to fail early rather than starting to allocate
  420. * buckets and do a cleanup later in case of failure.
  421. */
  422. if (!wait) {
  423. size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) +
  424. fifo_used(&ca->free[RESERVE_NONE]);
  425. if (prio_buckets(ca) > avail)
  426. return -ENOMEM;
  427. }
  428. closure_init_stack(&cl);
  429. lockdep_assert_held(&ca->set->bucket_lock);
  430. ca->disk_buckets->seq++;
  431. atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
  432. &ca->meta_sectors_written);
  433. for (i = prio_buckets(ca) - 1; i >= 0; --i) {
  434. long bucket;
  435. struct prio_set *p = ca->disk_buckets;
  436. struct bucket_disk *d = p->data;
  437. struct bucket_disk *end = d + prios_per_bucket(ca);
  438. for (b = ca->buckets + i * prios_per_bucket(ca);
  439. b < ca->buckets + ca->sb.nbuckets && d < end;
  440. b++, d++) {
  441. d->prio = cpu_to_le16(b->prio);
  442. d->gen = b->gen;
  443. }
  444. p->next_bucket = ca->prio_buckets[i + 1];
  445. p->magic = pset_magic(&ca->sb);
  446. p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
  447. bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait);
  448. BUG_ON(bucket == -1);
  449. mutex_unlock(&ca->set->bucket_lock);
  450. prio_io(ca, bucket, REQ_OP_WRITE, 0);
  451. mutex_lock(&ca->set->bucket_lock);
  452. ca->prio_buckets[i] = bucket;
  453. atomic_dec_bug(&ca->buckets[bucket].pin);
  454. }
  455. mutex_unlock(&ca->set->bucket_lock);
  456. bch_journal_meta(ca->set, &cl);
  457. closure_sync(&cl);
  458. mutex_lock(&ca->set->bucket_lock);
  459. /*
  460. * Don't want the old priorities to get garbage collected until after we
  461. * finish writing the new ones, and they're journalled
  462. */
  463. for (i = 0; i < prio_buckets(ca); i++) {
  464. if (ca->prio_last_buckets[i])
  465. __bch_bucket_free(ca,
  466. &ca->buckets[ca->prio_last_buckets[i]]);
  467. ca->prio_last_buckets[i] = ca->prio_buckets[i];
  468. }
  469. return 0;
  470. }
  471. static void prio_read(struct cache *ca, uint64_t bucket)
  472. {
  473. struct prio_set *p = ca->disk_buckets;
  474. struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
  475. struct bucket *b;
  476. unsigned int bucket_nr = 0;
  477. for (b = ca->buckets;
  478. b < ca->buckets + ca->sb.nbuckets;
  479. b++, d++) {
  480. if (d == end) {
  481. ca->prio_buckets[bucket_nr] = bucket;
  482. ca->prio_last_buckets[bucket_nr] = bucket;
  483. bucket_nr++;
  484. prio_io(ca, bucket, REQ_OP_READ, 0);
  485. if (p->csum !=
  486. bch_crc64(&p->magic, bucket_bytes(ca) - 8))
  487. pr_warn("bad csum reading priorities");
  488. if (p->magic != pset_magic(&ca->sb))
  489. pr_warn("bad magic reading priorities");
  490. bucket = p->next_bucket;
  491. d = p->data;
  492. }
  493. b->prio = le16_to_cpu(d->prio);
  494. b->gen = b->last_gc = d->gen;
  495. }
  496. }
  497. /* Bcache device */
  498. static int open_dev(struct block_device *b, fmode_t mode)
  499. {
  500. struct bcache_device *d = b->bd_disk->private_data;
  501. if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
  502. return -ENXIO;
  503. closure_get(&d->cl);
  504. return 0;
  505. }
  506. static void release_dev(struct gendisk *b, fmode_t mode)
  507. {
  508. struct bcache_device *d = b->private_data;
  509. closure_put(&d->cl);
  510. }
  511. static int ioctl_dev(struct block_device *b, fmode_t mode,
  512. unsigned int cmd, unsigned long arg)
  513. {
  514. struct bcache_device *d = b->bd_disk->private_data;
  515. return d->ioctl(d, mode, cmd, arg);
  516. }
  517. static const struct block_device_operations bcache_ops = {
  518. .open = open_dev,
  519. .release = release_dev,
  520. .ioctl = ioctl_dev,
  521. .owner = THIS_MODULE,
  522. };
  523. void bcache_device_stop(struct bcache_device *d)
  524. {
  525. if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
  526. closure_queue(&d->cl);
  527. }
  528. static void bcache_device_unlink(struct bcache_device *d)
  529. {
  530. lockdep_assert_held(&bch_register_lock);
  531. if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
  532. unsigned int i;
  533. struct cache *ca;
  534. sysfs_remove_link(&d->c->kobj, d->name);
  535. sysfs_remove_link(&d->kobj, "cache");
  536. for_each_cache(ca, d->c, i)
  537. bd_unlink_disk_holder(ca->bdev, d->disk);
  538. }
  539. }
  540. static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
  541. const char *name)
  542. {
  543. unsigned int i;
  544. struct cache *ca;
  545. for_each_cache(ca, d->c, i)
  546. bd_link_disk_holder(ca->bdev, d->disk);
  547. snprintf(d->name, BCACHEDEVNAME_SIZE,
  548. "%s%u", name, d->id);
  549. WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
  550. sysfs_create_link(&c->kobj, &d->kobj, d->name),
  551. "Couldn't create device <-> cache set symlinks");
  552. clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
  553. }
  554. static void bcache_device_detach(struct bcache_device *d)
  555. {
  556. lockdep_assert_held(&bch_register_lock);
  557. atomic_dec(&d->c->attached_dev_nr);
  558. if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
  559. struct uuid_entry *u = d->c->uuids + d->id;
  560. SET_UUID_FLASH_ONLY(u, 0);
  561. memcpy(u->uuid, invalid_uuid, 16);
  562. u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
  563. bch_uuid_write(d->c);
  564. }
  565. bcache_device_unlink(d);
  566. d->c->devices[d->id] = NULL;
  567. closure_put(&d->c->caching);
  568. d->c = NULL;
  569. }
  570. static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
  571. unsigned int id)
  572. {
  573. d->id = id;
  574. d->c = c;
  575. c->devices[id] = d;
  576. if (id >= c->devices_max_used)
  577. c->devices_max_used = id + 1;
  578. closure_get(&c->caching);
  579. }
  580. static inline int first_minor_to_idx(int first_minor)
  581. {
  582. return (first_minor/BCACHE_MINORS);
  583. }
  584. static inline int idx_to_first_minor(int idx)
  585. {
  586. return (idx * BCACHE_MINORS);
  587. }
  588. static void bcache_device_free(struct bcache_device *d)
  589. {
  590. struct gendisk *disk = d->disk;
  591. lockdep_assert_held(&bch_register_lock);
  592. if (disk)
  593. pr_info("%s stopped", disk->disk_name);
  594. else
  595. pr_err("bcache device (NULL gendisk) stopped");
  596. if (d->c)
  597. bcache_device_detach(d);
  598. if (disk) {
  599. bool disk_added = (disk->flags & GENHD_FL_UP) != 0;
  600. if (disk_added)
  601. del_gendisk(disk);
  602. if (disk->queue)
  603. blk_cleanup_queue(disk->queue);
  604. ida_simple_remove(&bcache_device_idx,
  605. first_minor_to_idx(disk->first_minor));
  606. if (disk_added)
  607. put_disk(disk);
  608. }
  609. bioset_exit(&d->bio_split);
  610. kvfree(d->full_dirty_stripes);
  611. kvfree(d->stripe_sectors_dirty);
  612. closure_debug_destroy(&d->cl);
  613. }
  614. static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
  615. sector_t sectors)
  616. {
  617. struct request_queue *q;
  618. const size_t max_stripes = min_t(size_t, INT_MAX,
  619. SIZE_MAX / sizeof(atomic_t));
  620. size_t n;
  621. int idx;
  622. if (!d->stripe_size)
  623. d->stripe_size = 1 << 31;
  624. d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
  625. if (!d->nr_stripes || d->nr_stripes > max_stripes) {
  626. pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)",
  627. (unsigned int)d->nr_stripes);
  628. return -ENOMEM;
  629. }
  630. n = d->nr_stripes * sizeof(atomic_t);
  631. d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
  632. if (!d->stripe_sectors_dirty)
  633. return -ENOMEM;
  634. n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
  635. d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL);
  636. if (!d->full_dirty_stripes)
  637. return -ENOMEM;
  638. idx = ida_simple_get(&bcache_device_idx, 0,
  639. BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
  640. if (idx < 0)
  641. return idx;
  642. if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
  643. BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
  644. goto err;
  645. d->disk = alloc_disk(BCACHE_MINORS);
  646. if (!d->disk)
  647. goto err;
  648. set_capacity(d->disk, sectors);
  649. snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
  650. d->disk->major = bcache_major;
  651. d->disk->first_minor = idx_to_first_minor(idx);
  652. d->disk->fops = &bcache_ops;
  653. d->disk->private_data = d;
  654. q = blk_alloc_queue(GFP_KERNEL);
  655. if (!q)
  656. return -ENOMEM;
  657. blk_queue_make_request(q, NULL);
  658. d->disk->queue = q;
  659. q->queuedata = d;
  660. q->backing_dev_info->congested_data = d;
  661. q->limits.max_hw_sectors = UINT_MAX;
  662. q->limits.max_sectors = UINT_MAX;
  663. q->limits.max_segment_size = UINT_MAX;
  664. q->limits.max_segments = BIO_MAX_PAGES;
  665. blk_queue_max_discard_sectors(q, UINT_MAX);
  666. q->limits.discard_granularity = 512;
  667. q->limits.io_min = block_size;
  668. q->limits.logical_block_size = block_size;
  669. q->limits.physical_block_size = block_size;
  670. blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
  671. blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue);
  672. blk_queue_flag_set(QUEUE_FLAG_DISCARD, d->disk->queue);
  673. blk_queue_write_cache(q, true, true);
  674. return 0;
  675. err:
  676. ida_simple_remove(&bcache_device_idx, idx);
  677. return -ENOMEM;
  678. }
  679. /* Cached device */
  680. static void calc_cached_dev_sectors(struct cache_set *c)
  681. {
  682. uint64_t sectors = 0;
  683. struct cached_dev *dc;
  684. list_for_each_entry(dc, &c->cached_devs, list)
  685. sectors += bdev_sectors(dc->bdev);
  686. c->cached_dev_sectors = sectors;
  687. }
  688. #define BACKING_DEV_OFFLINE_TIMEOUT 5
  689. static int cached_dev_status_update(void *arg)
  690. {
  691. struct cached_dev *dc = arg;
  692. struct request_queue *q;
  693. /*
  694. * If this delayed worker is stopping outside, directly quit here.
  695. * dc->io_disable might be set via sysfs interface, so check it
  696. * here too.
  697. */
  698. while (!kthread_should_stop() && !dc->io_disable) {
  699. q = bdev_get_queue(dc->bdev);
  700. if (blk_queue_dying(q))
  701. dc->offline_seconds++;
  702. else
  703. dc->offline_seconds = 0;
  704. if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
  705. pr_err("%s: device offline for %d seconds",
  706. dc->backing_dev_name,
  707. BACKING_DEV_OFFLINE_TIMEOUT);
  708. pr_err("%s: disable I/O request due to backing "
  709. "device offline", dc->disk.name);
  710. dc->io_disable = true;
  711. /* let others know earlier that io_disable is true */
  712. smp_mb();
  713. bcache_device_stop(&dc->disk);
  714. break;
  715. }
  716. schedule_timeout_interruptible(HZ);
  717. }
  718. wait_for_kthread_stop();
  719. return 0;
  720. }
  721. void bch_cached_dev_run(struct cached_dev *dc)
  722. {
  723. struct bcache_device *d = &dc->disk;
  724. char buf[SB_LABEL_SIZE + 1];
  725. char *env[] = {
  726. "DRIVER=bcache",
  727. kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
  728. NULL,
  729. NULL,
  730. };
  731. memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
  732. buf[SB_LABEL_SIZE] = '\0';
  733. env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
  734. if (atomic_xchg(&dc->running, 1)) {
  735. kfree(env[1]);
  736. kfree(env[2]);
  737. return;
  738. }
  739. if (!d->c &&
  740. BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
  741. struct closure cl;
  742. closure_init_stack(&cl);
  743. SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
  744. bch_write_bdev_super(dc, &cl);
  745. closure_sync(&cl);
  746. }
  747. add_disk(d->disk);
  748. bd_link_disk_holder(dc->bdev, dc->disk.disk);
  749. /*
  750. * won't show up in the uevent file, use udevadm monitor -e instead
  751. * only class / kset properties are persistent
  752. */
  753. kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
  754. kfree(env[1]);
  755. kfree(env[2]);
  756. if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
  757. sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
  758. pr_debug("error creating sysfs link");
  759. dc->status_update_thread = kthread_run(cached_dev_status_update,
  760. dc, "bcache_status_update");
  761. if (IS_ERR(dc->status_update_thread)) {
  762. pr_warn("failed to create bcache_status_update kthread, "
  763. "continue to run without monitoring backing "
  764. "device status");
  765. }
  766. }
  767. /*
  768. * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed
  769. * work dc->writeback_rate_update is running. Wait until the routine
  770. * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to
  771. * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out
  772. * seconds, give up waiting here and continue to cancel it too.
  773. */
  774. static void cancel_writeback_rate_update_dwork(struct cached_dev *dc)
  775. {
  776. int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ;
  777. do {
  778. if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING,
  779. &dc->disk.flags))
  780. break;
  781. time_out--;
  782. schedule_timeout_interruptible(1);
  783. } while (time_out > 0);
  784. if (time_out == 0)
  785. pr_warn("give up waiting for dc->writeback_write_update to quit");
  786. cancel_delayed_work_sync(&dc->writeback_rate_update);
  787. }
  788. static void cached_dev_detach_finish(struct work_struct *w)
  789. {
  790. struct cached_dev *dc = container_of(w, struct cached_dev, detach);
  791. struct closure cl;
  792. closure_init_stack(&cl);
  793. BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
  794. BUG_ON(refcount_read(&dc->count));
  795. mutex_lock(&bch_register_lock);
  796. if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
  797. cancel_writeback_rate_update_dwork(dc);
  798. if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
  799. kthread_stop(dc->writeback_thread);
  800. dc->writeback_thread = NULL;
  801. }
  802. memset(&dc->sb.set_uuid, 0, 16);
  803. SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
  804. bch_write_bdev_super(dc, &cl);
  805. closure_sync(&cl);
  806. calc_cached_dev_sectors(dc->disk.c);
  807. bcache_device_detach(&dc->disk);
  808. list_move(&dc->list, &uncached_devices);
  809. clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
  810. clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
  811. mutex_unlock(&bch_register_lock);
  812. pr_info("Caching disabled for %s", dc->backing_dev_name);
  813. /* Drop ref we took in cached_dev_detach() */
  814. closure_put(&dc->disk.cl);
  815. }
  816. void bch_cached_dev_detach(struct cached_dev *dc)
  817. {
  818. lockdep_assert_held(&bch_register_lock);
  819. if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
  820. return;
  821. if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
  822. return;
  823. /*
  824. * Block the device from being closed and freed until we're finished
  825. * detaching
  826. */
  827. closure_get(&dc->disk.cl);
  828. bch_writeback_queue(dc);
  829. cached_dev_put(dc);
  830. }
  831. int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
  832. uint8_t *set_uuid)
  833. {
  834. uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
  835. struct uuid_entry *u;
  836. struct cached_dev *exist_dc, *t;
  837. if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) ||
  838. (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)))
  839. return -ENOENT;
  840. if (dc->disk.c) {
  841. pr_err("Can't attach %s: already attached",
  842. dc->backing_dev_name);
  843. return -EINVAL;
  844. }
  845. if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
  846. pr_err("Can't attach %s: shutting down",
  847. dc->backing_dev_name);
  848. return -EINVAL;
  849. }
  850. if (dc->sb.block_size < c->sb.block_size) {
  851. /* Will die */
  852. pr_err("Couldn't attach %s: block size less than set's block size",
  853. dc->backing_dev_name);
  854. return -EINVAL;
  855. }
  856. /* Check whether already attached */
  857. list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) {
  858. if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) {
  859. pr_err("Tried to attach %s but duplicate UUID already attached",
  860. dc->backing_dev_name);
  861. return -EINVAL;
  862. }
  863. }
  864. u = uuid_find(c, dc->sb.uuid);
  865. if (u &&
  866. (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
  867. BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
  868. memcpy(u->uuid, invalid_uuid, 16);
  869. u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
  870. u = NULL;
  871. }
  872. if (!u) {
  873. if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
  874. pr_err("Couldn't find uuid for %s in set",
  875. dc->backing_dev_name);
  876. return -ENOENT;
  877. }
  878. u = uuid_find_empty(c);
  879. if (!u) {
  880. pr_err("Not caching %s, no room for UUID",
  881. dc->backing_dev_name);
  882. return -EINVAL;
  883. }
  884. }
  885. /*
  886. * Deadlocks since we're called via sysfs...
  887. * sysfs_remove_file(&dc->kobj, &sysfs_attach);
  888. */
  889. if (bch_is_zero(u->uuid, 16)) {
  890. struct closure cl;
  891. closure_init_stack(&cl);
  892. memcpy(u->uuid, dc->sb.uuid, 16);
  893. memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
  894. u->first_reg = u->last_reg = rtime;
  895. bch_uuid_write(c);
  896. memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
  897. SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
  898. bch_write_bdev_super(dc, &cl);
  899. closure_sync(&cl);
  900. } else {
  901. u->last_reg = rtime;
  902. bch_uuid_write(c);
  903. }
  904. bcache_device_attach(&dc->disk, c, u - c->uuids);
  905. list_move(&dc->list, &c->cached_devs);
  906. calc_cached_dev_sectors(c);
  907. /*
  908. * dc->c must be set before dc->count != 0 - paired with the mb in
  909. * cached_dev_get()
  910. */
  911. smp_wmb();
  912. refcount_set(&dc->count, 1);
  913. /* Block writeback thread, but spawn it */
  914. down_write(&dc->writeback_lock);
  915. if (bch_cached_dev_writeback_start(dc)) {
  916. up_write(&dc->writeback_lock);
  917. return -ENOMEM;
  918. }
  919. if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
  920. atomic_set(&dc->has_dirty, 1);
  921. bch_writeback_queue(dc);
  922. }
  923. bch_sectors_dirty_init(&dc->disk);
  924. bch_cached_dev_run(dc);
  925. bcache_device_link(&dc->disk, c, "bdev");
  926. atomic_inc(&c->attached_dev_nr);
  927. /* Allow the writeback thread to proceed */
  928. up_write(&dc->writeback_lock);
  929. pr_info("Caching %s as %s on set %pU",
  930. dc->backing_dev_name,
  931. dc->disk.disk->disk_name,
  932. dc->disk.c->sb.set_uuid);
  933. return 0;
  934. }
  935. void bch_cached_dev_release(struct kobject *kobj)
  936. {
  937. struct cached_dev *dc = container_of(kobj, struct cached_dev,
  938. disk.kobj);
  939. kfree(dc);
  940. module_put(THIS_MODULE);
  941. }
  942. static void cached_dev_free(struct closure *cl)
  943. {
  944. struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
  945. if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
  946. cancel_writeback_rate_update_dwork(dc);
  947. if (!IS_ERR_OR_NULL(dc->writeback_thread))
  948. kthread_stop(dc->writeback_thread);
  949. if (!IS_ERR_OR_NULL(dc->status_update_thread))
  950. kthread_stop(dc->status_update_thread);
  951. mutex_lock(&bch_register_lock);
  952. if (atomic_read(&dc->running))
  953. bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
  954. bcache_device_free(&dc->disk);
  955. list_del(&dc->list);
  956. mutex_unlock(&bch_register_lock);
  957. if (dc->sb_bio.bi_inline_vecs[0].bv_page)
  958. put_page(bio_first_page_all(&dc->sb_bio));
  959. if (!IS_ERR_OR_NULL(dc->bdev))
  960. blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
  961. wake_up(&unregister_wait);
  962. kobject_put(&dc->disk.kobj);
  963. }
  964. static void cached_dev_flush(struct closure *cl)
  965. {
  966. struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
  967. struct bcache_device *d = &dc->disk;
  968. mutex_lock(&bch_register_lock);
  969. bcache_device_unlink(d);
  970. mutex_unlock(&bch_register_lock);
  971. bch_cache_accounting_destroy(&dc->accounting);
  972. kobject_del(&d->kobj);
  973. continue_at(cl, cached_dev_free, system_wq);
  974. }
  975. static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
  976. {
  977. int ret;
  978. struct io *io;
  979. struct request_queue *q = bdev_get_queue(dc->bdev);
  980. __module_get(THIS_MODULE);
  981. INIT_LIST_HEAD(&dc->list);
  982. closure_init(&dc->disk.cl, NULL);
  983. set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
  984. kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
  985. INIT_WORK(&dc->detach, cached_dev_detach_finish);
  986. sema_init(&dc->sb_write_mutex, 1);
  987. INIT_LIST_HEAD(&dc->io_lru);
  988. spin_lock_init(&dc->io_lock);
  989. bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
  990. dc->sequential_cutoff = 4 << 20;
  991. for (io = dc->io; io < dc->io + RECENT_IO; io++) {
  992. list_add(&io->lru, &dc->io_lru);
  993. hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
  994. }
  995. dc->disk.stripe_size = q->limits.io_opt >> 9;
  996. if (dc->disk.stripe_size)
  997. dc->partial_stripes_expensive =
  998. q->limits.raid_partial_stripes_expensive;
  999. ret = bcache_device_init(&dc->disk, block_size,
  1000. dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
  1001. if (ret)
  1002. return ret;
  1003. dc->disk.disk->queue->backing_dev_info->ra_pages =
  1004. max(dc->disk.disk->queue->backing_dev_info->ra_pages,
  1005. q->backing_dev_info->ra_pages);
  1006. atomic_set(&dc->io_errors, 0);
  1007. dc->io_disable = false;
  1008. dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;
  1009. /* default to auto */
  1010. dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO;
  1011. bch_cached_dev_request_init(dc);
  1012. bch_cached_dev_writeback_init(dc);
  1013. return 0;
  1014. }
  1015. /* Cached device - bcache superblock */
  1016. static void register_bdev(struct cache_sb *sb, struct page *sb_page,
  1017. struct block_device *bdev,
  1018. struct cached_dev *dc)
  1019. {
  1020. const char *err = "cannot allocate memory";
  1021. struct cache_set *c;
  1022. bdevname(bdev, dc->backing_dev_name);
  1023. memcpy(&dc->sb, sb, sizeof(struct cache_sb));
  1024. dc->bdev = bdev;
  1025. dc->bdev->bd_holder = dc;
  1026. bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
  1027. bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page;
  1028. get_page(sb_page);
  1029. if (cached_dev_init(dc, sb->block_size << 9))
  1030. goto err;
  1031. err = "error creating kobject";
  1032. if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
  1033. "bcache"))
  1034. goto err;
  1035. if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
  1036. goto err;
  1037. pr_info("registered backing device %s", dc->backing_dev_name);
  1038. list_add(&dc->list, &uncached_devices);
  1039. /* attach to a matched cache set if it exists */
  1040. list_for_each_entry(c, &bch_cache_sets, list)
  1041. bch_cached_dev_attach(dc, c, NULL);
  1042. if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
  1043. BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
  1044. bch_cached_dev_run(dc);
  1045. return;
  1046. err:
  1047. pr_notice("error %s: %s", dc->backing_dev_name, err);
  1048. bcache_device_stop(&dc->disk);
  1049. }
  1050. /* Flash only volumes */
  1051. void bch_flash_dev_release(struct kobject *kobj)
  1052. {
  1053. struct bcache_device *d = container_of(kobj, struct bcache_device,
  1054. kobj);
  1055. kfree(d);
  1056. }
  1057. static void flash_dev_free(struct closure *cl)
  1058. {
  1059. struct bcache_device *d = container_of(cl, struct bcache_device, cl);
  1060. mutex_lock(&bch_register_lock);
  1061. atomic_long_sub(bcache_dev_sectors_dirty(d),
  1062. &d->c->flash_dev_dirty_sectors);
  1063. bcache_device_free(d);
  1064. mutex_unlock(&bch_register_lock);
  1065. kobject_put(&d->kobj);
  1066. }
  1067. static void flash_dev_flush(struct closure *cl)
  1068. {
  1069. struct bcache_device *d = container_of(cl, struct bcache_device, cl);
  1070. mutex_lock(&bch_register_lock);
  1071. bcache_device_unlink(d);
  1072. mutex_unlock(&bch_register_lock);
  1073. kobject_del(&d->kobj);
  1074. continue_at(cl, flash_dev_free, system_wq);
  1075. }
  1076. static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
  1077. {
  1078. struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
  1079. GFP_KERNEL);
  1080. if (!d)
  1081. return -ENOMEM;
  1082. closure_init(&d->cl, NULL);
  1083. set_closure_fn(&d->cl, flash_dev_flush, system_wq);
  1084. kobject_init(&d->kobj, &bch_flash_dev_ktype);
  1085. if (bcache_device_init(d, block_bytes(c), u->sectors))
  1086. goto err;
  1087. bcache_device_attach(d, c, u - c->uuids);
  1088. bch_sectors_dirty_init(d);
  1089. bch_flash_dev_request_init(d);
  1090. add_disk(d->disk);
  1091. if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
  1092. goto err;
  1093. bcache_device_link(d, c, "volume");
  1094. return 0;
  1095. err:
  1096. kobject_put(&d->kobj);
  1097. return -ENOMEM;
  1098. }
  1099. static int flash_devs_run(struct cache_set *c)
  1100. {
  1101. int ret = 0;
  1102. struct uuid_entry *u;
  1103. for (u = c->uuids;
  1104. u < c->uuids + c->nr_uuids && !ret;
  1105. u++)
  1106. if (UUID_FLASH_ONLY(u))
  1107. ret = flash_dev_run(c, u);
  1108. return ret;
  1109. }
  1110. int bch_flash_dev_create(struct cache_set *c, uint64_t size)
  1111. {
  1112. struct uuid_entry *u;
  1113. if (test_bit(CACHE_SET_STOPPING, &c->flags))
  1114. return -EINTR;
  1115. if (!test_bit(CACHE_SET_RUNNING, &c->flags))
  1116. return -EPERM;
  1117. u = uuid_find_empty(c);
  1118. if (!u) {
  1119. pr_err("Can't create volume, no room for UUID");
  1120. return -EINVAL;
  1121. }
  1122. get_random_bytes(u->uuid, 16);
  1123. memset(u->label, 0, 32);
  1124. u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds());
  1125. SET_UUID_FLASH_ONLY(u, 1);
  1126. u->sectors = size >> 9;
  1127. bch_uuid_write(c);
  1128. return flash_dev_run(c, u);
  1129. }
  1130. bool bch_cached_dev_error(struct cached_dev *dc)
  1131. {
  1132. if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
  1133. return false;
  1134. dc->io_disable = true;
  1135. /* make others know io_disable is true earlier */
  1136. smp_mb();
  1137. pr_err("stop %s: too many IO errors on backing device %s\n",
  1138. dc->disk.disk->disk_name, dc->backing_dev_name);
  1139. bcache_device_stop(&dc->disk);
  1140. return true;
  1141. }
  1142. /* Cache set */
  1143. __printf(2, 3)
  1144. bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
  1145. {
  1146. va_list args;
  1147. if (c->on_error != ON_ERROR_PANIC &&
  1148. test_bit(CACHE_SET_STOPPING, &c->flags))
  1149. return false;
  1150. if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
  1151. pr_info("CACHE_SET_IO_DISABLE already set");
  1152. /*
  1153. * XXX: we can be called from atomic context
  1154. * acquire_console_sem();
  1155. */
  1156. pr_err("bcache: error on %pU: ", c->sb.set_uuid);
  1157. va_start(args, fmt);
  1158. vprintk(fmt, args);
  1159. va_end(args);
  1160. pr_err(", disabling caching\n");
  1161. if (c->on_error == ON_ERROR_PANIC)
  1162. panic("panic forced after error\n");
  1163. bch_cache_set_unregister(c);
  1164. return true;
  1165. }
  1166. void bch_cache_set_release(struct kobject *kobj)
  1167. {
  1168. struct cache_set *c = container_of(kobj, struct cache_set, kobj);
  1169. kfree(c);
  1170. module_put(THIS_MODULE);
  1171. }
  1172. static void cache_set_free(struct closure *cl)
  1173. {
  1174. struct cache_set *c = container_of(cl, struct cache_set, cl);
  1175. struct cache *ca;
  1176. unsigned int i;
  1177. debugfs_remove(c->debug);
  1178. bch_open_buckets_free(c);
  1179. bch_btree_cache_free(c);
  1180. bch_journal_free(c);
  1181. mutex_lock(&bch_register_lock);
  1182. for_each_cache(ca, c, i)
  1183. if (ca) {
  1184. ca->set = NULL;
  1185. c->cache[ca->sb.nr_this_dev] = NULL;
  1186. kobject_put(&ca->kobj);
  1187. }
  1188. bch_bset_sort_state_free(&c->sort);
  1189. free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
  1190. if (c->moving_gc_wq)
  1191. destroy_workqueue(c->moving_gc_wq);
  1192. bioset_exit(&c->bio_split);
  1193. mempool_exit(&c->fill_iter);
  1194. mempool_exit(&c->bio_meta);
  1195. mempool_exit(&c->search);
  1196. kfree(c->devices);
  1197. list_del(&c->list);
  1198. mutex_unlock(&bch_register_lock);
  1199. pr_info("Cache set %pU unregistered", c->sb.set_uuid);
  1200. wake_up(&unregister_wait);
  1201. closure_debug_destroy(&c->cl);
  1202. kobject_put(&c->kobj);
  1203. }
  1204. static void cache_set_flush(struct closure *cl)
  1205. {
  1206. struct cache_set *c = container_of(cl, struct cache_set, caching);
  1207. struct cache *ca;
  1208. struct btree *b;
  1209. unsigned int i;
  1210. bch_cache_accounting_destroy(&c->accounting);
  1211. kobject_put(&c->internal);
  1212. kobject_del(&c->kobj);
  1213. if (!IS_ERR_OR_NULL(c->gc_thread))
  1214. kthread_stop(c->gc_thread);
  1215. if (!IS_ERR_OR_NULL(c->root))
  1216. list_add(&c->root->list, &c->btree_cache);
  1217. /* Should skip this if we're unregistering because of an error */
  1218. list_for_each_entry(b, &c->btree_cache, list) {
  1219. mutex_lock(&b->write_lock);
  1220. if (btree_node_dirty(b))
  1221. __bch_btree_node_write(b, NULL);
  1222. mutex_unlock(&b->write_lock);
  1223. }
  1224. for_each_cache(ca, c, i)
  1225. if (ca->alloc_thread)
  1226. kthread_stop(ca->alloc_thread);
  1227. if (c->journal.cur) {
  1228. cancel_delayed_work_sync(&c->journal.work);
  1229. /* flush last journal entry if needed */
  1230. c->journal.work.work.func(&c->journal.work.work);
  1231. }
  1232. closure_return(cl);
  1233. }
  1234. /*
  1235. * This function is only called when CACHE_SET_IO_DISABLE is set, which means
  1236. * cache set is unregistering due to too many I/O errors. In this condition,
  1237. * the bcache device might be stopped, it depends on stop_when_cache_set_failed
  1238. * value and whether the broken cache has dirty data:
  1239. *
  1240. * dc->stop_when_cache_set_failed dc->has_dirty stop bcache device
  1241. * BCH_CACHED_STOP_AUTO 0 NO
  1242. * BCH_CACHED_STOP_AUTO 1 YES
  1243. * BCH_CACHED_DEV_STOP_ALWAYS 0 YES
  1244. * BCH_CACHED_DEV_STOP_ALWAYS 1 YES
  1245. *
  1246. * The expected behavior is, if stop_when_cache_set_failed is configured to
  1247. * "auto" via sysfs interface, the bcache device will not be stopped if the
  1248. * backing device is clean on the broken cache device.
  1249. */
  1250. static void conditional_stop_bcache_device(struct cache_set *c,
  1251. struct bcache_device *d,
  1252. struct cached_dev *dc)
  1253. {
  1254. if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
  1255. pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.",
  1256. d->disk->disk_name, c->sb.set_uuid);
  1257. bcache_device_stop(d);
  1258. } else if (atomic_read(&dc->has_dirty)) {
  1259. /*
  1260. * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
  1261. * and dc->has_dirty == 1
  1262. */
  1263. pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.",
  1264. d->disk->disk_name);
  1265. /*
  1266. * There might be a small time gap that cache set is
  1267. * released but bcache device is not. Inside this time
  1268. * gap, regular I/O requests will directly go into
  1269. * backing device as no cache set attached to. This
  1270. * behavior may also introduce potential inconsistence
  1271. * data in writeback mode while cache is dirty.
  1272. * Therefore before calling bcache_device_stop() due
  1273. * to a broken cache device, dc->io_disable should be
  1274. * explicitly set to true.
  1275. */
  1276. dc->io_disable = true;
  1277. /* make others know io_disable is true earlier */
  1278. smp_mb();
  1279. bcache_device_stop(d);
  1280. } else {
  1281. /*
  1282. * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
  1283. * and dc->has_dirty == 0
  1284. */
  1285. pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.",
  1286. d->disk->disk_name);
  1287. }
  1288. }
  1289. static void __cache_set_unregister(struct closure *cl)
  1290. {
  1291. struct cache_set *c = container_of(cl, struct cache_set, caching);
  1292. struct cached_dev *dc;
  1293. struct bcache_device *d;
  1294. size_t i;
  1295. mutex_lock(&bch_register_lock);
  1296. for (i = 0; i < c->devices_max_used; i++) {
  1297. d = c->devices[i];
  1298. if (!d)
  1299. continue;
  1300. if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
  1301. test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
  1302. dc = container_of(d, struct cached_dev, disk);
  1303. bch_cached_dev_detach(dc);
  1304. if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
  1305. conditional_stop_bcache_device(c, d, dc);
  1306. } else {
  1307. bcache_device_stop(d);
  1308. }
  1309. }
  1310. mutex_unlock(&bch_register_lock);
  1311. continue_at(cl, cache_set_flush, system_wq);
  1312. }
  1313. void bch_cache_set_stop(struct cache_set *c)
  1314. {
  1315. if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
  1316. closure_queue(&c->caching);
  1317. }
  1318. void bch_cache_set_unregister(struct cache_set *c)
  1319. {
  1320. set_bit(CACHE_SET_UNREGISTERING, &c->flags);
  1321. bch_cache_set_stop(c);
  1322. }
  1323. #define alloc_bucket_pages(gfp, c) \
  1324. ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(bucket_pages(c))))
  1325. struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
  1326. {
  1327. int iter_size;
  1328. struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
  1329. if (!c)
  1330. return NULL;
  1331. __module_get(THIS_MODULE);
  1332. closure_init(&c->cl, NULL);
  1333. set_closure_fn(&c->cl, cache_set_free, system_wq);
  1334. closure_init(&c->caching, &c->cl);
  1335. set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
  1336. /* Maybe create continue_at_noreturn() and use it here? */
  1337. closure_set_stopped(&c->cl);
  1338. closure_put(&c->cl);
  1339. kobject_init(&c->kobj, &bch_cache_set_ktype);
  1340. kobject_init(&c->internal, &bch_cache_set_internal_ktype);
  1341. bch_cache_accounting_init(&c->accounting, &c->cl);
  1342. memcpy(c->sb.set_uuid, sb->set_uuid, 16);
  1343. c->sb.block_size = sb->block_size;
  1344. c->sb.bucket_size = sb->bucket_size;
  1345. c->sb.nr_in_set = sb->nr_in_set;
  1346. c->sb.last_mount = sb->last_mount;
  1347. c->bucket_bits = ilog2(sb->bucket_size);
  1348. c->block_bits = ilog2(sb->block_size);
  1349. c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
  1350. c->devices_max_used = 0;
  1351. atomic_set(&c->attached_dev_nr, 0);
  1352. c->btree_pages = bucket_pages(c);
  1353. if (c->btree_pages > BTREE_MAX_PAGES)
  1354. c->btree_pages = max_t(int, c->btree_pages / 4,
  1355. BTREE_MAX_PAGES);
  1356. sema_init(&c->sb_write_mutex, 1);
  1357. mutex_init(&c->bucket_lock);
  1358. init_waitqueue_head(&c->btree_cache_wait);
  1359. spin_lock_init(&c->btree_cannibalize_lock);
  1360. init_waitqueue_head(&c->bucket_wait);
  1361. init_waitqueue_head(&c->gc_wait);
  1362. sema_init(&c->uuid_write_mutex, 1);
  1363. spin_lock_init(&c->btree_gc_time.lock);
  1364. spin_lock_init(&c->btree_split_time.lock);
  1365. spin_lock_init(&c->btree_read_time.lock);
  1366. bch_moving_init_cache_set(c);
  1367. INIT_LIST_HEAD(&c->list);
  1368. INIT_LIST_HEAD(&c->cached_devs);
  1369. INIT_LIST_HEAD(&c->btree_cache);
  1370. INIT_LIST_HEAD(&c->btree_cache_freeable);
  1371. INIT_LIST_HEAD(&c->btree_cache_freed);
  1372. INIT_LIST_HEAD(&c->data_buckets);
  1373. iter_size = (sb->bucket_size / sb->block_size + 1) *
  1374. sizeof(struct btree_iter_set);
  1375. if (!(c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL)) ||
  1376. mempool_init_slab_pool(&c->search, 32, bch_search_cache) ||
  1377. mempool_init_kmalloc_pool(&c->bio_meta, 2,
  1378. sizeof(struct bbio) + sizeof(struct bio_vec) *
  1379. bucket_pages(c)) ||
  1380. mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
  1381. bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
  1382. BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
  1383. !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
  1384. !(c->moving_gc_wq = alloc_workqueue("bcache_gc",
  1385. WQ_MEM_RECLAIM, 0)) ||
  1386. bch_journal_alloc(c) ||
  1387. bch_btree_cache_alloc(c) ||
  1388. bch_open_buckets_alloc(c) ||
  1389. bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
  1390. goto err;
  1391. c->congested_read_threshold_us = 2000;
  1392. c->congested_write_threshold_us = 20000;
  1393. c->error_limit = DEFAULT_IO_ERROR_LIMIT;
  1394. WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
  1395. return c;
  1396. err:
  1397. bch_cache_set_unregister(c);
  1398. return NULL;
  1399. }
  1400. static int run_cache_set(struct cache_set *c)
  1401. {
  1402. const char *err = "cannot allocate memory";
  1403. struct cached_dev *dc, *t;
  1404. struct cache *ca;
  1405. struct closure cl;
  1406. unsigned int i;
  1407. LIST_HEAD(journal);
  1408. struct journal_replay *l;
  1409. closure_init_stack(&cl);
  1410. for_each_cache(ca, c, i)
  1411. c->nbuckets += ca->sb.nbuckets;
  1412. set_gc_sectors(c);
  1413. if (CACHE_SYNC(&c->sb)) {
  1414. struct bkey *k;
  1415. struct jset *j;
  1416. err = "cannot allocate memory for journal";
  1417. if (bch_journal_read(c, &journal))
  1418. goto err;
  1419. pr_debug("btree_journal_read() done");
  1420. err = "no journal entries found";
  1421. if (list_empty(&journal))
  1422. goto err;
  1423. j = &list_entry(journal.prev, struct journal_replay, list)->j;
  1424. err = "IO error reading priorities";
  1425. for_each_cache(ca, c, i)
  1426. prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
  1427. /*
  1428. * If prio_read() fails it'll call cache_set_error and we'll
  1429. * tear everything down right away, but if we perhaps checked
  1430. * sooner we could avoid journal replay.
  1431. */
  1432. k = &j->btree_root;
  1433. err = "bad btree root";
  1434. if (__bch_btree_ptr_invalid(c, k))
  1435. goto err;
  1436. err = "error reading btree root";
  1437. c->root = bch_btree_node_get(c, NULL, k,
  1438. j->btree_level,
  1439. true, NULL);
  1440. if (IS_ERR_OR_NULL(c->root))
  1441. goto err;
  1442. list_del_init(&c->root->list);
  1443. rw_unlock(true, c->root);
  1444. err = uuid_read(c, j, &cl);
  1445. if (err)
  1446. goto err;
  1447. err = "error in recovery";
  1448. if (bch_btree_check(c))
  1449. goto err;
  1450. bch_journal_mark(c, &journal);
  1451. bch_initial_gc_finish(c);
  1452. pr_debug("btree_check() done");
  1453. /*
  1454. * bcache_journal_next() can't happen sooner, or
  1455. * btree_gc_finish() will give spurious errors about last_gc >
  1456. * gc_gen - this is a hack but oh well.
  1457. */
  1458. bch_journal_next(&c->journal);
  1459. err = "error starting allocator thread";
  1460. for_each_cache(ca, c, i)
  1461. if (bch_cache_allocator_start(ca))
  1462. goto err;
  1463. /*
  1464. * First place it's safe to allocate: btree_check() and
  1465. * btree_gc_finish() have to run before we have buckets to
  1466. * allocate, and bch_bucket_alloc_set() might cause a journal
  1467. * entry to be written so bcache_journal_next() has to be called
  1468. * first.
  1469. *
  1470. * If the uuids were in the old format we have to rewrite them
  1471. * before the next journal entry is written:
  1472. */
  1473. if (j->version < BCACHE_JSET_VERSION_UUID)
  1474. __uuid_write(c);
  1475. err = "bcache: replay journal failed";
  1476. if (bch_journal_replay(c, &journal))
  1477. goto err;
  1478. } else {
  1479. pr_notice("invalidating existing data");
  1480. for_each_cache(ca, c, i) {
  1481. unsigned int j;
  1482. ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
  1483. 2, SB_JOURNAL_BUCKETS);
  1484. for (j = 0; j < ca->sb.keys; j++)
  1485. ca->sb.d[j] = ca->sb.first_bucket + j;
  1486. }
  1487. bch_initial_gc_finish(c);
  1488. err = "error starting allocator thread";
  1489. for_each_cache(ca, c, i)
  1490. if (bch_cache_allocator_start(ca))
  1491. goto err;
  1492. mutex_lock(&c->bucket_lock);
  1493. for_each_cache(ca, c, i)
  1494. bch_prio_write(ca, true);
  1495. mutex_unlock(&c->bucket_lock);
  1496. err = "cannot allocate new UUID bucket";
  1497. if (__uuid_write(c))
  1498. goto err;
  1499. err = "cannot allocate new btree root";
  1500. c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
  1501. if (IS_ERR_OR_NULL(c->root))
  1502. goto err;
  1503. mutex_lock(&c->root->write_lock);
  1504. bkey_copy_key(&c->root->key, &MAX_KEY);
  1505. bch_btree_node_write(c->root, &cl);
  1506. mutex_unlock(&c->root->write_lock);
  1507. bch_btree_set_root(c->root);
  1508. rw_unlock(true, c->root);
  1509. /*
  1510. * We don't want to write the first journal entry until
  1511. * everything is set up - fortunately journal entries won't be
  1512. * written until the SET_CACHE_SYNC() here:
  1513. */
  1514. SET_CACHE_SYNC(&c->sb, true);
  1515. bch_journal_next(&c->journal);
  1516. bch_journal_meta(c, &cl);
  1517. }
  1518. err = "error starting gc thread";
  1519. if (bch_gc_thread_start(c))
  1520. goto err;
  1521. closure_sync(&cl);
  1522. c->sb.last_mount = (u32)ktime_get_real_seconds();
  1523. bcache_write_super(c);
  1524. list_for_each_entry_safe(dc, t, &uncached_devices, list)
  1525. bch_cached_dev_attach(dc, c, NULL);
  1526. flash_devs_run(c);
  1527. set_bit(CACHE_SET_RUNNING, &c->flags);
  1528. return 0;
  1529. err:
  1530. while (!list_empty(&journal)) {
  1531. l = list_first_entry(&journal, struct journal_replay, list);
  1532. list_del(&l->list);
  1533. kfree(l);
  1534. }
  1535. closure_sync(&cl);
  1536. /* XXX: test this, it's broken */
  1537. bch_cache_set_error(c, "%s", err);
  1538. return -EIO;
  1539. }
  1540. static bool can_attach_cache(struct cache *ca, struct cache_set *c)
  1541. {
  1542. return ca->sb.block_size == c->sb.block_size &&
  1543. ca->sb.bucket_size == c->sb.bucket_size &&
  1544. ca->sb.nr_in_set == c->sb.nr_in_set;
  1545. }
  1546. static const char *register_cache_set(struct cache *ca)
  1547. {
  1548. char buf[12];
  1549. const char *err = "cannot allocate memory";
  1550. struct cache_set *c;
  1551. list_for_each_entry(c, &bch_cache_sets, list)
  1552. if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
  1553. if (c->cache[ca->sb.nr_this_dev])
  1554. return "duplicate cache set member";
  1555. if (!can_attach_cache(ca, c))
  1556. return "cache sb does not match set";
  1557. if (!CACHE_SYNC(&ca->sb))
  1558. SET_CACHE_SYNC(&c->sb, false);
  1559. goto found;
  1560. }
  1561. c = bch_cache_set_alloc(&ca->sb);
  1562. if (!c)
  1563. return err;
  1564. err = "error creating kobject";
  1565. if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
  1566. kobject_add(&c->internal, &c->kobj, "internal"))
  1567. goto err;
  1568. if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
  1569. goto err;
  1570. bch_debug_init_cache_set(c);
  1571. list_add(&c->list, &bch_cache_sets);
  1572. found:
  1573. sprintf(buf, "cache%i", ca->sb.nr_this_dev);
  1574. if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
  1575. sysfs_create_link(&c->kobj, &ca->kobj, buf))
  1576. goto err;
  1577. /*
  1578. * A special case is both ca->sb.seq and c->sb.seq are 0,
  1579. * such condition happens on a new created cache device whose
  1580. * super block is never flushed yet. In this case c->sb.version
  1581. * and other members should be updated too, otherwise we will
  1582. * have a mistaken super block version in cache set.
  1583. */
  1584. if (ca->sb.seq > c->sb.seq || c->sb.seq == 0) {
  1585. c->sb.version = ca->sb.version;
  1586. memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
  1587. c->sb.flags = ca->sb.flags;
  1588. c->sb.seq = ca->sb.seq;
  1589. pr_debug("set version = %llu", c->sb.version);
  1590. }
  1591. kobject_get(&ca->kobj);
  1592. ca->set = c;
  1593. ca->set->cache[ca->sb.nr_this_dev] = ca;
  1594. c->cache_by_alloc[c->caches_loaded++] = ca;
  1595. if (c->caches_loaded == c->sb.nr_in_set) {
  1596. err = "failed to run cache set";
  1597. if (run_cache_set(c) < 0)
  1598. goto err;
  1599. }
  1600. return NULL;
  1601. err:
  1602. bch_cache_set_unregister(c);
  1603. return err;
  1604. }
  1605. /* Cache device */
  1606. void bch_cache_release(struct kobject *kobj)
  1607. {
  1608. struct cache *ca = container_of(kobj, struct cache, kobj);
  1609. unsigned int i;
  1610. if (ca->set) {
  1611. BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
  1612. ca->set->cache[ca->sb.nr_this_dev] = NULL;
  1613. }
  1614. free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
  1615. kfree(ca->prio_buckets);
  1616. vfree(ca->buckets);
  1617. free_heap(&ca->heap);
  1618. free_fifo(&ca->free_inc);
  1619. for (i = 0; i < RESERVE_NR; i++)
  1620. free_fifo(&ca->free[i]);
  1621. if (ca->sb_bio.bi_inline_vecs[0].bv_page)
  1622. put_page(bio_first_page_all(&ca->sb_bio));
  1623. if (!IS_ERR_OR_NULL(ca->bdev))
  1624. blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
  1625. kfree(ca);
  1626. module_put(THIS_MODULE);
  1627. }
  1628. static int cache_alloc(struct cache *ca)
  1629. {
  1630. size_t free;
  1631. size_t btree_buckets;
  1632. struct bucket *b;
  1633. __module_get(THIS_MODULE);
  1634. kobject_init(&ca->kobj, &bch_cache_ktype);
  1635. bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8);
  1636. /*
  1637. * when ca->sb.njournal_buckets is not zero, journal exists,
  1638. * and in bch_journal_replay(), tree node may split,
  1639. * so bucket of RESERVE_BTREE type is needed,
  1640. * the worst situation is all journal buckets are valid journal,
  1641. * and all the keys need to replay,
  1642. * so the number of RESERVE_BTREE type buckets should be as much
  1643. * as journal buckets
  1644. */
  1645. btree_buckets = ca->sb.njournal_buckets ?: 8;
  1646. free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
  1647. if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets, GFP_KERNEL) ||
  1648. !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
  1649. !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
  1650. !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
  1651. !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
  1652. !init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
  1653. !(ca->buckets = vzalloc(array_size(sizeof(struct bucket),
  1654. ca->sb.nbuckets))) ||
  1655. !(ca->prio_buckets = kzalloc(array3_size(sizeof(uint64_t),
  1656. prio_buckets(ca), 2),
  1657. GFP_KERNEL)) ||
  1658. !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)))
  1659. return -ENOMEM;
  1660. ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
  1661. for_each_bucket(b, ca)
  1662. atomic_set(&b->pin, 0);
  1663. return 0;
  1664. }
  1665. static int register_cache(struct cache_sb *sb, struct page *sb_page,
  1666. struct block_device *bdev, struct cache *ca)
  1667. {
  1668. const char *err = NULL; /* must be set for any error case */
  1669. int ret = 0;
  1670. bdevname(bdev, ca->cache_dev_name);
  1671. memcpy(&ca->sb, sb, sizeof(struct cache_sb));
  1672. ca->bdev = bdev;
  1673. ca->bdev->bd_holder = ca;
  1674. bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
  1675. bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page;
  1676. get_page(sb_page);
  1677. if (blk_queue_discard(bdev_get_queue(bdev)))
  1678. ca->discard = CACHE_DISCARD(&ca->sb);
  1679. ret = cache_alloc(ca);
  1680. if (ret != 0) {
  1681. blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
  1682. if (ret == -ENOMEM)
  1683. err = "cache_alloc(): -ENOMEM";
  1684. else
  1685. err = "cache_alloc(): unknown error";
  1686. goto err;
  1687. }
  1688. if (kobject_add(&ca->kobj,
  1689. &part_to_dev(bdev->bd_part)->kobj,
  1690. "bcache")) {
  1691. err = "error calling kobject_add";
  1692. ret = -ENOMEM;
  1693. goto out;
  1694. }
  1695. mutex_lock(&bch_register_lock);
  1696. err = register_cache_set(ca);
  1697. mutex_unlock(&bch_register_lock);
  1698. if (err) {
  1699. ret = -ENODEV;
  1700. goto out;
  1701. }
  1702. pr_info("registered cache device %s", ca->cache_dev_name);
  1703. out:
  1704. kobject_put(&ca->kobj);
  1705. err:
  1706. if (err)
  1707. pr_notice("error %s: %s", ca->cache_dev_name, err);
  1708. return ret;
  1709. }
  1710. /* Global interfaces/init */
  1711. static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
  1712. const char *buffer, size_t size);
  1713. kobj_attribute_write(register, register_bcache);
  1714. kobj_attribute_write(register_quiet, register_bcache);
  1715. static bool bch_is_open_backing(struct block_device *bdev)
  1716. {
  1717. struct cache_set *c, *tc;
  1718. struct cached_dev *dc, *t;
  1719. list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
  1720. list_for_each_entry_safe(dc, t, &c->cached_devs, list)
  1721. if (dc->bdev == bdev)
  1722. return true;
  1723. list_for_each_entry_safe(dc, t, &uncached_devices, list)
  1724. if (dc->bdev == bdev)
  1725. return true;
  1726. return false;
  1727. }
  1728. static bool bch_is_open_cache(struct block_device *bdev)
  1729. {
  1730. struct cache_set *c, *tc;
  1731. struct cache *ca;
  1732. unsigned int i;
  1733. list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
  1734. for_each_cache(ca, c, i)
  1735. if (ca->bdev == bdev)
  1736. return true;
  1737. return false;
  1738. }
  1739. static bool bch_is_open(struct block_device *bdev)
  1740. {
  1741. return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
  1742. }
  1743. static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
  1744. const char *buffer, size_t size)
  1745. {
  1746. ssize_t ret = size;
  1747. const char *err = "cannot allocate memory";
  1748. char *path = NULL;
  1749. struct cache_sb *sb = NULL;
  1750. struct block_device *bdev = NULL;
  1751. struct page *sb_page = NULL;
  1752. if (!try_module_get(THIS_MODULE))
  1753. return -EBUSY;
  1754. path = kstrndup(buffer, size, GFP_KERNEL);
  1755. if (!path)
  1756. goto err;
  1757. sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
  1758. if (!sb)
  1759. goto err;
  1760. err = "failed to open device";
  1761. bdev = blkdev_get_by_path(strim(path),
  1762. FMODE_READ|FMODE_WRITE|FMODE_EXCL,
  1763. sb);
  1764. if (IS_ERR(bdev)) {
  1765. if (bdev == ERR_PTR(-EBUSY)) {
  1766. bdev = lookup_bdev(strim(path));
  1767. mutex_lock(&bch_register_lock);
  1768. if (!IS_ERR(bdev) && bch_is_open(bdev))
  1769. err = "device already registered";
  1770. else
  1771. err = "device busy";
  1772. mutex_unlock(&bch_register_lock);
  1773. if (!IS_ERR(bdev))
  1774. bdput(bdev);
  1775. if (attr == &ksysfs_register_quiet)
  1776. goto out;
  1777. }
  1778. goto err;
  1779. }
  1780. err = "failed to set blocksize";
  1781. if (set_blocksize(bdev, 4096))
  1782. goto err_close;
  1783. err = read_super(sb, bdev, &sb_page);
  1784. if (err)
  1785. goto err_close;
  1786. err = "failed to register device";
  1787. if (SB_IS_BDEV(sb)) {
  1788. struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
  1789. if (!dc)
  1790. goto err_close;
  1791. mutex_lock(&bch_register_lock);
  1792. register_bdev(sb, sb_page, bdev, dc);
  1793. mutex_unlock(&bch_register_lock);
  1794. } else {
  1795. struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
  1796. if (!ca)
  1797. goto err_close;
  1798. if (register_cache(sb, sb_page, bdev, ca) != 0)
  1799. goto err;
  1800. }
  1801. out:
  1802. if (sb_page)
  1803. put_page(sb_page);
  1804. kfree(sb);
  1805. kfree(path);
  1806. module_put(THIS_MODULE);
  1807. return ret;
  1808. err_close:
  1809. blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
  1810. err:
  1811. pr_info("error %s: %s", path, err);
  1812. ret = -EINVAL;
  1813. goto out;
  1814. }
  1815. static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
  1816. {
  1817. if (code == SYS_DOWN ||
  1818. code == SYS_HALT ||
  1819. code == SYS_POWER_OFF) {
  1820. DEFINE_WAIT(wait);
  1821. unsigned long start = jiffies;
  1822. bool stopped = false;
  1823. struct cache_set *c, *tc;
  1824. struct cached_dev *dc, *tdc;
  1825. mutex_lock(&bch_register_lock);
  1826. if (list_empty(&bch_cache_sets) &&
  1827. list_empty(&uncached_devices))
  1828. goto out;
  1829. pr_info("Stopping all devices:");
  1830. list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
  1831. bch_cache_set_stop(c);
  1832. list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
  1833. bcache_device_stop(&dc->disk);
  1834. /* What's a condition variable? */
  1835. while (1) {
  1836. long timeout = start + 2 * HZ - jiffies;
  1837. stopped = list_empty(&bch_cache_sets) &&
  1838. list_empty(&uncached_devices);
  1839. if (timeout < 0 || stopped)
  1840. break;
  1841. prepare_to_wait(&unregister_wait, &wait,
  1842. TASK_UNINTERRUPTIBLE);
  1843. mutex_unlock(&bch_register_lock);
  1844. schedule_timeout(timeout);
  1845. mutex_lock(&bch_register_lock);
  1846. }
  1847. finish_wait(&unregister_wait, &wait);
  1848. if (stopped)
  1849. pr_info("All devices stopped");
  1850. else
  1851. pr_notice("Timeout waiting for devices to be closed");
  1852. out:
  1853. mutex_unlock(&bch_register_lock);
  1854. }
  1855. return NOTIFY_DONE;
  1856. }
  1857. static struct notifier_block reboot = {
  1858. .notifier_call = bcache_reboot,
  1859. .priority = INT_MAX, /* before any real devices */
  1860. };
  1861. static void bcache_exit(void)
  1862. {
  1863. bch_debug_exit();
  1864. bch_request_exit();
  1865. if (bcache_kobj)
  1866. kobject_put(bcache_kobj);
  1867. if (bcache_wq)
  1868. destroy_workqueue(bcache_wq);
  1869. if (bch_journal_wq)
  1870. destroy_workqueue(bch_journal_wq);
  1871. if (bcache_major)
  1872. unregister_blkdev(bcache_major, "bcache");
  1873. unregister_reboot_notifier(&reboot);
  1874. mutex_destroy(&bch_register_lock);
  1875. }
  1876. static int __init bcache_init(void)
  1877. {
  1878. static const struct attribute *files[] = {
  1879. &ksysfs_register.attr,
  1880. &ksysfs_register_quiet.attr,
  1881. NULL
  1882. };
  1883. mutex_init(&bch_register_lock);
  1884. init_waitqueue_head(&unregister_wait);
  1885. register_reboot_notifier(&reboot);
  1886. bcache_major = register_blkdev(0, "bcache");
  1887. if (bcache_major < 0) {
  1888. unregister_reboot_notifier(&reboot);
  1889. mutex_destroy(&bch_register_lock);
  1890. return bcache_major;
  1891. }
  1892. bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
  1893. if (!bcache_wq)
  1894. goto err;
  1895. bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0);
  1896. if (!bch_journal_wq)
  1897. goto err;
  1898. bcache_kobj = kobject_create_and_add("bcache", fs_kobj);
  1899. if (!bcache_kobj)
  1900. goto err;
  1901. if (bch_request_init() ||
  1902. sysfs_create_files(bcache_kobj, files))
  1903. goto err;
  1904. bch_debug_init(bcache_kobj);
  1905. closure_debug_init();
  1906. return 0;
  1907. err:
  1908. bcache_exit();
  1909. return -ENOMEM;
  1910. }
  1911. module_exit(bcache_exit);
  1912. module_init(bcache_init);