drbd_bitmap.c 47 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674
  1. /*
  2. drbd_bitmap.c
  3. This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
  4. Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
  5. Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
  6. Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
  7. drbd is free software; you can redistribute it and/or modify
  8. it under the terms of the GNU General Public License as published by
  9. the Free Software Foundation; either version 2, or (at your option)
  10. any later version.
  11. drbd is distributed in the hope that it will be useful,
  12. but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. GNU General Public License for more details.
  15. You should have received a copy of the GNU General Public License
  16. along with drbd; see the file COPYING. If not, write to
  17. the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  18. */
  19. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  20. #include <linux/bitmap.h>
  21. #include <linux/vmalloc.h>
  22. #include <linux/string.h>
  23. #include <linux/drbd.h>
  24. #include <linux/slab.h>
  25. #include <linux/highmem.h>
  26. #include "drbd_int.h"
  27. /* OPAQUE outside this file!
  28. * interface defined in drbd_int.h
  29. * convention:
  30. * function name drbd_bm_... => used elsewhere, "public".
  31. * function name bm_... => internal to implementation, "private".
  32. */
  33. /*
  34. * LIMITATIONS:
  35. * We want to support >= peta byte of backend storage, while for now still using
  36. * a granularity of one bit per 4KiB of storage.
  37. * 1 << 50 bytes backend storage (1 PiB)
  38. * 1 << (50 - 12) bits needed
  39. * 38 --> we need u64 to index and count bits
  40. * 1 << (38 - 3) bitmap bytes needed
  41. * 35 --> we still need u64 to index and count bytes
  42. * (that's 32 GiB of bitmap for 1 PiB storage)
  43. * 1 << (35 - 2) 32bit longs needed
  44. * 33 --> we'd even need u64 to index and count 32bit long words.
  45. * 1 << (35 - 3) 64bit longs needed
  46. * 32 --> we could get away with a 32bit unsigned int to index and count
  47. * 64bit long words, but I rather stay with unsigned long for now.
  48. * We probably should neither count nor point to bytes or long words
  49. * directly, but either by bitnumber, or by page index and offset.
  50. * 1 << (35 - 12)
  51. * 22 --> we need that much 4KiB pages of bitmap.
  52. * 1 << (22 + 3) --> on a 64bit arch,
  53. * we need 32 MiB to store the array of page pointers.
  54. *
  55. * Because I'm lazy, and because the resulting patch was too large, too ugly
  56. * and still incomplete, on 32bit we still "only" support 16 TiB (minus some),
  57. * (1 << 32) bits * 4k storage.
  58. *
  59. * bitmap storage and IO:
  60. * Bitmap is stored little endian on disk, and is kept little endian in
  61. * core memory. Currently we still hold the full bitmap in core as long
  62. * as we are "attached" to a local disk, which at 32 GiB for 1PiB storage
  63. * seems excessive.
  64. *
  65. * We plan to reduce the amount of in-core bitmap pages by paging them in
  66. * and out against their on-disk location as necessary, but need to make
  67. * sure we don't cause too much meta data IO, and must not deadlock in
  68. * tight memory situations. This needs some more work.
  69. */
  70. /*
  71. * NOTE
  72. * Access to the *bm_pages is protected by bm_lock.
  73. * It is safe to read the other members within the lock.
  74. *
  75. * drbd_bm_set_bits is called from bio_endio callbacks,
  76. * We may be called with irq already disabled,
  77. * so we need spin_lock_irqsave().
  78. * And we need the kmap_atomic.
  79. */
  80. struct drbd_bitmap {
  81. struct page **bm_pages;
  82. spinlock_t bm_lock;
  83. /* exclusively to be used by __al_write_transaction(),
  84. * drbd_bm_mark_for_writeout() and
  85. * and drbd_bm_write_hinted() -> bm_rw() called from there.
  86. */
  87. unsigned int n_bitmap_hints;
  88. unsigned int al_bitmap_hints[AL_UPDATES_PER_TRANSACTION];
  89. /* see LIMITATIONS: above */
  90. unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
  91. unsigned long bm_bits;
  92. size_t bm_words;
  93. size_t bm_number_of_pages;
  94. sector_t bm_dev_capacity;
  95. struct mutex bm_change; /* serializes resize operations */
  96. wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */
  97. enum bm_flag bm_flags;
  98. /* debugging aid, in case we are still racy somewhere */
  99. char *bm_why;
  100. struct task_struct *bm_task;
  101. };
  102. #define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
  103. static void __bm_print_lock_info(struct drbd_device *device, const char *func)
  104. {
  105. struct drbd_bitmap *b = device->bitmap;
  106. if (!__ratelimit(&drbd_ratelimit_state))
  107. return;
  108. drbd_err(device, "FIXME %s[%d] in %s, bitmap locked for '%s' by %s[%d]\n",
  109. current->comm, task_pid_nr(current),
  110. func, b->bm_why ?: "?",
  111. b->bm_task->comm, task_pid_nr(b->bm_task));
  112. }
  113. void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags)
  114. {
  115. struct drbd_bitmap *b = device->bitmap;
  116. int trylock_failed;
  117. if (!b) {
  118. drbd_err(device, "FIXME no bitmap in drbd_bm_lock!?\n");
  119. return;
  120. }
  121. trylock_failed = !mutex_trylock(&b->bm_change);
  122. if (trylock_failed) {
  123. drbd_warn(device, "%s[%d] going to '%s' but bitmap already locked for '%s' by %s[%d]\n",
  124. current->comm, task_pid_nr(current),
  125. why, b->bm_why ?: "?",
  126. b->bm_task->comm, task_pid_nr(b->bm_task));
  127. mutex_lock(&b->bm_change);
  128. }
  129. if (BM_LOCKED_MASK & b->bm_flags)
  130. drbd_err(device, "FIXME bitmap already locked in bm_lock\n");
  131. b->bm_flags |= flags & BM_LOCKED_MASK;
  132. b->bm_why = why;
  133. b->bm_task = current;
  134. }
  135. void drbd_bm_unlock(struct drbd_device *device)
  136. {
  137. struct drbd_bitmap *b = device->bitmap;
  138. if (!b) {
  139. drbd_err(device, "FIXME no bitmap in drbd_bm_unlock!?\n");
  140. return;
  141. }
  142. if (!(BM_LOCKED_MASK & device->bitmap->bm_flags))
  143. drbd_err(device, "FIXME bitmap not locked in bm_unlock\n");
  144. b->bm_flags &= ~BM_LOCKED_MASK;
  145. b->bm_why = NULL;
  146. b->bm_task = NULL;
  147. mutex_unlock(&b->bm_change);
  148. }
  149. /* we store some "meta" info about our pages in page->private */
  150. /* at a granularity of 4k storage per bitmap bit:
  151. * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks
  152. * 1<<38 bits,
  153. * 1<<23 4k bitmap pages.
  154. * Use 24 bits as page index, covers 2 peta byte storage
  155. * at a granularity of 4k per bit.
  156. * Used to report the failed page idx on io error from the endio handlers.
  157. */
  158. #define BM_PAGE_IDX_MASK ((1UL<<24)-1)
  159. /* this page is currently read in, or written back */
  160. #define BM_PAGE_IO_LOCK 31
  161. /* if there has been an IO error for this page */
  162. #define BM_PAGE_IO_ERROR 30
  163. /* this is to be able to intelligently skip disk IO,
  164. * set if bits have been set since last IO. */
  165. #define BM_PAGE_NEED_WRITEOUT 29
  166. /* to mark for lazy writeout once syncer cleared all clearable bits,
  167. * we if bits have been cleared since last IO. */
  168. #define BM_PAGE_LAZY_WRITEOUT 28
  169. /* pages marked with this "HINT" will be considered for writeout
  170. * on activity log transactions */
  171. #define BM_PAGE_HINT_WRITEOUT 27
  172. /* store_page_idx uses non-atomic assignment. It is only used directly after
  173. * allocating the page. All other bm_set_page_* and bm_clear_page_* need to
  174. * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
  175. * changes) may happen from various contexts, and wait_on_bit/wake_up_bit
  176. * requires it all to be atomic as well. */
  177. static void bm_store_page_idx(struct page *page, unsigned long idx)
  178. {
  179. BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
  180. set_page_private(page, idx);
  181. }
  182. static unsigned long bm_page_to_idx(struct page *page)
  183. {
  184. return page_private(page) & BM_PAGE_IDX_MASK;
  185. }
  186. /* As is very unlikely that the same page is under IO from more than one
  187. * context, we can get away with a bit per page and one wait queue per bitmap.
  188. */
  189. static void bm_page_lock_io(struct drbd_device *device, int page_nr)
  190. {
  191. struct drbd_bitmap *b = device->bitmap;
  192. void *addr = &page_private(b->bm_pages[page_nr]);
  193. wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr));
  194. }
  195. static void bm_page_unlock_io(struct drbd_device *device, int page_nr)
  196. {
  197. struct drbd_bitmap *b = device->bitmap;
  198. void *addr = &page_private(b->bm_pages[page_nr]);
  199. clear_bit_unlock(BM_PAGE_IO_LOCK, addr);
  200. wake_up(&device->bitmap->bm_io_wait);
  201. }
  202. /* set _before_ submit_io, so it may be reset due to being changed
  203. * while this page is in flight... will get submitted later again */
  204. static void bm_set_page_unchanged(struct page *page)
  205. {
  206. /* use cmpxchg? */
  207. clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
  208. clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
  209. }
  210. static void bm_set_page_need_writeout(struct page *page)
  211. {
  212. set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
  213. }
  214. void drbd_bm_reset_al_hints(struct drbd_device *device)
  215. {
  216. device->bitmap->n_bitmap_hints = 0;
  217. }
  218. /**
  219. * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
  220. * @device: DRBD device.
  221. * @page_nr: the bitmap page to mark with the "hint" flag
  222. *
  223. * From within an activity log transaction, we mark a few pages with these
  224. * hints, then call drbd_bm_write_hinted(), which will only write out changed
  225. * pages which are flagged with this mark.
  226. */
  227. void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr)
  228. {
  229. struct drbd_bitmap *b = device->bitmap;
  230. struct page *page;
  231. if (page_nr >= device->bitmap->bm_number_of_pages) {
  232. drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n",
  233. page_nr, (int)device->bitmap->bm_number_of_pages);
  234. return;
  235. }
  236. page = device->bitmap->bm_pages[page_nr];
  237. BUG_ON(b->n_bitmap_hints >= ARRAY_SIZE(b->al_bitmap_hints));
  238. if (!test_and_set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)))
  239. b->al_bitmap_hints[b->n_bitmap_hints++] = page_nr;
  240. }
  241. static int bm_test_page_unchanged(struct page *page)
  242. {
  243. volatile const unsigned long *addr = &page_private(page);
  244. return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0;
  245. }
  246. static void bm_set_page_io_err(struct page *page)
  247. {
  248. set_bit(BM_PAGE_IO_ERROR, &page_private(page));
  249. }
  250. static void bm_clear_page_io_err(struct page *page)
  251. {
  252. clear_bit(BM_PAGE_IO_ERROR, &page_private(page));
  253. }
  254. static void bm_set_page_lazy_writeout(struct page *page)
  255. {
  256. set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
  257. }
  258. static int bm_test_page_lazy_writeout(struct page *page)
  259. {
  260. return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
  261. }
  262. /* on a 32bit box, this would allow for exactly (2<<38) bits. */
  263. static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr)
  264. {
  265. /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
  266. unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3);
  267. BUG_ON(page_nr >= b->bm_number_of_pages);
  268. return page_nr;
  269. }
  270. static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
  271. {
  272. /* page_nr = (bitnr/8) >> PAGE_SHIFT; */
  273. unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3);
  274. BUG_ON(page_nr >= b->bm_number_of_pages);
  275. return page_nr;
  276. }
  277. static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
  278. {
  279. struct page *page = b->bm_pages[idx];
  280. return (unsigned long *) kmap_atomic(page);
  281. }
  282. static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
  283. {
  284. return __bm_map_pidx(b, idx);
  285. }
  286. static void __bm_unmap(unsigned long *p_addr)
  287. {
  288. kunmap_atomic(p_addr);
  289. };
  290. static void bm_unmap(unsigned long *p_addr)
  291. {
  292. return __bm_unmap(p_addr);
  293. }
  294. /* long word offset of _bitmap_ sector */
  295. #define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
  296. /* word offset from start of bitmap to word number _in_page_
  297. * modulo longs per page
  298. #define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
  299. hm, well, Philipp thinks gcc might not optimize the % into & (... - 1)
  300. so do it explicitly:
  301. */
  302. #define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
  303. /* Long words per page */
  304. #define LWPP (PAGE_SIZE/sizeof(long))
  305. /*
  306. * actually most functions herein should take a struct drbd_bitmap*, not a
  307. * struct drbd_device*, but for the debug macros I like to have the device around
  308. * to be able to report device specific.
  309. */
  310. static void bm_free_pages(struct page **pages, unsigned long number)
  311. {
  312. unsigned long i;
  313. if (!pages)
  314. return;
  315. for (i = 0; i < number; i++) {
  316. if (!pages[i]) {
  317. pr_alert("bm_free_pages tried to free a NULL pointer; i=%lu n=%lu\n",
  318. i, number);
  319. continue;
  320. }
  321. __free_page(pages[i]);
  322. pages[i] = NULL;
  323. }
  324. }
  325. static inline void bm_vk_free(void *ptr)
  326. {
  327. kvfree(ptr);
  328. }
  329. /*
  330. * "have" and "want" are NUMBER OF PAGES.
  331. */
  332. static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
  333. {
  334. struct page **old_pages = b->bm_pages;
  335. struct page **new_pages, *page;
  336. unsigned int i, bytes;
  337. unsigned long have = b->bm_number_of_pages;
  338. BUG_ON(have == 0 && old_pages != NULL);
  339. BUG_ON(have != 0 && old_pages == NULL);
  340. if (have == want)
  341. return old_pages;
  342. /* Trying kmalloc first, falling back to vmalloc.
  343. * GFP_NOIO, as this is called while drbd IO is "suspended",
  344. * and during resize or attach on diskless Primary,
  345. * we must not block on IO to ourselves.
  346. * Context is receiver thread or dmsetup. */
  347. bytes = sizeof(struct page *)*want;
  348. new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN);
  349. if (!new_pages) {
  350. new_pages = __vmalloc(bytes,
  351. GFP_NOIO | __GFP_ZERO,
  352. PAGE_KERNEL);
  353. if (!new_pages)
  354. return NULL;
  355. }
  356. if (want >= have) {
  357. for (i = 0; i < have; i++)
  358. new_pages[i] = old_pages[i];
  359. for (; i < want; i++) {
  360. page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
  361. if (!page) {
  362. bm_free_pages(new_pages + have, i - have);
  363. bm_vk_free(new_pages);
  364. return NULL;
  365. }
  366. /* we want to know which page it is
  367. * from the endio handlers */
  368. bm_store_page_idx(page, i);
  369. new_pages[i] = page;
  370. }
  371. } else {
  372. for (i = 0; i < want; i++)
  373. new_pages[i] = old_pages[i];
  374. /* NOT HERE, we are outside the spinlock!
  375. bm_free_pages(old_pages + want, have - want);
  376. */
  377. }
  378. return new_pages;
  379. }
  380. /*
  381. * allocates the drbd_bitmap and stores it in device->bitmap.
  382. */
  383. int drbd_bm_init(struct drbd_device *device)
  384. {
  385. struct drbd_bitmap *b = device->bitmap;
  386. WARN_ON(b != NULL);
  387. b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
  388. if (!b)
  389. return -ENOMEM;
  390. spin_lock_init(&b->bm_lock);
  391. mutex_init(&b->bm_change);
  392. init_waitqueue_head(&b->bm_io_wait);
  393. device->bitmap = b;
  394. return 0;
  395. }
  396. sector_t drbd_bm_capacity(struct drbd_device *device)
  397. {
  398. if (!expect(device->bitmap))
  399. return 0;
  400. return device->bitmap->bm_dev_capacity;
  401. }
  402. /* called on driver unload. TODO: call when a device is destroyed.
  403. */
  404. void drbd_bm_cleanup(struct drbd_device *device)
  405. {
  406. if (!expect(device->bitmap))
  407. return;
  408. bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages);
  409. bm_vk_free(device->bitmap->bm_pages);
  410. kfree(device->bitmap);
  411. device->bitmap = NULL;
  412. }
  413. /*
  414. * since (b->bm_bits % BITS_PER_LONG) != 0,
  415. * this masks out the remaining bits.
  416. * Returns the number of bits cleared.
  417. */
  418. #ifndef BITS_PER_PAGE
  419. #define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3))
  420. #define BITS_PER_PAGE_MASK (BITS_PER_PAGE - 1)
  421. #else
  422. # if BITS_PER_PAGE != (1UL << (PAGE_SHIFT + 3))
  423. # error "ambiguous BITS_PER_PAGE"
  424. # endif
  425. #endif
  426. #define BITS_PER_LONG_MASK (BITS_PER_LONG - 1)
  427. static int bm_clear_surplus(struct drbd_bitmap *b)
  428. {
  429. unsigned long mask;
  430. unsigned long *p_addr, *bm;
  431. int tmp;
  432. int cleared = 0;
  433. /* number of bits modulo bits per page */
  434. tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
  435. /* mask the used bits of the word containing the last bit */
  436. mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
  437. /* bitmap is always stored little endian,
  438. * on disk and in core memory alike */
  439. mask = cpu_to_lel(mask);
  440. p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
  441. bm = p_addr + (tmp/BITS_PER_LONG);
  442. if (mask) {
  443. /* If mask != 0, we are not exactly aligned, so bm now points
  444. * to the long containing the last bit.
  445. * If mask == 0, bm already points to the word immediately
  446. * after the last (long word aligned) bit. */
  447. cleared = hweight_long(*bm & ~mask);
  448. *bm &= mask;
  449. bm++;
  450. }
  451. if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
  452. /* on a 32bit arch, we may need to zero out
  453. * a padding long to align with a 64bit remote */
  454. cleared += hweight_long(*bm);
  455. *bm = 0;
  456. }
  457. bm_unmap(p_addr);
  458. return cleared;
  459. }
  460. static void bm_set_surplus(struct drbd_bitmap *b)
  461. {
  462. unsigned long mask;
  463. unsigned long *p_addr, *bm;
  464. int tmp;
  465. /* number of bits modulo bits per page */
  466. tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
  467. /* mask the used bits of the word containing the last bit */
  468. mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
  469. /* bitmap is always stored little endian,
  470. * on disk and in core memory alike */
  471. mask = cpu_to_lel(mask);
  472. p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
  473. bm = p_addr + (tmp/BITS_PER_LONG);
  474. if (mask) {
  475. /* If mask != 0, we are not exactly aligned, so bm now points
  476. * to the long containing the last bit.
  477. * If mask == 0, bm already points to the word immediately
  478. * after the last (long word aligned) bit. */
  479. *bm |= ~mask;
  480. bm++;
  481. }
  482. if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
  483. /* on a 32bit arch, we may need to zero out
  484. * a padding long to align with a 64bit remote */
  485. *bm = ~0UL;
  486. }
  487. bm_unmap(p_addr);
  488. }
  489. /* you better not modify the bitmap while this is running,
  490. * or its results will be stale */
  491. static unsigned long bm_count_bits(struct drbd_bitmap *b)
  492. {
  493. unsigned long *p_addr;
  494. unsigned long bits = 0;
  495. unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
  496. int idx, last_word;
  497. /* all but last page */
  498. for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
  499. p_addr = __bm_map_pidx(b, idx);
  500. bits += bitmap_weight(p_addr, BITS_PER_PAGE);
  501. __bm_unmap(p_addr);
  502. cond_resched();
  503. }
  504. /* last (or only) page */
  505. last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
  506. p_addr = __bm_map_pidx(b, idx);
  507. bits += bitmap_weight(p_addr, last_word * BITS_PER_LONG);
  508. p_addr[last_word] &= cpu_to_lel(mask);
  509. bits += hweight_long(p_addr[last_word]);
  510. /* 32bit arch, may have an unused padding long */
  511. if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
  512. p_addr[last_word+1] = 0;
  513. __bm_unmap(p_addr);
  514. return bits;
  515. }
  516. /* offset and len in long words.*/
  517. static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
  518. {
  519. unsigned long *p_addr, *bm;
  520. unsigned int idx;
  521. size_t do_now, end;
  522. end = offset + len;
  523. if (end > b->bm_words) {
  524. pr_alert("bm_memset end > bm_words\n");
  525. return;
  526. }
  527. while (offset < end) {
  528. do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
  529. idx = bm_word_to_page_idx(b, offset);
  530. p_addr = bm_map_pidx(b, idx);
  531. bm = p_addr + MLPP(offset);
  532. if (bm+do_now > p_addr + LWPP) {
  533. pr_alert("BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
  534. p_addr, bm, (int)do_now);
  535. } else
  536. memset(bm, c, do_now * sizeof(long));
  537. bm_unmap(p_addr);
  538. bm_set_page_need_writeout(b->bm_pages[idx]);
  539. offset += do_now;
  540. }
  541. }
  542. /* For the layout, see comment above drbd_md_set_sector_offsets(). */
  543. static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev)
  544. {
  545. u64 bitmap_sectors;
  546. if (ldev->md.al_offset == 8)
  547. bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset;
  548. else
  549. bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset;
  550. return bitmap_sectors << (9 + 3);
  551. }
  552. /*
  553. * make sure the bitmap has enough room for the attached storage,
  554. * if necessary, resize.
  555. * called whenever we may have changed the device size.
  556. * returns -ENOMEM if we could not allocate enough memory, 0 on success.
  557. * In case this is actually a resize, we copy the old bitmap into the new one.
  558. * Otherwise, the bitmap is initialized to all bits set.
  559. */
  560. int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bits)
  561. {
  562. struct drbd_bitmap *b = device->bitmap;
  563. unsigned long bits, words, owords, obits;
  564. unsigned long want, have, onpages; /* number of pages */
  565. struct page **npages, **opages = NULL;
  566. int err = 0;
  567. bool growing;
  568. if (!expect(b))
  569. return -ENOMEM;
  570. drbd_bm_lock(device, "resize", BM_LOCKED_MASK);
  571. drbd_info(device, "drbd_bm_resize called with capacity == %llu\n",
  572. (unsigned long long)capacity);
  573. if (capacity == b->bm_dev_capacity)
  574. goto out;
  575. if (capacity == 0) {
  576. spin_lock_irq(&b->bm_lock);
  577. opages = b->bm_pages;
  578. onpages = b->bm_number_of_pages;
  579. owords = b->bm_words;
  580. b->bm_pages = NULL;
  581. b->bm_number_of_pages =
  582. b->bm_set =
  583. b->bm_bits =
  584. b->bm_words =
  585. b->bm_dev_capacity = 0;
  586. spin_unlock_irq(&b->bm_lock);
  587. bm_free_pages(opages, onpages);
  588. bm_vk_free(opages);
  589. goto out;
  590. }
  591. bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
  592. /* if we would use
  593. words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
  594. a 32bit host could present the wrong number of words
  595. to a 64bit host.
  596. */
  597. words = ALIGN(bits, 64) >> LN2_BPL;
  598. if (get_ldev(device)) {
  599. u64 bits_on_disk = drbd_md_on_disk_bits(device->ldev);
  600. put_ldev(device);
  601. if (bits > bits_on_disk) {
  602. drbd_info(device, "bits = %lu\n", bits);
  603. drbd_info(device, "bits_on_disk = %llu\n", bits_on_disk);
  604. err = -ENOSPC;
  605. goto out;
  606. }
  607. }
  608. want = ALIGN(words*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
  609. have = b->bm_number_of_pages;
  610. if (want == have) {
  611. D_ASSERT(device, b->bm_pages != NULL);
  612. npages = b->bm_pages;
  613. } else {
  614. if (drbd_insert_fault(device, DRBD_FAULT_BM_ALLOC))
  615. npages = NULL;
  616. else
  617. npages = bm_realloc_pages(b, want);
  618. }
  619. if (!npages) {
  620. err = -ENOMEM;
  621. goto out;
  622. }
  623. spin_lock_irq(&b->bm_lock);
  624. opages = b->bm_pages;
  625. owords = b->bm_words;
  626. obits = b->bm_bits;
  627. growing = bits > obits;
  628. if (opages && growing && set_new_bits)
  629. bm_set_surplus(b);
  630. b->bm_pages = npages;
  631. b->bm_number_of_pages = want;
  632. b->bm_bits = bits;
  633. b->bm_words = words;
  634. b->bm_dev_capacity = capacity;
  635. if (growing) {
  636. if (set_new_bits) {
  637. bm_memset(b, owords, 0xff, words-owords);
  638. b->bm_set += bits - obits;
  639. } else
  640. bm_memset(b, owords, 0x00, words-owords);
  641. }
  642. if (want < have) {
  643. /* implicit: (opages != NULL) && (opages != npages) */
  644. bm_free_pages(opages + want, have - want);
  645. }
  646. (void)bm_clear_surplus(b);
  647. spin_unlock_irq(&b->bm_lock);
  648. if (opages != npages)
  649. bm_vk_free(opages);
  650. if (!growing)
  651. b->bm_set = bm_count_bits(b);
  652. drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
  653. out:
  654. drbd_bm_unlock(device);
  655. return err;
  656. }
  657. /* inherently racy:
  658. * if not protected by other means, return value may be out of date when
  659. * leaving this function...
  660. * we still need to lock it, since it is important that this returns
  661. * bm_set == 0 precisely.
  662. *
  663. * maybe bm_set should be atomic_t ?
  664. */
  665. unsigned long _drbd_bm_total_weight(struct drbd_device *device)
  666. {
  667. struct drbd_bitmap *b = device->bitmap;
  668. unsigned long s;
  669. unsigned long flags;
  670. if (!expect(b))
  671. return 0;
  672. if (!expect(b->bm_pages))
  673. return 0;
  674. spin_lock_irqsave(&b->bm_lock, flags);
  675. s = b->bm_set;
  676. spin_unlock_irqrestore(&b->bm_lock, flags);
  677. return s;
  678. }
  679. unsigned long drbd_bm_total_weight(struct drbd_device *device)
  680. {
  681. unsigned long s;
  682. /* if I don't have a disk, I don't know about out-of-sync status */
  683. if (!get_ldev_if_state(device, D_NEGOTIATING))
  684. return 0;
  685. s = _drbd_bm_total_weight(device);
  686. put_ldev(device);
  687. return s;
  688. }
  689. size_t drbd_bm_words(struct drbd_device *device)
  690. {
  691. struct drbd_bitmap *b = device->bitmap;
  692. if (!expect(b))
  693. return 0;
  694. if (!expect(b->bm_pages))
  695. return 0;
  696. return b->bm_words;
  697. }
  698. unsigned long drbd_bm_bits(struct drbd_device *device)
  699. {
  700. struct drbd_bitmap *b = device->bitmap;
  701. if (!expect(b))
  702. return 0;
  703. return b->bm_bits;
  704. }
  705. /* merge number words from buffer into the bitmap starting at offset.
  706. * buffer[i] is expected to be little endian unsigned long.
  707. * bitmap must be locked by drbd_bm_lock.
  708. * currently only used from receive_bitmap.
  709. */
  710. void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, size_t number,
  711. unsigned long *buffer)
  712. {
  713. struct drbd_bitmap *b = device->bitmap;
  714. unsigned long *p_addr, *bm;
  715. unsigned long word, bits;
  716. unsigned int idx;
  717. size_t end, do_now;
  718. end = offset + number;
  719. if (!expect(b))
  720. return;
  721. if (!expect(b->bm_pages))
  722. return;
  723. if (number == 0)
  724. return;
  725. WARN_ON(offset >= b->bm_words);
  726. WARN_ON(end > b->bm_words);
  727. spin_lock_irq(&b->bm_lock);
  728. while (offset < end) {
  729. do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
  730. idx = bm_word_to_page_idx(b, offset);
  731. p_addr = bm_map_pidx(b, idx);
  732. bm = p_addr + MLPP(offset);
  733. offset += do_now;
  734. while (do_now--) {
  735. bits = hweight_long(*bm);
  736. word = *bm | *buffer++;
  737. *bm++ = word;
  738. b->bm_set += hweight_long(word) - bits;
  739. }
  740. bm_unmap(p_addr);
  741. bm_set_page_need_writeout(b->bm_pages[idx]);
  742. }
  743. /* with 32bit <-> 64bit cross-platform connect
  744. * this is only correct for current usage,
  745. * where we _know_ that we are 64 bit aligned,
  746. * and know that this function is used in this way, too...
  747. */
  748. if (end == b->bm_words)
  749. b->bm_set -= bm_clear_surplus(b);
  750. spin_unlock_irq(&b->bm_lock);
  751. }
  752. /* copy number words from the bitmap starting at offset into the buffer.
  753. * buffer[i] will be little endian unsigned long.
  754. */
  755. void drbd_bm_get_lel(struct drbd_device *device, size_t offset, size_t number,
  756. unsigned long *buffer)
  757. {
  758. struct drbd_bitmap *b = device->bitmap;
  759. unsigned long *p_addr, *bm;
  760. size_t end, do_now;
  761. end = offset + number;
  762. if (!expect(b))
  763. return;
  764. if (!expect(b->bm_pages))
  765. return;
  766. spin_lock_irq(&b->bm_lock);
  767. if ((offset >= b->bm_words) ||
  768. (end > b->bm_words) ||
  769. (number <= 0))
  770. drbd_err(device, "offset=%lu number=%lu bm_words=%lu\n",
  771. (unsigned long) offset,
  772. (unsigned long) number,
  773. (unsigned long) b->bm_words);
  774. else {
  775. while (offset < end) {
  776. do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
  777. p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset));
  778. bm = p_addr + MLPP(offset);
  779. offset += do_now;
  780. while (do_now--)
  781. *buffer++ = *bm++;
  782. bm_unmap(p_addr);
  783. }
  784. }
  785. spin_unlock_irq(&b->bm_lock);
  786. }
  787. /* set all bits in the bitmap */
  788. void drbd_bm_set_all(struct drbd_device *device)
  789. {
  790. struct drbd_bitmap *b = device->bitmap;
  791. if (!expect(b))
  792. return;
  793. if (!expect(b->bm_pages))
  794. return;
  795. spin_lock_irq(&b->bm_lock);
  796. bm_memset(b, 0, 0xff, b->bm_words);
  797. (void)bm_clear_surplus(b);
  798. b->bm_set = b->bm_bits;
  799. spin_unlock_irq(&b->bm_lock);
  800. }
  801. /* clear all bits in the bitmap */
  802. void drbd_bm_clear_all(struct drbd_device *device)
  803. {
  804. struct drbd_bitmap *b = device->bitmap;
  805. if (!expect(b))
  806. return;
  807. if (!expect(b->bm_pages))
  808. return;
  809. spin_lock_irq(&b->bm_lock);
  810. bm_memset(b, 0, 0, b->bm_words);
  811. b->bm_set = 0;
  812. spin_unlock_irq(&b->bm_lock);
  813. }
  814. static void drbd_bm_aio_ctx_destroy(struct kref *kref)
  815. {
  816. struct drbd_bm_aio_ctx *ctx = container_of(kref, struct drbd_bm_aio_ctx, kref);
  817. unsigned long flags;
  818. spin_lock_irqsave(&ctx->device->resource->req_lock, flags);
  819. list_del(&ctx->list);
  820. spin_unlock_irqrestore(&ctx->device->resource->req_lock, flags);
  821. put_ldev(ctx->device);
  822. kfree(ctx);
  823. }
  824. /* bv_page may be a copy, or may be the original */
  825. static void drbd_bm_endio(struct bio *bio)
  826. {
  827. struct drbd_bm_aio_ctx *ctx = bio->bi_private;
  828. struct drbd_device *device = ctx->device;
  829. struct drbd_bitmap *b = device->bitmap;
  830. unsigned int idx = bm_page_to_idx(bio_first_page_all(bio));
  831. if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
  832. !bm_test_page_unchanged(b->bm_pages[idx]))
  833. drbd_warn(device, "bitmap page idx %u changed during IO!\n", idx);
  834. if (bio->bi_status) {
  835. /* ctx error will hold the completed-last non-zero error code,
  836. * in case error codes differ. */
  837. ctx->error = blk_status_to_errno(bio->bi_status);
  838. bm_set_page_io_err(b->bm_pages[idx]);
  839. /* Not identical to on disk version of it.
  840. * Is BM_PAGE_IO_ERROR enough? */
  841. if (__ratelimit(&drbd_ratelimit_state))
  842. drbd_err(device, "IO ERROR %d on bitmap page idx %u\n",
  843. bio->bi_status, idx);
  844. } else {
  845. bm_clear_page_io_err(b->bm_pages[idx]);
  846. dynamic_drbd_dbg(device, "bitmap page idx %u completed\n", idx);
  847. }
  848. bm_page_unlock_io(device, idx);
  849. if (ctx->flags & BM_AIO_COPY_PAGES)
  850. mempool_free(bio->bi_io_vec[0].bv_page, &drbd_md_io_page_pool);
  851. bio_put(bio);
  852. if (atomic_dec_and_test(&ctx->in_flight)) {
  853. ctx->done = 1;
  854. wake_up(&device->misc_wait);
  855. kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
  856. }
  857. }
  858. static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
  859. {
  860. struct bio *bio = bio_alloc_drbd(GFP_NOIO);
  861. struct drbd_device *device = ctx->device;
  862. struct drbd_bitmap *b = device->bitmap;
  863. struct page *page;
  864. unsigned int len;
  865. unsigned int op = (ctx->flags & BM_AIO_READ) ? REQ_OP_READ : REQ_OP_WRITE;
  866. sector_t on_disk_sector =
  867. device->ldev->md.md_offset + device->ldev->md.bm_offset;
  868. on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
  869. /* this might happen with very small
  870. * flexible external meta data device,
  871. * or with PAGE_SIZE > 4k */
  872. len = min_t(unsigned int, PAGE_SIZE,
  873. (drbd_md_last_sector(device->ldev) - on_disk_sector + 1)<<9);
  874. /* serialize IO on this page */
  875. bm_page_lock_io(device, page_nr);
  876. /* before memcpy and submit,
  877. * so it can be redirtied any time */
  878. bm_set_page_unchanged(b->bm_pages[page_nr]);
  879. if (ctx->flags & BM_AIO_COPY_PAGES) {
  880. page = mempool_alloc(&drbd_md_io_page_pool,
  881. GFP_NOIO | __GFP_HIGHMEM);
  882. copy_highpage(page, b->bm_pages[page_nr]);
  883. bm_store_page_idx(page, page_nr);
  884. } else
  885. page = b->bm_pages[page_nr];
  886. bio_set_dev(bio, device->ldev->md_bdev);
  887. bio->bi_iter.bi_sector = on_disk_sector;
  888. /* bio_add_page of a single page to an empty bio will always succeed,
  889. * according to api. Do we want to assert that? */
  890. bio_add_page(bio, page, len, 0);
  891. bio->bi_private = ctx;
  892. bio->bi_end_io = drbd_bm_endio;
  893. bio_set_op_attrs(bio, op, 0);
  894. if (drbd_insert_fault(device, (op == REQ_OP_WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
  895. bio_io_error(bio);
  896. } else {
  897. submit_bio(bio);
  898. /* this should not count as user activity and cause the
  899. * resync to throttle -- see drbd_rs_should_slow_down(). */
  900. atomic_add(len >> 9, &device->rs_sect_ev);
  901. }
  902. }
  903. /*
  904. * bm_rw: read/write the whole bitmap from/to its on disk location.
  905. */
  906. static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
  907. {
  908. struct drbd_bm_aio_ctx *ctx;
  909. struct drbd_bitmap *b = device->bitmap;
  910. unsigned int num_pages, i, count = 0;
  911. unsigned long now;
  912. char ppb[10];
  913. int err = 0;
  914. /*
  915. * We are protected against bitmap disappearing/resizing by holding an
  916. * ldev reference (caller must have called get_ldev()).
  917. * For read/write, we are protected against changes to the bitmap by
  918. * the bitmap lock (see drbd_bitmap_io).
  919. * For lazy writeout, we don't care for ongoing changes to the bitmap,
  920. * as we submit copies of pages anyways.
  921. */
  922. ctx = kmalloc(sizeof(struct drbd_bm_aio_ctx), GFP_NOIO);
  923. if (!ctx)
  924. return -ENOMEM;
  925. *ctx = (struct drbd_bm_aio_ctx) {
  926. .device = device,
  927. .start_jif = jiffies,
  928. .in_flight = ATOMIC_INIT(1),
  929. .done = 0,
  930. .flags = flags,
  931. .error = 0,
  932. .kref = KREF_INIT(2),
  933. };
  934. if (!get_ldev_if_state(device, D_ATTACHING)) { /* put is in drbd_bm_aio_ctx_destroy() */
  935. drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
  936. kfree(ctx);
  937. return -ENODEV;
  938. }
  939. /* Here D_ATTACHING is sufficient since drbd_bm_read() is called only from
  940. drbd_adm_attach(), after device->ldev was assigned. */
  941. if (0 == (ctx->flags & ~BM_AIO_READ))
  942. WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
  943. spin_lock_irq(&device->resource->req_lock);
  944. list_add_tail(&ctx->list, &device->pending_bitmap_io);
  945. spin_unlock_irq(&device->resource->req_lock);
  946. num_pages = b->bm_number_of_pages;
  947. now = jiffies;
  948. /* let the layers below us try to merge these bios... */
  949. if (flags & BM_AIO_READ) {
  950. for (i = 0; i < num_pages; i++) {
  951. atomic_inc(&ctx->in_flight);
  952. bm_page_io_async(ctx, i);
  953. ++count;
  954. cond_resched();
  955. }
  956. } else if (flags & BM_AIO_WRITE_HINTED) {
  957. /* ASSERT: BM_AIO_WRITE_ALL_PAGES is not set. */
  958. unsigned int hint;
  959. for (hint = 0; hint < b->n_bitmap_hints; hint++) {
  960. i = b->al_bitmap_hints[hint];
  961. if (i >= num_pages) /* == -1U: no hint here. */
  962. continue;
  963. /* Several AL-extents may point to the same page. */
  964. if (!test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
  965. &page_private(b->bm_pages[i])))
  966. continue;
  967. /* Has it even changed? */
  968. if (bm_test_page_unchanged(b->bm_pages[i]))
  969. continue;
  970. atomic_inc(&ctx->in_flight);
  971. bm_page_io_async(ctx, i);
  972. ++count;
  973. }
  974. } else {
  975. for (i = 0; i < num_pages; i++) {
  976. /* ignore completely unchanged pages */
  977. if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
  978. break;
  979. if (!(flags & BM_AIO_WRITE_ALL_PAGES) &&
  980. bm_test_page_unchanged(b->bm_pages[i])) {
  981. dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i);
  982. continue;
  983. }
  984. /* during lazy writeout,
  985. * ignore those pages not marked for lazy writeout. */
  986. if (lazy_writeout_upper_idx &&
  987. !bm_test_page_lazy_writeout(b->bm_pages[i])) {
  988. dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i);
  989. continue;
  990. }
  991. atomic_inc(&ctx->in_flight);
  992. bm_page_io_async(ctx, i);
  993. ++count;
  994. cond_resched();
  995. }
  996. }
  997. /*
  998. * We initialize ctx->in_flight to one to make sure drbd_bm_endio
  999. * will not set ctx->done early, and decrement / test it here. If there
  1000. * are still some bios in flight, we need to wait for them here.
  1001. * If all IO is done already (or nothing had been submitted), there is
  1002. * no need to wait. Still, we need to put the kref associated with the
  1003. * "in_flight reached zero, all done" event.
  1004. */
  1005. if (!atomic_dec_and_test(&ctx->in_flight))
  1006. wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
  1007. else
  1008. kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
  1009. /* summary for global bitmap IO */
  1010. if (flags == 0) {
  1011. unsigned int ms = jiffies_to_msecs(jiffies - now);
  1012. if (ms > 5) {
  1013. drbd_info(device, "bitmap %s of %u pages took %u ms\n",
  1014. (flags & BM_AIO_READ) ? "READ" : "WRITE",
  1015. count, ms);
  1016. }
  1017. }
  1018. if (ctx->error) {
  1019. drbd_alert(device, "we had at least one MD IO ERROR during bitmap IO\n");
  1020. drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
  1021. err = -EIO; /* ctx->error ? */
  1022. }
  1023. if (atomic_read(&ctx->in_flight))
  1024. err = -EIO; /* Disk timeout/force-detach during IO... */
  1025. now = jiffies;
  1026. if (flags & BM_AIO_READ) {
  1027. b->bm_set = bm_count_bits(b);
  1028. drbd_info(device, "recounting of set bits took additional %lu jiffies\n",
  1029. jiffies - now);
  1030. }
  1031. now = b->bm_set;
  1032. if ((flags & ~BM_AIO_READ) == 0)
  1033. drbd_info(device, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
  1034. ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
  1035. kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
  1036. return err;
  1037. }
  1038. /**
  1039. * drbd_bm_read() - Read the whole bitmap from its on disk location.
  1040. * @device: DRBD device.
  1041. */
  1042. int drbd_bm_read(struct drbd_device *device) __must_hold(local)
  1043. {
  1044. return bm_rw(device, BM_AIO_READ, 0);
  1045. }
  1046. /**
  1047. * drbd_bm_write() - Write the whole bitmap to its on disk location.
  1048. * @device: DRBD device.
  1049. *
  1050. * Will only write pages that have changed since last IO.
  1051. */
  1052. int drbd_bm_write(struct drbd_device *device) __must_hold(local)
  1053. {
  1054. return bm_rw(device, 0, 0);
  1055. }
  1056. /**
  1057. * drbd_bm_write_all() - Write the whole bitmap to its on disk location.
  1058. * @device: DRBD device.
  1059. *
  1060. * Will write all pages.
  1061. */
  1062. int drbd_bm_write_all(struct drbd_device *device) __must_hold(local)
  1063. {
  1064. return bm_rw(device, BM_AIO_WRITE_ALL_PAGES, 0);
  1065. }
  1066. /**
  1067. * drbd_bm_write_lazy() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
  1068. * @device: DRBD device.
  1069. * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages
  1070. */
  1071. int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local)
  1072. {
  1073. return bm_rw(device, BM_AIO_COPY_PAGES, upper_idx);
  1074. }
  1075. /**
  1076. * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location.
  1077. * @device: DRBD device.
  1078. *
  1079. * Will only write pages that have changed since last IO.
  1080. * In contrast to drbd_bm_write(), this will copy the bitmap pages
  1081. * to temporary writeout pages. It is intended to trigger a full write-out
  1082. * while still allowing the bitmap to change, for example if a resync or online
  1083. * verify is aborted due to a failed peer disk, while local IO continues, or
  1084. * pending resync acks are still being processed.
  1085. */
  1086. int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local)
  1087. {
  1088. return bm_rw(device, BM_AIO_COPY_PAGES, 0);
  1089. }
  1090. /**
  1091. * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed.
  1092. * @device: DRBD device.
  1093. */
  1094. int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local)
  1095. {
  1096. return bm_rw(device, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
  1097. }
  1098. /* NOTE
  1099. * find_first_bit returns int, we return unsigned long.
  1100. * For this to work on 32bit arch with bitnumbers > (1<<32),
  1101. * we'd need to return u64, and get a whole lot of other places
  1102. * fixed where we still use unsigned long.
  1103. *
  1104. * this returns a bit number, NOT a sector!
  1105. */
  1106. static unsigned long __bm_find_next(struct drbd_device *device, unsigned long bm_fo,
  1107. const int find_zero_bit)
  1108. {
  1109. struct drbd_bitmap *b = device->bitmap;
  1110. unsigned long *p_addr;
  1111. unsigned long bit_offset;
  1112. unsigned i;
  1113. if (bm_fo > b->bm_bits) {
  1114. drbd_err(device, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
  1115. bm_fo = DRBD_END_OF_BITMAP;
  1116. } else {
  1117. while (bm_fo < b->bm_bits) {
  1118. /* bit offset of the first bit in the page */
  1119. bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
  1120. p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo));
  1121. if (find_zero_bit)
  1122. i = find_next_zero_bit_le(p_addr,
  1123. PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
  1124. else
  1125. i = find_next_bit_le(p_addr,
  1126. PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
  1127. __bm_unmap(p_addr);
  1128. if (i < PAGE_SIZE*8) {
  1129. bm_fo = bit_offset + i;
  1130. if (bm_fo >= b->bm_bits)
  1131. break;
  1132. goto found;
  1133. }
  1134. bm_fo = bit_offset + PAGE_SIZE*8;
  1135. }
  1136. bm_fo = DRBD_END_OF_BITMAP;
  1137. }
  1138. found:
  1139. return bm_fo;
  1140. }
  1141. static unsigned long bm_find_next(struct drbd_device *device,
  1142. unsigned long bm_fo, const int find_zero_bit)
  1143. {
  1144. struct drbd_bitmap *b = device->bitmap;
  1145. unsigned long i = DRBD_END_OF_BITMAP;
  1146. if (!expect(b))
  1147. return i;
  1148. if (!expect(b->bm_pages))
  1149. return i;
  1150. spin_lock_irq(&b->bm_lock);
  1151. if (BM_DONT_TEST & b->bm_flags)
  1152. bm_print_lock_info(device);
  1153. i = __bm_find_next(device, bm_fo, find_zero_bit);
  1154. spin_unlock_irq(&b->bm_lock);
  1155. return i;
  1156. }
  1157. unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo)
  1158. {
  1159. return bm_find_next(device, bm_fo, 0);
  1160. }
  1161. #if 0
  1162. /* not yet needed for anything. */
  1163. unsigned long drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo)
  1164. {
  1165. return bm_find_next(device, bm_fo, 1);
  1166. }
  1167. #endif
  1168. /* does not spin_lock_irqsave.
  1169. * you must take drbd_bm_lock() first */
  1170. unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo)
  1171. {
  1172. /* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */
  1173. return __bm_find_next(device, bm_fo, 0);
  1174. }
  1175. unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo)
  1176. {
  1177. /* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */
  1178. return __bm_find_next(device, bm_fo, 1);
  1179. }
  1180. /* returns number of bits actually changed.
  1181. * for val != 0, we change 0 -> 1, return code positive
  1182. * for val == 0, we change 1 -> 0, return code negative
  1183. * wants bitnr, not sector.
  1184. * expected to be called for only a few bits (e - s about BITS_PER_LONG).
  1185. * Must hold bitmap lock already. */
  1186. static int __bm_change_bits_to(struct drbd_device *device, const unsigned long s,
  1187. unsigned long e, int val)
  1188. {
  1189. struct drbd_bitmap *b = device->bitmap;
  1190. unsigned long *p_addr = NULL;
  1191. unsigned long bitnr;
  1192. unsigned int last_page_nr = -1U;
  1193. int c = 0;
  1194. int changed_total = 0;
  1195. if (e >= b->bm_bits) {
  1196. drbd_err(device, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
  1197. s, e, b->bm_bits);
  1198. e = b->bm_bits ? b->bm_bits -1 : 0;
  1199. }
  1200. for (bitnr = s; bitnr <= e; bitnr++) {
  1201. unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
  1202. if (page_nr != last_page_nr) {
  1203. if (p_addr)
  1204. __bm_unmap(p_addr);
  1205. if (c < 0)
  1206. bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
  1207. else if (c > 0)
  1208. bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
  1209. changed_total += c;
  1210. c = 0;
  1211. p_addr = __bm_map_pidx(b, page_nr);
  1212. last_page_nr = page_nr;
  1213. }
  1214. if (val)
  1215. c += (0 == __test_and_set_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
  1216. else
  1217. c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
  1218. }
  1219. if (p_addr)
  1220. __bm_unmap(p_addr);
  1221. if (c < 0)
  1222. bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
  1223. else if (c > 0)
  1224. bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
  1225. changed_total += c;
  1226. b->bm_set += changed_total;
  1227. return changed_total;
  1228. }
  1229. /* returns number of bits actually changed.
  1230. * for val != 0, we change 0 -> 1, return code positive
  1231. * for val == 0, we change 1 -> 0, return code negative
  1232. * wants bitnr, not sector */
  1233. static int bm_change_bits_to(struct drbd_device *device, const unsigned long s,
  1234. const unsigned long e, int val)
  1235. {
  1236. unsigned long flags;
  1237. struct drbd_bitmap *b = device->bitmap;
  1238. int c = 0;
  1239. if (!expect(b))
  1240. return 1;
  1241. if (!expect(b->bm_pages))
  1242. return 0;
  1243. spin_lock_irqsave(&b->bm_lock, flags);
  1244. if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
  1245. bm_print_lock_info(device);
  1246. c = __bm_change_bits_to(device, s, e, val);
  1247. spin_unlock_irqrestore(&b->bm_lock, flags);
  1248. return c;
  1249. }
  1250. /* returns number of bits changed 0 -> 1 */
  1251. int drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
  1252. {
  1253. return bm_change_bits_to(device, s, e, 1);
  1254. }
  1255. /* returns number of bits changed 1 -> 0 */
  1256. int drbd_bm_clear_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
  1257. {
  1258. return -bm_change_bits_to(device, s, e, 0);
  1259. }
  1260. /* sets all bits in full words,
  1261. * from first_word up to, but not including, last_word */
  1262. static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
  1263. int page_nr, int first_word, int last_word)
  1264. {
  1265. int i;
  1266. int bits;
  1267. int changed = 0;
  1268. unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
  1269. /* I think it is more cache line friendly to hweight_long then set to ~0UL,
  1270. * than to first bitmap_weight() all words, then bitmap_fill() all words */
  1271. for (i = first_word; i < last_word; i++) {
  1272. bits = hweight_long(paddr[i]);
  1273. paddr[i] = ~0UL;
  1274. changed += BITS_PER_LONG - bits;
  1275. }
  1276. kunmap_atomic(paddr);
  1277. if (changed) {
  1278. /* We only need lazy writeout, the information is still in the
  1279. * remote bitmap as well, and is reconstructed during the next
  1280. * bitmap exchange, if lost locally due to a crash. */
  1281. bm_set_page_lazy_writeout(b->bm_pages[page_nr]);
  1282. b->bm_set += changed;
  1283. }
  1284. }
  1285. /* Same thing as drbd_bm_set_bits,
  1286. * but more efficient for a large bit range.
  1287. * You must first drbd_bm_lock().
  1288. * Can be called to set the whole bitmap in one go.
  1289. * Sets bits from s to e _inclusive_. */
  1290. void _drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
  1291. {
  1292. /* First set_bit from the first bit (s)
  1293. * up to the next long boundary (sl),
  1294. * then assign full words up to the last long boundary (el),
  1295. * then set_bit up to and including the last bit (e).
  1296. *
  1297. * Do not use memset, because we must account for changes,
  1298. * so we need to loop over the words with hweight() anyways.
  1299. */
  1300. struct drbd_bitmap *b = device->bitmap;
  1301. unsigned long sl = ALIGN(s,BITS_PER_LONG);
  1302. unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
  1303. int first_page;
  1304. int last_page;
  1305. int page_nr;
  1306. int first_word;
  1307. int last_word;
  1308. if (e - s <= 3*BITS_PER_LONG) {
  1309. /* don't bother; el and sl may even be wrong. */
  1310. spin_lock_irq(&b->bm_lock);
  1311. __bm_change_bits_to(device, s, e, 1);
  1312. spin_unlock_irq(&b->bm_lock);
  1313. return;
  1314. }
  1315. /* difference is large enough that we can trust sl and el */
  1316. spin_lock_irq(&b->bm_lock);
  1317. /* bits filling the current long */
  1318. if (sl)
  1319. __bm_change_bits_to(device, s, sl-1, 1);
  1320. first_page = sl >> (3 + PAGE_SHIFT);
  1321. last_page = el >> (3 + PAGE_SHIFT);
  1322. /* MLPP: modulo longs per page */
  1323. /* LWPP: long words per page */
  1324. first_word = MLPP(sl >> LN2_BPL);
  1325. last_word = LWPP;
  1326. /* first and full pages, unless first page == last page */
  1327. for (page_nr = first_page; page_nr < last_page; page_nr++) {
  1328. bm_set_full_words_within_one_page(device->bitmap, page_nr, first_word, last_word);
  1329. spin_unlock_irq(&b->bm_lock);
  1330. cond_resched();
  1331. first_word = 0;
  1332. spin_lock_irq(&b->bm_lock);
  1333. }
  1334. /* last page (respectively only page, for first page == last page) */
  1335. last_word = MLPP(el >> LN2_BPL);
  1336. /* consider bitmap->bm_bits = 32768, bitmap->bm_number_of_pages = 1. (or multiples).
  1337. * ==> e = 32767, el = 32768, last_page = 2,
  1338. * and now last_word = 0.
  1339. * We do not want to touch last_page in this case,
  1340. * as we did not allocate it, it is not present in bitmap->bm_pages.
  1341. */
  1342. if (last_word)
  1343. bm_set_full_words_within_one_page(device->bitmap, last_page, first_word, last_word);
  1344. /* possibly trailing bits.
  1345. * example: (e & 63) == 63, el will be e+1.
  1346. * if that even was the very last bit,
  1347. * it would trigger an assert in __bm_change_bits_to()
  1348. */
  1349. if (el <= e)
  1350. __bm_change_bits_to(device, el, e, 1);
  1351. spin_unlock_irq(&b->bm_lock);
  1352. }
  1353. /* returns bit state
  1354. * wants bitnr, NOT sector.
  1355. * inherently racy... area needs to be locked by means of {al,rs}_lru
  1356. * 1 ... bit set
  1357. * 0 ... bit not set
  1358. * -1 ... first out of bounds access, stop testing for bits!
  1359. */
  1360. int drbd_bm_test_bit(struct drbd_device *device, const unsigned long bitnr)
  1361. {
  1362. unsigned long flags;
  1363. struct drbd_bitmap *b = device->bitmap;
  1364. unsigned long *p_addr;
  1365. int i;
  1366. if (!expect(b))
  1367. return 0;
  1368. if (!expect(b->bm_pages))
  1369. return 0;
  1370. spin_lock_irqsave(&b->bm_lock, flags);
  1371. if (BM_DONT_TEST & b->bm_flags)
  1372. bm_print_lock_info(device);
  1373. if (bitnr < b->bm_bits) {
  1374. p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr));
  1375. i = test_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr) ? 1 : 0;
  1376. bm_unmap(p_addr);
  1377. } else if (bitnr == b->bm_bits) {
  1378. i = -1;
  1379. } else { /* (bitnr > b->bm_bits) */
  1380. drbd_err(device, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
  1381. i = 0;
  1382. }
  1383. spin_unlock_irqrestore(&b->bm_lock, flags);
  1384. return i;
  1385. }
  1386. /* returns number of bits set in the range [s, e] */
  1387. int drbd_bm_count_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
  1388. {
  1389. unsigned long flags;
  1390. struct drbd_bitmap *b = device->bitmap;
  1391. unsigned long *p_addr = NULL;
  1392. unsigned long bitnr;
  1393. unsigned int page_nr = -1U;
  1394. int c = 0;
  1395. /* If this is called without a bitmap, that is a bug. But just to be
  1396. * robust in case we screwed up elsewhere, in that case pretend there
  1397. * was one dirty bit in the requested area, so we won't try to do a
  1398. * local read there (no bitmap probably implies no disk) */
  1399. if (!expect(b))
  1400. return 1;
  1401. if (!expect(b->bm_pages))
  1402. return 1;
  1403. spin_lock_irqsave(&b->bm_lock, flags);
  1404. if (BM_DONT_TEST & b->bm_flags)
  1405. bm_print_lock_info(device);
  1406. for (bitnr = s; bitnr <= e; bitnr++) {
  1407. unsigned int idx = bm_bit_to_page_idx(b, bitnr);
  1408. if (page_nr != idx) {
  1409. page_nr = idx;
  1410. if (p_addr)
  1411. bm_unmap(p_addr);
  1412. p_addr = bm_map_pidx(b, idx);
  1413. }
  1414. if (expect(bitnr < b->bm_bits))
  1415. c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
  1416. else
  1417. drbd_err(device, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
  1418. }
  1419. if (p_addr)
  1420. bm_unmap(p_addr);
  1421. spin_unlock_irqrestore(&b->bm_lock, flags);
  1422. return c;
  1423. }
  1424. /* inherently racy...
  1425. * return value may be already out-of-date when this function returns.
  1426. * but the general usage is that this is only use during a cstate when bits are
  1427. * only cleared, not set, and typically only care for the case when the return
  1428. * value is zero, or we already "locked" this "bitmap extent" by other means.
  1429. *
  1430. * enr is bm-extent number, since we chose to name one sector (512 bytes)
  1431. * worth of the bitmap a "bitmap extent".
  1432. *
  1433. * TODO
  1434. * I think since we use it like a reference count, we should use the real
  1435. * reference count of some bitmap extent element from some lru instead...
  1436. *
  1437. */
  1438. int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr)
  1439. {
  1440. struct drbd_bitmap *b = device->bitmap;
  1441. int count, s, e;
  1442. unsigned long flags;
  1443. unsigned long *p_addr, *bm;
  1444. if (!expect(b))
  1445. return 0;
  1446. if (!expect(b->bm_pages))
  1447. return 0;
  1448. spin_lock_irqsave(&b->bm_lock, flags);
  1449. if (BM_DONT_TEST & b->bm_flags)
  1450. bm_print_lock_info(device);
  1451. s = S2W(enr);
  1452. e = min((size_t)S2W(enr+1), b->bm_words);
  1453. count = 0;
  1454. if (s < b->bm_words) {
  1455. int n = e-s;
  1456. p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
  1457. bm = p_addr + MLPP(s);
  1458. count += bitmap_weight(bm, n * BITS_PER_LONG);
  1459. bm_unmap(p_addr);
  1460. } else {
  1461. drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s);
  1462. }
  1463. spin_unlock_irqrestore(&b->bm_lock, flags);
  1464. return count;
  1465. }