dm-writecache.c 69 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (C) 2018 Red Hat. All rights reserved.
  4. *
  5. * This file is released under the GPL.
  6. */
  7. #include <linux/device-mapper.h>
  8. #include <linux/module.h>
  9. #include <linux/init.h>
  10. #include <linux/vmalloc.h>
  11. #include <linux/kthread.h>
  12. #include <linux/dm-io.h>
  13. #include <linux/dm-kcopyd.h>
  14. #include <linux/dax.h>
  15. #include <linux/pfn_t.h>
  16. #include <linux/libnvdimm.h>
  17. #include <linux/delay.h>
  18. #include "dm-io-tracker.h"
  19. #define DM_MSG_PREFIX "writecache"
  20. #define HIGH_WATERMARK 50
  21. #define LOW_WATERMARK 45
  22. #define MAX_WRITEBACK_JOBS min(0x10000000 / PAGE_SIZE, totalram_pages() / 16)
  23. #define ENDIO_LATENCY 16
  24. #define WRITEBACK_LATENCY 64
  25. #define AUTOCOMMIT_BLOCKS_SSD 65536
  26. #define AUTOCOMMIT_BLOCKS_PMEM 64
  27. #define AUTOCOMMIT_MSEC 1000
  28. #define MAX_AGE_DIV 16
  29. #define MAX_AGE_UNSPECIFIED -1UL
  30. #define PAUSE_WRITEBACK (HZ * 3)
  31. #define BITMAP_GRANULARITY 65536
  32. #if BITMAP_GRANULARITY < PAGE_SIZE
  33. #undef BITMAP_GRANULARITY
  34. #define BITMAP_GRANULARITY PAGE_SIZE
  35. #endif
  36. #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_FS_DAX)
  37. #define DM_WRITECACHE_HAS_PMEM
  38. #endif
  39. #ifdef DM_WRITECACHE_HAS_PMEM
  40. #define pmem_assign(dest, src) \
  41. do { \
  42. typeof(dest) uniq = (src); \
  43. memcpy_flushcache(&(dest), &uniq, sizeof(dest)); \
  44. } while (0)
  45. #else
  46. #define pmem_assign(dest, src) ((dest) = (src))
  47. #endif
  48. #if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM)
  49. #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  50. #endif
  51. #define MEMORY_SUPERBLOCK_MAGIC 0x23489321
  52. #define MEMORY_SUPERBLOCK_VERSION 1
  53. struct wc_memory_entry {
  54. __le64 original_sector;
  55. __le64 seq_count;
  56. };
  57. struct wc_memory_superblock {
  58. union {
  59. struct {
  60. __le32 magic;
  61. __le32 version;
  62. __le32 block_size;
  63. __le32 pad;
  64. __le64 n_blocks;
  65. __le64 seq_count;
  66. };
  67. __le64 padding[8];
  68. };
  69. struct wc_memory_entry entries[];
  70. };
  71. struct wc_entry {
  72. struct rb_node rb_node;
  73. struct list_head lru;
  74. unsigned short wc_list_contiguous;
  75. #if BITS_PER_LONG == 64
  76. bool write_in_progress : 1;
  77. unsigned long index : 47;
  78. #else
  79. bool write_in_progress;
  80. unsigned long index;
  81. #endif
  82. unsigned long age;
  83. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  84. uint64_t original_sector;
  85. uint64_t seq_count;
  86. #endif
  87. };
  88. #ifdef DM_WRITECACHE_HAS_PMEM
  89. #define WC_MODE_PMEM(wc) ((wc)->pmem_mode)
  90. #define WC_MODE_FUA(wc) ((wc)->writeback_fua)
  91. #else
  92. #define WC_MODE_PMEM(wc) false
  93. #define WC_MODE_FUA(wc) false
  94. #endif
  95. #define WC_MODE_SORT_FREELIST(wc) (!WC_MODE_PMEM(wc))
  96. struct dm_writecache {
  97. struct mutex lock;
  98. struct list_head lru;
  99. union {
  100. struct list_head freelist;
  101. struct {
  102. struct rb_root freetree;
  103. struct wc_entry *current_free;
  104. };
  105. };
  106. struct rb_root tree;
  107. size_t freelist_size;
  108. size_t writeback_size;
  109. size_t freelist_high_watermark;
  110. size_t freelist_low_watermark;
  111. unsigned long max_age;
  112. unsigned long pause;
  113. unsigned int uncommitted_blocks;
  114. unsigned int autocommit_blocks;
  115. unsigned int max_writeback_jobs;
  116. int error;
  117. unsigned long autocommit_jiffies;
  118. struct timer_list autocommit_timer;
  119. struct wait_queue_head freelist_wait;
  120. struct timer_list max_age_timer;
  121. atomic_t bio_in_progress[2];
  122. struct wait_queue_head bio_in_progress_wait[2];
  123. struct dm_target *ti;
  124. struct dm_dev *dev;
  125. struct dm_dev *ssd_dev;
  126. sector_t start_sector;
  127. void *memory_map;
  128. uint64_t memory_map_size;
  129. size_t metadata_sectors;
  130. size_t n_blocks;
  131. uint64_t seq_count;
  132. sector_t data_device_sectors;
  133. void *block_start;
  134. struct wc_entry *entries;
  135. unsigned int block_size;
  136. unsigned char block_size_bits;
  137. bool pmem_mode:1;
  138. bool writeback_fua:1;
  139. bool overwrote_committed:1;
  140. bool memory_vmapped:1;
  141. bool start_sector_set:1;
  142. bool high_wm_percent_set:1;
  143. bool low_wm_percent_set:1;
  144. bool max_writeback_jobs_set:1;
  145. bool autocommit_blocks_set:1;
  146. bool autocommit_time_set:1;
  147. bool max_age_set:1;
  148. bool writeback_fua_set:1;
  149. bool flush_on_suspend:1;
  150. bool cleaner:1;
  151. bool cleaner_set:1;
  152. bool metadata_only:1;
  153. bool pause_set:1;
  154. unsigned int high_wm_percent_value;
  155. unsigned int low_wm_percent_value;
  156. unsigned int autocommit_time_value;
  157. unsigned int max_age_value;
  158. unsigned int pause_value;
  159. unsigned int writeback_all;
  160. struct workqueue_struct *writeback_wq;
  161. struct work_struct writeback_work;
  162. struct work_struct flush_work;
  163. struct dm_io_tracker iot;
  164. struct dm_io_client *dm_io;
  165. raw_spinlock_t endio_list_lock;
  166. struct list_head endio_list;
  167. struct task_struct *endio_thread;
  168. struct task_struct *flush_thread;
  169. struct bio_list flush_list;
  170. struct dm_kcopyd_client *dm_kcopyd;
  171. unsigned long *dirty_bitmap;
  172. unsigned int dirty_bitmap_size;
  173. struct bio_set bio_set;
  174. mempool_t copy_pool;
  175. struct {
  176. unsigned long long reads;
  177. unsigned long long read_hits;
  178. unsigned long long writes;
  179. unsigned long long write_hits_uncommitted;
  180. unsigned long long write_hits_committed;
  181. unsigned long long writes_around;
  182. unsigned long long writes_allocate;
  183. unsigned long long writes_blocked_on_freelist;
  184. unsigned long long flushes;
  185. unsigned long long discards;
  186. } stats;
  187. };
  188. #define WB_LIST_INLINE 16
  189. struct writeback_struct {
  190. struct list_head endio_entry;
  191. struct dm_writecache *wc;
  192. struct wc_entry **wc_list;
  193. unsigned int wc_list_n;
  194. struct wc_entry *wc_list_inline[WB_LIST_INLINE];
  195. struct bio bio;
  196. };
  197. struct copy_struct {
  198. struct list_head endio_entry;
  199. struct dm_writecache *wc;
  200. struct wc_entry *e;
  201. unsigned int n_entries;
  202. int error;
  203. };
  204. DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle,
  205. "A percentage of time allocated for data copying");
  206. static void wc_lock(struct dm_writecache *wc)
  207. {
  208. mutex_lock(&wc->lock);
  209. }
  210. static void wc_unlock(struct dm_writecache *wc)
  211. {
  212. mutex_unlock(&wc->lock);
  213. }
  214. #ifdef DM_WRITECACHE_HAS_PMEM
  215. static int persistent_memory_claim(struct dm_writecache *wc)
  216. {
  217. int r;
  218. loff_t s;
  219. long p, da;
  220. pfn_t pfn;
  221. int id;
  222. struct page **pages;
  223. sector_t offset;
  224. wc->memory_vmapped = false;
  225. s = wc->memory_map_size;
  226. p = s >> PAGE_SHIFT;
  227. if (!p) {
  228. r = -EINVAL;
  229. goto err1;
  230. }
  231. if (p != s >> PAGE_SHIFT) {
  232. r = -EOVERFLOW;
  233. goto err1;
  234. }
  235. offset = get_start_sect(wc->ssd_dev->bdev);
  236. if (offset & (PAGE_SIZE / 512 - 1)) {
  237. r = -EINVAL;
  238. goto err1;
  239. }
  240. offset >>= PAGE_SHIFT - 9;
  241. id = dax_read_lock();
  242. da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, DAX_ACCESS,
  243. &wc->memory_map, &pfn);
  244. if (da < 0) {
  245. wc->memory_map = NULL;
  246. r = da;
  247. goto err2;
  248. }
  249. if (!pfn_t_has_page(pfn)) {
  250. wc->memory_map = NULL;
  251. r = -EOPNOTSUPP;
  252. goto err2;
  253. }
  254. if (da != p) {
  255. long i;
  256. wc->memory_map = NULL;
  257. pages = vmalloc_array(p, sizeof(struct page *));
  258. if (!pages) {
  259. r = -ENOMEM;
  260. goto err2;
  261. }
  262. i = 0;
  263. do {
  264. long daa;
  265. daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i,
  266. p - i, DAX_ACCESS, NULL, &pfn);
  267. if (daa <= 0) {
  268. r = daa ? daa : -EINVAL;
  269. goto err3;
  270. }
  271. if (!pfn_t_has_page(pfn)) {
  272. r = -EOPNOTSUPP;
  273. goto err3;
  274. }
  275. while (daa-- && i < p) {
  276. pages[i++] = pfn_t_to_page(pfn);
  277. pfn.val++;
  278. if (!(i & 15))
  279. cond_resched();
  280. }
  281. } while (i < p);
  282. wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
  283. if (!wc->memory_map) {
  284. r = -ENOMEM;
  285. goto err3;
  286. }
  287. vfree(pages);
  288. wc->memory_vmapped = true;
  289. }
  290. dax_read_unlock(id);
  291. wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT;
  292. wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT;
  293. return 0;
  294. err3:
  295. vfree(pages);
  296. err2:
  297. dax_read_unlock(id);
  298. err1:
  299. return r;
  300. }
  301. #else
  302. static int persistent_memory_claim(struct dm_writecache *wc)
  303. {
  304. return -EOPNOTSUPP;
  305. }
  306. #endif
  307. static void persistent_memory_release(struct dm_writecache *wc)
  308. {
  309. if (wc->memory_vmapped)
  310. vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT));
  311. }
  312. static struct page *persistent_memory_page(void *addr)
  313. {
  314. if (is_vmalloc_addr(addr))
  315. return vmalloc_to_page(addr);
  316. else
  317. return virt_to_page(addr);
  318. }
  319. static unsigned int persistent_memory_page_offset(void *addr)
  320. {
  321. return (unsigned long)addr & (PAGE_SIZE - 1);
  322. }
  323. static void persistent_memory_flush_cache(void *ptr, size_t size)
  324. {
  325. if (is_vmalloc_addr(ptr))
  326. flush_kernel_vmap_range(ptr, size);
  327. }
  328. static void persistent_memory_invalidate_cache(void *ptr, size_t size)
  329. {
  330. if (is_vmalloc_addr(ptr))
  331. invalidate_kernel_vmap_range(ptr, size);
  332. }
  333. static struct wc_memory_superblock *sb(struct dm_writecache *wc)
  334. {
  335. return wc->memory_map;
  336. }
  337. static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
  338. {
  339. return &sb(wc)->entries[e->index];
  340. }
  341. static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
  342. {
  343. return (char *)wc->block_start + (e->index << wc->block_size_bits);
  344. }
  345. static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
  346. {
  347. return wc->start_sector + wc->metadata_sectors +
  348. ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
  349. }
  350. static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e)
  351. {
  352. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  353. return e->original_sector;
  354. #else
  355. return le64_to_cpu(memory_entry(wc, e)->original_sector);
  356. #endif
  357. }
  358. static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e)
  359. {
  360. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  361. return e->seq_count;
  362. #else
  363. return le64_to_cpu(memory_entry(wc, e)->seq_count);
  364. #endif
  365. }
  366. static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
  367. {
  368. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  369. e->seq_count = -1;
  370. #endif
  371. pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1));
  372. }
  373. static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e,
  374. uint64_t original_sector, uint64_t seq_count)
  375. {
  376. struct wc_memory_entry me;
  377. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  378. e->original_sector = original_sector;
  379. e->seq_count = seq_count;
  380. #endif
  381. me.original_sector = cpu_to_le64(original_sector);
  382. me.seq_count = cpu_to_le64(seq_count);
  383. pmem_assign(*memory_entry(wc, e), me);
  384. }
  385. #define writecache_error(wc, err, msg, arg...) \
  386. do { \
  387. if (!cmpxchg(&(wc)->error, 0, err)) \
  388. DMERR(msg, ##arg); \
  389. wake_up(&(wc)->freelist_wait); \
  390. } while (0)
  391. #define writecache_has_error(wc) (unlikely(READ_ONCE((wc)->error)))
  392. static void writecache_flush_all_metadata(struct dm_writecache *wc)
  393. {
  394. if (!WC_MODE_PMEM(wc))
  395. memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size);
  396. }
  397. static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size)
  398. {
  399. if (!WC_MODE_PMEM(wc))
  400. __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY,
  401. wc->dirty_bitmap);
  402. }
  403. static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev);
  404. struct io_notify {
  405. struct dm_writecache *wc;
  406. struct completion c;
  407. atomic_t count;
  408. };
  409. static void writecache_notify_io(unsigned long error, void *context)
  410. {
  411. struct io_notify *endio = context;
  412. if (unlikely(error != 0))
  413. writecache_error(endio->wc, -EIO, "error writing metadata");
  414. BUG_ON(atomic_read(&endio->count) <= 0);
  415. if (atomic_dec_and_test(&endio->count))
  416. complete(&endio->c);
  417. }
  418. static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
  419. {
  420. wait_event(wc->bio_in_progress_wait[direction],
  421. !atomic_read(&wc->bio_in_progress[direction]));
  422. }
  423. static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
  424. {
  425. struct dm_io_region region;
  426. struct dm_io_request req;
  427. struct io_notify endio = {
  428. wc,
  429. COMPLETION_INITIALIZER_ONSTACK(endio.c),
  430. ATOMIC_INIT(1),
  431. };
  432. unsigned int bitmap_bits = wc->dirty_bitmap_size * 8;
  433. unsigned int i = 0;
  434. while (1) {
  435. unsigned int j;
  436. i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
  437. if (unlikely(i == bitmap_bits))
  438. break;
  439. j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i);
  440. region.bdev = wc->ssd_dev->bdev;
  441. region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
  442. region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
  443. if (unlikely(region.sector >= wc->metadata_sectors))
  444. break;
  445. if (unlikely(region.sector + region.count > wc->metadata_sectors))
  446. region.count = wc->metadata_sectors - region.sector;
  447. region.sector += wc->start_sector;
  448. atomic_inc(&endio.count);
  449. req.bi_opf = REQ_OP_WRITE | REQ_SYNC;
  450. req.mem.type = DM_IO_VMA;
  451. req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
  452. req.client = wc->dm_io;
  453. req.notify.fn = writecache_notify_io;
  454. req.notify.context = &endio;
  455. /* writing via async dm-io (implied by notify.fn above) won't return an error */
  456. (void) dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
  457. i = j;
  458. }
  459. writecache_notify_io(0, &endio);
  460. wait_for_completion_io(&endio.c);
  461. if (wait_for_ios)
  462. writecache_wait_for_ios(wc, WRITE);
  463. writecache_disk_flush(wc, wc->ssd_dev);
  464. memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
  465. }
  466. static void ssd_commit_superblock(struct dm_writecache *wc)
  467. {
  468. int r;
  469. struct dm_io_region region;
  470. struct dm_io_request req;
  471. region.bdev = wc->ssd_dev->bdev;
  472. region.sector = 0;
  473. region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT;
  474. if (unlikely(region.sector + region.count > wc->metadata_sectors))
  475. region.count = wc->metadata_sectors - region.sector;
  476. region.sector += wc->start_sector;
  477. req.bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_FUA;
  478. req.mem.type = DM_IO_VMA;
  479. req.mem.ptr.vma = (char *)wc->memory_map;
  480. req.client = wc->dm_io;
  481. req.notify.fn = NULL;
  482. req.notify.context = NULL;
  483. r = dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
  484. if (unlikely(r))
  485. writecache_error(wc, r, "error writing superblock");
  486. }
  487. static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
  488. {
  489. if (WC_MODE_PMEM(wc))
  490. pmem_wmb();
  491. else
  492. ssd_commit_flushed(wc, wait_for_ios);
  493. }
  494. static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
  495. {
  496. int r;
  497. struct dm_io_region region;
  498. struct dm_io_request req;
  499. region.bdev = dev->bdev;
  500. region.sector = 0;
  501. region.count = 0;
  502. req.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
  503. req.mem.type = DM_IO_KMEM;
  504. req.mem.ptr.addr = NULL;
  505. req.client = wc->dm_io;
  506. req.notify.fn = NULL;
  507. r = dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
  508. if (unlikely(r))
  509. writecache_error(wc, r, "error flushing metadata: %d", r);
  510. }
  511. #define WFE_RETURN_FOLLOWING 1
  512. #define WFE_LOWEST_SEQ 2
  513. static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
  514. uint64_t block, int flags)
  515. {
  516. struct wc_entry *e;
  517. struct rb_node *node = wc->tree.rb_node;
  518. if (unlikely(!node))
  519. return NULL;
  520. while (1) {
  521. e = container_of(node, struct wc_entry, rb_node);
  522. if (read_original_sector(wc, e) == block)
  523. break;
  524. node = (read_original_sector(wc, e) >= block ?
  525. e->rb_node.rb_left : e->rb_node.rb_right);
  526. if (unlikely(!node)) {
  527. if (!(flags & WFE_RETURN_FOLLOWING))
  528. return NULL;
  529. if (read_original_sector(wc, e) >= block)
  530. return e;
  531. node = rb_next(&e->rb_node);
  532. if (unlikely(!node))
  533. return NULL;
  534. e = container_of(node, struct wc_entry, rb_node);
  535. return e;
  536. }
  537. }
  538. while (1) {
  539. struct wc_entry *e2;
  540. if (flags & WFE_LOWEST_SEQ)
  541. node = rb_prev(&e->rb_node);
  542. else
  543. node = rb_next(&e->rb_node);
  544. if (unlikely(!node))
  545. return e;
  546. e2 = container_of(node, struct wc_entry, rb_node);
  547. if (read_original_sector(wc, e2) != block)
  548. return e;
  549. e = e2;
  550. }
  551. }
  552. static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins)
  553. {
  554. struct wc_entry *e;
  555. struct rb_node **node = &wc->tree.rb_node, *parent = NULL;
  556. while (*node) {
  557. e = container_of(*node, struct wc_entry, rb_node);
  558. parent = &e->rb_node;
  559. if (read_original_sector(wc, e) > read_original_sector(wc, ins))
  560. node = &parent->rb_left;
  561. else
  562. node = &parent->rb_right;
  563. }
  564. rb_link_node(&ins->rb_node, parent, node);
  565. rb_insert_color(&ins->rb_node, &wc->tree);
  566. list_add(&ins->lru, &wc->lru);
  567. ins->age = jiffies;
  568. }
  569. static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
  570. {
  571. list_del(&e->lru);
  572. rb_erase(&e->rb_node, &wc->tree);
  573. }
  574. static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e)
  575. {
  576. if (WC_MODE_SORT_FREELIST(wc)) {
  577. struct rb_node **node = &wc->freetree.rb_node, *parent = NULL;
  578. if (unlikely(!*node))
  579. wc->current_free = e;
  580. while (*node) {
  581. parent = *node;
  582. if (&e->rb_node < *node)
  583. node = &parent->rb_left;
  584. else
  585. node = &parent->rb_right;
  586. }
  587. rb_link_node(&e->rb_node, parent, node);
  588. rb_insert_color(&e->rb_node, &wc->freetree);
  589. } else {
  590. list_add_tail(&e->lru, &wc->freelist);
  591. }
  592. wc->freelist_size++;
  593. }
  594. static inline void writecache_verify_watermark(struct dm_writecache *wc)
  595. {
  596. if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
  597. queue_work(wc->writeback_wq, &wc->writeback_work);
  598. }
  599. static void writecache_max_age_timer(struct timer_list *t)
  600. {
  601. struct dm_writecache *wc = from_timer(wc, t, max_age_timer);
  602. if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) {
  603. queue_work(wc->writeback_wq, &wc->writeback_work);
  604. mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
  605. }
  606. }
  607. static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
  608. {
  609. struct wc_entry *e;
  610. if (WC_MODE_SORT_FREELIST(wc)) {
  611. struct rb_node *next;
  612. if (unlikely(!wc->current_free))
  613. return NULL;
  614. e = wc->current_free;
  615. if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
  616. return NULL;
  617. next = rb_next(&e->rb_node);
  618. rb_erase(&e->rb_node, &wc->freetree);
  619. if (unlikely(!next))
  620. next = rb_first(&wc->freetree);
  621. wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL;
  622. } else {
  623. if (unlikely(list_empty(&wc->freelist)))
  624. return NULL;
  625. e = container_of(wc->freelist.next, struct wc_entry, lru);
  626. if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
  627. return NULL;
  628. list_del(&e->lru);
  629. }
  630. wc->freelist_size--;
  631. writecache_verify_watermark(wc);
  632. return e;
  633. }
  634. static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e)
  635. {
  636. writecache_unlink(wc, e);
  637. writecache_add_to_freelist(wc, e);
  638. clear_seq_count(wc, e);
  639. writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
  640. if (unlikely(waitqueue_active(&wc->freelist_wait)))
  641. wake_up(&wc->freelist_wait);
  642. }
  643. static void writecache_wait_on_freelist(struct dm_writecache *wc)
  644. {
  645. DEFINE_WAIT(wait);
  646. prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE);
  647. wc_unlock(wc);
  648. io_schedule();
  649. finish_wait(&wc->freelist_wait, &wait);
  650. wc_lock(wc);
  651. }
  652. static void writecache_poison_lists(struct dm_writecache *wc)
  653. {
  654. /*
  655. * Catch incorrect access to these values while the device is suspended.
  656. */
  657. memset(&wc->tree, -1, sizeof(wc->tree));
  658. wc->lru.next = LIST_POISON1;
  659. wc->lru.prev = LIST_POISON2;
  660. wc->freelist.next = LIST_POISON1;
  661. wc->freelist.prev = LIST_POISON2;
  662. }
  663. static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e)
  664. {
  665. writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
  666. if (WC_MODE_PMEM(wc))
  667. writecache_flush_region(wc, memory_data(wc, e), wc->block_size);
  668. }
  669. static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e)
  670. {
  671. return read_seq_count(wc, e) < wc->seq_count;
  672. }
  673. static void writecache_flush(struct dm_writecache *wc)
  674. {
  675. struct wc_entry *e, *e2;
  676. bool need_flush_after_free;
  677. wc->uncommitted_blocks = 0;
  678. del_timer(&wc->autocommit_timer);
  679. if (list_empty(&wc->lru))
  680. return;
  681. e = container_of(wc->lru.next, struct wc_entry, lru);
  682. if (writecache_entry_is_committed(wc, e)) {
  683. if (wc->overwrote_committed) {
  684. writecache_wait_for_ios(wc, WRITE);
  685. writecache_disk_flush(wc, wc->ssd_dev);
  686. wc->overwrote_committed = false;
  687. }
  688. return;
  689. }
  690. while (1) {
  691. writecache_flush_entry(wc, e);
  692. if (unlikely(e->lru.next == &wc->lru))
  693. break;
  694. e2 = container_of(e->lru.next, struct wc_entry, lru);
  695. if (writecache_entry_is_committed(wc, e2))
  696. break;
  697. e = e2;
  698. cond_resched();
  699. }
  700. writecache_commit_flushed(wc, true);
  701. wc->seq_count++;
  702. pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
  703. if (WC_MODE_PMEM(wc))
  704. writecache_commit_flushed(wc, false);
  705. else
  706. ssd_commit_superblock(wc);
  707. wc->overwrote_committed = false;
  708. need_flush_after_free = false;
  709. while (1) {
  710. /* Free another committed entry with lower seq-count */
  711. struct rb_node *rb_node = rb_prev(&e->rb_node);
  712. if (rb_node) {
  713. e2 = container_of(rb_node, struct wc_entry, rb_node);
  714. if (read_original_sector(wc, e2) == read_original_sector(wc, e) &&
  715. likely(!e2->write_in_progress)) {
  716. writecache_free_entry(wc, e2);
  717. need_flush_after_free = true;
  718. }
  719. }
  720. if (unlikely(e->lru.prev == &wc->lru))
  721. break;
  722. e = container_of(e->lru.prev, struct wc_entry, lru);
  723. cond_resched();
  724. }
  725. if (need_flush_after_free)
  726. writecache_commit_flushed(wc, false);
  727. }
  728. static void writecache_flush_work(struct work_struct *work)
  729. {
  730. struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work);
  731. wc_lock(wc);
  732. writecache_flush(wc);
  733. wc_unlock(wc);
  734. }
  735. static void writecache_autocommit_timer(struct timer_list *t)
  736. {
  737. struct dm_writecache *wc = from_timer(wc, t, autocommit_timer);
  738. if (!writecache_has_error(wc))
  739. queue_work(wc->writeback_wq, &wc->flush_work);
  740. }
  741. static void writecache_schedule_autocommit(struct dm_writecache *wc)
  742. {
  743. if (!timer_pending(&wc->autocommit_timer))
  744. mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies);
  745. }
  746. static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end)
  747. {
  748. struct wc_entry *e;
  749. bool discarded_something = false;
  750. e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ);
  751. if (unlikely(!e))
  752. return;
  753. while (read_original_sector(wc, e) < end) {
  754. struct rb_node *node = rb_next(&e->rb_node);
  755. if (likely(!e->write_in_progress)) {
  756. if (!discarded_something) {
  757. if (!WC_MODE_PMEM(wc)) {
  758. writecache_wait_for_ios(wc, READ);
  759. writecache_wait_for_ios(wc, WRITE);
  760. }
  761. discarded_something = true;
  762. }
  763. if (!writecache_entry_is_committed(wc, e))
  764. wc->uncommitted_blocks--;
  765. writecache_free_entry(wc, e);
  766. }
  767. if (unlikely(!node))
  768. break;
  769. e = container_of(node, struct wc_entry, rb_node);
  770. }
  771. if (discarded_something)
  772. writecache_commit_flushed(wc, false);
  773. }
  774. static bool writecache_wait_for_writeback(struct dm_writecache *wc)
  775. {
  776. if (wc->writeback_size) {
  777. writecache_wait_on_freelist(wc);
  778. return true;
  779. }
  780. return false;
  781. }
  782. static void writecache_suspend(struct dm_target *ti)
  783. {
  784. struct dm_writecache *wc = ti->private;
  785. bool flush_on_suspend;
  786. del_timer_sync(&wc->autocommit_timer);
  787. del_timer_sync(&wc->max_age_timer);
  788. wc_lock(wc);
  789. writecache_flush(wc);
  790. flush_on_suspend = wc->flush_on_suspend;
  791. if (flush_on_suspend) {
  792. wc->flush_on_suspend = false;
  793. wc->writeback_all++;
  794. queue_work(wc->writeback_wq, &wc->writeback_work);
  795. }
  796. wc_unlock(wc);
  797. drain_workqueue(wc->writeback_wq);
  798. wc_lock(wc);
  799. if (flush_on_suspend)
  800. wc->writeback_all--;
  801. while (writecache_wait_for_writeback(wc))
  802. ;
  803. if (WC_MODE_PMEM(wc))
  804. persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
  805. writecache_poison_lists(wc);
  806. wc_unlock(wc);
  807. }
  808. static int writecache_alloc_entries(struct dm_writecache *wc)
  809. {
  810. size_t b;
  811. if (wc->entries)
  812. return 0;
  813. wc->entries = vmalloc_array(wc->n_blocks, sizeof(struct wc_entry));
  814. if (!wc->entries)
  815. return -ENOMEM;
  816. for (b = 0; b < wc->n_blocks; b++) {
  817. struct wc_entry *e = &wc->entries[b];
  818. e->index = b;
  819. e->write_in_progress = false;
  820. cond_resched();
  821. }
  822. return 0;
  823. }
  824. static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors)
  825. {
  826. struct dm_io_region region;
  827. struct dm_io_request req;
  828. region.bdev = wc->ssd_dev->bdev;
  829. region.sector = wc->start_sector;
  830. region.count = n_sectors;
  831. req.bi_opf = REQ_OP_READ | REQ_SYNC;
  832. req.mem.type = DM_IO_VMA;
  833. req.mem.ptr.vma = (char *)wc->memory_map;
  834. req.client = wc->dm_io;
  835. req.notify.fn = NULL;
  836. return dm_io(&req, 1, &region, NULL, IOPRIO_DEFAULT);
  837. }
  838. static void writecache_resume(struct dm_target *ti)
  839. {
  840. struct dm_writecache *wc = ti->private;
  841. size_t b;
  842. bool need_flush = false;
  843. __le64 sb_seq_count;
  844. int r;
  845. wc_lock(wc);
  846. wc->data_device_sectors = bdev_nr_sectors(wc->dev->bdev);
  847. if (WC_MODE_PMEM(wc)) {
  848. persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
  849. } else {
  850. r = writecache_read_metadata(wc, wc->metadata_sectors);
  851. if (r) {
  852. size_t sb_entries_offset;
  853. writecache_error(wc, r, "unable to read metadata: %d", r);
  854. sb_entries_offset = offsetof(struct wc_memory_superblock, entries);
  855. memset((char *)wc->memory_map + sb_entries_offset, -1,
  856. (wc->metadata_sectors << SECTOR_SHIFT) - sb_entries_offset);
  857. }
  858. }
  859. wc->tree = RB_ROOT;
  860. INIT_LIST_HEAD(&wc->lru);
  861. if (WC_MODE_SORT_FREELIST(wc)) {
  862. wc->freetree = RB_ROOT;
  863. wc->current_free = NULL;
  864. } else {
  865. INIT_LIST_HEAD(&wc->freelist);
  866. }
  867. wc->freelist_size = 0;
  868. r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count,
  869. sizeof(uint64_t));
  870. if (r) {
  871. writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
  872. sb_seq_count = cpu_to_le64(0);
  873. }
  874. wc->seq_count = le64_to_cpu(sb_seq_count);
  875. #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  876. for (b = 0; b < wc->n_blocks; b++) {
  877. struct wc_entry *e = &wc->entries[b];
  878. struct wc_memory_entry wme;
  879. if (writecache_has_error(wc)) {
  880. e->original_sector = -1;
  881. e->seq_count = -1;
  882. continue;
  883. }
  884. r = copy_mc_to_kernel(&wme, memory_entry(wc, e),
  885. sizeof(struct wc_memory_entry));
  886. if (r) {
  887. writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
  888. (unsigned long)b, r);
  889. e->original_sector = -1;
  890. e->seq_count = -1;
  891. } else {
  892. e->original_sector = le64_to_cpu(wme.original_sector);
  893. e->seq_count = le64_to_cpu(wme.seq_count);
  894. }
  895. cond_resched();
  896. }
  897. #endif
  898. for (b = 0; b < wc->n_blocks; b++) {
  899. struct wc_entry *e = &wc->entries[b];
  900. if (!writecache_entry_is_committed(wc, e)) {
  901. if (read_seq_count(wc, e) != -1) {
  902. erase_this:
  903. clear_seq_count(wc, e);
  904. need_flush = true;
  905. }
  906. writecache_add_to_freelist(wc, e);
  907. } else {
  908. struct wc_entry *old;
  909. old = writecache_find_entry(wc, read_original_sector(wc, e), 0);
  910. if (!old) {
  911. writecache_insert_entry(wc, e);
  912. } else {
  913. if (read_seq_count(wc, old) == read_seq_count(wc, e)) {
  914. writecache_error(wc, -EINVAL,
  915. "two identical entries, position %llu, sector %llu, sequence %llu",
  916. (unsigned long long)b, (unsigned long long)read_original_sector(wc, e),
  917. (unsigned long long)read_seq_count(wc, e));
  918. }
  919. if (read_seq_count(wc, old) > read_seq_count(wc, e)) {
  920. goto erase_this;
  921. } else {
  922. writecache_free_entry(wc, old);
  923. writecache_insert_entry(wc, e);
  924. need_flush = true;
  925. }
  926. }
  927. }
  928. cond_resched();
  929. }
  930. if (need_flush) {
  931. writecache_flush_all_metadata(wc);
  932. writecache_commit_flushed(wc, false);
  933. }
  934. writecache_verify_watermark(wc);
  935. if (wc->max_age != MAX_AGE_UNSPECIFIED)
  936. mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
  937. wc_unlock(wc);
  938. }
  939. static int process_flush_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
  940. {
  941. if (argc != 1)
  942. return -EINVAL;
  943. wc_lock(wc);
  944. if (dm_suspended(wc->ti)) {
  945. wc_unlock(wc);
  946. return -EBUSY;
  947. }
  948. if (writecache_has_error(wc)) {
  949. wc_unlock(wc);
  950. return -EIO;
  951. }
  952. writecache_flush(wc);
  953. wc->writeback_all++;
  954. queue_work(wc->writeback_wq, &wc->writeback_work);
  955. wc_unlock(wc);
  956. flush_workqueue(wc->writeback_wq);
  957. wc_lock(wc);
  958. wc->writeback_all--;
  959. if (writecache_has_error(wc)) {
  960. wc_unlock(wc);
  961. return -EIO;
  962. }
  963. wc_unlock(wc);
  964. return 0;
  965. }
  966. static int process_flush_on_suspend_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
  967. {
  968. if (argc != 1)
  969. return -EINVAL;
  970. wc_lock(wc);
  971. wc->flush_on_suspend = true;
  972. wc_unlock(wc);
  973. return 0;
  974. }
  975. static void activate_cleaner(struct dm_writecache *wc)
  976. {
  977. wc->flush_on_suspend = true;
  978. wc->cleaner = true;
  979. wc->freelist_high_watermark = wc->n_blocks;
  980. wc->freelist_low_watermark = wc->n_blocks;
  981. }
  982. static int process_cleaner_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
  983. {
  984. if (argc != 1)
  985. return -EINVAL;
  986. wc_lock(wc);
  987. activate_cleaner(wc);
  988. if (!dm_suspended(wc->ti))
  989. writecache_verify_watermark(wc);
  990. wc_unlock(wc);
  991. return 0;
  992. }
  993. static int process_clear_stats_mesg(unsigned int argc, char **argv, struct dm_writecache *wc)
  994. {
  995. if (argc != 1)
  996. return -EINVAL;
  997. wc_lock(wc);
  998. memset(&wc->stats, 0, sizeof(wc->stats));
  999. wc_unlock(wc);
  1000. return 0;
  1001. }
  1002. static int writecache_message(struct dm_target *ti, unsigned int argc, char **argv,
  1003. char *result, unsigned int maxlen)
  1004. {
  1005. int r = -EINVAL;
  1006. struct dm_writecache *wc = ti->private;
  1007. if (!strcasecmp(argv[0], "flush"))
  1008. r = process_flush_mesg(argc, argv, wc);
  1009. else if (!strcasecmp(argv[0], "flush_on_suspend"))
  1010. r = process_flush_on_suspend_mesg(argc, argv, wc);
  1011. else if (!strcasecmp(argv[0], "cleaner"))
  1012. r = process_cleaner_mesg(argc, argv, wc);
  1013. else if (!strcasecmp(argv[0], "clear_stats"))
  1014. r = process_clear_stats_mesg(argc, argv, wc);
  1015. else
  1016. DMERR("unrecognised message received: %s", argv[0]);
  1017. return r;
  1018. }
  1019. static void memcpy_flushcache_optimized(void *dest, void *source, size_t size)
  1020. {
  1021. /*
  1022. * clflushopt performs better with block size 1024, 2048, 4096
  1023. * non-temporal stores perform better with block size 512
  1024. *
  1025. * block size 512 1024 2048 4096
  1026. * movnti 496 MB/s 642 MB/s 725 MB/s 744 MB/s
  1027. * clflushopt 373 MB/s 688 MB/s 1.1 GB/s 1.2 GB/s
  1028. *
  1029. * We see that movnti performs better for 512-byte blocks, and
  1030. * clflushopt performs better for 1024-byte and larger blocks. So, we
  1031. * prefer clflushopt for sizes >= 768.
  1032. *
  1033. * NOTE: this happens to be the case now (with dm-writecache's single
  1034. * threaded model) but re-evaluate this once memcpy_flushcache() is
  1035. * enabled to use movdir64b which might invalidate this performance
  1036. * advantage seen with cache-allocating-writes plus flushing.
  1037. */
  1038. #ifdef CONFIG_X86
  1039. if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) &&
  1040. likely(boot_cpu_data.x86_clflush_size == 64) &&
  1041. likely(size >= 768)) {
  1042. do {
  1043. memcpy((void *)dest, (void *)source, 64);
  1044. clflushopt((void *)dest);
  1045. dest += 64;
  1046. source += 64;
  1047. size -= 64;
  1048. } while (size >= 64);
  1049. return;
  1050. }
  1051. #endif
  1052. memcpy_flushcache(dest, source, size);
  1053. }
  1054. static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
  1055. {
  1056. void *buf;
  1057. unsigned int size;
  1058. int rw = bio_data_dir(bio);
  1059. unsigned int remaining_size = wc->block_size;
  1060. do {
  1061. struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
  1062. buf = bvec_kmap_local(&bv);
  1063. size = bv.bv_len;
  1064. if (unlikely(size > remaining_size))
  1065. size = remaining_size;
  1066. if (rw == READ) {
  1067. int r;
  1068. r = copy_mc_to_kernel(buf, data, size);
  1069. flush_dcache_page(bio_page(bio));
  1070. if (unlikely(r)) {
  1071. writecache_error(wc, r, "hardware memory error when reading data: %d", r);
  1072. bio->bi_status = BLK_STS_IOERR;
  1073. }
  1074. } else {
  1075. flush_dcache_page(bio_page(bio));
  1076. memcpy_flushcache_optimized(data, buf, size);
  1077. }
  1078. kunmap_local(buf);
  1079. data = (char *)data + size;
  1080. remaining_size -= size;
  1081. bio_advance(bio, size);
  1082. } while (unlikely(remaining_size));
  1083. }
  1084. static int writecache_flush_thread(void *data)
  1085. {
  1086. struct dm_writecache *wc = data;
  1087. while (1) {
  1088. struct bio *bio;
  1089. wc_lock(wc);
  1090. bio = bio_list_pop(&wc->flush_list);
  1091. if (!bio) {
  1092. set_current_state(TASK_INTERRUPTIBLE);
  1093. wc_unlock(wc);
  1094. if (unlikely(kthread_should_stop())) {
  1095. set_current_state(TASK_RUNNING);
  1096. break;
  1097. }
  1098. schedule();
  1099. continue;
  1100. }
  1101. if (bio_op(bio) == REQ_OP_DISCARD) {
  1102. writecache_discard(wc, bio->bi_iter.bi_sector,
  1103. bio_end_sector(bio));
  1104. wc_unlock(wc);
  1105. bio_set_dev(bio, wc->dev->bdev);
  1106. submit_bio_noacct(bio);
  1107. } else {
  1108. writecache_flush(wc);
  1109. wc_unlock(wc);
  1110. if (writecache_has_error(wc))
  1111. bio->bi_status = BLK_STS_IOERR;
  1112. bio_endio(bio);
  1113. }
  1114. }
  1115. return 0;
  1116. }
  1117. static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
  1118. {
  1119. if (bio_list_empty(&wc->flush_list))
  1120. wake_up_process(wc->flush_thread);
  1121. bio_list_add(&wc->flush_list, bio);
  1122. }
  1123. enum wc_map_op {
  1124. WC_MAP_SUBMIT,
  1125. WC_MAP_REMAP,
  1126. WC_MAP_REMAP_ORIGIN,
  1127. WC_MAP_RETURN,
  1128. WC_MAP_ERROR,
  1129. };
  1130. static void writecache_map_remap_origin(struct dm_writecache *wc, struct bio *bio,
  1131. struct wc_entry *e)
  1132. {
  1133. if (e) {
  1134. sector_t next_boundary =
  1135. read_original_sector(wc, e) - bio->bi_iter.bi_sector;
  1136. if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT)
  1137. dm_accept_partial_bio(bio, next_boundary);
  1138. }
  1139. }
  1140. static enum wc_map_op writecache_map_read(struct dm_writecache *wc, struct bio *bio)
  1141. {
  1142. enum wc_map_op map_op;
  1143. struct wc_entry *e;
  1144. read_next_block:
  1145. wc->stats.reads++;
  1146. e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
  1147. if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
  1148. wc->stats.read_hits++;
  1149. if (WC_MODE_PMEM(wc)) {
  1150. bio_copy_block(wc, bio, memory_data(wc, e));
  1151. if (bio->bi_iter.bi_size)
  1152. goto read_next_block;
  1153. map_op = WC_MAP_SUBMIT;
  1154. } else {
  1155. dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
  1156. bio_set_dev(bio, wc->ssd_dev->bdev);
  1157. bio->bi_iter.bi_sector = cache_sector(wc, e);
  1158. if (!writecache_entry_is_committed(wc, e))
  1159. writecache_wait_for_ios(wc, WRITE);
  1160. map_op = WC_MAP_REMAP;
  1161. }
  1162. } else {
  1163. writecache_map_remap_origin(wc, bio, e);
  1164. wc->stats.reads += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits;
  1165. map_op = WC_MAP_REMAP_ORIGIN;
  1166. }
  1167. return map_op;
  1168. }
  1169. static void writecache_bio_copy_ssd(struct dm_writecache *wc, struct bio *bio,
  1170. struct wc_entry *e, bool search_used)
  1171. {
  1172. unsigned int bio_size = wc->block_size;
  1173. sector_t start_cache_sec = cache_sector(wc, e);
  1174. sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
  1175. while (bio_size < bio->bi_iter.bi_size) {
  1176. if (!search_used) {
  1177. struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
  1178. if (!f)
  1179. break;
  1180. write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
  1181. (bio_size >> SECTOR_SHIFT), wc->seq_count);
  1182. writecache_insert_entry(wc, f);
  1183. wc->uncommitted_blocks++;
  1184. } else {
  1185. struct wc_entry *f;
  1186. struct rb_node *next = rb_next(&e->rb_node);
  1187. if (!next)
  1188. break;
  1189. f = container_of(next, struct wc_entry, rb_node);
  1190. if (f != e + 1)
  1191. break;
  1192. if (read_original_sector(wc, f) !=
  1193. read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
  1194. break;
  1195. if (unlikely(f->write_in_progress))
  1196. break;
  1197. if (writecache_entry_is_committed(wc, f))
  1198. wc->overwrote_committed = true;
  1199. e = f;
  1200. }
  1201. bio_size += wc->block_size;
  1202. current_cache_sec += wc->block_size >> SECTOR_SHIFT;
  1203. }
  1204. bio_set_dev(bio, wc->ssd_dev->bdev);
  1205. bio->bi_iter.bi_sector = start_cache_sec;
  1206. dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
  1207. wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
  1208. wc->stats.writes_allocate += (bio->bi_iter.bi_size - wc->block_size) >> wc->block_size_bits;
  1209. if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
  1210. wc->uncommitted_blocks = 0;
  1211. queue_work(wc->writeback_wq, &wc->flush_work);
  1212. } else {
  1213. writecache_schedule_autocommit(wc);
  1214. }
  1215. }
  1216. static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio *bio)
  1217. {
  1218. struct wc_entry *e;
  1219. do {
  1220. bool found_entry = false;
  1221. bool search_used = false;
  1222. if (writecache_has_error(wc)) {
  1223. wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
  1224. return WC_MAP_ERROR;
  1225. }
  1226. e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
  1227. if (e) {
  1228. if (!writecache_entry_is_committed(wc, e)) {
  1229. wc->stats.write_hits_uncommitted++;
  1230. search_used = true;
  1231. goto bio_copy;
  1232. }
  1233. wc->stats.write_hits_committed++;
  1234. if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
  1235. wc->overwrote_committed = true;
  1236. search_used = true;
  1237. goto bio_copy;
  1238. }
  1239. found_entry = true;
  1240. } else {
  1241. if (unlikely(wc->cleaner) ||
  1242. (wc->metadata_only && !(bio->bi_opf & REQ_META)))
  1243. goto direct_write;
  1244. }
  1245. e = writecache_pop_from_freelist(wc, (sector_t)-1);
  1246. if (unlikely(!e)) {
  1247. if (!WC_MODE_PMEM(wc) && !found_entry) {
  1248. direct_write:
  1249. e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
  1250. writecache_map_remap_origin(wc, bio, e);
  1251. wc->stats.writes_around += bio->bi_iter.bi_size >> wc->block_size_bits;
  1252. wc->stats.writes += bio->bi_iter.bi_size >> wc->block_size_bits;
  1253. return WC_MAP_REMAP_ORIGIN;
  1254. }
  1255. wc->stats.writes_blocked_on_freelist++;
  1256. writecache_wait_on_freelist(wc);
  1257. continue;
  1258. }
  1259. write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
  1260. writecache_insert_entry(wc, e);
  1261. wc->uncommitted_blocks++;
  1262. wc->stats.writes_allocate++;
  1263. bio_copy:
  1264. if (WC_MODE_PMEM(wc)) {
  1265. bio_copy_block(wc, bio, memory_data(wc, e));
  1266. wc->stats.writes++;
  1267. } else {
  1268. writecache_bio_copy_ssd(wc, bio, e, search_used);
  1269. return WC_MAP_REMAP;
  1270. }
  1271. } while (bio->bi_iter.bi_size);
  1272. if (unlikely(bio->bi_opf & REQ_FUA || wc->uncommitted_blocks >= wc->autocommit_blocks))
  1273. writecache_flush(wc);
  1274. else
  1275. writecache_schedule_autocommit(wc);
  1276. return WC_MAP_SUBMIT;
  1277. }
  1278. static enum wc_map_op writecache_map_flush(struct dm_writecache *wc, struct bio *bio)
  1279. {
  1280. if (writecache_has_error(wc))
  1281. return WC_MAP_ERROR;
  1282. if (WC_MODE_PMEM(wc)) {
  1283. wc->stats.flushes++;
  1284. writecache_flush(wc);
  1285. if (writecache_has_error(wc))
  1286. return WC_MAP_ERROR;
  1287. else if (unlikely(wc->cleaner) || unlikely(wc->metadata_only))
  1288. return WC_MAP_REMAP_ORIGIN;
  1289. return WC_MAP_SUBMIT;
  1290. }
  1291. /* SSD: */
  1292. if (dm_bio_get_target_bio_nr(bio))
  1293. return WC_MAP_REMAP_ORIGIN;
  1294. wc->stats.flushes++;
  1295. writecache_offload_bio(wc, bio);
  1296. return WC_MAP_RETURN;
  1297. }
  1298. static enum wc_map_op writecache_map_discard(struct dm_writecache *wc, struct bio *bio)
  1299. {
  1300. wc->stats.discards += bio->bi_iter.bi_size >> wc->block_size_bits;
  1301. if (writecache_has_error(wc))
  1302. return WC_MAP_ERROR;
  1303. if (WC_MODE_PMEM(wc)) {
  1304. writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
  1305. return WC_MAP_REMAP_ORIGIN;
  1306. }
  1307. /* SSD: */
  1308. writecache_offload_bio(wc, bio);
  1309. return WC_MAP_RETURN;
  1310. }
  1311. static int writecache_map(struct dm_target *ti, struct bio *bio)
  1312. {
  1313. struct dm_writecache *wc = ti->private;
  1314. enum wc_map_op map_op;
  1315. bio->bi_private = NULL;
  1316. wc_lock(wc);
  1317. if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
  1318. map_op = writecache_map_flush(wc, bio);
  1319. goto done;
  1320. }
  1321. bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
  1322. if (unlikely((((unsigned int)bio->bi_iter.bi_sector | bio_sectors(bio)) &
  1323. (wc->block_size / 512 - 1)) != 0)) {
  1324. DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
  1325. (unsigned long long)bio->bi_iter.bi_sector,
  1326. bio->bi_iter.bi_size, wc->block_size);
  1327. map_op = WC_MAP_ERROR;
  1328. goto done;
  1329. }
  1330. if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
  1331. map_op = writecache_map_discard(wc, bio);
  1332. goto done;
  1333. }
  1334. if (bio_data_dir(bio) == READ)
  1335. map_op = writecache_map_read(wc, bio);
  1336. else
  1337. map_op = writecache_map_write(wc, bio);
  1338. done:
  1339. switch (map_op) {
  1340. case WC_MAP_REMAP_ORIGIN:
  1341. if (likely(wc->pause != 0)) {
  1342. if (bio_op(bio) == REQ_OP_WRITE) {
  1343. dm_iot_io_begin(&wc->iot, 1);
  1344. bio->bi_private = (void *)2;
  1345. }
  1346. }
  1347. bio_set_dev(bio, wc->dev->bdev);
  1348. wc_unlock(wc);
  1349. return DM_MAPIO_REMAPPED;
  1350. case WC_MAP_REMAP:
  1351. /* make sure that writecache_end_io decrements bio_in_progress: */
  1352. bio->bi_private = (void *)1;
  1353. atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
  1354. wc_unlock(wc);
  1355. return DM_MAPIO_REMAPPED;
  1356. case WC_MAP_SUBMIT:
  1357. wc_unlock(wc);
  1358. bio_endio(bio);
  1359. return DM_MAPIO_SUBMITTED;
  1360. case WC_MAP_RETURN:
  1361. wc_unlock(wc);
  1362. return DM_MAPIO_SUBMITTED;
  1363. case WC_MAP_ERROR:
  1364. wc_unlock(wc);
  1365. bio_io_error(bio);
  1366. return DM_MAPIO_SUBMITTED;
  1367. default:
  1368. BUG();
  1369. wc_unlock(wc);
  1370. return DM_MAPIO_KILL;
  1371. }
  1372. }
  1373. static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
  1374. {
  1375. struct dm_writecache *wc = ti->private;
  1376. if (bio->bi_private == (void *)1) {
  1377. int dir = bio_data_dir(bio);
  1378. if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
  1379. if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
  1380. wake_up(&wc->bio_in_progress_wait[dir]);
  1381. } else if (bio->bi_private == (void *)2) {
  1382. dm_iot_io_end(&wc->iot, 1);
  1383. }
  1384. return 0;
  1385. }
  1386. static int writecache_iterate_devices(struct dm_target *ti,
  1387. iterate_devices_callout_fn fn, void *data)
  1388. {
  1389. struct dm_writecache *wc = ti->private;
  1390. return fn(ti, wc->dev, 0, ti->len, data);
  1391. }
  1392. static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits)
  1393. {
  1394. struct dm_writecache *wc = ti->private;
  1395. if (limits->logical_block_size < wc->block_size)
  1396. limits->logical_block_size = wc->block_size;
  1397. if (limits->physical_block_size < wc->block_size)
  1398. limits->physical_block_size = wc->block_size;
  1399. if (limits->io_min < wc->block_size)
  1400. limits->io_min = wc->block_size;
  1401. }
  1402. static void writecache_writeback_endio(struct bio *bio)
  1403. {
  1404. struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio);
  1405. struct dm_writecache *wc = wb->wc;
  1406. unsigned long flags;
  1407. raw_spin_lock_irqsave(&wc->endio_list_lock, flags);
  1408. if (unlikely(list_empty(&wc->endio_list)))
  1409. wake_up_process(wc->endio_thread);
  1410. list_add_tail(&wb->endio_entry, &wc->endio_list);
  1411. raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags);
  1412. }
  1413. static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr)
  1414. {
  1415. struct copy_struct *c = ptr;
  1416. struct dm_writecache *wc = c->wc;
  1417. c->error = likely(!(read_err | write_err)) ? 0 : -EIO;
  1418. raw_spin_lock_irq(&wc->endio_list_lock);
  1419. if (unlikely(list_empty(&wc->endio_list)))
  1420. wake_up_process(wc->endio_thread);
  1421. list_add_tail(&c->endio_entry, &wc->endio_list);
  1422. raw_spin_unlock_irq(&wc->endio_list_lock);
  1423. }
  1424. static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list)
  1425. {
  1426. unsigned int i;
  1427. struct writeback_struct *wb;
  1428. struct wc_entry *e;
  1429. unsigned long n_walked = 0;
  1430. do {
  1431. wb = list_entry(list->next, struct writeback_struct, endio_entry);
  1432. list_del(&wb->endio_entry);
  1433. if (unlikely(wb->bio.bi_status != BLK_STS_OK))
  1434. writecache_error(wc, blk_status_to_errno(wb->bio.bi_status),
  1435. "write error %d", wb->bio.bi_status);
  1436. i = 0;
  1437. do {
  1438. e = wb->wc_list[i];
  1439. BUG_ON(!e->write_in_progress);
  1440. e->write_in_progress = false;
  1441. INIT_LIST_HEAD(&e->lru);
  1442. if (!writecache_has_error(wc))
  1443. writecache_free_entry(wc, e);
  1444. BUG_ON(!wc->writeback_size);
  1445. wc->writeback_size--;
  1446. n_walked++;
  1447. if (unlikely(n_walked >= ENDIO_LATENCY)) {
  1448. writecache_commit_flushed(wc, false);
  1449. wc_unlock(wc);
  1450. wc_lock(wc);
  1451. n_walked = 0;
  1452. }
  1453. } while (++i < wb->wc_list_n);
  1454. if (wb->wc_list != wb->wc_list_inline)
  1455. kfree(wb->wc_list);
  1456. bio_put(&wb->bio);
  1457. } while (!list_empty(list));
  1458. }
  1459. static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list)
  1460. {
  1461. struct copy_struct *c;
  1462. struct wc_entry *e;
  1463. do {
  1464. c = list_entry(list->next, struct copy_struct, endio_entry);
  1465. list_del(&c->endio_entry);
  1466. if (unlikely(c->error))
  1467. writecache_error(wc, c->error, "copy error");
  1468. e = c->e;
  1469. do {
  1470. BUG_ON(!e->write_in_progress);
  1471. e->write_in_progress = false;
  1472. INIT_LIST_HEAD(&e->lru);
  1473. if (!writecache_has_error(wc))
  1474. writecache_free_entry(wc, e);
  1475. BUG_ON(!wc->writeback_size);
  1476. wc->writeback_size--;
  1477. e++;
  1478. } while (--c->n_entries);
  1479. mempool_free(c, &wc->copy_pool);
  1480. } while (!list_empty(list));
  1481. }
  1482. static int writecache_endio_thread(void *data)
  1483. {
  1484. struct dm_writecache *wc = data;
  1485. while (1) {
  1486. struct list_head list;
  1487. raw_spin_lock_irq(&wc->endio_list_lock);
  1488. if (!list_empty(&wc->endio_list))
  1489. goto pop_from_list;
  1490. set_current_state(TASK_INTERRUPTIBLE);
  1491. raw_spin_unlock_irq(&wc->endio_list_lock);
  1492. if (unlikely(kthread_should_stop())) {
  1493. set_current_state(TASK_RUNNING);
  1494. break;
  1495. }
  1496. schedule();
  1497. continue;
  1498. pop_from_list:
  1499. list = wc->endio_list;
  1500. list.next->prev = list.prev->next = &list;
  1501. INIT_LIST_HEAD(&wc->endio_list);
  1502. raw_spin_unlock_irq(&wc->endio_list_lock);
  1503. if (!WC_MODE_FUA(wc))
  1504. writecache_disk_flush(wc, wc->dev);
  1505. wc_lock(wc);
  1506. if (WC_MODE_PMEM(wc)) {
  1507. __writecache_endio_pmem(wc, &list);
  1508. } else {
  1509. __writecache_endio_ssd(wc, &list);
  1510. writecache_wait_for_ios(wc, READ);
  1511. }
  1512. writecache_commit_flushed(wc, false);
  1513. wc_unlock(wc);
  1514. }
  1515. return 0;
  1516. }
  1517. static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e)
  1518. {
  1519. struct dm_writecache *wc = wb->wc;
  1520. unsigned int block_size = wc->block_size;
  1521. void *address = memory_data(wc, e);
  1522. persistent_memory_flush_cache(address, block_size);
  1523. if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors))
  1524. return true;
  1525. return bio_add_page(&wb->bio, persistent_memory_page(address),
  1526. block_size, persistent_memory_page_offset(address)) != 0;
  1527. }
  1528. struct writeback_list {
  1529. struct list_head list;
  1530. size_t size;
  1531. };
  1532. static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl)
  1533. {
  1534. if (unlikely(wc->max_writeback_jobs)) {
  1535. if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) {
  1536. wc_lock(wc);
  1537. while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs)
  1538. writecache_wait_on_freelist(wc);
  1539. wc_unlock(wc);
  1540. }
  1541. }
  1542. cond_resched();
  1543. }
  1544. static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl)
  1545. {
  1546. struct wc_entry *e, *f;
  1547. struct bio *bio;
  1548. struct writeback_struct *wb;
  1549. unsigned int max_pages;
  1550. while (wbl->size) {
  1551. wbl->size--;
  1552. e = container_of(wbl->list.prev, struct wc_entry, lru);
  1553. list_del(&e->lru);
  1554. max_pages = e->wc_list_contiguous;
  1555. bio = bio_alloc_bioset(wc->dev->bdev, max_pages, REQ_OP_WRITE,
  1556. GFP_NOIO, &wc->bio_set);
  1557. wb = container_of(bio, struct writeback_struct, bio);
  1558. wb->wc = wc;
  1559. bio->bi_end_io = writecache_writeback_endio;
  1560. bio->bi_iter.bi_sector = read_original_sector(wc, e);
  1561. if (unlikely(max_pages > WB_LIST_INLINE))
  1562. wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
  1563. GFP_NOIO | __GFP_NORETRY |
  1564. __GFP_NOMEMALLOC | __GFP_NOWARN);
  1565. if (likely(max_pages <= WB_LIST_INLINE) || unlikely(!wb->wc_list)) {
  1566. wb->wc_list = wb->wc_list_inline;
  1567. max_pages = WB_LIST_INLINE;
  1568. }
  1569. BUG_ON(!wc_add_block(wb, e));
  1570. wb->wc_list[0] = e;
  1571. wb->wc_list_n = 1;
  1572. while (wbl->size && wb->wc_list_n < max_pages) {
  1573. f = container_of(wbl->list.prev, struct wc_entry, lru);
  1574. if (read_original_sector(wc, f) !=
  1575. read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
  1576. break;
  1577. if (!wc_add_block(wb, f))
  1578. break;
  1579. wbl->size--;
  1580. list_del(&f->lru);
  1581. wb->wc_list[wb->wc_list_n++] = f;
  1582. e = f;
  1583. }
  1584. if (WC_MODE_FUA(wc))
  1585. bio->bi_opf |= REQ_FUA;
  1586. if (writecache_has_error(wc)) {
  1587. bio->bi_status = BLK_STS_IOERR;
  1588. bio_endio(bio);
  1589. } else if (unlikely(!bio_sectors(bio))) {
  1590. bio->bi_status = BLK_STS_OK;
  1591. bio_endio(bio);
  1592. } else {
  1593. submit_bio(bio);
  1594. }
  1595. __writeback_throttle(wc, wbl);
  1596. }
  1597. }
  1598. static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl)
  1599. {
  1600. struct wc_entry *e, *f;
  1601. struct dm_io_region from, to;
  1602. struct copy_struct *c;
  1603. while (wbl->size) {
  1604. unsigned int n_sectors;
  1605. wbl->size--;
  1606. e = container_of(wbl->list.prev, struct wc_entry, lru);
  1607. list_del(&e->lru);
  1608. n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT);
  1609. from.bdev = wc->ssd_dev->bdev;
  1610. from.sector = cache_sector(wc, e);
  1611. from.count = n_sectors;
  1612. to.bdev = wc->dev->bdev;
  1613. to.sector = read_original_sector(wc, e);
  1614. to.count = n_sectors;
  1615. c = mempool_alloc(&wc->copy_pool, GFP_NOIO);
  1616. c->wc = wc;
  1617. c->e = e;
  1618. c->n_entries = e->wc_list_contiguous;
  1619. while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) {
  1620. wbl->size--;
  1621. f = container_of(wbl->list.prev, struct wc_entry, lru);
  1622. BUG_ON(f != e + 1);
  1623. list_del(&f->lru);
  1624. e = f;
  1625. }
  1626. if (unlikely(to.sector + to.count > wc->data_device_sectors)) {
  1627. if (to.sector >= wc->data_device_sectors) {
  1628. writecache_copy_endio(0, 0, c);
  1629. continue;
  1630. }
  1631. from.count = to.count = wc->data_device_sectors - to.sector;
  1632. }
  1633. dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
  1634. __writeback_throttle(wc, wbl);
  1635. }
  1636. }
  1637. static void writecache_writeback(struct work_struct *work)
  1638. {
  1639. struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
  1640. struct blk_plug plug;
  1641. struct wc_entry *f, *g, *e = NULL;
  1642. struct rb_node *node, *next_node;
  1643. struct list_head skipped;
  1644. struct writeback_list wbl;
  1645. unsigned long n_walked;
  1646. if (!WC_MODE_PMEM(wc)) {
  1647. /* Wait for any active kcopyd work on behalf of ssd writeback */
  1648. dm_kcopyd_client_flush(wc->dm_kcopyd);
  1649. }
  1650. if (likely(wc->pause != 0)) {
  1651. while (1) {
  1652. unsigned long idle;
  1653. if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) ||
  1654. unlikely(dm_suspended(wc->ti)))
  1655. break;
  1656. idle = dm_iot_idle_time(&wc->iot);
  1657. if (idle >= wc->pause)
  1658. break;
  1659. idle = wc->pause - idle;
  1660. if (idle > HZ)
  1661. idle = HZ;
  1662. schedule_timeout_idle(idle);
  1663. }
  1664. }
  1665. wc_lock(wc);
  1666. restart:
  1667. if (writecache_has_error(wc)) {
  1668. wc_unlock(wc);
  1669. return;
  1670. }
  1671. if (unlikely(wc->writeback_all)) {
  1672. if (writecache_wait_for_writeback(wc))
  1673. goto restart;
  1674. }
  1675. if (wc->overwrote_committed)
  1676. writecache_wait_for_ios(wc, WRITE);
  1677. n_walked = 0;
  1678. INIT_LIST_HEAD(&skipped);
  1679. INIT_LIST_HEAD(&wbl.list);
  1680. wbl.size = 0;
  1681. while (!list_empty(&wc->lru) &&
  1682. (wc->writeback_all ||
  1683. wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark ||
  1684. (jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >=
  1685. wc->max_age - wc->max_age / MAX_AGE_DIV))) {
  1686. n_walked++;
  1687. if (unlikely(n_walked > WRITEBACK_LATENCY) &&
  1688. likely(!wc->writeback_all)) {
  1689. if (likely(!dm_suspended(wc->ti)))
  1690. queue_work(wc->writeback_wq, &wc->writeback_work);
  1691. break;
  1692. }
  1693. if (unlikely(wc->writeback_all)) {
  1694. if (unlikely(!e)) {
  1695. writecache_flush(wc);
  1696. e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node);
  1697. } else
  1698. e = g;
  1699. } else
  1700. e = container_of(wc->lru.prev, struct wc_entry, lru);
  1701. BUG_ON(e->write_in_progress);
  1702. if (unlikely(!writecache_entry_is_committed(wc, e)))
  1703. writecache_flush(wc);
  1704. node = rb_prev(&e->rb_node);
  1705. if (node) {
  1706. f = container_of(node, struct wc_entry, rb_node);
  1707. if (unlikely(read_original_sector(wc, f) ==
  1708. read_original_sector(wc, e))) {
  1709. BUG_ON(!f->write_in_progress);
  1710. list_move(&e->lru, &skipped);
  1711. cond_resched();
  1712. continue;
  1713. }
  1714. }
  1715. wc->writeback_size++;
  1716. list_move(&e->lru, &wbl.list);
  1717. wbl.size++;
  1718. e->write_in_progress = true;
  1719. e->wc_list_contiguous = 1;
  1720. f = e;
  1721. while (1) {
  1722. next_node = rb_next(&f->rb_node);
  1723. if (unlikely(!next_node))
  1724. break;
  1725. g = container_of(next_node, struct wc_entry, rb_node);
  1726. if (unlikely(read_original_sector(wc, g) ==
  1727. read_original_sector(wc, f))) {
  1728. f = g;
  1729. continue;
  1730. }
  1731. if (read_original_sector(wc, g) !=
  1732. read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT))
  1733. break;
  1734. if (unlikely(g->write_in_progress))
  1735. break;
  1736. if (unlikely(!writecache_entry_is_committed(wc, g)))
  1737. break;
  1738. if (!WC_MODE_PMEM(wc)) {
  1739. if (g != f + 1)
  1740. break;
  1741. }
  1742. n_walked++;
  1743. //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
  1744. // break;
  1745. wc->writeback_size++;
  1746. list_move(&g->lru, &wbl.list);
  1747. wbl.size++;
  1748. g->write_in_progress = true;
  1749. g->wc_list_contiguous = BIO_MAX_VECS;
  1750. f = g;
  1751. e->wc_list_contiguous++;
  1752. if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) {
  1753. if (unlikely(wc->writeback_all)) {
  1754. next_node = rb_next(&f->rb_node);
  1755. if (likely(next_node))
  1756. g = container_of(next_node, struct wc_entry, rb_node);
  1757. }
  1758. break;
  1759. }
  1760. }
  1761. cond_resched();
  1762. }
  1763. if (!list_empty(&skipped)) {
  1764. list_splice_tail(&skipped, &wc->lru);
  1765. /*
  1766. * If we didn't do any progress, we must wait until some
  1767. * writeback finishes to avoid burning CPU in a loop
  1768. */
  1769. if (unlikely(!wbl.size))
  1770. writecache_wait_for_writeback(wc);
  1771. }
  1772. wc_unlock(wc);
  1773. blk_start_plug(&plug);
  1774. if (WC_MODE_PMEM(wc))
  1775. __writecache_writeback_pmem(wc, &wbl);
  1776. else
  1777. __writecache_writeback_ssd(wc, &wbl);
  1778. blk_finish_plug(&plug);
  1779. if (unlikely(wc->writeback_all)) {
  1780. wc_lock(wc);
  1781. while (writecache_wait_for_writeback(wc))
  1782. ;
  1783. wc_unlock(wc);
  1784. }
  1785. }
  1786. static int calculate_memory_size(uint64_t device_size, unsigned int block_size,
  1787. size_t *n_blocks_p, size_t *n_metadata_blocks_p)
  1788. {
  1789. uint64_t n_blocks, offset;
  1790. struct wc_entry e;
  1791. n_blocks = device_size;
  1792. do_div(n_blocks, block_size + sizeof(struct wc_memory_entry));
  1793. while (1) {
  1794. if (!n_blocks)
  1795. return -ENOSPC;
  1796. /* Verify the following entries[n_blocks] won't overflow */
  1797. if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) /
  1798. sizeof(struct wc_memory_entry)))
  1799. return -EFBIG;
  1800. offset = offsetof(struct wc_memory_superblock, entries[n_blocks]);
  1801. offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1);
  1802. if (offset + n_blocks * block_size <= device_size)
  1803. break;
  1804. n_blocks--;
  1805. }
  1806. /* check if the bit field overflows */
  1807. e.index = n_blocks;
  1808. if (e.index != n_blocks)
  1809. return -EFBIG;
  1810. if (n_blocks_p)
  1811. *n_blocks_p = n_blocks;
  1812. if (n_metadata_blocks_p)
  1813. *n_metadata_blocks_p = offset >> __ffs(block_size);
  1814. return 0;
  1815. }
  1816. static int init_memory(struct dm_writecache *wc)
  1817. {
  1818. size_t b;
  1819. int r;
  1820. r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL);
  1821. if (r)
  1822. return r;
  1823. r = writecache_alloc_entries(wc);
  1824. if (r)
  1825. return r;
  1826. for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
  1827. pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
  1828. pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
  1829. pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
  1830. pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
  1831. pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
  1832. for (b = 0; b < wc->n_blocks; b++) {
  1833. write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
  1834. cond_resched();
  1835. }
  1836. writecache_flush_all_metadata(wc);
  1837. writecache_commit_flushed(wc, false);
  1838. pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
  1839. writecache_flush_region(wc, &sb(wc)->magic, sizeof(sb(wc)->magic));
  1840. writecache_commit_flushed(wc, false);
  1841. return 0;
  1842. }
  1843. static void writecache_dtr(struct dm_target *ti)
  1844. {
  1845. struct dm_writecache *wc = ti->private;
  1846. if (!wc)
  1847. return;
  1848. if (wc->endio_thread)
  1849. kthread_stop(wc->endio_thread);
  1850. if (wc->flush_thread)
  1851. kthread_stop(wc->flush_thread);
  1852. bioset_exit(&wc->bio_set);
  1853. mempool_exit(&wc->copy_pool);
  1854. if (wc->writeback_wq)
  1855. destroy_workqueue(wc->writeback_wq);
  1856. if (wc->dev)
  1857. dm_put_device(ti, wc->dev);
  1858. if (wc->ssd_dev)
  1859. dm_put_device(ti, wc->ssd_dev);
  1860. vfree(wc->entries);
  1861. if (wc->memory_map) {
  1862. if (WC_MODE_PMEM(wc))
  1863. persistent_memory_release(wc);
  1864. else
  1865. vfree(wc->memory_map);
  1866. }
  1867. if (wc->dm_kcopyd)
  1868. dm_kcopyd_client_destroy(wc->dm_kcopyd);
  1869. if (wc->dm_io)
  1870. dm_io_client_destroy(wc->dm_io);
  1871. vfree(wc->dirty_bitmap);
  1872. kfree(wc);
  1873. }
  1874. static int writecache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
  1875. {
  1876. struct dm_writecache *wc;
  1877. struct dm_arg_set as;
  1878. const char *string;
  1879. unsigned int opt_params;
  1880. size_t offset, data_size;
  1881. int i, r;
  1882. char dummy;
  1883. int high_wm_percent = HIGH_WATERMARK;
  1884. int low_wm_percent = LOW_WATERMARK;
  1885. uint64_t x;
  1886. struct wc_memory_superblock s;
  1887. static struct dm_arg _args[] = {
  1888. {0, 18, "Invalid number of feature args"},
  1889. };
  1890. as.argc = argc;
  1891. as.argv = argv;
  1892. wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL);
  1893. if (!wc) {
  1894. ti->error = "Cannot allocate writecache structure";
  1895. r = -ENOMEM;
  1896. goto bad;
  1897. }
  1898. ti->private = wc;
  1899. wc->ti = ti;
  1900. mutex_init(&wc->lock);
  1901. wc->max_age = MAX_AGE_UNSPECIFIED;
  1902. writecache_poison_lists(wc);
  1903. init_waitqueue_head(&wc->freelist_wait);
  1904. timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
  1905. timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0);
  1906. for (i = 0; i < 2; i++) {
  1907. atomic_set(&wc->bio_in_progress[i], 0);
  1908. init_waitqueue_head(&wc->bio_in_progress_wait[i]);
  1909. }
  1910. wc->dm_io = dm_io_client_create();
  1911. if (IS_ERR(wc->dm_io)) {
  1912. r = PTR_ERR(wc->dm_io);
  1913. ti->error = "Unable to allocate dm-io client";
  1914. wc->dm_io = NULL;
  1915. goto bad;
  1916. }
  1917. wc->writeback_wq = alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM, 1);
  1918. if (!wc->writeback_wq) {
  1919. r = -ENOMEM;
  1920. ti->error = "Could not allocate writeback workqueue";
  1921. goto bad;
  1922. }
  1923. INIT_WORK(&wc->writeback_work, writecache_writeback);
  1924. INIT_WORK(&wc->flush_work, writecache_flush_work);
  1925. dm_iot_init(&wc->iot);
  1926. raw_spin_lock_init(&wc->endio_list_lock);
  1927. INIT_LIST_HEAD(&wc->endio_list);
  1928. wc->endio_thread = kthread_run(writecache_endio_thread, wc, "writecache_endio");
  1929. if (IS_ERR(wc->endio_thread)) {
  1930. r = PTR_ERR(wc->endio_thread);
  1931. wc->endio_thread = NULL;
  1932. ti->error = "Couldn't spawn endio thread";
  1933. goto bad;
  1934. }
  1935. /*
  1936. * Parse the mode (pmem or ssd)
  1937. */
  1938. string = dm_shift_arg(&as);
  1939. if (!string)
  1940. goto bad_arguments;
  1941. if (!strcasecmp(string, "s")) {
  1942. wc->pmem_mode = false;
  1943. } else if (!strcasecmp(string, "p")) {
  1944. #ifdef DM_WRITECACHE_HAS_PMEM
  1945. wc->pmem_mode = true;
  1946. wc->writeback_fua = true;
  1947. #else
  1948. /*
  1949. * If the architecture doesn't support persistent memory or
  1950. * the kernel doesn't support any DAX drivers, this driver can
  1951. * only be used in SSD-only mode.
  1952. */
  1953. r = -EOPNOTSUPP;
  1954. ti->error = "Persistent memory or DAX not supported on this system";
  1955. goto bad;
  1956. #endif
  1957. } else {
  1958. goto bad_arguments;
  1959. }
  1960. if (WC_MODE_PMEM(wc)) {
  1961. r = bioset_init(&wc->bio_set, BIO_POOL_SIZE,
  1962. offsetof(struct writeback_struct, bio),
  1963. BIOSET_NEED_BVECS);
  1964. if (r) {
  1965. ti->error = "Could not allocate bio set";
  1966. goto bad;
  1967. }
  1968. } else {
  1969. wc->pause = PAUSE_WRITEBACK;
  1970. r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
  1971. if (r) {
  1972. ti->error = "Could not allocate mempool";
  1973. goto bad;
  1974. }
  1975. }
  1976. /*
  1977. * Parse the origin data device
  1978. */
  1979. string = dm_shift_arg(&as);
  1980. if (!string)
  1981. goto bad_arguments;
  1982. r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev);
  1983. if (r) {
  1984. ti->error = "Origin data device lookup failed";
  1985. goto bad;
  1986. }
  1987. /*
  1988. * Parse cache data device (be it pmem or ssd)
  1989. */
  1990. string = dm_shift_arg(&as);
  1991. if (!string)
  1992. goto bad_arguments;
  1993. r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev);
  1994. if (r) {
  1995. ti->error = "Cache data device lookup failed";
  1996. goto bad;
  1997. }
  1998. wc->memory_map_size = bdev_nr_bytes(wc->ssd_dev->bdev);
  1999. /*
  2000. * Parse the cache block size
  2001. */
  2002. string = dm_shift_arg(&as);
  2003. if (!string)
  2004. goto bad_arguments;
  2005. if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 ||
  2006. wc->block_size < 512 || wc->block_size > PAGE_SIZE ||
  2007. (wc->block_size & (wc->block_size - 1))) {
  2008. r = -EINVAL;
  2009. ti->error = "Invalid block size";
  2010. goto bad;
  2011. }
  2012. if (wc->block_size < bdev_logical_block_size(wc->dev->bdev) ||
  2013. wc->block_size < bdev_logical_block_size(wc->ssd_dev->bdev)) {
  2014. r = -EINVAL;
  2015. ti->error = "Block size is smaller than device logical block size";
  2016. goto bad;
  2017. }
  2018. wc->block_size_bits = __ffs(wc->block_size);
  2019. wc->max_writeback_jobs = MAX_WRITEBACK_JOBS;
  2020. wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM;
  2021. wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC);
  2022. /*
  2023. * Parse optional arguments
  2024. */
  2025. r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
  2026. if (r)
  2027. goto bad;
  2028. while (opt_params) {
  2029. string = dm_shift_arg(&as), opt_params--;
  2030. if (!strcasecmp(string, "start_sector") && opt_params >= 1) {
  2031. unsigned long long start_sector;
  2032. string = dm_shift_arg(&as), opt_params--;
  2033. if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
  2034. goto invalid_optional;
  2035. wc->start_sector = start_sector;
  2036. wc->start_sector_set = true;
  2037. if (wc->start_sector != start_sector ||
  2038. wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
  2039. goto invalid_optional;
  2040. } else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
  2041. string = dm_shift_arg(&as), opt_params--;
  2042. if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1)
  2043. goto invalid_optional;
  2044. if (high_wm_percent < 0 || high_wm_percent > 100)
  2045. goto invalid_optional;
  2046. wc->high_wm_percent_value = high_wm_percent;
  2047. wc->high_wm_percent_set = true;
  2048. } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
  2049. string = dm_shift_arg(&as), opt_params--;
  2050. if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1)
  2051. goto invalid_optional;
  2052. if (low_wm_percent < 0 || low_wm_percent > 100)
  2053. goto invalid_optional;
  2054. wc->low_wm_percent_value = low_wm_percent;
  2055. wc->low_wm_percent_set = true;
  2056. } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
  2057. string = dm_shift_arg(&as), opt_params--;
  2058. if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1)
  2059. goto invalid_optional;
  2060. wc->max_writeback_jobs_set = true;
  2061. } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) {
  2062. string = dm_shift_arg(&as), opt_params--;
  2063. if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1)
  2064. goto invalid_optional;
  2065. wc->autocommit_blocks_set = true;
  2066. } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) {
  2067. unsigned int autocommit_msecs;
  2068. string = dm_shift_arg(&as), opt_params--;
  2069. if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1)
  2070. goto invalid_optional;
  2071. if (autocommit_msecs > 3600000)
  2072. goto invalid_optional;
  2073. wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
  2074. wc->autocommit_time_value = autocommit_msecs;
  2075. wc->autocommit_time_set = true;
  2076. } else if (!strcasecmp(string, "max_age") && opt_params >= 1) {
  2077. unsigned int max_age_msecs;
  2078. string = dm_shift_arg(&as), opt_params--;
  2079. if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1)
  2080. goto invalid_optional;
  2081. if (max_age_msecs > 86400000)
  2082. goto invalid_optional;
  2083. wc->max_age = msecs_to_jiffies(max_age_msecs);
  2084. wc->max_age_set = true;
  2085. wc->max_age_value = max_age_msecs;
  2086. } else if (!strcasecmp(string, "cleaner")) {
  2087. wc->cleaner_set = true;
  2088. wc->cleaner = true;
  2089. } else if (!strcasecmp(string, "fua")) {
  2090. if (WC_MODE_PMEM(wc)) {
  2091. wc->writeback_fua = true;
  2092. wc->writeback_fua_set = true;
  2093. } else
  2094. goto invalid_optional;
  2095. } else if (!strcasecmp(string, "nofua")) {
  2096. if (WC_MODE_PMEM(wc)) {
  2097. wc->writeback_fua = false;
  2098. wc->writeback_fua_set = true;
  2099. } else
  2100. goto invalid_optional;
  2101. } else if (!strcasecmp(string, "metadata_only")) {
  2102. wc->metadata_only = true;
  2103. } else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) {
  2104. unsigned int pause_msecs;
  2105. if (WC_MODE_PMEM(wc))
  2106. goto invalid_optional;
  2107. string = dm_shift_arg(&as), opt_params--;
  2108. if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1)
  2109. goto invalid_optional;
  2110. if (pause_msecs > 60000)
  2111. goto invalid_optional;
  2112. wc->pause = msecs_to_jiffies(pause_msecs);
  2113. wc->pause_set = true;
  2114. wc->pause_value = pause_msecs;
  2115. } else {
  2116. invalid_optional:
  2117. r = -EINVAL;
  2118. ti->error = "Invalid optional argument";
  2119. goto bad;
  2120. }
  2121. }
  2122. if (high_wm_percent < low_wm_percent) {
  2123. r = -EINVAL;
  2124. ti->error = "High watermark must be greater than or equal to low watermark";
  2125. goto bad;
  2126. }
  2127. if (WC_MODE_PMEM(wc)) {
  2128. if (!dax_synchronous(wc->ssd_dev->dax_dev)) {
  2129. r = -EOPNOTSUPP;
  2130. ti->error = "Asynchronous persistent memory not supported as pmem cache";
  2131. goto bad;
  2132. }
  2133. r = persistent_memory_claim(wc);
  2134. if (r) {
  2135. ti->error = "Unable to map persistent memory for cache";
  2136. goto bad;
  2137. }
  2138. } else {
  2139. size_t n_blocks, n_metadata_blocks;
  2140. uint64_t n_bitmap_bits;
  2141. wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT;
  2142. bio_list_init(&wc->flush_list);
  2143. wc->flush_thread = kthread_run(writecache_flush_thread, wc, "dm_writecache_flush");
  2144. if (IS_ERR(wc->flush_thread)) {
  2145. r = PTR_ERR(wc->flush_thread);
  2146. wc->flush_thread = NULL;
  2147. ti->error = "Couldn't spawn flush thread";
  2148. goto bad;
  2149. }
  2150. r = calculate_memory_size(wc->memory_map_size, wc->block_size,
  2151. &n_blocks, &n_metadata_blocks);
  2152. if (r) {
  2153. ti->error = "Invalid device size";
  2154. goto bad;
  2155. }
  2156. n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) +
  2157. BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
  2158. /* this is limitation of test_bit functions */
  2159. if (n_bitmap_bits > 1U << 31) {
  2160. r = -EFBIG;
  2161. ti->error = "Invalid device size";
  2162. goto bad;
  2163. }
  2164. wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits);
  2165. if (!wc->memory_map) {
  2166. r = -ENOMEM;
  2167. ti->error = "Unable to allocate memory for metadata";
  2168. goto bad;
  2169. }
  2170. wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle);
  2171. if (IS_ERR(wc->dm_kcopyd)) {
  2172. r = PTR_ERR(wc->dm_kcopyd);
  2173. ti->error = "Unable to allocate dm-kcopyd client";
  2174. wc->dm_kcopyd = NULL;
  2175. goto bad;
  2176. }
  2177. wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT);
  2178. wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) /
  2179. BITS_PER_LONG * sizeof(unsigned long);
  2180. wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size);
  2181. if (!wc->dirty_bitmap) {
  2182. r = -ENOMEM;
  2183. ti->error = "Unable to allocate dirty bitmap";
  2184. goto bad;
  2185. }
  2186. r = writecache_read_metadata(wc, wc->block_size >> SECTOR_SHIFT);
  2187. if (r) {
  2188. ti->error = "Unable to read first block of metadata";
  2189. goto bad;
  2190. }
  2191. }
  2192. r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock));
  2193. if (r) {
  2194. ti->error = "Hardware memory error when reading superblock";
  2195. goto bad;
  2196. }
  2197. if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) {
  2198. r = init_memory(wc);
  2199. if (r) {
  2200. ti->error = "Unable to initialize device";
  2201. goto bad;
  2202. }
  2203. r = copy_mc_to_kernel(&s, sb(wc),
  2204. sizeof(struct wc_memory_superblock));
  2205. if (r) {
  2206. ti->error = "Hardware memory error when reading superblock";
  2207. goto bad;
  2208. }
  2209. }
  2210. if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) {
  2211. ti->error = "Invalid magic in the superblock";
  2212. r = -EINVAL;
  2213. goto bad;
  2214. }
  2215. if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
  2216. ti->error = "Invalid version in the superblock";
  2217. r = -EINVAL;
  2218. goto bad;
  2219. }
  2220. if (le32_to_cpu(s.block_size) != wc->block_size) {
  2221. ti->error = "Block size does not match superblock";
  2222. r = -EINVAL;
  2223. goto bad;
  2224. }
  2225. wc->n_blocks = le64_to_cpu(s.n_blocks);
  2226. offset = wc->n_blocks * sizeof(struct wc_memory_entry);
  2227. if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) {
  2228. overflow:
  2229. ti->error = "Overflow in size calculation";
  2230. r = -EINVAL;
  2231. goto bad;
  2232. }
  2233. offset += sizeof(struct wc_memory_superblock);
  2234. if (offset < sizeof(struct wc_memory_superblock))
  2235. goto overflow;
  2236. offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1);
  2237. data_size = wc->n_blocks * (size_t)wc->block_size;
  2238. if (!offset || (data_size / wc->block_size != wc->n_blocks) ||
  2239. (offset + data_size < offset))
  2240. goto overflow;
  2241. if (offset + data_size > wc->memory_map_size) {
  2242. ti->error = "Memory area is too small";
  2243. r = -EINVAL;
  2244. goto bad;
  2245. }
  2246. wc->metadata_sectors = offset >> SECTOR_SHIFT;
  2247. wc->block_start = (char *)sb(wc) + offset;
  2248. x = (uint64_t)wc->n_blocks * (100 - high_wm_percent);
  2249. x += 50;
  2250. do_div(x, 100);
  2251. wc->freelist_high_watermark = x;
  2252. x = (uint64_t)wc->n_blocks * (100 - low_wm_percent);
  2253. x += 50;
  2254. do_div(x, 100);
  2255. wc->freelist_low_watermark = x;
  2256. if (wc->cleaner)
  2257. activate_cleaner(wc);
  2258. r = writecache_alloc_entries(wc);
  2259. if (r) {
  2260. ti->error = "Cannot allocate memory";
  2261. goto bad;
  2262. }
  2263. ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2;
  2264. ti->flush_supported = true;
  2265. ti->num_discard_bios = 1;
  2266. if (WC_MODE_PMEM(wc))
  2267. persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
  2268. return 0;
  2269. bad_arguments:
  2270. r = -EINVAL;
  2271. ti->error = "Bad arguments";
  2272. bad:
  2273. writecache_dtr(ti);
  2274. return r;
  2275. }
  2276. static void writecache_status(struct dm_target *ti, status_type_t type,
  2277. unsigned int status_flags, char *result, unsigned int maxlen)
  2278. {
  2279. struct dm_writecache *wc = ti->private;
  2280. unsigned int extra_args;
  2281. unsigned int sz = 0;
  2282. switch (type) {
  2283. case STATUSTYPE_INFO:
  2284. DMEMIT("%ld %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu",
  2285. writecache_has_error(wc),
  2286. (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size,
  2287. (unsigned long long)wc->writeback_size,
  2288. wc->stats.reads,
  2289. wc->stats.read_hits,
  2290. wc->stats.writes,
  2291. wc->stats.write_hits_uncommitted,
  2292. wc->stats.write_hits_committed,
  2293. wc->stats.writes_around,
  2294. wc->stats.writes_allocate,
  2295. wc->stats.writes_blocked_on_freelist,
  2296. wc->stats.flushes,
  2297. wc->stats.discards);
  2298. break;
  2299. case STATUSTYPE_TABLE:
  2300. DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
  2301. wc->dev->name, wc->ssd_dev->name, wc->block_size);
  2302. extra_args = 0;
  2303. if (wc->start_sector_set)
  2304. extra_args += 2;
  2305. if (wc->high_wm_percent_set)
  2306. extra_args += 2;
  2307. if (wc->low_wm_percent_set)
  2308. extra_args += 2;
  2309. if (wc->max_writeback_jobs_set)
  2310. extra_args += 2;
  2311. if (wc->autocommit_blocks_set)
  2312. extra_args += 2;
  2313. if (wc->autocommit_time_set)
  2314. extra_args += 2;
  2315. if (wc->max_age_set)
  2316. extra_args += 2;
  2317. if (wc->cleaner_set)
  2318. extra_args++;
  2319. if (wc->writeback_fua_set)
  2320. extra_args++;
  2321. if (wc->metadata_only)
  2322. extra_args++;
  2323. if (wc->pause_set)
  2324. extra_args += 2;
  2325. DMEMIT("%u", extra_args);
  2326. if (wc->start_sector_set)
  2327. DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
  2328. if (wc->high_wm_percent_set)
  2329. DMEMIT(" high_watermark %u", wc->high_wm_percent_value);
  2330. if (wc->low_wm_percent_set)
  2331. DMEMIT(" low_watermark %u", wc->low_wm_percent_value);
  2332. if (wc->max_writeback_jobs_set)
  2333. DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
  2334. if (wc->autocommit_blocks_set)
  2335. DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
  2336. if (wc->autocommit_time_set)
  2337. DMEMIT(" autocommit_time %u", wc->autocommit_time_value);
  2338. if (wc->max_age_set)
  2339. DMEMIT(" max_age %u", wc->max_age_value);
  2340. if (wc->cleaner_set)
  2341. DMEMIT(" cleaner");
  2342. if (wc->writeback_fua_set)
  2343. DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
  2344. if (wc->metadata_only)
  2345. DMEMIT(" metadata_only");
  2346. if (wc->pause_set)
  2347. DMEMIT(" pause_writeback %u", wc->pause_value);
  2348. break;
  2349. case STATUSTYPE_IMA:
  2350. *result = '\0';
  2351. break;
  2352. }
  2353. }
  2354. static struct target_type writecache_target = {
  2355. .name = "writecache",
  2356. .version = {1, 6, 0},
  2357. .module = THIS_MODULE,
  2358. .ctr = writecache_ctr,
  2359. .dtr = writecache_dtr,
  2360. .status = writecache_status,
  2361. .postsuspend = writecache_suspend,
  2362. .resume = writecache_resume,
  2363. .message = writecache_message,
  2364. .map = writecache_map,
  2365. .end_io = writecache_end_io,
  2366. .iterate_devices = writecache_iterate_devices,
  2367. .io_hints = writecache_io_hints,
  2368. };
  2369. module_dm(writecache);
  2370. MODULE_DESCRIPTION(DM_NAME " writecache target");
  2371. MODULE_AUTHOR("Mikulas Patocka <dm-devel@lists.linux.dev>");
  2372. MODULE_LICENSE("GPL");