blkback.c 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524
  1. /******************************************************************************
  2. *
  3. * Back-end of the driver for virtual block devices. This portion of the
  4. * driver exports a 'unified' block-device interface that can be accessed
  5. * by any operating system that implements a compatible front end. A
  6. * reference front-end implementation can be found in:
  7. * drivers/block/xen-blkfront.c
  8. *
  9. * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
  10. * Copyright (c) 2005, Christopher Clark
  11. *
  12. * This program is free software; you can redistribute it and/or
  13. * modify it under the terms of the GNU General Public License version 2
  14. * as published by the Free Software Foundation; or, when distributed
  15. * separately from the Linux kernel or incorporated into other
  16. * software packages, subject to the following license:
  17. *
  18. * Permission is hereby granted, free of charge, to any person obtaining a copy
  19. * of this source file (the "Software"), to deal in the Software without
  20. * restriction, including without limitation the rights to use, copy, modify,
  21. * merge, publish, distribute, sublicense, and/or sell copies of the Software,
  22. * and to permit persons to whom the Software is furnished to do so, subject to
  23. * the following conditions:
  24. *
  25. * The above copyright notice and this permission notice shall be included in
  26. * all copies or substantial portions of the Software.
  27. *
  28. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  29. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  30. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  31. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  32. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  33. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  34. * IN THE SOFTWARE.
  35. */
  36. #define pr_fmt(fmt) "xen-blkback: " fmt
  37. #include <linux/spinlock.h>
  38. #include <linux/kthread.h>
  39. #include <linux/list.h>
  40. #include <linux/delay.h>
  41. #include <linux/freezer.h>
  42. #include <linux/bitmap.h>
  43. #include <xen/events.h>
  44. #include <xen/page.h>
  45. #include <xen/xen.h>
  46. #include <asm/xen/hypervisor.h>
  47. #include <asm/xen/hypercall.h>
  48. #include <xen/balloon.h>
  49. #include <xen/grant_table.h>
  50. #include "common.h"
  51. /*
  52. * Maximum number of unused free pages to keep in the internal buffer.
  53. * Setting this to a value too low will reduce memory used in each backend,
  54. * but can have a performance penalty.
  55. *
  56. * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can
  57. * be set to a lower value that might degrade performance on some intensive
  58. * IO workloads.
  59. */
  60. static int xen_blkif_max_buffer_pages = 1024;
  61. module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644);
  62. MODULE_PARM_DESC(max_buffer_pages,
  63. "Maximum number of free pages to keep in each block backend buffer");
  64. /*
  65. * Maximum number of grants to map persistently in blkback. For maximum
  66. * performance this should be the total numbers of grants that can be used
  67. * to fill the ring, but since this might become too high, specially with
  68. * the use of indirect descriptors, we set it to a value that provides good
  69. * performance without using too much memory.
  70. *
  71. * When the list of persistent grants is full we clean it up using a LRU
  72. * algorithm.
  73. */
  74. static int xen_blkif_max_pgrants = 1056;
  75. module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
  76. MODULE_PARM_DESC(max_persistent_grants,
  77. "Maximum number of grants to map persistently");
  78. /*
  79. * How long a persistent grant is allowed to remain allocated without being in
  80. * use. The time is in seconds, 0 means indefinitely long.
  81. */
  82. static unsigned int xen_blkif_pgrant_timeout = 60;
  83. module_param_named(persistent_grant_unused_seconds, xen_blkif_pgrant_timeout,
  84. uint, 0644);
  85. MODULE_PARM_DESC(persistent_grant_unused_seconds,
  86. "Time in seconds an unused persistent grant is allowed to "
  87. "remain allocated. Default is 60, 0 means unlimited.");
  88. /*
  89. * Maximum number of rings/queues blkback supports, allow as many queues as there
  90. * are CPUs if user has not specified a value.
  91. */
  92. unsigned int xenblk_max_queues;
  93. module_param_named(max_queues, xenblk_max_queues, uint, 0644);
  94. MODULE_PARM_DESC(max_queues,
  95. "Maximum number of hardware queues per virtual disk." \
  96. "By default it is the number of online CPUs.");
  97. /*
  98. * Maximum order of pages to be used for the shared ring between front and
  99. * backend, 4KB page granularity is used.
  100. */
  101. unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
  102. module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444);
  103. MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
  104. /*
  105. * The LRU mechanism to clean the lists of persistent grants needs to
  106. * be executed periodically. The time interval between consecutive executions
  107. * of the purge mechanism is set in ms.
  108. */
  109. #define LRU_INTERVAL 100
  110. /*
  111. * When the persistent grants list is full we will remove unused grants
  112. * from the list. The percent number of grants to be removed at each LRU
  113. * execution.
  114. */
  115. #define LRU_PERCENT_CLEAN 5
  116. /* Run-time switchable: /sys/module/blkback/parameters/ */
  117. static unsigned int log_stats;
  118. module_param(log_stats, int, 0644);
  119. #define BLKBACK_INVALID_HANDLE (~0)
  120. /* Number of free pages to remove on each call to gnttab_free_pages */
  121. #define NUM_BATCH_FREE_PAGES 10
  122. static inline bool persistent_gnt_timeout(struct persistent_gnt *persistent_gnt)
  123. {
  124. return xen_blkif_pgrant_timeout &&
  125. (jiffies - persistent_gnt->last_used >=
  126. HZ * xen_blkif_pgrant_timeout);
  127. }
  128. static inline int get_free_page(struct xen_blkif_ring *ring, struct page **page)
  129. {
  130. unsigned long flags;
  131. spin_lock_irqsave(&ring->free_pages_lock, flags);
  132. if (list_empty(&ring->free_pages)) {
  133. BUG_ON(ring->free_pages_num != 0);
  134. spin_unlock_irqrestore(&ring->free_pages_lock, flags);
  135. return gnttab_alloc_pages(1, page);
  136. }
  137. BUG_ON(ring->free_pages_num == 0);
  138. page[0] = list_first_entry(&ring->free_pages, struct page, lru);
  139. list_del(&page[0]->lru);
  140. ring->free_pages_num--;
  141. spin_unlock_irqrestore(&ring->free_pages_lock, flags);
  142. return 0;
  143. }
  144. static inline void put_free_pages(struct xen_blkif_ring *ring, struct page **page,
  145. int num)
  146. {
  147. unsigned long flags;
  148. int i;
  149. spin_lock_irqsave(&ring->free_pages_lock, flags);
  150. for (i = 0; i < num; i++)
  151. list_add(&page[i]->lru, &ring->free_pages);
  152. ring->free_pages_num += num;
  153. spin_unlock_irqrestore(&ring->free_pages_lock, flags);
  154. }
  155. static inline void shrink_free_pagepool(struct xen_blkif_ring *ring, int num)
  156. {
  157. /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
  158. struct page *page[NUM_BATCH_FREE_PAGES];
  159. unsigned int num_pages = 0;
  160. unsigned long flags;
  161. spin_lock_irqsave(&ring->free_pages_lock, flags);
  162. while (ring->free_pages_num > num) {
  163. BUG_ON(list_empty(&ring->free_pages));
  164. page[num_pages] = list_first_entry(&ring->free_pages,
  165. struct page, lru);
  166. list_del(&page[num_pages]->lru);
  167. ring->free_pages_num--;
  168. if (++num_pages == NUM_BATCH_FREE_PAGES) {
  169. spin_unlock_irqrestore(&ring->free_pages_lock, flags);
  170. gnttab_free_pages(num_pages, page);
  171. spin_lock_irqsave(&ring->free_pages_lock, flags);
  172. num_pages = 0;
  173. }
  174. }
  175. spin_unlock_irqrestore(&ring->free_pages_lock, flags);
  176. if (num_pages != 0)
  177. gnttab_free_pages(num_pages, page);
  178. }
  179. #define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
  180. static int do_block_io_op(struct xen_blkif_ring *ring, unsigned int *eoi_flags);
  181. static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
  182. struct blkif_request *req,
  183. struct pending_req *pending_req);
  184. static void make_response(struct xen_blkif_ring *ring, u64 id,
  185. unsigned short op, int st);
  186. #define foreach_grant_safe(pos, n, rbtree, node) \
  187. for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node), \
  188. (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL; \
  189. &(pos)->node != NULL; \
  190. (pos) = container_of(n, typeof(*(pos)), node), \
  191. (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
  192. /*
  193. * We don't need locking around the persistent grant helpers
  194. * because blkback uses a single-thread for each backend, so we
  195. * can be sure that this functions will never be called recursively.
  196. *
  197. * The only exception to that is put_persistent_grant, that can be called
  198. * from interrupt context (by xen_blkbk_unmap), so we have to use atomic
  199. * bit operations to modify the flags of a persistent grant and to count
  200. * the number of used grants.
  201. */
  202. static int add_persistent_gnt(struct xen_blkif_ring *ring,
  203. struct persistent_gnt *persistent_gnt)
  204. {
  205. struct rb_node **new = NULL, *parent = NULL;
  206. struct persistent_gnt *this;
  207. struct xen_blkif *blkif = ring->blkif;
  208. if (ring->persistent_gnt_c >= xen_blkif_max_pgrants) {
  209. if (!blkif->vbd.overflow_max_grants)
  210. blkif->vbd.overflow_max_grants = 1;
  211. return -EBUSY;
  212. }
  213. /* Figure out where to put new node */
  214. new = &ring->persistent_gnts.rb_node;
  215. while (*new) {
  216. this = container_of(*new, struct persistent_gnt, node);
  217. parent = *new;
  218. if (persistent_gnt->gnt < this->gnt)
  219. new = &((*new)->rb_left);
  220. else if (persistent_gnt->gnt > this->gnt)
  221. new = &((*new)->rb_right);
  222. else {
  223. pr_alert_ratelimited("trying to add a gref that's already in the tree\n");
  224. return -EINVAL;
  225. }
  226. }
  227. persistent_gnt->active = true;
  228. /* Add new node and rebalance tree. */
  229. rb_link_node(&(persistent_gnt->node), parent, new);
  230. rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts);
  231. ring->persistent_gnt_c++;
  232. atomic_inc(&ring->persistent_gnt_in_use);
  233. return 0;
  234. }
  235. static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring,
  236. grant_ref_t gref)
  237. {
  238. struct persistent_gnt *data;
  239. struct rb_node *node = NULL;
  240. node = ring->persistent_gnts.rb_node;
  241. while (node) {
  242. data = container_of(node, struct persistent_gnt, node);
  243. if (gref < data->gnt)
  244. node = node->rb_left;
  245. else if (gref > data->gnt)
  246. node = node->rb_right;
  247. else {
  248. if (data->active) {
  249. pr_alert_ratelimited("requesting a grant already in use\n");
  250. return NULL;
  251. }
  252. data->active = true;
  253. atomic_inc(&ring->persistent_gnt_in_use);
  254. return data;
  255. }
  256. }
  257. return NULL;
  258. }
  259. static void put_persistent_gnt(struct xen_blkif_ring *ring,
  260. struct persistent_gnt *persistent_gnt)
  261. {
  262. if (!persistent_gnt->active)
  263. pr_alert_ratelimited("freeing a grant already unused\n");
  264. persistent_gnt->last_used = jiffies;
  265. persistent_gnt->active = false;
  266. atomic_dec(&ring->persistent_gnt_in_use);
  267. }
  268. static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root,
  269. unsigned int num)
  270. {
  271. struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  272. struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  273. struct persistent_gnt *persistent_gnt;
  274. struct rb_node *n;
  275. int segs_to_unmap = 0;
  276. struct gntab_unmap_queue_data unmap_data;
  277. unmap_data.pages = pages;
  278. unmap_data.unmap_ops = unmap;
  279. unmap_data.kunmap_ops = NULL;
  280. foreach_grant_safe(persistent_gnt, n, root, node) {
  281. BUG_ON(persistent_gnt->handle ==
  282. BLKBACK_INVALID_HANDLE);
  283. gnttab_set_unmap_op(&unmap[segs_to_unmap],
  284. (unsigned long) pfn_to_kaddr(page_to_pfn(
  285. persistent_gnt->page)),
  286. GNTMAP_host_map,
  287. persistent_gnt->handle);
  288. pages[segs_to_unmap] = persistent_gnt->page;
  289. if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
  290. !rb_next(&persistent_gnt->node)) {
  291. unmap_data.count = segs_to_unmap;
  292. BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
  293. put_free_pages(ring, pages, segs_to_unmap);
  294. segs_to_unmap = 0;
  295. }
  296. rb_erase(&persistent_gnt->node, root);
  297. kfree(persistent_gnt);
  298. num--;
  299. }
  300. BUG_ON(num != 0);
  301. }
  302. void xen_blkbk_unmap_purged_grants(struct work_struct *work)
  303. {
  304. struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  305. struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  306. struct persistent_gnt *persistent_gnt;
  307. int segs_to_unmap = 0;
  308. struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work);
  309. struct gntab_unmap_queue_data unmap_data;
  310. unmap_data.pages = pages;
  311. unmap_data.unmap_ops = unmap;
  312. unmap_data.kunmap_ops = NULL;
  313. while(!list_empty(&ring->persistent_purge_list)) {
  314. persistent_gnt = list_first_entry(&ring->persistent_purge_list,
  315. struct persistent_gnt,
  316. remove_node);
  317. list_del(&persistent_gnt->remove_node);
  318. gnttab_set_unmap_op(&unmap[segs_to_unmap],
  319. vaddr(persistent_gnt->page),
  320. GNTMAP_host_map,
  321. persistent_gnt->handle);
  322. pages[segs_to_unmap] = persistent_gnt->page;
  323. if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
  324. unmap_data.count = segs_to_unmap;
  325. BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
  326. put_free_pages(ring, pages, segs_to_unmap);
  327. segs_to_unmap = 0;
  328. }
  329. kfree(persistent_gnt);
  330. }
  331. if (segs_to_unmap > 0) {
  332. unmap_data.count = segs_to_unmap;
  333. BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
  334. put_free_pages(ring, pages, segs_to_unmap);
  335. }
  336. }
  337. static void purge_persistent_gnt(struct xen_blkif_ring *ring)
  338. {
  339. struct persistent_gnt *persistent_gnt;
  340. struct rb_node *n;
  341. unsigned int num_clean, total;
  342. bool scan_used = false;
  343. struct rb_root *root;
  344. if (work_busy(&ring->persistent_purge_work)) {
  345. pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n");
  346. goto out;
  347. }
  348. if (ring->persistent_gnt_c < xen_blkif_max_pgrants ||
  349. (ring->persistent_gnt_c == xen_blkif_max_pgrants &&
  350. !ring->blkif->vbd.overflow_max_grants)) {
  351. num_clean = 0;
  352. } else {
  353. num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
  354. num_clean = ring->persistent_gnt_c - xen_blkif_max_pgrants +
  355. num_clean;
  356. num_clean = min(ring->persistent_gnt_c, num_clean);
  357. pr_debug("Going to purge at least %u persistent grants\n",
  358. num_clean);
  359. }
  360. /*
  361. * At this point, we can assure that there will be no calls
  362. * to get_persistent_grant (because we are executing this code from
  363. * xen_blkif_schedule), there can only be calls to put_persistent_gnt,
  364. * which means that the number of currently used grants will go down,
  365. * but never up, so we will always be able to remove the requested
  366. * number of grants.
  367. */
  368. total = 0;
  369. BUG_ON(!list_empty(&ring->persistent_purge_list));
  370. root = &ring->persistent_gnts;
  371. purge_list:
  372. foreach_grant_safe(persistent_gnt, n, root, node) {
  373. BUG_ON(persistent_gnt->handle ==
  374. BLKBACK_INVALID_HANDLE);
  375. if (persistent_gnt->active)
  376. continue;
  377. if (!scan_used && !persistent_gnt_timeout(persistent_gnt))
  378. continue;
  379. if (scan_used && total >= num_clean)
  380. continue;
  381. rb_erase(&persistent_gnt->node, root);
  382. list_add(&persistent_gnt->remove_node,
  383. &ring->persistent_purge_list);
  384. total++;
  385. }
  386. /*
  387. * Check whether we also need to start cleaning
  388. * grants that were used since last purge in order to cope
  389. * with the requested num
  390. */
  391. if (!scan_used && total < num_clean) {
  392. pr_debug("Still missing %u purged frames\n", num_clean - total);
  393. scan_used = true;
  394. goto purge_list;
  395. }
  396. if (total) {
  397. ring->persistent_gnt_c -= total;
  398. ring->blkif->vbd.overflow_max_grants = 0;
  399. /* We can defer this work */
  400. schedule_work(&ring->persistent_purge_work);
  401. pr_debug("Purged %u/%u\n", num_clean, total);
  402. }
  403. out:
  404. return;
  405. }
  406. /*
  407. * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
  408. */
  409. static struct pending_req *alloc_req(struct xen_blkif_ring *ring)
  410. {
  411. struct pending_req *req = NULL;
  412. unsigned long flags;
  413. spin_lock_irqsave(&ring->pending_free_lock, flags);
  414. if (!list_empty(&ring->pending_free)) {
  415. req = list_entry(ring->pending_free.next, struct pending_req,
  416. free_list);
  417. list_del(&req->free_list);
  418. }
  419. spin_unlock_irqrestore(&ring->pending_free_lock, flags);
  420. return req;
  421. }
  422. /*
  423. * Return the 'pending_req' structure back to the freepool. We also
  424. * wake up the thread if it was waiting for a free page.
  425. */
  426. static void free_req(struct xen_blkif_ring *ring, struct pending_req *req)
  427. {
  428. unsigned long flags;
  429. int was_empty;
  430. spin_lock_irqsave(&ring->pending_free_lock, flags);
  431. was_empty = list_empty(&ring->pending_free);
  432. list_add(&req->free_list, &ring->pending_free);
  433. spin_unlock_irqrestore(&ring->pending_free_lock, flags);
  434. if (was_empty)
  435. wake_up(&ring->pending_free_wq);
  436. }
  437. /*
  438. * Routines for managing virtual block devices (vbds).
  439. */
  440. static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
  441. int operation)
  442. {
  443. struct xen_vbd *vbd = &blkif->vbd;
  444. int rc = -EACCES;
  445. if ((operation != REQ_OP_READ) && vbd->readonly)
  446. goto out;
  447. if (likely(req->nr_sects)) {
  448. blkif_sector_t end = req->sector_number + req->nr_sects;
  449. if (unlikely(end < req->sector_number))
  450. goto out;
  451. if (unlikely(end > vbd_sz(vbd)))
  452. goto out;
  453. }
  454. req->dev = vbd->pdevice;
  455. req->bdev = vbd->bdev;
  456. rc = 0;
  457. out:
  458. return rc;
  459. }
  460. static void xen_vbd_resize(struct xen_blkif *blkif)
  461. {
  462. struct xen_vbd *vbd = &blkif->vbd;
  463. struct xenbus_transaction xbt;
  464. int err;
  465. struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
  466. unsigned long long new_size = vbd_sz(vbd);
  467. pr_info("VBD Resize: Domid: %d, Device: (%d, %d)\n",
  468. blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
  469. pr_info("VBD Resize: new size %llu\n", new_size);
  470. vbd->size = new_size;
  471. again:
  472. err = xenbus_transaction_start(&xbt);
  473. if (err) {
  474. pr_warn("Error starting transaction\n");
  475. return;
  476. }
  477. err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
  478. (unsigned long long)vbd_sz(vbd));
  479. if (err) {
  480. pr_warn("Error writing new size\n");
  481. goto abort;
  482. }
  483. /*
  484. * Write the current state; we will use this to synchronize
  485. * the front-end. If the current state is "connected" the
  486. * front-end will get the new size information online.
  487. */
  488. err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
  489. if (err) {
  490. pr_warn("Error writing the state\n");
  491. goto abort;
  492. }
  493. err = xenbus_transaction_end(xbt, 0);
  494. if (err == -EAGAIN)
  495. goto again;
  496. if (err)
  497. pr_warn("Error ending transaction\n");
  498. return;
  499. abort:
  500. xenbus_transaction_end(xbt, 1);
  501. }
  502. /*
  503. * Notification from the guest OS.
  504. */
  505. static void blkif_notify_work(struct xen_blkif_ring *ring)
  506. {
  507. ring->waiting_reqs = 1;
  508. wake_up(&ring->wq);
  509. }
  510. irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
  511. {
  512. blkif_notify_work(dev_id);
  513. return IRQ_HANDLED;
  514. }
  515. /*
  516. * SCHEDULER FUNCTIONS
  517. */
  518. static void print_stats(struct xen_blkif_ring *ring)
  519. {
  520. pr_info("(%s): oo %3llu | rd %4llu | wr %4llu | f %4llu"
  521. " | ds %4llu | pg: %4u/%4d\n",
  522. current->comm, ring->st_oo_req,
  523. ring->st_rd_req, ring->st_wr_req,
  524. ring->st_f_req, ring->st_ds_req,
  525. ring->persistent_gnt_c,
  526. xen_blkif_max_pgrants);
  527. ring->st_print = jiffies + msecs_to_jiffies(10 * 1000);
  528. ring->st_rd_req = 0;
  529. ring->st_wr_req = 0;
  530. ring->st_oo_req = 0;
  531. ring->st_ds_req = 0;
  532. }
  533. int xen_blkif_schedule(void *arg)
  534. {
  535. struct xen_blkif_ring *ring = arg;
  536. struct xen_blkif *blkif = ring->blkif;
  537. struct xen_vbd *vbd = &blkif->vbd;
  538. unsigned long timeout;
  539. int ret;
  540. bool do_eoi;
  541. unsigned int eoi_flags = XEN_EOI_FLAG_SPURIOUS;
  542. set_freezable();
  543. while (!kthread_should_stop()) {
  544. if (try_to_freeze())
  545. continue;
  546. if (unlikely(vbd->size != vbd_sz(vbd)))
  547. xen_vbd_resize(blkif);
  548. timeout = msecs_to_jiffies(LRU_INTERVAL);
  549. timeout = wait_event_interruptible_timeout(
  550. ring->wq,
  551. ring->waiting_reqs || kthread_should_stop(),
  552. timeout);
  553. if (timeout == 0)
  554. goto purge_gnt_list;
  555. timeout = wait_event_interruptible_timeout(
  556. ring->pending_free_wq,
  557. !list_empty(&ring->pending_free) ||
  558. kthread_should_stop(),
  559. timeout);
  560. if (timeout == 0)
  561. goto purge_gnt_list;
  562. do_eoi = ring->waiting_reqs;
  563. ring->waiting_reqs = 0;
  564. smp_mb(); /* clear flag *before* checking for work */
  565. ret = do_block_io_op(ring, &eoi_flags);
  566. if (ret > 0)
  567. ring->waiting_reqs = 1;
  568. if (ret == -EACCES)
  569. wait_event_interruptible(ring->shutdown_wq,
  570. kthread_should_stop());
  571. if (do_eoi && !ring->waiting_reqs) {
  572. xen_irq_lateeoi(ring->irq, eoi_flags);
  573. eoi_flags |= XEN_EOI_FLAG_SPURIOUS;
  574. }
  575. purge_gnt_list:
  576. if (blkif->vbd.feature_gnt_persistent &&
  577. time_after(jiffies, ring->next_lru)) {
  578. purge_persistent_gnt(ring);
  579. ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
  580. }
  581. /* Shrink if we have more than xen_blkif_max_buffer_pages */
  582. shrink_free_pagepool(ring, xen_blkif_max_buffer_pages);
  583. if (log_stats && time_after(jiffies, ring->st_print))
  584. print_stats(ring);
  585. }
  586. /* Drain pending purge work */
  587. flush_work(&ring->persistent_purge_work);
  588. if (log_stats)
  589. print_stats(ring);
  590. ring->xenblkd = NULL;
  591. return 0;
  592. }
  593. /*
  594. * Remove persistent grants and empty the pool of free pages
  595. */
  596. void xen_blkbk_free_caches(struct xen_blkif_ring *ring)
  597. {
  598. /* Free all persistent grant pages */
  599. if (!RB_EMPTY_ROOT(&ring->persistent_gnts))
  600. free_persistent_gnts(ring, &ring->persistent_gnts,
  601. ring->persistent_gnt_c);
  602. BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
  603. ring->persistent_gnt_c = 0;
  604. /* Since we are shutting down remove all pages from the buffer */
  605. shrink_free_pagepool(ring, 0 /* All */);
  606. }
  607. static unsigned int xen_blkbk_unmap_prepare(
  608. struct xen_blkif_ring *ring,
  609. struct grant_page **pages,
  610. unsigned int num,
  611. struct gnttab_unmap_grant_ref *unmap_ops,
  612. struct page **unmap_pages)
  613. {
  614. unsigned int i, invcount = 0;
  615. for (i = 0; i < num; i++) {
  616. if (pages[i]->persistent_gnt != NULL) {
  617. put_persistent_gnt(ring, pages[i]->persistent_gnt);
  618. continue;
  619. }
  620. if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
  621. continue;
  622. unmap_pages[invcount] = pages[i]->page;
  623. gnttab_set_unmap_op(&unmap_ops[invcount], vaddr(pages[i]->page),
  624. GNTMAP_host_map, pages[i]->handle);
  625. pages[i]->handle = BLKBACK_INVALID_HANDLE;
  626. invcount++;
  627. }
  628. return invcount;
  629. }
  630. static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data)
  631. {
  632. struct pending_req *pending_req = (struct pending_req *)(data->data);
  633. struct xen_blkif_ring *ring = pending_req->ring;
  634. struct xen_blkif *blkif = ring->blkif;
  635. /* BUG_ON used to reproduce existing behaviour,
  636. but is this the best way to deal with this? */
  637. BUG_ON(result);
  638. put_free_pages(ring, data->pages, data->count);
  639. make_response(ring, pending_req->id,
  640. pending_req->operation, pending_req->status);
  641. free_req(ring, pending_req);
  642. /*
  643. * Make sure the request is freed before releasing blkif,
  644. * or there could be a race between free_req and the
  645. * cleanup done in xen_blkif_free during shutdown.
  646. *
  647. * NB: The fact that we might try to wake up pending_free_wq
  648. * before drain_complete (in case there's a drain going on)
  649. * it's not a problem with our current implementation
  650. * because we can assure there's no thread waiting on
  651. * pending_free_wq if there's a drain going on, but it has
  652. * to be taken into account if the current model is changed.
  653. */
  654. if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) {
  655. complete(&blkif->drain_complete);
  656. }
  657. xen_blkif_put(blkif);
  658. }
  659. static void xen_blkbk_unmap_and_respond(struct pending_req *req)
  660. {
  661. struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data;
  662. struct xen_blkif_ring *ring = req->ring;
  663. struct grant_page **pages = req->segments;
  664. unsigned int invcount;
  665. invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs,
  666. req->unmap, req->unmap_pages);
  667. work->data = req;
  668. work->done = xen_blkbk_unmap_and_respond_callback;
  669. work->unmap_ops = req->unmap;
  670. work->kunmap_ops = NULL;
  671. work->pages = req->unmap_pages;
  672. work->count = invcount;
  673. gnttab_unmap_refs_async(&req->gnttab_unmap_data);
  674. }
  675. /*
  676. * Unmap the grant references.
  677. *
  678. * This could accumulate ops up to the batch size to reduce the number
  679. * of hypercalls, but since this is only used in error paths there's
  680. * no real need.
  681. */
  682. static void xen_blkbk_unmap(struct xen_blkif_ring *ring,
  683. struct grant_page *pages[],
  684. int num)
  685. {
  686. struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  687. struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  688. unsigned int invcount = 0;
  689. int ret;
  690. while (num) {
  691. unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST);
  692. invcount = xen_blkbk_unmap_prepare(ring, pages, batch,
  693. unmap, unmap_pages);
  694. if (invcount) {
  695. ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
  696. BUG_ON(ret);
  697. put_free_pages(ring, unmap_pages, invcount);
  698. }
  699. pages += batch;
  700. num -= batch;
  701. }
  702. }
  703. static int xen_blkbk_map(struct xen_blkif_ring *ring,
  704. struct grant_page *pages[],
  705. int num, bool ro)
  706. {
  707. struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  708. struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  709. struct persistent_gnt *persistent_gnt = NULL;
  710. phys_addr_t addr = 0;
  711. int i, seg_idx, new_map_idx;
  712. int segs_to_map = 0;
  713. int ret = 0;
  714. int last_map = 0, map_until = 0;
  715. int use_persistent_gnts;
  716. struct xen_blkif *blkif = ring->blkif;
  717. use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
  718. /*
  719. * Fill out preq.nr_sects with proper amount of sectors, and setup
  720. * assign map[..] with the PFN of the page in our domain with the
  721. * corresponding grant reference for each page.
  722. */
  723. again:
  724. for (i = map_until; i < num; i++) {
  725. uint32_t flags;
  726. if (use_persistent_gnts) {
  727. persistent_gnt = get_persistent_gnt(
  728. ring,
  729. pages[i]->gref);
  730. }
  731. if (persistent_gnt) {
  732. /*
  733. * We are using persistent grants and
  734. * the grant is already mapped
  735. */
  736. pages[i]->page = persistent_gnt->page;
  737. pages[i]->persistent_gnt = persistent_gnt;
  738. } else {
  739. if (get_free_page(ring, &pages[i]->page)) {
  740. put_free_pages(ring, pages_to_gnt, segs_to_map);
  741. ret = -ENOMEM;
  742. goto out;
  743. }
  744. addr = vaddr(pages[i]->page);
  745. pages_to_gnt[segs_to_map] = pages[i]->page;
  746. pages[i]->persistent_gnt = NULL;
  747. flags = GNTMAP_host_map;
  748. if (!use_persistent_gnts && ro)
  749. flags |= GNTMAP_readonly;
  750. gnttab_set_map_op(&map[segs_to_map++], addr,
  751. flags, pages[i]->gref,
  752. blkif->domid);
  753. }
  754. map_until = i + 1;
  755. if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST)
  756. break;
  757. }
  758. if (segs_to_map)
  759. ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map);
  760. /*
  761. * Now swizzle the MFN in our domain with the MFN from the other domain
  762. * so that when we access vaddr(pending_req,i) it has the contents of
  763. * the page from the other domain.
  764. */
  765. for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) {
  766. if (!pages[seg_idx]->persistent_gnt) {
  767. /* This is a newly mapped grant */
  768. BUG_ON(new_map_idx >= segs_to_map);
  769. if (unlikely(map[new_map_idx].status != 0)) {
  770. pr_debug("invalid buffer -- could not remap it\n");
  771. put_free_pages(ring, &pages[seg_idx]->page, 1);
  772. pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
  773. ret |= !ret;
  774. goto next;
  775. }
  776. pages[seg_idx]->handle = map[new_map_idx].handle;
  777. } else {
  778. continue;
  779. }
  780. if (use_persistent_gnts &&
  781. ring->persistent_gnt_c < xen_blkif_max_pgrants) {
  782. /*
  783. * We are using persistent grants, the grant is
  784. * not mapped but we might have room for it.
  785. */
  786. persistent_gnt = kmalloc(sizeof(struct persistent_gnt),
  787. GFP_KERNEL);
  788. if (!persistent_gnt) {
  789. /*
  790. * If we don't have enough memory to
  791. * allocate the persistent_gnt struct
  792. * map this grant non-persistenly
  793. */
  794. goto next;
  795. }
  796. persistent_gnt->gnt = map[new_map_idx].ref;
  797. persistent_gnt->handle = map[new_map_idx].handle;
  798. persistent_gnt->page = pages[seg_idx]->page;
  799. if (add_persistent_gnt(ring,
  800. persistent_gnt)) {
  801. kfree(persistent_gnt);
  802. persistent_gnt = NULL;
  803. goto next;
  804. }
  805. pages[seg_idx]->persistent_gnt = persistent_gnt;
  806. pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n",
  807. persistent_gnt->gnt, ring->persistent_gnt_c,
  808. xen_blkif_max_pgrants);
  809. goto next;
  810. }
  811. if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) {
  812. blkif->vbd.overflow_max_grants = 1;
  813. pr_debug("domain %u, device %#x is using maximum number of persistent grants\n",
  814. blkif->domid, blkif->vbd.handle);
  815. }
  816. /*
  817. * We could not map this grant persistently, so use it as
  818. * a non-persistent grant.
  819. */
  820. next:
  821. new_map_idx++;
  822. }
  823. segs_to_map = 0;
  824. last_map = map_until;
  825. if (!ret && map_until != num)
  826. goto again;
  827. out:
  828. for (i = last_map; i < num; i++) {
  829. /* Don't zap current batch's valid persistent grants. */
  830. if(i >= map_until)
  831. pages[i]->persistent_gnt = NULL;
  832. pages[i]->handle = BLKBACK_INVALID_HANDLE;
  833. }
  834. return ret;
  835. }
  836. static int xen_blkbk_map_seg(struct pending_req *pending_req)
  837. {
  838. int rc;
  839. rc = xen_blkbk_map(pending_req->ring, pending_req->segments,
  840. pending_req->nr_segs,
  841. (pending_req->operation != BLKIF_OP_READ));
  842. return rc;
  843. }
  844. static int xen_blkbk_parse_indirect(struct blkif_request *req,
  845. struct pending_req *pending_req,
  846. struct seg_buf seg[],
  847. struct phys_req *preq)
  848. {
  849. struct grant_page **pages = pending_req->indirect_pages;
  850. struct xen_blkif_ring *ring = pending_req->ring;
  851. int indirect_grefs, rc, n, nseg, i;
  852. struct blkif_request_segment *segments = NULL;
  853. nseg = pending_req->nr_segs;
  854. indirect_grefs = INDIRECT_PAGES(nseg);
  855. BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
  856. for (i = 0; i < indirect_grefs; i++)
  857. pages[i]->gref = req->u.indirect.indirect_grefs[i];
  858. rc = xen_blkbk_map(ring, pages, indirect_grefs, true);
  859. if (rc)
  860. goto unmap;
  861. for (n = 0, i = 0; n < nseg; n++) {
  862. uint8_t first_sect, last_sect;
  863. if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
  864. /* Map indirect segments */
  865. if (segments)
  866. kunmap_atomic(segments);
  867. segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page);
  868. }
  869. i = n % SEGS_PER_INDIRECT_FRAME;
  870. pending_req->segments[n]->gref = segments[i].gref;
  871. first_sect = READ_ONCE(segments[i].first_sect);
  872. last_sect = READ_ONCE(segments[i].last_sect);
  873. if (last_sect >= (XEN_PAGE_SIZE >> 9) || last_sect < first_sect) {
  874. rc = -EINVAL;
  875. goto unmap;
  876. }
  877. seg[n].nsec = last_sect - first_sect + 1;
  878. seg[n].offset = first_sect << 9;
  879. preq->nr_sects += seg[n].nsec;
  880. }
  881. unmap:
  882. if (segments)
  883. kunmap_atomic(segments);
  884. xen_blkbk_unmap(ring, pages, indirect_grefs);
  885. return rc;
  886. }
  887. static int dispatch_discard_io(struct xen_blkif_ring *ring,
  888. struct blkif_request *req)
  889. {
  890. int err = 0;
  891. int status = BLKIF_RSP_OKAY;
  892. struct xen_blkif *blkif = ring->blkif;
  893. struct block_device *bdev = blkif->vbd.bdev;
  894. unsigned long secure;
  895. struct phys_req preq;
  896. xen_blkif_get(blkif);
  897. preq.sector_number = req->u.discard.sector_number;
  898. preq.nr_sects = req->u.discard.nr_sectors;
  899. err = xen_vbd_translate(&preq, blkif, REQ_OP_WRITE);
  900. if (err) {
  901. pr_warn("access denied: DISCARD [%llu->%llu] on dev=%04x\n",
  902. preq.sector_number,
  903. preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
  904. goto fail_response;
  905. }
  906. ring->st_ds_req++;
  907. secure = (blkif->vbd.discard_secure &&
  908. (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
  909. BLKDEV_DISCARD_SECURE : 0;
  910. err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
  911. req->u.discard.nr_sectors,
  912. GFP_KERNEL, secure);
  913. fail_response:
  914. if (err == -EOPNOTSUPP) {
  915. pr_debug("discard op failed, not supported\n");
  916. status = BLKIF_RSP_EOPNOTSUPP;
  917. } else if (err)
  918. status = BLKIF_RSP_ERROR;
  919. make_response(ring, req->u.discard.id, req->operation, status);
  920. xen_blkif_put(blkif);
  921. return err;
  922. }
  923. static int dispatch_other_io(struct xen_blkif_ring *ring,
  924. struct blkif_request *req,
  925. struct pending_req *pending_req)
  926. {
  927. free_req(ring, pending_req);
  928. make_response(ring, req->u.other.id, req->operation,
  929. BLKIF_RSP_EOPNOTSUPP);
  930. return -EIO;
  931. }
  932. static void xen_blk_drain_io(struct xen_blkif_ring *ring)
  933. {
  934. struct xen_blkif *blkif = ring->blkif;
  935. atomic_set(&blkif->drain, 1);
  936. do {
  937. if (atomic_read(&ring->inflight) == 0)
  938. break;
  939. wait_for_completion_interruptible_timeout(
  940. &blkif->drain_complete, HZ);
  941. if (!atomic_read(&blkif->drain))
  942. break;
  943. } while (!kthread_should_stop());
  944. atomic_set(&blkif->drain, 0);
  945. }
  946. static void __end_block_io_op(struct pending_req *pending_req,
  947. blk_status_t error)
  948. {
  949. /* An error fails the entire request. */
  950. if (pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE &&
  951. error == BLK_STS_NOTSUPP) {
  952. pr_debug("flush diskcache op failed, not supported\n");
  953. xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0);
  954. pending_req->status = BLKIF_RSP_EOPNOTSUPP;
  955. } else if (pending_req->operation == BLKIF_OP_WRITE_BARRIER &&
  956. error == BLK_STS_NOTSUPP) {
  957. pr_debug("write barrier op failed, not supported\n");
  958. xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0);
  959. pending_req->status = BLKIF_RSP_EOPNOTSUPP;
  960. } else if (error) {
  961. pr_debug("Buffer not up-to-date at end of operation,"
  962. " error=%d\n", error);
  963. pending_req->status = BLKIF_RSP_ERROR;
  964. }
  965. /*
  966. * If all of the bio's have completed it is time to unmap
  967. * the grant references associated with 'request' and provide
  968. * the proper response on the ring.
  969. */
  970. if (atomic_dec_and_test(&pending_req->pendcnt))
  971. xen_blkbk_unmap_and_respond(pending_req);
  972. }
  973. /*
  974. * bio callback.
  975. */
  976. static void end_block_io_op(struct bio *bio)
  977. {
  978. __end_block_io_op(bio->bi_private, bio->bi_status);
  979. bio_put(bio);
  980. }
  981. /*
  982. * Function to copy the from the ring buffer the 'struct blkif_request'
  983. * (which has the sectors we want, number of them, grant references, etc),
  984. * and transmute it to the block API to hand it over to the proper block disk.
  985. */
  986. static int
  987. __do_block_io_op(struct xen_blkif_ring *ring, unsigned int *eoi_flags)
  988. {
  989. union blkif_back_rings *blk_rings = &ring->blk_rings;
  990. struct blkif_request req;
  991. struct pending_req *pending_req;
  992. RING_IDX rc, rp;
  993. int more_to_do = 0;
  994. rc = blk_rings->common.req_cons;
  995. rp = blk_rings->common.sring->req_prod;
  996. rmb(); /* Ensure we see queued requests up to 'rp'. */
  997. if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
  998. rc = blk_rings->common.rsp_prod_pvt;
  999. pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
  1000. rp, rc, rp - rc, ring->blkif->vbd.pdevice);
  1001. return -EACCES;
  1002. }
  1003. while (rc != rp) {
  1004. if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
  1005. break;
  1006. /* We've seen a request, so clear spurious eoi flag. */
  1007. *eoi_flags &= ~XEN_EOI_FLAG_SPURIOUS;
  1008. if (kthread_should_stop()) {
  1009. more_to_do = 1;
  1010. break;
  1011. }
  1012. pending_req = alloc_req(ring);
  1013. if (NULL == pending_req) {
  1014. ring->st_oo_req++;
  1015. more_to_do = 1;
  1016. break;
  1017. }
  1018. switch (ring->blkif->blk_protocol) {
  1019. case BLKIF_PROTOCOL_NATIVE:
  1020. memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
  1021. break;
  1022. case BLKIF_PROTOCOL_X86_32:
  1023. blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
  1024. break;
  1025. case BLKIF_PROTOCOL_X86_64:
  1026. blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
  1027. break;
  1028. default:
  1029. BUG();
  1030. }
  1031. blk_rings->common.req_cons = ++rc; /* before make_response() */
  1032. /* Apply all sanity checks to /private copy/ of request. */
  1033. barrier();
  1034. switch (req.operation) {
  1035. case BLKIF_OP_READ:
  1036. case BLKIF_OP_WRITE:
  1037. case BLKIF_OP_WRITE_BARRIER:
  1038. case BLKIF_OP_FLUSH_DISKCACHE:
  1039. case BLKIF_OP_INDIRECT:
  1040. if (dispatch_rw_block_io(ring, &req, pending_req))
  1041. goto done;
  1042. break;
  1043. case BLKIF_OP_DISCARD:
  1044. free_req(ring, pending_req);
  1045. if (dispatch_discard_io(ring, &req))
  1046. goto done;
  1047. break;
  1048. default:
  1049. if (dispatch_other_io(ring, &req, pending_req))
  1050. goto done;
  1051. break;
  1052. }
  1053. /* Yield point for this unbounded loop. */
  1054. cond_resched();
  1055. }
  1056. done:
  1057. return more_to_do;
  1058. }
  1059. static int
  1060. do_block_io_op(struct xen_blkif_ring *ring, unsigned int *eoi_flags)
  1061. {
  1062. union blkif_back_rings *blk_rings = &ring->blk_rings;
  1063. int more_to_do;
  1064. do {
  1065. more_to_do = __do_block_io_op(ring, eoi_flags);
  1066. if (more_to_do)
  1067. break;
  1068. RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
  1069. } while (more_to_do);
  1070. return more_to_do;
  1071. }
  1072. /*
  1073. * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
  1074. * and call the 'submit_bio' to pass it to the underlying storage.
  1075. */
  1076. static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
  1077. struct blkif_request *req,
  1078. struct pending_req *pending_req)
  1079. {
  1080. struct phys_req preq;
  1081. struct seg_buf *seg = pending_req->seg;
  1082. unsigned int nseg;
  1083. struct bio *bio = NULL;
  1084. struct bio **biolist = pending_req->biolist;
  1085. int i, nbio = 0;
  1086. int operation;
  1087. int operation_flags = 0;
  1088. struct blk_plug plug;
  1089. bool drain = false;
  1090. struct grant_page **pages = pending_req->segments;
  1091. unsigned short req_operation;
  1092. req_operation = req->operation == BLKIF_OP_INDIRECT ?
  1093. req->u.indirect.indirect_op : req->operation;
  1094. if ((req->operation == BLKIF_OP_INDIRECT) &&
  1095. (req_operation != BLKIF_OP_READ) &&
  1096. (req_operation != BLKIF_OP_WRITE)) {
  1097. pr_debug("Invalid indirect operation (%u)\n", req_operation);
  1098. goto fail_response;
  1099. }
  1100. switch (req_operation) {
  1101. case BLKIF_OP_READ:
  1102. ring->st_rd_req++;
  1103. operation = REQ_OP_READ;
  1104. break;
  1105. case BLKIF_OP_WRITE:
  1106. ring->st_wr_req++;
  1107. operation = REQ_OP_WRITE;
  1108. operation_flags = REQ_SYNC | REQ_IDLE;
  1109. break;
  1110. case BLKIF_OP_WRITE_BARRIER:
  1111. drain = true;
  1112. /* fall through */
  1113. case BLKIF_OP_FLUSH_DISKCACHE:
  1114. ring->st_f_req++;
  1115. operation = REQ_OP_WRITE;
  1116. operation_flags = REQ_PREFLUSH;
  1117. break;
  1118. default:
  1119. operation = 0; /* make gcc happy */
  1120. goto fail_response;
  1121. break;
  1122. }
  1123. /* Check that the number of segments is sane. */
  1124. nseg = req->operation == BLKIF_OP_INDIRECT ?
  1125. req->u.indirect.nr_segments : req->u.rw.nr_segments;
  1126. if (unlikely(nseg == 0 && operation_flags != REQ_PREFLUSH) ||
  1127. unlikely((req->operation != BLKIF_OP_INDIRECT) &&
  1128. (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
  1129. unlikely((req->operation == BLKIF_OP_INDIRECT) &&
  1130. (nseg > MAX_INDIRECT_SEGMENTS))) {
  1131. pr_debug("Bad number of segments in request (%d)\n", nseg);
  1132. /* Haven't submitted any bio's yet. */
  1133. goto fail_response;
  1134. }
  1135. preq.nr_sects = 0;
  1136. pending_req->ring = ring;
  1137. pending_req->id = req->u.rw.id;
  1138. pending_req->operation = req_operation;
  1139. pending_req->status = BLKIF_RSP_OKAY;
  1140. pending_req->nr_segs = nseg;
  1141. if (req->operation != BLKIF_OP_INDIRECT) {
  1142. preq.dev = req->u.rw.handle;
  1143. preq.sector_number = req->u.rw.sector_number;
  1144. for (i = 0; i < nseg; i++) {
  1145. pages[i]->gref = req->u.rw.seg[i].gref;
  1146. seg[i].nsec = req->u.rw.seg[i].last_sect -
  1147. req->u.rw.seg[i].first_sect + 1;
  1148. seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
  1149. if ((req->u.rw.seg[i].last_sect >= (XEN_PAGE_SIZE >> 9)) ||
  1150. (req->u.rw.seg[i].last_sect <
  1151. req->u.rw.seg[i].first_sect))
  1152. goto fail_response;
  1153. preq.nr_sects += seg[i].nsec;
  1154. }
  1155. } else {
  1156. preq.dev = req->u.indirect.handle;
  1157. preq.sector_number = req->u.indirect.sector_number;
  1158. if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq))
  1159. goto fail_response;
  1160. }
  1161. if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) {
  1162. pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n",
  1163. operation == REQ_OP_READ ? "read" : "write",
  1164. preq.sector_number,
  1165. preq.sector_number + preq.nr_sects,
  1166. ring->blkif->vbd.pdevice);
  1167. goto fail_response;
  1168. }
  1169. /*
  1170. * This check _MUST_ be done after xen_vbd_translate as the preq.bdev
  1171. * is set there.
  1172. */
  1173. for (i = 0; i < nseg; i++) {
  1174. if (((int)preq.sector_number|(int)seg[i].nsec) &
  1175. ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
  1176. pr_debug("Misaligned I/O request from domain %d\n",
  1177. ring->blkif->domid);
  1178. goto fail_response;
  1179. }
  1180. }
  1181. /* Wait on all outstanding I/O's and once that has been completed
  1182. * issue the flush.
  1183. */
  1184. if (drain)
  1185. xen_blk_drain_io(pending_req->ring);
  1186. /*
  1187. * If we have failed at this point, we need to undo the M2P override,
  1188. * set gnttab_set_unmap_op on all of the grant references and perform
  1189. * the hypercall to unmap the grants - that is all done in
  1190. * xen_blkbk_unmap.
  1191. */
  1192. if (xen_blkbk_map_seg(pending_req))
  1193. goto fail_flush;
  1194. /*
  1195. * This corresponding xen_blkif_put is done in __end_block_io_op, or
  1196. * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
  1197. */
  1198. xen_blkif_get(ring->blkif);
  1199. atomic_inc(&ring->inflight);
  1200. for (i = 0; i < nseg; i++) {
  1201. while ((bio == NULL) ||
  1202. (bio_add_page(bio,
  1203. pages[i]->page,
  1204. seg[i].nsec << 9,
  1205. seg[i].offset) == 0)) {
  1206. int nr_iovecs = min_t(int, (nseg-i), BIO_MAX_PAGES);
  1207. bio = bio_alloc(GFP_KERNEL, nr_iovecs);
  1208. if (unlikely(bio == NULL))
  1209. goto fail_put_bio;
  1210. biolist[nbio++] = bio;
  1211. bio_set_dev(bio, preq.bdev);
  1212. bio->bi_private = pending_req;
  1213. bio->bi_end_io = end_block_io_op;
  1214. bio->bi_iter.bi_sector = preq.sector_number;
  1215. bio_set_op_attrs(bio, operation, operation_flags);
  1216. }
  1217. preq.sector_number += seg[i].nsec;
  1218. }
  1219. /* This will be hit if the operation was a flush or discard. */
  1220. if (!bio) {
  1221. BUG_ON(operation_flags != REQ_PREFLUSH);
  1222. bio = bio_alloc(GFP_KERNEL, 0);
  1223. if (unlikely(bio == NULL))
  1224. goto fail_put_bio;
  1225. biolist[nbio++] = bio;
  1226. bio_set_dev(bio, preq.bdev);
  1227. bio->bi_private = pending_req;
  1228. bio->bi_end_io = end_block_io_op;
  1229. bio_set_op_attrs(bio, operation, operation_flags);
  1230. }
  1231. atomic_set(&pending_req->pendcnt, nbio);
  1232. blk_start_plug(&plug);
  1233. for (i = 0; i < nbio; i++)
  1234. submit_bio(biolist[i]);
  1235. /* Let the I/Os go.. */
  1236. blk_finish_plug(&plug);
  1237. if (operation == REQ_OP_READ)
  1238. ring->st_rd_sect += preq.nr_sects;
  1239. else if (operation == REQ_OP_WRITE)
  1240. ring->st_wr_sect += preq.nr_sects;
  1241. return 0;
  1242. fail_flush:
  1243. xen_blkbk_unmap(ring, pending_req->segments,
  1244. pending_req->nr_segs);
  1245. fail_response:
  1246. /* Haven't submitted any bio's yet. */
  1247. make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
  1248. free_req(ring, pending_req);
  1249. msleep(1); /* back off a bit */
  1250. return -EIO;
  1251. fail_put_bio:
  1252. for (i = 0; i < nbio; i++)
  1253. bio_put(biolist[i]);
  1254. atomic_set(&pending_req->pendcnt, 1);
  1255. __end_block_io_op(pending_req, BLK_STS_RESOURCE);
  1256. msleep(1); /* back off a bit */
  1257. return -EIO;
  1258. }
  1259. /*
  1260. * Put a response on the ring on how the operation fared.
  1261. */
  1262. static void make_response(struct xen_blkif_ring *ring, u64 id,
  1263. unsigned short op, int st)
  1264. {
  1265. struct blkif_response *resp;
  1266. unsigned long flags;
  1267. union blkif_back_rings *blk_rings;
  1268. int notify;
  1269. spin_lock_irqsave(&ring->blk_ring_lock, flags);
  1270. blk_rings = &ring->blk_rings;
  1271. /* Place on the response ring for the relevant domain. */
  1272. switch (ring->blkif->blk_protocol) {
  1273. case BLKIF_PROTOCOL_NATIVE:
  1274. resp = RING_GET_RESPONSE(&blk_rings->native,
  1275. blk_rings->native.rsp_prod_pvt);
  1276. break;
  1277. case BLKIF_PROTOCOL_X86_32:
  1278. resp = RING_GET_RESPONSE(&blk_rings->x86_32,
  1279. blk_rings->x86_32.rsp_prod_pvt);
  1280. break;
  1281. case BLKIF_PROTOCOL_X86_64:
  1282. resp = RING_GET_RESPONSE(&blk_rings->x86_64,
  1283. blk_rings->x86_64.rsp_prod_pvt);
  1284. break;
  1285. default:
  1286. BUG();
  1287. }
  1288. resp->id = id;
  1289. resp->operation = op;
  1290. resp->status = st;
  1291. blk_rings->common.rsp_prod_pvt++;
  1292. RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
  1293. spin_unlock_irqrestore(&ring->blk_ring_lock, flags);
  1294. if (notify)
  1295. notify_remote_via_irq(ring->irq);
  1296. }
  1297. static int __init xen_blkif_init(void)
  1298. {
  1299. int rc = 0;
  1300. if (!xen_domain())
  1301. return -ENODEV;
  1302. if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
  1303. pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
  1304. xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
  1305. xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
  1306. }
  1307. if (xenblk_max_queues == 0)
  1308. xenblk_max_queues = num_online_cpus();
  1309. rc = xen_blkif_interface_init();
  1310. if (rc)
  1311. goto failed_init;
  1312. rc = xen_blkif_xenbus_init();
  1313. if (rc)
  1314. goto failed_init;
  1315. failed_init:
  1316. return rc;
  1317. }
  1318. module_init(xen_blkif_init);
  1319. MODULE_LICENSE("Dual BSD/GPL");
  1320. MODULE_ALIAS("xen-backend:vbd");