vhost.c 75 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /* Copyright (C) 2009 Red Hat, Inc.
  3. * Copyright (C) 2006 Rusty Russell IBM Corporation
  4. *
  5. * Author: Michael S. Tsirkin <mst@redhat.com>
  6. *
  7. * Inspiration, some code, and most witty comments come from
  8. * Documentation/virtual/lguest/lguest.c, by Rusty Russell
  9. *
  10. * Generic code for virtio server in host kernel.
  11. */
  12. #include <linux/eventfd.h>
  13. #include <linux/vhost.h>
  14. #include <linux/uio.h>
  15. #include <linux/mm.h>
  16. #include <linux/miscdevice.h>
  17. #include <linux/mutex.h>
  18. #include <linux/poll.h>
  19. #include <linux/file.h>
  20. #include <linux/highmem.h>
  21. #include <linux/slab.h>
  22. #include <linux/vmalloc.h>
  23. #include <linux/kthread.h>
  24. #include <linux/cgroup.h>
  25. #include <linux/module.h>
  26. #include <linux/sort.h>
  27. #include <linux/sched/mm.h>
  28. #include <linux/sched/signal.h>
  29. #include <linux/sched/vhost_task.h>
  30. #include <linux/interval_tree_generic.h>
  31. #include <linux/nospec.h>
  32. #include <linux/kcov.h>
  33. #include "vhost.h"
  34. static ushort max_mem_regions = 64;
  35. module_param(max_mem_regions, ushort, 0444);
  36. MODULE_PARM_DESC(max_mem_regions,
  37. "Maximum number of memory regions in memory map. (default: 64)");
  38. static int max_iotlb_entries = 2048;
  39. module_param(max_iotlb_entries, int, 0444);
  40. MODULE_PARM_DESC(max_iotlb_entries,
  41. "Maximum number of iotlb entries. (default: 2048)");
  42. static bool fork_from_owner_default = VHOST_FORK_OWNER_TASK;
  43. #ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL
  44. module_param(fork_from_owner_default, bool, 0444);
  45. MODULE_PARM_DESC(fork_from_owner_default,
  46. "Set task mode as the default(default: Y)");
  47. #endif
  48. enum {
  49. VHOST_MEMORY_F_LOG = 0x1,
  50. };
  51. #define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
  52. #define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])
  53. #ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
  54. static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
  55. {
  56. vq->user_be = !virtio_legacy_is_little_endian();
  57. }
  58. static void vhost_enable_cross_endian_big(struct vhost_virtqueue *vq)
  59. {
  60. vq->user_be = true;
  61. }
  62. static void vhost_enable_cross_endian_little(struct vhost_virtqueue *vq)
  63. {
  64. vq->user_be = false;
  65. }
  66. static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
  67. {
  68. struct vhost_vring_state s;
  69. if (vq->private_data)
  70. return -EBUSY;
  71. if (copy_from_user(&s, argp, sizeof(s)))
  72. return -EFAULT;
  73. if (s.num != VHOST_VRING_LITTLE_ENDIAN &&
  74. s.num != VHOST_VRING_BIG_ENDIAN)
  75. return -EINVAL;
  76. if (s.num == VHOST_VRING_BIG_ENDIAN)
  77. vhost_enable_cross_endian_big(vq);
  78. else
  79. vhost_enable_cross_endian_little(vq);
  80. return 0;
  81. }
  82. static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
  83. int __user *argp)
  84. {
  85. struct vhost_vring_state s = {
  86. .index = idx,
  87. .num = vq->user_be
  88. };
  89. if (copy_to_user(argp, &s, sizeof(s)))
  90. return -EFAULT;
  91. return 0;
  92. }
  93. static void vhost_init_is_le(struct vhost_virtqueue *vq)
  94. {
  95. /* Note for legacy virtio: user_be is initialized at reset time
  96. * according to the host endianness. If userspace does not set an
  97. * explicit endianness, the default behavior is native endian, as
  98. * expected by legacy virtio.
  99. */
  100. vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1) || !vq->user_be;
  101. }
  102. #else
  103. static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
  104. {
  105. }
  106. static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
  107. {
  108. return -ENOIOCTLCMD;
  109. }
  110. static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
  111. int __user *argp)
  112. {
  113. return -ENOIOCTLCMD;
  114. }
  115. static void vhost_init_is_le(struct vhost_virtqueue *vq)
  116. {
  117. vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1)
  118. || virtio_legacy_is_little_endian();
  119. }
  120. #endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */
  121. static void vhost_reset_is_le(struct vhost_virtqueue *vq)
  122. {
  123. vhost_init_is_le(vq);
  124. }
  125. struct vhost_flush_struct {
  126. struct vhost_work work;
  127. struct completion wait_event;
  128. };
  129. static void vhost_flush_work(struct vhost_work *work)
  130. {
  131. struct vhost_flush_struct *s;
  132. s = container_of(work, struct vhost_flush_struct, work);
  133. complete(&s->wait_event);
  134. }
  135. static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
  136. poll_table *pt)
  137. {
  138. struct vhost_poll *poll;
  139. poll = container_of(pt, struct vhost_poll, table);
  140. poll->wqh = wqh;
  141. add_wait_queue(wqh, &poll->wait);
  142. }
  143. static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync,
  144. void *key)
  145. {
  146. struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
  147. struct vhost_work *work = &poll->work;
  148. if (!(key_to_poll(key) & poll->mask))
  149. return 0;
  150. if (!poll->dev->use_worker)
  151. work->fn(work);
  152. else
  153. vhost_poll_queue(poll);
  154. return 0;
  155. }
  156. void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
  157. {
  158. clear_bit(VHOST_WORK_QUEUED, &work->flags);
  159. work->fn = fn;
  160. }
  161. EXPORT_SYMBOL_GPL(vhost_work_init);
  162. /* Init poll structure */
  163. void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
  164. __poll_t mask, struct vhost_dev *dev,
  165. struct vhost_virtqueue *vq)
  166. {
  167. init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
  168. init_poll_funcptr(&poll->table, vhost_poll_func);
  169. poll->mask = mask;
  170. poll->dev = dev;
  171. poll->wqh = NULL;
  172. poll->vq = vq;
  173. vhost_work_init(&poll->work, fn);
  174. }
  175. EXPORT_SYMBOL_GPL(vhost_poll_init);
  176. /* Start polling a file. We add ourselves to file's wait queue. The caller must
  177. * keep a reference to a file until after vhost_poll_stop is called. */
  178. int vhost_poll_start(struct vhost_poll *poll, struct file *file)
  179. {
  180. __poll_t mask;
  181. if (poll->wqh)
  182. return 0;
  183. mask = vfs_poll(file, &poll->table);
  184. if (mask)
  185. vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
  186. if (mask & EPOLLERR) {
  187. vhost_poll_stop(poll);
  188. return -EINVAL;
  189. }
  190. return 0;
  191. }
  192. EXPORT_SYMBOL_GPL(vhost_poll_start);
  193. /* Stop polling a file. After this function returns, it becomes safe to drop the
  194. * file reference. You must also flush afterwards. */
  195. void vhost_poll_stop(struct vhost_poll *poll)
  196. {
  197. if (poll->wqh) {
  198. remove_wait_queue(poll->wqh, &poll->wait);
  199. poll->wqh = NULL;
  200. }
  201. }
  202. EXPORT_SYMBOL_GPL(vhost_poll_stop);
  203. static void vhost_worker_queue(struct vhost_worker *worker,
  204. struct vhost_work *work)
  205. {
  206. if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
  207. /* We can only add the work to the list after we're
  208. * sure it was not in the list.
  209. * test_and_set_bit() implies a memory barrier.
  210. */
  211. llist_add(&work->node, &worker->work_list);
  212. worker->ops->wakeup(worker);
  213. }
  214. }
  215. bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work)
  216. {
  217. struct vhost_worker *worker;
  218. bool queued = false;
  219. rcu_read_lock();
  220. worker = rcu_dereference(vq->worker);
  221. if (worker) {
  222. queued = true;
  223. vhost_worker_queue(worker, work);
  224. }
  225. rcu_read_unlock();
  226. return queued;
  227. }
  228. EXPORT_SYMBOL_GPL(vhost_vq_work_queue);
  229. /**
  230. * __vhost_worker_flush - flush a worker
  231. * @worker: worker to flush
  232. *
  233. * The worker's flush_mutex must be held.
  234. */
  235. static void __vhost_worker_flush(struct vhost_worker *worker)
  236. {
  237. struct vhost_flush_struct flush;
  238. if (!worker->attachment_cnt || worker->killed)
  239. return;
  240. init_completion(&flush.wait_event);
  241. vhost_work_init(&flush.work, vhost_flush_work);
  242. vhost_worker_queue(worker, &flush.work);
  243. /*
  244. * Drop mutex in case our worker is killed and it needs to take the
  245. * mutex to force cleanup.
  246. */
  247. mutex_unlock(&worker->mutex);
  248. wait_for_completion(&flush.wait_event);
  249. mutex_lock(&worker->mutex);
  250. }
  251. static void vhost_worker_flush(struct vhost_worker *worker)
  252. {
  253. mutex_lock(&worker->mutex);
  254. __vhost_worker_flush(worker);
  255. mutex_unlock(&worker->mutex);
  256. }
  257. void vhost_dev_flush(struct vhost_dev *dev)
  258. {
  259. struct vhost_worker *worker;
  260. unsigned long i;
  261. xa_for_each(&dev->worker_xa, i, worker)
  262. vhost_worker_flush(worker);
  263. }
  264. EXPORT_SYMBOL_GPL(vhost_dev_flush);
  265. /* A lockless hint for busy polling code to exit the loop */
  266. bool vhost_vq_has_work(struct vhost_virtqueue *vq)
  267. {
  268. struct vhost_worker *worker;
  269. bool has_work = false;
  270. rcu_read_lock();
  271. worker = rcu_dereference(vq->worker);
  272. if (worker && !llist_empty(&worker->work_list))
  273. has_work = true;
  274. rcu_read_unlock();
  275. return has_work;
  276. }
  277. EXPORT_SYMBOL_GPL(vhost_vq_has_work);
  278. void vhost_poll_queue(struct vhost_poll *poll)
  279. {
  280. vhost_vq_work_queue(poll->vq, &poll->work);
  281. }
  282. EXPORT_SYMBOL_GPL(vhost_poll_queue);
  283. static void __vhost_vq_meta_reset(struct vhost_virtqueue *vq)
  284. {
  285. int j;
  286. for (j = 0; j < VHOST_NUM_ADDRS; j++)
  287. vq->meta_iotlb[j] = NULL;
  288. }
  289. static void vhost_vq_meta_reset(struct vhost_dev *d)
  290. {
  291. int i;
  292. for (i = 0; i < d->nvqs; ++i)
  293. __vhost_vq_meta_reset(d->vqs[i]);
  294. }
  295. static void vhost_vring_call_reset(struct vhost_vring_call *call_ctx)
  296. {
  297. call_ctx->ctx = NULL;
  298. memset(&call_ctx->producer, 0x0, sizeof(struct irq_bypass_producer));
  299. }
  300. bool vhost_vq_is_setup(struct vhost_virtqueue *vq)
  301. {
  302. return vq->avail && vq->desc && vq->used && vhost_vq_access_ok(vq);
  303. }
  304. EXPORT_SYMBOL_GPL(vhost_vq_is_setup);
  305. static void vhost_vq_reset(struct vhost_dev *dev,
  306. struct vhost_virtqueue *vq)
  307. {
  308. vq->num = 1;
  309. vq->desc = NULL;
  310. vq->avail = NULL;
  311. vq->used = NULL;
  312. vq->last_avail_idx = 0;
  313. vq->avail_idx = 0;
  314. vq->last_used_idx = 0;
  315. vq->signalled_used = 0;
  316. vq->signalled_used_valid = false;
  317. vq->used_flags = 0;
  318. vq->log_used = false;
  319. vq->log_addr = -1ull;
  320. vq->private_data = NULL;
  321. vq->acked_features = 0;
  322. vq->acked_backend_features = 0;
  323. vq->log_base = NULL;
  324. vq->error_ctx = NULL;
  325. vq->kick = NULL;
  326. vq->log_ctx = NULL;
  327. vhost_disable_cross_endian(vq);
  328. vhost_reset_is_le(vq);
  329. vq->busyloop_timeout = 0;
  330. vq->umem = NULL;
  331. vq->iotlb = NULL;
  332. rcu_assign_pointer(vq->worker, NULL);
  333. vhost_vring_call_reset(&vq->call_ctx);
  334. __vhost_vq_meta_reset(vq);
  335. }
  336. static int vhost_run_work_kthread_list(void *data)
  337. {
  338. struct vhost_worker *worker = data;
  339. struct vhost_work *work, *work_next;
  340. struct vhost_dev *dev = worker->dev;
  341. struct llist_node *node;
  342. kthread_use_mm(dev->mm);
  343. for (;;) {
  344. /* mb paired w/ kthread_stop */
  345. set_current_state(TASK_INTERRUPTIBLE);
  346. if (kthread_should_stop()) {
  347. __set_current_state(TASK_RUNNING);
  348. break;
  349. }
  350. node = llist_del_all(&worker->work_list);
  351. if (!node)
  352. schedule();
  353. node = llist_reverse_order(node);
  354. /* make sure flag is seen after deletion */
  355. smp_wmb();
  356. llist_for_each_entry_safe(work, work_next, node, node) {
  357. clear_bit(VHOST_WORK_QUEUED, &work->flags);
  358. __set_current_state(TASK_RUNNING);
  359. kcov_remote_start_common(worker->kcov_handle);
  360. work->fn(work);
  361. kcov_remote_stop();
  362. cond_resched();
  363. }
  364. }
  365. kthread_unuse_mm(dev->mm);
  366. return 0;
  367. }
  368. static bool vhost_run_work_list(void *data)
  369. {
  370. struct vhost_worker *worker = data;
  371. struct vhost_work *work, *work_next;
  372. struct llist_node *node;
  373. node = llist_del_all(&worker->work_list);
  374. if (node) {
  375. __set_current_state(TASK_RUNNING);
  376. node = llist_reverse_order(node);
  377. /* make sure flag is seen after deletion */
  378. smp_wmb();
  379. llist_for_each_entry_safe(work, work_next, node, node) {
  380. clear_bit(VHOST_WORK_QUEUED, &work->flags);
  381. kcov_remote_start_common(worker->kcov_handle);
  382. work->fn(work);
  383. kcov_remote_stop();
  384. cond_resched();
  385. }
  386. }
  387. return !!node;
  388. }
  389. static void vhost_worker_killed(void *data)
  390. {
  391. struct vhost_worker *worker = data;
  392. struct vhost_dev *dev = worker->dev;
  393. struct vhost_virtqueue *vq;
  394. int i, attach_cnt = 0;
  395. mutex_lock(&worker->mutex);
  396. worker->killed = true;
  397. for (i = 0; i < dev->nvqs; i++) {
  398. vq = dev->vqs[i];
  399. mutex_lock(&vq->mutex);
  400. if (worker ==
  401. rcu_dereference_check(vq->worker,
  402. lockdep_is_held(&vq->mutex))) {
  403. rcu_assign_pointer(vq->worker, NULL);
  404. attach_cnt++;
  405. }
  406. mutex_unlock(&vq->mutex);
  407. }
  408. worker->attachment_cnt -= attach_cnt;
  409. if (attach_cnt)
  410. synchronize_rcu();
  411. /*
  412. * Finish vhost_worker_flush calls and any other works that snuck in
  413. * before the synchronize_rcu.
  414. */
  415. vhost_run_work_list(worker);
  416. mutex_unlock(&worker->mutex);
  417. }
  418. static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
  419. {
  420. kfree(vq->indirect);
  421. vq->indirect = NULL;
  422. kfree(vq->log);
  423. vq->log = NULL;
  424. kfree(vq->heads);
  425. vq->heads = NULL;
  426. }
  427. /* Helper to allocate iovec buffers for all vqs. */
  428. static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
  429. {
  430. struct vhost_virtqueue *vq;
  431. int i;
  432. for (i = 0; i < dev->nvqs; ++i) {
  433. vq = dev->vqs[i];
  434. vq->indirect = kmalloc_array(UIO_MAXIOV,
  435. sizeof(*vq->indirect),
  436. GFP_KERNEL);
  437. vq->log = kmalloc_array(dev->iov_limit, sizeof(*vq->log),
  438. GFP_KERNEL);
  439. vq->heads = kmalloc_array(dev->iov_limit, sizeof(*vq->heads),
  440. GFP_KERNEL);
  441. if (!vq->indirect || !vq->log || !vq->heads)
  442. goto err_nomem;
  443. }
  444. return 0;
  445. err_nomem:
  446. for (; i >= 0; --i)
  447. vhost_vq_free_iovecs(dev->vqs[i]);
  448. return -ENOMEM;
  449. }
  450. static void vhost_dev_free_iovecs(struct vhost_dev *dev)
  451. {
  452. int i;
  453. for (i = 0; i < dev->nvqs; ++i)
  454. vhost_vq_free_iovecs(dev->vqs[i]);
  455. }
  456. bool vhost_exceeds_weight(struct vhost_virtqueue *vq,
  457. int pkts, int total_len)
  458. {
  459. struct vhost_dev *dev = vq->dev;
  460. if ((dev->byte_weight && total_len >= dev->byte_weight) ||
  461. pkts >= dev->weight) {
  462. vhost_poll_queue(&vq->poll);
  463. return true;
  464. }
  465. return false;
  466. }
  467. EXPORT_SYMBOL_GPL(vhost_exceeds_weight);
  468. static size_t vhost_get_avail_size(struct vhost_virtqueue *vq,
  469. unsigned int num)
  470. {
  471. size_t event __maybe_unused =
  472. vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
  473. return size_add(struct_size(vq->avail, ring, num), event);
  474. }
  475. static size_t vhost_get_used_size(struct vhost_virtqueue *vq,
  476. unsigned int num)
  477. {
  478. size_t event __maybe_unused =
  479. vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
  480. return size_add(struct_size(vq->used, ring, num), event);
  481. }
  482. static size_t vhost_get_desc_size(struct vhost_virtqueue *vq,
  483. unsigned int num)
  484. {
  485. return sizeof(*vq->desc) * num;
  486. }
  487. void vhost_dev_init(struct vhost_dev *dev,
  488. struct vhost_virtqueue **vqs, int nvqs,
  489. int iov_limit, int weight, int byte_weight,
  490. bool use_worker,
  491. int (*msg_handler)(struct vhost_dev *dev, u32 asid,
  492. struct vhost_iotlb_msg *msg))
  493. {
  494. struct vhost_virtqueue *vq;
  495. int i;
  496. dev->vqs = vqs;
  497. dev->nvqs = nvqs;
  498. mutex_init(&dev->mutex);
  499. dev->log_ctx = NULL;
  500. dev->umem = NULL;
  501. dev->iotlb = NULL;
  502. dev->mm = NULL;
  503. dev->iov_limit = iov_limit;
  504. dev->weight = weight;
  505. dev->byte_weight = byte_weight;
  506. dev->use_worker = use_worker;
  507. dev->msg_handler = msg_handler;
  508. dev->fork_owner = fork_from_owner_default;
  509. init_waitqueue_head(&dev->wait);
  510. INIT_LIST_HEAD(&dev->read_list);
  511. INIT_LIST_HEAD(&dev->pending_list);
  512. spin_lock_init(&dev->iotlb_lock);
  513. xa_init_flags(&dev->worker_xa, XA_FLAGS_ALLOC);
  514. for (i = 0; i < dev->nvqs; ++i) {
  515. vq = dev->vqs[i];
  516. vq->log = NULL;
  517. vq->indirect = NULL;
  518. vq->heads = NULL;
  519. vq->dev = dev;
  520. mutex_init(&vq->mutex);
  521. vhost_vq_reset(dev, vq);
  522. if (vq->handle_kick)
  523. vhost_poll_init(&vq->poll, vq->handle_kick,
  524. EPOLLIN, dev, vq);
  525. }
  526. }
  527. EXPORT_SYMBOL_GPL(vhost_dev_init);
  528. /* Caller should have device mutex */
  529. long vhost_dev_check_owner(struct vhost_dev *dev)
  530. {
  531. /* Are you the owner? If not, I don't think you mean to do that */
  532. return dev->mm == current->mm ? 0 : -EPERM;
  533. }
  534. EXPORT_SYMBOL_GPL(vhost_dev_check_owner);
  535. struct vhost_attach_cgroups_struct {
  536. struct vhost_work work;
  537. struct task_struct *owner;
  538. int ret;
  539. };
  540. static void vhost_attach_cgroups_work(struct vhost_work *work)
  541. {
  542. struct vhost_attach_cgroups_struct *s;
  543. s = container_of(work, struct vhost_attach_cgroups_struct, work);
  544. s->ret = cgroup_attach_task_all(s->owner, current);
  545. }
  546. static int vhost_attach_task_to_cgroups(struct vhost_worker *worker)
  547. {
  548. struct vhost_attach_cgroups_struct attach;
  549. int saved_cnt;
  550. attach.owner = current;
  551. vhost_work_init(&attach.work, vhost_attach_cgroups_work);
  552. vhost_worker_queue(worker, &attach.work);
  553. mutex_lock(&worker->mutex);
  554. /*
  555. * Bypass attachment_cnt check in __vhost_worker_flush:
  556. * Temporarily change it to INT_MAX to bypass the check
  557. */
  558. saved_cnt = worker->attachment_cnt;
  559. worker->attachment_cnt = INT_MAX;
  560. __vhost_worker_flush(worker);
  561. worker->attachment_cnt = saved_cnt;
  562. mutex_unlock(&worker->mutex);
  563. return attach.ret;
  564. }
  565. /* Caller should have device mutex */
  566. bool vhost_dev_has_owner(struct vhost_dev *dev)
  567. {
  568. return dev->mm;
  569. }
  570. EXPORT_SYMBOL_GPL(vhost_dev_has_owner);
  571. static void vhost_attach_mm(struct vhost_dev *dev)
  572. {
  573. /* No owner, become one */
  574. if (dev->use_worker) {
  575. dev->mm = get_task_mm(current);
  576. } else {
  577. /* vDPA device does not use worker thead, so there's
  578. * no need to hold the address space for mm. This help
  579. * to avoid deadlock in the case of mmap() which may
  580. * held the refcnt of the file and depends on release
  581. * method to remove vma.
  582. */
  583. dev->mm = current->mm;
  584. mmgrab(dev->mm);
  585. }
  586. }
  587. static void vhost_detach_mm(struct vhost_dev *dev)
  588. {
  589. if (!dev->mm)
  590. return;
  591. if (dev->use_worker)
  592. mmput(dev->mm);
  593. else
  594. mmdrop(dev->mm);
  595. dev->mm = NULL;
  596. }
  597. static void vhost_worker_destroy(struct vhost_dev *dev,
  598. struct vhost_worker *worker)
  599. {
  600. if (!worker)
  601. return;
  602. WARN_ON(!llist_empty(&worker->work_list));
  603. xa_erase(&dev->worker_xa, worker->id);
  604. worker->ops->stop(worker);
  605. kfree(worker);
  606. }
  607. static void vhost_workers_free(struct vhost_dev *dev)
  608. {
  609. struct vhost_worker *worker;
  610. unsigned long i;
  611. if (!dev->use_worker)
  612. return;
  613. for (i = 0; i < dev->nvqs; i++)
  614. rcu_assign_pointer(dev->vqs[i]->worker, NULL);
  615. /*
  616. * Free the default worker we created and cleanup workers userspace
  617. * created but couldn't clean up (it forgot or crashed).
  618. */
  619. xa_for_each(&dev->worker_xa, i, worker)
  620. vhost_worker_destroy(dev, worker);
  621. xa_destroy(&dev->worker_xa);
  622. }
  623. static void vhost_task_wakeup(struct vhost_worker *worker)
  624. {
  625. return vhost_task_wake(worker->vtsk);
  626. }
  627. static void vhost_kthread_wakeup(struct vhost_worker *worker)
  628. {
  629. wake_up_process(worker->kthread_task);
  630. }
  631. static void vhost_task_do_stop(struct vhost_worker *worker)
  632. {
  633. return vhost_task_stop(worker->vtsk);
  634. }
  635. static void vhost_kthread_do_stop(struct vhost_worker *worker)
  636. {
  637. kthread_stop(worker->kthread_task);
  638. }
  639. static int vhost_task_worker_create(struct vhost_worker *worker,
  640. struct vhost_dev *dev, const char *name)
  641. {
  642. struct vhost_task *vtsk;
  643. u32 id;
  644. int ret;
  645. vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed,
  646. worker, name);
  647. if (IS_ERR(vtsk))
  648. return PTR_ERR(vtsk);
  649. worker->vtsk = vtsk;
  650. vhost_task_start(vtsk);
  651. ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
  652. if (ret < 0) {
  653. vhost_task_do_stop(worker);
  654. return ret;
  655. }
  656. worker->id = id;
  657. return 0;
  658. }
  659. static int vhost_kthread_worker_create(struct vhost_worker *worker,
  660. struct vhost_dev *dev, const char *name)
  661. {
  662. struct task_struct *task;
  663. u32 id;
  664. int ret;
  665. task = kthread_create(vhost_run_work_kthread_list, worker, "%s", name);
  666. if (IS_ERR(task))
  667. return PTR_ERR(task);
  668. worker->kthread_task = task;
  669. wake_up_process(task);
  670. ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
  671. if (ret < 0)
  672. goto stop_worker;
  673. ret = vhost_attach_task_to_cgroups(worker);
  674. if (ret)
  675. goto stop_worker;
  676. worker->id = id;
  677. return 0;
  678. stop_worker:
  679. vhost_kthread_do_stop(worker);
  680. return ret;
  681. }
  682. static const struct vhost_worker_ops kthread_ops = {
  683. .create = vhost_kthread_worker_create,
  684. .stop = vhost_kthread_do_stop,
  685. .wakeup = vhost_kthread_wakeup,
  686. };
  687. static const struct vhost_worker_ops vhost_task_ops = {
  688. .create = vhost_task_worker_create,
  689. .stop = vhost_task_do_stop,
  690. .wakeup = vhost_task_wakeup,
  691. };
  692. static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
  693. {
  694. struct vhost_worker *worker;
  695. char name[TASK_COMM_LEN];
  696. int ret;
  697. const struct vhost_worker_ops *ops = dev->fork_owner ? &vhost_task_ops :
  698. &kthread_ops;
  699. worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
  700. if (!worker)
  701. return NULL;
  702. worker->dev = dev;
  703. worker->ops = ops;
  704. snprintf(name, sizeof(name), "vhost-%d", current->pid);
  705. mutex_init(&worker->mutex);
  706. init_llist_head(&worker->work_list);
  707. worker->kcov_handle = kcov_common_handle();
  708. ret = ops->create(worker, dev, name);
  709. if (ret < 0)
  710. goto free_worker;
  711. return worker;
  712. free_worker:
  713. kfree(worker);
  714. return NULL;
  715. }
  716. /* Caller must have device mutex */
  717. static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq,
  718. struct vhost_worker *worker)
  719. {
  720. struct vhost_worker *old_worker;
  721. mutex_lock(&worker->mutex);
  722. if (worker->killed) {
  723. mutex_unlock(&worker->mutex);
  724. return;
  725. }
  726. mutex_lock(&vq->mutex);
  727. old_worker = rcu_dereference_check(vq->worker,
  728. lockdep_is_held(&vq->mutex));
  729. rcu_assign_pointer(vq->worker, worker);
  730. worker->attachment_cnt++;
  731. if (!old_worker) {
  732. mutex_unlock(&vq->mutex);
  733. mutex_unlock(&worker->mutex);
  734. return;
  735. }
  736. mutex_unlock(&vq->mutex);
  737. mutex_unlock(&worker->mutex);
  738. /*
  739. * Take the worker mutex to make sure we see the work queued from
  740. * device wide flushes which doesn't use RCU for execution.
  741. */
  742. mutex_lock(&old_worker->mutex);
  743. if (old_worker->killed) {
  744. mutex_unlock(&old_worker->mutex);
  745. return;
  746. }
  747. /*
  748. * We don't want to call synchronize_rcu for every vq during setup
  749. * because it will slow down VM startup. If we haven't done
  750. * VHOST_SET_VRING_KICK and not done the driver specific
  751. * SET_ENDPOINT/RUNNUNG then we can skip the sync since there will
  752. * not be any works queued for scsi and net.
  753. */
  754. mutex_lock(&vq->mutex);
  755. if (!vhost_vq_get_backend(vq) && !vq->kick) {
  756. mutex_unlock(&vq->mutex);
  757. old_worker->attachment_cnt--;
  758. mutex_unlock(&old_worker->mutex);
  759. /*
  760. * vsock can queue anytime after VHOST_VSOCK_SET_GUEST_CID.
  761. * Warn if it adds support for multiple workers but forgets to
  762. * handle the early queueing case.
  763. */
  764. WARN_ON(!old_worker->attachment_cnt &&
  765. !llist_empty(&old_worker->work_list));
  766. return;
  767. }
  768. mutex_unlock(&vq->mutex);
  769. /* Make sure new vq queue/flush/poll calls see the new worker */
  770. synchronize_rcu();
  771. /* Make sure whatever was queued gets run */
  772. __vhost_worker_flush(old_worker);
  773. old_worker->attachment_cnt--;
  774. mutex_unlock(&old_worker->mutex);
  775. }
  776. /* Caller must have device mutex */
  777. static int vhost_vq_attach_worker(struct vhost_virtqueue *vq,
  778. struct vhost_vring_worker *info)
  779. {
  780. unsigned long index = info->worker_id;
  781. struct vhost_dev *dev = vq->dev;
  782. struct vhost_worker *worker;
  783. if (!dev->use_worker)
  784. return -EINVAL;
  785. worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
  786. if (!worker || worker->id != info->worker_id)
  787. return -ENODEV;
  788. __vhost_vq_attach_worker(vq, worker);
  789. return 0;
  790. }
  791. /* Caller must have device mutex */
  792. static int vhost_new_worker(struct vhost_dev *dev,
  793. struct vhost_worker_state *info)
  794. {
  795. struct vhost_worker *worker;
  796. worker = vhost_worker_create(dev);
  797. if (!worker)
  798. return -ENOMEM;
  799. info->worker_id = worker->id;
  800. return 0;
  801. }
  802. /* Caller must have device mutex */
  803. static int vhost_free_worker(struct vhost_dev *dev,
  804. struct vhost_worker_state *info)
  805. {
  806. unsigned long index = info->worker_id;
  807. struct vhost_worker *worker;
  808. worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
  809. if (!worker || worker->id != info->worker_id)
  810. return -ENODEV;
  811. mutex_lock(&worker->mutex);
  812. if (worker->attachment_cnt || worker->killed) {
  813. mutex_unlock(&worker->mutex);
  814. return -EBUSY;
  815. }
  816. /*
  817. * A flush might have raced and snuck in before attachment_cnt was set
  818. * to zero. Make sure flushes are flushed from the queue before
  819. * freeing.
  820. */
  821. __vhost_worker_flush(worker);
  822. mutex_unlock(&worker->mutex);
  823. vhost_worker_destroy(dev, worker);
  824. return 0;
  825. }
  826. static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp,
  827. struct vhost_virtqueue **vq, u32 *id)
  828. {
  829. u32 __user *idxp = argp;
  830. u32 idx;
  831. long r;
  832. r = get_user(idx, idxp);
  833. if (r < 0)
  834. return r;
  835. if (idx >= dev->nvqs)
  836. return -ENOBUFS;
  837. idx = array_index_nospec(idx, dev->nvqs);
  838. *vq = dev->vqs[idx];
  839. *id = idx;
  840. return 0;
  841. }
  842. /* Caller must have device mutex */
  843. long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl,
  844. void __user *argp)
  845. {
  846. struct vhost_vring_worker ring_worker;
  847. struct vhost_worker_state state;
  848. struct vhost_worker *worker;
  849. struct vhost_virtqueue *vq;
  850. long ret;
  851. u32 idx;
  852. if (!dev->use_worker)
  853. return -EINVAL;
  854. if (!vhost_dev_has_owner(dev))
  855. return -EINVAL;
  856. ret = vhost_dev_check_owner(dev);
  857. if (ret)
  858. return ret;
  859. switch (ioctl) {
  860. /* dev worker ioctls */
  861. case VHOST_NEW_WORKER:
  862. /*
  863. * vhost_tasks will account for worker threads under the parent's
  864. * NPROC value but kthreads do not. To avoid userspace overflowing
  865. * the system with worker threads fork_owner must be true.
  866. */
  867. if (!dev->fork_owner)
  868. return -EFAULT;
  869. ret = vhost_new_worker(dev, &state);
  870. if (!ret && copy_to_user(argp, &state, sizeof(state)))
  871. ret = -EFAULT;
  872. return ret;
  873. case VHOST_FREE_WORKER:
  874. if (copy_from_user(&state, argp, sizeof(state)))
  875. return -EFAULT;
  876. return vhost_free_worker(dev, &state);
  877. /* vring worker ioctls */
  878. case VHOST_ATTACH_VRING_WORKER:
  879. case VHOST_GET_VRING_WORKER:
  880. break;
  881. default:
  882. return -ENOIOCTLCMD;
  883. }
  884. ret = vhost_get_vq_from_user(dev, argp, &vq, &idx);
  885. if (ret)
  886. return ret;
  887. switch (ioctl) {
  888. case VHOST_ATTACH_VRING_WORKER:
  889. if (copy_from_user(&ring_worker, argp, sizeof(ring_worker))) {
  890. ret = -EFAULT;
  891. break;
  892. }
  893. ret = vhost_vq_attach_worker(vq, &ring_worker);
  894. break;
  895. case VHOST_GET_VRING_WORKER:
  896. worker = rcu_dereference_check(vq->worker,
  897. lockdep_is_held(&dev->mutex));
  898. if (!worker) {
  899. ret = -EINVAL;
  900. break;
  901. }
  902. ring_worker.index = idx;
  903. ring_worker.worker_id = worker->id;
  904. if (copy_to_user(argp, &ring_worker, sizeof(ring_worker)))
  905. ret = -EFAULT;
  906. break;
  907. default:
  908. ret = -ENOIOCTLCMD;
  909. break;
  910. }
  911. return ret;
  912. }
  913. EXPORT_SYMBOL_GPL(vhost_worker_ioctl);
  914. /* Caller should have device mutex */
  915. long vhost_dev_set_owner(struct vhost_dev *dev)
  916. {
  917. struct vhost_worker *worker;
  918. int err, i;
  919. /* Is there an owner already? */
  920. if (vhost_dev_has_owner(dev)) {
  921. err = -EBUSY;
  922. goto err_mm;
  923. }
  924. vhost_attach_mm(dev);
  925. err = vhost_dev_alloc_iovecs(dev);
  926. if (err)
  927. goto err_iovecs;
  928. if (dev->use_worker) {
  929. /*
  930. * This should be done last, because vsock can queue work
  931. * before VHOST_SET_OWNER so it simplifies the failure path
  932. * below since we don't have to worry about vsock queueing
  933. * while we free the worker.
  934. */
  935. worker = vhost_worker_create(dev);
  936. if (!worker) {
  937. err = -ENOMEM;
  938. goto err_worker;
  939. }
  940. for (i = 0; i < dev->nvqs; i++)
  941. __vhost_vq_attach_worker(dev->vqs[i], worker);
  942. }
  943. return 0;
  944. err_worker:
  945. vhost_dev_free_iovecs(dev);
  946. err_iovecs:
  947. vhost_detach_mm(dev);
  948. err_mm:
  949. return err;
  950. }
  951. EXPORT_SYMBOL_GPL(vhost_dev_set_owner);
  952. static struct vhost_iotlb *iotlb_alloc(void)
  953. {
  954. return vhost_iotlb_alloc(max_iotlb_entries,
  955. VHOST_IOTLB_FLAG_RETIRE);
  956. }
  957. struct vhost_iotlb *vhost_dev_reset_owner_prepare(void)
  958. {
  959. return iotlb_alloc();
  960. }
  961. EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare);
  962. /* Caller should have device mutex */
  963. void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem)
  964. {
  965. int i;
  966. vhost_dev_cleanup(dev);
  967. dev->fork_owner = fork_from_owner_default;
  968. dev->umem = umem;
  969. /* We don't need VQ locks below since vhost_dev_cleanup makes sure
  970. * VQs aren't running.
  971. */
  972. for (i = 0; i < dev->nvqs; ++i)
  973. dev->vqs[i]->umem = umem;
  974. }
  975. EXPORT_SYMBOL_GPL(vhost_dev_reset_owner);
  976. void vhost_dev_stop(struct vhost_dev *dev)
  977. {
  978. int i;
  979. for (i = 0; i < dev->nvqs; ++i) {
  980. if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick)
  981. vhost_poll_stop(&dev->vqs[i]->poll);
  982. }
  983. vhost_dev_flush(dev);
  984. }
  985. EXPORT_SYMBOL_GPL(vhost_dev_stop);
  986. void vhost_clear_msg(struct vhost_dev *dev)
  987. {
  988. struct vhost_msg_node *node, *n;
  989. spin_lock(&dev->iotlb_lock);
  990. list_for_each_entry_safe(node, n, &dev->read_list, node) {
  991. list_del(&node->node);
  992. kfree(node);
  993. }
  994. list_for_each_entry_safe(node, n, &dev->pending_list, node) {
  995. list_del(&node->node);
  996. kfree(node);
  997. }
  998. spin_unlock(&dev->iotlb_lock);
  999. }
  1000. EXPORT_SYMBOL_GPL(vhost_clear_msg);
  1001. void vhost_dev_cleanup(struct vhost_dev *dev)
  1002. {
  1003. int i;
  1004. for (i = 0; i < dev->nvqs; ++i) {
  1005. if (dev->vqs[i]->error_ctx)
  1006. eventfd_ctx_put(dev->vqs[i]->error_ctx);
  1007. if (dev->vqs[i]->kick)
  1008. fput(dev->vqs[i]->kick);
  1009. if (dev->vqs[i]->call_ctx.ctx)
  1010. eventfd_ctx_put(dev->vqs[i]->call_ctx.ctx);
  1011. vhost_vq_reset(dev, dev->vqs[i]);
  1012. }
  1013. vhost_dev_free_iovecs(dev);
  1014. if (dev->log_ctx)
  1015. eventfd_ctx_put(dev->log_ctx);
  1016. dev->log_ctx = NULL;
  1017. /* No one will access memory at this point */
  1018. vhost_iotlb_free(dev->umem);
  1019. dev->umem = NULL;
  1020. vhost_iotlb_free(dev->iotlb);
  1021. dev->iotlb = NULL;
  1022. vhost_clear_msg(dev);
  1023. wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
  1024. vhost_workers_free(dev);
  1025. vhost_detach_mm(dev);
  1026. }
  1027. EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
  1028. static bool log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
  1029. {
  1030. u64 a = addr / VHOST_PAGE_SIZE / 8;
  1031. /* Make sure 64 bit math will not overflow. */
  1032. if (a > ULONG_MAX - (unsigned long)log_base ||
  1033. a + (unsigned long)log_base > ULONG_MAX)
  1034. return false;
  1035. return access_ok(log_base + a,
  1036. (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8);
  1037. }
  1038. /* Make sure 64 bit math will not overflow. */
  1039. static bool vhost_overflow(u64 uaddr, u64 size)
  1040. {
  1041. if (uaddr > ULONG_MAX || size > ULONG_MAX)
  1042. return true;
  1043. if (!size)
  1044. return false;
  1045. return uaddr > ULONG_MAX - size + 1;
  1046. }
  1047. /* Caller should have vq mutex and device mutex. */
  1048. static bool vq_memory_access_ok(void __user *log_base, struct vhost_iotlb *umem,
  1049. int log_all)
  1050. {
  1051. struct vhost_iotlb_map *map;
  1052. if (!umem)
  1053. return false;
  1054. list_for_each_entry(map, &umem->list, link) {
  1055. unsigned long a = map->addr;
  1056. if (vhost_overflow(map->addr, map->size))
  1057. return false;
  1058. if (!access_ok((void __user *)a, map->size))
  1059. return false;
  1060. else if (log_all && !log_access_ok(log_base,
  1061. map->start,
  1062. map->size))
  1063. return false;
  1064. }
  1065. return true;
  1066. }
  1067. static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq,
  1068. u64 addr, unsigned int size,
  1069. int type)
  1070. {
  1071. const struct vhost_iotlb_map *map = vq->meta_iotlb[type];
  1072. if (!map)
  1073. return NULL;
  1074. return (void __user *)(uintptr_t)(map->addr + addr - map->start);
  1075. }
  1076. /* Can we switch to this memory table? */
  1077. /* Caller should have device mutex but not vq mutex */
  1078. static bool memory_access_ok(struct vhost_dev *d, struct vhost_iotlb *umem,
  1079. int log_all)
  1080. {
  1081. int i;
  1082. for (i = 0; i < d->nvqs; ++i) {
  1083. bool ok;
  1084. bool log;
  1085. mutex_lock(&d->vqs[i]->mutex);
  1086. log = log_all || vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL);
  1087. /* If ring is inactive, will check when it's enabled. */
  1088. if (d->vqs[i]->private_data)
  1089. ok = vq_memory_access_ok(d->vqs[i]->log_base,
  1090. umem, log);
  1091. else
  1092. ok = true;
  1093. mutex_unlock(&d->vqs[i]->mutex);
  1094. if (!ok)
  1095. return false;
  1096. }
  1097. return true;
  1098. }
  1099. static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
  1100. struct iovec iov[], int iov_size, int access);
  1101. static int vhost_copy_to_user(struct vhost_virtqueue *vq, void __user *to,
  1102. const void *from, unsigned size)
  1103. {
  1104. int ret;
  1105. if (!vq->iotlb)
  1106. return __copy_to_user(to, from, size);
  1107. else {
  1108. /* This function should be called after iotlb
  1109. * prefetch, which means we're sure that all vq
  1110. * could be access through iotlb. So -EAGAIN should
  1111. * not happen in this case.
  1112. */
  1113. struct iov_iter t;
  1114. void __user *uaddr = vhost_vq_meta_fetch(vq,
  1115. (u64)(uintptr_t)to, size,
  1116. VHOST_ADDR_USED);
  1117. if (uaddr)
  1118. return __copy_to_user(uaddr, from, size);
  1119. ret = translate_desc(vq, (u64)(uintptr_t)to, size, vq->iotlb_iov,
  1120. ARRAY_SIZE(vq->iotlb_iov),
  1121. VHOST_ACCESS_WO);
  1122. if (ret < 0)
  1123. goto out;
  1124. iov_iter_init(&t, ITER_DEST, vq->iotlb_iov, ret, size);
  1125. ret = copy_to_iter(from, size, &t);
  1126. if (ret == size)
  1127. ret = 0;
  1128. }
  1129. out:
  1130. return ret;
  1131. }
  1132. static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
  1133. void __user *from, unsigned size)
  1134. {
  1135. int ret;
  1136. if (!vq->iotlb)
  1137. return __copy_from_user(to, from, size);
  1138. else {
  1139. /* This function should be called after iotlb
  1140. * prefetch, which means we're sure that vq
  1141. * could be access through iotlb. So -EAGAIN should
  1142. * not happen in this case.
  1143. */
  1144. void __user *uaddr = vhost_vq_meta_fetch(vq,
  1145. (u64)(uintptr_t)from, size,
  1146. VHOST_ADDR_DESC);
  1147. struct iov_iter f;
  1148. if (uaddr)
  1149. return __copy_from_user(to, uaddr, size);
  1150. ret = translate_desc(vq, (u64)(uintptr_t)from, size, vq->iotlb_iov,
  1151. ARRAY_SIZE(vq->iotlb_iov),
  1152. VHOST_ACCESS_RO);
  1153. if (ret < 0) {
  1154. vq_err(vq, "IOTLB translation failure: uaddr "
  1155. "%p size 0x%llx\n", from,
  1156. (unsigned long long) size);
  1157. goto out;
  1158. }
  1159. iov_iter_init(&f, ITER_SOURCE, vq->iotlb_iov, ret, size);
  1160. ret = copy_from_iter(to, size, &f);
  1161. if (ret == size)
  1162. ret = 0;
  1163. }
  1164. out:
  1165. return ret;
  1166. }
  1167. static void __user *__vhost_get_user_slow(struct vhost_virtqueue *vq,
  1168. void __user *addr, unsigned int size,
  1169. int type)
  1170. {
  1171. int ret;
  1172. ret = translate_desc(vq, (u64)(uintptr_t)addr, size, vq->iotlb_iov,
  1173. ARRAY_SIZE(vq->iotlb_iov),
  1174. VHOST_ACCESS_RO);
  1175. if (ret < 0) {
  1176. vq_err(vq, "IOTLB translation failure: uaddr "
  1177. "%p size 0x%llx\n", addr,
  1178. (unsigned long long) size);
  1179. return NULL;
  1180. }
  1181. if (ret != 1 || vq->iotlb_iov[0].iov_len != size) {
  1182. vq_err(vq, "Non atomic userspace memory access: uaddr "
  1183. "%p size 0x%llx\n", addr,
  1184. (unsigned long long) size);
  1185. return NULL;
  1186. }
  1187. return vq->iotlb_iov[0].iov_base;
  1188. }
  1189. /* This function should be called after iotlb
  1190. * prefetch, which means we're sure that vq
  1191. * could be access through iotlb. So -EAGAIN should
  1192. * not happen in this case.
  1193. */
  1194. static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
  1195. void __user *addr, unsigned int size,
  1196. int type)
  1197. {
  1198. void __user *uaddr = vhost_vq_meta_fetch(vq,
  1199. (u64)(uintptr_t)addr, size, type);
  1200. if (uaddr)
  1201. return uaddr;
  1202. return __vhost_get_user_slow(vq, addr, size, type);
  1203. }
  1204. #define vhost_put_user(vq, x, ptr) \
  1205. ({ \
  1206. int ret; \
  1207. if (!vq->iotlb) { \
  1208. ret = __put_user(x, ptr); \
  1209. } else { \
  1210. __typeof__(ptr) to = \
  1211. (__typeof__(ptr)) __vhost_get_user(vq, ptr, \
  1212. sizeof(*ptr), VHOST_ADDR_USED); \
  1213. if (to != NULL) \
  1214. ret = __put_user(x, to); \
  1215. else \
  1216. ret = -EFAULT; \
  1217. } \
  1218. ret; \
  1219. })
  1220. static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
  1221. {
  1222. return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
  1223. vhost_avail_event(vq));
  1224. }
  1225. static inline int vhost_put_used(struct vhost_virtqueue *vq,
  1226. struct vring_used_elem *head, int idx,
  1227. int count)
  1228. {
  1229. return vhost_copy_to_user(vq, vq->used->ring + idx, head,
  1230. count * sizeof(*head));
  1231. }
  1232. static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
  1233. {
  1234. return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
  1235. &vq->used->flags);
  1236. }
  1237. static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
  1238. {
  1239. return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
  1240. &vq->used->idx);
  1241. }
  1242. #define vhost_get_user(vq, x, ptr, type) \
  1243. ({ \
  1244. int ret; \
  1245. if (!vq->iotlb) { \
  1246. ret = __get_user(x, ptr); \
  1247. } else { \
  1248. __typeof__(ptr) from = \
  1249. (__typeof__(ptr)) __vhost_get_user(vq, ptr, \
  1250. sizeof(*ptr), \
  1251. type); \
  1252. if (from != NULL) \
  1253. ret = __get_user(x, from); \
  1254. else \
  1255. ret = -EFAULT; \
  1256. } \
  1257. ret; \
  1258. })
  1259. #define vhost_get_avail(vq, x, ptr) \
  1260. vhost_get_user(vq, x, ptr, VHOST_ADDR_AVAIL)
  1261. #define vhost_get_used(vq, x, ptr) \
  1262. vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)
  1263. static void vhost_dev_lock_vqs(struct vhost_dev *d)
  1264. {
  1265. int i = 0;
  1266. for (i = 0; i < d->nvqs; ++i)
  1267. mutex_lock_nested(&d->vqs[i]->mutex, i);
  1268. }
  1269. static void vhost_dev_unlock_vqs(struct vhost_dev *d)
  1270. {
  1271. int i = 0;
  1272. for (i = 0; i < d->nvqs; ++i)
  1273. mutex_unlock(&d->vqs[i]->mutex);
  1274. }
  1275. static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq)
  1276. {
  1277. __virtio16 idx;
  1278. int r;
  1279. r = vhost_get_avail(vq, idx, &vq->avail->idx);
  1280. if (unlikely(r < 0)) {
  1281. vq_err(vq, "Failed to access available index at %p (%d)\n",
  1282. &vq->avail->idx, r);
  1283. return r;
  1284. }
  1285. /* Check it isn't doing very strange thing with available indexes */
  1286. vq->avail_idx = vhost16_to_cpu(vq, idx);
  1287. if (unlikely((u16)(vq->avail_idx - vq->last_avail_idx) > vq->num)) {
  1288. vq_err(vq, "Invalid available index change from %u to %u",
  1289. vq->last_avail_idx, vq->avail_idx);
  1290. return -EINVAL;
  1291. }
  1292. /* We're done if there is nothing new */
  1293. if (vq->avail_idx == vq->last_avail_idx)
  1294. return 0;
  1295. /*
  1296. * We updated vq->avail_idx so we need a memory barrier between
  1297. * the index read above and the caller reading avail ring entries.
  1298. */
  1299. smp_rmb();
  1300. return 1;
  1301. }
  1302. static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
  1303. __virtio16 *head, int idx)
  1304. {
  1305. return vhost_get_avail(vq, *head,
  1306. &vq->avail->ring[idx & (vq->num - 1)]);
  1307. }
  1308. static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
  1309. __virtio16 *flags)
  1310. {
  1311. return vhost_get_avail(vq, *flags, &vq->avail->flags);
  1312. }
  1313. static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
  1314. __virtio16 *event)
  1315. {
  1316. return vhost_get_avail(vq, *event, vhost_used_event(vq));
  1317. }
  1318. static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
  1319. __virtio16 *idx)
  1320. {
  1321. return vhost_get_used(vq, *idx, &vq->used->idx);
  1322. }
  1323. static inline int vhost_get_desc(struct vhost_virtqueue *vq,
  1324. struct vring_desc *desc, int idx)
  1325. {
  1326. return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
  1327. }
  1328. static void vhost_iotlb_notify_vq(struct vhost_dev *d,
  1329. struct vhost_iotlb_msg *msg)
  1330. {
  1331. struct vhost_msg_node *node, *n;
  1332. spin_lock(&d->iotlb_lock);
  1333. list_for_each_entry_safe(node, n, &d->pending_list, node) {
  1334. struct vhost_iotlb_msg *vq_msg = &node->msg.iotlb;
  1335. if (msg->iova <= vq_msg->iova &&
  1336. msg->iova + msg->size - 1 >= vq_msg->iova &&
  1337. vq_msg->type == VHOST_IOTLB_MISS) {
  1338. vhost_poll_queue(&node->vq->poll);
  1339. list_del(&node->node);
  1340. kfree(node);
  1341. }
  1342. }
  1343. spin_unlock(&d->iotlb_lock);
  1344. }
  1345. static bool umem_access_ok(u64 uaddr, u64 size, int access)
  1346. {
  1347. unsigned long a = uaddr;
  1348. /* Make sure 64 bit math will not overflow. */
  1349. if (vhost_overflow(uaddr, size))
  1350. return false;
  1351. if ((access & VHOST_ACCESS_RO) &&
  1352. !access_ok((void __user *)a, size))
  1353. return false;
  1354. if ((access & VHOST_ACCESS_WO) &&
  1355. !access_ok((void __user *)a, size))
  1356. return false;
  1357. return true;
  1358. }
  1359. static int vhost_process_iotlb_msg(struct vhost_dev *dev, u32 asid,
  1360. struct vhost_iotlb_msg *msg)
  1361. {
  1362. int ret = 0;
  1363. if (asid != 0)
  1364. return -EINVAL;
  1365. mutex_lock(&dev->mutex);
  1366. vhost_dev_lock_vqs(dev);
  1367. switch (msg->type) {
  1368. case VHOST_IOTLB_UPDATE:
  1369. if (!dev->iotlb) {
  1370. ret = -EFAULT;
  1371. break;
  1372. }
  1373. if (!umem_access_ok(msg->uaddr, msg->size, msg->perm)) {
  1374. ret = -EFAULT;
  1375. break;
  1376. }
  1377. vhost_vq_meta_reset(dev);
  1378. if (vhost_iotlb_add_range(dev->iotlb, msg->iova,
  1379. msg->iova + msg->size - 1,
  1380. msg->uaddr, msg->perm)) {
  1381. ret = -ENOMEM;
  1382. break;
  1383. }
  1384. vhost_iotlb_notify_vq(dev, msg);
  1385. break;
  1386. case VHOST_IOTLB_INVALIDATE:
  1387. if (!dev->iotlb) {
  1388. ret = -EFAULT;
  1389. break;
  1390. }
  1391. vhost_vq_meta_reset(dev);
  1392. vhost_iotlb_del_range(dev->iotlb, msg->iova,
  1393. msg->iova + msg->size - 1);
  1394. break;
  1395. default:
  1396. ret = -EINVAL;
  1397. break;
  1398. }
  1399. vhost_dev_unlock_vqs(dev);
  1400. mutex_unlock(&dev->mutex);
  1401. return ret;
  1402. }
  1403. ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
  1404. struct iov_iter *from)
  1405. {
  1406. struct vhost_iotlb_msg msg;
  1407. size_t offset;
  1408. int type, ret;
  1409. u32 asid = 0;
  1410. ret = copy_from_iter(&type, sizeof(type), from);
  1411. if (ret != sizeof(type)) {
  1412. ret = -EINVAL;
  1413. goto done;
  1414. }
  1415. switch (type) {
  1416. case VHOST_IOTLB_MSG:
  1417. /* There maybe a hole after type for V1 message type,
  1418. * so skip it here.
  1419. */
  1420. offset = offsetof(struct vhost_msg, iotlb) - sizeof(int);
  1421. break;
  1422. case VHOST_IOTLB_MSG_V2:
  1423. if (vhost_backend_has_feature(dev->vqs[0],
  1424. VHOST_BACKEND_F_IOTLB_ASID)) {
  1425. ret = copy_from_iter(&asid, sizeof(asid), from);
  1426. if (ret != sizeof(asid)) {
  1427. ret = -EINVAL;
  1428. goto done;
  1429. }
  1430. offset = 0;
  1431. } else
  1432. offset = sizeof(__u32);
  1433. break;
  1434. default:
  1435. ret = -EINVAL;
  1436. goto done;
  1437. }
  1438. iov_iter_advance(from, offset);
  1439. ret = copy_from_iter(&msg, sizeof(msg), from);
  1440. if (ret != sizeof(msg)) {
  1441. ret = -EINVAL;
  1442. goto done;
  1443. }
  1444. if (msg.type == VHOST_IOTLB_UPDATE && msg.size == 0) {
  1445. ret = -EINVAL;
  1446. goto done;
  1447. }
  1448. if (dev->msg_handler)
  1449. ret = dev->msg_handler(dev, asid, &msg);
  1450. else
  1451. ret = vhost_process_iotlb_msg(dev, asid, &msg);
  1452. if (ret) {
  1453. ret = -EFAULT;
  1454. goto done;
  1455. }
  1456. ret = (type == VHOST_IOTLB_MSG) ? sizeof(struct vhost_msg) :
  1457. sizeof(struct vhost_msg_v2);
  1458. done:
  1459. return ret;
  1460. }
  1461. EXPORT_SYMBOL(vhost_chr_write_iter);
  1462. __poll_t vhost_chr_poll(struct file *file, struct vhost_dev *dev,
  1463. poll_table *wait)
  1464. {
  1465. __poll_t mask = 0;
  1466. poll_wait(file, &dev->wait, wait);
  1467. if (!list_empty(&dev->read_list))
  1468. mask |= EPOLLIN | EPOLLRDNORM;
  1469. return mask;
  1470. }
  1471. EXPORT_SYMBOL(vhost_chr_poll);
  1472. ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
  1473. int noblock)
  1474. {
  1475. DEFINE_WAIT(wait);
  1476. struct vhost_msg_node *node;
  1477. ssize_t ret = 0;
  1478. unsigned size = sizeof(struct vhost_msg);
  1479. if (iov_iter_count(to) < size)
  1480. return 0;
  1481. while (1) {
  1482. if (!noblock)
  1483. prepare_to_wait(&dev->wait, &wait,
  1484. TASK_INTERRUPTIBLE);
  1485. node = vhost_dequeue_msg(dev, &dev->read_list);
  1486. if (node)
  1487. break;
  1488. if (noblock) {
  1489. ret = -EAGAIN;
  1490. break;
  1491. }
  1492. if (signal_pending(current)) {
  1493. ret = -ERESTARTSYS;
  1494. break;
  1495. }
  1496. if (!dev->iotlb) {
  1497. ret = -EBADFD;
  1498. break;
  1499. }
  1500. schedule();
  1501. }
  1502. if (!noblock)
  1503. finish_wait(&dev->wait, &wait);
  1504. if (node) {
  1505. struct vhost_iotlb_msg *msg;
  1506. void *start = &node->msg;
  1507. switch (node->msg.type) {
  1508. case VHOST_IOTLB_MSG:
  1509. size = sizeof(node->msg);
  1510. msg = &node->msg.iotlb;
  1511. break;
  1512. case VHOST_IOTLB_MSG_V2:
  1513. size = sizeof(node->msg_v2);
  1514. msg = &node->msg_v2.iotlb;
  1515. break;
  1516. default:
  1517. BUG();
  1518. break;
  1519. }
  1520. ret = copy_to_iter(start, size, to);
  1521. if (ret != size || msg->type != VHOST_IOTLB_MISS) {
  1522. kfree(node);
  1523. return ret;
  1524. }
  1525. vhost_enqueue_msg(dev, &dev->pending_list, node);
  1526. }
  1527. return ret;
  1528. }
  1529. EXPORT_SYMBOL_GPL(vhost_chr_read_iter);
  1530. static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access)
  1531. {
  1532. struct vhost_dev *dev = vq->dev;
  1533. struct vhost_msg_node *node;
  1534. struct vhost_iotlb_msg *msg;
  1535. bool v2 = vhost_backend_has_feature(vq, VHOST_BACKEND_F_IOTLB_MSG_V2);
  1536. node = vhost_new_msg(vq, v2 ? VHOST_IOTLB_MSG_V2 : VHOST_IOTLB_MSG);
  1537. if (!node)
  1538. return -ENOMEM;
  1539. if (v2) {
  1540. node->msg_v2.type = VHOST_IOTLB_MSG_V2;
  1541. msg = &node->msg_v2.iotlb;
  1542. } else {
  1543. msg = &node->msg.iotlb;
  1544. }
  1545. msg->type = VHOST_IOTLB_MISS;
  1546. msg->iova = iova;
  1547. msg->perm = access;
  1548. vhost_enqueue_msg(dev, &dev->read_list, node);
  1549. return 0;
  1550. }
  1551. static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
  1552. vring_desc_t __user *desc,
  1553. vring_avail_t __user *avail,
  1554. vring_used_t __user *used)
  1555. {
  1556. /* If an IOTLB device is present, the vring addresses are
  1557. * GIOVAs. Access validation occurs at prefetch time. */
  1558. if (vq->iotlb)
  1559. return true;
  1560. return access_ok(desc, vhost_get_desc_size(vq, num)) &&
  1561. access_ok(avail, vhost_get_avail_size(vq, num)) &&
  1562. access_ok(used, vhost_get_used_size(vq, num));
  1563. }
  1564. static void vhost_vq_meta_update(struct vhost_virtqueue *vq,
  1565. const struct vhost_iotlb_map *map,
  1566. int type)
  1567. {
  1568. int access = (type == VHOST_ADDR_USED) ?
  1569. VHOST_ACCESS_WO : VHOST_ACCESS_RO;
  1570. if (likely(map->perm & access))
  1571. vq->meta_iotlb[type] = map;
  1572. }
  1573. static bool iotlb_access_ok(struct vhost_virtqueue *vq,
  1574. int access, u64 addr, u64 len, int type)
  1575. {
  1576. const struct vhost_iotlb_map *map;
  1577. struct vhost_iotlb *umem = vq->iotlb;
  1578. u64 s = 0, size, orig_addr = addr, last = addr + len - 1;
  1579. if (vhost_vq_meta_fetch(vq, addr, len, type))
  1580. return true;
  1581. while (len > s) {
  1582. map = vhost_iotlb_itree_first(umem, addr, last);
  1583. if (map == NULL || map->start > addr) {
  1584. vhost_iotlb_miss(vq, addr, access);
  1585. return false;
  1586. } else if (!(map->perm & access)) {
  1587. /* Report the possible access violation by
  1588. * request another translation from userspace.
  1589. */
  1590. return false;
  1591. }
  1592. size = map->size - addr + map->start;
  1593. if (orig_addr == addr && size >= len)
  1594. vhost_vq_meta_update(vq, map, type);
  1595. s += size;
  1596. addr += size;
  1597. }
  1598. return true;
  1599. }
  1600. int vq_meta_prefetch(struct vhost_virtqueue *vq)
  1601. {
  1602. unsigned int num = vq->num;
  1603. if (!vq->iotlb)
  1604. return 1;
  1605. return iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->desc,
  1606. vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
  1607. iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->avail,
  1608. vhost_get_avail_size(vq, num),
  1609. VHOST_ADDR_AVAIL) &&
  1610. iotlb_access_ok(vq, VHOST_MAP_WO, (u64)(uintptr_t)vq->used,
  1611. vhost_get_used_size(vq, num), VHOST_ADDR_USED);
  1612. }
  1613. EXPORT_SYMBOL_GPL(vq_meta_prefetch);
  1614. /* Can we log writes? */
  1615. /* Caller should have device mutex but not vq mutex */
  1616. bool vhost_log_access_ok(struct vhost_dev *dev)
  1617. {
  1618. return memory_access_ok(dev, dev->umem, 1);
  1619. }
  1620. EXPORT_SYMBOL_GPL(vhost_log_access_ok);
  1621. static bool vq_log_used_access_ok(struct vhost_virtqueue *vq,
  1622. void __user *log_base,
  1623. bool log_used,
  1624. u64 log_addr)
  1625. {
  1626. /* If an IOTLB device is present, log_addr is a GIOVA that
  1627. * will never be logged by log_used(). */
  1628. if (vq->iotlb)
  1629. return true;
  1630. return !log_used || log_access_ok(log_base, log_addr,
  1631. vhost_get_used_size(vq, vq->num));
  1632. }
  1633. /* Verify access for write logging. */
  1634. /* Caller should have vq mutex and device mutex */
  1635. static bool vq_log_access_ok(struct vhost_virtqueue *vq,
  1636. void __user *log_base)
  1637. {
  1638. return vq_memory_access_ok(log_base, vq->umem,
  1639. vhost_has_feature(vq, VHOST_F_LOG_ALL)) &&
  1640. vq_log_used_access_ok(vq, log_base, vq->log_used, vq->log_addr);
  1641. }
  1642. /* Can we start vq? */
  1643. /* Caller should have vq mutex and device mutex */
  1644. bool vhost_vq_access_ok(struct vhost_virtqueue *vq)
  1645. {
  1646. if (!vq_log_access_ok(vq, vq->log_base))
  1647. return false;
  1648. return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used);
  1649. }
  1650. EXPORT_SYMBOL_GPL(vhost_vq_access_ok);
  1651. static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
  1652. {
  1653. struct vhost_memory mem, *newmem;
  1654. struct vhost_memory_region *region;
  1655. struct vhost_iotlb *newumem, *oldumem;
  1656. unsigned long size = offsetof(struct vhost_memory, regions);
  1657. int i;
  1658. if (copy_from_user(&mem, m, size))
  1659. return -EFAULT;
  1660. if (mem.padding)
  1661. return -EOPNOTSUPP;
  1662. if (mem.nregions > max_mem_regions)
  1663. return -E2BIG;
  1664. newmem = kvzalloc(struct_size(newmem, regions, mem.nregions),
  1665. GFP_KERNEL);
  1666. if (!newmem)
  1667. return -ENOMEM;
  1668. memcpy(newmem, &mem, size);
  1669. if (copy_from_user(newmem->regions, m->regions,
  1670. flex_array_size(newmem, regions, mem.nregions))) {
  1671. kvfree(newmem);
  1672. return -EFAULT;
  1673. }
  1674. newumem = iotlb_alloc();
  1675. if (!newumem) {
  1676. kvfree(newmem);
  1677. return -ENOMEM;
  1678. }
  1679. for (region = newmem->regions;
  1680. region < newmem->regions + mem.nregions;
  1681. region++) {
  1682. if (vhost_iotlb_add_range(newumem,
  1683. region->guest_phys_addr,
  1684. region->guest_phys_addr +
  1685. region->memory_size - 1,
  1686. region->userspace_addr,
  1687. VHOST_MAP_RW))
  1688. goto err;
  1689. }
  1690. if (!memory_access_ok(d, newumem, 0))
  1691. goto err;
  1692. oldumem = d->umem;
  1693. d->umem = newumem;
  1694. /* All memory accesses are done under some VQ mutex. */
  1695. for (i = 0; i < d->nvqs; ++i) {
  1696. mutex_lock(&d->vqs[i]->mutex);
  1697. d->vqs[i]->umem = newumem;
  1698. mutex_unlock(&d->vqs[i]->mutex);
  1699. }
  1700. kvfree(newmem);
  1701. vhost_iotlb_free(oldumem);
  1702. return 0;
  1703. err:
  1704. vhost_iotlb_free(newumem);
  1705. kvfree(newmem);
  1706. return -EFAULT;
  1707. }
  1708. static long vhost_vring_set_num(struct vhost_dev *d,
  1709. struct vhost_virtqueue *vq,
  1710. void __user *argp)
  1711. {
  1712. struct vhost_vring_state s;
  1713. /* Resizing ring with an active backend?
  1714. * You don't want to do that. */
  1715. if (vq->private_data)
  1716. return -EBUSY;
  1717. if (copy_from_user(&s, argp, sizeof s))
  1718. return -EFAULT;
  1719. if (!s.num || s.num > 0xffff || (s.num & (s.num - 1)))
  1720. return -EINVAL;
  1721. vq->num = s.num;
  1722. return 0;
  1723. }
  1724. static long vhost_vring_set_addr(struct vhost_dev *d,
  1725. struct vhost_virtqueue *vq,
  1726. void __user *argp)
  1727. {
  1728. struct vhost_vring_addr a;
  1729. if (copy_from_user(&a, argp, sizeof a))
  1730. return -EFAULT;
  1731. if (a.flags & ~(0x1 << VHOST_VRING_F_LOG))
  1732. return -EOPNOTSUPP;
  1733. /* For 32bit, verify that the top 32bits of the user
  1734. data are set to zero. */
  1735. if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr ||
  1736. (u64)(unsigned long)a.used_user_addr != a.used_user_addr ||
  1737. (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr)
  1738. return -EFAULT;
  1739. /* Make sure it's safe to cast pointers to vring types. */
  1740. BUILD_BUG_ON(__alignof__ *vq->avail > VRING_AVAIL_ALIGN_SIZE);
  1741. BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE);
  1742. if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) ||
  1743. (a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) ||
  1744. (a.log_guest_addr & (VRING_USED_ALIGN_SIZE - 1)))
  1745. return -EINVAL;
  1746. /* We only verify access here if backend is configured.
  1747. * If it is not, we don't as size might not have been setup.
  1748. * We will verify when backend is configured. */
  1749. if (vq->private_data) {
  1750. if (!vq_access_ok(vq, vq->num,
  1751. (void __user *)(unsigned long)a.desc_user_addr,
  1752. (void __user *)(unsigned long)a.avail_user_addr,
  1753. (void __user *)(unsigned long)a.used_user_addr))
  1754. return -EINVAL;
  1755. /* Also validate log access for used ring if enabled. */
  1756. if (!vq_log_used_access_ok(vq, vq->log_base,
  1757. a.flags & (0x1 << VHOST_VRING_F_LOG),
  1758. a.log_guest_addr))
  1759. return -EINVAL;
  1760. }
  1761. vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG));
  1762. vq->desc = (void __user *)(unsigned long)a.desc_user_addr;
  1763. vq->avail = (void __user *)(unsigned long)a.avail_user_addr;
  1764. vq->log_addr = a.log_guest_addr;
  1765. vq->used = (void __user *)(unsigned long)a.used_user_addr;
  1766. return 0;
  1767. }
  1768. static long vhost_vring_set_num_addr(struct vhost_dev *d,
  1769. struct vhost_virtqueue *vq,
  1770. unsigned int ioctl,
  1771. void __user *argp)
  1772. {
  1773. long r;
  1774. mutex_lock(&vq->mutex);
  1775. switch (ioctl) {
  1776. case VHOST_SET_VRING_NUM:
  1777. r = vhost_vring_set_num(d, vq, argp);
  1778. break;
  1779. case VHOST_SET_VRING_ADDR:
  1780. r = vhost_vring_set_addr(d, vq, argp);
  1781. break;
  1782. default:
  1783. BUG();
  1784. }
  1785. mutex_unlock(&vq->mutex);
  1786. return r;
  1787. }
  1788. long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
  1789. {
  1790. struct file *eventfp, *filep = NULL;
  1791. bool pollstart = false, pollstop = false;
  1792. struct eventfd_ctx *ctx = NULL;
  1793. struct vhost_virtqueue *vq;
  1794. struct vhost_vring_state s;
  1795. struct vhost_vring_file f;
  1796. u32 idx;
  1797. long r;
  1798. r = vhost_get_vq_from_user(d, argp, &vq, &idx);
  1799. if (r < 0)
  1800. return r;
  1801. if (ioctl == VHOST_SET_VRING_NUM ||
  1802. ioctl == VHOST_SET_VRING_ADDR) {
  1803. return vhost_vring_set_num_addr(d, vq, ioctl, argp);
  1804. }
  1805. mutex_lock(&vq->mutex);
  1806. switch (ioctl) {
  1807. case VHOST_SET_VRING_BASE:
  1808. /* Moving base with an active backend?
  1809. * You don't want to do that. */
  1810. if (vq->private_data) {
  1811. r = -EBUSY;
  1812. break;
  1813. }
  1814. if (copy_from_user(&s, argp, sizeof s)) {
  1815. r = -EFAULT;
  1816. break;
  1817. }
  1818. if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) {
  1819. vq->last_avail_idx = s.num & 0xffff;
  1820. vq->last_used_idx = (s.num >> 16) & 0xffff;
  1821. } else {
  1822. if (s.num > 0xffff) {
  1823. r = -EINVAL;
  1824. break;
  1825. }
  1826. vq->last_avail_idx = s.num;
  1827. }
  1828. /* Forget the cached index value. */
  1829. vq->avail_idx = vq->last_avail_idx;
  1830. break;
  1831. case VHOST_GET_VRING_BASE:
  1832. s.index = idx;
  1833. if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
  1834. s.num = (u32)vq->last_avail_idx | ((u32)vq->last_used_idx << 16);
  1835. else
  1836. s.num = vq->last_avail_idx;
  1837. if (copy_to_user(argp, &s, sizeof s))
  1838. r = -EFAULT;
  1839. break;
  1840. case VHOST_SET_VRING_KICK:
  1841. if (copy_from_user(&f, argp, sizeof f)) {
  1842. r = -EFAULT;
  1843. break;
  1844. }
  1845. eventfp = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_fget(f.fd);
  1846. if (IS_ERR(eventfp)) {
  1847. r = PTR_ERR(eventfp);
  1848. break;
  1849. }
  1850. if (eventfp != vq->kick) {
  1851. pollstop = (filep = vq->kick) != NULL;
  1852. pollstart = (vq->kick = eventfp) != NULL;
  1853. } else
  1854. filep = eventfp;
  1855. break;
  1856. case VHOST_SET_VRING_CALL:
  1857. if (copy_from_user(&f, argp, sizeof f)) {
  1858. r = -EFAULT;
  1859. break;
  1860. }
  1861. ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd);
  1862. if (IS_ERR(ctx)) {
  1863. r = PTR_ERR(ctx);
  1864. break;
  1865. }
  1866. swap(ctx, vq->call_ctx.ctx);
  1867. break;
  1868. case VHOST_SET_VRING_ERR:
  1869. if (copy_from_user(&f, argp, sizeof f)) {
  1870. r = -EFAULT;
  1871. break;
  1872. }
  1873. ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd);
  1874. if (IS_ERR(ctx)) {
  1875. r = PTR_ERR(ctx);
  1876. break;
  1877. }
  1878. swap(ctx, vq->error_ctx);
  1879. break;
  1880. case VHOST_SET_VRING_ENDIAN:
  1881. r = vhost_set_vring_endian(vq, argp);
  1882. break;
  1883. case VHOST_GET_VRING_ENDIAN:
  1884. r = vhost_get_vring_endian(vq, idx, argp);
  1885. break;
  1886. case VHOST_SET_VRING_BUSYLOOP_TIMEOUT:
  1887. if (copy_from_user(&s, argp, sizeof(s))) {
  1888. r = -EFAULT;
  1889. break;
  1890. }
  1891. vq->busyloop_timeout = s.num;
  1892. break;
  1893. case VHOST_GET_VRING_BUSYLOOP_TIMEOUT:
  1894. s.index = idx;
  1895. s.num = vq->busyloop_timeout;
  1896. if (copy_to_user(argp, &s, sizeof(s)))
  1897. r = -EFAULT;
  1898. break;
  1899. default:
  1900. r = -ENOIOCTLCMD;
  1901. }
  1902. if (pollstop && vq->handle_kick)
  1903. vhost_poll_stop(&vq->poll);
  1904. if (!IS_ERR_OR_NULL(ctx))
  1905. eventfd_ctx_put(ctx);
  1906. if (filep)
  1907. fput(filep);
  1908. if (pollstart && vq->handle_kick)
  1909. r = vhost_poll_start(&vq->poll, vq->kick);
  1910. mutex_unlock(&vq->mutex);
  1911. if (pollstop && vq->handle_kick)
  1912. vhost_dev_flush(vq->poll.dev);
  1913. return r;
  1914. }
  1915. EXPORT_SYMBOL_GPL(vhost_vring_ioctl);
  1916. int vhost_init_device_iotlb(struct vhost_dev *d)
  1917. {
  1918. struct vhost_iotlb *niotlb, *oiotlb;
  1919. int i;
  1920. niotlb = iotlb_alloc();
  1921. if (!niotlb)
  1922. return -ENOMEM;
  1923. oiotlb = d->iotlb;
  1924. d->iotlb = niotlb;
  1925. for (i = 0; i < d->nvqs; ++i) {
  1926. struct vhost_virtqueue *vq = d->vqs[i];
  1927. mutex_lock(&vq->mutex);
  1928. vq->iotlb = niotlb;
  1929. __vhost_vq_meta_reset(vq);
  1930. mutex_unlock(&vq->mutex);
  1931. }
  1932. vhost_iotlb_free(oiotlb);
  1933. return 0;
  1934. }
  1935. EXPORT_SYMBOL_GPL(vhost_init_device_iotlb);
  1936. /* Caller must have device mutex */
  1937. long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
  1938. {
  1939. struct eventfd_ctx *ctx;
  1940. u64 p;
  1941. long r;
  1942. int i, fd;
  1943. /* If you are not the owner, you can become one */
  1944. if (ioctl == VHOST_SET_OWNER) {
  1945. r = vhost_dev_set_owner(d);
  1946. goto done;
  1947. }
  1948. #ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL
  1949. if (ioctl == VHOST_SET_FORK_FROM_OWNER) {
  1950. /* Only allow modification before owner is set */
  1951. if (vhost_dev_has_owner(d)) {
  1952. r = -EBUSY;
  1953. goto done;
  1954. }
  1955. u8 fork_owner_val;
  1956. if (get_user(fork_owner_val, (u8 __user *)argp)) {
  1957. r = -EFAULT;
  1958. goto done;
  1959. }
  1960. if (fork_owner_val != VHOST_FORK_OWNER_TASK &&
  1961. fork_owner_val != VHOST_FORK_OWNER_KTHREAD) {
  1962. r = -EINVAL;
  1963. goto done;
  1964. }
  1965. d->fork_owner = !!fork_owner_val;
  1966. r = 0;
  1967. goto done;
  1968. }
  1969. if (ioctl == VHOST_GET_FORK_FROM_OWNER) {
  1970. u8 fork_owner_val = d->fork_owner;
  1971. if (fork_owner_val != VHOST_FORK_OWNER_TASK &&
  1972. fork_owner_val != VHOST_FORK_OWNER_KTHREAD) {
  1973. r = -EINVAL;
  1974. goto done;
  1975. }
  1976. if (put_user(fork_owner_val, (u8 __user *)argp)) {
  1977. r = -EFAULT;
  1978. goto done;
  1979. }
  1980. r = 0;
  1981. goto done;
  1982. }
  1983. #endif
  1984. /* You must be the owner to do anything else */
  1985. r = vhost_dev_check_owner(d);
  1986. if (r)
  1987. goto done;
  1988. switch (ioctl) {
  1989. case VHOST_SET_MEM_TABLE:
  1990. r = vhost_set_memory(d, argp);
  1991. break;
  1992. case VHOST_SET_LOG_BASE:
  1993. if (copy_from_user(&p, argp, sizeof p)) {
  1994. r = -EFAULT;
  1995. break;
  1996. }
  1997. if ((u64)(unsigned long)p != p) {
  1998. r = -EFAULT;
  1999. break;
  2000. }
  2001. for (i = 0; i < d->nvqs; ++i) {
  2002. struct vhost_virtqueue *vq;
  2003. void __user *base = (void __user *)(unsigned long)p;
  2004. vq = d->vqs[i];
  2005. mutex_lock(&vq->mutex);
  2006. /* If ring is inactive, will check when it's enabled. */
  2007. if (vq->private_data && !vq_log_access_ok(vq, base))
  2008. r = -EFAULT;
  2009. else
  2010. vq->log_base = base;
  2011. mutex_unlock(&vq->mutex);
  2012. }
  2013. break;
  2014. case VHOST_SET_LOG_FD:
  2015. r = get_user(fd, (int __user *)argp);
  2016. if (r < 0)
  2017. break;
  2018. ctx = fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(fd);
  2019. if (IS_ERR(ctx)) {
  2020. r = PTR_ERR(ctx);
  2021. break;
  2022. }
  2023. swap(ctx, d->log_ctx);
  2024. for (i = 0; i < d->nvqs; ++i) {
  2025. mutex_lock(&d->vqs[i]->mutex);
  2026. d->vqs[i]->log_ctx = d->log_ctx;
  2027. mutex_unlock(&d->vqs[i]->mutex);
  2028. }
  2029. if (ctx)
  2030. eventfd_ctx_put(ctx);
  2031. break;
  2032. default:
  2033. r = -ENOIOCTLCMD;
  2034. break;
  2035. }
  2036. done:
  2037. return r;
  2038. }
  2039. EXPORT_SYMBOL_GPL(vhost_dev_ioctl);
  2040. /* TODO: This is really inefficient. We need something like get_user()
  2041. * (instruction directly accesses the data, with an exception table entry
  2042. * returning -EFAULT). See Documentation/arch/x86/exception-tables.rst.
  2043. */
  2044. static int set_bit_to_user(int nr, void __user *addr)
  2045. {
  2046. unsigned long log = (unsigned long)addr;
  2047. struct page *page;
  2048. void *base;
  2049. int bit = nr + (log % PAGE_SIZE) * 8;
  2050. int r;
  2051. r = pin_user_pages_fast(log, 1, FOLL_WRITE, &page);
  2052. if (r < 0)
  2053. return r;
  2054. BUG_ON(r != 1);
  2055. base = kmap_atomic(page);
  2056. set_bit(bit, base);
  2057. kunmap_atomic(base);
  2058. unpin_user_pages_dirty_lock(&page, 1, true);
  2059. return 0;
  2060. }
  2061. static int log_write(void __user *log_base,
  2062. u64 write_address, u64 write_length)
  2063. {
  2064. u64 write_page = write_address / VHOST_PAGE_SIZE;
  2065. int r;
  2066. if (!write_length)
  2067. return 0;
  2068. write_length += write_address % VHOST_PAGE_SIZE;
  2069. for (;;) {
  2070. u64 base = (u64)(unsigned long)log_base;
  2071. u64 log = base + write_page / 8;
  2072. int bit = write_page % 8;
  2073. if ((u64)(unsigned long)log != log)
  2074. return -EFAULT;
  2075. r = set_bit_to_user(bit, (void __user *)(unsigned long)log);
  2076. if (r < 0)
  2077. return r;
  2078. if (write_length <= VHOST_PAGE_SIZE)
  2079. break;
  2080. write_length -= VHOST_PAGE_SIZE;
  2081. write_page += 1;
  2082. }
  2083. return r;
  2084. }
  2085. static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len)
  2086. {
  2087. struct vhost_iotlb *umem = vq->umem;
  2088. struct vhost_iotlb_map *u;
  2089. u64 start, end, l, min;
  2090. int r;
  2091. bool hit = false;
  2092. while (len) {
  2093. min = len;
  2094. /* More than one GPAs can be mapped into a single HVA. So
  2095. * iterate all possible umems here to be safe.
  2096. */
  2097. list_for_each_entry(u, &umem->list, link) {
  2098. if (u->addr > hva - 1 + len ||
  2099. u->addr - 1 + u->size < hva)
  2100. continue;
  2101. start = max(u->addr, hva);
  2102. end = min(u->addr - 1 + u->size, hva - 1 + len);
  2103. l = end - start + 1;
  2104. r = log_write(vq->log_base,
  2105. u->start + start - u->addr,
  2106. l);
  2107. if (r < 0)
  2108. return r;
  2109. hit = true;
  2110. min = min(l, min);
  2111. }
  2112. if (!hit)
  2113. return -EFAULT;
  2114. len -= min;
  2115. hva += min;
  2116. }
  2117. return 0;
  2118. }
  2119. static int log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len)
  2120. {
  2121. struct iovec *iov = vq->log_iov;
  2122. int i, ret;
  2123. if (!vq->iotlb)
  2124. return log_write(vq->log_base, vq->log_addr + used_offset, len);
  2125. ret = translate_desc(vq, (uintptr_t)vq->used + used_offset,
  2126. len, iov, 64, VHOST_ACCESS_WO);
  2127. if (ret < 0)
  2128. return ret;
  2129. for (i = 0; i < ret; i++) {
  2130. ret = log_write_hva(vq, (uintptr_t)iov[i].iov_base,
  2131. iov[i].iov_len);
  2132. if (ret)
  2133. return ret;
  2134. }
  2135. return 0;
  2136. }
  2137. int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
  2138. unsigned int log_num, u64 len, struct iovec *iov, int count)
  2139. {
  2140. int i, r;
  2141. /* Make sure data written is seen before log. */
  2142. smp_wmb();
  2143. if (vq->iotlb) {
  2144. for (i = 0; i < count; i++) {
  2145. r = log_write_hva(vq, (uintptr_t)iov[i].iov_base,
  2146. iov[i].iov_len);
  2147. if (r < 0)
  2148. return r;
  2149. }
  2150. return 0;
  2151. }
  2152. for (i = 0; i < log_num; ++i) {
  2153. u64 l = min(log[i].len, len);
  2154. r = log_write(vq->log_base, log[i].addr, l);
  2155. if (r < 0)
  2156. return r;
  2157. len -= l;
  2158. if (!len) {
  2159. if (vq->log_ctx)
  2160. eventfd_signal(vq->log_ctx);
  2161. return 0;
  2162. }
  2163. }
  2164. /* Length written exceeds what we have stored. This is a bug. */
  2165. BUG();
  2166. return 0;
  2167. }
  2168. EXPORT_SYMBOL_GPL(vhost_log_write);
  2169. static int vhost_update_used_flags(struct vhost_virtqueue *vq)
  2170. {
  2171. void __user *used;
  2172. if (vhost_put_used_flags(vq))
  2173. return -EFAULT;
  2174. if (unlikely(vq->log_used)) {
  2175. /* Make sure the flag is seen before log. */
  2176. smp_wmb();
  2177. /* Log used flag write. */
  2178. used = &vq->used->flags;
  2179. log_used(vq, (used - (void __user *)vq->used),
  2180. sizeof vq->used->flags);
  2181. if (vq->log_ctx)
  2182. eventfd_signal(vq->log_ctx);
  2183. }
  2184. return 0;
  2185. }
  2186. static int vhost_update_avail_event(struct vhost_virtqueue *vq)
  2187. {
  2188. if (vhost_put_avail_event(vq))
  2189. return -EFAULT;
  2190. if (unlikely(vq->log_used)) {
  2191. void __user *used;
  2192. /* Make sure the event is seen before log. */
  2193. smp_wmb();
  2194. /* Log avail event write */
  2195. used = vhost_avail_event(vq);
  2196. log_used(vq, (used - (void __user *)vq->used),
  2197. sizeof *vhost_avail_event(vq));
  2198. if (vq->log_ctx)
  2199. eventfd_signal(vq->log_ctx);
  2200. }
  2201. return 0;
  2202. }
  2203. int vhost_vq_init_access(struct vhost_virtqueue *vq)
  2204. {
  2205. __virtio16 last_used_idx;
  2206. int r;
  2207. bool is_le = vq->is_le;
  2208. if (!vq->private_data)
  2209. return 0;
  2210. vhost_init_is_le(vq);
  2211. r = vhost_update_used_flags(vq);
  2212. if (r)
  2213. goto err;
  2214. vq->signalled_used_valid = false;
  2215. if (!vq->iotlb &&
  2216. !access_ok(&vq->used->idx, sizeof vq->used->idx)) {
  2217. r = -EFAULT;
  2218. goto err;
  2219. }
  2220. r = vhost_get_used_idx(vq, &last_used_idx);
  2221. if (r) {
  2222. vq_err(vq, "Can't access used idx at %p\n",
  2223. &vq->used->idx);
  2224. goto err;
  2225. }
  2226. vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx);
  2227. return 0;
  2228. err:
  2229. vq->is_le = is_le;
  2230. return r;
  2231. }
  2232. EXPORT_SYMBOL_GPL(vhost_vq_init_access);
  2233. static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
  2234. struct iovec iov[], int iov_size, int access)
  2235. {
  2236. const struct vhost_iotlb_map *map;
  2237. struct vhost_dev *dev = vq->dev;
  2238. struct vhost_iotlb *umem = dev->iotlb ? dev->iotlb : dev->umem;
  2239. struct iovec *_iov;
  2240. u64 s = 0, last = addr + len - 1;
  2241. int ret = 0;
  2242. while ((u64)len > s) {
  2243. u64 size;
  2244. if (unlikely(ret >= iov_size)) {
  2245. ret = -ENOBUFS;
  2246. break;
  2247. }
  2248. map = vhost_iotlb_itree_first(umem, addr, last);
  2249. if (map == NULL || map->start > addr) {
  2250. if (umem != dev->iotlb) {
  2251. ret = -EFAULT;
  2252. break;
  2253. }
  2254. ret = -EAGAIN;
  2255. break;
  2256. } else if (!(map->perm & access)) {
  2257. ret = -EPERM;
  2258. break;
  2259. }
  2260. _iov = iov + ret;
  2261. size = map->size - addr + map->start;
  2262. _iov->iov_len = min((u64)len - s, size);
  2263. _iov->iov_base = (void __user *)(unsigned long)
  2264. (map->addr + addr - map->start);
  2265. s += size;
  2266. addr += size;
  2267. ++ret;
  2268. }
  2269. if (ret == -EAGAIN)
  2270. vhost_iotlb_miss(vq, addr, access);
  2271. return ret;
  2272. }
  2273. /* Each buffer in the virtqueues is actually a chain of descriptors. This
  2274. * function returns the next descriptor in the chain,
  2275. * or -1U if we're at the end. */
  2276. static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
  2277. {
  2278. unsigned int next;
  2279. /* If this descriptor says it doesn't chain, we're done. */
  2280. if (!(desc->flags & cpu_to_vhost16(vq, VRING_DESC_F_NEXT)))
  2281. return -1U;
  2282. /* Check they're not leading us off end of descriptors. */
  2283. next = vhost16_to_cpu(vq, READ_ONCE(desc->next));
  2284. return next;
  2285. }
  2286. static int get_indirect(struct vhost_virtqueue *vq,
  2287. struct iovec iov[], unsigned int iov_size,
  2288. unsigned int *out_num, unsigned int *in_num,
  2289. struct vhost_log *log, unsigned int *log_num,
  2290. struct vring_desc *indirect)
  2291. {
  2292. struct vring_desc desc;
  2293. unsigned int i = 0, count, found = 0;
  2294. u32 len = vhost32_to_cpu(vq, indirect->len);
  2295. struct iov_iter from;
  2296. int ret, access;
  2297. /* Sanity check */
  2298. if (unlikely(len % sizeof desc)) {
  2299. vq_err(vq, "Invalid length in indirect descriptor: "
  2300. "len 0x%llx not multiple of 0x%zx\n",
  2301. (unsigned long long)len,
  2302. sizeof desc);
  2303. return -EINVAL;
  2304. }
  2305. ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
  2306. UIO_MAXIOV, VHOST_ACCESS_RO);
  2307. if (unlikely(ret < 0)) {
  2308. if (ret != -EAGAIN)
  2309. vq_err(vq, "Translation failure %d in indirect.\n", ret);
  2310. return ret;
  2311. }
  2312. iov_iter_init(&from, ITER_SOURCE, vq->indirect, ret, len);
  2313. count = len / sizeof desc;
  2314. /* Buffers are chained via a 16 bit next field, so
  2315. * we can have at most 2^16 of these. */
  2316. if (unlikely(count > USHRT_MAX + 1)) {
  2317. vq_err(vq, "Indirect buffer length too big: %d\n",
  2318. indirect->len);
  2319. return -E2BIG;
  2320. }
  2321. do {
  2322. unsigned iov_count = *in_num + *out_num;
  2323. if (unlikely(++found > count)) {
  2324. vq_err(vq, "Loop detected: last one at %u "
  2325. "indirect size %u\n",
  2326. i, count);
  2327. return -EINVAL;
  2328. }
  2329. if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {
  2330. vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
  2331. i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
  2332. return -EINVAL;
  2333. }
  2334. if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
  2335. vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
  2336. i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
  2337. return -EINVAL;
  2338. }
  2339. if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
  2340. access = VHOST_ACCESS_WO;
  2341. else
  2342. access = VHOST_ACCESS_RO;
  2343. ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
  2344. vhost32_to_cpu(vq, desc.len), iov + iov_count,
  2345. iov_size - iov_count, access);
  2346. if (unlikely(ret < 0)) {
  2347. if (ret != -EAGAIN)
  2348. vq_err(vq, "Translation failure %d indirect idx %d\n",
  2349. ret, i);
  2350. return ret;
  2351. }
  2352. /* If this is an input descriptor, increment that count. */
  2353. if (access == VHOST_ACCESS_WO) {
  2354. *in_num += ret;
  2355. if (unlikely(log && ret)) {
  2356. log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
  2357. log[*log_num].len = vhost32_to_cpu(vq, desc.len);
  2358. ++*log_num;
  2359. }
  2360. } else {
  2361. /* If it's an output descriptor, they're all supposed
  2362. * to come before any input descriptors. */
  2363. if (unlikely(*in_num)) {
  2364. vq_err(vq, "Indirect descriptor "
  2365. "has out after in: idx %d\n", i);
  2366. return -EINVAL;
  2367. }
  2368. *out_num += ret;
  2369. }
  2370. } while ((i = next_desc(vq, &desc)) != -1);
  2371. return 0;
  2372. }
  2373. /* This looks in the virtqueue and for the first available buffer, and converts
  2374. * it to an iovec for convenient access. Since descriptors consist of some
  2375. * number of output then some number of input descriptors, it's actually two
  2376. * iovecs, but we pack them into one and note how many of each there were.
  2377. *
  2378. * This function returns the descriptor number found, or vq->num (which is
  2379. * never a valid descriptor number) if none was found. A negative code is
  2380. * returned on error. */
  2381. int vhost_get_vq_desc(struct vhost_virtqueue *vq,
  2382. struct iovec iov[], unsigned int iov_size,
  2383. unsigned int *out_num, unsigned int *in_num,
  2384. struct vhost_log *log, unsigned int *log_num)
  2385. {
  2386. struct vring_desc desc;
  2387. unsigned int i, head, found = 0;
  2388. u16 last_avail_idx = vq->last_avail_idx;
  2389. __virtio16 ring_head;
  2390. int ret, access;
  2391. if (vq->avail_idx == vq->last_avail_idx) {
  2392. ret = vhost_get_avail_idx(vq);
  2393. if (unlikely(ret < 0))
  2394. return ret;
  2395. if (!ret)
  2396. return vq->num;
  2397. }
  2398. /* Grab the next descriptor number they're advertising, and increment
  2399. * the index we've seen. */
  2400. if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
  2401. vq_err(vq, "Failed to read head: idx %d address %p\n",
  2402. last_avail_idx,
  2403. &vq->avail->ring[last_avail_idx % vq->num]);
  2404. return -EFAULT;
  2405. }
  2406. head = vhost16_to_cpu(vq, ring_head);
  2407. /* If their number is silly, that's an error. */
  2408. if (unlikely(head >= vq->num)) {
  2409. vq_err(vq, "Guest says index %u > %u is available",
  2410. head, vq->num);
  2411. return -EINVAL;
  2412. }
  2413. /* When we start there are none of either input nor output. */
  2414. *out_num = *in_num = 0;
  2415. if (unlikely(log))
  2416. *log_num = 0;
  2417. i = head;
  2418. do {
  2419. unsigned iov_count = *in_num + *out_num;
  2420. if (unlikely(i >= vq->num)) {
  2421. vq_err(vq, "Desc index is %u > %u, head = %u",
  2422. i, vq->num, head);
  2423. return -EINVAL;
  2424. }
  2425. if (unlikely(++found > vq->num)) {
  2426. vq_err(vq, "Loop detected: last one at %u "
  2427. "vq size %u head %u\n",
  2428. i, vq->num, head);
  2429. return -EINVAL;
  2430. }
  2431. ret = vhost_get_desc(vq, &desc, i);
  2432. if (unlikely(ret)) {
  2433. vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
  2434. i, vq->desc + i);
  2435. return -EFAULT;
  2436. }
  2437. if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
  2438. ret = get_indirect(vq, iov, iov_size,
  2439. out_num, in_num,
  2440. log, log_num, &desc);
  2441. if (unlikely(ret < 0)) {
  2442. if (ret != -EAGAIN)
  2443. vq_err(vq, "Failure detected "
  2444. "in indirect descriptor at idx %d\n", i);
  2445. return ret;
  2446. }
  2447. continue;
  2448. }
  2449. if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
  2450. access = VHOST_ACCESS_WO;
  2451. else
  2452. access = VHOST_ACCESS_RO;
  2453. ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
  2454. vhost32_to_cpu(vq, desc.len), iov + iov_count,
  2455. iov_size - iov_count, access);
  2456. if (unlikely(ret < 0)) {
  2457. if (ret != -EAGAIN)
  2458. vq_err(vq, "Translation failure %d descriptor idx %d\n",
  2459. ret, i);
  2460. return ret;
  2461. }
  2462. if (access == VHOST_ACCESS_WO) {
  2463. /* If this is an input descriptor,
  2464. * increment that count. */
  2465. *in_num += ret;
  2466. if (unlikely(log && ret)) {
  2467. log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
  2468. log[*log_num].len = vhost32_to_cpu(vq, desc.len);
  2469. ++*log_num;
  2470. }
  2471. } else {
  2472. /* If it's an output descriptor, they're all supposed
  2473. * to come before any input descriptors. */
  2474. if (unlikely(*in_num)) {
  2475. vq_err(vq, "Descriptor has out after in: "
  2476. "idx %d\n", i);
  2477. return -EINVAL;
  2478. }
  2479. *out_num += ret;
  2480. }
  2481. } while ((i = next_desc(vq, &desc)) != -1);
  2482. /* On success, increment avail index. */
  2483. vq->last_avail_idx++;
  2484. /* Assume notifications from guest are disabled at this point,
  2485. * if they aren't we would need to update avail_event index. */
  2486. BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
  2487. return head;
  2488. }
  2489. EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
  2490. /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
  2491. void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
  2492. {
  2493. vq->last_avail_idx -= n;
  2494. }
  2495. EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
  2496. /* After we've used one of their buffers, we tell them about it. We'll then
  2497. * want to notify the guest, using eventfd. */
  2498. int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
  2499. {
  2500. struct vring_used_elem heads = {
  2501. cpu_to_vhost32(vq, head),
  2502. cpu_to_vhost32(vq, len)
  2503. };
  2504. return vhost_add_used_n(vq, &heads, 1);
  2505. }
  2506. EXPORT_SYMBOL_GPL(vhost_add_used);
  2507. static int __vhost_add_used_n(struct vhost_virtqueue *vq,
  2508. struct vring_used_elem *heads,
  2509. unsigned count)
  2510. {
  2511. vring_used_elem_t __user *used;
  2512. u16 old, new;
  2513. int start;
  2514. start = vq->last_used_idx & (vq->num - 1);
  2515. used = vq->used->ring + start;
  2516. if (vhost_put_used(vq, heads, start, count)) {
  2517. vq_err(vq, "Failed to write used");
  2518. return -EFAULT;
  2519. }
  2520. if (unlikely(vq->log_used)) {
  2521. /* Make sure data is seen before log. */
  2522. smp_wmb();
  2523. /* Log used ring entry write. */
  2524. log_used(vq, ((void __user *)used - (void __user *)vq->used),
  2525. count * sizeof *used);
  2526. }
  2527. old = vq->last_used_idx;
  2528. new = (vq->last_used_idx += count);
  2529. /* If the driver never bothers to signal in a very long while,
  2530. * used index might wrap around. If that happens, invalidate
  2531. * signalled_used index we stored. TODO: make sure driver
  2532. * signals at least once in 2^16 and remove this. */
  2533. if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
  2534. vq->signalled_used_valid = false;
  2535. return 0;
  2536. }
  2537. /* After we've used one of their buffers, we tell them about it. We'll then
  2538. * want to notify the guest, using eventfd. */
  2539. int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
  2540. unsigned count)
  2541. {
  2542. int start, n, r;
  2543. start = vq->last_used_idx & (vq->num - 1);
  2544. n = vq->num - start;
  2545. if (n < count) {
  2546. r = __vhost_add_used_n(vq, heads, n);
  2547. if (r < 0)
  2548. return r;
  2549. heads += n;
  2550. count -= n;
  2551. }
  2552. r = __vhost_add_used_n(vq, heads, count);
  2553. if (r < 0)
  2554. return r;
  2555. /* Make sure buffer is written before we update index. */
  2556. smp_wmb();
  2557. if (vhost_put_used_idx(vq)) {
  2558. vq_err(vq, "Failed to increment used idx");
  2559. return -EFAULT;
  2560. }
  2561. if (unlikely(vq->log_used)) {
  2562. /* Make sure used idx is seen before log. */
  2563. smp_wmb();
  2564. /* Log used index update. */
  2565. log_used(vq, offsetof(struct vring_used, idx),
  2566. sizeof vq->used->idx);
  2567. if (vq->log_ctx)
  2568. eventfd_signal(vq->log_ctx);
  2569. }
  2570. return r;
  2571. }
  2572. EXPORT_SYMBOL_GPL(vhost_add_used_n);
  2573. static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
  2574. {
  2575. __u16 old, new;
  2576. __virtio16 event;
  2577. bool v;
  2578. /* Flush out used index updates. This is paired
  2579. * with the barrier that the Guest executes when enabling
  2580. * interrupts. */
  2581. smp_mb();
  2582. if (vhost_has_feature(vq, VIRTIO_F_NOTIFY_ON_EMPTY) &&
  2583. unlikely(vq->avail_idx == vq->last_avail_idx))
  2584. return true;
  2585. if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
  2586. __virtio16 flags;
  2587. if (vhost_get_avail_flags(vq, &flags)) {
  2588. vq_err(vq, "Failed to get flags");
  2589. return true;
  2590. }
  2591. return !(flags & cpu_to_vhost16(vq, VRING_AVAIL_F_NO_INTERRUPT));
  2592. }
  2593. old = vq->signalled_used;
  2594. v = vq->signalled_used_valid;
  2595. new = vq->signalled_used = vq->last_used_idx;
  2596. vq->signalled_used_valid = true;
  2597. if (unlikely(!v))
  2598. return true;
  2599. if (vhost_get_used_event(vq, &event)) {
  2600. vq_err(vq, "Failed to get used event idx");
  2601. return true;
  2602. }
  2603. return vring_need_event(vhost16_to_cpu(vq, event), new, old);
  2604. }
  2605. /* This actually signals the guest, using eventfd. */
  2606. void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
  2607. {
  2608. /* Signal the Guest tell them we used something up. */
  2609. if (vq->call_ctx.ctx && vhost_notify(dev, vq))
  2610. eventfd_signal(vq->call_ctx.ctx);
  2611. }
  2612. EXPORT_SYMBOL_GPL(vhost_signal);
  2613. /* And here's the combo meal deal. Supersize me! */
  2614. void vhost_add_used_and_signal(struct vhost_dev *dev,
  2615. struct vhost_virtqueue *vq,
  2616. unsigned int head, int len)
  2617. {
  2618. vhost_add_used(vq, head, len);
  2619. vhost_signal(dev, vq);
  2620. }
  2621. EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
  2622. /* multi-buffer version of vhost_add_used_and_signal */
  2623. void vhost_add_used_and_signal_n(struct vhost_dev *dev,
  2624. struct vhost_virtqueue *vq,
  2625. struct vring_used_elem *heads, unsigned count)
  2626. {
  2627. vhost_add_used_n(vq, heads, count);
  2628. vhost_signal(dev, vq);
  2629. }
  2630. EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
  2631. /* return true if we're sure that avaiable ring is empty */
  2632. bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
  2633. {
  2634. int r;
  2635. if (vq->avail_idx != vq->last_avail_idx)
  2636. return false;
  2637. r = vhost_get_avail_idx(vq);
  2638. /* Note: we treat error as non-empty here */
  2639. return r == 0;
  2640. }
  2641. EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
  2642. /* OK, now we need to know about added descriptors. */
  2643. bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
  2644. {
  2645. int r;
  2646. if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
  2647. return false;
  2648. vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
  2649. if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
  2650. r = vhost_update_used_flags(vq);
  2651. if (r) {
  2652. vq_err(vq, "Failed to enable notification at %p: %d\n",
  2653. &vq->used->flags, r);
  2654. return false;
  2655. }
  2656. } else {
  2657. r = vhost_update_avail_event(vq);
  2658. if (r) {
  2659. vq_err(vq, "Failed to update avail event index at %p: %d\n",
  2660. vhost_avail_event(vq), r);
  2661. return false;
  2662. }
  2663. }
  2664. /* They could have slipped one in as we were doing that: make
  2665. * sure it's written, then check again. */
  2666. smp_mb();
  2667. r = vhost_get_avail_idx(vq);
  2668. /* Note: we treat error as empty here */
  2669. if (unlikely(r < 0))
  2670. return false;
  2671. return r;
  2672. }
  2673. EXPORT_SYMBOL_GPL(vhost_enable_notify);
  2674. /* We don't need to be notified again. */
  2675. void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
  2676. {
  2677. int r;
  2678. if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
  2679. return;
  2680. vq->used_flags |= VRING_USED_F_NO_NOTIFY;
  2681. if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
  2682. r = vhost_update_used_flags(vq);
  2683. if (r)
  2684. vq_err(vq, "Failed to disable notification at %p: %d\n",
  2685. &vq->used->flags, r);
  2686. }
  2687. }
  2688. EXPORT_SYMBOL_GPL(vhost_disable_notify);
  2689. /* Create a new message. */
  2690. struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type)
  2691. {
  2692. /* Make sure all padding within the structure is initialized. */
  2693. struct vhost_msg_node *node = kzalloc(sizeof(*node), GFP_KERNEL);
  2694. if (!node)
  2695. return NULL;
  2696. node->vq = vq;
  2697. node->msg.type = type;
  2698. return node;
  2699. }
  2700. EXPORT_SYMBOL_GPL(vhost_new_msg);
  2701. void vhost_enqueue_msg(struct vhost_dev *dev, struct list_head *head,
  2702. struct vhost_msg_node *node)
  2703. {
  2704. spin_lock(&dev->iotlb_lock);
  2705. list_add_tail(&node->node, head);
  2706. spin_unlock(&dev->iotlb_lock);
  2707. wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
  2708. }
  2709. EXPORT_SYMBOL_GPL(vhost_enqueue_msg);
  2710. struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
  2711. struct list_head *head)
  2712. {
  2713. struct vhost_msg_node *node = NULL;
  2714. spin_lock(&dev->iotlb_lock);
  2715. if (!list_empty(head)) {
  2716. node = list_first_entry(head, struct vhost_msg_node,
  2717. node);
  2718. list_del(&node->node);
  2719. }
  2720. spin_unlock(&dev->iotlb_lock);
  2721. return node;
  2722. }
  2723. EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
  2724. void vhost_set_backend_features(struct vhost_dev *dev, u64 features)
  2725. {
  2726. struct vhost_virtqueue *vq;
  2727. int i;
  2728. mutex_lock(&dev->mutex);
  2729. for (i = 0; i < dev->nvqs; ++i) {
  2730. vq = dev->vqs[i];
  2731. mutex_lock(&vq->mutex);
  2732. vq->acked_backend_features = features;
  2733. mutex_unlock(&vq->mutex);
  2734. }
  2735. mutex_unlock(&dev->mutex);
  2736. }
  2737. EXPORT_SYMBOL_GPL(vhost_set_backend_features);
  2738. static int __init vhost_init(void)
  2739. {
  2740. return 0;
  2741. }
  2742. static void __exit vhost_exit(void)
  2743. {
  2744. }
  2745. module_init(vhost_init);
  2746. module_exit(vhost_exit);
  2747. MODULE_VERSION("0.0.1");
  2748. MODULE_LICENSE("GPL v2");
  2749. MODULE_AUTHOR("Michael S. Tsirkin");
  2750. MODULE_DESCRIPTION("Host kernel accelerator for virtio");