poll.c 28 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/kernel.h>
  3. #include <linux/errno.h>
  4. #include <linux/fs.h>
  5. #include <linux/file.h>
  6. #include <linux/mm.h>
  7. #include <linux/slab.h>
  8. #include <linux/poll.h>
  9. #include <linux/hashtable.h>
  10. #include <linux/io_uring.h>
  11. #include <trace/events/io_uring.h>
  12. #include <uapi/linux/io_uring.h>
  13. #include "io_uring.h"
  14. #include "alloc_cache.h"
  15. #include "refs.h"
  16. #include "napi.h"
  17. #include "opdef.h"
  18. #include "kbuf.h"
  19. #include "poll.h"
  20. #include "cancel.h"
  21. struct io_poll_update {
  22. struct file *file;
  23. u64 old_user_data;
  24. u64 new_user_data;
  25. __poll_t events;
  26. bool update_events;
  27. bool update_user_data;
  28. };
  29. struct io_poll_table {
  30. struct poll_table_struct pt;
  31. struct io_kiocb *req;
  32. int nr_entries;
  33. int error;
  34. bool owning;
  35. /* output value, set only if arm poll returns >0 */
  36. __poll_t result_mask;
  37. };
  38. #define IO_POLL_CANCEL_FLAG BIT(31)
  39. #define IO_POLL_RETRY_FLAG BIT(30)
  40. #define IO_POLL_REF_MASK GENMASK(29, 0)
  41. /*
  42. * We usually have 1-2 refs taken, 128 is more than enough and we want to
  43. * maximise the margin between this amount and the moment when it overflows.
  44. */
  45. #define IO_POLL_REF_BIAS 128
  46. #define IO_WQE_F_DOUBLE 1
  47. static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
  48. void *key);
  49. static inline struct io_kiocb *wqe_to_req(struct wait_queue_entry *wqe)
  50. {
  51. unsigned long priv = (unsigned long)wqe->private;
  52. return (struct io_kiocb *)(priv & ~IO_WQE_F_DOUBLE);
  53. }
  54. static inline bool wqe_is_double(struct wait_queue_entry *wqe)
  55. {
  56. unsigned long priv = (unsigned long)wqe->private;
  57. return priv & IO_WQE_F_DOUBLE;
  58. }
  59. static bool io_poll_get_ownership_slowpath(struct io_kiocb *req)
  60. {
  61. int v;
  62. /*
  63. * poll_refs are already elevated and we don't have much hope for
  64. * grabbing the ownership. Instead of incrementing set a retry flag
  65. * to notify the loop that there might have been some change.
  66. */
  67. v = atomic_fetch_or(IO_POLL_RETRY_FLAG, &req->poll_refs);
  68. if (v & IO_POLL_REF_MASK)
  69. return false;
  70. return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
  71. }
  72. /*
  73. * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
  74. * bump it and acquire ownership. It's disallowed to modify requests while not
  75. * owning it, that prevents from races for enqueueing task_work's and b/w
  76. * arming poll and wakeups.
  77. */
  78. static inline bool io_poll_get_ownership(struct io_kiocb *req)
  79. {
  80. if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
  81. return io_poll_get_ownership_slowpath(req);
  82. return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
  83. }
  84. static void io_poll_mark_cancelled(struct io_kiocb *req)
  85. {
  86. atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
  87. }
  88. static struct io_poll *io_poll_get_double(struct io_kiocb *req)
  89. {
  90. /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
  91. if (req->opcode == IORING_OP_POLL_ADD)
  92. return req->async_data;
  93. return req->apoll->double_poll;
  94. }
  95. static struct io_poll *io_poll_get_single(struct io_kiocb *req)
  96. {
  97. if (req->opcode == IORING_OP_POLL_ADD)
  98. return io_kiocb_to_cmd(req, struct io_poll);
  99. return &req->apoll->poll;
  100. }
  101. static void io_poll_req_insert(struct io_kiocb *req)
  102. {
  103. struct io_hash_table *table = &req->ctx->cancel_table;
  104. u32 index = hash_long(req->cqe.user_data, table->hash_bits);
  105. struct io_hash_bucket *hb = &table->hbs[index];
  106. spin_lock(&hb->lock);
  107. hlist_add_head(&req->hash_node, &hb->list);
  108. spin_unlock(&hb->lock);
  109. }
  110. static void io_poll_req_delete(struct io_kiocb *req, struct io_ring_ctx *ctx)
  111. {
  112. struct io_hash_table *table = &req->ctx->cancel_table;
  113. u32 index = hash_long(req->cqe.user_data, table->hash_bits);
  114. spinlock_t *lock = &table->hbs[index].lock;
  115. spin_lock(lock);
  116. hash_del(&req->hash_node);
  117. spin_unlock(lock);
  118. }
  119. static void io_poll_req_insert_locked(struct io_kiocb *req)
  120. {
  121. struct io_hash_table *table = &req->ctx->cancel_table_locked;
  122. u32 index = hash_long(req->cqe.user_data, table->hash_bits);
  123. lockdep_assert_held(&req->ctx->uring_lock);
  124. hlist_add_head(&req->hash_node, &table->hbs[index].list);
  125. }
  126. static void io_poll_tw_hash_eject(struct io_kiocb *req, struct io_tw_state *ts)
  127. {
  128. struct io_ring_ctx *ctx = req->ctx;
  129. if (req->flags & REQ_F_HASH_LOCKED) {
  130. /*
  131. * ->cancel_table_locked is protected by ->uring_lock in
  132. * contrast to per bucket spinlocks. Likely, tctx_task_work()
  133. * already grabbed the mutex for us, but there is a chance it
  134. * failed.
  135. */
  136. io_tw_lock(ctx, ts);
  137. hash_del(&req->hash_node);
  138. req->flags &= ~REQ_F_HASH_LOCKED;
  139. } else {
  140. io_poll_req_delete(req, ctx);
  141. }
  142. }
  143. static void io_init_poll_iocb(struct io_poll *poll, __poll_t events)
  144. {
  145. poll->head = NULL;
  146. #define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
  147. /* mask in events that we always want/need */
  148. poll->events = events | IO_POLL_UNMASK;
  149. INIT_LIST_HEAD(&poll->wait.entry);
  150. init_waitqueue_func_entry(&poll->wait, io_poll_wake);
  151. }
  152. static inline void io_poll_remove_entry(struct io_poll *poll)
  153. {
  154. struct wait_queue_head *head = smp_load_acquire(&poll->head);
  155. if (head) {
  156. spin_lock_irq(&head->lock);
  157. list_del_init(&poll->wait.entry);
  158. poll->head = NULL;
  159. spin_unlock_irq(&head->lock);
  160. }
  161. }
  162. static void io_poll_remove_entries(struct io_kiocb *req)
  163. {
  164. /*
  165. * Nothing to do if neither of those flags are set. Avoid dipping
  166. * into the poll/apoll/double cachelines if we can.
  167. */
  168. if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL)))
  169. return;
  170. /*
  171. * While we hold the waitqueue lock and the waitqueue is nonempty,
  172. * wake_up_pollfree() will wait for us. However, taking the waitqueue
  173. * lock in the first place can race with the waitqueue being freed.
  174. *
  175. * We solve this as eventpoll does: by taking advantage of the fact that
  176. * all users of wake_up_pollfree() will RCU-delay the actual free. If
  177. * we enter rcu_read_lock() and see that the pointer to the queue is
  178. * non-NULL, we can then lock it without the memory being freed out from
  179. * under us.
  180. *
  181. * Keep holding rcu_read_lock() as long as we hold the queue lock, in
  182. * case the caller deletes the entry from the queue, leaving it empty.
  183. * In that case, only RCU prevents the queue memory from being freed.
  184. */
  185. rcu_read_lock();
  186. if (req->flags & REQ_F_SINGLE_POLL)
  187. io_poll_remove_entry(io_poll_get_single(req));
  188. if (req->flags & REQ_F_DOUBLE_POLL)
  189. io_poll_remove_entry(io_poll_get_double(req));
  190. rcu_read_unlock();
  191. }
  192. enum {
  193. IOU_POLL_DONE = 0,
  194. IOU_POLL_NO_ACTION = 1,
  195. IOU_POLL_REMOVE_POLL_USE_RES = 2,
  196. IOU_POLL_REISSUE = 3,
  197. IOU_POLL_REQUEUE = 4,
  198. };
  199. static void __io_poll_execute(struct io_kiocb *req, int mask)
  200. {
  201. unsigned flags = 0;
  202. io_req_set_res(req, mask, 0);
  203. req->io_task_work.func = io_poll_task_func;
  204. trace_io_uring_task_add(req, mask);
  205. if (!(req->flags & REQ_F_POLL_NO_LAZY))
  206. flags = IOU_F_TWQ_LAZY_WAKE;
  207. __io_req_task_work_add(req, flags);
  208. }
  209. static inline void io_poll_execute(struct io_kiocb *req, int res)
  210. {
  211. if (io_poll_get_ownership(req))
  212. __io_poll_execute(req, res);
  213. }
  214. /*
  215. * All poll tw should go through this. Checks for poll events, manages
  216. * references, does rewait, etc.
  217. *
  218. * Returns a negative error on failure. IOU_POLL_NO_ACTION when no action
  219. * require, which is either spurious wakeup or multishot CQE is served.
  220. * IOU_POLL_DONE when it's done with the request, then the mask is stored in
  221. * req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot
  222. * poll and that the result is stored in req->cqe.
  223. */
  224. static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts)
  225. {
  226. int v;
  227. if (unlikely(io_should_terminate_tw(req->ctx)))
  228. return -ECANCELED;
  229. do {
  230. v = atomic_read(&req->poll_refs);
  231. if (unlikely(v != 1)) {
  232. /* tw should be the owner and so have some refs */
  233. if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
  234. return IOU_POLL_NO_ACTION;
  235. if (v & IO_POLL_CANCEL_FLAG)
  236. return -ECANCELED;
  237. /*
  238. * cqe.res contains only events of the first wake up
  239. * and all others are to be lost. Redo vfs_poll() to get
  240. * up to date state.
  241. */
  242. if ((v & IO_POLL_REF_MASK) != 1)
  243. req->cqe.res = 0;
  244. if (v & IO_POLL_RETRY_FLAG) {
  245. req->cqe.res = 0;
  246. /*
  247. * We won't find new events that came in between
  248. * vfs_poll and the ref put unless we clear the
  249. * flag in advance.
  250. */
  251. atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs);
  252. v &= ~IO_POLL_RETRY_FLAG;
  253. }
  254. }
  255. /* the mask was stashed in __io_poll_execute */
  256. if (!req->cqe.res) {
  257. struct poll_table_struct pt = { ._key = req->apoll_events };
  258. req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
  259. /*
  260. * We got woken with a mask, but someone else got to
  261. * it first. The above vfs_poll() doesn't add us back
  262. * to the waitqueue, so if we get nothing back, we
  263. * should be safe and attempt a reissue.
  264. */
  265. if (unlikely(!req->cqe.res)) {
  266. /* Multishot armed need not reissue */
  267. if (!(req->apoll_events & EPOLLONESHOT))
  268. continue;
  269. return IOU_POLL_REISSUE;
  270. }
  271. }
  272. if (req->apoll_events & EPOLLONESHOT)
  273. return IOU_POLL_DONE;
  274. /* multishot, just fill a CQE and proceed */
  275. if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
  276. __poll_t mask = mangle_poll(req->cqe.res &
  277. req->apoll_events);
  278. if (!io_req_post_cqe(req, mask, IORING_CQE_F_MORE)) {
  279. io_req_set_res(req, mask, 0);
  280. return IOU_POLL_REMOVE_POLL_USE_RES;
  281. }
  282. } else {
  283. int ret = io_poll_issue(req, ts);
  284. if (ret == IOU_STOP_MULTISHOT)
  285. return IOU_POLL_REMOVE_POLL_USE_RES;
  286. else if (ret == IOU_REQUEUE)
  287. return IOU_POLL_REQUEUE;
  288. if (ret < 0)
  289. return ret;
  290. }
  291. /* force the next iteration to vfs_poll() */
  292. req->cqe.res = 0;
  293. /*
  294. * Release all references, retry if someone tried to restart
  295. * task_work while we were executing it.
  296. */
  297. v &= IO_POLL_REF_MASK;
  298. } while (atomic_sub_return(v, &req->poll_refs) & IO_POLL_REF_MASK);
  299. io_napi_add(req);
  300. return IOU_POLL_NO_ACTION;
  301. }
  302. void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts)
  303. {
  304. int ret;
  305. ret = io_poll_check_events(req, ts);
  306. if (ret == IOU_POLL_NO_ACTION) {
  307. io_kbuf_recycle(req, 0);
  308. return;
  309. } else if (ret == IOU_POLL_REQUEUE) {
  310. io_kbuf_recycle(req, 0);
  311. __io_poll_execute(req, 0);
  312. return;
  313. }
  314. io_poll_remove_entries(req);
  315. io_poll_tw_hash_eject(req, ts);
  316. if (req->opcode == IORING_OP_POLL_ADD) {
  317. if (ret == IOU_POLL_DONE) {
  318. struct io_poll *poll;
  319. poll = io_kiocb_to_cmd(req, struct io_poll);
  320. req->cqe.res = mangle_poll(req->cqe.res & poll->events);
  321. } else if (ret == IOU_POLL_REISSUE) {
  322. io_req_task_submit(req, ts);
  323. return;
  324. } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) {
  325. req->cqe.res = ret;
  326. req_set_fail(req);
  327. }
  328. io_req_set_res(req, req->cqe.res, 0);
  329. io_req_task_complete(req, ts);
  330. } else {
  331. io_tw_lock(req->ctx, ts);
  332. if (ret == IOU_POLL_REMOVE_POLL_USE_RES)
  333. io_req_task_complete(req, ts);
  334. else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE)
  335. io_req_task_submit(req, ts);
  336. else
  337. io_req_defer_failed(req, ret);
  338. }
  339. }
  340. static void io_poll_cancel_req(struct io_kiocb *req)
  341. {
  342. io_poll_mark_cancelled(req);
  343. /* kick tw, which should complete the request */
  344. io_poll_execute(req, 0);
  345. }
  346. #define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI)
  347. static __cold int io_pollfree_wake(struct io_kiocb *req, struct io_poll *poll)
  348. {
  349. io_poll_mark_cancelled(req);
  350. /* we have to kick tw in case it's not already */
  351. io_poll_execute(req, 0);
  352. /*
  353. * If the waitqueue is being freed early but someone is already
  354. * holds ownership over it, we have to tear down the request as
  355. * best we can. That means immediately removing the request from
  356. * its waitqueue and preventing all further accesses to the
  357. * waitqueue via the request.
  358. */
  359. list_del_init(&poll->wait.entry);
  360. /*
  361. * Careful: this *must* be the last step, since as soon
  362. * as req->head is NULL'ed out, the request can be
  363. * completed and freed, since aio_poll_complete_work()
  364. * will no longer need to take the waitqueue lock.
  365. */
  366. smp_store_release(&poll->head, NULL);
  367. return 1;
  368. }
  369. static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
  370. void *key)
  371. {
  372. struct io_kiocb *req = wqe_to_req(wait);
  373. struct io_poll *poll = container_of(wait, struct io_poll, wait);
  374. __poll_t mask = key_to_poll(key);
  375. if (unlikely(mask & POLLFREE))
  376. return io_pollfree_wake(req, poll);
  377. /* for instances that support it check for an event match first */
  378. if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON)))
  379. return 0;
  380. if (io_poll_get_ownership(req)) {
  381. /*
  382. * If we trigger a multishot poll off our own wakeup path,
  383. * disable multishot as there is a circular dependency between
  384. * CQ posting and triggering the event.
  385. */
  386. if (mask & EPOLL_URING_WAKE)
  387. poll->events |= EPOLLONESHOT;
  388. /* optional, saves extra locking for removal in tw handler */
  389. if (mask && poll->events & EPOLLONESHOT) {
  390. list_del_init(&poll->wait.entry);
  391. poll->head = NULL;
  392. if (wqe_is_double(wait))
  393. req->flags &= ~REQ_F_DOUBLE_POLL;
  394. else
  395. req->flags &= ~REQ_F_SINGLE_POLL;
  396. }
  397. __io_poll_execute(req, mask);
  398. }
  399. return 1;
  400. }
  401. /* fails only when polling is already completing by the first entry */
  402. static bool io_poll_double_prepare(struct io_kiocb *req)
  403. {
  404. struct wait_queue_head *head;
  405. struct io_poll *poll = io_poll_get_single(req);
  406. /* head is RCU protected, see io_poll_remove_entries() comments */
  407. rcu_read_lock();
  408. head = smp_load_acquire(&poll->head);
  409. /*
  410. * poll arm might not hold ownership and so race for req->flags with
  411. * io_poll_wake(). There is only one poll entry queued, serialise with
  412. * it by taking its head lock. As we're still arming the tw hanlder
  413. * is not going to be run, so there are no races with it.
  414. */
  415. if (head) {
  416. spin_lock_irq(&head->lock);
  417. req->flags |= REQ_F_DOUBLE_POLL;
  418. if (req->opcode == IORING_OP_POLL_ADD)
  419. req->flags |= REQ_F_ASYNC_DATA;
  420. spin_unlock_irq(&head->lock);
  421. }
  422. rcu_read_unlock();
  423. return !!head;
  424. }
  425. static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
  426. struct wait_queue_head *head,
  427. struct io_poll **poll_ptr)
  428. {
  429. struct io_kiocb *req = pt->req;
  430. unsigned long wqe_private = (unsigned long) req;
  431. /*
  432. * The file being polled uses multiple waitqueues for poll handling
  433. * (e.g. one for read, one for write). Setup a separate io_poll
  434. * if this happens.
  435. */
  436. if (unlikely(pt->nr_entries)) {
  437. struct io_poll *first = poll;
  438. /* double add on the same waitqueue head, ignore */
  439. if (first->head == head)
  440. return;
  441. /* already have a 2nd entry, fail a third attempt */
  442. if (*poll_ptr) {
  443. if ((*poll_ptr)->head == head)
  444. return;
  445. pt->error = -EINVAL;
  446. return;
  447. }
  448. poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
  449. if (!poll) {
  450. pt->error = -ENOMEM;
  451. return;
  452. }
  453. /* mark as double wq entry */
  454. wqe_private |= IO_WQE_F_DOUBLE;
  455. io_init_poll_iocb(poll, first->events);
  456. if (!io_poll_double_prepare(req)) {
  457. /* the request is completing, just back off */
  458. kfree(poll);
  459. return;
  460. }
  461. *poll_ptr = poll;
  462. } else {
  463. /* fine to modify, there is no poll queued to race with us */
  464. req->flags |= REQ_F_SINGLE_POLL;
  465. }
  466. pt->nr_entries++;
  467. poll->head = head;
  468. poll->wait.private = (void *) wqe_private;
  469. if (poll->events & EPOLLEXCLUSIVE) {
  470. add_wait_queue_exclusive(head, &poll->wait);
  471. } else {
  472. add_wait_queue(head, &poll->wait);
  473. }
  474. }
  475. static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
  476. struct poll_table_struct *p)
  477. {
  478. struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
  479. struct io_poll *poll = io_kiocb_to_cmd(pt->req, struct io_poll);
  480. __io_queue_proc(poll, pt, head,
  481. (struct io_poll **) &pt->req->async_data);
  482. }
  483. static bool io_poll_can_finish_inline(struct io_kiocb *req,
  484. struct io_poll_table *pt)
  485. {
  486. return pt->owning || io_poll_get_ownership(req);
  487. }
  488. static void io_poll_add_hash(struct io_kiocb *req)
  489. {
  490. if (req->flags & REQ_F_HASH_LOCKED)
  491. io_poll_req_insert_locked(req);
  492. else
  493. io_poll_req_insert(req);
  494. }
  495. /*
  496. * Returns 0 when it's handed over for polling. The caller owns the requests if
  497. * it returns non-zero, but otherwise should not touch it. Negative values
  498. * contain an error code. When the result is >0, the polling has completed
  499. * inline and ipt.result_mask is set to the mask.
  500. */
  501. static int __io_arm_poll_handler(struct io_kiocb *req,
  502. struct io_poll *poll,
  503. struct io_poll_table *ipt, __poll_t mask,
  504. unsigned issue_flags)
  505. {
  506. INIT_HLIST_NODE(&req->hash_node);
  507. io_init_poll_iocb(poll, mask);
  508. poll->file = req->file;
  509. req->apoll_events = poll->events;
  510. ipt->pt._key = mask;
  511. ipt->req = req;
  512. ipt->error = 0;
  513. ipt->nr_entries = 0;
  514. /*
  515. * Polling is either completed here or via task_work, so if we're in the
  516. * task context we're naturally serialised with tw by merit of running
  517. * the same task. When it's io-wq, take the ownership to prevent tw
  518. * from running. However, when we're in the task context, skip taking
  519. * it as an optimisation.
  520. *
  521. * Note: even though the request won't be completed/freed, without
  522. * ownership we still can race with io_poll_wake().
  523. * io_poll_can_finish_inline() tries to deal with that.
  524. */
  525. ipt->owning = issue_flags & IO_URING_F_UNLOCKED;
  526. atomic_set(&req->poll_refs, (int)ipt->owning);
  527. /* io-wq doesn't hold uring_lock */
  528. if (issue_flags & IO_URING_F_UNLOCKED)
  529. req->flags &= ~REQ_F_HASH_LOCKED;
  530. /*
  531. * Exclusive waits may only wake a limited amount of entries
  532. * rather than all of them, this may interfere with lazy
  533. * wake if someone does wait(events > 1). Ensure we don't do
  534. * lazy wake for those, as we need to process each one as they
  535. * come in.
  536. */
  537. if (poll->events & EPOLLEXCLUSIVE)
  538. req->flags |= REQ_F_POLL_NO_LAZY;
  539. mask = vfs_poll(req->file, &ipt->pt) & poll->events;
  540. if (unlikely(ipt->error || !ipt->nr_entries)) {
  541. io_poll_remove_entries(req);
  542. if (!io_poll_can_finish_inline(req, ipt)) {
  543. io_poll_mark_cancelled(req);
  544. return 0;
  545. } else if (mask && (poll->events & EPOLLET)) {
  546. ipt->result_mask = mask;
  547. return 1;
  548. }
  549. return ipt->error ?: -EINVAL;
  550. }
  551. if (mask &&
  552. ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) {
  553. if (!io_poll_can_finish_inline(req, ipt)) {
  554. io_poll_add_hash(req);
  555. return 0;
  556. }
  557. io_poll_remove_entries(req);
  558. ipt->result_mask = mask;
  559. /* no one else has access to the req, forget about the ref */
  560. return 1;
  561. }
  562. io_poll_add_hash(req);
  563. if (mask && (poll->events & EPOLLET) &&
  564. io_poll_can_finish_inline(req, ipt)) {
  565. __io_poll_execute(req, mask);
  566. return 0;
  567. }
  568. io_napi_add(req);
  569. if (ipt->owning) {
  570. /*
  571. * Try to release ownership. If we see a change of state, e.g.
  572. * poll was waken up, queue up a tw, it'll deal with it.
  573. */
  574. if (atomic_cmpxchg(&req->poll_refs, 1, 0) != 1)
  575. __io_poll_execute(req, 0);
  576. }
  577. return 0;
  578. }
  579. static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
  580. struct poll_table_struct *p)
  581. {
  582. struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
  583. struct async_poll *apoll = pt->req->apoll;
  584. __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
  585. }
  586. /*
  587. * We can't reliably detect loops in repeated poll triggers and issue
  588. * subsequently failing. But rather than fail these immediately, allow a
  589. * certain amount of retries before we give up. Given that this condition
  590. * should _rarely_ trigger even once, we should be fine with a larger value.
  591. */
  592. #define APOLL_MAX_RETRY 128
  593. static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
  594. unsigned issue_flags)
  595. {
  596. struct io_ring_ctx *ctx = req->ctx;
  597. struct async_poll *apoll;
  598. if (req->flags & REQ_F_POLLED) {
  599. apoll = req->apoll;
  600. kfree(apoll->double_poll);
  601. } else if (!(issue_flags & IO_URING_F_UNLOCKED)) {
  602. apoll = io_alloc_cache_get(&ctx->apoll_cache);
  603. if (!apoll)
  604. goto alloc_apoll;
  605. apoll->poll.retries = APOLL_MAX_RETRY;
  606. } else {
  607. alloc_apoll:
  608. apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
  609. if (unlikely(!apoll))
  610. return NULL;
  611. apoll->poll.retries = APOLL_MAX_RETRY;
  612. }
  613. apoll->double_poll = NULL;
  614. req->apoll = apoll;
  615. if (unlikely(!--apoll->poll.retries))
  616. return NULL;
  617. return apoll;
  618. }
  619. int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
  620. {
  621. const struct io_issue_def *def = &io_issue_defs[req->opcode];
  622. struct async_poll *apoll;
  623. struct io_poll_table ipt;
  624. __poll_t mask = POLLPRI | POLLERR | EPOLLET;
  625. int ret;
  626. /*
  627. * apoll requests already grab the mutex to complete in the tw handler,
  628. * so removal from the mutex-backed hash is free, use it by default.
  629. */
  630. req->flags |= REQ_F_HASH_LOCKED;
  631. if (!def->pollin && !def->pollout)
  632. return IO_APOLL_ABORTED;
  633. if (!io_file_can_poll(req))
  634. return IO_APOLL_ABORTED;
  635. if (!(req->flags & REQ_F_APOLL_MULTISHOT))
  636. mask |= EPOLLONESHOT;
  637. if (def->pollin) {
  638. mask |= EPOLLIN | EPOLLRDNORM;
  639. /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
  640. if (req->flags & REQ_F_CLEAR_POLLIN)
  641. mask &= ~EPOLLIN;
  642. } else {
  643. mask |= EPOLLOUT | EPOLLWRNORM;
  644. }
  645. if (def->poll_exclusive)
  646. mask |= EPOLLEXCLUSIVE;
  647. apoll = io_req_alloc_apoll(req, issue_flags);
  648. if (!apoll)
  649. return IO_APOLL_ABORTED;
  650. req->flags &= ~(REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL);
  651. req->flags |= REQ_F_POLLED;
  652. ipt.pt._qproc = io_async_queue_proc;
  653. io_kbuf_recycle(req, issue_flags);
  654. ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, issue_flags);
  655. if (ret)
  656. return ret > 0 ? IO_APOLL_READY : IO_APOLL_ABORTED;
  657. trace_io_uring_poll_arm(req, mask, apoll->poll.events);
  658. return IO_APOLL_OK;
  659. }
  660. static __cold bool io_poll_remove_all_table(struct task_struct *tsk,
  661. struct io_hash_table *table,
  662. bool cancel_all)
  663. {
  664. unsigned nr_buckets = 1U << table->hash_bits;
  665. struct hlist_node *tmp;
  666. struct io_kiocb *req;
  667. bool found = false;
  668. int i;
  669. for (i = 0; i < nr_buckets; i++) {
  670. struct io_hash_bucket *hb = &table->hbs[i];
  671. spin_lock(&hb->lock);
  672. hlist_for_each_entry_safe(req, tmp, &hb->list, hash_node) {
  673. if (io_match_task_safe(req, tsk, cancel_all)) {
  674. hlist_del_init(&req->hash_node);
  675. io_poll_cancel_req(req);
  676. found = true;
  677. }
  678. }
  679. spin_unlock(&hb->lock);
  680. }
  681. return found;
  682. }
  683. /*
  684. * Returns true if we found and killed one or more poll requests
  685. */
  686. __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
  687. bool cancel_all)
  688. __must_hold(&ctx->uring_lock)
  689. {
  690. bool ret;
  691. ret = io_poll_remove_all_table(tsk, &ctx->cancel_table, cancel_all);
  692. ret |= io_poll_remove_all_table(tsk, &ctx->cancel_table_locked, cancel_all);
  693. return ret;
  694. }
  695. static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
  696. struct io_cancel_data *cd,
  697. struct io_hash_table *table,
  698. struct io_hash_bucket **out_bucket)
  699. {
  700. struct io_kiocb *req;
  701. u32 index = hash_long(cd->data, table->hash_bits);
  702. struct io_hash_bucket *hb = &table->hbs[index];
  703. *out_bucket = NULL;
  704. spin_lock(&hb->lock);
  705. hlist_for_each_entry(req, &hb->list, hash_node) {
  706. if (cd->data != req->cqe.user_data)
  707. continue;
  708. if (poll_only && req->opcode != IORING_OP_POLL_ADD)
  709. continue;
  710. if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
  711. if (io_cancel_match_sequence(req, cd->seq))
  712. continue;
  713. }
  714. *out_bucket = hb;
  715. return req;
  716. }
  717. spin_unlock(&hb->lock);
  718. return NULL;
  719. }
  720. static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
  721. struct io_cancel_data *cd,
  722. struct io_hash_table *table,
  723. struct io_hash_bucket **out_bucket)
  724. {
  725. unsigned nr_buckets = 1U << table->hash_bits;
  726. struct io_kiocb *req;
  727. int i;
  728. *out_bucket = NULL;
  729. for (i = 0; i < nr_buckets; i++) {
  730. struct io_hash_bucket *hb = &table->hbs[i];
  731. spin_lock(&hb->lock);
  732. hlist_for_each_entry(req, &hb->list, hash_node) {
  733. if (io_cancel_req_match(req, cd)) {
  734. *out_bucket = hb;
  735. return req;
  736. }
  737. }
  738. spin_unlock(&hb->lock);
  739. }
  740. return NULL;
  741. }
  742. static int io_poll_disarm(struct io_kiocb *req)
  743. {
  744. if (!req)
  745. return -ENOENT;
  746. if (!io_poll_get_ownership(req))
  747. return -EALREADY;
  748. io_poll_remove_entries(req);
  749. hash_del(&req->hash_node);
  750. return 0;
  751. }
  752. static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
  753. struct io_hash_table *table)
  754. {
  755. struct io_hash_bucket *bucket;
  756. struct io_kiocb *req;
  757. if (cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP |
  758. IORING_ASYNC_CANCEL_ANY))
  759. req = io_poll_file_find(ctx, cd, table, &bucket);
  760. else
  761. req = io_poll_find(ctx, false, cd, table, &bucket);
  762. if (req)
  763. io_poll_cancel_req(req);
  764. if (bucket)
  765. spin_unlock(&bucket->lock);
  766. return req ? 0 : -ENOENT;
  767. }
  768. int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
  769. unsigned issue_flags)
  770. {
  771. int ret;
  772. ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table);
  773. if (ret != -ENOENT)
  774. return ret;
  775. io_ring_submit_lock(ctx, issue_flags);
  776. ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table_locked);
  777. io_ring_submit_unlock(ctx, issue_flags);
  778. return ret;
  779. }
  780. static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
  781. unsigned int flags)
  782. {
  783. u32 events;
  784. events = READ_ONCE(sqe->poll32_events);
  785. #ifdef __BIG_ENDIAN
  786. events = swahw32(events);
  787. #endif
  788. if (!(flags & IORING_POLL_ADD_MULTI))
  789. events |= EPOLLONESHOT;
  790. if (!(flags & IORING_POLL_ADD_LEVEL))
  791. events |= EPOLLET;
  792. return demangle_poll(events) |
  793. (events & (EPOLLEXCLUSIVE|EPOLLONESHOT|EPOLLET));
  794. }
  795. int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  796. {
  797. struct io_poll_update *upd = io_kiocb_to_cmd(req, struct io_poll_update);
  798. u32 flags;
  799. if (sqe->buf_index || sqe->splice_fd_in)
  800. return -EINVAL;
  801. flags = READ_ONCE(sqe->len);
  802. if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
  803. IORING_POLL_ADD_MULTI))
  804. return -EINVAL;
  805. /* meaningless without update */
  806. if (flags == IORING_POLL_ADD_MULTI)
  807. return -EINVAL;
  808. upd->old_user_data = READ_ONCE(sqe->addr);
  809. upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
  810. upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
  811. upd->new_user_data = READ_ONCE(sqe->off);
  812. if (!upd->update_user_data && upd->new_user_data)
  813. return -EINVAL;
  814. if (upd->update_events)
  815. upd->events = io_poll_parse_events(sqe, flags);
  816. else if (sqe->poll32_events)
  817. return -EINVAL;
  818. return 0;
  819. }
  820. int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  821. {
  822. struct io_poll *poll = io_kiocb_to_cmd(req, struct io_poll);
  823. u32 flags;
  824. if (sqe->buf_index || sqe->off || sqe->addr)
  825. return -EINVAL;
  826. flags = READ_ONCE(sqe->len);
  827. if (flags & ~IORING_POLL_ADD_MULTI)
  828. return -EINVAL;
  829. if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
  830. return -EINVAL;
  831. poll->events = io_poll_parse_events(sqe, flags);
  832. return 0;
  833. }
  834. int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
  835. {
  836. struct io_poll *poll = io_kiocb_to_cmd(req, struct io_poll);
  837. struct io_poll_table ipt;
  838. int ret;
  839. ipt.pt._qproc = io_poll_queue_proc;
  840. /*
  841. * If sqpoll or single issuer, there is no contention for ->uring_lock
  842. * and we'll end up holding it in tw handlers anyway.
  843. */
  844. if (req->ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_SINGLE_ISSUER))
  845. req->flags |= REQ_F_HASH_LOCKED;
  846. ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags);
  847. if (ret > 0) {
  848. io_req_set_res(req, ipt.result_mask, 0);
  849. return IOU_OK;
  850. }
  851. return ret ?: IOU_ISSUE_SKIP_COMPLETE;
  852. }
  853. int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
  854. {
  855. struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update);
  856. struct io_ring_ctx *ctx = req->ctx;
  857. struct io_cancel_data cd = { .ctx = ctx, .data = poll_update->old_user_data, };
  858. struct io_hash_bucket *bucket;
  859. struct io_kiocb *preq;
  860. int ret2, ret = 0;
  861. io_ring_submit_lock(ctx, issue_flags);
  862. preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket);
  863. ret2 = io_poll_disarm(preq);
  864. if (bucket)
  865. spin_unlock(&bucket->lock);
  866. if (!ret2)
  867. goto found;
  868. if (ret2 != -ENOENT) {
  869. ret = ret2;
  870. goto out;
  871. }
  872. preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table_locked, &bucket);
  873. ret2 = io_poll_disarm(preq);
  874. if (bucket)
  875. spin_unlock(&bucket->lock);
  876. if (ret2) {
  877. ret = ret2;
  878. goto out;
  879. }
  880. found:
  881. if (WARN_ON_ONCE(preq->opcode != IORING_OP_POLL_ADD)) {
  882. ret = -EFAULT;
  883. goto out;
  884. }
  885. if (poll_update->update_events || poll_update->update_user_data) {
  886. /* only mask one event flags, keep behavior flags */
  887. if (poll_update->update_events) {
  888. struct io_poll *poll = io_kiocb_to_cmd(preq, struct io_poll);
  889. poll->events &= ~0xffff;
  890. poll->events |= poll_update->events & 0xffff;
  891. poll->events |= IO_POLL_UNMASK;
  892. }
  893. if (poll_update->update_user_data)
  894. preq->cqe.user_data = poll_update->new_user_data;
  895. ret2 = io_poll_add(preq, issue_flags & ~IO_URING_F_UNLOCKED);
  896. /* successfully updated, don't complete poll request */
  897. if (!ret2 || ret2 == -EIOCBQUEUED)
  898. goto out;
  899. }
  900. req_set_fail(preq);
  901. io_req_set_res(preq, -ECANCELED, 0);
  902. preq->io_task_work.func = io_req_task_complete;
  903. io_req_task_work_add(preq);
  904. out:
  905. io_ring_submit_unlock(ctx, issue_flags);
  906. if (ret < 0) {
  907. req_set_fail(req);
  908. return ret;
  909. }
  910. /* complete update request, we're done with it */
  911. io_req_set_res(req, ret, 0);
  912. return IOU_OK;
  913. }