svc_rdma_rw.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (c) 2016-2018 Oracle. All rights reserved.
  4. *
  5. * Use the core R/W API to move RPC-over-RDMA Read and Write chunks.
  6. */
  7. #include <rdma/rw.h>
  8. #include <linux/sunrpc/rpc_rdma.h>
  9. #include <linux/sunrpc/svc_rdma.h>
  10. #include <linux/sunrpc/debug.h>
  11. #include "xprt_rdma.h"
  12. #include <trace/events/rpcrdma.h>
  13. #define RPCDBG_FACILITY RPCDBG_SVCXPRT
  14. static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc);
  15. static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc);
  16. /* Each R/W context contains state for one chain of RDMA Read or
  17. * Write Work Requests.
  18. *
  19. * Each WR chain handles a single contiguous server-side buffer,
  20. * because scatterlist entries after the first have to start on
  21. * page alignment. xdr_buf iovecs cannot guarantee alignment.
  22. *
  23. * Each WR chain handles only one R_key. Each RPC-over-RDMA segment
  24. * from a client may contain a unique R_key, so each WR chain moves
  25. * up to one segment at a time.
  26. *
  27. * The scatterlist makes this data structure over 4KB in size. To
  28. * make it less likely to fail, and to handle the allocation for
  29. * smaller I/O requests without disabling bottom-halves, these
  30. * contexts are created on demand, but cached and reused until the
  31. * controlling svcxprt_rdma is destroyed.
  32. */
  33. struct svc_rdma_rw_ctxt {
  34. struct list_head rw_list;
  35. struct rdma_rw_ctx rw_ctx;
  36. int rw_nents;
  37. struct sg_table rw_sg_table;
  38. struct scatterlist rw_first_sgl[0];
  39. };
  40. static inline struct svc_rdma_rw_ctxt *
  41. svc_rdma_next_ctxt(struct list_head *list)
  42. {
  43. return list_first_entry_or_null(list, struct svc_rdma_rw_ctxt,
  44. rw_list);
  45. }
  46. static struct svc_rdma_rw_ctxt *
  47. svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
  48. {
  49. struct svc_rdma_rw_ctxt *ctxt;
  50. spin_lock(&rdma->sc_rw_ctxt_lock);
  51. ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts);
  52. if (ctxt) {
  53. list_del(&ctxt->rw_list);
  54. spin_unlock(&rdma->sc_rw_ctxt_lock);
  55. } else {
  56. spin_unlock(&rdma->sc_rw_ctxt_lock);
  57. ctxt = kmalloc(sizeof(*ctxt) +
  58. SG_CHUNK_SIZE * sizeof(struct scatterlist),
  59. GFP_KERNEL);
  60. if (!ctxt)
  61. goto out;
  62. INIT_LIST_HEAD(&ctxt->rw_list);
  63. }
  64. ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl;
  65. if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges,
  66. ctxt->rw_sg_table.sgl)) {
  67. kfree(ctxt);
  68. ctxt = NULL;
  69. }
  70. out:
  71. return ctxt;
  72. }
  73. static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
  74. struct svc_rdma_rw_ctxt *ctxt)
  75. {
  76. sg_free_table_chained(&ctxt->rw_sg_table, true);
  77. spin_lock(&rdma->sc_rw_ctxt_lock);
  78. list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts);
  79. spin_unlock(&rdma->sc_rw_ctxt_lock);
  80. }
  81. /**
  82. * svc_rdma_destroy_rw_ctxts - Free accumulated R/W contexts
  83. * @rdma: transport about to be destroyed
  84. *
  85. */
  86. void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma)
  87. {
  88. struct svc_rdma_rw_ctxt *ctxt;
  89. while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) {
  90. list_del(&ctxt->rw_list);
  91. kfree(ctxt);
  92. }
  93. }
  94. /* A chunk context tracks all I/O for moving one Read or Write
  95. * chunk. This is a a set of rdma_rw's that handle data movement
  96. * for all segments of one chunk.
  97. *
  98. * These are small, acquired with a single allocator call, and
  99. * no more than one is needed per chunk. They are allocated on
  100. * demand, and not cached.
  101. */
  102. struct svc_rdma_chunk_ctxt {
  103. struct ib_cqe cc_cqe;
  104. struct svcxprt_rdma *cc_rdma;
  105. struct list_head cc_rwctxts;
  106. int cc_sqecount;
  107. };
  108. static void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
  109. struct svc_rdma_chunk_ctxt *cc)
  110. {
  111. cc->cc_rdma = rdma;
  112. svc_xprt_get(&rdma->sc_xprt);
  113. INIT_LIST_HEAD(&cc->cc_rwctxts);
  114. cc->cc_sqecount = 0;
  115. }
  116. static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc,
  117. enum dma_data_direction dir)
  118. {
  119. struct svcxprt_rdma *rdma = cc->cc_rdma;
  120. struct svc_rdma_rw_ctxt *ctxt;
  121. while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) {
  122. list_del(&ctxt->rw_list);
  123. rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp,
  124. rdma->sc_port_num, ctxt->rw_sg_table.sgl,
  125. ctxt->rw_nents, dir);
  126. svc_rdma_put_rw_ctxt(rdma, ctxt);
  127. }
  128. svc_xprt_put(&rdma->sc_xprt);
  129. }
  130. /* State for sending a Write or Reply chunk.
  131. * - Tracks progress of writing one chunk over all its segments
  132. * - Stores arguments for the SGL constructor functions
  133. */
  134. struct svc_rdma_write_info {
  135. /* write state of this chunk */
  136. unsigned int wi_seg_off;
  137. unsigned int wi_seg_no;
  138. unsigned int wi_nsegs;
  139. __be32 *wi_segs;
  140. /* SGL constructor arguments */
  141. struct xdr_buf *wi_xdr;
  142. unsigned char *wi_base;
  143. unsigned int wi_next_off;
  144. struct svc_rdma_chunk_ctxt wi_cc;
  145. };
  146. static struct svc_rdma_write_info *
  147. svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk)
  148. {
  149. struct svc_rdma_write_info *info;
  150. info = kmalloc(sizeof(*info), GFP_KERNEL);
  151. if (!info)
  152. return info;
  153. info->wi_seg_off = 0;
  154. info->wi_seg_no = 0;
  155. info->wi_nsegs = be32_to_cpup(++chunk);
  156. info->wi_segs = ++chunk;
  157. svc_rdma_cc_init(rdma, &info->wi_cc);
  158. info->wi_cc.cc_cqe.done = svc_rdma_write_done;
  159. return info;
  160. }
  161. static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
  162. {
  163. svc_rdma_cc_release(&info->wi_cc, DMA_TO_DEVICE);
  164. kfree(info);
  165. }
  166. /**
  167. * svc_rdma_write_done - Write chunk completion
  168. * @cq: controlling Completion Queue
  169. * @wc: Work Completion
  170. *
  171. * Pages under I/O are freed by a subsequent Send completion.
  172. */
  173. static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
  174. {
  175. struct ib_cqe *cqe = wc->wr_cqe;
  176. struct svc_rdma_chunk_ctxt *cc =
  177. container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
  178. struct svcxprt_rdma *rdma = cc->cc_rdma;
  179. struct svc_rdma_write_info *info =
  180. container_of(cc, struct svc_rdma_write_info, wi_cc);
  181. trace_svcrdma_wc_write(wc);
  182. atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
  183. wake_up(&rdma->sc_send_wait);
  184. if (unlikely(wc->status != IB_WC_SUCCESS)) {
  185. set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
  186. if (wc->status != IB_WC_WR_FLUSH_ERR)
  187. pr_err("svcrdma: write ctx: %s (%u/0x%x)\n",
  188. ib_wc_status_msg(wc->status),
  189. wc->status, wc->vendor_err);
  190. }
  191. svc_rdma_write_info_free(info);
  192. }
  193. /* State for pulling a Read chunk.
  194. */
  195. struct svc_rdma_read_info {
  196. struct svc_rdma_recv_ctxt *ri_readctxt;
  197. unsigned int ri_position;
  198. unsigned int ri_pageno;
  199. unsigned int ri_pageoff;
  200. unsigned int ri_chunklen;
  201. struct svc_rdma_chunk_ctxt ri_cc;
  202. };
  203. static struct svc_rdma_read_info *
  204. svc_rdma_read_info_alloc(struct svcxprt_rdma *rdma)
  205. {
  206. struct svc_rdma_read_info *info;
  207. info = kmalloc(sizeof(*info), GFP_KERNEL);
  208. if (!info)
  209. return info;
  210. svc_rdma_cc_init(rdma, &info->ri_cc);
  211. info->ri_cc.cc_cqe.done = svc_rdma_wc_read_done;
  212. return info;
  213. }
  214. static void svc_rdma_read_info_free(struct svc_rdma_read_info *info)
  215. {
  216. svc_rdma_cc_release(&info->ri_cc, DMA_FROM_DEVICE);
  217. kfree(info);
  218. }
  219. /**
  220. * svc_rdma_wc_read_done - Handle completion of an RDMA Read ctx
  221. * @cq: controlling Completion Queue
  222. * @wc: Work Completion
  223. *
  224. */
  225. static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
  226. {
  227. struct ib_cqe *cqe = wc->wr_cqe;
  228. struct svc_rdma_chunk_ctxt *cc =
  229. container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
  230. struct svcxprt_rdma *rdma = cc->cc_rdma;
  231. struct svc_rdma_read_info *info =
  232. container_of(cc, struct svc_rdma_read_info, ri_cc);
  233. trace_svcrdma_wc_read(wc);
  234. atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
  235. wake_up(&rdma->sc_send_wait);
  236. if (unlikely(wc->status != IB_WC_SUCCESS)) {
  237. set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
  238. if (wc->status != IB_WC_WR_FLUSH_ERR)
  239. pr_err("svcrdma: read ctx: %s (%u/0x%x)\n",
  240. ib_wc_status_msg(wc->status),
  241. wc->status, wc->vendor_err);
  242. svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt);
  243. } else {
  244. spin_lock(&rdma->sc_rq_dto_lock);
  245. list_add_tail(&info->ri_readctxt->rc_list,
  246. &rdma->sc_read_complete_q);
  247. spin_unlock(&rdma->sc_rq_dto_lock);
  248. set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
  249. svc_xprt_enqueue(&rdma->sc_xprt);
  250. }
  251. svc_rdma_read_info_free(info);
  252. }
  253. /* This function sleeps when the transport's Send Queue is congested.
  254. *
  255. * Assumptions:
  256. * - If ib_post_send() succeeds, only one completion is expected,
  257. * even if one or more WRs are flushed. This is true when posting
  258. * an rdma_rw_ctx or when posting a single signaled WR.
  259. */
  260. static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
  261. {
  262. struct svcxprt_rdma *rdma = cc->cc_rdma;
  263. struct svc_xprt *xprt = &rdma->sc_xprt;
  264. struct ib_send_wr *first_wr;
  265. const struct ib_send_wr *bad_wr;
  266. struct list_head *tmp;
  267. struct ib_cqe *cqe;
  268. int ret;
  269. if (cc->cc_sqecount > rdma->sc_sq_depth)
  270. return -EINVAL;
  271. first_wr = NULL;
  272. cqe = &cc->cc_cqe;
  273. list_for_each(tmp, &cc->cc_rwctxts) {
  274. struct svc_rdma_rw_ctxt *ctxt;
  275. ctxt = list_entry(tmp, struct svc_rdma_rw_ctxt, rw_list);
  276. first_wr = rdma_rw_ctx_wrs(&ctxt->rw_ctx, rdma->sc_qp,
  277. rdma->sc_port_num, cqe, first_wr);
  278. cqe = NULL;
  279. }
  280. do {
  281. if (atomic_sub_return(cc->cc_sqecount,
  282. &rdma->sc_sq_avail) > 0) {
  283. ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
  284. if (ret)
  285. break;
  286. return 0;
  287. }
  288. trace_svcrdma_sq_full(rdma);
  289. atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
  290. wait_event(rdma->sc_send_wait,
  291. atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount);
  292. trace_svcrdma_sq_retry(rdma);
  293. } while (1);
  294. trace_svcrdma_sq_post_err(rdma, ret);
  295. set_bit(XPT_CLOSE, &xprt->xpt_flags);
  296. /* If even one was posted, there will be a completion. */
  297. if (bad_wr != first_wr)
  298. return 0;
  299. atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
  300. wake_up(&rdma->sc_send_wait);
  301. return -ENOTCONN;
  302. }
  303. /* Build and DMA-map an SGL that covers one kvec in an xdr_buf
  304. */
  305. static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info,
  306. unsigned int len,
  307. struct svc_rdma_rw_ctxt *ctxt)
  308. {
  309. struct scatterlist *sg = ctxt->rw_sg_table.sgl;
  310. sg_set_buf(&sg[0], info->wi_base, len);
  311. info->wi_base += len;
  312. ctxt->rw_nents = 1;
  313. }
  314. /* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist.
  315. */
  316. static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
  317. unsigned int remaining,
  318. struct svc_rdma_rw_ctxt *ctxt)
  319. {
  320. unsigned int sge_no, sge_bytes, page_off, page_no;
  321. struct xdr_buf *xdr = info->wi_xdr;
  322. struct scatterlist *sg;
  323. struct page **page;
  324. page_off = info->wi_next_off + xdr->page_base;
  325. page_no = page_off >> PAGE_SHIFT;
  326. page_off = offset_in_page(page_off);
  327. page = xdr->pages + page_no;
  328. info->wi_next_off += remaining;
  329. sg = ctxt->rw_sg_table.sgl;
  330. sge_no = 0;
  331. do {
  332. sge_bytes = min_t(unsigned int, remaining,
  333. PAGE_SIZE - page_off);
  334. sg_set_page(sg, *page, sge_bytes, page_off);
  335. remaining -= sge_bytes;
  336. sg = sg_next(sg);
  337. page_off = 0;
  338. sge_no++;
  339. page++;
  340. } while (remaining);
  341. ctxt->rw_nents = sge_no;
  342. }
  343. /* Construct RDMA Write WRs to send a portion of an xdr_buf containing
  344. * an RPC Reply.
  345. */
  346. static int
  347. svc_rdma_build_writes(struct svc_rdma_write_info *info,
  348. void (*constructor)(struct svc_rdma_write_info *info,
  349. unsigned int len,
  350. struct svc_rdma_rw_ctxt *ctxt),
  351. unsigned int remaining)
  352. {
  353. struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
  354. struct svcxprt_rdma *rdma = cc->cc_rdma;
  355. struct svc_rdma_rw_ctxt *ctxt;
  356. __be32 *seg;
  357. int ret;
  358. seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz;
  359. do {
  360. unsigned int write_len;
  361. u32 seg_length, seg_handle;
  362. u64 seg_offset;
  363. if (info->wi_seg_no >= info->wi_nsegs)
  364. goto out_overflow;
  365. seg_handle = be32_to_cpup(seg);
  366. seg_length = be32_to_cpup(seg + 1);
  367. xdr_decode_hyper(seg + 2, &seg_offset);
  368. seg_offset += info->wi_seg_off;
  369. write_len = min(remaining, seg_length - info->wi_seg_off);
  370. ctxt = svc_rdma_get_rw_ctxt(rdma,
  371. (write_len >> PAGE_SHIFT) + 2);
  372. if (!ctxt)
  373. goto out_noctx;
  374. constructor(info, write_len, ctxt);
  375. ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp,
  376. rdma->sc_port_num, ctxt->rw_sg_table.sgl,
  377. ctxt->rw_nents, 0, seg_offset,
  378. seg_handle, DMA_TO_DEVICE);
  379. if (ret < 0)
  380. goto out_initerr;
  381. trace_svcrdma_encode_wseg(seg_handle, write_len, seg_offset);
  382. list_add(&ctxt->rw_list, &cc->cc_rwctxts);
  383. cc->cc_sqecount += ret;
  384. if (write_len == seg_length - info->wi_seg_off) {
  385. seg += 4;
  386. info->wi_seg_no++;
  387. info->wi_seg_off = 0;
  388. } else {
  389. info->wi_seg_off += write_len;
  390. }
  391. remaining -= write_len;
  392. } while (remaining);
  393. return 0;
  394. out_overflow:
  395. dprintk("svcrdma: inadequate space in Write chunk (%u)\n",
  396. info->wi_nsegs);
  397. return -E2BIG;
  398. out_noctx:
  399. dprintk("svcrdma: no R/W ctxs available\n");
  400. return -ENOMEM;
  401. out_initerr:
  402. svc_rdma_put_rw_ctxt(rdma, ctxt);
  403. trace_svcrdma_dma_map_rwctx(rdma, ret);
  404. return -EIO;
  405. }
  406. /* Send one of an xdr_buf's kvecs by itself. To send a Reply
  407. * chunk, the whole RPC Reply is written back to the client.
  408. * This function writes either the head or tail of the xdr_buf
  409. * containing the Reply.
  410. */
  411. static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info,
  412. struct kvec *vec)
  413. {
  414. info->wi_base = vec->iov_base;
  415. return svc_rdma_build_writes(info, svc_rdma_vec_to_sg,
  416. vec->iov_len);
  417. }
  418. /* Send an xdr_buf's page list by itself. A Write chunk is
  419. * just the page list. a Reply chunk is the head, page list,
  420. * and tail. This function is shared between the two types
  421. * of chunk.
  422. */
  423. static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
  424. struct xdr_buf *xdr)
  425. {
  426. info->wi_xdr = xdr;
  427. info->wi_next_off = 0;
  428. return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg,
  429. xdr->page_len);
  430. }
  431. /**
  432. * svc_rdma_send_write_chunk - Write all segments in a Write chunk
  433. * @rdma: controlling RDMA transport
  434. * @wr_ch: Write chunk provided by client
  435. * @xdr: xdr_buf containing the data payload
  436. *
  437. * Returns a non-negative number of bytes the chunk consumed, or
  438. * %-E2BIG if the payload was larger than the Write chunk,
  439. * %-EINVAL if client provided too many segments,
  440. * %-ENOMEM if rdma_rw context pool was exhausted,
  441. * %-ENOTCONN if posting failed (connection is lost),
  442. * %-EIO if rdma_rw initialization failed (DMA mapping, etc).
  443. */
  444. int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
  445. struct xdr_buf *xdr)
  446. {
  447. struct svc_rdma_write_info *info;
  448. int ret;
  449. if (!xdr->page_len)
  450. return 0;
  451. info = svc_rdma_write_info_alloc(rdma, wr_ch);
  452. if (!info)
  453. return -ENOMEM;
  454. ret = svc_rdma_send_xdr_pagelist(info, xdr);
  455. if (ret < 0)
  456. goto out_err;
  457. ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
  458. if (ret < 0)
  459. goto out_err;
  460. trace_svcrdma_encode_write(xdr->page_len);
  461. return xdr->page_len;
  462. out_err:
  463. svc_rdma_write_info_free(info);
  464. return ret;
  465. }
  466. /**
  467. * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk
  468. * @rdma: controlling RDMA transport
  469. * @rp_ch: Reply chunk provided by client
  470. * @writelist: true if client provided a Write list
  471. * @xdr: xdr_buf containing an RPC Reply
  472. *
  473. * Returns a non-negative number of bytes the chunk consumed, or
  474. * %-E2BIG if the payload was larger than the Reply chunk,
  475. * %-EINVAL if client provided too many segments,
  476. * %-ENOMEM if rdma_rw context pool was exhausted,
  477. * %-ENOTCONN if posting failed (connection is lost),
  478. * %-EIO if rdma_rw initialization failed (DMA mapping, etc).
  479. */
  480. int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch,
  481. bool writelist, struct xdr_buf *xdr)
  482. {
  483. struct svc_rdma_write_info *info;
  484. int consumed, ret;
  485. info = svc_rdma_write_info_alloc(rdma, rp_ch);
  486. if (!info)
  487. return -ENOMEM;
  488. ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]);
  489. if (ret < 0)
  490. goto out_err;
  491. consumed = xdr->head[0].iov_len;
  492. /* Send the page list in the Reply chunk only if the
  493. * client did not provide Write chunks.
  494. */
  495. if (!writelist && xdr->page_len) {
  496. ret = svc_rdma_send_xdr_pagelist(info, xdr);
  497. if (ret < 0)
  498. goto out_err;
  499. consumed += xdr->page_len;
  500. }
  501. if (xdr->tail[0].iov_len) {
  502. ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]);
  503. if (ret < 0)
  504. goto out_err;
  505. consumed += xdr->tail[0].iov_len;
  506. }
  507. ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
  508. if (ret < 0)
  509. goto out_err;
  510. trace_svcrdma_encode_reply(consumed);
  511. return consumed;
  512. out_err:
  513. svc_rdma_write_info_free(info);
  514. return ret;
  515. }
  516. static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
  517. struct svc_rqst *rqstp,
  518. u32 rkey, u32 len, u64 offset)
  519. {
  520. struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
  521. struct svc_rdma_chunk_ctxt *cc = &info->ri_cc;
  522. struct svc_rdma_rw_ctxt *ctxt;
  523. unsigned int sge_no, seg_len;
  524. struct scatterlist *sg;
  525. int ret;
  526. sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT;
  527. ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no);
  528. if (!ctxt)
  529. goto out_noctx;
  530. ctxt->rw_nents = sge_no;
  531. sg = ctxt->rw_sg_table.sgl;
  532. for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) {
  533. seg_len = min_t(unsigned int, len,
  534. PAGE_SIZE - info->ri_pageoff);
  535. head->rc_arg.pages[info->ri_pageno] =
  536. rqstp->rq_pages[info->ri_pageno];
  537. if (!info->ri_pageoff)
  538. head->rc_page_count++;
  539. sg_set_page(sg, rqstp->rq_pages[info->ri_pageno],
  540. seg_len, info->ri_pageoff);
  541. sg = sg_next(sg);
  542. info->ri_pageoff += seg_len;
  543. if (info->ri_pageoff == PAGE_SIZE) {
  544. info->ri_pageno++;
  545. info->ri_pageoff = 0;
  546. }
  547. len -= seg_len;
  548. /* Safety check */
  549. if (len &&
  550. &rqstp->rq_pages[info->ri_pageno + 1] > rqstp->rq_page_end)
  551. goto out_overrun;
  552. }
  553. ret = rdma_rw_ctx_init(&ctxt->rw_ctx, cc->cc_rdma->sc_qp,
  554. cc->cc_rdma->sc_port_num,
  555. ctxt->rw_sg_table.sgl, ctxt->rw_nents,
  556. 0, offset, rkey, DMA_FROM_DEVICE);
  557. if (ret < 0)
  558. goto out_initerr;
  559. list_add(&ctxt->rw_list, &cc->cc_rwctxts);
  560. cc->cc_sqecount += ret;
  561. return 0;
  562. out_noctx:
  563. dprintk("svcrdma: no R/W ctxs available\n");
  564. return -ENOMEM;
  565. out_overrun:
  566. dprintk("svcrdma: request overruns rq_pages\n");
  567. return -EINVAL;
  568. out_initerr:
  569. trace_svcrdma_dma_map_rwctx(cc->cc_rdma, ret);
  570. svc_rdma_put_rw_ctxt(cc->cc_rdma, ctxt);
  571. return -EIO;
  572. }
  573. /* Walk the segments in the Read chunk starting at @p and construct
  574. * RDMA Read operations to pull the chunk to the server.
  575. */
  576. static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp,
  577. struct svc_rdma_read_info *info,
  578. __be32 *p)
  579. {
  580. int ret;
  581. ret = -EINVAL;
  582. info->ri_chunklen = 0;
  583. while (*p++ != xdr_zero && be32_to_cpup(p++) == info->ri_position) {
  584. u32 rs_handle, rs_length;
  585. u64 rs_offset;
  586. rs_handle = be32_to_cpup(p++);
  587. rs_length = be32_to_cpup(p++);
  588. p = xdr_decode_hyper(p, &rs_offset);
  589. ret = svc_rdma_build_read_segment(info, rqstp,
  590. rs_handle, rs_length,
  591. rs_offset);
  592. if (ret < 0)
  593. break;
  594. trace_svcrdma_encode_rseg(rs_handle, rs_length, rs_offset);
  595. info->ri_chunklen += rs_length;
  596. }
  597. return ret;
  598. }
  599. /* Construct RDMA Reads to pull over a normal Read chunk. The chunk
  600. * data lands in the page list of head->rc_arg.pages.
  601. *
  602. * Currently NFSD does not look at the head->rc_arg.tail[0] iovec.
  603. * Therefore, XDR round-up of the Read chunk and trailing
  604. * inline content must both be added at the end of the pagelist.
  605. */
  606. static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp,
  607. struct svc_rdma_read_info *info,
  608. __be32 *p)
  609. {
  610. struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
  611. int ret;
  612. ret = svc_rdma_build_read_chunk(rqstp, info, p);
  613. if (ret < 0)
  614. goto out;
  615. trace_svcrdma_encode_read(info->ri_chunklen, info->ri_position);
  616. head->rc_hdr_count = 0;
  617. /* Split the Receive buffer between the head and tail
  618. * buffers at Read chunk's position. XDR roundup of the
  619. * chunk is not included in either the pagelist or in
  620. * the tail.
  621. */
  622. head->rc_arg.tail[0].iov_base =
  623. head->rc_arg.head[0].iov_base + info->ri_position;
  624. head->rc_arg.tail[0].iov_len =
  625. head->rc_arg.head[0].iov_len - info->ri_position;
  626. head->rc_arg.head[0].iov_len = info->ri_position;
  627. /* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2).
  628. *
  629. * If the client already rounded up the chunk length, the
  630. * length does not change. Otherwise, the length of the page
  631. * list is increased to include XDR round-up.
  632. *
  633. * Currently these chunks always start at page offset 0,
  634. * thus the rounded-up length never crosses a page boundary.
  635. */
  636. info->ri_chunklen = XDR_QUADLEN(info->ri_chunklen) << 2;
  637. head->rc_arg.page_len = info->ri_chunklen;
  638. head->rc_arg.len += info->ri_chunklen;
  639. head->rc_arg.buflen += info->ri_chunklen;
  640. out:
  641. return ret;
  642. }
  643. /* Construct RDMA Reads to pull over a Position Zero Read chunk.
  644. * The start of the data lands in the first page just after
  645. * the Transport header, and the rest lands in the page list of
  646. * head->rc_arg.pages.
  647. *
  648. * Assumptions:
  649. * - A PZRC has an XDR-aligned length (no implicit round-up).
  650. * - There can be no trailing inline content (IOW, we assume
  651. * a PZRC is never sent in an RDMA_MSG message, though it's
  652. * allowed by spec).
  653. */
  654. static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp,
  655. struct svc_rdma_read_info *info,
  656. __be32 *p)
  657. {
  658. struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
  659. int ret;
  660. ret = svc_rdma_build_read_chunk(rqstp, info, p);
  661. if (ret < 0)
  662. goto out;
  663. trace_svcrdma_encode_pzr(info->ri_chunklen);
  664. head->rc_arg.len += info->ri_chunklen;
  665. head->rc_arg.buflen += info->ri_chunklen;
  666. head->rc_hdr_count = 1;
  667. head->rc_arg.head[0].iov_base = page_address(head->rc_pages[0]);
  668. head->rc_arg.head[0].iov_len = min_t(size_t, PAGE_SIZE,
  669. info->ri_chunklen);
  670. head->rc_arg.page_len = info->ri_chunklen -
  671. head->rc_arg.head[0].iov_len;
  672. out:
  673. return ret;
  674. }
  675. /* Pages under I/O have been copied to head->rc_pages. Ensure they
  676. * are not released by svc_xprt_release() until the I/O is complete.
  677. *
  678. * This has to be done after all Read WRs are constructed to properly
  679. * handle a page that is part of I/O on behalf of two different RDMA
  680. * segments.
  681. *
  682. * Do this only if I/O has been posted. Otherwise, we do indeed want
  683. * svc_xprt_release() to clean things up properly.
  684. */
  685. static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
  686. const unsigned int start,
  687. const unsigned int num_pages)
  688. {
  689. unsigned int i;
  690. for (i = start; i < num_pages + start; i++)
  691. rqstp->rq_pages[i] = NULL;
  692. }
  693. /**
  694. * svc_rdma_recv_read_chunk - Pull a Read chunk from the client
  695. * @rdma: controlling RDMA transport
  696. * @rqstp: set of pages to use as Read sink buffers
  697. * @head: pages under I/O collect here
  698. * @p: pointer to start of Read chunk
  699. *
  700. * Returns:
  701. * %0 if all needed RDMA Reads were posted successfully,
  702. * %-EINVAL if client provided too many segments,
  703. * %-ENOMEM if rdma_rw context pool was exhausted,
  704. * %-ENOTCONN if posting failed (connection is lost),
  705. * %-EIO if rdma_rw initialization failed (DMA mapping, etc).
  706. *
  707. * Assumptions:
  708. * - All Read segments in @p have the same Position value.
  709. */
  710. int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp,
  711. struct svc_rdma_recv_ctxt *head, __be32 *p)
  712. {
  713. struct svc_rdma_read_info *info;
  714. int ret;
  715. /* The request (with page list) is constructed in
  716. * head->rc_arg. Pages involved with RDMA Read I/O are
  717. * transferred there.
  718. */
  719. head->rc_arg.head[0] = rqstp->rq_arg.head[0];
  720. head->rc_arg.tail[0] = rqstp->rq_arg.tail[0];
  721. head->rc_arg.pages = head->rc_pages;
  722. head->rc_arg.page_base = 0;
  723. head->rc_arg.page_len = 0;
  724. head->rc_arg.len = rqstp->rq_arg.len;
  725. head->rc_arg.buflen = rqstp->rq_arg.buflen;
  726. info = svc_rdma_read_info_alloc(rdma);
  727. if (!info)
  728. return -ENOMEM;
  729. info->ri_readctxt = head;
  730. info->ri_pageno = 0;
  731. info->ri_pageoff = 0;
  732. info->ri_position = be32_to_cpup(p + 1);
  733. if (info->ri_position)
  734. ret = svc_rdma_build_normal_read_chunk(rqstp, info, p);
  735. else
  736. ret = svc_rdma_build_pz_read_chunk(rqstp, info, p);
  737. if (ret < 0)
  738. goto out_err;
  739. ret = svc_rdma_post_chunk_ctxt(&info->ri_cc);
  740. if (ret < 0)
  741. goto out_err;
  742. svc_rdma_save_io_pages(rqstp, 0, head->rc_page_count);
  743. return 0;
  744. out_err:
  745. svc_rdma_read_info_free(info);
  746. return ret;
  747. }