smc_wr.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Shared Memory Communications over RDMA (SMC-R) and RoCE
  4. *
  5. * Work Requests exploiting Infiniband API
  6. *
  7. * Work requests (WR) of type ib_post_send or ib_post_recv respectively
  8. * are submitted to either RC SQ or RC RQ respectively
  9. * (reliably connected send/receive queue)
  10. * and become work queue entries (WQEs).
  11. * While an SQ WR/WQE is pending, we track it until transmission completion.
  12. * Through a send or receive completion queue (CQ) respectively,
  13. * we get completion queue entries (CQEs) [aka work completions (WCs)].
  14. * Since the CQ callback is called from IRQ context, we split work by using
  15. * bottom halves implemented by tasklets.
  16. *
  17. * SMC uses this to exchange LLC (link layer control)
  18. * and CDC (connection data control) messages.
  19. *
  20. * Copyright IBM Corp. 2016
  21. *
  22. * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
  23. */
  24. #include <linux/atomic.h>
  25. #include <linux/hashtable.h>
  26. #include <linux/wait.h>
  27. #include <rdma/ib_verbs.h>
  28. #include <asm/div64.h>
  29. #include "smc.h"
  30. #include "smc_wr.h"
  31. #define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */
  32. #define SMC_WR_RX_HASH_BITS 4
  33. static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
  34. static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
  35. struct smc_wr_tx_pend { /* control data for a pending send request */
  36. u64 wr_id; /* work request id sent */
  37. smc_wr_tx_handler handler;
  38. enum ib_wc_status wc_status; /* CQE status */
  39. struct smc_link *link;
  40. u32 idx;
  41. struct smc_wr_tx_pend_priv priv;
  42. u8 compl_requested;
  43. };
  44. /******************************** send queue *********************************/
  45. /*------------------------------- completion --------------------------------*/
  46. /* returns true if at least one tx work request is pending on the given link */
  47. static inline bool smc_wr_is_tx_pend(struct smc_link *link)
  48. {
  49. return !bitmap_empty(link->wr_tx_mask, link->wr_tx_cnt);
  50. }
  51. /* wait till all pending tx work requests on the given link are completed */
  52. void smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
  53. {
  54. wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link));
  55. }
  56. static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
  57. {
  58. u32 i;
  59. for (i = 0; i < link->wr_tx_cnt; i++) {
  60. if (link->wr_tx_pends[i].wr_id == wr_id)
  61. return i;
  62. }
  63. return link->wr_tx_cnt;
  64. }
  65. static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
  66. {
  67. struct smc_wr_tx_pend pnd_snd;
  68. struct smc_link *link;
  69. u32 pnd_snd_idx;
  70. link = wc->qp->qp_context;
  71. if (wc->opcode == IB_WC_REG_MR) {
  72. if (wc->status)
  73. link->wr_reg_state = FAILED;
  74. else
  75. link->wr_reg_state = CONFIRMED;
  76. smc_wr_wakeup_reg_wait(link);
  77. return;
  78. }
  79. pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
  80. if (pnd_snd_idx == link->wr_tx_cnt) {
  81. if (link->lgr->smc_version != SMC_V2 ||
  82. link->wr_tx_v2_pend->wr_id != wc->wr_id)
  83. return;
  84. link->wr_tx_v2_pend->wc_status = wc->status;
  85. memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd));
  86. /* clear the full struct smc_wr_tx_pend including .priv */
  87. memset(link->wr_tx_v2_pend, 0,
  88. sizeof(*link->wr_tx_v2_pend));
  89. memset(link->lgr->wr_tx_buf_v2, 0,
  90. sizeof(*link->lgr->wr_tx_buf_v2));
  91. } else {
  92. link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
  93. if (link->wr_tx_pends[pnd_snd_idx].compl_requested)
  94. complete(&link->wr_tx_compl[pnd_snd_idx]);
  95. memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx],
  96. sizeof(pnd_snd));
  97. /* clear the full struct smc_wr_tx_pend including .priv */
  98. memset(&link->wr_tx_pends[pnd_snd_idx], 0,
  99. sizeof(link->wr_tx_pends[pnd_snd_idx]));
  100. memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
  101. sizeof(link->wr_tx_bufs[pnd_snd_idx]));
  102. if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
  103. return;
  104. }
  105. if (wc->status) {
  106. if (link->lgr->smc_version == SMC_V2) {
  107. memset(link->wr_tx_v2_pend, 0,
  108. sizeof(*link->wr_tx_v2_pend));
  109. memset(link->lgr->wr_tx_buf_v2, 0,
  110. sizeof(*link->lgr->wr_tx_buf_v2));
  111. }
  112. /* terminate link */
  113. smcr_link_down_cond_sched(link);
  114. }
  115. if (pnd_snd.handler)
  116. pnd_snd.handler(&pnd_snd.priv, link, wc->status);
  117. wake_up(&link->wr_tx_wait);
  118. }
  119. static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t)
  120. {
  121. struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet);
  122. struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
  123. int i = 0, rc;
  124. int polled = 0;
  125. again:
  126. polled++;
  127. do {
  128. memset(&wc, 0, sizeof(wc));
  129. rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
  130. if (polled == 1) {
  131. ib_req_notify_cq(dev->roce_cq_send,
  132. IB_CQ_NEXT_COMP |
  133. IB_CQ_REPORT_MISSED_EVENTS);
  134. }
  135. if (!rc)
  136. break;
  137. for (i = 0; i < rc; i++)
  138. smc_wr_tx_process_cqe(&wc[i]);
  139. } while (rc > 0);
  140. if (polled == 1)
  141. goto again;
  142. }
  143. void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
  144. {
  145. struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
  146. tasklet_schedule(&dev->send_tasklet);
  147. }
  148. /*---------------------------- request submission ---------------------------*/
  149. static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
  150. {
  151. *idx = link->wr_tx_cnt;
  152. if (!smc_link_sendable(link))
  153. return -ENOLINK;
  154. for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
  155. if (!test_and_set_bit(*idx, link->wr_tx_mask))
  156. return 0;
  157. }
  158. *idx = link->wr_tx_cnt;
  159. return -EBUSY;
  160. }
  161. /**
  162. * smc_wr_tx_get_free_slot() - returns buffer for message assembly,
  163. * and sets info for pending transmit tracking
  164. * @link: Pointer to smc_link used to later send the message.
  165. * @handler: Send completion handler function pointer.
  166. * @wr_buf: Out value returns pointer to message buffer.
  167. * @wr_rdma_buf: Out value returns pointer to rdma work request.
  168. * @wr_pend_priv: Out value returns pointer serving as handler context.
  169. *
  170. * Return: 0 on success, or -errno on error.
  171. */
  172. int smc_wr_tx_get_free_slot(struct smc_link *link,
  173. smc_wr_tx_handler handler,
  174. struct smc_wr_buf **wr_buf,
  175. struct smc_rdma_wr **wr_rdma_buf,
  176. struct smc_wr_tx_pend_priv **wr_pend_priv)
  177. {
  178. struct smc_link_group *lgr = smc_get_lgr(link);
  179. struct smc_wr_tx_pend *wr_pend;
  180. u32 idx = link->wr_tx_cnt;
  181. struct ib_send_wr *wr_ib;
  182. u64 wr_id;
  183. int rc;
  184. *wr_buf = NULL;
  185. *wr_pend_priv = NULL;
  186. if (in_softirq() || lgr->terminating) {
  187. rc = smc_wr_tx_get_free_slot_index(link, &idx);
  188. if (rc)
  189. return rc;
  190. } else {
  191. rc = wait_event_interruptible_timeout(
  192. link->wr_tx_wait,
  193. !smc_link_sendable(link) ||
  194. lgr->terminating ||
  195. (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
  196. SMC_WR_TX_WAIT_FREE_SLOT_TIME);
  197. if (!rc) {
  198. /* timeout - terminate link */
  199. smcr_link_down_cond_sched(link);
  200. return -EPIPE;
  201. }
  202. if (idx == link->wr_tx_cnt)
  203. return -EPIPE;
  204. }
  205. wr_id = smc_wr_tx_get_next_wr_id(link);
  206. wr_pend = &link->wr_tx_pends[idx];
  207. wr_pend->wr_id = wr_id;
  208. wr_pend->handler = handler;
  209. wr_pend->link = link;
  210. wr_pend->idx = idx;
  211. wr_ib = &link->wr_tx_ibs[idx];
  212. wr_ib->wr_id = wr_id;
  213. *wr_buf = &link->wr_tx_bufs[idx];
  214. if (wr_rdma_buf)
  215. *wr_rdma_buf = &link->wr_tx_rdmas[idx];
  216. *wr_pend_priv = &wr_pend->priv;
  217. return 0;
  218. }
  219. int smc_wr_tx_get_v2_slot(struct smc_link *link,
  220. smc_wr_tx_handler handler,
  221. struct smc_wr_v2_buf **wr_buf,
  222. struct smc_wr_tx_pend_priv **wr_pend_priv)
  223. {
  224. struct smc_wr_tx_pend *wr_pend;
  225. struct ib_send_wr *wr_ib;
  226. u64 wr_id;
  227. if (link->wr_tx_v2_pend->idx == link->wr_tx_cnt)
  228. return -EBUSY;
  229. *wr_buf = NULL;
  230. *wr_pend_priv = NULL;
  231. wr_id = smc_wr_tx_get_next_wr_id(link);
  232. wr_pend = link->wr_tx_v2_pend;
  233. wr_pend->wr_id = wr_id;
  234. wr_pend->handler = handler;
  235. wr_pend->link = link;
  236. wr_pend->idx = link->wr_tx_cnt;
  237. wr_ib = link->wr_tx_v2_ib;
  238. wr_ib->wr_id = wr_id;
  239. *wr_buf = link->lgr->wr_tx_buf_v2;
  240. *wr_pend_priv = &wr_pend->priv;
  241. return 0;
  242. }
  243. int smc_wr_tx_put_slot(struct smc_link *link,
  244. struct smc_wr_tx_pend_priv *wr_pend_priv)
  245. {
  246. struct smc_wr_tx_pend *pend;
  247. pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv);
  248. if (pend->idx < link->wr_tx_cnt) {
  249. u32 idx = pend->idx;
  250. /* clear the full struct smc_wr_tx_pend including .priv */
  251. memset(&link->wr_tx_pends[idx], 0,
  252. sizeof(link->wr_tx_pends[idx]));
  253. memset(&link->wr_tx_bufs[idx], 0,
  254. sizeof(link->wr_tx_bufs[idx]));
  255. test_and_clear_bit(idx, link->wr_tx_mask);
  256. wake_up(&link->wr_tx_wait);
  257. return 1;
  258. } else if (link->lgr->smc_version == SMC_V2 &&
  259. pend->idx == link->wr_tx_cnt) {
  260. /* Large v2 buffer */
  261. memset(&link->wr_tx_v2_pend, 0,
  262. sizeof(link->wr_tx_v2_pend));
  263. memset(&link->lgr->wr_tx_buf_v2, 0,
  264. sizeof(link->lgr->wr_tx_buf_v2));
  265. return 1;
  266. }
  267. return 0;
  268. }
  269. /* Send prepared WR slot via ib_post_send.
  270. * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
  271. */
  272. int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
  273. {
  274. struct smc_wr_tx_pend *pend;
  275. int rc;
  276. ib_req_notify_cq(link->smcibdev->roce_cq_send,
  277. IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
  278. pend = container_of(priv, struct smc_wr_tx_pend, priv);
  279. rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL);
  280. if (rc) {
  281. smc_wr_tx_put_slot(link, priv);
  282. smcr_link_down_cond_sched(link);
  283. }
  284. return rc;
  285. }
  286. int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
  287. int len)
  288. {
  289. int rc;
  290. link->wr_tx_v2_ib->sg_list[0].length = len;
  291. ib_req_notify_cq(link->smcibdev->roce_cq_send,
  292. IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
  293. rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL);
  294. if (rc) {
  295. smc_wr_tx_put_slot(link, priv);
  296. smcr_link_down_cond_sched(link);
  297. }
  298. return rc;
  299. }
  300. /* Send prepared WR slot via ib_post_send and wait for send completion
  301. * notification.
  302. * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
  303. */
  304. int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
  305. unsigned long timeout)
  306. {
  307. struct smc_wr_tx_pend *pend;
  308. u32 pnd_idx;
  309. int rc;
  310. pend = container_of(priv, struct smc_wr_tx_pend, priv);
  311. pend->compl_requested = 1;
  312. pnd_idx = pend->idx;
  313. init_completion(&link->wr_tx_compl[pnd_idx]);
  314. rc = smc_wr_tx_send(link, priv);
  315. if (rc)
  316. return rc;
  317. /* wait for completion by smc_wr_tx_process_cqe() */
  318. rc = wait_for_completion_interruptible_timeout(
  319. &link->wr_tx_compl[pnd_idx], timeout);
  320. if (rc <= 0)
  321. rc = -ENODATA;
  322. if (rc > 0)
  323. rc = 0;
  324. return rc;
  325. }
  326. /* Register a memory region and wait for result. */
  327. int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
  328. {
  329. int rc;
  330. ib_req_notify_cq(link->smcibdev->roce_cq_send,
  331. IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
  332. link->wr_reg_state = POSTED;
  333. link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr;
  334. link->wr_reg.mr = mr;
  335. link->wr_reg.key = mr->rkey;
  336. rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL);
  337. if (rc)
  338. return rc;
  339. percpu_ref_get(&link->wr_reg_refs);
  340. rc = wait_event_interruptible_timeout(link->wr_reg_wait,
  341. (link->wr_reg_state != POSTED),
  342. SMC_WR_REG_MR_WAIT_TIME);
  343. percpu_ref_put(&link->wr_reg_refs);
  344. if (!rc) {
  345. /* timeout - terminate link */
  346. smcr_link_down_cond_sched(link);
  347. return -EPIPE;
  348. }
  349. if (rc == -ERESTARTSYS)
  350. return -EINTR;
  351. switch (link->wr_reg_state) {
  352. case CONFIRMED:
  353. rc = 0;
  354. break;
  355. case FAILED:
  356. rc = -EIO;
  357. break;
  358. case POSTED:
  359. rc = -EPIPE;
  360. break;
  361. }
  362. return rc;
  363. }
  364. /****************************** receive queue ********************************/
  365. int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
  366. {
  367. struct smc_wr_rx_handler *h_iter;
  368. int rc = 0;
  369. spin_lock(&smc_wr_rx_hash_lock);
  370. hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) {
  371. if (h_iter->type == handler->type) {
  372. rc = -EEXIST;
  373. goto out_unlock;
  374. }
  375. }
  376. hash_add(smc_wr_rx_hash, &handler->list, handler->type);
  377. out_unlock:
  378. spin_unlock(&smc_wr_rx_hash_lock);
  379. return rc;
  380. }
  381. /* Demultiplex a received work request based on the message type to its handler.
  382. * Relies on smc_wr_rx_hash having been completely filled before any IB WRs,
  383. * and not being modified any more afterwards so we don't need to lock it.
  384. */
  385. static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
  386. {
  387. struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
  388. struct smc_wr_rx_handler *handler;
  389. struct smc_wr_rx_hdr *wr_rx;
  390. u64 temp_wr_id;
  391. u32 index;
  392. if (wc->byte_len < sizeof(*wr_rx))
  393. return; /* short message */
  394. temp_wr_id = wc->wr_id;
  395. index = do_div(temp_wr_id, link->wr_rx_cnt);
  396. wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index];
  397. hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
  398. if (handler->type == wr_rx->type)
  399. handler->handler(wc, wr_rx);
  400. }
  401. }
  402. static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
  403. {
  404. struct smc_link *link;
  405. int i;
  406. for (i = 0; i < num; i++) {
  407. link = wc[i].qp->qp_context;
  408. link->wr_rx_id_compl = wc[i].wr_id;
  409. if (wc[i].status == IB_WC_SUCCESS) {
  410. link->wr_rx_tstamp = jiffies;
  411. smc_wr_rx_demultiplex(&wc[i]);
  412. smc_wr_rx_post(link); /* refill WR RX */
  413. } else {
  414. /* handle status errors */
  415. switch (wc[i].status) {
  416. case IB_WC_RETRY_EXC_ERR:
  417. case IB_WC_RNR_RETRY_EXC_ERR:
  418. case IB_WC_WR_FLUSH_ERR:
  419. smcr_link_down_cond_sched(link);
  420. if (link->wr_rx_id_compl == link->wr_rx_id)
  421. wake_up(&link->wr_rx_empty_wait);
  422. break;
  423. default:
  424. smc_wr_rx_post(link); /* refill WR RX */
  425. break;
  426. }
  427. }
  428. }
  429. }
  430. static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t)
  431. {
  432. struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet);
  433. struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
  434. int polled = 0;
  435. int rc;
  436. again:
  437. polled++;
  438. do {
  439. memset(&wc, 0, sizeof(wc));
  440. rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
  441. if (polled == 1) {
  442. ib_req_notify_cq(dev->roce_cq_recv,
  443. IB_CQ_SOLICITED_MASK
  444. | IB_CQ_REPORT_MISSED_EVENTS);
  445. }
  446. if (!rc)
  447. break;
  448. smc_wr_rx_process_cqes(&wc[0], rc);
  449. } while (rc > 0);
  450. if (polled == 1)
  451. goto again;
  452. }
  453. void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
  454. {
  455. struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
  456. tasklet_schedule(&dev->recv_tasklet);
  457. }
  458. int smc_wr_rx_post_init(struct smc_link *link)
  459. {
  460. u32 i;
  461. int rc = 0;
  462. for (i = 0; i < link->wr_rx_cnt; i++)
  463. rc = smc_wr_rx_post(link);
  464. return rc;
  465. }
  466. /***************************** init, exit, misc ******************************/
  467. void smc_wr_remember_qp_attr(struct smc_link *lnk)
  468. {
  469. struct ib_qp_attr *attr = &lnk->qp_attr;
  470. struct ib_qp_init_attr init_attr;
  471. memset(attr, 0, sizeof(*attr));
  472. memset(&init_attr, 0, sizeof(init_attr));
  473. ib_query_qp(lnk->roce_qp, attr,
  474. IB_QP_STATE |
  475. IB_QP_CUR_STATE |
  476. IB_QP_PKEY_INDEX |
  477. IB_QP_PORT |
  478. IB_QP_QKEY |
  479. IB_QP_AV |
  480. IB_QP_PATH_MTU |
  481. IB_QP_TIMEOUT |
  482. IB_QP_RETRY_CNT |
  483. IB_QP_RNR_RETRY |
  484. IB_QP_RQ_PSN |
  485. IB_QP_ALT_PATH |
  486. IB_QP_MIN_RNR_TIMER |
  487. IB_QP_SQ_PSN |
  488. IB_QP_PATH_MIG_STATE |
  489. IB_QP_CAP |
  490. IB_QP_DEST_QPN,
  491. &init_attr);
  492. lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
  493. lnk->qp_attr.cap.max_send_wr);
  494. lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
  495. lnk->qp_attr.cap.max_recv_wr);
  496. }
  497. static void smc_wr_init_sge(struct smc_link *lnk)
  498. {
  499. int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
  500. bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE);
  501. u32 i;
  502. for (i = 0; i < lnk->wr_tx_cnt; i++) {
  503. lnk->wr_tx_sges[i].addr = send_inline ? (uintptr_t)(&lnk->wr_tx_bufs[i]) :
  504. lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
  505. lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
  506. lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
  507. lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey =
  508. lnk->roce_pd->local_dma_lkey;
  509. lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey =
  510. lnk->roce_pd->local_dma_lkey;
  511. lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey =
  512. lnk->roce_pd->local_dma_lkey;
  513. lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey =
  514. lnk->roce_pd->local_dma_lkey;
  515. lnk->wr_tx_ibs[i].next = NULL;
  516. lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
  517. lnk->wr_tx_ibs[i].num_sge = 1;
  518. lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
  519. lnk->wr_tx_ibs[i].send_flags =
  520. IB_SEND_SIGNALED | IB_SEND_SOLICITED;
  521. if (send_inline)
  522. lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE;
  523. lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
  524. lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
  525. lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
  526. lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge;
  527. lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
  528. lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
  529. }
  530. if (lnk->lgr->smc_version == SMC_V2) {
  531. lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr;
  532. lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE;
  533. lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey;
  534. lnk->wr_tx_v2_ib->next = NULL;
  535. lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge;
  536. lnk->wr_tx_v2_ib->num_sge = 1;
  537. lnk->wr_tx_v2_ib->opcode = IB_WR_SEND;
  538. lnk->wr_tx_v2_ib->send_flags =
  539. IB_SEND_SIGNALED | IB_SEND_SOLICITED;
  540. }
  541. /* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE.
  542. * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer
  543. * and the same buffer for all sges. When a larger message arrived then
  544. * the content of the first small sge is copied to the beginning of
  545. * the larger spillover buffer, allowing easy data mapping.
  546. */
  547. for (i = 0; i < lnk->wr_rx_cnt; i++) {
  548. int x = i * sges_per_buf;
  549. lnk->wr_rx_sges[x].addr =
  550. lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
  551. lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE;
  552. lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey;
  553. if (lnk->lgr->smc_version == SMC_V2) {
  554. lnk->wr_rx_sges[x + 1].addr =
  555. lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE;
  556. lnk->wr_rx_sges[x + 1].length =
  557. SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE;
  558. lnk->wr_rx_sges[x + 1].lkey =
  559. lnk->roce_pd->local_dma_lkey;
  560. }
  561. lnk->wr_rx_ibs[i].next = NULL;
  562. lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x];
  563. lnk->wr_rx_ibs[i].num_sge = sges_per_buf;
  564. }
  565. lnk->wr_reg.wr.next = NULL;
  566. lnk->wr_reg.wr.num_sge = 0;
  567. lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED;
  568. lnk->wr_reg.wr.opcode = IB_WR_REG_MR;
  569. lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
  570. }
  571. void smc_wr_free_link(struct smc_link *lnk)
  572. {
  573. struct ib_device *ibdev;
  574. if (!lnk->smcibdev)
  575. return;
  576. ibdev = lnk->smcibdev->ibdev;
  577. smc_wr_drain_cq(lnk);
  578. smc_wr_wakeup_reg_wait(lnk);
  579. smc_wr_wakeup_tx_wait(lnk);
  580. smc_wr_tx_wait_no_pending_sends(lnk);
  581. percpu_ref_kill(&lnk->wr_reg_refs);
  582. wait_for_completion(&lnk->reg_ref_comp);
  583. percpu_ref_exit(&lnk->wr_reg_refs);
  584. percpu_ref_kill(&lnk->wr_tx_refs);
  585. wait_for_completion(&lnk->tx_ref_comp);
  586. percpu_ref_exit(&lnk->wr_tx_refs);
  587. if (lnk->wr_rx_dma_addr) {
  588. ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
  589. SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
  590. DMA_FROM_DEVICE);
  591. lnk->wr_rx_dma_addr = 0;
  592. }
  593. if (lnk->wr_rx_v2_dma_addr) {
  594. ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr,
  595. SMC_WR_BUF_V2_SIZE,
  596. DMA_FROM_DEVICE);
  597. lnk->wr_rx_v2_dma_addr = 0;
  598. }
  599. if (lnk->wr_tx_dma_addr) {
  600. ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr,
  601. SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
  602. DMA_TO_DEVICE);
  603. lnk->wr_tx_dma_addr = 0;
  604. }
  605. if (lnk->wr_tx_v2_dma_addr) {
  606. ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr,
  607. SMC_WR_BUF_V2_SIZE,
  608. DMA_TO_DEVICE);
  609. lnk->wr_tx_v2_dma_addr = 0;
  610. }
  611. }
  612. void smc_wr_free_lgr_mem(struct smc_link_group *lgr)
  613. {
  614. if (lgr->smc_version < SMC_V2)
  615. return;
  616. kfree(lgr->wr_rx_buf_v2);
  617. lgr->wr_rx_buf_v2 = NULL;
  618. kfree(lgr->wr_tx_buf_v2);
  619. lgr->wr_tx_buf_v2 = NULL;
  620. }
  621. void smc_wr_free_link_mem(struct smc_link *lnk)
  622. {
  623. kfree(lnk->wr_tx_v2_ib);
  624. lnk->wr_tx_v2_ib = NULL;
  625. kfree(lnk->wr_tx_v2_sge);
  626. lnk->wr_tx_v2_sge = NULL;
  627. kfree(lnk->wr_tx_v2_pend);
  628. lnk->wr_tx_v2_pend = NULL;
  629. kfree(lnk->wr_tx_compl);
  630. lnk->wr_tx_compl = NULL;
  631. kfree(lnk->wr_tx_pends);
  632. lnk->wr_tx_pends = NULL;
  633. bitmap_free(lnk->wr_tx_mask);
  634. lnk->wr_tx_mask = NULL;
  635. kfree(lnk->wr_tx_sges);
  636. lnk->wr_tx_sges = NULL;
  637. kfree(lnk->wr_tx_rdma_sges);
  638. lnk->wr_tx_rdma_sges = NULL;
  639. kfree(lnk->wr_rx_sges);
  640. lnk->wr_rx_sges = NULL;
  641. kfree(lnk->wr_tx_rdmas);
  642. lnk->wr_tx_rdmas = NULL;
  643. kfree(lnk->wr_rx_ibs);
  644. lnk->wr_rx_ibs = NULL;
  645. kfree(lnk->wr_tx_ibs);
  646. lnk->wr_tx_ibs = NULL;
  647. kfree(lnk->wr_tx_bufs);
  648. lnk->wr_tx_bufs = NULL;
  649. kfree(lnk->wr_rx_bufs);
  650. lnk->wr_rx_bufs = NULL;
  651. }
  652. int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr)
  653. {
  654. if (lgr->smc_version < SMC_V2)
  655. return 0;
  656. lgr->wr_rx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL);
  657. if (!lgr->wr_rx_buf_v2)
  658. return -ENOMEM;
  659. lgr->wr_tx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL);
  660. if (!lgr->wr_tx_buf_v2) {
  661. kfree(lgr->wr_rx_buf_v2);
  662. return -ENOMEM;
  663. }
  664. return 0;
  665. }
  666. int smc_wr_alloc_link_mem(struct smc_link *link)
  667. {
  668. int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1;
  669. /* allocate link related memory */
  670. link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
  671. if (!link->wr_tx_bufs)
  672. goto no_mem;
  673. link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE,
  674. GFP_KERNEL);
  675. if (!link->wr_rx_bufs)
  676. goto no_mem_wr_tx_bufs;
  677. link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]),
  678. GFP_KERNEL);
  679. if (!link->wr_tx_ibs)
  680. goto no_mem_wr_rx_bufs;
  681. link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3,
  682. sizeof(link->wr_rx_ibs[0]),
  683. GFP_KERNEL);
  684. if (!link->wr_rx_ibs)
  685. goto no_mem_wr_tx_ibs;
  686. link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT,
  687. sizeof(link->wr_tx_rdmas[0]),
  688. GFP_KERNEL);
  689. if (!link->wr_tx_rdmas)
  690. goto no_mem_wr_rx_ibs;
  691. link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT,
  692. sizeof(link->wr_tx_rdma_sges[0]),
  693. GFP_KERNEL);
  694. if (!link->wr_tx_rdma_sges)
  695. goto no_mem_wr_tx_rdmas;
  696. link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
  697. GFP_KERNEL);
  698. if (!link->wr_tx_sges)
  699. goto no_mem_wr_tx_rdma_sges;
  700. link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
  701. sizeof(link->wr_rx_sges[0]) * sges_per_buf,
  702. GFP_KERNEL);
  703. if (!link->wr_rx_sges)
  704. goto no_mem_wr_tx_sges;
  705. link->wr_tx_mask = bitmap_zalloc(SMC_WR_BUF_CNT, GFP_KERNEL);
  706. if (!link->wr_tx_mask)
  707. goto no_mem_wr_rx_sges;
  708. link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT,
  709. sizeof(link->wr_tx_pends[0]),
  710. GFP_KERNEL);
  711. if (!link->wr_tx_pends)
  712. goto no_mem_wr_tx_mask;
  713. link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT,
  714. sizeof(link->wr_tx_compl[0]),
  715. GFP_KERNEL);
  716. if (!link->wr_tx_compl)
  717. goto no_mem_wr_tx_pends;
  718. if (link->lgr->smc_version == SMC_V2) {
  719. link->wr_tx_v2_ib = kzalloc(sizeof(*link->wr_tx_v2_ib),
  720. GFP_KERNEL);
  721. if (!link->wr_tx_v2_ib)
  722. goto no_mem_tx_compl;
  723. link->wr_tx_v2_sge = kzalloc(sizeof(*link->wr_tx_v2_sge),
  724. GFP_KERNEL);
  725. if (!link->wr_tx_v2_sge)
  726. goto no_mem_v2_ib;
  727. link->wr_tx_v2_pend = kzalloc(sizeof(*link->wr_tx_v2_pend),
  728. GFP_KERNEL);
  729. if (!link->wr_tx_v2_pend)
  730. goto no_mem_v2_sge;
  731. }
  732. return 0;
  733. no_mem_v2_sge:
  734. kfree(link->wr_tx_v2_sge);
  735. no_mem_v2_ib:
  736. kfree(link->wr_tx_v2_ib);
  737. no_mem_tx_compl:
  738. kfree(link->wr_tx_compl);
  739. no_mem_wr_tx_pends:
  740. kfree(link->wr_tx_pends);
  741. no_mem_wr_tx_mask:
  742. kfree(link->wr_tx_mask);
  743. no_mem_wr_rx_sges:
  744. kfree(link->wr_rx_sges);
  745. no_mem_wr_tx_sges:
  746. kfree(link->wr_tx_sges);
  747. no_mem_wr_tx_rdma_sges:
  748. kfree(link->wr_tx_rdma_sges);
  749. no_mem_wr_tx_rdmas:
  750. kfree(link->wr_tx_rdmas);
  751. no_mem_wr_rx_ibs:
  752. kfree(link->wr_rx_ibs);
  753. no_mem_wr_tx_ibs:
  754. kfree(link->wr_tx_ibs);
  755. no_mem_wr_rx_bufs:
  756. kfree(link->wr_rx_bufs);
  757. no_mem_wr_tx_bufs:
  758. kfree(link->wr_tx_bufs);
  759. no_mem:
  760. return -ENOMEM;
  761. }
  762. void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
  763. {
  764. tasklet_kill(&smcibdev->recv_tasklet);
  765. tasklet_kill(&smcibdev->send_tasklet);
  766. }
  767. void smc_wr_add_dev(struct smc_ib_device *smcibdev)
  768. {
  769. tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn);
  770. tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn);
  771. }
  772. static void smcr_wr_tx_refs_free(struct percpu_ref *ref)
  773. {
  774. struct smc_link *lnk = container_of(ref, struct smc_link, wr_tx_refs);
  775. complete(&lnk->tx_ref_comp);
  776. }
  777. static void smcr_wr_reg_refs_free(struct percpu_ref *ref)
  778. {
  779. struct smc_link *lnk = container_of(ref, struct smc_link, wr_reg_refs);
  780. complete(&lnk->reg_ref_comp);
  781. }
  782. int smc_wr_create_link(struct smc_link *lnk)
  783. {
  784. struct ib_device *ibdev = lnk->smcibdev->ibdev;
  785. int rc = 0;
  786. smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0);
  787. lnk->wr_rx_id = 0;
  788. lnk->wr_rx_dma_addr = ib_dma_map_single(
  789. ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
  790. DMA_FROM_DEVICE);
  791. if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) {
  792. lnk->wr_rx_dma_addr = 0;
  793. rc = -EIO;
  794. goto out;
  795. }
  796. if (lnk->lgr->smc_version == SMC_V2) {
  797. lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev,
  798. lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE,
  799. DMA_FROM_DEVICE);
  800. if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) {
  801. lnk->wr_rx_v2_dma_addr = 0;
  802. rc = -EIO;
  803. goto dma_unmap;
  804. }
  805. lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev,
  806. lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE,
  807. DMA_TO_DEVICE);
  808. if (ib_dma_mapping_error(ibdev, lnk->wr_tx_v2_dma_addr)) {
  809. lnk->wr_tx_v2_dma_addr = 0;
  810. rc = -EIO;
  811. goto dma_unmap;
  812. }
  813. }
  814. lnk->wr_tx_dma_addr = ib_dma_map_single(
  815. ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
  816. DMA_TO_DEVICE);
  817. if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) {
  818. rc = -EIO;
  819. goto dma_unmap;
  820. }
  821. smc_wr_init_sge(lnk);
  822. bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT);
  823. init_waitqueue_head(&lnk->wr_tx_wait);
  824. rc = percpu_ref_init(&lnk->wr_tx_refs, smcr_wr_tx_refs_free, 0, GFP_KERNEL);
  825. if (rc)
  826. goto dma_unmap;
  827. init_completion(&lnk->tx_ref_comp);
  828. init_waitqueue_head(&lnk->wr_reg_wait);
  829. rc = percpu_ref_init(&lnk->wr_reg_refs, smcr_wr_reg_refs_free, 0, GFP_KERNEL);
  830. if (rc)
  831. goto cancel_ref;
  832. init_completion(&lnk->reg_ref_comp);
  833. init_waitqueue_head(&lnk->wr_rx_empty_wait);
  834. return rc;
  835. cancel_ref:
  836. percpu_ref_exit(&lnk->wr_tx_refs);
  837. dma_unmap:
  838. if (lnk->wr_rx_v2_dma_addr) {
  839. ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr,
  840. SMC_WR_BUF_V2_SIZE,
  841. DMA_FROM_DEVICE);
  842. lnk->wr_rx_v2_dma_addr = 0;
  843. }
  844. if (lnk->wr_tx_v2_dma_addr) {
  845. ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr,
  846. SMC_WR_BUF_V2_SIZE,
  847. DMA_TO_DEVICE);
  848. lnk->wr_tx_v2_dma_addr = 0;
  849. }
  850. ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
  851. SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
  852. DMA_FROM_DEVICE);
  853. lnk->wr_rx_dma_addr = 0;
  854. out:
  855. return rc;
  856. }