rdma.c 41 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691
  1. /*
  2. * NVMe over Fabrics RDMA target.
  3. * Copyright (c) 2015-2016 HGST, a Western Digital Company.
  4. *
  5. * This program is free software; you can redistribute it and/or modify it
  6. * under the terms and conditions of the GNU General Public License,
  7. * version 2, as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope it will be useful, but WITHOUT
  10. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11. * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  12. * more details.
  13. */
  14. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  15. #include <linux/atomic.h>
  16. #include <linux/ctype.h>
  17. #include <linux/delay.h>
  18. #include <linux/err.h>
  19. #include <linux/init.h>
  20. #include <linux/module.h>
  21. #include <linux/nvme.h>
  22. #include <linux/slab.h>
  23. #include <linux/string.h>
  24. #include <linux/wait.h>
  25. #include <linux/inet.h>
  26. #include <asm/unaligned.h>
  27. #include <rdma/ib_verbs.h>
  28. #include <rdma/rdma_cm.h>
  29. #include <rdma/rw.h>
  30. #include <linux/nvme-rdma.h>
  31. #include "nvmet.h"
  32. /*
  33. * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data
  34. */
  35. #define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE PAGE_SIZE
  36. #define NVMET_RDMA_MAX_INLINE_SGE 4
  37. #define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE)
  38. struct nvmet_rdma_cmd {
  39. struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1];
  40. struct ib_cqe cqe;
  41. struct ib_recv_wr wr;
  42. struct scatterlist inline_sg[NVMET_RDMA_MAX_INLINE_SGE];
  43. struct nvme_command *nvme_cmd;
  44. struct nvmet_rdma_queue *queue;
  45. };
  46. enum {
  47. NVMET_RDMA_REQ_INLINE_DATA = (1 << 0),
  48. NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1),
  49. };
  50. struct nvmet_rdma_rsp {
  51. struct ib_sge send_sge;
  52. struct ib_cqe send_cqe;
  53. struct ib_send_wr send_wr;
  54. struct nvmet_rdma_cmd *cmd;
  55. struct nvmet_rdma_queue *queue;
  56. struct ib_cqe read_cqe;
  57. struct rdma_rw_ctx rw;
  58. struct nvmet_req req;
  59. bool allocated;
  60. u8 n_rdma;
  61. u32 flags;
  62. u32 invalidate_rkey;
  63. struct list_head wait_list;
  64. struct list_head free_list;
  65. };
  66. enum nvmet_rdma_queue_state {
  67. NVMET_RDMA_Q_CONNECTING,
  68. NVMET_RDMA_Q_LIVE,
  69. NVMET_RDMA_Q_DISCONNECTING,
  70. };
  71. struct nvmet_rdma_queue {
  72. struct rdma_cm_id *cm_id;
  73. struct ib_qp *qp;
  74. struct nvmet_port *port;
  75. struct ib_cq *cq;
  76. atomic_t sq_wr_avail;
  77. struct nvmet_rdma_device *dev;
  78. spinlock_t state_lock;
  79. enum nvmet_rdma_queue_state state;
  80. struct nvmet_cq nvme_cq;
  81. struct nvmet_sq nvme_sq;
  82. struct nvmet_rdma_rsp *rsps;
  83. struct list_head free_rsps;
  84. spinlock_t rsps_lock;
  85. struct nvmet_rdma_cmd *cmds;
  86. struct work_struct release_work;
  87. struct list_head rsp_wait_list;
  88. struct list_head rsp_wr_wait_list;
  89. spinlock_t rsp_wr_wait_lock;
  90. int idx;
  91. int host_qid;
  92. int recv_queue_size;
  93. int send_queue_size;
  94. struct list_head queue_list;
  95. };
  96. struct nvmet_rdma_device {
  97. struct ib_device *device;
  98. struct ib_pd *pd;
  99. struct ib_srq *srq;
  100. struct nvmet_rdma_cmd *srq_cmds;
  101. size_t srq_size;
  102. struct kref ref;
  103. struct list_head entry;
  104. int inline_data_size;
  105. int inline_page_count;
  106. };
  107. static bool nvmet_rdma_use_srq;
  108. module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444);
  109. MODULE_PARM_DESC(use_srq, "Use shared receive queue.");
  110. static DEFINE_IDA(nvmet_rdma_queue_ida);
  111. static LIST_HEAD(nvmet_rdma_queue_list);
  112. static DEFINE_MUTEX(nvmet_rdma_queue_mutex);
  113. static LIST_HEAD(device_list);
  114. static DEFINE_MUTEX(device_list_mutex);
  115. static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp);
  116. static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc);
  117. static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
  118. static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc);
  119. static void nvmet_rdma_qp_event(struct ib_event *event, void *priv);
  120. static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue);
  121. static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
  122. struct nvmet_rdma_rsp *r);
  123. static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
  124. struct nvmet_rdma_rsp *r);
  125. static const struct nvmet_fabrics_ops nvmet_rdma_ops;
  126. static int num_pages(int len)
  127. {
  128. return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT);
  129. }
  130. /* XXX: really should move to a generic header sooner or later.. */
  131. static inline u32 get_unaligned_le24(const u8 *p)
  132. {
  133. return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16;
  134. }
  135. static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp)
  136. {
  137. return nvme_is_write(rsp->req.cmd) &&
  138. rsp->req.transfer_len &&
  139. !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
  140. }
  141. static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp)
  142. {
  143. return !nvme_is_write(rsp->req.cmd) &&
  144. rsp->req.transfer_len &&
  145. !rsp->req.rsp->status &&
  146. !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
  147. }
  148. static inline struct nvmet_rdma_rsp *
  149. nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue)
  150. {
  151. struct nvmet_rdma_rsp *rsp;
  152. unsigned long flags;
  153. spin_lock_irqsave(&queue->rsps_lock, flags);
  154. rsp = list_first_entry_or_null(&queue->free_rsps,
  155. struct nvmet_rdma_rsp, free_list);
  156. if (likely(rsp))
  157. list_del(&rsp->free_list);
  158. spin_unlock_irqrestore(&queue->rsps_lock, flags);
  159. if (unlikely(!rsp)) {
  160. int ret;
  161. rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
  162. if (unlikely(!rsp))
  163. return NULL;
  164. ret = nvmet_rdma_alloc_rsp(queue->dev, rsp);
  165. if (unlikely(ret)) {
  166. kfree(rsp);
  167. return NULL;
  168. }
  169. rsp->allocated = true;
  170. }
  171. return rsp;
  172. }
  173. static inline void
  174. nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
  175. {
  176. unsigned long flags;
  177. if (unlikely(rsp->allocated)) {
  178. nvmet_rdma_free_rsp(rsp->queue->dev, rsp);
  179. kfree(rsp);
  180. return;
  181. }
  182. spin_lock_irqsave(&rsp->queue->rsps_lock, flags);
  183. list_add_tail(&rsp->free_list, &rsp->queue->free_rsps);
  184. spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
  185. }
  186. static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev,
  187. struct nvmet_rdma_cmd *c)
  188. {
  189. struct scatterlist *sg;
  190. struct ib_sge *sge;
  191. int i;
  192. if (!ndev->inline_data_size)
  193. return;
  194. sg = c->inline_sg;
  195. sge = &c->sge[1];
  196. for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
  197. if (sge->length)
  198. ib_dma_unmap_page(ndev->device, sge->addr,
  199. sge->length, DMA_FROM_DEVICE);
  200. if (sg_page(sg))
  201. __free_page(sg_page(sg));
  202. }
  203. }
  204. static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev,
  205. struct nvmet_rdma_cmd *c)
  206. {
  207. struct scatterlist *sg;
  208. struct ib_sge *sge;
  209. struct page *pg;
  210. int len;
  211. int i;
  212. if (!ndev->inline_data_size)
  213. return 0;
  214. sg = c->inline_sg;
  215. sg_init_table(sg, ndev->inline_page_count);
  216. sge = &c->sge[1];
  217. len = ndev->inline_data_size;
  218. for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
  219. pg = alloc_page(GFP_KERNEL);
  220. if (!pg)
  221. goto out_err;
  222. sg_assign_page(sg, pg);
  223. sge->addr = ib_dma_map_page(ndev->device,
  224. pg, 0, PAGE_SIZE, DMA_FROM_DEVICE);
  225. if (ib_dma_mapping_error(ndev->device, sge->addr))
  226. goto out_err;
  227. sge->length = min_t(int, len, PAGE_SIZE);
  228. sge->lkey = ndev->pd->local_dma_lkey;
  229. len -= sge->length;
  230. }
  231. return 0;
  232. out_err:
  233. for (; i >= 0; i--, sg--, sge--) {
  234. if (sge->length)
  235. ib_dma_unmap_page(ndev->device, sge->addr,
  236. sge->length, DMA_FROM_DEVICE);
  237. if (sg_page(sg))
  238. __free_page(sg_page(sg));
  239. }
  240. return -ENOMEM;
  241. }
  242. static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
  243. struct nvmet_rdma_cmd *c, bool admin)
  244. {
  245. /* NVMe command / RDMA RECV */
  246. c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL);
  247. if (!c->nvme_cmd)
  248. goto out;
  249. c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd,
  250. sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
  251. if (ib_dma_mapping_error(ndev->device, c->sge[0].addr))
  252. goto out_free_cmd;
  253. c->sge[0].length = sizeof(*c->nvme_cmd);
  254. c->sge[0].lkey = ndev->pd->local_dma_lkey;
  255. if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c))
  256. goto out_unmap_cmd;
  257. c->cqe.done = nvmet_rdma_recv_done;
  258. c->wr.wr_cqe = &c->cqe;
  259. c->wr.sg_list = c->sge;
  260. c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1;
  261. return 0;
  262. out_unmap_cmd:
  263. ib_dma_unmap_single(ndev->device, c->sge[0].addr,
  264. sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
  265. out_free_cmd:
  266. kfree(c->nvme_cmd);
  267. out:
  268. return -ENOMEM;
  269. }
  270. static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev,
  271. struct nvmet_rdma_cmd *c, bool admin)
  272. {
  273. if (!admin)
  274. nvmet_rdma_free_inline_pages(ndev, c);
  275. ib_dma_unmap_single(ndev->device, c->sge[0].addr,
  276. sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
  277. kfree(c->nvme_cmd);
  278. }
  279. static struct nvmet_rdma_cmd *
  280. nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev,
  281. int nr_cmds, bool admin)
  282. {
  283. struct nvmet_rdma_cmd *cmds;
  284. int ret = -EINVAL, i;
  285. cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL);
  286. if (!cmds)
  287. goto out;
  288. for (i = 0; i < nr_cmds; i++) {
  289. ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin);
  290. if (ret)
  291. goto out_free;
  292. }
  293. return cmds;
  294. out_free:
  295. while (--i >= 0)
  296. nvmet_rdma_free_cmd(ndev, cmds + i, admin);
  297. kfree(cmds);
  298. out:
  299. return ERR_PTR(ret);
  300. }
  301. static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev,
  302. struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin)
  303. {
  304. int i;
  305. for (i = 0; i < nr_cmds; i++)
  306. nvmet_rdma_free_cmd(ndev, cmds + i, admin);
  307. kfree(cmds);
  308. }
  309. static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
  310. struct nvmet_rdma_rsp *r)
  311. {
  312. /* NVMe CQE / RDMA SEND */
  313. r->req.rsp = kmalloc(sizeof(*r->req.rsp), GFP_KERNEL);
  314. if (!r->req.rsp)
  315. goto out;
  316. r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.rsp,
  317. sizeof(*r->req.rsp), DMA_TO_DEVICE);
  318. if (ib_dma_mapping_error(ndev->device, r->send_sge.addr))
  319. goto out_free_rsp;
  320. r->send_sge.length = sizeof(*r->req.rsp);
  321. r->send_sge.lkey = ndev->pd->local_dma_lkey;
  322. r->send_cqe.done = nvmet_rdma_send_done;
  323. r->send_wr.wr_cqe = &r->send_cqe;
  324. r->send_wr.sg_list = &r->send_sge;
  325. r->send_wr.num_sge = 1;
  326. r->send_wr.send_flags = IB_SEND_SIGNALED;
  327. /* Data In / RDMA READ */
  328. r->read_cqe.done = nvmet_rdma_read_data_done;
  329. return 0;
  330. out_free_rsp:
  331. kfree(r->req.rsp);
  332. out:
  333. return -ENOMEM;
  334. }
  335. static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
  336. struct nvmet_rdma_rsp *r)
  337. {
  338. ib_dma_unmap_single(ndev->device, r->send_sge.addr,
  339. sizeof(*r->req.rsp), DMA_TO_DEVICE);
  340. kfree(r->req.rsp);
  341. }
  342. static int
  343. nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue)
  344. {
  345. struct nvmet_rdma_device *ndev = queue->dev;
  346. int nr_rsps = queue->recv_queue_size * 2;
  347. int ret = -EINVAL, i;
  348. queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp),
  349. GFP_KERNEL);
  350. if (!queue->rsps)
  351. goto out;
  352. for (i = 0; i < nr_rsps; i++) {
  353. struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
  354. ret = nvmet_rdma_alloc_rsp(ndev, rsp);
  355. if (ret)
  356. goto out_free;
  357. list_add_tail(&rsp->free_list, &queue->free_rsps);
  358. }
  359. return 0;
  360. out_free:
  361. while (--i >= 0) {
  362. struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
  363. list_del(&rsp->free_list);
  364. nvmet_rdma_free_rsp(ndev, rsp);
  365. }
  366. kfree(queue->rsps);
  367. out:
  368. return ret;
  369. }
  370. static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue)
  371. {
  372. struct nvmet_rdma_device *ndev = queue->dev;
  373. int i, nr_rsps = queue->recv_queue_size * 2;
  374. for (i = 0; i < nr_rsps; i++) {
  375. struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
  376. list_del(&rsp->free_list);
  377. nvmet_rdma_free_rsp(ndev, rsp);
  378. }
  379. kfree(queue->rsps);
  380. }
  381. static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev,
  382. struct nvmet_rdma_cmd *cmd)
  383. {
  384. int ret;
  385. ib_dma_sync_single_for_device(ndev->device,
  386. cmd->sge[0].addr, cmd->sge[0].length,
  387. DMA_FROM_DEVICE);
  388. if (ndev->srq)
  389. ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL);
  390. else
  391. ret = ib_post_recv(cmd->queue->qp, &cmd->wr, NULL);
  392. if (unlikely(ret))
  393. pr_err("post_recv cmd failed\n");
  394. return ret;
  395. }
  396. static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue)
  397. {
  398. spin_lock(&queue->rsp_wr_wait_lock);
  399. while (!list_empty(&queue->rsp_wr_wait_list)) {
  400. struct nvmet_rdma_rsp *rsp;
  401. bool ret;
  402. rsp = list_entry(queue->rsp_wr_wait_list.next,
  403. struct nvmet_rdma_rsp, wait_list);
  404. list_del(&rsp->wait_list);
  405. spin_unlock(&queue->rsp_wr_wait_lock);
  406. ret = nvmet_rdma_execute_command(rsp);
  407. spin_lock(&queue->rsp_wr_wait_lock);
  408. if (!ret) {
  409. list_add(&rsp->wait_list, &queue->rsp_wr_wait_list);
  410. break;
  411. }
  412. }
  413. spin_unlock(&queue->rsp_wr_wait_lock);
  414. }
  415. static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
  416. {
  417. struct nvmet_rdma_queue *queue = rsp->queue;
  418. atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
  419. if (rsp->n_rdma) {
  420. rdma_rw_ctx_destroy(&rsp->rw, queue->qp,
  421. queue->cm_id->port_num, rsp->req.sg,
  422. rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
  423. }
  424. if (rsp->req.sg != rsp->cmd->inline_sg)
  425. sgl_free(rsp->req.sg);
  426. if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
  427. nvmet_rdma_process_wr_wait_list(queue);
  428. nvmet_rdma_put_rsp(rsp);
  429. }
  430. static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue)
  431. {
  432. if (queue->nvme_sq.ctrl) {
  433. nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
  434. } else {
  435. /*
  436. * we didn't setup the controller yet in case
  437. * of admin connect error, just disconnect and
  438. * cleanup the queue
  439. */
  440. nvmet_rdma_queue_disconnect(queue);
  441. }
  442. }
  443. static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
  444. {
  445. struct nvmet_rdma_rsp *rsp =
  446. container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe);
  447. struct nvmet_rdma_queue *queue = cq->cq_context;
  448. nvmet_rdma_release_rsp(rsp);
  449. if (unlikely(wc->status != IB_WC_SUCCESS &&
  450. wc->status != IB_WC_WR_FLUSH_ERR)) {
  451. pr_err("SEND for CQE 0x%p failed with status %s (%d).\n",
  452. wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
  453. nvmet_rdma_error_comp(queue);
  454. }
  455. }
  456. static void nvmet_rdma_queue_response(struct nvmet_req *req)
  457. {
  458. struct nvmet_rdma_rsp *rsp =
  459. container_of(req, struct nvmet_rdma_rsp, req);
  460. struct rdma_cm_id *cm_id = rsp->queue->cm_id;
  461. struct ib_send_wr *first_wr;
  462. if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) {
  463. rsp->send_wr.opcode = IB_WR_SEND_WITH_INV;
  464. rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey;
  465. } else {
  466. rsp->send_wr.opcode = IB_WR_SEND;
  467. }
  468. if (nvmet_rdma_need_data_out(rsp))
  469. first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp,
  470. cm_id->port_num, NULL, &rsp->send_wr);
  471. else
  472. first_wr = &rsp->send_wr;
  473. nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd);
  474. ib_dma_sync_single_for_device(rsp->queue->dev->device,
  475. rsp->send_sge.addr, rsp->send_sge.length,
  476. DMA_TO_DEVICE);
  477. if (unlikely(ib_post_send(cm_id->qp, first_wr, NULL))) {
  478. pr_err("sending cmd response failed\n");
  479. nvmet_rdma_release_rsp(rsp);
  480. }
  481. }
  482. static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
  483. {
  484. struct nvmet_rdma_rsp *rsp =
  485. container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe);
  486. struct nvmet_rdma_queue *queue = cq->cq_context;
  487. WARN_ON(rsp->n_rdma <= 0);
  488. atomic_add(rsp->n_rdma, &queue->sq_wr_avail);
  489. rdma_rw_ctx_destroy(&rsp->rw, queue->qp,
  490. queue->cm_id->port_num, rsp->req.sg,
  491. rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
  492. rsp->n_rdma = 0;
  493. if (unlikely(wc->status != IB_WC_SUCCESS)) {
  494. nvmet_req_uninit(&rsp->req);
  495. nvmet_rdma_release_rsp(rsp);
  496. if (wc->status != IB_WC_WR_FLUSH_ERR) {
  497. pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n",
  498. wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
  499. nvmet_rdma_error_comp(queue);
  500. }
  501. return;
  502. }
  503. nvmet_req_execute(&rsp->req);
  504. }
  505. static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
  506. u64 off)
  507. {
  508. int sg_count = num_pages(len);
  509. struct scatterlist *sg;
  510. int i;
  511. sg = rsp->cmd->inline_sg;
  512. for (i = 0; i < sg_count; i++, sg++) {
  513. if (i < sg_count - 1)
  514. sg_unmark_end(sg);
  515. else
  516. sg_mark_end(sg);
  517. sg->offset = off;
  518. sg->length = min_t(int, len, PAGE_SIZE - off);
  519. len -= sg->length;
  520. if (!i)
  521. off = 0;
  522. }
  523. rsp->req.sg = rsp->cmd->inline_sg;
  524. rsp->req.sg_cnt = sg_count;
  525. }
  526. static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
  527. {
  528. struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl;
  529. u64 off = le64_to_cpu(sgl->addr);
  530. u32 len = le32_to_cpu(sgl->length);
  531. if (!nvme_is_write(rsp->req.cmd))
  532. return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
  533. if (off + len > rsp->queue->dev->inline_data_size) {
  534. pr_err("invalid inline data offset!\n");
  535. return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
  536. }
  537. /* no data command? */
  538. if (!len)
  539. return 0;
  540. nvmet_rdma_use_inline_sg(rsp, len, off);
  541. rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA;
  542. rsp->req.transfer_len += len;
  543. return 0;
  544. }
  545. static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
  546. struct nvme_keyed_sgl_desc *sgl, bool invalidate)
  547. {
  548. struct rdma_cm_id *cm_id = rsp->queue->cm_id;
  549. u64 addr = le64_to_cpu(sgl->addr);
  550. u32 len = get_unaligned_le24(sgl->length);
  551. u32 key = get_unaligned_le32(sgl->key);
  552. int ret;
  553. /* no data command? */
  554. if (!len)
  555. return 0;
  556. rsp->req.sg = sgl_alloc(len, GFP_KERNEL, &rsp->req.sg_cnt);
  557. if (!rsp->req.sg)
  558. return NVME_SC_INTERNAL;
  559. ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
  560. rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
  561. nvmet_data_dir(&rsp->req));
  562. if (ret < 0)
  563. return NVME_SC_INTERNAL;
  564. rsp->req.transfer_len += len;
  565. rsp->n_rdma += ret;
  566. if (invalidate) {
  567. rsp->invalidate_rkey = key;
  568. rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY;
  569. }
  570. return 0;
  571. }
  572. static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
  573. {
  574. struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl;
  575. switch (sgl->type >> 4) {
  576. case NVME_SGL_FMT_DATA_DESC:
  577. switch (sgl->type & 0xf) {
  578. case NVME_SGL_FMT_OFFSET:
  579. return nvmet_rdma_map_sgl_inline(rsp);
  580. default:
  581. pr_err("invalid SGL subtype: %#x\n", sgl->type);
  582. return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
  583. }
  584. case NVME_KEY_SGL_FMT_DATA_DESC:
  585. switch (sgl->type & 0xf) {
  586. case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE:
  587. return nvmet_rdma_map_sgl_keyed(rsp, sgl, true);
  588. case NVME_SGL_FMT_ADDRESS:
  589. return nvmet_rdma_map_sgl_keyed(rsp, sgl, false);
  590. default:
  591. pr_err("invalid SGL subtype: %#x\n", sgl->type);
  592. return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
  593. }
  594. default:
  595. pr_err("invalid SGL type: %#x\n", sgl->type);
  596. return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR;
  597. }
  598. }
  599. static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp)
  600. {
  601. struct nvmet_rdma_queue *queue = rsp->queue;
  602. if (unlikely(atomic_sub_return(1 + rsp->n_rdma,
  603. &queue->sq_wr_avail) < 0)) {
  604. pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n",
  605. 1 + rsp->n_rdma, queue->idx,
  606. queue->nvme_sq.ctrl->cntlid);
  607. atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
  608. return false;
  609. }
  610. if (nvmet_rdma_need_data_in(rsp)) {
  611. if (rdma_rw_ctx_post(&rsp->rw, queue->qp,
  612. queue->cm_id->port_num, &rsp->read_cqe, NULL))
  613. nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR);
  614. } else {
  615. nvmet_req_execute(&rsp->req);
  616. }
  617. return true;
  618. }
  619. static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue,
  620. struct nvmet_rdma_rsp *cmd)
  621. {
  622. u16 status;
  623. ib_dma_sync_single_for_cpu(queue->dev->device,
  624. cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length,
  625. DMA_FROM_DEVICE);
  626. ib_dma_sync_single_for_cpu(queue->dev->device,
  627. cmd->send_sge.addr, cmd->send_sge.length,
  628. DMA_TO_DEVICE);
  629. if (!nvmet_req_init(&cmd->req, &queue->nvme_cq,
  630. &queue->nvme_sq, &nvmet_rdma_ops))
  631. return;
  632. status = nvmet_rdma_map_sgl(cmd);
  633. if (status)
  634. goto out_err;
  635. if (unlikely(!nvmet_rdma_execute_command(cmd))) {
  636. spin_lock(&queue->rsp_wr_wait_lock);
  637. list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list);
  638. spin_unlock(&queue->rsp_wr_wait_lock);
  639. }
  640. return;
  641. out_err:
  642. nvmet_req_complete(&cmd->req, status);
  643. }
  644. static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
  645. {
  646. struct nvmet_rdma_cmd *cmd =
  647. container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe);
  648. struct nvmet_rdma_queue *queue = cq->cq_context;
  649. struct nvmet_rdma_rsp *rsp;
  650. if (unlikely(wc->status != IB_WC_SUCCESS)) {
  651. if (wc->status != IB_WC_WR_FLUSH_ERR) {
  652. pr_err("RECV for CQE 0x%p failed with status %s (%d)\n",
  653. wc->wr_cqe, ib_wc_status_msg(wc->status),
  654. wc->status);
  655. nvmet_rdma_error_comp(queue);
  656. }
  657. return;
  658. }
  659. if (unlikely(wc->byte_len < sizeof(struct nvme_command))) {
  660. pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n");
  661. nvmet_rdma_error_comp(queue);
  662. return;
  663. }
  664. cmd->queue = queue;
  665. rsp = nvmet_rdma_get_rsp(queue);
  666. if (unlikely(!rsp)) {
  667. /*
  668. * we get here only under memory pressure,
  669. * silently drop and have the host retry
  670. * as we can't even fail it.
  671. */
  672. nvmet_rdma_post_recv(queue->dev, cmd);
  673. return;
  674. }
  675. rsp->queue = queue;
  676. rsp->cmd = cmd;
  677. rsp->flags = 0;
  678. rsp->req.cmd = cmd->nvme_cmd;
  679. rsp->req.port = queue->port;
  680. rsp->n_rdma = 0;
  681. if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) {
  682. unsigned long flags;
  683. spin_lock_irqsave(&queue->state_lock, flags);
  684. if (queue->state == NVMET_RDMA_Q_CONNECTING)
  685. list_add_tail(&rsp->wait_list, &queue->rsp_wait_list);
  686. else
  687. nvmet_rdma_put_rsp(rsp);
  688. spin_unlock_irqrestore(&queue->state_lock, flags);
  689. return;
  690. }
  691. nvmet_rdma_handle_command(queue, rsp);
  692. }
  693. static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev)
  694. {
  695. if (!ndev->srq)
  696. return;
  697. nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
  698. ib_destroy_srq(ndev->srq);
  699. }
  700. static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
  701. {
  702. struct ib_srq_init_attr srq_attr = { NULL, };
  703. struct ib_srq *srq;
  704. size_t srq_size;
  705. int ret, i;
  706. srq_size = 4095; /* XXX: tune */
  707. srq_attr.attr.max_wr = srq_size;
  708. srq_attr.attr.max_sge = 1 + ndev->inline_page_count;
  709. srq_attr.attr.srq_limit = 0;
  710. srq_attr.srq_type = IB_SRQT_BASIC;
  711. srq = ib_create_srq(ndev->pd, &srq_attr);
  712. if (IS_ERR(srq)) {
  713. /*
  714. * If SRQs aren't supported we just go ahead and use normal
  715. * non-shared receive queues.
  716. */
  717. pr_info("SRQ requested but not supported.\n");
  718. return 0;
  719. }
  720. ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false);
  721. if (IS_ERR(ndev->srq_cmds)) {
  722. ret = PTR_ERR(ndev->srq_cmds);
  723. goto out_destroy_srq;
  724. }
  725. ndev->srq = srq;
  726. ndev->srq_size = srq_size;
  727. for (i = 0; i < srq_size; i++) {
  728. ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]);
  729. if (ret)
  730. goto out_free_cmds;
  731. }
  732. return 0;
  733. out_free_cmds:
  734. nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
  735. out_destroy_srq:
  736. ib_destroy_srq(srq);
  737. return ret;
  738. }
  739. static void nvmet_rdma_free_dev(struct kref *ref)
  740. {
  741. struct nvmet_rdma_device *ndev =
  742. container_of(ref, struct nvmet_rdma_device, ref);
  743. mutex_lock(&device_list_mutex);
  744. list_del(&ndev->entry);
  745. mutex_unlock(&device_list_mutex);
  746. nvmet_rdma_destroy_srq(ndev);
  747. ib_dealloc_pd(ndev->pd);
  748. kfree(ndev);
  749. }
  750. static struct nvmet_rdma_device *
  751. nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
  752. {
  753. struct nvmet_port *port = cm_id->context;
  754. struct nvmet_rdma_device *ndev;
  755. int inline_page_count;
  756. int inline_sge_count;
  757. int ret;
  758. mutex_lock(&device_list_mutex);
  759. list_for_each_entry(ndev, &device_list, entry) {
  760. if (ndev->device->node_guid == cm_id->device->node_guid &&
  761. kref_get_unless_zero(&ndev->ref))
  762. goto out_unlock;
  763. }
  764. ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
  765. if (!ndev)
  766. goto out_err;
  767. inline_page_count = num_pages(port->inline_data_size);
  768. inline_sge_count = max(cm_id->device->attrs.max_sge_rd,
  769. cm_id->device->attrs.max_recv_sge) - 1;
  770. if (inline_page_count > inline_sge_count) {
  771. pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n",
  772. port->inline_data_size, cm_id->device->name,
  773. inline_sge_count * PAGE_SIZE);
  774. port->inline_data_size = inline_sge_count * PAGE_SIZE;
  775. inline_page_count = inline_sge_count;
  776. }
  777. ndev->inline_data_size = port->inline_data_size;
  778. ndev->inline_page_count = inline_page_count;
  779. ndev->device = cm_id->device;
  780. kref_init(&ndev->ref);
  781. ndev->pd = ib_alloc_pd(ndev->device, 0);
  782. if (IS_ERR(ndev->pd))
  783. goto out_free_dev;
  784. if (nvmet_rdma_use_srq) {
  785. ret = nvmet_rdma_init_srq(ndev);
  786. if (ret)
  787. goto out_free_pd;
  788. }
  789. list_add(&ndev->entry, &device_list);
  790. out_unlock:
  791. mutex_unlock(&device_list_mutex);
  792. pr_debug("added %s.\n", ndev->device->name);
  793. return ndev;
  794. out_free_pd:
  795. ib_dealloc_pd(ndev->pd);
  796. out_free_dev:
  797. kfree(ndev);
  798. out_err:
  799. mutex_unlock(&device_list_mutex);
  800. return NULL;
  801. }
  802. static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
  803. {
  804. struct ib_qp_init_attr qp_attr;
  805. struct nvmet_rdma_device *ndev = queue->dev;
  806. int comp_vector, nr_cqe, ret, i;
  807. /*
  808. * Spread the io queues across completion vectors,
  809. * but still keep all admin queues on vector 0.
  810. */
  811. comp_vector = !queue->host_qid ? 0 :
  812. queue->idx % ndev->device->num_comp_vectors;
  813. /*
  814. * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND.
  815. */
  816. nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size;
  817. queue->cq = ib_alloc_cq(ndev->device, queue,
  818. nr_cqe + 1, comp_vector,
  819. IB_POLL_WORKQUEUE);
  820. if (IS_ERR(queue->cq)) {
  821. ret = PTR_ERR(queue->cq);
  822. pr_err("failed to create CQ cqe= %d ret= %d\n",
  823. nr_cqe + 1, ret);
  824. goto out;
  825. }
  826. memset(&qp_attr, 0, sizeof(qp_attr));
  827. qp_attr.qp_context = queue;
  828. qp_attr.event_handler = nvmet_rdma_qp_event;
  829. qp_attr.send_cq = queue->cq;
  830. qp_attr.recv_cq = queue->cq;
  831. qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
  832. qp_attr.qp_type = IB_QPT_RC;
  833. /* +1 for drain */
  834. qp_attr.cap.max_send_wr = queue->send_queue_size + 1;
  835. qp_attr.cap.max_rdma_ctxs = queue->send_queue_size;
  836. qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd,
  837. ndev->device->attrs.max_send_sge);
  838. if (ndev->srq) {
  839. qp_attr.srq = ndev->srq;
  840. } else {
  841. /* +1 for drain */
  842. qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size;
  843. qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count;
  844. }
  845. ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr);
  846. if (ret) {
  847. pr_err("failed to create_qp ret= %d\n", ret);
  848. goto err_destroy_cq;
  849. }
  850. queue->qp = queue->cm_id->qp;
  851. atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr);
  852. pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n",
  853. __func__, queue->cq->cqe, qp_attr.cap.max_send_sge,
  854. qp_attr.cap.max_send_wr, queue->cm_id);
  855. if (!ndev->srq) {
  856. for (i = 0; i < queue->recv_queue_size; i++) {
  857. queue->cmds[i].queue = queue;
  858. ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]);
  859. if (ret)
  860. goto err_destroy_qp;
  861. }
  862. }
  863. out:
  864. return ret;
  865. err_destroy_qp:
  866. rdma_destroy_qp(queue->cm_id);
  867. err_destroy_cq:
  868. ib_free_cq(queue->cq);
  869. goto out;
  870. }
  871. static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
  872. {
  873. ib_drain_qp(queue->qp);
  874. if (queue->cm_id)
  875. rdma_destroy_id(queue->cm_id);
  876. ib_destroy_qp(queue->qp);
  877. ib_free_cq(queue->cq);
  878. }
  879. static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
  880. {
  881. pr_debug("freeing queue %d\n", queue->idx);
  882. nvmet_sq_destroy(&queue->nvme_sq);
  883. nvmet_rdma_destroy_queue_ib(queue);
  884. if (!queue->dev->srq) {
  885. nvmet_rdma_free_cmds(queue->dev, queue->cmds,
  886. queue->recv_queue_size,
  887. !queue->host_qid);
  888. }
  889. nvmet_rdma_free_rsps(queue);
  890. ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
  891. kfree(queue);
  892. }
  893. static void nvmet_rdma_release_queue_work(struct work_struct *w)
  894. {
  895. struct nvmet_rdma_queue *queue =
  896. container_of(w, struct nvmet_rdma_queue, release_work);
  897. struct nvmet_rdma_device *dev = queue->dev;
  898. nvmet_rdma_free_queue(queue);
  899. kref_put(&dev->ref, nvmet_rdma_free_dev);
  900. }
  901. static int
  902. nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn,
  903. struct nvmet_rdma_queue *queue)
  904. {
  905. struct nvme_rdma_cm_req *req;
  906. req = (struct nvme_rdma_cm_req *)conn->private_data;
  907. if (!req || conn->private_data_len == 0)
  908. return NVME_RDMA_CM_INVALID_LEN;
  909. if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0)
  910. return NVME_RDMA_CM_INVALID_RECFMT;
  911. queue->host_qid = le16_to_cpu(req->qid);
  912. /*
  913. * req->hsqsize corresponds to our recv queue size plus 1
  914. * req->hrqsize corresponds to our send queue size
  915. */
  916. queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1;
  917. queue->send_queue_size = le16_to_cpu(req->hrqsize);
  918. if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH)
  919. return NVME_RDMA_CM_INVALID_HSQSIZE;
  920. /* XXX: Should we enforce some kind of max for IO queues? */
  921. return 0;
  922. }
  923. static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id,
  924. enum nvme_rdma_cm_status status)
  925. {
  926. struct nvme_rdma_cm_rej rej;
  927. pr_debug("rejecting connect request: status %d (%s)\n",
  928. status, nvme_rdma_cm_msg(status));
  929. rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
  930. rej.sts = cpu_to_le16(status);
  931. return rdma_reject(cm_id, (void *)&rej, sizeof(rej));
  932. }
  933. static struct nvmet_rdma_queue *
  934. nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
  935. struct rdma_cm_id *cm_id,
  936. struct rdma_cm_event *event)
  937. {
  938. struct nvmet_rdma_queue *queue;
  939. int ret;
  940. queue = kzalloc(sizeof(*queue), GFP_KERNEL);
  941. if (!queue) {
  942. ret = NVME_RDMA_CM_NO_RSC;
  943. goto out_reject;
  944. }
  945. ret = nvmet_sq_init(&queue->nvme_sq);
  946. if (ret) {
  947. ret = NVME_RDMA_CM_NO_RSC;
  948. goto out_free_queue;
  949. }
  950. ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue);
  951. if (ret)
  952. goto out_destroy_sq;
  953. /*
  954. * Schedules the actual release because calling rdma_destroy_id from
  955. * inside a CM callback would trigger a deadlock. (great API design..)
  956. */
  957. INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work);
  958. queue->dev = ndev;
  959. queue->cm_id = cm_id;
  960. spin_lock_init(&queue->state_lock);
  961. queue->state = NVMET_RDMA_Q_CONNECTING;
  962. INIT_LIST_HEAD(&queue->rsp_wait_list);
  963. INIT_LIST_HEAD(&queue->rsp_wr_wait_list);
  964. spin_lock_init(&queue->rsp_wr_wait_lock);
  965. INIT_LIST_HEAD(&queue->free_rsps);
  966. spin_lock_init(&queue->rsps_lock);
  967. INIT_LIST_HEAD(&queue->queue_list);
  968. queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL);
  969. if (queue->idx < 0) {
  970. ret = NVME_RDMA_CM_NO_RSC;
  971. goto out_destroy_sq;
  972. }
  973. ret = nvmet_rdma_alloc_rsps(queue);
  974. if (ret) {
  975. ret = NVME_RDMA_CM_NO_RSC;
  976. goto out_ida_remove;
  977. }
  978. if (!ndev->srq) {
  979. queue->cmds = nvmet_rdma_alloc_cmds(ndev,
  980. queue->recv_queue_size,
  981. !queue->host_qid);
  982. if (IS_ERR(queue->cmds)) {
  983. ret = NVME_RDMA_CM_NO_RSC;
  984. goto out_free_responses;
  985. }
  986. }
  987. ret = nvmet_rdma_create_queue_ib(queue);
  988. if (ret) {
  989. pr_err("%s: creating RDMA queue failed (%d).\n",
  990. __func__, ret);
  991. ret = NVME_RDMA_CM_NO_RSC;
  992. goto out_free_cmds;
  993. }
  994. return queue;
  995. out_free_cmds:
  996. if (!ndev->srq) {
  997. nvmet_rdma_free_cmds(queue->dev, queue->cmds,
  998. queue->recv_queue_size,
  999. !queue->host_qid);
  1000. }
  1001. out_free_responses:
  1002. nvmet_rdma_free_rsps(queue);
  1003. out_ida_remove:
  1004. ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
  1005. out_destroy_sq:
  1006. nvmet_sq_destroy(&queue->nvme_sq);
  1007. out_free_queue:
  1008. kfree(queue);
  1009. out_reject:
  1010. nvmet_rdma_cm_reject(cm_id, ret);
  1011. return NULL;
  1012. }
  1013. static void nvmet_rdma_qp_event(struct ib_event *event, void *priv)
  1014. {
  1015. struct nvmet_rdma_queue *queue = priv;
  1016. switch (event->event) {
  1017. case IB_EVENT_COMM_EST:
  1018. rdma_notify(queue->cm_id, event->event);
  1019. break;
  1020. default:
  1021. pr_err("received IB QP event: %s (%d)\n",
  1022. ib_event_msg(event->event), event->event);
  1023. break;
  1024. }
  1025. }
  1026. static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id,
  1027. struct nvmet_rdma_queue *queue,
  1028. struct rdma_conn_param *p)
  1029. {
  1030. struct rdma_conn_param param = { };
  1031. struct nvme_rdma_cm_rep priv = { };
  1032. int ret = -ENOMEM;
  1033. param.rnr_retry_count = 7;
  1034. param.flow_control = 1;
  1035. param.initiator_depth = min_t(u8, p->initiator_depth,
  1036. queue->dev->device->attrs.max_qp_init_rd_atom);
  1037. param.private_data = &priv;
  1038. param.private_data_len = sizeof(priv);
  1039. priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
  1040. priv.crqsize = cpu_to_le16(queue->recv_queue_size);
  1041. ret = rdma_accept(cm_id, &param);
  1042. if (ret)
  1043. pr_err("rdma_accept failed (error code = %d)\n", ret);
  1044. return ret;
  1045. }
  1046. static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
  1047. struct rdma_cm_event *event)
  1048. {
  1049. struct nvmet_rdma_device *ndev;
  1050. struct nvmet_rdma_queue *queue;
  1051. int ret = -EINVAL;
  1052. ndev = nvmet_rdma_find_get_device(cm_id);
  1053. if (!ndev) {
  1054. nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC);
  1055. return -ECONNREFUSED;
  1056. }
  1057. queue = nvmet_rdma_alloc_queue(ndev, cm_id, event);
  1058. if (!queue) {
  1059. ret = -ENOMEM;
  1060. goto put_device;
  1061. }
  1062. queue->port = cm_id->context;
  1063. if (queue->host_qid == 0) {
  1064. /* Let inflight controller teardown complete */
  1065. flush_scheduled_work();
  1066. }
  1067. ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
  1068. if (ret) {
  1069. /*
  1070. * Don't destroy the cm_id in free path, as we implicitly
  1071. * destroy the cm_id here with non-zero ret code.
  1072. */
  1073. queue->cm_id = NULL;
  1074. goto free_queue;
  1075. }
  1076. mutex_lock(&nvmet_rdma_queue_mutex);
  1077. list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list);
  1078. mutex_unlock(&nvmet_rdma_queue_mutex);
  1079. return 0;
  1080. free_queue:
  1081. nvmet_rdma_free_queue(queue);
  1082. put_device:
  1083. kref_put(&ndev->ref, nvmet_rdma_free_dev);
  1084. return ret;
  1085. }
  1086. static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue)
  1087. {
  1088. unsigned long flags;
  1089. spin_lock_irqsave(&queue->state_lock, flags);
  1090. if (queue->state != NVMET_RDMA_Q_CONNECTING) {
  1091. pr_warn("trying to establish a connected queue\n");
  1092. goto out_unlock;
  1093. }
  1094. queue->state = NVMET_RDMA_Q_LIVE;
  1095. while (!list_empty(&queue->rsp_wait_list)) {
  1096. struct nvmet_rdma_rsp *cmd;
  1097. cmd = list_first_entry(&queue->rsp_wait_list,
  1098. struct nvmet_rdma_rsp, wait_list);
  1099. list_del(&cmd->wait_list);
  1100. spin_unlock_irqrestore(&queue->state_lock, flags);
  1101. nvmet_rdma_handle_command(queue, cmd);
  1102. spin_lock_irqsave(&queue->state_lock, flags);
  1103. }
  1104. out_unlock:
  1105. spin_unlock_irqrestore(&queue->state_lock, flags);
  1106. }
  1107. static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
  1108. {
  1109. bool disconnect = false;
  1110. unsigned long flags;
  1111. pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state);
  1112. spin_lock_irqsave(&queue->state_lock, flags);
  1113. switch (queue->state) {
  1114. case NVMET_RDMA_Q_CONNECTING:
  1115. case NVMET_RDMA_Q_LIVE:
  1116. queue->state = NVMET_RDMA_Q_DISCONNECTING;
  1117. disconnect = true;
  1118. break;
  1119. case NVMET_RDMA_Q_DISCONNECTING:
  1120. break;
  1121. }
  1122. spin_unlock_irqrestore(&queue->state_lock, flags);
  1123. if (disconnect) {
  1124. rdma_disconnect(queue->cm_id);
  1125. schedule_work(&queue->release_work);
  1126. }
  1127. }
  1128. static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
  1129. {
  1130. bool disconnect = false;
  1131. mutex_lock(&nvmet_rdma_queue_mutex);
  1132. if (!list_empty(&queue->queue_list)) {
  1133. list_del_init(&queue->queue_list);
  1134. disconnect = true;
  1135. }
  1136. mutex_unlock(&nvmet_rdma_queue_mutex);
  1137. if (disconnect)
  1138. __nvmet_rdma_queue_disconnect(queue);
  1139. }
  1140. static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id,
  1141. struct nvmet_rdma_queue *queue)
  1142. {
  1143. WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING);
  1144. mutex_lock(&nvmet_rdma_queue_mutex);
  1145. if (!list_empty(&queue->queue_list))
  1146. list_del_init(&queue->queue_list);
  1147. mutex_unlock(&nvmet_rdma_queue_mutex);
  1148. pr_err("failed to connect queue %d\n", queue->idx);
  1149. schedule_work(&queue->release_work);
  1150. }
  1151. /**
  1152. * nvme_rdma_device_removal() - Handle RDMA device removal
  1153. * @cm_id: rdma_cm id, used for nvmet port
  1154. * @queue: nvmet rdma queue (cm id qp_context)
  1155. *
  1156. * DEVICE_REMOVAL event notifies us that the RDMA device is about
  1157. * to unplug. Note that this event can be generated on a normal
  1158. * queue cm_id and/or a device bound listener cm_id (where in this
  1159. * case queue will be null).
  1160. *
  1161. * We registered an ib_client to handle device removal for queues,
  1162. * so we only need to handle the listening port cm_ids. In this case
  1163. * we nullify the priv to prevent double cm_id destruction and destroying
  1164. * the cm_id implicitely by returning a non-zero rc to the callout.
  1165. */
  1166. static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
  1167. struct nvmet_rdma_queue *queue)
  1168. {
  1169. struct nvmet_port *port;
  1170. if (queue) {
  1171. /*
  1172. * This is a queue cm_id. we have registered
  1173. * an ib_client to handle queues removal
  1174. * so don't interfear and just return.
  1175. */
  1176. return 0;
  1177. }
  1178. port = cm_id->context;
  1179. /*
  1180. * This is a listener cm_id. Make sure that
  1181. * future remove_port won't invoke a double
  1182. * cm_id destroy. use atomic xchg to make sure
  1183. * we don't compete with remove_port.
  1184. */
  1185. if (xchg(&port->priv, NULL) != cm_id)
  1186. return 0;
  1187. /*
  1188. * We need to return 1 so that the core will destroy
  1189. * it's own ID. What a great API design..
  1190. */
  1191. return 1;
  1192. }
  1193. static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
  1194. struct rdma_cm_event *event)
  1195. {
  1196. struct nvmet_rdma_queue *queue = NULL;
  1197. int ret = 0;
  1198. if (cm_id->qp)
  1199. queue = cm_id->qp->qp_context;
  1200. pr_debug("%s (%d): status %d id %p\n",
  1201. rdma_event_msg(event->event), event->event,
  1202. event->status, cm_id);
  1203. switch (event->event) {
  1204. case RDMA_CM_EVENT_CONNECT_REQUEST:
  1205. ret = nvmet_rdma_queue_connect(cm_id, event);
  1206. break;
  1207. case RDMA_CM_EVENT_ESTABLISHED:
  1208. nvmet_rdma_queue_established(queue);
  1209. break;
  1210. case RDMA_CM_EVENT_ADDR_CHANGE:
  1211. case RDMA_CM_EVENT_DISCONNECTED:
  1212. case RDMA_CM_EVENT_TIMEWAIT_EXIT:
  1213. nvmet_rdma_queue_disconnect(queue);
  1214. break;
  1215. case RDMA_CM_EVENT_DEVICE_REMOVAL:
  1216. ret = nvmet_rdma_device_removal(cm_id, queue);
  1217. break;
  1218. case RDMA_CM_EVENT_REJECTED:
  1219. pr_debug("Connection rejected: %s\n",
  1220. rdma_reject_msg(cm_id, event->status));
  1221. /* FALLTHROUGH */
  1222. case RDMA_CM_EVENT_UNREACHABLE:
  1223. case RDMA_CM_EVENT_CONNECT_ERROR:
  1224. nvmet_rdma_queue_connect_fail(cm_id, queue);
  1225. break;
  1226. default:
  1227. pr_err("received unrecognized RDMA CM event %d\n",
  1228. event->event);
  1229. break;
  1230. }
  1231. return ret;
  1232. }
  1233. static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl)
  1234. {
  1235. struct nvmet_rdma_queue *queue;
  1236. restart:
  1237. mutex_lock(&nvmet_rdma_queue_mutex);
  1238. list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) {
  1239. if (queue->nvme_sq.ctrl == ctrl) {
  1240. list_del_init(&queue->queue_list);
  1241. mutex_unlock(&nvmet_rdma_queue_mutex);
  1242. __nvmet_rdma_queue_disconnect(queue);
  1243. goto restart;
  1244. }
  1245. }
  1246. mutex_unlock(&nvmet_rdma_queue_mutex);
  1247. }
  1248. static int nvmet_rdma_add_port(struct nvmet_port *port)
  1249. {
  1250. struct rdma_cm_id *cm_id;
  1251. struct sockaddr_storage addr = { };
  1252. __kernel_sa_family_t af;
  1253. int ret;
  1254. switch (port->disc_addr.adrfam) {
  1255. case NVMF_ADDR_FAMILY_IP4:
  1256. af = AF_INET;
  1257. break;
  1258. case NVMF_ADDR_FAMILY_IP6:
  1259. af = AF_INET6;
  1260. break;
  1261. default:
  1262. pr_err("address family %d not supported\n",
  1263. port->disc_addr.adrfam);
  1264. return -EINVAL;
  1265. }
  1266. if (port->inline_data_size < 0) {
  1267. port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE;
  1268. } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) {
  1269. pr_warn("inline_data_size %u is too large, reducing to %u\n",
  1270. port->inline_data_size,
  1271. NVMET_RDMA_MAX_INLINE_DATA_SIZE);
  1272. port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE;
  1273. }
  1274. ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr,
  1275. port->disc_addr.trsvcid, &addr);
  1276. if (ret) {
  1277. pr_err("malformed ip/port passed: %s:%s\n",
  1278. port->disc_addr.traddr, port->disc_addr.trsvcid);
  1279. return ret;
  1280. }
  1281. cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port,
  1282. RDMA_PS_TCP, IB_QPT_RC);
  1283. if (IS_ERR(cm_id)) {
  1284. pr_err("CM ID creation failed\n");
  1285. return PTR_ERR(cm_id);
  1286. }
  1287. /*
  1288. * Allow both IPv4 and IPv6 sockets to bind a single port
  1289. * at the same time.
  1290. */
  1291. ret = rdma_set_afonly(cm_id, 1);
  1292. if (ret) {
  1293. pr_err("rdma_set_afonly failed (%d)\n", ret);
  1294. goto out_destroy_id;
  1295. }
  1296. ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr);
  1297. if (ret) {
  1298. pr_err("binding CM ID to %pISpcs failed (%d)\n",
  1299. (struct sockaddr *)&addr, ret);
  1300. goto out_destroy_id;
  1301. }
  1302. ret = rdma_listen(cm_id, 128);
  1303. if (ret) {
  1304. pr_err("listening to %pISpcs failed (%d)\n",
  1305. (struct sockaddr *)&addr, ret);
  1306. goto out_destroy_id;
  1307. }
  1308. pr_info("enabling port %d (%pISpcs)\n",
  1309. le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr);
  1310. port->priv = cm_id;
  1311. return 0;
  1312. out_destroy_id:
  1313. rdma_destroy_id(cm_id);
  1314. return ret;
  1315. }
  1316. static void nvmet_rdma_remove_port(struct nvmet_port *port)
  1317. {
  1318. struct rdma_cm_id *cm_id = xchg(&port->priv, NULL);
  1319. if (cm_id)
  1320. rdma_destroy_id(cm_id);
  1321. }
  1322. static void nvmet_rdma_disc_port_addr(struct nvmet_req *req,
  1323. struct nvmet_port *port, char *traddr)
  1324. {
  1325. struct rdma_cm_id *cm_id = port->priv;
  1326. if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) {
  1327. struct nvmet_rdma_rsp *rsp =
  1328. container_of(req, struct nvmet_rdma_rsp, req);
  1329. struct rdma_cm_id *req_cm_id = rsp->queue->cm_id;
  1330. struct sockaddr *addr = (void *)&req_cm_id->route.addr.src_addr;
  1331. sprintf(traddr, "%pISc", addr);
  1332. } else {
  1333. memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE);
  1334. }
  1335. }
  1336. static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
  1337. .owner = THIS_MODULE,
  1338. .type = NVMF_TRTYPE_RDMA,
  1339. .msdbd = 1,
  1340. .has_keyed_sgls = 1,
  1341. .add_port = nvmet_rdma_add_port,
  1342. .remove_port = nvmet_rdma_remove_port,
  1343. .queue_response = nvmet_rdma_queue_response,
  1344. .delete_ctrl = nvmet_rdma_delete_ctrl,
  1345. .disc_traddr = nvmet_rdma_disc_port_addr,
  1346. };
  1347. static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data)
  1348. {
  1349. struct nvmet_rdma_queue *queue, *tmp;
  1350. struct nvmet_rdma_device *ndev;
  1351. bool found = false;
  1352. mutex_lock(&device_list_mutex);
  1353. list_for_each_entry(ndev, &device_list, entry) {
  1354. if (ndev->device == ib_device) {
  1355. found = true;
  1356. break;
  1357. }
  1358. }
  1359. mutex_unlock(&device_list_mutex);
  1360. if (!found)
  1361. return;
  1362. /*
  1363. * IB Device that is used by nvmet controllers is being removed,
  1364. * delete all queues using this device.
  1365. */
  1366. mutex_lock(&nvmet_rdma_queue_mutex);
  1367. list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list,
  1368. queue_list) {
  1369. if (queue->dev->device != ib_device)
  1370. continue;
  1371. pr_info("Removing queue %d\n", queue->idx);
  1372. list_del_init(&queue->queue_list);
  1373. __nvmet_rdma_queue_disconnect(queue);
  1374. }
  1375. mutex_unlock(&nvmet_rdma_queue_mutex);
  1376. flush_scheduled_work();
  1377. }
  1378. static struct ib_client nvmet_rdma_ib_client = {
  1379. .name = "nvmet_rdma",
  1380. .remove = nvmet_rdma_remove_one
  1381. };
  1382. static int __init nvmet_rdma_init(void)
  1383. {
  1384. int ret;
  1385. ret = ib_register_client(&nvmet_rdma_ib_client);
  1386. if (ret)
  1387. return ret;
  1388. ret = nvmet_register_transport(&nvmet_rdma_ops);
  1389. if (ret)
  1390. goto err_ib_client;
  1391. return 0;
  1392. err_ib_client:
  1393. ib_unregister_client(&nvmet_rdma_ib_client);
  1394. return ret;
  1395. }
  1396. static void __exit nvmet_rdma_exit(void)
  1397. {
  1398. nvmet_unregister_transport(&nvmet_rdma_ops);
  1399. ib_unregister_client(&nvmet_rdma_ib_client);
  1400. WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list));
  1401. ida_destroy(&nvmet_rdma_queue_ida);
  1402. }
  1403. module_init(nvmet_rdma_init);
  1404. module_exit(nvmet_rdma_exit);
  1405. MODULE_LICENSE("GPL v2");
  1406. MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */