smc_ib.c 27 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Shared Memory Communications over RDMA (SMC-R) and RoCE
  4. *
  5. * IB infrastructure:
  6. * Establish SMC-R as an Infiniband Client to be notified about added and
  7. * removed IB devices of type RDMA.
  8. * Determine device and port characteristics for these IB devices.
  9. *
  10. * Copyright IBM Corp. 2016
  11. *
  12. * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
  13. */
  14. #include <linux/etherdevice.h>
  15. #include <linux/if_vlan.h>
  16. #include <linux/random.h>
  17. #include <linux/workqueue.h>
  18. #include <linux/scatterlist.h>
  19. #include <linux/wait.h>
  20. #include <linux/mutex.h>
  21. #include <linux/inetdevice.h>
  22. #include <rdma/ib_verbs.h>
  23. #include <rdma/ib_cache.h>
  24. #include "smc_pnet.h"
  25. #include "smc_ib.h"
  26. #include "smc_core.h"
  27. #include "smc_wr.h"
  28. #include "smc.h"
  29. #include "smc_netlink.h"
  30. #define SMC_MAX_CQE 32766 /* max. # of completion queue elements */
  31. #define SMC_QP_MIN_RNR_TIMER 5
  32. #define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */
  33. #define SMC_QP_RETRY_CNT 7 /* 7: infinite */
  34. #define SMC_QP_RNR_RETRY 7 /* 7: infinite */
  35. struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */
  36. .mutex = __MUTEX_INITIALIZER(smc_ib_devices.mutex),
  37. .list = LIST_HEAD_INIT(smc_ib_devices.list),
  38. };
  39. u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
  40. static int smc_ib_modify_qp_init(struct smc_link *lnk)
  41. {
  42. struct ib_qp_attr qp_attr;
  43. memset(&qp_attr, 0, sizeof(qp_attr));
  44. qp_attr.qp_state = IB_QPS_INIT;
  45. qp_attr.pkey_index = 0;
  46. qp_attr.port_num = lnk->ibport;
  47. qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE
  48. | IB_ACCESS_REMOTE_WRITE;
  49. return ib_modify_qp(lnk->roce_qp, &qp_attr,
  50. IB_QP_STATE | IB_QP_PKEY_INDEX |
  51. IB_QP_ACCESS_FLAGS | IB_QP_PORT);
  52. }
  53. static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
  54. {
  55. enum ib_qp_attr_mask qp_attr_mask =
  56. IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN |
  57. IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER;
  58. struct ib_qp_attr qp_attr;
  59. u8 hop_lim = 1;
  60. memset(&qp_attr, 0, sizeof(qp_attr));
  61. qp_attr.qp_state = IB_QPS_RTR;
  62. qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
  63. qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE;
  64. rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport);
  65. if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway)
  66. hop_lim = IPV6_DEFAULT_HOPLIMIT;
  67. rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, hop_lim, 0);
  68. rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid);
  69. if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway)
  70. memcpy(&qp_attr.ah_attr.roce.dmac, lnk->lgr->nexthop_mac,
  71. sizeof(lnk->lgr->nexthop_mac));
  72. else
  73. memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac,
  74. sizeof(lnk->peer_mac));
  75. qp_attr.dest_qp_num = lnk->peer_qpn;
  76. qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */
  77. qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming
  78. * requests
  79. */
  80. qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER;
  81. return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask);
  82. }
  83. int smc_ib_modify_qp_rts(struct smc_link *lnk)
  84. {
  85. struct ib_qp_attr qp_attr;
  86. memset(&qp_attr, 0, sizeof(qp_attr));
  87. qp_attr.qp_state = IB_QPS_RTS;
  88. qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */
  89. qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */
  90. qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */
  91. qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */
  92. qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and
  93. * atomic ops allowed
  94. */
  95. return ib_modify_qp(lnk->roce_qp, &qp_attr,
  96. IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
  97. IB_QP_SQ_PSN | IB_QP_RNR_RETRY |
  98. IB_QP_MAX_QP_RD_ATOMIC);
  99. }
  100. int smc_ib_modify_qp_error(struct smc_link *lnk)
  101. {
  102. struct ib_qp_attr qp_attr;
  103. memset(&qp_attr, 0, sizeof(qp_attr));
  104. qp_attr.qp_state = IB_QPS_ERR;
  105. return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
  106. }
  107. int smc_ib_ready_link(struct smc_link *lnk)
  108. {
  109. struct smc_link_group *lgr = smc_get_lgr(lnk);
  110. int rc = 0;
  111. rc = smc_ib_modify_qp_init(lnk);
  112. if (rc)
  113. goto out;
  114. rc = smc_ib_modify_qp_rtr(lnk);
  115. if (rc)
  116. goto out;
  117. smc_wr_remember_qp_attr(lnk);
  118. rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv,
  119. IB_CQ_SOLICITED_MASK);
  120. if (rc)
  121. goto out;
  122. rc = smc_wr_rx_post_init(lnk);
  123. if (rc)
  124. goto out;
  125. smc_wr_remember_qp_attr(lnk);
  126. if (lgr->role == SMC_SERV) {
  127. rc = smc_ib_modify_qp_rts(lnk);
  128. if (rc)
  129. goto out;
  130. smc_wr_remember_qp_attr(lnk);
  131. }
  132. out:
  133. return rc;
  134. }
  135. static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport)
  136. {
  137. const struct ib_gid_attr *attr;
  138. int rc;
  139. attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0);
  140. if (IS_ERR(attr))
  141. return -ENODEV;
  142. rc = rdma_read_gid_l2_fields(attr, NULL, smcibdev->mac[ibport - 1]);
  143. rdma_put_gid_attr(attr);
  144. return rc;
  145. }
  146. /* Create an identifier unique for this instance of SMC-R.
  147. * The MAC-address of the first active registered IB device
  148. * plus a random 2-byte number is used to create this identifier.
  149. * This name is delivered to the peer during connection initialization.
  150. */
  151. static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
  152. u8 ibport)
  153. {
  154. memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
  155. sizeof(smcibdev->mac[ibport - 1]));
  156. }
  157. bool smc_ib_is_valid_local_systemid(void)
  158. {
  159. return !is_zero_ether_addr(&local_systemid[2]);
  160. }
  161. static void smc_ib_init_local_systemid(void)
  162. {
  163. get_random_bytes(&local_systemid[0], 2);
  164. }
  165. bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
  166. {
  167. return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
  168. }
  169. int smc_ib_find_route(struct net *net, __be32 saddr, __be32 daddr,
  170. u8 nexthop_mac[], u8 *uses_gateway)
  171. {
  172. struct neighbour *neigh = NULL;
  173. struct rtable *rt = NULL;
  174. struct flowi4 fl4 = {
  175. .saddr = saddr,
  176. .daddr = daddr
  177. };
  178. if (daddr == cpu_to_be32(INADDR_NONE))
  179. goto out;
  180. rt = ip_route_output_flow(net, &fl4, NULL);
  181. if (IS_ERR(rt))
  182. goto out;
  183. if (rt->rt_uses_gateway && rt->rt_gw_family != AF_INET)
  184. goto out_rt;
  185. neigh = dst_neigh_lookup(&rt->dst, &fl4.daddr);
  186. if (!neigh)
  187. goto out_rt;
  188. memcpy(nexthop_mac, neigh->ha, ETH_ALEN);
  189. *uses_gateway = rt->rt_uses_gateway;
  190. neigh_release(neigh);
  191. ip_rt_put(rt);
  192. return 0;
  193. out_rt:
  194. ip_rt_put(rt);
  195. out:
  196. return -ENOENT;
  197. }
  198. static int smc_ib_determine_gid_rcu(const struct net_device *ndev,
  199. const struct ib_gid_attr *attr,
  200. u8 gid[], u8 *sgid_index,
  201. struct smc_init_info_smcrv2 *smcrv2)
  202. {
  203. if (!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) {
  204. if (gid)
  205. memcpy(gid, &attr->gid, SMC_GID_SIZE);
  206. if (sgid_index)
  207. *sgid_index = attr->index;
  208. return 0;
  209. }
  210. if (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP &&
  211. smc_ib_gid_to_ipv4((u8 *)&attr->gid) != cpu_to_be32(INADDR_NONE)) {
  212. struct in_device *in_dev = __in_dev_get_rcu(ndev);
  213. struct net *net = dev_net(ndev);
  214. const struct in_ifaddr *ifa;
  215. bool subnet_match = false;
  216. if (!in_dev)
  217. goto out;
  218. in_dev_for_each_ifa_rcu(ifa, in_dev) {
  219. if (!inet_ifa_match(smcrv2->saddr, ifa))
  220. continue;
  221. subnet_match = true;
  222. break;
  223. }
  224. if (!subnet_match)
  225. goto out;
  226. if (smcrv2->daddr && smc_ib_find_route(net, smcrv2->saddr,
  227. smcrv2->daddr,
  228. smcrv2->nexthop_mac,
  229. &smcrv2->uses_gateway))
  230. goto out;
  231. if (gid)
  232. memcpy(gid, &attr->gid, SMC_GID_SIZE);
  233. if (sgid_index)
  234. *sgid_index = attr->index;
  235. return 0;
  236. }
  237. out:
  238. return -ENODEV;
  239. }
  240. /* determine the gid for an ib-device port and vlan id */
  241. int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
  242. unsigned short vlan_id, u8 gid[], u8 *sgid_index,
  243. struct smc_init_info_smcrv2 *smcrv2)
  244. {
  245. const struct ib_gid_attr *attr;
  246. const struct net_device *ndev;
  247. int i;
  248. for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
  249. attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i);
  250. if (IS_ERR(attr))
  251. continue;
  252. rcu_read_lock();
  253. ndev = rdma_read_gid_attr_ndev_rcu(attr);
  254. if (!IS_ERR(ndev) &&
  255. ((!vlan_id && !is_vlan_dev(ndev)) ||
  256. (vlan_id && is_vlan_dev(ndev) &&
  257. vlan_dev_vlan_id(ndev) == vlan_id))) {
  258. if (!smc_ib_determine_gid_rcu(ndev, attr, gid,
  259. sgid_index, smcrv2)) {
  260. rcu_read_unlock();
  261. rdma_put_gid_attr(attr);
  262. return 0;
  263. }
  264. }
  265. rcu_read_unlock();
  266. rdma_put_gid_attr(attr);
  267. }
  268. return -ENODEV;
  269. }
  270. /* check if gid is still defined on smcibdev */
  271. static bool smc_ib_check_link_gid(u8 gid[SMC_GID_SIZE], bool smcrv2,
  272. struct smc_ib_device *smcibdev, u8 ibport)
  273. {
  274. const struct ib_gid_attr *attr;
  275. bool rc = false;
  276. int i;
  277. for (i = 0; !rc && i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
  278. attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i);
  279. if (IS_ERR(attr))
  280. continue;
  281. rcu_read_lock();
  282. if ((!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) ||
  283. (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP &&
  284. !(ipv6_addr_type((const struct in6_addr *)&attr->gid)
  285. & IPV6_ADDR_LINKLOCAL)))
  286. if (!memcmp(gid, &attr->gid, SMC_GID_SIZE))
  287. rc = true;
  288. rcu_read_unlock();
  289. rdma_put_gid_attr(attr);
  290. }
  291. return rc;
  292. }
  293. /* check all links if the gid is still defined on smcibdev */
  294. static void smc_ib_gid_check(struct smc_ib_device *smcibdev, u8 ibport)
  295. {
  296. struct smc_link_group *lgr;
  297. int i;
  298. spin_lock_bh(&smc_lgr_list.lock);
  299. list_for_each_entry(lgr, &smc_lgr_list.list, list) {
  300. if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
  301. SMC_MAX_PNETID_LEN))
  302. continue; /* lgr is not affected */
  303. if (list_empty(&lgr->list))
  304. continue;
  305. for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
  306. if (lgr->lnk[i].state == SMC_LNK_UNUSED ||
  307. lgr->lnk[i].smcibdev != smcibdev)
  308. continue;
  309. if (!smc_ib_check_link_gid(lgr->lnk[i].gid,
  310. lgr->smc_version == SMC_V2,
  311. smcibdev, ibport))
  312. smcr_port_err(smcibdev, ibport);
  313. }
  314. }
  315. spin_unlock_bh(&smc_lgr_list.lock);
  316. }
  317. static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
  318. {
  319. int rc;
  320. memset(&smcibdev->pattr[ibport - 1], 0,
  321. sizeof(smcibdev->pattr[ibport - 1]));
  322. rc = ib_query_port(smcibdev->ibdev, ibport,
  323. &smcibdev->pattr[ibport - 1]);
  324. if (rc)
  325. goto out;
  326. /* the SMC protocol requires specification of the RoCE MAC address */
  327. rc = smc_ib_fill_mac(smcibdev, ibport);
  328. if (rc)
  329. goto out;
  330. if (!smc_ib_is_valid_local_systemid() &&
  331. smc_ib_port_active(smcibdev, ibport))
  332. /* create unique system identifier */
  333. smc_ib_define_local_systemid(smcibdev, ibport);
  334. out:
  335. return rc;
  336. }
  337. /* process context wrapper for might_sleep smc_ib_remember_port_attr */
  338. static void smc_ib_port_event_work(struct work_struct *work)
  339. {
  340. struct smc_ib_device *smcibdev = container_of(
  341. work, struct smc_ib_device, port_event_work);
  342. u8 port_idx;
  343. for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
  344. smc_ib_remember_port_attr(smcibdev, port_idx + 1);
  345. clear_bit(port_idx, &smcibdev->port_event_mask);
  346. if (!smc_ib_port_active(smcibdev, port_idx + 1)) {
  347. set_bit(port_idx, smcibdev->ports_going_away);
  348. smcr_port_err(smcibdev, port_idx + 1);
  349. } else {
  350. clear_bit(port_idx, smcibdev->ports_going_away);
  351. smcr_port_add(smcibdev, port_idx + 1);
  352. smc_ib_gid_check(smcibdev, port_idx + 1);
  353. }
  354. }
  355. }
  356. /* can be called in IRQ context */
  357. static void smc_ib_global_event_handler(struct ib_event_handler *handler,
  358. struct ib_event *ibevent)
  359. {
  360. struct smc_ib_device *smcibdev;
  361. bool schedule = false;
  362. u8 port_idx;
  363. smcibdev = container_of(handler, struct smc_ib_device, event_handler);
  364. switch (ibevent->event) {
  365. case IB_EVENT_DEVICE_FATAL:
  366. /* terminate all ports on device */
  367. for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) {
  368. set_bit(port_idx, &smcibdev->port_event_mask);
  369. if (!test_and_set_bit(port_idx,
  370. smcibdev->ports_going_away))
  371. schedule = true;
  372. }
  373. if (schedule)
  374. schedule_work(&smcibdev->port_event_work);
  375. break;
  376. case IB_EVENT_PORT_ACTIVE:
  377. port_idx = ibevent->element.port_num - 1;
  378. if (port_idx >= SMC_MAX_PORTS)
  379. break;
  380. set_bit(port_idx, &smcibdev->port_event_mask);
  381. if (test_and_clear_bit(port_idx, smcibdev->ports_going_away))
  382. schedule_work(&smcibdev->port_event_work);
  383. break;
  384. case IB_EVENT_PORT_ERR:
  385. port_idx = ibevent->element.port_num - 1;
  386. if (port_idx >= SMC_MAX_PORTS)
  387. break;
  388. set_bit(port_idx, &smcibdev->port_event_mask);
  389. if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
  390. schedule_work(&smcibdev->port_event_work);
  391. break;
  392. case IB_EVENT_GID_CHANGE:
  393. port_idx = ibevent->element.port_num - 1;
  394. if (port_idx >= SMC_MAX_PORTS)
  395. break;
  396. set_bit(port_idx, &smcibdev->port_event_mask);
  397. schedule_work(&smcibdev->port_event_work);
  398. break;
  399. default:
  400. break;
  401. }
  402. }
  403. void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
  404. {
  405. if (lnk->roce_pd)
  406. ib_dealloc_pd(lnk->roce_pd);
  407. lnk->roce_pd = NULL;
  408. }
  409. int smc_ib_create_protection_domain(struct smc_link *lnk)
  410. {
  411. int rc;
  412. lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0);
  413. rc = PTR_ERR_OR_ZERO(lnk->roce_pd);
  414. if (IS_ERR(lnk->roce_pd))
  415. lnk->roce_pd = NULL;
  416. return rc;
  417. }
  418. static bool smcr_diag_is_dev_critical(struct smc_lgr_list *smc_lgr,
  419. struct smc_ib_device *smcibdev)
  420. {
  421. struct smc_link_group *lgr;
  422. bool rc = false;
  423. int i;
  424. spin_lock_bh(&smc_lgr->lock);
  425. list_for_each_entry(lgr, &smc_lgr->list, list) {
  426. if (lgr->is_smcd)
  427. continue;
  428. for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
  429. if (lgr->lnk[i].state == SMC_LNK_UNUSED ||
  430. lgr->lnk[i].smcibdev != smcibdev)
  431. continue;
  432. if (lgr->type == SMC_LGR_SINGLE ||
  433. lgr->type == SMC_LGR_ASYMMETRIC_LOCAL) {
  434. rc = true;
  435. goto out;
  436. }
  437. }
  438. }
  439. out:
  440. spin_unlock_bh(&smc_lgr->lock);
  441. return rc;
  442. }
  443. static int smc_nl_handle_dev_port(struct sk_buff *skb,
  444. struct ib_device *ibdev,
  445. struct smc_ib_device *smcibdev,
  446. int port)
  447. {
  448. char smc_pnet[SMC_MAX_PNETID_LEN + 1];
  449. struct nlattr *port_attrs;
  450. unsigned char port_state;
  451. int lnk_count = 0;
  452. port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT + port);
  453. if (!port_attrs)
  454. goto errout;
  455. if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR,
  456. smcibdev->pnetid_by_user[port]))
  457. goto errattr;
  458. memcpy(smc_pnet, &smcibdev->pnetid[port], SMC_MAX_PNETID_LEN);
  459. smc_pnet[SMC_MAX_PNETID_LEN] = 0;
  460. if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet))
  461. goto errattr;
  462. if (nla_put_u32(skb, SMC_NLA_DEV_PORT_NETDEV,
  463. smcibdev->ndev_ifidx[port]))
  464. goto errattr;
  465. if (nla_put_u8(skb, SMC_NLA_DEV_PORT_VALID, 1))
  466. goto errattr;
  467. port_state = smc_ib_port_active(smcibdev, port + 1);
  468. if (nla_put_u8(skb, SMC_NLA_DEV_PORT_STATE, port_state))
  469. goto errattr;
  470. lnk_count = atomic_read(&smcibdev->lnk_cnt_by_port[port]);
  471. if (nla_put_u32(skb, SMC_NLA_DEV_PORT_LNK_CNT, lnk_count))
  472. goto errattr;
  473. nla_nest_end(skb, port_attrs);
  474. return 0;
  475. errattr:
  476. nla_nest_cancel(skb, port_attrs);
  477. errout:
  478. return -EMSGSIZE;
  479. }
  480. static bool smc_nl_handle_pci_values(const struct smc_pci_dev *smc_pci_dev,
  481. struct sk_buff *skb)
  482. {
  483. if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev->pci_fid))
  484. return false;
  485. if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev->pci_pchid))
  486. return false;
  487. if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev->pci_vendor))
  488. return false;
  489. if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev->pci_device))
  490. return false;
  491. if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev->pci_id))
  492. return false;
  493. return true;
  494. }
  495. static int smc_nl_handle_smcr_dev(struct smc_ib_device *smcibdev,
  496. struct sk_buff *skb,
  497. struct netlink_callback *cb)
  498. {
  499. char smc_ibname[IB_DEVICE_NAME_MAX];
  500. struct smc_pci_dev smc_pci_dev;
  501. struct pci_dev *pci_dev;
  502. unsigned char is_crit;
  503. struct nlattr *attrs;
  504. void *nlh;
  505. int i;
  506. nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
  507. &smc_gen_nl_family, NLM_F_MULTI,
  508. SMC_NETLINK_GET_DEV_SMCR);
  509. if (!nlh)
  510. goto errmsg;
  511. attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCR);
  512. if (!attrs)
  513. goto errout;
  514. is_crit = smcr_diag_is_dev_critical(&smc_lgr_list, smcibdev);
  515. if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, is_crit))
  516. goto errattr;
  517. if (smcibdev->ibdev->dev.parent) {
  518. memset(&smc_pci_dev, 0, sizeof(smc_pci_dev));
  519. pci_dev = to_pci_dev(smcibdev->ibdev->dev.parent);
  520. smc_set_pci_values(pci_dev, &smc_pci_dev);
  521. if (!smc_nl_handle_pci_values(&smc_pci_dev, skb))
  522. goto errattr;
  523. }
  524. snprintf(smc_ibname, sizeof(smc_ibname), "%s", smcibdev->ibdev->name);
  525. if (nla_put_string(skb, SMC_NLA_DEV_IB_NAME, smc_ibname))
  526. goto errattr;
  527. for (i = 1; i <= SMC_MAX_PORTS; i++) {
  528. if (!rdma_is_port_valid(smcibdev->ibdev, i))
  529. continue;
  530. if (smc_nl_handle_dev_port(skb, smcibdev->ibdev,
  531. smcibdev, i - 1))
  532. goto errattr;
  533. }
  534. nla_nest_end(skb, attrs);
  535. genlmsg_end(skb, nlh);
  536. return 0;
  537. errattr:
  538. nla_nest_cancel(skb, attrs);
  539. errout:
  540. genlmsg_cancel(skb, nlh);
  541. errmsg:
  542. return -EMSGSIZE;
  543. }
  544. static void smc_nl_prep_smcr_dev(struct smc_ib_devices *dev_list,
  545. struct sk_buff *skb,
  546. struct netlink_callback *cb)
  547. {
  548. struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
  549. struct smc_ib_device *smcibdev;
  550. int snum = cb_ctx->pos[0];
  551. int num = 0;
  552. mutex_lock(&dev_list->mutex);
  553. list_for_each_entry(smcibdev, &dev_list->list, list) {
  554. if (num < snum)
  555. goto next;
  556. if (smc_nl_handle_smcr_dev(smcibdev, skb, cb))
  557. goto errout;
  558. next:
  559. num++;
  560. }
  561. errout:
  562. mutex_unlock(&dev_list->mutex);
  563. cb_ctx->pos[0] = num;
  564. }
  565. int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb)
  566. {
  567. smc_nl_prep_smcr_dev(&smc_ib_devices, skb, cb);
  568. return skb->len;
  569. }
  570. static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
  571. {
  572. struct smc_link *lnk = (struct smc_link *)priv;
  573. struct smc_ib_device *smcibdev = lnk->smcibdev;
  574. u8 port_idx;
  575. switch (ibevent->event) {
  576. case IB_EVENT_QP_FATAL:
  577. case IB_EVENT_QP_ACCESS_ERR:
  578. port_idx = ibevent->element.qp->port - 1;
  579. if (port_idx >= SMC_MAX_PORTS)
  580. break;
  581. set_bit(port_idx, &smcibdev->port_event_mask);
  582. if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
  583. schedule_work(&smcibdev->port_event_work);
  584. break;
  585. default:
  586. break;
  587. }
  588. }
  589. void smc_ib_destroy_queue_pair(struct smc_link *lnk)
  590. {
  591. if (lnk->roce_qp)
  592. ib_destroy_qp(lnk->roce_qp);
  593. lnk->roce_qp = NULL;
  594. }
  595. /* create a queue pair within the protection domain for a link */
  596. int smc_ib_create_queue_pair(struct smc_link *lnk)
  597. {
  598. int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
  599. struct ib_qp_init_attr qp_attr = {
  600. .event_handler = smc_ib_qp_event_handler,
  601. .qp_context = lnk,
  602. .send_cq = lnk->smcibdev->roce_cq_send,
  603. .recv_cq = lnk->smcibdev->roce_cq_recv,
  604. .srq = NULL,
  605. .cap = {
  606. /* include unsolicited rdma_writes as well,
  607. * there are max. 2 RDMA_WRITE per 1 WR_SEND
  608. */
  609. .max_send_wr = SMC_WR_BUF_CNT * 3,
  610. .max_recv_wr = SMC_WR_BUF_CNT * 3,
  611. .max_send_sge = SMC_IB_MAX_SEND_SGE,
  612. .max_recv_sge = sges_per_buf,
  613. .max_inline_data = 0,
  614. },
  615. .sq_sig_type = IB_SIGNAL_REQ_WR,
  616. .qp_type = IB_QPT_RC,
  617. };
  618. int rc;
  619. lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
  620. rc = PTR_ERR_OR_ZERO(lnk->roce_qp);
  621. if (IS_ERR(lnk->roce_qp))
  622. lnk->roce_qp = NULL;
  623. else
  624. smc_wr_remember_qp_attr(lnk);
  625. return rc;
  626. }
  627. void smc_ib_put_memory_region(struct ib_mr *mr)
  628. {
  629. ib_dereg_mr(mr);
  630. }
  631. static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx)
  632. {
  633. unsigned int offset = 0;
  634. int sg_num;
  635. /* map the largest prefix of a dma mapped SG list */
  636. sg_num = ib_map_mr_sg(buf_slot->mr[link_idx],
  637. buf_slot->sgt[link_idx].sgl,
  638. buf_slot->sgt[link_idx].orig_nents,
  639. &offset, PAGE_SIZE);
  640. return sg_num;
  641. }
  642. /* Allocate a memory region and map the dma mapped SG list of buf_slot */
  643. int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
  644. struct smc_buf_desc *buf_slot, u8 link_idx)
  645. {
  646. if (buf_slot->mr[link_idx])
  647. return 0; /* already done */
  648. buf_slot->mr[link_idx] =
  649. ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order);
  650. if (IS_ERR(buf_slot->mr[link_idx])) {
  651. int rc;
  652. rc = PTR_ERR(buf_slot->mr[link_idx]);
  653. buf_slot->mr[link_idx] = NULL;
  654. return rc;
  655. }
  656. if (smc_ib_map_mr_sg(buf_slot, link_idx) !=
  657. buf_slot->sgt[link_idx].orig_nents)
  658. return -EINVAL;
  659. return 0;
  660. }
  661. bool smc_ib_is_sg_need_sync(struct smc_link *lnk,
  662. struct smc_buf_desc *buf_slot)
  663. {
  664. struct scatterlist *sg;
  665. unsigned int i;
  666. bool ret = false;
  667. if (!lnk->smcibdev->ibdev->dma_device)
  668. return ret;
  669. /* for now there is just one DMA address */
  670. for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
  671. buf_slot->sgt[lnk->link_idx].nents, i) {
  672. if (!sg_dma_len(sg))
  673. break;
  674. if (dma_need_sync(lnk->smcibdev->ibdev->dma_device,
  675. sg_dma_address(sg))) {
  676. ret = true;
  677. goto out;
  678. }
  679. }
  680. out:
  681. return ret;
  682. }
  683. /* synchronize buffer usage for cpu access */
  684. void smc_ib_sync_sg_for_cpu(struct smc_link *lnk,
  685. struct smc_buf_desc *buf_slot,
  686. enum dma_data_direction data_direction)
  687. {
  688. struct scatterlist *sg;
  689. unsigned int i;
  690. if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx)))
  691. return;
  692. /* for now there is just one DMA address */
  693. for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
  694. buf_slot->sgt[lnk->link_idx].nents, i) {
  695. if (!sg_dma_len(sg))
  696. break;
  697. ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev,
  698. sg_dma_address(sg),
  699. sg_dma_len(sg),
  700. data_direction);
  701. }
  702. }
  703. /* synchronize buffer usage for device access */
  704. void smc_ib_sync_sg_for_device(struct smc_link *lnk,
  705. struct smc_buf_desc *buf_slot,
  706. enum dma_data_direction data_direction)
  707. {
  708. struct scatterlist *sg;
  709. unsigned int i;
  710. if (!(buf_slot->is_dma_need_sync & (1U << lnk->link_idx)))
  711. return;
  712. /* for now there is just one DMA address */
  713. for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg,
  714. buf_slot->sgt[lnk->link_idx].nents, i) {
  715. if (!sg_dma_len(sg))
  716. break;
  717. ib_dma_sync_single_for_device(lnk->smcibdev->ibdev,
  718. sg_dma_address(sg),
  719. sg_dma_len(sg),
  720. data_direction);
  721. }
  722. }
  723. /* Map a new TX or RX buffer SG-table to DMA */
  724. int smc_ib_buf_map_sg(struct smc_link *lnk,
  725. struct smc_buf_desc *buf_slot,
  726. enum dma_data_direction data_direction)
  727. {
  728. int mapped_nents;
  729. mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev,
  730. buf_slot->sgt[lnk->link_idx].sgl,
  731. buf_slot->sgt[lnk->link_idx].orig_nents,
  732. data_direction);
  733. if (!mapped_nents)
  734. return -ENOMEM;
  735. return mapped_nents;
  736. }
  737. void smc_ib_buf_unmap_sg(struct smc_link *lnk,
  738. struct smc_buf_desc *buf_slot,
  739. enum dma_data_direction data_direction)
  740. {
  741. if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address)
  742. return; /* already unmapped */
  743. ib_dma_unmap_sg(lnk->smcibdev->ibdev,
  744. buf_slot->sgt[lnk->link_idx].sgl,
  745. buf_slot->sgt[lnk->link_idx].orig_nents,
  746. data_direction);
  747. buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0;
  748. }
  749. long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
  750. {
  751. struct ib_cq_init_attr cqattr = {
  752. .cqe = SMC_MAX_CQE, .comp_vector = 0 };
  753. int cqe_size_order, smc_order;
  754. long rc;
  755. mutex_lock(&smcibdev->mutex);
  756. rc = 0;
  757. if (smcibdev->initialized)
  758. goto out;
  759. /* the calculated number of cq entries fits to mlx5 cq allocation */
  760. cqe_size_order = cache_line_size() == 128 ? 7 : 6;
  761. smc_order = MAX_PAGE_ORDER - cqe_size_order;
  762. if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE)
  763. cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2;
  764. smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
  765. smc_wr_tx_cq_handler, NULL,
  766. smcibdev, &cqattr);
  767. rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send);
  768. if (IS_ERR(smcibdev->roce_cq_send)) {
  769. smcibdev->roce_cq_send = NULL;
  770. goto out;
  771. }
  772. smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev,
  773. smc_wr_rx_cq_handler, NULL,
  774. smcibdev, &cqattr);
  775. rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv);
  776. if (IS_ERR(smcibdev->roce_cq_recv)) {
  777. smcibdev->roce_cq_recv = NULL;
  778. goto err;
  779. }
  780. smc_wr_add_dev(smcibdev);
  781. smcibdev->initialized = 1;
  782. goto out;
  783. err:
  784. ib_destroy_cq(smcibdev->roce_cq_send);
  785. out:
  786. mutex_unlock(&smcibdev->mutex);
  787. return rc;
  788. }
  789. static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
  790. {
  791. mutex_lock(&smcibdev->mutex);
  792. if (!smcibdev->initialized)
  793. goto out;
  794. smcibdev->initialized = 0;
  795. ib_destroy_cq(smcibdev->roce_cq_recv);
  796. ib_destroy_cq(smcibdev->roce_cq_send);
  797. smc_wr_remove_dev(smcibdev);
  798. out:
  799. mutex_unlock(&smcibdev->mutex);
  800. }
  801. static struct ib_client smc_ib_client;
  802. static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port)
  803. {
  804. struct ib_device *ibdev = smcibdev->ibdev;
  805. struct net_device *ndev;
  806. ndev = ib_device_get_netdev(ibdev, port + 1);
  807. if (ndev) {
  808. smcibdev->ndev_ifidx[port] = ndev->ifindex;
  809. dev_put(ndev);
  810. }
  811. }
  812. void smc_ib_ndev_change(struct net_device *ndev, unsigned long event)
  813. {
  814. struct smc_ib_device *smcibdev;
  815. struct ib_device *libdev;
  816. struct net_device *lndev;
  817. u8 port_cnt;
  818. int i;
  819. mutex_lock(&smc_ib_devices.mutex);
  820. list_for_each_entry(smcibdev, &smc_ib_devices.list, list) {
  821. port_cnt = smcibdev->ibdev->phys_port_cnt;
  822. for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) {
  823. libdev = smcibdev->ibdev;
  824. lndev = ib_device_get_netdev(libdev, i + 1);
  825. dev_put(lndev);
  826. if (lndev != ndev)
  827. continue;
  828. if (event == NETDEV_REGISTER)
  829. smcibdev->ndev_ifidx[i] = ndev->ifindex;
  830. if (event == NETDEV_UNREGISTER)
  831. smcibdev->ndev_ifidx[i] = 0;
  832. }
  833. }
  834. mutex_unlock(&smc_ib_devices.mutex);
  835. }
  836. /* callback function for ib_register_client() */
  837. static int smc_ib_add_dev(struct ib_device *ibdev)
  838. {
  839. struct smc_ib_device *smcibdev;
  840. u8 port_cnt;
  841. int i;
  842. if (ibdev->node_type != RDMA_NODE_IB_CA)
  843. return -EOPNOTSUPP;
  844. smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL);
  845. if (!smcibdev)
  846. return -ENOMEM;
  847. smcibdev->ibdev = ibdev;
  848. INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
  849. atomic_set(&smcibdev->lnk_cnt, 0);
  850. init_waitqueue_head(&smcibdev->lnks_deleted);
  851. mutex_init(&smcibdev->mutex);
  852. mutex_lock(&smc_ib_devices.mutex);
  853. list_add_tail(&smcibdev->list, &smc_ib_devices.list);
  854. mutex_unlock(&smc_ib_devices.mutex);
  855. ib_set_client_data(ibdev, &smc_ib_client, smcibdev);
  856. INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
  857. smc_ib_global_event_handler);
  858. ib_register_event_handler(&smcibdev->event_handler);
  859. /* trigger reading of the port attributes */
  860. port_cnt = smcibdev->ibdev->phys_port_cnt;
  861. pr_warn_ratelimited("smc: adding ib device %s with port count %d\n",
  862. smcibdev->ibdev->name, port_cnt);
  863. for (i = 0;
  864. i < min_t(size_t, port_cnt, SMC_MAX_PORTS);
  865. i++) {
  866. set_bit(i, &smcibdev->port_event_mask);
  867. /* determine pnetids of the port */
  868. if (smc_pnetid_by_dev_port(ibdev->dev.parent, i,
  869. smcibdev->pnetid[i]))
  870. smc_pnetid_by_table_ib(smcibdev, i + 1);
  871. smc_copy_netdev_ifindex(smcibdev, i);
  872. pr_warn_ratelimited("smc: ib device %s port %d has pnetid "
  873. "%.16s%s\n",
  874. smcibdev->ibdev->name, i + 1,
  875. smcibdev->pnetid[i],
  876. smcibdev->pnetid_by_user[i] ?
  877. " (user defined)" :
  878. "");
  879. }
  880. schedule_work(&smcibdev->port_event_work);
  881. return 0;
  882. }
  883. /* callback function for ib_unregister_client() */
  884. static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
  885. {
  886. struct smc_ib_device *smcibdev = client_data;
  887. mutex_lock(&smc_ib_devices.mutex);
  888. list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
  889. mutex_unlock(&smc_ib_devices.mutex);
  890. pr_warn_ratelimited("smc: removing ib device %s\n",
  891. smcibdev->ibdev->name);
  892. smc_smcr_terminate_all(smcibdev);
  893. smc_ib_cleanup_per_ibdev(smcibdev);
  894. ib_unregister_event_handler(&smcibdev->event_handler);
  895. cancel_work_sync(&smcibdev->port_event_work);
  896. kfree(smcibdev);
  897. }
  898. static struct ib_client smc_ib_client = {
  899. .name = "smc_ib",
  900. .add = smc_ib_add_dev,
  901. .remove = smc_ib_remove_dev,
  902. };
  903. int __init smc_ib_register_client(void)
  904. {
  905. smc_ib_init_local_systemid();
  906. return ib_register_client(&smc_ib_client);
  907. }
  908. void smc_ib_unregister_client(void)
  909. {
  910. ib_unregister_client(&smc_ib_client);
  911. }