smbdirect.c 74 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645
  1. /*
  2. * Copyright (C) 2017, Microsoft Corporation.
  3. *
  4. * Author(s): Long Li <longli@microsoft.com>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
  14. * the GNU General Public License for more details.
  15. */
  16. #include <linux/module.h>
  17. #include <linux/highmem.h>
  18. #include "smbdirect.h"
  19. #include "cifs_debug.h"
  20. #include "cifsproto.h"
  21. #include "smb2proto.h"
  22. static struct smbd_response *get_empty_queue_buffer(
  23. struct smbd_connection *info);
  24. static struct smbd_response *get_receive_buffer(
  25. struct smbd_connection *info);
  26. static void put_receive_buffer(
  27. struct smbd_connection *info,
  28. struct smbd_response *response);
  29. static int allocate_receive_buffers(struct smbd_connection *info, int num_buf);
  30. static void destroy_receive_buffers(struct smbd_connection *info);
  31. static void put_empty_packet(
  32. struct smbd_connection *info, struct smbd_response *response);
  33. static void enqueue_reassembly(
  34. struct smbd_connection *info,
  35. struct smbd_response *response, int data_length);
  36. static struct smbd_response *_get_first_reassembly(
  37. struct smbd_connection *info);
  38. static int smbd_post_recv(
  39. struct smbd_connection *info,
  40. struct smbd_response *response);
  41. static int smbd_post_send_empty(struct smbd_connection *info);
  42. static int smbd_post_send_data(
  43. struct smbd_connection *info,
  44. struct kvec *iov, int n_vec, int remaining_data_length);
  45. static int smbd_post_send_page(struct smbd_connection *info,
  46. struct page *page, unsigned long offset,
  47. size_t size, int remaining_data_length);
  48. static void destroy_mr_list(struct smbd_connection *info);
  49. static int allocate_mr_list(struct smbd_connection *info);
  50. /* SMBD version number */
  51. #define SMBD_V1 0x0100
  52. /* Port numbers for SMBD transport */
  53. #define SMB_PORT 445
  54. #define SMBD_PORT 5445
  55. /* Address lookup and resolve timeout in ms */
  56. #define RDMA_RESOLVE_TIMEOUT 5000
  57. /* SMBD negotiation timeout in seconds */
  58. #define SMBD_NEGOTIATE_TIMEOUT 120
  59. /* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */
  60. #define SMBD_MIN_RECEIVE_SIZE 128
  61. #define SMBD_MIN_FRAGMENTED_SIZE 131072
  62. /*
  63. * Default maximum number of RDMA read/write outstanding on this connection
  64. * This value is possibly decreased during QP creation on hardware limit
  65. */
  66. #define SMBD_CM_RESPONDER_RESOURCES 32
  67. /* Maximum number of retries on data transfer operations */
  68. #define SMBD_CM_RETRY 6
  69. /* No need to retry on Receiver Not Ready since SMBD manages credits */
  70. #define SMBD_CM_RNR_RETRY 0
  71. /*
  72. * User configurable initial values per SMBD transport connection
  73. * as defined in [MS-SMBD] 3.1.1.1
  74. * Those may change after a SMBD negotiation
  75. */
  76. /* The local peer's maximum number of credits to grant to the peer */
  77. int smbd_receive_credit_max = 255;
  78. /* The remote peer's credit request of local peer */
  79. int smbd_send_credit_target = 255;
  80. /* The maximum single message size can be sent to remote peer */
  81. int smbd_max_send_size = 1364;
  82. /* The maximum fragmented upper-layer payload receive size supported */
  83. int smbd_max_fragmented_recv_size = 1024 * 1024;
  84. /* The maximum single-message size which can be received */
  85. int smbd_max_receive_size = 8192;
  86. /* The timeout to initiate send of a keepalive message on idle */
  87. int smbd_keep_alive_interval = 120;
  88. /*
  89. * User configurable initial values for RDMA transport
  90. * The actual values used may be lower and are limited to hardware capabilities
  91. */
  92. /* Default maximum number of SGEs in a RDMA write/read */
  93. int smbd_max_frmr_depth = 2048;
  94. /* If payload is less than this byte, use RDMA send/recv not read/write */
  95. int rdma_readwrite_threshold = 4096;
  96. /* Transport logging functions
  97. * Logging are defined as classes. They can be OR'ed to define the actual
  98. * logging level via module parameter smbd_logging_class
  99. * e.g. cifs.smbd_logging_class=0xa0 will log all log_rdma_recv() and
  100. * log_rdma_event()
  101. */
  102. #define LOG_OUTGOING 0x1
  103. #define LOG_INCOMING 0x2
  104. #define LOG_READ 0x4
  105. #define LOG_WRITE 0x8
  106. #define LOG_RDMA_SEND 0x10
  107. #define LOG_RDMA_RECV 0x20
  108. #define LOG_KEEP_ALIVE 0x40
  109. #define LOG_RDMA_EVENT 0x80
  110. #define LOG_RDMA_MR 0x100
  111. static unsigned int smbd_logging_class;
  112. module_param(smbd_logging_class, uint, 0644);
  113. MODULE_PARM_DESC(smbd_logging_class,
  114. "Logging class for SMBD transport 0x0 to 0x100");
  115. #define ERR 0x0
  116. #define INFO 0x1
  117. static unsigned int smbd_logging_level = ERR;
  118. module_param(smbd_logging_level, uint, 0644);
  119. MODULE_PARM_DESC(smbd_logging_level,
  120. "Logging level for SMBD transport, 0 (default): error, 1: info");
  121. #define log_rdma(level, class, fmt, args...) \
  122. do { \
  123. if (level <= smbd_logging_level || class & smbd_logging_class) \
  124. cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\
  125. } while (0)
  126. #define log_outgoing(level, fmt, args...) \
  127. log_rdma(level, LOG_OUTGOING, fmt, ##args)
  128. #define log_incoming(level, fmt, args...) \
  129. log_rdma(level, LOG_INCOMING, fmt, ##args)
  130. #define log_read(level, fmt, args...) log_rdma(level, LOG_READ, fmt, ##args)
  131. #define log_write(level, fmt, args...) log_rdma(level, LOG_WRITE, fmt, ##args)
  132. #define log_rdma_send(level, fmt, args...) \
  133. log_rdma(level, LOG_RDMA_SEND, fmt, ##args)
  134. #define log_rdma_recv(level, fmt, args...) \
  135. log_rdma(level, LOG_RDMA_RECV, fmt, ##args)
  136. #define log_keep_alive(level, fmt, args...) \
  137. log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args)
  138. #define log_rdma_event(level, fmt, args...) \
  139. log_rdma(level, LOG_RDMA_EVENT, fmt, ##args)
  140. #define log_rdma_mr(level, fmt, args...) \
  141. log_rdma(level, LOG_RDMA_MR, fmt, ##args)
  142. /*
  143. * Destroy the transport and related RDMA and memory resources
  144. * Need to go through all the pending counters and make sure on one is using
  145. * the transport while it is destroyed
  146. */
  147. static void smbd_destroy_rdma_work(struct work_struct *work)
  148. {
  149. struct smbd_response *response;
  150. struct smbd_connection *info =
  151. container_of(work, struct smbd_connection, destroy_work);
  152. unsigned long flags;
  153. log_rdma_event(INFO, "destroying qp\n");
  154. ib_drain_qp(info->id->qp);
  155. rdma_destroy_qp(info->id);
  156. /* Unblock all I/O waiting on the send queue */
  157. wake_up_interruptible_all(&info->wait_send_queue);
  158. log_rdma_event(INFO, "cancelling idle timer\n");
  159. cancel_delayed_work_sync(&info->idle_timer_work);
  160. log_rdma_event(INFO, "cancelling send immediate work\n");
  161. cancel_delayed_work_sync(&info->send_immediate_work);
  162. log_rdma_event(INFO, "wait for all send to finish\n");
  163. wait_event(info->wait_smbd_send_pending,
  164. info->smbd_send_pending == 0);
  165. log_rdma_event(INFO, "wait for all recv to finish\n");
  166. wake_up_interruptible(&info->wait_reassembly_queue);
  167. wait_event(info->wait_smbd_recv_pending,
  168. info->smbd_recv_pending == 0);
  169. log_rdma_event(INFO, "wait for all send posted to IB to finish\n");
  170. wait_event(info->wait_send_pending,
  171. atomic_read(&info->send_pending) == 0);
  172. wait_event(info->wait_send_payload_pending,
  173. atomic_read(&info->send_payload_pending) == 0);
  174. log_rdma_event(INFO, "freeing mr list\n");
  175. wake_up_interruptible_all(&info->wait_mr);
  176. wait_event(info->wait_for_mr_cleanup,
  177. atomic_read(&info->mr_used_count) == 0);
  178. destroy_mr_list(info);
  179. /* It's not posssible for upper layer to get to reassembly */
  180. log_rdma_event(INFO, "drain the reassembly queue\n");
  181. do {
  182. spin_lock_irqsave(&info->reassembly_queue_lock, flags);
  183. response = _get_first_reassembly(info);
  184. if (response) {
  185. list_del(&response->list);
  186. spin_unlock_irqrestore(
  187. &info->reassembly_queue_lock, flags);
  188. put_receive_buffer(info, response);
  189. } else
  190. spin_unlock_irqrestore(&info->reassembly_queue_lock, flags);
  191. } while (response);
  192. info->reassembly_data_length = 0;
  193. log_rdma_event(INFO, "free receive buffers\n");
  194. wait_event(info->wait_receive_queues,
  195. info->count_receive_queue + info->count_empty_packet_queue
  196. == info->receive_credit_max);
  197. destroy_receive_buffers(info);
  198. ib_free_cq(info->send_cq);
  199. ib_free_cq(info->recv_cq);
  200. ib_dealloc_pd(info->pd);
  201. rdma_destroy_id(info->id);
  202. /* free mempools */
  203. mempool_destroy(info->request_mempool);
  204. kmem_cache_destroy(info->request_cache);
  205. mempool_destroy(info->response_mempool);
  206. kmem_cache_destroy(info->response_cache);
  207. info->transport_status = SMBD_DESTROYED;
  208. wake_up_all(&info->wait_destroy);
  209. }
  210. static int smbd_process_disconnected(struct smbd_connection *info)
  211. {
  212. schedule_work(&info->destroy_work);
  213. return 0;
  214. }
  215. static void smbd_disconnect_rdma_work(struct work_struct *work)
  216. {
  217. struct smbd_connection *info =
  218. container_of(work, struct smbd_connection, disconnect_work);
  219. if (info->transport_status == SMBD_CONNECTED) {
  220. info->transport_status = SMBD_DISCONNECTING;
  221. rdma_disconnect(info->id);
  222. }
  223. }
  224. static void smbd_disconnect_rdma_connection(struct smbd_connection *info)
  225. {
  226. queue_work(info->workqueue, &info->disconnect_work);
  227. }
  228. /* Upcall from RDMA CM */
  229. static int smbd_conn_upcall(
  230. struct rdma_cm_id *id, struct rdma_cm_event *event)
  231. {
  232. struct smbd_connection *info = id->context;
  233. log_rdma_event(INFO, "event=%d status=%d\n",
  234. event->event, event->status);
  235. switch (event->event) {
  236. case RDMA_CM_EVENT_ADDR_RESOLVED:
  237. case RDMA_CM_EVENT_ROUTE_RESOLVED:
  238. info->ri_rc = 0;
  239. complete(&info->ri_done);
  240. break;
  241. case RDMA_CM_EVENT_ADDR_ERROR:
  242. info->ri_rc = -EHOSTUNREACH;
  243. complete(&info->ri_done);
  244. break;
  245. case RDMA_CM_EVENT_ROUTE_ERROR:
  246. info->ri_rc = -ENETUNREACH;
  247. complete(&info->ri_done);
  248. break;
  249. case RDMA_CM_EVENT_ESTABLISHED:
  250. log_rdma_event(INFO, "connected event=%d\n", event->event);
  251. info->transport_status = SMBD_CONNECTED;
  252. wake_up_interruptible(&info->conn_wait);
  253. break;
  254. case RDMA_CM_EVENT_CONNECT_ERROR:
  255. case RDMA_CM_EVENT_UNREACHABLE:
  256. case RDMA_CM_EVENT_REJECTED:
  257. log_rdma_event(INFO, "connecting failed event=%d\n", event->event);
  258. info->transport_status = SMBD_DISCONNECTED;
  259. wake_up_interruptible(&info->conn_wait);
  260. break;
  261. case RDMA_CM_EVENT_DEVICE_REMOVAL:
  262. case RDMA_CM_EVENT_DISCONNECTED:
  263. /* This happenes when we fail the negotiation */
  264. if (info->transport_status == SMBD_NEGOTIATE_FAILED) {
  265. info->transport_status = SMBD_DISCONNECTED;
  266. wake_up(&info->conn_wait);
  267. break;
  268. }
  269. info->transport_status = SMBD_DISCONNECTED;
  270. smbd_process_disconnected(info);
  271. break;
  272. default:
  273. break;
  274. }
  275. return 0;
  276. }
  277. /* Upcall from RDMA QP */
  278. static void
  279. smbd_qp_async_error_upcall(struct ib_event *event, void *context)
  280. {
  281. struct smbd_connection *info = context;
  282. log_rdma_event(ERR, "%s on device %s info %p\n",
  283. ib_event_msg(event->event), event->device->name, info);
  284. switch (event->event) {
  285. case IB_EVENT_CQ_ERR:
  286. case IB_EVENT_QP_FATAL:
  287. smbd_disconnect_rdma_connection(info);
  288. default:
  289. break;
  290. }
  291. }
  292. static inline void *smbd_request_payload(struct smbd_request *request)
  293. {
  294. return (void *)request->packet;
  295. }
  296. static inline void *smbd_response_payload(struct smbd_response *response)
  297. {
  298. return (void *)response->packet;
  299. }
  300. /* Called when a RDMA send is done */
  301. static void send_done(struct ib_cq *cq, struct ib_wc *wc)
  302. {
  303. int i;
  304. struct smbd_request *request =
  305. container_of(wc->wr_cqe, struct smbd_request, cqe);
  306. log_rdma_send(INFO, "smbd_request %p completed wc->status=%d\n",
  307. request, wc->status);
  308. if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
  309. log_rdma_send(ERR, "wc->status=%d wc->opcode=%d\n",
  310. wc->status, wc->opcode);
  311. smbd_disconnect_rdma_connection(request->info);
  312. }
  313. for (i = 0; i < request->num_sge; i++)
  314. ib_dma_unmap_single(request->info->id->device,
  315. request->sge[i].addr,
  316. request->sge[i].length,
  317. DMA_TO_DEVICE);
  318. if (request->has_payload) {
  319. if (atomic_dec_and_test(&request->info->send_payload_pending))
  320. wake_up(&request->info->wait_send_payload_pending);
  321. } else {
  322. if (atomic_dec_and_test(&request->info->send_pending))
  323. wake_up(&request->info->wait_send_pending);
  324. }
  325. mempool_free(request, request->info->request_mempool);
  326. }
  327. static void dump_smbd_negotiate_resp(struct smbd_negotiate_resp *resp)
  328. {
  329. log_rdma_event(INFO, "resp message min_version %u max_version %u "
  330. "negotiated_version %u credits_requested %u "
  331. "credits_granted %u status %u max_readwrite_size %u "
  332. "preferred_send_size %u max_receive_size %u "
  333. "max_fragmented_size %u\n",
  334. resp->min_version, resp->max_version, resp->negotiated_version,
  335. resp->credits_requested, resp->credits_granted, resp->status,
  336. resp->max_readwrite_size, resp->preferred_send_size,
  337. resp->max_receive_size, resp->max_fragmented_size);
  338. }
  339. /*
  340. * Process a negotiation response message, according to [MS-SMBD]3.1.5.7
  341. * response, packet_length: the negotiation response message
  342. * return value: true if negotiation is a success, false if failed
  343. */
  344. static bool process_negotiation_response(
  345. struct smbd_response *response, int packet_length)
  346. {
  347. struct smbd_connection *info = response->info;
  348. struct smbd_negotiate_resp *packet = smbd_response_payload(response);
  349. if (packet_length < sizeof(struct smbd_negotiate_resp)) {
  350. log_rdma_event(ERR,
  351. "error: packet_length=%d\n", packet_length);
  352. return false;
  353. }
  354. if (le16_to_cpu(packet->negotiated_version) != SMBD_V1) {
  355. log_rdma_event(ERR, "error: negotiated_version=%x\n",
  356. le16_to_cpu(packet->negotiated_version));
  357. return false;
  358. }
  359. info->protocol = le16_to_cpu(packet->negotiated_version);
  360. if (packet->credits_requested == 0) {
  361. log_rdma_event(ERR, "error: credits_requested==0\n");
  362. return false;
  363. }
  364. info->receive_credit_target = le16_to_cpu(packet->credits_requested);
  365. if (packet->credits_granted == 0) {
  366. log_rdma_event(ERR, "error: credits_granted==0\n");
  367. return false;
  368. }
  369. atomic_set(&info->send_credits, le16_to_cpu(packet->credits_granted));
  370. atomic_set(&info->receive_credits, 0);
  371. if (le32_to_cpu(packet->preferred_send_size) > info->max_receive_size) {
  372. log_rdma_event(ERR, "error: preferred_send_size=%d\n",
  373. le32_to_cpu(packet->preferred_send_size));
  374. return false;
  375. }
  376. info->max_receive_size = le32_to_cpu(packet->preferred_send_size);
  377. if (le32_to_cpu(packet->max_receive_size) < SMBD_MIN_RECEIVE_SIZE) {
  378. log_rdma_event(ERR, "error: max_receive_size=%d\n",
  379. le32_to_cpu(packet->max_receive_size));
  380. return false;
  381. }
  382. info->max_send_size = min_t(int, info->max_send_size,
  383. le32_to_cpu(packet->max_receive_size));
  384. if (le32_to_cpu(packet->max_fragmented_size) <
  385. SMBD_MIN_FRAGMENTED_SIZE) {
  386. log_rdma_event(ERR, "error: max_fragmented_size=%d\n",
  387. le32_to_cpu(packet->max_fragmented_size));
  388. return false;
  389. }
  390. info->max_fragmented_send_size =
  391. le32_to_cpu(packet->max_fragmented_size);
  392. info->rdma_readwrite_threshold =
  393. rdma_readwrite_threshold > info->max_fragmented_send_size ?
  394. info->max_fragmented_send_size :
  395. rdma_readwrite_threshold;
  396. info->max_readwrite_size = min_t(u32,
  397. le32_to_cpu(packet->max_readwrite_size),
  398. info->max_frmr_depth * PAGE_SIZE);
  399. info->max_frmr_depth = info->max_readwrite_size / PAGE_SIZE;
  400. return true;
  401. }
  402. /*
  403. * Check and schedule to send an immediate packet
  404. * This is used to extend credtis to remote peer to keep the transport busy
  405. */
  406. static void check_and_send_immediate(struct smbd_connection *info)
  407. {
  408. if (info->transport_status != SMBD_CONNECTED)
  409. return;
  410. info->send_immediate = true;
  411. /*
  412. * Promptly send a packet if our peer is running low on receive
  413. * credits
  414. */
  415. if (atomic_read(&info->receive_credits) <
  416. info->receive_credit_target - 1)
  417. queue_delayed_work(
  418. info->workqueue, &info->send_immediate_work, 0);
  419. }
  420. static void smbd_post_send_credits(struct work_struct *work)
  421. {
  422. int ret = 0;
  423. int use_receive_queue = 1;
  424. int rc;
  425. struct smbd_response *response;
  426. struct smbd_connection *info =
  427. container_of(work, struct smbd_connection,
  428. post_send_credits_work);
  429. if (info->transport_status != SMBD_CONNECTED) {
  430. wake_up(&info->wait_receive_queues);
  431. return;
  432. }
  433. if (info->receive_credit_target >
  434. atomic_read(&info->receive_credits)) {
  435. while (true) {
  436. if (use_receive_queue)
  437. response = get_receive_buffer(info);
  438. else
  439. response = get_empty_queue_buffer(info);
  440. if (!response) {
  441. /* now switch to emtpy packet queue */
  442. if (use_receive_queue) {
  443. use_receive_queue = 0;
  444. continue;
  445. } else
  446. break;
  447. }
  448. response->type = SMBD_TRANSFER_DATA;
  449. response->first_segment = false;
  450. rc = smbd_post_recv(info, response);
  451. if (rc) {
  452. log_rdma_recv(ERR,
  453. "post_recv failed rc=%d\n", rc);
  454. put_receive_buffer(info, response);
  455. break;
  456. }
  457. ret++;
  458. }
  459. }
  460. spin_lock(&info->lock_new_credits_offered);
  461. info->new_credits_offered += ret;
  462. spin_unlock(&info->lock_new_credits_offered);
  463. atomic_add(ret, &info->receive_credits);
  464. /* Check if we can post new receive and grant credits to peer */
  465. check_and_send_immediate(info);
  466. }
  467. static void smbd_recv_done_work(struct work_struct *work)
  468. {
  469. struct smbd_connection *info =
  470. container_of(work, struct smbd_connection, recv_done_work);
  471. /*
  472. * We may have new send credits granted from remote peer
  473. * If any sender is blcoked on lack of credets, unblock it
  474. */
  475. if (atomic_read(&info->send_credits))
  476. wake_up_interruptible(&info->wait_send_queue);
  477. /*
  478. * Check if we need to send something to remote peer to
  479. * grant more credits or respond to KEEP_ALIVE packet
  480. */
  481. check_and_send_immediate(info);
  482. }
  483. /* Called from softirq, when recv is done */
  484. static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
  485. {
  486. struct smbd_data_transfer *data_transfer;
  487. struct smbd_response *response =
  488. container_of(wc->wr_cqe, struct smbd_response, cqe);
  489. struct smbd_connection *info = response->info;
  490. int data_length = 0;
  491. log_rdma_recv(INFO, "response=%p type=%d wc status=%d wc opcode %d "
  492. "byte_len=%d pkey_index=%x\n",
  493. response, response->type, wc->status, wc->opcode,
  494. wc->byte_len, wc->pkey_index);
  495. if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
  496. log_rdma_recv(INFO, "wc->status=%d opcode=%d\n",
  497. wc->status, wc->opcode);
  498. smbd_disconnect_rdma_connection(info);
  499. goto error;
  500. }
  501. ib_dma_sync_single_for_cpu(
  502. wc->qp->device,
  503. response->sge.addr,
  504. response->sge.length,
  505. DMA_FROM_DEVICE);
  506. switch (response->type) {
  507. /* SMBD negotiation response */
  508. case SMBD_NEGOTIATE_RESP:
  509. dump_smbd_negotiate_resp(smbd_response_payload(response));
  510. info->full_packet_received = true;
  511. info->negotiate_done =
  512. process_negotiation_response(response, wc->byte_len);
  513. complete(&info->negotiate_completion);
  514. break;
  515. /* SMBD data transfer packet */
  516. case SMBD_TRANSFER_DATA:
  517. data_transfer = smbd_response_payload(response);
  518. data_length = le32_to_cpu(data_transfer->data_length);
  519. /*
  520. * If this is a packet with data playload place the data in
  521. * reassembly queue and wake up the reading thread
  522. */
  523. if (data_length) {
  524. if (info->full_packet_received)
  525. response->first_segment = true;
  526. if (le32_to_cpu(data_transfer->remaining_data_length))
  527. info->full_packet_received = false;
  528. else
  529. info->full_packet_received = true;
  530. enqueue_reassembly(
  531. info,
  532. response,
  533. data_length);
  534. } else
  535. put_empty_packet(info, response);
  536. if (data_length)
  537. wake_up_interruptible(&info->wait_reassembly_queue);
  538. atomic_dec(&info->receive_credits);
  539. info->receive_credit_target =
  540. le16_to_cpu(data_transfer->credits_requested);
  541. atomic_add(le16_to_cpu(data_transfer->credits_granted),
  542. &info->send_credits);
  543. log_incoming(INFO, "data flags %d data_offset %d "
  544. "data_length %d remaining_data_length %d\n",
  545. le16_to_cpu(data_transfer->flags),
  546. le32_to_cpu(data_transfer->data_offset),
  547. le32_to_cpu(data_transfer->data_length),
  548. le32_to_cpu(data_transfer->remaining_data_length));
  549. /* Send a KEEP_ALIVE response right away if requested */
  550. info->keep_alive_requested = KEEP_ALIVE_NONE;
  551. if (le16_to_cpu(data_transfer->flags) &
  552. SMB_DIRECT_RESPONSE_REQUESTED) {
  553. info->keep_alive_requested = KEEP_ALIVE_PENDING;
  554. }
  555. queue_work(info->workqueue, &info->recv_done_work);
  556. return;
  557. default:
  558. log_rdma_recv(ERR,
  559. "unexpected response type=%d\n", response->type);
  560. }
  561. error:
  562. put_receive_buffer(info, response);
  563. }
  564. static struct rdma_cm_id *smbd_create_id(
  565. struct smbd_connection *info,
  566. struct sockaddr *dstaddr, int port)
  567. {
  568. struct rdma_cm_id *id;
  569. int rc;
  570. __be16 *sport;
  571. id = rdma_create_id(&init_net, smbd_conn_upcall, info,
  572. RDMA_PS_TCP, IB_QPT_RC);
  573. if (IS_ERR(id)) {
  574. rc = PTR_ERR(id);
  575. log_rdma_event(ERR, "rdma_create_id() failed %i\n", rc);
  576. return id;
  577. }
  578. if (dstaddr->sa_family == AF_INET6)
  579. sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port;
  580. else
  581. sport = &((struct sockaddr_in *)dstaddr)->sin_port;
  582. *sport = htons(port);
  583. init_completion(&info->ri_done);
  584. info->ri_rc = -ETIMEDOUT;
  585. rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr,
  586. RDMA_RESOLVE_TIMEOUT);
  587. if (rc) {
  588. log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc);
  589. goto out;
  590. }
  591. wait_for_completion_interruptible_timeout(
  592. &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
  593. rc = info->ri_rc;
  594. if (rc) {
  595. log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc);
  596. goto out;
  597. }
  598. info->ri_rc = -ETIMEDOUT;
  599. rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
  600. if (rc) {
  601. log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc);
  602. goto out;
  603. }
  604. wait_for_completion_interruptible_timeout(
  605. &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
  606. rc = info->ri_rc;
  607. if (rc) {
  608. log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc);
  609. goto out;
  610. }
  611. return id;
  612. out:
  613. rdma_destroy_id(id);
  614. return ERR_PTR(rc);
  615. }
  616. /*
  617. * Test if FRWR (Fast Registration Work Requests) is supported on the device
  618. * This implementation requries FRWR on RDMA read/write
  619. * return value: true if it is supported
  620. */
  621. static bool frwr_is_supported(struct ib_device_attr *attrs)
  622. {
  623. if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
  624. return false;
  625. if (attrs->max_fast_reg_page_list_len == 0)
  626. return false;
  627. return true;
  628. }
  629. static int smbd_ia_open(
  630. struct smbd_connection *info,
  631. struct sockaddr *dstaddr, int port)
  632. {
  633. int rc;
  634. info->id = smbd_create_id(info, dstaddr, port);
  635. if (IS_ERR(info->id)) {
  636. rc = PTR_ERR(info->id);
  637. goto out1;
  638. }
  639. if (!frwr_is_supported(&info->id->device->attrs)) {
  640. log_rdma_event(ERR,
  641. "Fast Registration Work Requests "
  642. "(FRWR) is not supported\n");
  643. log_rdma_event(ERR,
  644. "Device capability flags = %llx "
  645. "max_fast_reg_page_list_len = %u\n",
  646. info->id->device->attrs.device_cap_flags,
  647. info->id->device->attrs.max_fast_reg_page_list_len);
  648. rc = -EPROTONOSUPPORT;
  649. goto out2;
  650. }
  651. info->max_frmr_depth = min_t(int,
  652. smbd_max_frmr_depth,
  653. info->id->device->attrs.max_fast_reg_page_list_len);
  654. info->mr_type = IB_MR_TYPE_MEM_REG;
  655. if (info->id->device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)
  656. info->mr_type = IB_MR_TYPE_SG_GAPS;
  657. info->pd = ib_alloc_pd(info->id->device, 0);
  658. if (IS_ERR(info->pd)) {
  659. rc = PTR_ERR(info->pd);
  660. log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc);
  661. goto out2;
  662. }
  663. return 0;
  664. out2:
  665. rdma_destroy_id(info->id);
  666. info->id = NULL;
  667. out1:
  668. return rc;
  669. }
  670. /*
  671. * Send a negotiation request message to the peer
  672. * The negotiation procedure is in [MS-SMBD] 3.1.5.2 and 3.1.5.3
  673. * After negotiation, the transport is connected and ready for
  674. * carrying upper layer SMB payload
  675. */
  676. static int smbd_post_send_negotiate_req(struct smbd_connection *info)
  677. {
  678. struct ib_send_wr send_wr;
  679. int rc = -ENOMEM;
  680. struct smbd_request *request;
  681. struct smbd_negotiate_req *packet;
  682. request = mempool_alloc(info->request_mempool, GFP_KERNEL);
  683. if (!request)
  684. return rc;
  685. request->info = info;
  686. packet = smbd_request_payload(request);
  687. packet->min_version = cpu_to_le16(SMBD_V1);
  688. packet->max_version = cpu_to_le16(SMBD_V1);
  689. packet->reserved = 0;
  690. packet->credits_requested = cpu_to_le16(info->send_credit_target);
  691. packet->preferred_send_size = cpu_to_le32(info->max_send_size);
  692. packet->max_receive_size = cpu_to_le32(info->max_receive_size);
  693. packet->max_fragmented_size =
  694. cpu_to_le32(info->max_fragmented_recv_size);
  695. request->num_sge = 1;
  696. request->sge[0].addr = ib_dma_map_single(
  697. info->id->device, (void *)packet,
  698. sizeof(*packet), DMA_TO_DEVICE);
  699. if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
  700. rc = -EIO;
  701. goto dma_mapping_failed;
  702. }
  703. request->sge[0].length = sizeof(*packet);
  704. request->sge[0].lkey = info->pd->local_dma_lkey;
  705. ib_dma_sync_single_for_device(
  706. info->id->device, request->sge[0].addr,
  707. request->sge[0].length, DMA_TO_DEVICE);
  708. request->cqe.done = send_done;
  709. send_wr.next = NULL;
  710. send_wr.wr_cqe = &request->cqe;
  711. send_wr.sg_list = request->sge;
  712. send_wr.num_sge = request->num_sge;
  713. send_wr.opcode = IB_WR_SEND;
  714. send_wr.send_flags = IB_SEND_SIGNALED;
  715. log_rdma_send(INFO, "sge addr=%llx length=%x lkey=%x\n",
  716. request->sge[0].addr,
  717. request->sge[0].length, request->sge[0].lkey);
  718. request->has_payload = false;
  719. atomic_inc(&info->send_pending);
  720. rc = ib_post_send(info->id->qp, &send_wr, NULL);
  721. if (!rc)
  722. return 0;
  723. /* if we reach here, post send failed */
  724. log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
  725. atomic_dec(&info->send_pending);
  726. ib_dma_unmap_single(info->id->device, request->sge[0].addr,
  727. request->sge[0].length, DMA_TO_DEVICE);
  728. smbd_disconnect_rdma_connection(info);
  729. dma_mapping_failed:
  730. mempool_free(request, info->request_mempool);
  731. return rc;
  732. }
  733. /*
  734. * Extend the credits to remote peer
  735. * This implements [MS-SMBD] 3.1.5.9
  736. * The idea is that we should extend credits to remote peer as quickly as
  737. * it's allowed, to maintain data flow. We allocate as much receive
  738. * buffer as possible, and extend the receive credits to remote peer
  739. * return value: the new credtis being granted.
  740. */
  741. static int manage_credits_prior_sending(struct smbd_connection *info)
  742. {
  743. int new_credits;
  744. spin_lock(&info->lock_new_credits_offered);
  745. new_credits = info->new_credits_offered;
  746. info->new_credits_offered = 0;
  747. spin_unlock(&info->lock_new_credits_offered);
  748. return new_credits;
  749. }
  750. /*
  751. * Check if we need to send a KEEP_ALIVE message
  752. * The idle connection timer triggers a KEEP_ALIVE message when expires
  753. * SMB_DIRECT_RESPONSE_REQUESTED is set in the message flag to have peer send
  754. * back a response.
  755. * return value:
  756. * 1 if SMB_DIRECT_RESPONSE_REQUESTED needs to be set
  757. * 0: otherwise
  758. */
  759. static int manage_keep_alive_before_sending(struct smbd_connection *info)
  760. {
  761. if (info->keep_alive_requested == KEEP_ALIVE_PENDING) {
  762. info->keep_alive_requested = KEEP_ALIVE_SENT;
  763. return 1;
  764. }
  765. return 0;
  766. }
  767. /*
  768. * Build and prepare the SMBD packet header
  769. * This function waits for avaialbe send credits and build a SMBD packet
  770. * header. The caller then optional append payload to the packet after
  771. * the header
  772. * intput values
  773. * size: the size of the payload
  774. * remaining_data_length: remaining data to send if this is part of a
  775. * fragmented packet
  776. * output values
  777. * request_out: the request allocated from this function
  778. * return values: 0 on success, otherwise actual error code returned
  779. */
  780. static int smbd_create_header(struct smbd_connection *info,
  781. int size, int remaining_data_length,
  782. struct smbd_request **request_out)
  783. {
  784. struct smbd_request *request;
  785. struct smbd_data_transfer *packet;
  786. int header_length;
  787. int rc;
  788. /* Wait for send credits. A SMBD packet needs one credit */
  789. rc = wait_event_interruptible(info->wait_send_queue,
  790. atomic_read(&info->send_credits) > 0 ||
  791. info->transport_status != SMBD_CONNECTED);
  792. if (rc)
  793. return rc;
  794. if (info->transport_status != SMBD_CONNECTED) {
  795. log_outgoing(ERR, "disconnected not sending\n");
  796. return -ENOENT;
  797. }
  798. atomic_dec(&info->send_credits);
  799. request = mempool_alloc(info->request_mempool, GFP_KERNEL);
  800. if (!request) {
  801. rc = -ENOMEM;
  802. goto err;
  803. }
  804. request->info = info;
  805. /* Fill in the packet header */
  806. packet = smbd_request_payload(request);
  807. packet->credits_requested = cpu_to_le16(info->send_credit_target);
  808. packet->credits_granted =
  809. cpu_to_le16(manage_credits_prior_sending(info));
  810. info->send_immediate = false;
  811. packet->flags = 0;
  812. if (manage_keep_alive_before_sending(info))
  813. packet->flags |= cpu_to_le16(SMB_DIRECT_RESPONSE_REQUESTED);
  814. packet->reserved = 0;
  815. if (!size)
  816. packet->data_offset = 0;
  817. else
  818. packet->data_offset = cpu_to_le32(24);
  819. packet->data_length = cpu_to_le32(size);
  820. packet->remaining_data_length = cpu_to_le32(remaining_data_length);
  821. packet->padding = 0;
  822. log_outgoing(INFO, "credits_requested=%d credits_granted=%d "
  823. "data_offset=%d data_length=%d remaining_data_length=%d\n",
  824. le16_to_cpu(packet->credits_requested),
  825. le16_to_cpu(packet->credits_granted),
  826. le32_to_cpu(packet->data_offset),
  827. le32_to_cpu(packet->data_length),
  828. le32_to_cpu(packet->remaining_data_length));
  829. /* Map the packet to DMA */
  830. header_length = sizeof(struct smbd_data_transfer);
  831. /* If this is a packet without payload, don't send padding */
  832. if (!size)
  833. header_length = offsetof(struct smbd_data_transfer, padding);
  834. request->num_sge = 1;
  835. request->sge[0].addr = ib_dma_map_single(info->id->device,
  836. (void *)packet,
  837. header_length,
  838. DMA_BIDIRECTIONAL);
  839. if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
  840. mempool_free(request, info->request_mempool);
  841. rc = -EIO;
  842. goto err;
  843. }
  844. request->sge[0].length = header_length;
  845. request->sge[0].lkey = info->pd->local_dma_lkey;
  846. *request_out = request;
  847. return 0;
  848. err:
  849. atomic_inc(&info->send_credits);
  850. return rc;
  851. }
  852. static void smbd_destroy_header(struct smbd_connection *info,
  853. struct smbd_request *request)
  854. {
  855. ib_dma_unmap_single(info->id->device,
  856. request->sge[0].addr,
  857. request->sge[0].length,
  858. DMA_TO_DEVICE);
  859. mempool_free(request, info->request_mempool);
  860. atomic_inc(&info->send_credits);
  861. }
  862. /* Post the send request */
  863. static int smbd_post_send(struct smbd_connection *info,
  864. struct smbd_request *request, bool has_payload)
  865. {
  866. struct ib_send_wr send_wr;
  867. int rc, i;
  868. for (i = 0; i < request->num_sge; i++) {
  869. log_rdma_send(INFO,
  870. "rdma_request sge[%d] addr=%llu length=%u\n",
  871. i, request->sge[i].addr, request->sge[i].length);
  872. ib_dma_sync_single_for_device(
  873. info->id->device,
  874. request->sge[i].addr,
  875. request->sge[i].length,
  876. DMA_TO_DEVICE);
  877. }
  878. request->cqe.done = send_done;
  879. send_wr.next = NULL;
  880. send_wr.wr_cqe = &request->cqe;
  881. send_wr.sg_list = request->sge;
  882. send_wr.num_sge = request->num_sge;
  883. send_wr.opcode = IB_WR_SEND;
  884. send_wr.send_flags = IB_SEND_SIGNALED;
  885. if (has_payload) {
  886. request->has_payload = true;
  887. atomic_inc(&info->send_payload_pending);
  888. } else {
  889. request->has_payload = false;
  890. atomic_inc(&info->send_pending);
  891. }
  892. rc = ib_post_send(info->id->qp, &send_wr, NULL);
  893. if (rc) {
  894. log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
  895. if (has_payload) {
  896. if (atomic_dec_and_test(&info->send_payload_pending))
  897. wake_up(&info->wait_send_payload_pending);
  898. } else {
  899. if (atomic_dec_and_test(&info->send_pending))
  900. wake_up(&info->wait_send_pending);
  901. }
  902. smbd_disconnect_rdma_connection(info);
  903. } else
  904. /* Reset timer for idle connection after packet is sent */
  905. mod_delayed_work(info->workqueue, &info->idle_timer_work,
  906. info->keep_alive_interval*HZ);
  907. return rc;
  908. }
  909. static int smbd_post_send_sgl(struct smbd_connection *info,
  910. struct scatterlist *sgl, int data_length, int remaining_data_length)
  911. {
  912. int num_sgs;
  913. int i, rc;
  914. struct smbd_request *request;
  915. struct scatterlist *sg;
  916. rc = smbd_create_header(
  917. info, data_length, remaining_data_length, &request);
  918. if (rc)
  919. return rc;
  920. num_sgs = sgl ? sg_nents(sgl) : 0;
  921. for_each_sg(sgl, sg, num_sgs, i) {
  922. request->sge[i+1].addr =
  923. ib_dma_map_page(info->id->device, sg_page(sg),
  924. sg->offset, sg->length, DMA_BIDIRECTIONAL);
  925. if (ib_dma_mapping_error(
  926. info->id->device, request->sge[i+1].addr)) {
  927. rc = -EIO;
  928. request->sge[i+1].addr = 0;
  929. goto dma_mapping_failure;
  930. }
  931. request->sge[i+1].length = sg->length;
  932. request->sge[i+1].lkey = info->pd->local_dma_lkey;
  933. request->num_sge++;
  934. }
  935. rc = smbd_post_send(info, request, data_length);
  936. if (!rc)
  937. return 0;
  938. dma_mapping_failure:
  939. for (i = 1; i < request->num_sge; i++)
  940. if (request->sge[i].addr)
  941. ib_dma_unmap_single(info->id->device,
  942. request->sge[i].addr,
  943. request->sge[i].length,
  944. DMA_TO_DEVICE);
  945. smbd_destroy_header(info, request);
  946. return rc;
  947. }
  948. /*
  949. * Send a page
  950. * page: the page to send
  951. * offset: offset in the page to send
  952. * size: length in the page to send
  953. * remaining_data_length: remaining data to send in this payload
  954. */
  955. static int smbd_post_send_page(struct smbd_connection *info, struct page *page,
  956. unsigned long offset, size_t size, int remaining_data_length)
  957. {
  958. struct scatterlist sgl;
  959. sg_init_table(&sgl, 1);
  960. sg_set_page(&sgl, page, size, offset);
  961. return smbd_post_send_sgl(info, &sgl, size, remaining_data_length);
  962. }
  963. /*
  964. * Send an empty message
  965. * Empty message is used to extend credits to peer to for keep live
  966. * while there is no upper layer payload to send at the time
  967. */
  968. static int smbd_post_send_empty(struct smbd_connection *info)
  969. {
  970. info->count_send_empty++;
  971. return smbd_post_send_sgl(info, NULL, 0, 0);
  972. }
  973. /*
  974. * Send a data buffer
  975. * iov: the iov array describing the data buffers
  976. * n_vec: number of iov array
  977. * remaining_data_length: remaining data to send following this packet
  978. * in segmented SMBD packet
  979. */
  980. static int smbd_post_send_data(
  981. struct smbd_connection *info, struct kvec *iov, int n_vec,
  982. int remaining_data_length)
  983. {
  984. int i;
  985. u32 data_length = 0;
  986. struct scatterlist sgl[SMBDIRECT_MAX_SGE];
  987. if (n_vec > SMBDIRECT_MAX_SGE) {
  988. cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec);
  989. return -EINVAL;
  990. }
  991. sg_init_table(sgl, n_vec);
  992. for (i = 0; i < n_vec; i++) {
  993. data_length += iov[i].iov_len;
  994. sg_set_buf(&sgl[i], iov[i].iov_base, iov[i].iov_len);
  995. }
  996. return smbd_post_send_sgl(info, sgl, data_length, remaining_data_length);
  997. }
  998. /*
  999. * Post a receive request to the transport
  1000. * The remote peer can only send data when a receive request is posted
  1001. * The interaction is controlled by send/receive credit system
  1002. */
  1003. static int smbd_post_recv(
  1004. struct smbd_connection *info, struct smbd_response *response)
  1005. {
  1006. struct ib_recv_wr recv_wr;
  1007. int rc = -EIO;
  1008. response->sge.addr = ib_dma_map_single(
  1009. info->id->device, response->packet,
  1010. info->max_receive_size, DMA_FROM_DEVICE);
  1011. if (ib_dma_mapping_error(info->id->device, response->sge.addr))
  1012. return rc;
  1013. response->sge.length = info->max_receive_size;
  1014. response->sge.lkey = info->pd->local_dma_lkey;
  1015. response->cqe.done = recv_done;
  1016. recv_wr.wr_cqe = &response->cqe;
  1017. recv_wr.next = NULL;
  1018. recv_wr.sg_list = &response->sge;
  1019. recv_wr.num_sge = 1;
  1020. rc = ib_post_recv(info->id->qp, &recv_wr, NULL);
  1021. if (rc) {
  1022. ib_dma_unmap_single(info->id->device, response->sge.addr,
  1023. response->sge.length, DMA_FROM_DEVICE);
  1024. smbd_disconnect_rdma_connection(info);
  1025. log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc);
  1026. }
  1027. return rc;
  1028. }
  1029. /* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */
  1030. static int smbd_negotiate(struct smbd_connection *info)
  1031. {
  1032. int rc;
  1033. struct smbd_response *response = get_receive_buffer(info);
  1034. response->type = SMBD_NEGOTIATE_RESP;
  1035. rc = smbd_post_recv(info, response);
  1036. log_rdma_event(INFO,
  1037. "smbd_post_recv rc=%d iov.addr=%llx iov.length=%x "
  1038. "iov.lkey=%x\n",
  1039. rc, response->sge.addr,
  1040. response->sge.length, response->sge.lkey);
  1041. if (rc)
  1042. return rc;
  1043. init_completion(&info->negotiate_completion);
  1044. info->negotiate_done = false;
  1045. rc = smbd_post_send_negotiate_req(info);
  1046. if (rc)
  1047. return rc;
  1048. rc = wait_for_completion_interruptible_timeout(
  1049. &info->negotiate_completion, SMBD_NEGOTIATE_TIMEOUT * HZ);
  1050. log_rdma_event(INFO, "wait_for_completion_timeout rc=%d\n", rc);
  1051. if (info->negotiate_done)
  1052. return 0;
  1053. if (rc == 0)
  1054. rc = -ETIMEDOUT;
  1055. else if (rc == -ERESTARTSYS)
  1056. rc = -EINTR;
  1057. else
  1058. rc = -ENOTCONN;
  1059. return rc;
  1060. }
  1061. static void put_empty_packet(
  1062. struct smbd_connection *info, struct smbd_response *response)
  1063. {
  1064. spin_lock(&info->empty_packet_queue_lock);
  1065. list_add_tail(&response->list, &info->empty_packet_queue);
  1066. info->count_empty_packet_queue++;
  1067. spin_unlock(&info->empty_packet_queue_lock);
  1068. queue_work(info->workqueue, &info->post_send_credits_work);
  1069. }
  1070. /*
  1071. * Implement Connection.FragmentReassemblyBuffer defined in [MS-SMBD] 3.1.1.1
  1072. * This is a queue for reassembling upper layer payload and present to upper
  1073. * layer. All the inncoming payload go to the reassembly queue, regardless of
  1074. * if reassembly is required. The uuper layer code reads from the queue for all
  1075. * incoming payloads.
  1076. * Put a received packet to the reassembly queue
  1077. * response: the packet received
  1078. * data_length: the size of payload in this packet
  1079. */
  1080. static void enqueue_reassembly(
  1081. struct smbd_connection *info,
  1082. struct smbd_response *response,
  1083. int data_length)
  1084. {
  1085. spin_lock(&info->reassembly_queue_lock);
  1086. list_add_tail(&response->list, &info->reassembly_queue);
  1087. info->reassembly_queue_length++;
  1088. /*
  1089. * Make sure reassembly_data_length is updated after list and
  1090. * reassembly_queue_length are updated. On the dequeue side
  1091. * reassembly_data_length is checked without a lock to determine
  1092. * if reassembly_queue_length and list is up to date
  1093. */
  1094. virt_wmb();
  1095. info->reassembly_data_length += data_length;
  1096. spin_unlock(&info->reassembly_queue_lock);
  1097. info->count_reassembly_queue++;
  1098. info->count_enqueue_reassembly_queue++;
  1099. }
  1100. /*
  1101. * Get the first entry at the front of reassembly queue
  1102. * Caller is responsible for locking
  1103. * return value: the first entry if any, NULL if queue is empty
  1104. */
  1105. static struct smbd_response *_get_first_reassembly(struct smbd_connection *info)
  1106. {
  1107. struct smbd_response *ret = NULL;
  1108. if (!list_empty(&info->reassembly_queue)) {
  1109. ret = list_first_entry(
  1110. &info->reassembly_queue,
  1111. struct smbd_response, list);
  1112. }
  1113. return ret;
  1114. }
  1115. static struct smbd_response *get_empty_queue_buffer(
  1116. struct smbd_connection *info)
  1117. {
  1118. struct smbd_response *ret = NULL;
  1119. unsigned long flags;
  1120. spin_lock_irqsave(&info->empty_packet_queue_lock, flags);
  1121. if (!list_empty(&info->empty_packet_queue)) {
  1122. ret = list_first_entry(
  1123. &info->empty_packet_queue,
  1124. struct smbd_response, list);
  1125. list_del(&ret->list);
  1126. info->count_empty_packet_queue--;
  1127. }
  1128. spin_unlock_irqrestore(&info->empty_packet_queue_lock, flags);
  1129. return ret;
  1130. }
  1131. /*
  1132. * Get a receive buffer
  1133. * For each remote send, we need to post a receive. The receive buffers are
  1134. * pre-allocated in advance.
  1135. * return value: the receive buffer, NULL if none is available
  1136. */
  1137. static struct smbd_response *get_receive_buffer(struct smbd_connection *info)
  1138. {
  1139. struct smbd_response *ret = NULL;
  1140. unsigned long flags;
  1141. spin_lock_irqsave(&info->receive_queue_lock, flags);
  1142. if (!list_empty(&info->receive_queue)) {
  1143. ret = list_first_entry(
  1144. &info->receive_queue,
  1145. struct smbd_response, list);
  1146. list_del(&ret->list);
  1147. info->count_receive_queue--;
  1148. info->count_get_receive_buffer++;
  1149. }
  1150. spin_unlock_irqrestore(&info->receive_queue_lock, flags);
  1151. return ret;
  1152. }
  1153. /*
  1154. * Return a receive buffer
  1155. * Upon returning of a receive buffer, we can post new receive and extend
  1156. * more receive credits to remote peer. This is done immediately after a
  1157. * receive buffer is returned.
  1158. */
  1159. static void put_receive_buffer(
  1160. struct smbd_connection *info, struct smbd_response *response)
  1161. {
  1162. unsigned long flags;
  1163. ib_dma_unmap_single(info->id->device, response->sge.addr,
  1164. response->sge.length, DMA_FROM_DEVICE);
  1165. spin_lock_irqsave(&info->receive_queue_lock, flags);
  1166. list_add_tail(&response->list, &info->receive_queue);
  1167. info->count_receive_queue++;
  1168. info->count_put_receive_buffer++;
  1169. spin_unlock_irqrestore(&info->receive_queue_lock, flags);
  1170. queue_work(info->workqueue, &info->post_send_credits_work);
  1171. }
  1172. /* Preallocate all receive buffer on transport establishment */
  1173. static int allocate_receive_buffers(struct smbd_connection *info, int num_buf)
  1174. {
  1175. int i;
  1176. struct smbd_response *response;
  1177. INIT_LIST_HEAD(&info->reassembly_queue);
  1178. spin_lock_init(&info->reassembly_queue_lock);
  1179. info->reassembly_data_length = 0;
  1180. info->reassembly_queue_length = 0;
  1181. INIT_LIST_HEAD(&info->receive_queue);
  1182. spin_lock_init(&info->receive_queue_lock);
  1183. info->count_receive_queue = 0;
  1184. INIT_LIST_HEAD(&info->empty_packet_queue);
  1185. spin_lock_init(&info->empty_packet_queue_lock);
  1186. info->count_empty_packet_queue = 0;
  1187. init_waitqueue_head(&info->wait_receive_queues);
  1188. for (i = 0; i < num_buf; i++) {
  1189. response = mempool_alloc(info->response_mempool, GFP_KERNEL);
  1190. if (!response)
  1191. goto allocate_failed;
  1192. response->info = info;
  1193. list_add_tail(&response->list, &info->receive_queue);
  1194. info->count_receive_queue++;
  1195. }
  1196. return 0;
  1197. allocate_failed:
  1198. while (!list_empty(&info->receive_queue)) {
  1199. response = list_first_entry(
  1200. &info->receive_queue,
  1201. struct smbd_response, list);
  1202. list_del(&response->list);
  1203. info->count_receive_queue--;
  1204. mempool_free(response, info->response_mempool);
  1205. }
  1206. return -ENOMEM;
  1207. }
  1208. static void destroy_receive_buffers(struct smbd_connection *info)
  1209. {
  1210. struct smbd_response *response;
  1211. while ((response = get_receive_buffer(info)))
  1212. mempool_free(response, info->response_mempool);
  1213. while ((response = get_empty_queue_buffer(info)))
  1214. mempool_free(response, info->response_mempool);
  1215. }
  1216. /*
  1217. * Check and send an immediate or keep alive packet
  1218. * The condition to send those packets are defined in [MS-SMBD] 3.1.1.1
  1219. * Connection.KeepaliveRequested and Connection.SendImmediate
  1220. * The idea is to extend credits to server as soon as it becomes available
  1221. */
  1222. static void send_immediate_work(struct work_struct *work)
  1223. {
  1224. struct smbd_connection *info = container_of(
  1225. work, struct smbd_connection,
  1226. send_immediate_work.work);
  1227. if (info->keep_alive_requested == KEEP_ALIVE_PENDING ||
  1228. info->send_immediate) {
  1229. log_keep_alive(INFO, "send an empty message\n");
  1230. smbd_post_send_empty(info);
  1231. }
  1232. }
  1233. /* Implement idle connection timer [MS-SMBD] 3.1.6.2 */
  1234. static void idle_connection_timer(struct work_struct *work)
  1235. {
  1236. struct smbd_connection *info = container_of(
  1237. work, struct smbd_connection,
  1238. idle_timer_work.work);
  1239. if (info->keep_alive_requested != KEEP_ALIVE_NONE) {
  1240. log_keep_alive(ERR,
  1241. "error status info->keep_alive_requested=%d\n",
  1242. info->keep_alive_requested);
  1243. smbd_disconnect_rdma_connection(info);
  1244. return;
  1245. }
  1246. log_keep_alive(INFO, "about to send an empty idle message\n");
  1247. smbd_post_send_empty(info);
  1248. /* Setup the next idle timeout work */
  1249. queue_delayed_work(info->workqueue, &info->idle_timer_work,
  1250. info->keep_alive_interval*HZ);
  1251. }
  1252. /* Destroy this SMBD connection, called from upper layer */
  1253. void smbd_destroy(struct smbd_connection *info)
  1254. {
  1255. log_rdma_event(INFO, "destroying rdma session\n");
  1256. /* Kick off the disconnection process */
  1257. smbd_disconnect_rdma_connection(info);
  1258. log_rdma_event(INFO, "wait for transport being destroyed\n");
  1259. wait_event(info->wait_destroy,
  1260. info->transport_status == SMBD_DESTROYED);
  1261. destroy_workqueue(info->workqueue);
  1262. log_rdma_event(INFO, "rdma session destroyed\n");
  1263. kfree(info);
  1264. }
  1265. /*
  1266. * Reconnect this SMBD connection, called from upper layer
  1267. * return value: 0 on success, or actual error code
  1268. */
  1269. int smbd_reconnect(struct TCP_Server_Info *server)
  1270. {
  1271. log_rdma_event(INFO, "reconnecting rdma session\n");
  1272. if (!server->smbd_conn) {
  1273. log_rdma_event(INFO, "rdma session already destroyed\n");
  1274. goto create_conn;
  1275. }
  1276. /*
  1277. * This is possible if transport is disconnected and we haven't received
  1278. * notification from RDMA, but upper layer has detected timeout
  1279. */
  1280. if (server->smbd_conn->transport_status == SMBD_CONNECTED) {
  1281. log_rdma_event(INFO, "disconnecting transport\n");
  1282. smbd_disconnect_rdma_connection(server->smbd_conn);
  1283. }
  1284. /* wait until the transport is destroyed */
  1285. if (!wait_event_timeout(server->smbd_conn->wait_destroy,
  1286. server->smbd_conn->transport_status == SMBD_DESTROYED, 5*HZ))
  1287. return -EAGAIN;
  1288. destroy_workqueue(server->smbd_conn->workqueue);
  1289. kfree(server->smbd_conn);
  1290. create_conn:
  1291. log_rdma_event(INFO, "creating rdma session\n");
  1292. server->smbd_conn = smbd_get_connection(
  1293. server, (struct sockaddr *) &server->dstaddr);
  1294. if (server->smbd_conn)
  1295. cifs_dbg(VFS, "RDMA transport re-established\n");
  1296. return server->smbd_conn ? 0 : -ENOENT;
  1297. }
  1298. static void destroy_caches_and_workqueue(struct smbd_connection *info)
  1299. {
  1300. destroy_receive_buffers(info);
  1301. destroy_workqueue(info->workqueue);
  1302. mempool_destroy(info->response_mempool);
  1303. kmem_cache_destroy(info->response_cache);
  1304. mempool_destroy(info->request_mempool);
  1305. kmem_cache_destroy(info->request_cache);
  1306. }
  1307. #define MAX_NAME_LEN 80
  1308. static int allocate_caches_and_workqueue(struct smbd_connection *info)
  1309. {
  1310. char name[MAX_NAME_LEN];
  1311. int rc;
  1312. snprintf(name, MAX_NAME_LEN, "smbd_request_%p", info);
  1313. info->request_cache =
  1314. kmem_cache_create(
  1315. name,
  1316. sizeof(struct smbd_request) +
  1317. sizeof(struct smbd_data_transfer),
  1318. 0, SLAB_HWCACHE_ALIGN, NULL);
  1319. if (!info->request_cache)
  1320. return -ENOMEM;
  1321. info->request_mempool =
  1322. mempool_create(info->send_credit_target, mempool_alloc_slab,
  1323. mempool_free_slab, info->request_cache);
  1324. if (!info->request_mempool)
  1325. goto out1;
  1326. snprintf(name, MAX_NAME_LEN, "smbd_response_%p", info);
  1327. info->response_cache =
  1328. kmem_cache_create(
  1329. name,
  1330. sizeof(struct smbd_response) +
  1331. info->max_receive_size,
  1332. 0, SLAB_HWCACHE_ALIGN, NULL);
  1333. if (!info->response_cache)
  1334. goto out2;
  1335. info->response_mempool =
  1336. mempool_create(info->receive_credit_max, mempool_alloc_slab,
  1337. mempool_free_slab, info->response_cache);
  1338. if (!info->response_mempool)
  1339. goto out3;
  1340. snprintf(name, MAX_NAME_LEN, "smbd_%p", info);
  1341. info->workqueue = create_workqueue(name);
  1342. if (!info->workqueue)
  1343. goto out4;
  1344. rc = allocate_receive_buffers(info, info->receive_credit_max);
  1345. if (rc) {
  1346. log_rdma_event(ERR, "failed to allocate receive buffers\n");
  1347. goto out5;
  1348. }
  1349. return 0;
  1350. out5:
  1351. destroy_workqueue(info->workqueue);
  1352. out4:
  1353. mempool_destroy(info->response_mempool);
  1354. out3:
  1355. kmem_cache_destroy(info->response_cache);
  1356. out2:
  1357. mempool_destroy(info->request_mempool);
  1358. out1:
  1359. kmem_cache_destroy(info->request_cache);
  1360. return -ENOMEM;
  1361. }
  1362. /* Create a SMBD connection, called by upper layer */
  1363. static struct smbd_connection *_smbd_get_connection(
  1364. struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port)
  1365. {
  1366. int rc;
  1367. struct smbd_connection *info;
  1368. struct rdma_conn_param conn_param;
  1369. struct ib_qp_init_attr qp_attr;
  1370. struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr;
  1371. struct ib_port_immutable port_immutable;
  1372. u32 ird_ord_hdr[2];
  1373. info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL);
  1374. if (!info)
  1375. return NULL;
  1376. info->transport_status = SMBD_CONNECTING;
  1377. rc = smbd_ia_open(info, dstaddr, port);
  1378. if (rc) {
  1379. log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc);
  1380. goto create_id_failed;
  1381. }
  1382. if (smbd_send_credit_target > info->id->device->attrs.max_cqe ||
  1383. smbd_send_credit_target > info->id->device->attrs.max_qp_wr) {
  1384. log_rdma_event(ERR,
  1385. "consider lowering send_credit_target = %d. "
  1386. "Possible CQE overrun, device "
  1387. "reporting max_cpe %d max_qp_wr %d\n",
  1388. smbd_send_credit_target,
  1389. info->id->device->attrs.max_cqe,
  1390. info->id->device->attrs.max_qp_wr);
  1391. goto config_failed;
  1392. }
  1393. if (smbd_receive_credit_max > info->id->device->attrs.max_cqe ||
  1394. smbd_receive_credit_max > info->id->device->attrs.max_qp_wr) {
  1395. log_rdma_event(ERR,
  1396. "consider lowering receive_credit_max = %d. "
  1397. "Possible CQE overrun, device "
  1398. "reporting max_cpe %d max_qp_wr %d\n",
  1399. smbd_receive_credit_max,
  1400. info->id->device->attrs.max_cqe,
  1401. info->id->device->attrs.max_qp_wr);
  1402. goto config_failed;
  1403. }
  1404. info->receive_credit_max = smbd_receive_credit_max;
  1405. info->send_credit_target = smbd_send_credit_target;
  1406. info->max_send_size = smbd_max_send_size;
  1407. info->max_fragmented_recv_size = smbd_max_fragmented_recv_size;
  1408. info->max_receive_size = smbd_max_receive_size;
  1409. info->keep_alive_interval = smbd_keep_alive_interval;
  1410. if (info->id->device->attrs.max_send_sge < SMBDIRECT_MAX_SGE) {
  1411. log_rdma_event(ERR,
  1412. "warning: device max_send_sge = %d too small\n",
  1413. info->id->device->attrs.max_send_sge);
  1414. log_rdma_event(ERR, "Queue Pair creation may fail\n");
  1415. }
  1416. if (info->id->device->attrs.max_recv_sge < SMBDIRECT_MAX_SGE) {
  1417. log_rdma_event(ERR,
  1418. "warning: device max_recv_sge = %d too small\n",
  1419. info->id->device->attrs.max_recv_sge);
  1420. log_rdma_event(ERR, "Queue Pair creation may fail\n");
  1421. }
  1422. info->send_cq = NULL;
  1423. info->recv_cq = NULL;
  1424. info->send_cq = ib_alloc_cq(info->id->device, info,
  1425. info->send_credit_target, 0, IB_POLL_SOFTIRQ);
  1426. if (IS_ERR(info->send_cq)) {
  1427. info->send_cq = NULL;
  1428. goto alloc_cq_failed;
  1429. }
  1430. info->recv_cq = ib_alloc_cq(info->id->device, info,
  1431. info->receive_credit_max, 0, IB_POLL_SOFTIRQ);
  1432. if (IS_ERR(info->recv_cq)) {
  1433. info->recv_cq = NULL;
  1434. goto alloc_cq_failed;
  1435. }
  1436. memset(&qp_attr, 0, sizeof(qp_attr));
  1437. qp_attr.event_handler = smbd_qp_async_error_upcall;
  1438. qp_attr.qp_context = info;
  1439. qp_attr.cap.max_send_wr = info->send_credit_target;
  1440. qp_attr.cap.max_recv_wr = info->receive_credit_max;
  1441. qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SGE;
  1442. qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_SGE;
  1443. qp_attr.cap.max_inline_data = 0;
  1444. qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
  1445. qp_attr.qp_type = IB_QPT_RC;
  1446. qp_attr.send_cq = info->send_cq;
  1447. qp_attr.recv_cq = info->recv_cq;
  1448. qp_attr.port_num = ~0;
  1449. rc = rdma_create_qp(info->id, info->pd, &qp_attr);
  1450. if (rc) {
  1451. log_rdma_event(ERR, "rdma_create_qp failed %i\n", rc);
  1452. goto create_qp_failed;
  1453. }
  1454. memset(&conn_param, 0, sizeof(conn_param));
  1455. conn_param.initiator_depth = 0;
  1456. conn_param.responder_resources =
  1457. info->id->device->attrs.max_qp_rd_atom
  1458. < SMBD_CM_RESPONDER_RESOURCES ?
  1459. info->id->device->attrs.max_qp_rd_atom :
  1460. SMBD_CM_RESPONDER_RESOURCES;
  1461. info->responder_resources = conn_param.responder_resources;
  1462. log_rdma_mr(INFO, "responder_resources=%d\n",
  1463. info->responder_resources);
  1464. /* Need to send IRD/ORD in private data for iWARP */
  1465. info->id->device->get_port_immutable(
  1466. info->id->device, info->id->port_num, &port_immutable);
  1467. if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
  1468. ird_ord_hdr[0] = info->responder_resources;
  1469. ird_ord_hdr[1] = 1;
  1470. conn_param.private_data = ird_ord_hdr;
  1471. conn_param.private_data_len = sizeof(ird_ord_hdr);
  1472. } else {
  1473. conn_param.private_data = NULL;
  1474. conn_param.private_data_len = 0;
  1475. }
  1476. conn_param.retry_count = SMBD_CM_RETRY;
  1477. conn_param.rnr_retry_count = SMBD_CM_RNR_RETRY;
  1478. conn_param.flow_control = 0;
  1479. init_waitqueue_head(&info->wait_destroy);
  1480. log_rdma_event(INFO, "connecting to IP %pI4 port %d\n",
  1481. &addr_in->sin_addr, port);
  1482. init_waitqueue_head(&info->conn_wait);
  1483. rc = rdma_connect(info->id, &conn_param);
  1484. if (rc) {
  1485. log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc);
  1486. goto rdma_connect_failed;
  1487. }
  1488. wait_event_interruptible(
  1489. info->conn_wait, info->transport_status != SMBD_CONNECTING);
  1490. if (info->transport_status != SMBD_CONNECTED) {
  1491. log_rdma_event(ERR, "rdma_connect failed port=%d\n", port);
  1492. goto rdma_connect_failed;
  1493. }
  1494. log_rdma_event(INFO, "rdma_connect connected\n");
  1495. rc = allocate_caches_and_workqueue(info);
  1496. if (rc) {
  1497. log_rdma_event(ERR, "cache allocation failed\n");
  1498. goto allocate_cache_failed;
  1499. }
  1500. init_waitqueue_head(&info->wait_send_queue);
  1501. init_waitqueue_head(&info->wait_reassembly_queue);
  1502. INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer);
  1503. INIT_DELAYED_WORK(&info->send_immediate_work, send_immediate_work);
  1504. queue_delayed_work(info->workqueue, &info->idle_timer_work,
  1505. info->keep_alive_interval*HZ);
  1506. init_waitqueue_head(&info->wait_smbd_send_pending);
  1507. info->smbd_send_pending = 0;
  1508. init_waitqueue_head(&info->wait_smbd_recv_pending);
  1509. info->smbd_recv_pending = 0;
  1510. init_waitqueue_head(&info->wait_send_pending);
  1511. atomic_set(&info->send_pending, 0);
  1512. init_waitqueue_head(&info->wait_send_payload_pending);
  1513. atomic_set(&info->send_payload_pending, 0);
  1514. INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work);
  1515. INIT_WORK(&info->destroy_work, smbd_destroy_rdma_work);
  1516. INIT_WORK(&info->recv_done_work, smbd_recv_done_work);
  1517. INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits);
  1518. info->new_credits_offered = 0;
  1519. spin_lock_init(&info->lock_new_credits_offered);
  1520. rc = smbd_negotiate(info);
  1521. if (rc) {
  1522. log_rdma_event(ERR, "smbd_negotiate rc=%d\n", rc);
  1523. goto negotiation_failed;
  1524. }
  1525. rc = allocate_mr_list(info);
  1526. if (rc) {
  1527. log_rdma_mr(ERR, "memory registration allocation failed\n");
  1528. goto allocate_mr_failed;
  1529. }
  1530. return info;
  1531. allocate_mr_failed:
  1532. /* At this point, need to a full transport shutdown */
  1533. smbd_destroy(info);
  1534. return NULL;
  1535. negotiation_failed:
  1536. cancel_delayed_work_sync(&info->idle_timer_work);
  1537. destroy_caches_and_workqueue(info);
  1538. info->transport_status = SMBD_NEGOTIATE_FAILED;
  1539. init_waitqueue_head(&info->conn_wait);
  1540. rdma_disconnect(info->id);
  1541. wait_event(info->conn_wait,
  1542. info->transport_status == SMBD_DISCONNECTED);
  1543. allocate_cache_failed:
  1544. rdma_connect_failed:
  1545. rdma_destroy_qp(info->id);
  1546. create_qp_failed:
  1547. alloc_cq_failed:
  1548. if (info->send_cq)
  1549. ib_free_cq(info->send_cq);
  1550. if (info->recv_cq)
  1551. ib_free_cq(info->recv_cq);
  1552. config_failed:
  1553. ib_dealloc_pd(info->pd);
  1554. rdma_destroy_id(info->id);
  1555. create_id_failed:
  1556. kfree(info);
  1557. return NULL;
  1558. }
  1559. struct smbd_connection *smbd_get_connection(
  1560. struct TCP_Server_Info *server, struct sockaddr *dstaddr)
  1561. {
  1562. struct smbd_connection *ret;
  1563. int port = SMBD_PORT;
  1564. try_again:
  1565. ret = _smbd_get_connection(server, dstaddr, port);
  1566. /* Try SMB_PORT if SMBD_PORT doesn't work */
  1567. if (!ret && port == SMBD_PORT) {
  1568. port = SMB_PORT;
  1569. goto try_again;
  1570. }
  1571. return ret;
  1572. }
  1573. /*
  1574. * Receive data from receive reassembly queue
  1575. * All the incoming data packets are placed in reassembly queue
  1576. * buf: the buffer to read data into
  1577. * size: the length of data to read
  1578. * return value: actual data read
  1579. * Note: this implementation copies the data from reassebmly queue to receive
  1580. * buffers used by upper layer. This is not the optimal code path. A better way
  1581. * to do it is to not have upper layer allocate its receive buffers but rather
  1582. * borrow the buffer from reassembly queue, and return it after data is
  1583. * consumed. But this will require more changes to upper layer code, and also
  1584. * need to consider packet boundaries while they still being reassembled.
  1585. */
  1586. static int smbd_recv_buf(struct smbd_connection *info, char *buf,
  1587. unsigned int size)
  1588. {
  1589. struct smbd_response *response;
  1590. struct smbd_data_transfer *data_transfer;
  1591. int to_copy, to_read, data_read, offset;
  1592. u32 data_length, remaining_data_length, data_offset;
  1593. int rc;
  1594. again:
  1595. if (info->transport_status != SMBD_CONNECTED) {
  1596. log_read(ERR, "disconnected\n");
  1597. return -ENODEV;
  1598. }
  1599. /*
  1600. * No need to hold the reassembly queue lock all the time as we are
  1601. * the only one reading from the front of the queue. The transport
  1602. * may add more entries to the back of the queue at the same time
  1603. */
  1604. log_read(INFO, "size=%d info->reassembly_data_length=%d\n", size,
  1605. info->reassembly_data_length);
  1606. if (info->reassembly_data_length >= size) {
  1607. int queue_length;
  1608. int queue_removed = 0;
  1609. /*
  1610. * Need to make sure reassembly_data_length is read before
  1611. * reading reassembly_queue_length and calling
  1612. * _get_first_reassembly. This call is lock free
  1613. * as we never read at the end of the queue which are being
  1614. * updated in SOFTIRQ as more data is received
  1615. */
  1616. virt_rmb();
  1617. queue_length = info->reassembly_queue_length;
  1618. data_read = 0;
  1619. to_read = size;
  1620. offset = info->first_entry_offset;
  1621. while (data_read < size) {
  1622. response = _get_first_reassembly(info);
  1623. data_transfer = smbd_response_payload(response);
  1624. data_length = le32_to_cpu(data_transfer->data_length);
  1625. remaining_data_length =
  1626. le32_to_cpu(
  1627. data_transfer->remaining_data_length);
  1628. data_offset = le32_to_cpu(data_transfer->data_offset);
  1629. /*
  1630. * The upper layer expects RFC1002 length at the
  1631. * beginning of the payload. Return it to indicate
  1632. * the total length of the packet. This minimize the
  1633. * change to upper layer packet processing logic. This
  1634. * will be eventually remove when an intermediate
  1635. * transport layer is added
  1636. */
  1637. if (response->first_segment && size == 4) {
  1638. unsigned int rfc1002_len =
  1639. data_length + remaining_data_length;
  1640. *((__be32 *)buf) = cpu_to_be32(rfc1002_len);
  1641. data_read = 4;
  1642. response->first_segment = false;
  1643. log_read(INFO, "returning rfc1002 length %d\n",
  1644. rfc1002_len);
  1645. goto read_rfc1002_done;
  1646. }
  1647. to_copy = min_t(int, data_length - offset, to_read);
  1648. memcpy(
  1649. buf + data_read,
  1650. (char *)data_transfer + data_offset + offset,
  1651. to_copy);
  1652. /* move on to the next buffer? */
  1653. if (to_copy == data_length - offset) {
  1654. queue_length--;
  1655. /*
  1656. * No need to lock if we are not at the
  1657. * end of the queue
  1658. */
  1659. if (queue_length)
  1660. list_del(&response->list);
  1661. else {
  1662. spin_lock_irq(
  1663. &info->reassembly_queue_lock);
  1664. list_del(&response->list);
  1665. spin_unlock_irq(
  1666. &info->reassembly_queue_lock);
  1667. }
  1668. queue_removed++;
  1669. info->count_reassembly_queue--;
  1670. info->count_dequeue_reassembly_queue++;
  1671. put_receive_buffer(info, response);
  1672. offset = 0;
  1673. log_read(INFO, "put_receive_buffer offset=0\n");
  1674. } else
  1675. offset += to_copy;
  1676. to_read -= to_copy;
  1677. data_read += to_copy;
  1678. log_read(INFO, "_get_first_reassembly memcpy %d bytes "
  1679. "data_transfer_length-offset=%d after that "
  1680. "to_read=%d data_read=%d offset=%d\n",
  1681. to_copy, data_length - offset,
  1682. to_read, data_read, offset);
  1683. }
  1684. spin_lock_irq(&info->reassembly_queue_lock);
  1685. info->reassembly_data_length -= data_read;
  1686. info->reassembly_queue_length -= queue_removed;
  1687. spin_unlock_irq(&info->reassembly_queue_lock);
  1688. info->first_entry_offset = offset;
  1689. log_read(INFO, "returning to thread data_read=%d "
  1690. "reassembly_data_length=%d first_entry_offset=%d\n",
  1691. data_read, info->reassembly_data_length,
  1692. info->first_entry_offset);
  1693. read_rfc1002_done:
  1694. return data_read;
  1695. }
  1696. log_read(INFO, "wait_event on more data\n");
  1697. rc = wait_event_interruptible(
  1698. info->wait_reassembly_queue,
  1699. info->reassembly_data_length >= size ||
  1700. info->transport_status != SMBD_CONNECTED);
  1701. /* Don't return any data if interrupted */
  1702. if (rc)
  1703. return -ENODEV;
  1704. goto again;
  1705. }
  1706. /*
  1707. * Receive a page from receive reassembly queue
  1708. * page: the page to read data into
  1709. * to_read: the length of data to read
  1710. * return value: actual data read
  1711. */
  1712. static int smbd_recv_page(struct smbd_connection *info,
  1713. struct page *page, unsigned int page_offset,
  1714. unsigned int to_read)
  1715. {
  1716. int ret;
  1717. char *to_address;
  1718. void *page_address;
  1719. /* make sure we have the page ready for read */
  1720. ret = wait_event_interruptible(
  1721. info->wait_reassembly_queue,
  1722. info->reassembly_data_length >= to_read ||
  1723. info->transport_status != SMBD_CONNECTED);
  1724. if (ret)
  1725. return ret;
  1726. /* now we can read from reassembly queue and not sleep */
  1727. page_address = kmap_atomic(page);
  1728. to_address = (char *) page_address + page_offset;
  1729. log_read(INFO, "reading from page=%p address=%p to_read=%d\n",
  1730. page, to_address, to_read);
  1731. ret = smbd_recv_buf(info, to_address, to_read);
  1732. kunmap_atomic(page_address);
  1733. return ret;
  1734. }
  1735. /*
  1736. * Receive data from transport
  1737. * msg: a msghdr point to the buffer, can be ITER_KVEC or ITER_BVEC
  1738. * return: total bytes read, or 0. SMB Direct will not do partial read.
  1739. */
  1740. int smbd_recv(struct smbd_connection *info, struct msghdr *msg)
  1741. {
  1742. char *buf;
  1743. struct page *page;
  1744. unsigned int to_read, page_offset;
  1745. int rc;
  1746. info->smbd_recv_pending++;
  1747. switch (msg->msg_iter.type) {
  1748. case READ | ITER_KVEC:
  1749. buf = msg->msg_iter.kvec->iov_base;
  1750. to_read = msg->msg_iter.kvec->iov_len;
  1751. rc = smbd_recv_buf(info, buf, to_read);
  1752. break;
  1753. case READ | ITER_BVEC:
  1754. page = msg->msg_iter.bvec->bv_page;
  1755. page_offset = msg->msg_iter.bvec->bv_offset;
  1756. to_read = msg->msg_iter.bvec->bv_len;
  1757. rc = smbd_recv_page(info, page, page_offset, to_read);
  1758. break;
  1759. default:
  1760. /* It's a bug in upper layer to get there */
  1761. cifs_dbg(VFS, "CIFS: invalid msg type %d\n",
  1762. msg->msg_iter.type);
  1763. rc = -EINVAL;
  1764. }
  1765. info->smbd_recv_pending--;
  1766. wake_up(&info->wait_smbd_recv_pending);
  1767. /* SMBDirect will read it all or nothing */
  1768. if (rc > 0)
  1769. msg->msg_iter.count = 0;
  1770. return rc;
  1771. }
  1772. /*
  1773. * Send data to transport
  1774. * Each rqst is transported as a SMBDirect payload
  1775. * rqst: the data to write
  1776. * return value: 0 if successfully write, otherwise error code
  1777. */
  1778. int smbd_send(struct TCP_Server_Info *server,
  1779. int num_rqst, struct smb_rqst *rqst_array)
  1780. {
  1781. struct smbd_connection *info = server->smbd_conn;
  1782. struct kvec vec;
  1783. int nvecs;
  1784. int size;
  1785. unsigned int buflen, remaining_data_length;
  1786. int start, i, j;
  1787. int max_iov_size =
  1788. info->max_send_size - sizeof(struct smbd_data_transfer);
  1789. struct kvec *iov;
  1790. int rc;
  1791. struct smb_rqst *rqst;
  1792. int rqst_idx;
  1793. info->smbd_send_pending++;
  1794. if (info->transport_status != SMBD_CONNECTED) {
  1795. rc = -ENODEV;
  1796. goto done;
  1797. }
  1798. /*
  1799. * Add in the page array if there is one. The caller needs to set
  1800. * rq_tailsz to PAGE_SIZE when the buffer has multiple pages and
  1801. * ends at page boundary
  1802. */
  1803. remaining_data_length = 0;
  1804. for (i = 0; i < num_rqst; i++)
  1805. remaining_data_length += smb_rqst_len(server, &rqst_array[i]);
  1806. if (remaining_data_length + sizeof(struct smbd_data_transfer) >
  1807. info->max_fragmented_send_size) {
  1808. log_write(ERR, "payload size %d > max size %d\n",
  1809. remaining_data_length, info->max_fragmented_send_size);
  1810. rc = -EINVAL;
  1811. goto done;
  1812. }
  1813. rqst_idx = 0;
  1814. next_rqst:
  1815. rqst = &rqst_array[rqst_idx];
  1816. iov = rqst->rq_iov;
  1817. cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n",
  1818. rqst_idx, smb_rqst_len(server, rqst));
  1819. for (i = 0; i < rqst->rq_nvec; i++)
  1820. dump_smb(iov[i].iov_base, iov[i].iov_len);
  1821. log_write(INFO, "rqst_idx=%d nvec=%d rqst->rq_npages=%d rq_pagesz=%d "
  1822. "rq_tailsz=%d buflen=%lu\n",
  1823. rqst_idx, rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz,
  1824. rqst->rq_tailsz, smb_rqst_len(server, rqst));
  1825. start = i = 0;
  1826. buflen = 0;
  1827. while (true) {
  1828. buflen += iov[i].iov_len;
  1829. if (buflen > max_iov_size) {
  1830. if (i > start) {
  1831. remaining_data_length -=
  1832. (buflen-iov[i].iov_len);
  1833. log_write(INFO, "sending iov[] from start=%d "
  1834. "i=%d nvecs=%d "
  1835. "remaining_data_length=%d\n",
  1836. start, i, i-start,
  1837. remaining_data_length);
  1838. rc = smbd_post_send_data(
  1839. info, &iov[start], i-start,
  1840. remaining_data_length);
  1841. if (rc)
  1842. goto done;
  1843. } else {
  1844. /* iov[start] is too big, break it */
  1845. nvecs = (buflen+max_iov_size-1)/max_iov_size;
  1846. log_write(INFO, "iov[%d] iov_base=%p buflen=%d"
  1847. " break to %d vectors\n",
  1848. start, iov[start].iov_base,
  1849. buflen, nvecs);
  1850. for (j = 0; j < nvecs; j++) {
  1851. vec.iov_base =
  1852. (char *)iov[start].iov_base +
  1853. j*max_iov_size;
  1854. vec.iov_len = max_iov_size;
  1855. if (j == nvecs-1)
  1856. vec.iov_len =
  1857. buflen -
  1858. max_iov_size*(nvecs-1);
  1859. remaining_data_length -= vec.iov_len;
  1860. log_write(INFO,
  1861. "sending vec j=%d iov_base=%p"
  1862. " iov_len=%zu "
  1863. "remaining_data_length=%d\n",
  1864. j, vec.iov_base, vec.iov_len,
  1865. remaining_data_length);
  1866. rc = smbd_post_send_data(
  1867. info, &vec, 1,
  1868. remaining_data_length);
  1869. if (rc)
  1870. goto done;
  1871. }
  1872. i++;
  1873. if (i == rqst->rq_nvec)
  1874. break;
  1875. }
  1876. start = i;
  1877. buflen = 0;
  1878. } else {
  1879. i++;
  1880. if (i == rqst->rq_nvec) {
  1881. /* send out all remaining vecs */
  1882. remaining_data_length -= buflen;
  1883. log_write(INFO,
  1884. "sending iov[] from start=%d i=%d "
  1885. "nvecs=%d remaining_data_length=%d\n",
  1886. start, i, i-start,
  1887. remaining_data_length);
  1888. rc = smbd_post_send_data(info, &iov[start],
  1889. i-start, remaining_data_length);
  1890. if (rc)
  1891. goto done;
  1892. break;
  1893. }
  1894. }
  1895. log_write(INFO, "looping i=%d buflen=%d\n", i, buflen);
  1896. }
  1897. /* now sending pages if there are any */
  1898. for (i = 0; i < rqst->rq_npages; i++) {
  1899. unsigned int offset;
  1900. rqst_page_get_length(rqst, i, &buflen, &offset);
  1901. nvecs = (buflen + max_iov_size - 1) / max_iov_size;
  1902. log_write(INFO, "sending pages buflen=%d nvecs=%d\n",
  1903. buflen, nvecs);
  1904. for (j = 0; j < nvecs; j++) {
  1905. size = max_iov_size;
  1906. if (j == nvecs-1)
  1907. size = buflen - j*max_iov_size;
  1908. remaining_data_length -= size;
  1909. log_write(INFO, "sending pages i=%d offset=%d size=%d"
  1910. " remaining_data_length=%d\n",
  1911. i, j*max_iov_size+offset, size,
  1912. remaining_data_length);
  1913. rc = smbd_post_send_page(
  1914. info, rqst->rq_pages[i],
  1915. j*max_iov_size + offset,
  1916. size, remaining_data_length);
  1917. if (rc)
  1918. goto done;
  1919. }
  1920. }
  1921. rqst_idx++;
  1922. if (rqst_idx < num_rqst)
  1923. goto next_rqst;
  1924. done:
  1925. /*
  1926. * As an optimization, we don't wait for individual I/O to finish
  1927. * before sending the next one.
  1928. * Send them all and wait for pending send count to get to 0
  1929. * that means all the I/Os have been out and we are good to return
  1930. */
  1931. wait_event(info->wait_send_payload_pending,
  1932. atomic_read(&info->send_payload_pending) == 0);
  1933. info->smbd_send_pending--;
  1934. wake_up(&info->wait_smbd_send_pending);
  1935. return rc;
  1936. }
  1937. static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc)
  1938. {
  1939. struct smbd_mr *mr;
  1940. struct ib_cqe *cqe;
  1941. if (wc->status) {
  1942. log_rdma_mr(ERR, "status=%d\n", wc->status);
  1943. cqe = wc->wr_cqe;
  1944. mr = container_of(cqe, struct smbd_mr, cqe);
  1945. smbd_disconnect_rdma_connection(mr->conn);
  1946. }
  1947. }
  1948. /*
  1949. * The work queue function that recovers MRs
  1950. * We need to call ib_dereg_mr() and ib_alloc_mr() before this MR can be used
  1951. * again. Both calls are slow, so finish them in a workqueue. This will not
  1952. * block I/O path.
  1953. * There is one workqueue that recovers MRs, there is no need to lock as the
  1954. * I/O requests calling smbd_register_mr will never update the links in the
  1955. * mr_list.
  1956. */
  1957. static void smbd_mr_recovery_work(struct work_struct *work)
  1958. {
  1959. struct smbd_connection *info =
  1960. container_of(work, struct smbd_connection, mr_recovery_work);
  1961. struct smbd_mr *smbdirect_mr;
  1962. int rc;
  1963. list_for_each_entry(smbdirect_mr, &info->mr_list, list) {
  1964. if (smbdirect_mr->state == MR_INVALIDATED ||
  1965. smbdirect_mr->state == MR_ERROR) {
  1966. /* recover this MR entry */
  1967. rc = ib_dereg_mr(smbdirect_mr->mr);
  1968. if (rc) {
  1969. log_rdma_mr(ERR,
  1970. "ib_dereg_mr failed rc=%x\n",
  1971. rc);
  1972. smbd_disconnect_rdma_connection(info);
  1973. continue;
  1974. }
  1975. smbdirect_mr->mr = ib_alloc_mr(
  1976. info->pd, info->mr_type,
  1977. info->max_frmr_depth);
  1978. if (IS_ERR(smbdirect_mr->mr)) {
  1979. log_rdma_mr(ERR,
  1980. "ib_alloc_mr failed mr_type=%x "
  1981. "max_frmr_depth=%x\n",
  1982. info->mr_type,
  1983. info->max_frmr_depth);
  1984. smbd_disconnect_rdma_connection(info);
  1985. continue;
  1986. }
  1987. if (smbdirect_mr->state == MR_INVALIDATED)
  1988. ib_dma_unmap_sg(
  1989. info->id->device, smbdirect_mr->sgl,
  1990. smbdirect_mr->sgl_count,
  1991. smbdirect_mr->dir);
  1992. smbdirect_mr->state = MR_READY;
  1993. /* smbdirect_mr->state is updated by this function
  1994. * and is read and updated by I/O issuing CPUs trying
  1995. * to get a MR, the call to atomic_inc_return
  1996. * implicates a memory barrier and guarantees this
  1997. * value is updated before waking up any calls to
  1998. * get_mr() from the I/O issuing CPUs
  1999. */
  2000. if (atomic_inc_return(&info->mr_ready_count) == 1)
  2001. wake_up_interruptible(&info->wait_mr);
  2002. }
  2003. }
  2004. }
  2005. static void destroy_mr_list(struct smbd_connection *info)
  2006. {
  2007. struct smbd_mr *mr, *tmp;
  2008. cancel_work_sync(&info->mr_recovery_work);
  2009. list_for_each_entry_safe(mr, tmp, &info->mr_list, list) {
  2010. if (mr->state == MR_INVALIDATED)
  2011. ib_dma_unmap_sg(info->id->device, mr->sgl,
  2012. mr->sgl_count, mr->dir);
  2013. ib_dereg_mr(mr->mr);
  2014. kfree(mr->sgl);
  2015. kfree(mr);
  2016. }
  2017. }
  2018. /*
  2019. * Allocate MRs used for RDMA read/write
  2020. * The number of MRs will not exceed hardware capability in responder_resources
  2021. * All MRs are kept in mr_list. The MR can be recovered after it's used
  2022. * Recovery is done in smbd_mr_recovery_work. The content of list entry changes
  2023. * as MRs are used and recovered for I/O, but the list links will not change
  2024. */
  2025. static int allocate_mr_list(struct smbd_connection *info)
  2026. {
  2027. int i;
  2028. struct smbd_mr *smbdirect_mr, *tmp;
  2029. INIT_LIST_HEAD(&info->mr_list);
  2030. init_waitqueue_head(&info->wait_mr);
  2031. spin_lock_init(&info->mr_list_lock);
  2032. atomic_set(&info->mr_ready_count, 0);
  2033. atomic_set(&info->mr_used_count, 0);
  2034. init_waitqueue_head(&info->wait_for_mr_cleanup);
  2035. /* Allocate more MRs (2x) than hardware responder_resources */
  2036. for (i = 0; i < info->responder_resources * 2; i++) {
  2037. smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL);
  2038. if (!smbdirect_mr)
  2039. goto out;
  2040. smbdirect_mr->mr = ib_alloc_mr(info->pd, info->mr_type,
  2041. info->max_frmr_depth);
  2042. if (IS_ERR(smbdirect_mr->mr)) {
  2043. log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x "
  2044. "max_frmr_depth=%x\n",
  2045. info->mr_type, info->max_frmr_depth);
  2046. goto out;
  2047. }
  2048. smbdirect_mr->sgl = kcalloc(
  2049. info->max_frmr_depth,
  2050. sizeof(struct scatterlist),
  2051. GFP_KERNEL);
  2052. if (!smbdirect_mr->sgl) {
  2053. log_rdma_mr(ERR, "failed to allocate sgl\n");
  2054. ib_dereg_mr(smbdirect_mr->mr);
  2055. goto out;
  2056. }
  2057. smbdirect_mr->state = MR_READY;
  2058. smbdirect_mr->conn = info;
  2059. list_add_tail(&smbdirect_mr->list, &info->mr_list);
  2060. atomic_inc(&info->mr_ready_count);
  2061. }
  2062. INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work);
  2063. return 0;
  2064. out:
  2065. kfree(smbdirect_mr);
  2066. list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) {
  2067. ib_dereg_mr(smbdirect_mr->mr);
  2068. kfree(smbdirect_mr->sgl);
  2069. kfree(smbdirect_mr);
  2070. }
  2071. return -ENOMEM;
  2072. }
  2073. /*
  2074. * Get a MR from mr_list. This function waits until there is at least one
  2075. * MR available in the list. It may access the list while the
  2076. * smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock
  2077. * as they never modify the same places. However, there may be several CPUs
  2078. * issueing I/O trying to get MR at the same time, mr_list_lock is used to
  2079. * protect this situation.
  2080. */
  2081. static struct smbd_mr *get_mr(struct smbd_connection *info)
  2082. {
  2083. struct smbd_mr *ret;
  2084. int rc;
  2085. again:
  2086. rc = wait_event_interruptible(info->wait_mr,
  2087. atomic_read(&info->mr_ready_count) ||
  2088. info->transport_status != SMBD_CONNECTED);
  2089. if (rc) {
  2090. log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc);
  2091. return NULL;
  2092. }
  2093. if (info->transport_status != SMBD_CONNECTED) {
  2094. log_rdma_mr(ERR, "info->transport_status=%x\n",
  2095. info->transport_status);
  2096. return NULL;
  2097. }
  2098. spin_lock(&info->mr_list_lock);
  2099. list_for_each_entry(ret, &info->mr_list, list) {
  2100. if (ret->state == MR_READY) {
  2101. ret->state = MR_REGISTERED;
  2102. spin_unlock(&info->mr_list_lock);
  2103. atomic_dec(&info->mr_ready_count);
  2104. atomic_inc(&info->mr_used_count);
  2105. return ret;
  2106. }
  2107. }
  2108. spin_unlock(&info->mr_list_lock);
  2109. /*
  2110. * It is possible that we could fail to get MR because other processes may
  2111. * try to acquire a MR at the same time. If this is the case, retry it.
  2112. */
  2113. goto again;
  2114. }
  2115. /*
  2116. * Register memory for RDMA read/write
  2117. * pages[]: the list of pages to register memory with
  2118. * num_pages: the number of pages to register
  2119. * tailsz: if non-zero, the bytes to register in the last page
  2120. * writing: true if this is a RDMA write (SMB read), false for RDMA read
  2121. * need_invalidate: true if this MR needs to be locally invalidated after I/O
  2122. * return value: the MR registered, NULL if failed.
  2123. */
  2124. struct smbd_mr *smbd_register_mr(
  2125. struct smbd_connection *info, struct page *pages[], int num_pages,
  2126. int offset, int tailsz, bool writing, bool need_invalidate)
  2127. {
  2128. struct smbd_mr *smbdirect_mr;
  2129. int rc, i;
  2130. enum dma_data_direction dir;
  2131. struct ib_reg_wr *reg_wr;
  2132. if (num_pages > info->max_frmr_depth) {
  2133. log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n",
  2134. num_pages, info->max_frmr_depth);
  2135. return NULL;
  2136. }
  2137. smbdirect_mr = get_mr(info);
  2138. if (!smbdirect_mr) {
  2139. log_rdma_mr(ERR, "get_mr returning NULL\n");
  2140. return NULL;
  2141. }
  2142. smbdirect_mr->need_invalidate = need_invalidate;
  2143. smbdirect_mr->sgl_count = num_pages;
  2144. sg_init_table(smbdirect_mr->sgl, num_pages);
  2145. log_rdma_mr(INFO, "num_pages=0x%x offset=0x%x tailsz=0x%x\n",
  2146. num_pages, offset, tailsz);
  2147. if (num_pages == 1) {
  2148. sg_set_page(&smbdirect_mr->sgl[0], pages[0], tailsz, offset);
  2149. goto skip_multiple_pages;
  2150. }
  2151. /* We have at least two pages to register */
  2152. sg_set_page(
  2153. &smbdirect_mr->sgl[0], pages[0], PAGE_SIZE - offset, offset);
  2154. i = 1;
  2155. while (i < num_pages - 1) {
  2156. sg_set_page(&smbdirect_mr->sgl[i], pages[i], PAGE_SIZE, 0);
  2157. i++;
  2158. }
  2159. sg_set_page(&smbdirect_mr->sgl[i], pages[i],
  2160. tailsz ? tailsz : PAGE_SIZE, 0);
  2161. skip_multiple_pages:
  2162. dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
  2163. smbdirect_mr->dir = dir;
  2164. rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgl, num_pages, dir);
  2165. if (!rc) {
  2166. log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n",
  2167. num_pages, dir, rc);
  2168. goto dma_map_error;
  2169. }
  2170. rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgl, num_pages,
  2171. NULL, PAGE_SIZE);
  2172. if (rc != num_pages) {
  2173. log_rdma_mr(ERR,
  2174. "ib_map_mr_sg failed rc = %d num_pages = %x\n",
  2175. rc, num_pages);
  2176. goto map_mr_error;
  2177. }
  2178. ib_update_fast_reg_key(smbdirect_mr->mr,
  2179. ib_inc_rkey(smbdirect_mr->mr->rkey));
  2180. reg_wr = &smbdirect_mr->wr;
  2181. reg_wr->wr.opcode = IB_WR_REG_MR;
  2182. smbdirect_mr->cqe.done = register_mr_done;
  2183. reg_wr->wr.wr_cqe = &smbdirect_mr->cqe;
  2184. reg_wr->wr.num_sge = 0;
  2185. reg_wr->wr.send_flags = IB_SEND_SIGNALED;
  2186. reg_wr->mr = smbdirect_mr->mr;
  2187. reg_wr->key = smbdirect_mr->mr->rkey;
  2188. reg_wr->access = writing ?
  2189. IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
  2190. IB_ACCESS_REMOTE_READ;
  2191. /*
  2192. * There is no need for waiting for complemtion on ib_post_send
  2193. * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution
  2194. * on the next ib_post_send when we actaully send I/O to remote peer
  2195. */
  2196. rc = ib_post_send(info->id->qp, &reg_wr->wr, NULL);
  2197. if (!rc)
  2198. return smbdirect_mr;
  2199. log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n",
  2200. rc, reg_wr->key);
  2201. /* If all failed, attempt to recover this MR by setting it MR_ERROR*/
  2202. map_mr_error:
  2203. ib_dma_unmap_sg(info->id->device, smbdirect_mr->sgl,
  2204. smbdirect_mr->sgl_count, smbdirect_mr->dir);
  2205. dma_map_error:
  2206. smbdirect_mr->state = MR_ERROR;
  2207. if (atomic_dec_and_test(&info->mr_used_count))
  2208. wake_up(&info->wait_for_mr_cleanup);
  2209. smbd_disconnect_rdma_connection(info);
  2210. return NULL;
  2211. }
  2212. static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc)
  2213. {
  2214. struct smbd_mr *smbdirect_mr;
  2215. struct ib_cqe *cqe;
  2216. cqe = wc->wr_cqe;
  2217. smbdirect_mr = container_of(cqe, struct smbd_mr, cqe);
  2218. smbdirect_mr->state = MR_INVALIDATED;
  2219. if (wc->status != IB_WC_SUCCESS) {
  2220. log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status);
  2221. smbdirect_mr->state = MR_ERROR;
  2222. }
  2223. complete(&smbdirect_mr->invalidate_done);
  2224. }
  2225. /*
  2226. * Deregister a MR after I/O is done
  2227. * This function may wait if remote invalidation is not used
  2228. * and we have to locally invalidate the buffer to prevent data is being
  2229. * modified by remote peer after upper layer consumes it
  2230. */
  2231. int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
  2232. {
  2233. struct ib_send_wr *wr;
  2234. struct smbd_connection *info = smbdirect_mr->conn;
  2235. int rc = 0;
  2236. if (smbdirect_mr->need_invalidate) {
  2237. /* Need to finish local invalidation before returning */
  2238. wr = &smbdirect_mr->inv_wr;
  2239. wr->opcode = IB_WR_LOCAL_INV;
  2240. smbdirect_mr->cqe.done = local_inv_done;
  2241. wr->wr_cqe = &smbdirect_mr->cqe;
  2242. wr->num_sge = 0;
  2243. wr->ex.invalidate_rkey = smbdirect_mr->mr->rkey;
  2244. wr->send_flags = IB_SEND_SIGNALED;
  2245. init_completion(&smbdirect_mr->invalidate_done);
  2246. rc = ib_post_send(info->id->qp, wr, NULL);
  2247. if (rc) {
  2248. log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc);
  2249. smbd_disconnect_rdma_connection(info);
  2250. goto done;
  2251. }
  2252. wait_for_completion(&smbdirect_mr->invalidate_done);
  2253. smbdirect_mr->need_invalidate = false;
  2254. } else
  2255. /*
  2256. * For remote invalidation, just set it to MR_INVALIDATED
  2257. * and defer to mr_recovery_work to recover the MR for next use
  2258. */
  2259. smbdirect_mr->state = MR_INVALIDATED;
  2260. /*
  2261. * Schedule the work to do MR recovery for future I/Os
  2262. * MR recovery is slow and we don't want it to block the current I/O
  2263. */
  2264. queue_work(info->workqueue, &info->mr_recovery_work);
  2265. done:
  2266. if (atomic_dec_and_test(&info->mr_used_count))
  2267. wake_up(&info->wait_for_mr_cleanup);
  2268. return rc;
  2269. }