kcmsock.c 42 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Kernel Connection Multiplexor
  4. *
  5. * Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
  6. */
  7. #include <linux/bpf.h>
  8. #include <linux/errno.h>
  9. #include <linux/errqueue.h>
  10. #include <linux/file.h>
  11. #include <linux/filter.h>
  12. #include <linux/in.h>
  13. #include <linux/kernel.h>
  14. #include <linux/module.h>
  15. #include <linux/net.h>
  16. #include <linux/netdevice.h>
  17. #include <linux/poll.h>
  18. #include <linux/rculist.h>
  19. #include <linux/skbuff.h>
  20. #include <linux/socket.h>
  21. #include <linux/uaccess.h>
  22. #include <linux/workqueue.h>
  23. #include <linux/syscalls.h>
  24. #include <linux/sched/signal.h>
  25. #include <net/kcm.h>
  26. #include <net/netns/generic.h>
  27. #include <net/sock.h>
  28. #include <uapi/linux/kcm.h>
  29. #include <trace/events/sock.h>
  30. unsigned int kcm_net_id;
  31. static struct kmem_cache *kcm_psockp __read_mostly;
  32. static struct kmem_cache *kcm_muxp __read_mostly;
  33. static struct workqueue_struct *kcm_wq;
  34. static inline struct kcm_sock *kcm_sk(const struct sock *sk)
  35. {
  36. return (struct kcm_sock *)sk;
  37. }
  38. static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb)
  39. {
  40. return (struct kcm_tx_msg *)skb->cb;
  41. }
  42. static void report_csk_error(struct sock *csk, int err)
  43. {
  44. csk->sk_err = EPIPE;
  45. sk_error_report(csk);
  46. }
  47. static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
  48. bool wakeup_kcm)
  49. {
  50. struct sock *csk = psock->sk;
  51. struct kcm_mux *mux = psock->mux;
  52. /* Unrecoverable error in transmit */
  53. spin_lock_bh(&mux->lock);
  54. if (psock->tx_stopped) {
  55. spin_unlock_bh(&mux->lock);
  56. return;
  57. }
  58. psock->tx_stopped = 1;
  59. KCM_STATS_INCR(psock->stats.tx_aborts);
  60. if (!psock->tx_kcm) {
  61. /* Take off psocks_avail list */
  62. list_del(&psock->psock_avail_list);
  63. } else if (wakeup_kcm) {
  64. /* In this case psock is being aborted while outside of
  65. * write_msgs and psock is reserved. Schedule tx_work
  66. * to handle the failure there. Need to commit tx_stopped
  67. * before queuing work.
  68. */
  69. smp_mb();
  70. queue_work(kcm_wq, &psock->tx_kcm->tx_work);
  71. }
  72. spin_unlock_bh(&mux->lock);
  73. /* Report error on lower socket */
  74. report_csk_error(csk, err);
  75. }
  76. /* RX mux lock held. */
  77. static void kcm_update_rx_mux_stats(struct kcm_mux *mux,
  78. struct kcm_psock *psock)
  79. {
  80. STRP_STATS_ADD(mux->stats.rx_bytes,
  81. psock->strp.stats.bytes -
  82. psock->saved_rx_bytes);
  83. mux->stats.rx_msgs +=
  84. psock->strp.stats.msgs - psock->saved_rx_msgs;
  85. psock->saved_rx_msgs = psock->strp.stats.msgs;
  86. psock->saved_rx_bytes = psock->strp.stats.bytes;
  87. }
  88. static void kcm_update_tx_mux_stats(struct kcm_mux *mux,
  89. struct kcm_psock *psock)
  90. {
  91. KCM_STATS_ADD(mux->stats.tx_bytes,
  92. psock->stats.tx_bytes - psock->saved_tx_bytes);
  93. mux->stats.tx_msgs +=
  94. psock->stats.tx_msgs - psock->saved_tx_msgs;
  95. psock->saved_tx_msgs = psock->stats.tx_msgs;
  96. psock->saved_tx_bytes = psock->stats.tx_bytes;
  97. }
  98. static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
  99. /* KCM is ready to receive messages on its queue-- either the KCM is new or
  100. * has become unblocked after being blocked on full socket buffer. Queue any
  101. * pending ready messages on a psock. RX mux lock held.
  102. */
  103. static void kcm_rcv_ready(struct kcm_sock *kcm)
  104. {
  105. struct kcm_mux *mux = kcm->mux;
  106. struct kcm_psock *psock;
  107. struct sk_buff *skb;
  108. if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled))
  109. return;
  110. while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) {
  111. if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
  112. /* Assuming buffer limit has been reached */
  113. skb_queue_head(&mux->rx_hold_queue, skb);
  114. WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
  115. return;
  116. }
  117. }
  118. while (!list_empty(&mux->psocks_ready)) {
  119. psock = list_first_entry(&mux->psocks_ready, struct kcm_psock,
  120. psock_ready_list);
  121. if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) {
  122. /* Assuming buffer limit has been reached */
  123. WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
  124. return;
  125. }
  126. /* Consumed the ready message on the psock. Schedule rx_work to
  127. * get more messages.
  128. */
  129. list_del(&psock->psock_ready_list);
  130. psock->ready_rx_msg = NULL;
  131. /* Commit clearing of ready_rx_msg for queuing work */
  132. smp_mb();
  133. strp_unpause(&psock->strp);
  134. strp_check_rcv(&psock->strp);
  135. }
  136. /* Buffer limit is okay now, add to ready list */
  137. list_add_tail(&kcm->wait_rx_list,
  138. &kcm->mux->kcm_rx_waiters);
  139. /* paired with lockless reads in kcm_rfree() */
  140. WRITE_ONCE(kcm->rx_wait, true);
  141. }
  142. static void kcm_rfree(struct sk_buff *skb)
  143. {
  144. struct sock *sk = skb->sk;
  145. struct kcm_sock *kcm = kcm_sk(sk);
  146. struct kcm_mux *mux = kcm->mux;
  147. unsigned int len = skb->truesize;
  148. sk_mem_uncharge(sk, len);
  149. atomic_sub(len, &sk->sk_rmem_alloc);
  150. /* For reading rx_wait and rx_psock without holding lock */
  151. smp_mb__after_atomic();
  152. if (!READ_ONCE(kcm->rx_wait) && !READ_ONCE(kcm->rx_psock) &&
  153. sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) {
  154. spin_lock_bh(&mux->rx_lock);
  155. kcm_rcv_ready(kcm);
  156. spin_unlock_bh(&mux->rx_lock);
  157. }
  158. }
  159. static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
  160. {
  161. struct sk_buff_head *list = &sk->sk_receive_queue;
  162. if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
  163. return -ENOMEM;
  164. if (!sk_rmem_schedule(sk, skb, skb->truesize))
  165. return -ENOBUFS;
  166. skb->dev = NULL;
  167. skb_orphan(skb);
  168. skb->sk = sk;
  169. skb->destructor = kcm_rfree;
  170. atomic_add(skb->truesize, &sk->sk_rmem_alloc);
  171. sk_mem_charge(sk, skb->truesize);
  172. skb_queue_tail(list, skb);
  173. if (!sock_flag(sk, SOCK_DEAD))
  174. sk->sk_data_ready(sk);
  175. return 0;
  176. }
  177. /* Requeue received messages for a kcm socket to other kcm sockets. This is
  178. * called with a kcm socket is receive disabled.
  179. * RX mux lock held.
  180. */
  181. static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head)
  182. {
  183. struct sk_buff *skb;
  184. struct kcm_sock *kcm;
  185. while ((skb = skb_dequeue(head))) {
  186. /* Reset destructor to avoid calling kcm_rcv_ready */
  187. skb->destructor = sock_rfree;
  188. skb_orphan(skb);
  189. try_again:
  190. if (list_empty(&mux->kcm_rx_waiters)) {
  191. skb_queue_tail(&mux->rx_hold_queue, skb);
  192. continue;
  193. }
  194. kcm = list_first_entry(&mux->kcm_rx_waiters,
  195. struct kcm_sock, wait_rx_list);
  196. if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
  197. /* Should mean socket buffer full */
  198. list_del(&kcm->wait_rx_list);
  199. /* paired with lockless reads in kcm_rfree() */
  200. WRITE_ONCE(kcm->rx_wait, false);
  201. /* Commit rx_wait to read in kcm_free */
  202. smp_wmb();
  203. goto try_again;
  204. }
  205. }
  206. }
  207. /* Lower sock lock held */
  208. static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock,
  209. struct sk_buff *head)
  210. {
  211. struct kcm_mux *mux = psock->mux;
  212. struct kcm_sock *kcm;
  213. WARN_ON(psock->ready_rx_msg);
  214. if (psock->rx_kcm)
  215. return psock->rx_kcm;
  216. spin_lock_bh(&mux->rx_lock);
  217. if (psock->rx_kcm) {
  218. spin_unlock_bh(&mux->rx_lock);
  219. return psock->rx_kcm;
  220. }
  221. kcm_update_rx_mux_stats(mux, psock);
  222. if (list_empty(&mux->kcm_rx_waiters)) {
  223. psock->ready_rx_msg = head;
  224. strp_pause(&psock->strp);
  225. list_add_tail(&psock->psock_ready_list,
  226. &mux->psocks_ready);
  227. spin_unlock_bh(&mux->rx_lock);
  228. return NULL;
  229. }
  230. kcm = list_first_entry(&mux->kcm_rx_waiters,
  231. struct kcm_sock, wait_rx_list);
  232. list_del(&kcm->wait_rx_list);
  233. /* paired with lockless reads in kcm_rfree() */
  234. WRITE_ONCE(kcm->rx_wait, false);
  235. psock->rx_kcm = kcm;
  236. /* paired with lockless reads in kcm_rfree() */
  237. WRITE_ONCE(kcm->rx_psock, psock);
  238. spin_unlock_bh(&mux->rx_lock);
  239. return kcm;
  240. }
  241. static void kcm_done(struct kcm_sock *kcm);
  242. static void kcm_done_work(struct work_struct *w)
  243. {
  244. kcm_done(container_of(w, struct kcm_sock, done_work));
  245. }
  246. /* Lower sock held */
  247. static void unreserve_rx_kcm(struct kcm_psock *psock,
  248. bool rcv_ready)
  249. {
  250. struct kcm_sock *kcm = psock->rx_kcm;
  251. struct kcm_mux *mux = psock->mux;
  252. if (!kcm)
  253. return;
  254. spin_lock_bh(&mux->rx_lock);
  255. psock->rx_kcm = NULL;
  256. /* paired with lockless reads in kcm_rfree() */
  257. WRITE_ONCE(kcm->rx_psock, NULL);
  258. /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with
  259. * kcm_rfree
  260. */
  261. smp_mb();
  262. if (unlikely(kcm->done)) {
  263. spin_unlock_bh(&mux->rx_lock);
  264. /* Need to run kcm_done in a task since we need to qcquire
  265. * callback locks which may already be held here.
  266. */
  267. INIT_WORK(&kcm->done_work, kcm_done_work);
  268. schedule_work(&kcm->done_work);
  269. return;
  270. }
  271. if (unlikely(kcm->rx_disabled)) {
  272. requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
  273. } else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) {
  274. /* Check for degenerative race with rx_wait that all
  275. * data was dequeued (accounted for in kcm_rfree).
  276. */
  277. kcm_rcv_ready(kcm);
  278. }
  279. spin_unlock_bh(&mux->rx_lock);
  280. }
  281. /* Lower sock lock held */
  282. static void psock_data_ready(struct sock *sk)
  283. {
  284. struct kcm_psock *psock;
  285. trace_sk_data_ready(sk);
  286. read_lock_bh(&sk->sk_callback_lock);
  287. psock = (struct kcm_psock *)sk->sk_user_data;
  288. if (likely(psock))
  289. strp_data_ready(&psock->strp);
  290. read_unlock_bh(&sk->sk_callback_lock);
  291. }
  292. /* Called with lower sock held */
  293. static void kcm_rcv_strparser(struct strparser *strp, struct sk_buff *skb)
  294. {
  295. struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);
  296. struct kcm_sock *kcm;
  297. try_queue:
  298. kcm = reserve_rx_kcm(psock, skb);
  299. if (!kcm) {
  300. /* Unable to reserve a KCM, message is held in psock and strp
  301. * is paused.
  302. */
  303. return;
  304. }
  305. if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
  306. /* Should mean socket buffer full */
  307. unreserve_rx_kcm(psock, false);
  308. goto try_queue;
  309. }
  310. }
  311. static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb)
  312. {
  313. struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);
  314. struct bpf_prog *prog = psock->bpf_prog;
  315. int res;
  316. res = bpf_prog_run_pin_on_cpu(prog, skb);
  317. return res;
  318. }
  319. static int kcm_read_sock_done(struct strparser *strp, int err)
  320. {
  321. struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);
  322. unreserve_rx_kcm(psock, true);
  323. return err;
  324. }
  325. static void psock_state_change(struct sock *sk)
  326. {
  327. /* TCP only does a EPOLLIN for a half close. Do a EPOLLHUP here
  328. * since application will normally not poll with EPOLLIN
  329. * on the TCP sockets.
  330. */
  331. report_csk_error(sk, EPIPE);
  332. }
  333. static void psock_write_space(struct sock *sk)
  334. {
  335. struct kcm_psock *psock;
  336. struct kcm_mux *mux;
  337. struct kcm_sock *kcm;
  338. read_lock_bh(&sk->sk_callback_lock);
  339. psock = (struct kcm_psock *)sk->sk_user_data;
  340. if (unlikely(!psock))
  341. goto out;
  342. mux = psock->mux;
  343. spin_lock_bh(&mux->lock);
  344. /* Check if the socket is reserved so someone is waiting for sending. */
  345. kcm = psock->tx_kcm;
  346. if (kcm)
  347. queue_work(kcm_wq, &kcm->tx_work);
  348. spin_unlock_bh(&mux->lock);
  349. out:
  350. read_unlock_bh(&sk->sk_callback_lock);
  351. }
  352. static void unreserve_psock(struct kcm_sock *kcm);
  353. /* kcm sock is locked. */
  354. static struct kcm_psock *reserve_psock(struct kcm_sock *kcm)
  355. {
  356. struct kcm_mux *mux = kcm->mux;
  357. struct kcm_psock *psock;
  358. psock = kcm->tx_psock;
  359. smp_rmb(); /* Must read tx_psock before tx_wait */
  360. if (psock) {
  361. WARN_ON(kcm->tx_wait);
  362. if (unlikely(psock->tx_stopped))
  363. unreserve_psock(kcm);
  364. else
  365. return kcm->tx_psock;
  366. }
  367. spin_lock_bh(&mux->lock);
  368. /* Check again under lock to see if psock was reserved for this
  369. * psock via psock_unreserve.
  370. */
  371. psock = kcm->tx_psock;
  372. if (unlikely(psock)) {
  373. WARN_ON(kcm->tx_wait);
  374. spin_unlock_bh(&mux->lock);
  375. return kcm->tx_psock;
  376. }
  377. if (!list_empty(&mux->psocks_avail)) {
  378. psock = list_first_entry(&mux->psocks_avail,
  379. struct kcm_psock,
  380. psock_avail_list);
  381. list_del(&psock->psock_avail_list);
  382. if (kcm->tx_wait) {
  383. list_del(&kcm->wait_psock_list);
  384. kcm->tx_wait = false;
  385. }
  386. kcm->tx_psock = psock;
  387. psock->tx_kcm = kcm;
  388. KCM_STATS_INCR(psock->stats.reserved);
  389. } else if (!kcm->tx_wait) {
  390. list_add_tail(&kcm->wait_psock_list,
  391. &mux->kcm_tx_waiters);
  392. kcm->tx_wait = true;
  393. }
  394. spin_unlock_bh(&mux->lock);
  395. return psock;
  396. }
  397. /* mux lock held */
  398. static void psock_now_avail(struct kcm_psock *psock)
  399. {
  400. struct kcm_mux *mux = psock->mux;
  401. struct kcm_sock *kcm;
  402. if (list_empty(&mux->kcm_tx_waiters)) {
  403. list_add_tail(&psock->psock_avail_list,
  404. &mux->psocks_avail);
  405. } else {
  406. kcm = list_first_entry(&mux->kcm_tx_waiters,
  407. struct kcm_sock,
  408. wait_psock_list);
  409. list_del(&kcm->wait_psock_list);
  410. kcm->tx_wait = false;
  411. psock->tx_kcm = kcm;
  412. /* Commit before changing tx_psock since that is read in
  413. * reserve_psock before queuing work.
  414. */
  415. smp_mb();
  416. kcm->tx_psock = psock;
  417. KCM_STATS_INCR(psock->stats.reserved);
  418. queue_work(kcm_wq, &kcm->tx_work);
  419. }
  420. }
  421. /* kcm sock is locked. */
  422. static void unreserve_psock(struct kcm_sock *kcm)
  423. {
  424. struct kcm_psock *psock;
  425. struct kcm_mux *mux = kcm->mux;
  426. spin_lock_bh(&mux->lock);
  427. psock = kcm->tx_psock;
  428. if (WARN_ON(!psock)) {
  429. spin_unlock_bh(&mux->lock);
  430. return;
  431. }
  432. smp_rmb(); /* Read tx_psock before tx_wait */
  433. kcm_update_tx_mux_stats(mux, psock);
  434. WARN_ON(kcm->tx_wait);
  435. kcm->tx_psock = NULL;
  436. psock->tx_kcm = NULL;
  437. KCM_STATS_INCR(psock->stats.unreserved);
  438. if (unlikely(psock->tx_stopped)) {
  439. if (psock->done) {
  440. /* Deferred free */
  441. list_del(&psock->psock_list);
  442. mux->psocks_cnt--;
  443. sock_put(psock->sk);
  444. fput(psock->sk->sk_socket->file);
  445. kmem_cache_free(kcm_psockp, psock);
  446. }
  447. /* Don't put back on available list */
  448. spin_unlock_bh(&mux->lock);
  449. return;
  450. }
  451. psock_now_avail(psock);
  452. spin_unlock_bh(&mux->lock);
  453. }
  454. static void kcm_report_tx_retry(struct kcm_sock *kcm)
  455. {
  456. struct kcm_mux *mux = kcm->mux;
  457. spin_lock_bh(&mux->lock);
  458. KCM_STATS_INCR(mux->stats.tx_retries);
  459. spin_unlock_bh(&mux->lock);
  460. }
  461. /* Write any messages ready on the kcm socket. Called with kcm sock lock
  462. * held. Return bytes actually sent or error.
  463. */
  464. static int kcm_write_msgs(struct kcm_sock *kcm)
  465. {
  466. unsigned int total_sent = 0;
  467. struct sock *sk = &kcm->sk;
  468. struct kcm_psock *psock;
  469. struct sk_buff *head;
  470. int ret = 0;
  471. kcm->tx_wait_more = false;
  472. psock = kcm->tx_psock;
  473. if (unlikely(psock && psock->tx_stopped)) {
  474. /* A reserved psock was aborted asynchronously. Unreserve
  475. * it and we'll retry the message.
  476. */
  477. unreserve_psock(kcm);
  478. kcm_report_tx_retry(kcm);
  479. if (skb_queue_empty(&sk->sk_write_queue))
  480. return 0;
  481. kcm_tx_msg(skb_peek(&sk->sk_write_queue))->started_tx = false;
  482. }
  483. retry:
  484. while ((head = skb_peek(&sk->sk_write_queue))) {
  485. struct msghdr msg = {
  486. .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
  487. };
  488. struct kcm_tx_msg *txm = kcm_tx_msg(head);
  489. struct sk_buff *skb;
  490. unsigned int msize;
  491. int i;
  492. if (!txm->started_tx) {
  493. psock = reserve_psock(kcm);
  494. if (!psock)
  495. goto out;
  496. skb = head;
  497. txm->frag_offset = 0;
  498. txm->sent = 0;
  499. txm->started_tx = true;
  500. } else {
  501. if (WARN_ON(!psock)) {
  502. ret = -EINVAL;
  503. goto out;
  504. }
  505. skb = txm->frag_skb;
  506. }
  507. if (WARN_ON(!skb_shinfo(skb)->nr_frags) ||
  508. WARN_ON_ONCE(!skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
  509. ret = -EINVAL;
  510. goto out;
  511. }
  512. msize = 0;
  513. for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
  514. msize += skb_frag_size(&skb_shinfo(skb)->frags[i]);
  515. iov_iter_bvec(&msg.msg_iter, ITER_SOURCE,
  516. (const struct bio_vec *)skb_shinfo(skb)->frags,
  517. skb_shinfo(skb)->nr_frags, msize);
  518. iov_iter_advance(&msg.msg_iter, txm->frag_offset);
  519. do {
  520. ret = sock_sendmsg(psock->sk->sk_socket, &msg);
  521. if (ret <= 0) {
  522. if (ret == -EAGAIN) {
  523. /* Save state to try again when there's
  524. * write space on the socket
  525. */
  526. txm->frag_skb = skb;
  527. ret = 0;
  528. goto out;
  529. }
  530. /* Hard failure in sending message, abort this
  531. * psock since it has lost framing
  532. * synchronization and retry sending the
  533. * message from the beginning.
  534. */
  535. kcm_abort_tx_psock(psock, ret ? -ret : EPIPE,
  536. true);
  537. unreserve_psock(kcm);
  538. psock = NULL;
  539. txm->started_tx = false;
  540. kcm_report_tx_retry(kcm);
  541. ret = 0;
  542. goto retry;
  543. }
  544. txm->sent += ret;
  545. txm->frag_offset += ret;
  546. KCM_STATS_ADD(psock->stats.tx_bytes, ret);
  547. } while (msg.msg_iter.count > 0);
  548. if (skb == head) {
  549. if (skb_has_frag_list(skb)) {
  550. txm->frag_skb = skb_shinfo(skb)->frag_list;
  551. txm->frag_offset = 0;
  552. continue;
  553. }
  554. } else if (skb->next) {
  555. txm->frag_skb = skb->next;
  556. txm->frag_offset = 0;
  557. continue;
  558. }
  559. /* Successfully sent the whole packet, account for it. */
  560. sk->sk_wmem_queued -= txm->sent;
  561. total_sent += txm->sent;
  562. skb_dequeue(&sk->sk_write_queue);
  563. kfree_skb(head);
  564. KCM_STATS_INCR(psock->stats.tx_msgs);
  565. }
  566. out:
  567. if (!head) {
  568. /* Done with all queued messages. */
  569. WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
  570. if (psock)
  571. unreserve_psock(kcm);
  572. }
  573. /* Check if write space is available */
  574. sk->sk_write_space(sk);
  575. return total_sent ? : ret;
  576. }
  577. static void kcm_tx_work(struct work_struct *w)
  578. {
  579. struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work);
  580. struct sock *sk = &kcm->sk;
  581. int err;
  582. lock_sock(sk);
  583. /* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx
  584. * aborts
  585. */
  586. err = kcm_write_msgs(kcm);
  587. if (err < 0) {
  588. /* Hard failure in write, report error on KCM socket */
  589. pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err);
  590. report_csk_error(&kcm->sk, -err);
  591. goto out;
  592. }
  593. /* Primarily for SOCK_SEQPACKET sockets */
  594. if (likely(sk->sk_socket) &&
  595. test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
  596. clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
  597. sk->sk_write_space(sk);
  598. }
  599. out:
  600. release_sock(sk);
  601. }
  602. static void kcm_push(struct kcm_sock *kcm)
  603. {
  604. if (kcm->tx_wait_more)
  605. kcm_write_msgs(kcm);
  606. }
  607. static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
  608. {
  609. struct sock *sk = sock->sk;
  610. struct kcm_sock *kcm = kcm_sk(sk);
  611. struct sk_buff *skb = NULL, *head = NULL;
  612. size_t copy, copied = 0;
  613. long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
  614. int eor = (sock->type == SOCK_DGRAM) ?
  615. !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR);
  616. int err = -EPIPE;
  617. mutex_lock(&kcm->tx_mutex);
  618. lock_sock(sk);
  619. /* Per tcp_sendmsg this should be in poll */
  620. sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
  621. if (sk->sk_err)
  622. goto out_error;
  623. if (kcm->seq_skb) {
  624. /* Previously opened message */
  625. head = kcm->seq_skb;
  626. skb = kcm_tx_msg(head)->last_skb;
  627. goto start;
  628. }
  629. /* Call the sk_stream functions to manage the sndbuf mem. */
  630. if (!sk_stream_memory_free(sk)) {
  631. kcm_push(kcm);
  632. set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
  633. err = sk_stream_wait_memory(sk, &timeo);
  634. if (err)
  635. goto out_error;
  636. }
  637. if (msg_data_left(msg)) {
  638. /* New message, alloc head skb */
  639. head = alloc_skb(0, sk->sk_allocation);
  640. while (!head) {
  641. kcm_push(kcm);
  642. err = sk_stream_wait_memory(sk, &timeo);
  643. if (err)
  644. goto out_error;
  645. head = alloc_skb(0, sk->sk_allocation);
  646. }
  647. skb = head;
  648. /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
  649. * csum_and_copy_from_iter from skb_do_copy_data_nocache.
  650. */
  651. skb->ip_summed = CHECKSUM_UNNECESSARY;
  652. }
  653. start:
  654. while (msg_data_left(msg)) {
  655. bool merge = true;
  656. int i = skb_shinfo(skb)->nr_frags;
  657. struct page_frag *pfrag = sk_page_frag(sk);
  658. if (!sk_page_frag_refill(sk, pfrag))
  659. goto wait_for_memory;
  660. if (!skb_can_coalesce(skb, i, pfrag->page,
  661. pfrag->offset)) {
  662. if (i == MAX_SKB_FRAGS) {
  663. struct sk_buff *tskb;
  664. tskb = alloc_skb(0, sk->sk_allocation);
  665. if (!tskb)
  666. goto wait_for_memory;
  667. if (head == skb)
  668. skb_shinfo(head)->frag_list = tskb;
  669. else
  670. skb->next = tskb;
  671. skb = tskb;
  672. skb->ip_summed = CHECKSUM_UNNECESSARY;
  673. continue;
  674. }
  675. merge = false;
  676. }
  677. if (msg->msg_flags & MSG_SPLICE_PAGES) {
  678. copy = msg_data_left(msg);
  679. if (!sk_wmem_schedule(sk, copy))
  680. goto wait_for_memory;
  681. err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
  682. sk->sk_allocation);
  683. if (err < 0) {
  684. if (err == -EMSGSIZE)
  685. goto wait_for_memory;
  686. goto out_error;
  687. }
  688. copy = err;
  689. skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
  690. sk_wmem_queued_add(sk, copy);
  691. sk_mem_charge(sk, copy);
  692. if (head != skb)
  693. head->truesize += copy;
  694. } else {
  695. copy = min_t(int, msg_data_left(msg),
  696. pfrag->size - pfrag->offset);
  697. if (!sk_wmem_schedule(sk, copy))
  698. goto wait_for_memory;
  699. err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
  700. pfrag->page,
  701. pfrag->offset,
  702. copy);
  703. if (err)
  704. goto out_error;
  705. /* Update the skb. */
  706. if (merge) {
  707. skb_frag_size_add(
  708. &skb_shinfo(skb)->frags[i - 1], copy);
  709. } else {
  710. skb_fill_page_desc(skb, i, pfrag->page,
  711. pfrag->offset, copy);
  712. get_page(pfrag->page);
  713. }
  714. pfrag->offset += copy;
  715. }
  716. copied += copy;
  717. if (head != skb) {
  718. head->len += copy;
  719. head->data_len += copy;
  720. }
  721. continue;
  722. wait_for_memory:
  723. kcm_push(kcm);
  724. err = sk_stream_wait_memory(sk, &timeo);
  725. if (err)
  726. goto out_error;
  727. }
  728. if (eor) {
  729. bool not_busy = skb_queue_empty(&sk->sk_write_queue);
  730. if (head) {
  731. /* Message complete, queue it on send buffer */
  732. __skb_queue_tail(&sk->sk_write_queue, head);
  733. kcm->seq_skb = NULL;
  734. KCM_STATS_INCR(kcm->stats.tx_msgs);
  735. }
  736. if (msg->msg_flags & MSG_BATCH) {
  737. kcm->tx_wait_more = true;
  738. } else if (kcm->tx_wait_more || not_busy) {
  739. err = kcm_write_msgs(kcm);
  740. if (err < 0) {
  741. /* We got a hard error in write_msgs but have
  742. * already queued this message. Report an error
  743. * in the socket, but don't affect return value
  744. * from sendmsg
  745. */
  746. pr_warn("KCM: Hard failure on kcm_write_msgs\n");
  747. report_csk_error(&kcm->sk, -err);
  748. }
  749. }
  750. } else {
  751. /* Message not complete, save state */
  752. partial_message:
  753. if (head) {
  754. kcm->seq_skb = head;
  755. kcm_tx_msg(head)->last_skb = skb;
  756. }
  757. }
  758. KCM_STATS_ADD(kcm->stats.tx_bytes, copied);
  759. release_sock(sk);
  760. mutex_unlock(&kcm->tx_mutex);
  761. return copied;
  762. out_error:
  763. kcm_push(kcm);
  764. if (sock->type == SOCK_SEQPACKET) {
  765. /* Wrote some bytes before encountering an
  766. * error, return partial success.
  767. */
  768. if (copied)
  769. goto partial_message;
  770. if (head != kcm->seq_skb)
  771. kfree_skb(head);
  772. } else {
  773. kfree_skb(head);
  774. kcm->seq_skb = NULL;
  775. }
  776. err = sk_stream_error(sk, msg->msg_flags, err);
  777. /* make sure we wake any epoll edge trigger waiter */
  778. if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
  779. sk->sk_write_space(sk);
  780. release_sock(sk);
  781. mutex_unlock(&kcm->tx_mutex);
  782. return err;
  783. }
  784. static void kcm_splice_eof(struct socket *sock)
  785. {
  786. struct sock *sk = sock->sk;
  787. struct kcm_sock *kcm = kcm_sk(sk);
  788. if (skb_queue_empty_lockless(&sk->sk_write_queue))
  789. return;
  790. lock_sock(sk);
  791. kcm_write_msgs(kcm);
  792. release_sock(sk);
  793. }
  794. static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
  795. size_t len, int flags)
  796. {
  797. struct sock *sk = sock->sk;
  798. struct kcm_sock *kcm = kcm_sk(sk);
  799. int err = 0;
  800. struct strp_msg *stm;
  801. int copied = 0;
  802. struct sk_buff *skb;
  803. skb = skb_recv_datagram(sk, flags, &err);
  804. if (!skb)
  805. goto out;
  806. /* Okay, have a message on the receive queue */
  807. stm = strp_msg(skb);
  808. if (len > stm->full_len)
  809. len = stm->full_len;
  810. err = skb_copy_datagram_msg(skb, stm->offset, msg, len);
  811. if (err < 0)
  812. goto out;
  813. copied = len;
  814. if (likely(!(flags & MSG_PEEK))) {
  815. KCM_STATS_ADD(kcm->stats.rx_bytes, copied);
  816. if (copied < stm->full_len) {
  817. if (sock->type == SOCK_DGRAM) {
  818. /* Truncated message */
  819. msg->msg_flags |= MSG_TRUNC;
  820. goto msg_finished;
  821. }
  822. stm->offset += copied;
  823. stm->full_len -= copied;
  824. } else {
  825. msg_finished:
  826. /* Finished with message */
  827. msg->msg_flags |= MSG_EOR;
  828. KCM_STATS_INCR(kcm->stats.rx_msgs);
  829. }
  830. }
  831. out:
  832. skb_free_datagram(sk, skb);
  833. return copied ? : err;
  834. }
  835. static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
  836. struct pipe_inode_info *pipe, size_t len,
  837. unsigned int flags)
  838. {
  839. struct sock *sk = sock->sk;
  840. struct kcm_sock *kcm = kcm_sk(sk);
  841. struct strp_msg *stm;
  842. int err = 0;
  843. ssize_t copied;
  844. struct sk_buff *skb;
  845. /* Only support splice for SOCKSEQPACKET */
  846. skb = skb_recv_datagram(sk, flags, &err);
  847. if (!skb)
  848. goto err_out;
  849. /* Okay, have a message on the receive queue */
  850. stm = strp_msg(skb);
  851. if (len > stm->full_len)
  852. len = stm->full_len;
  853. copied = skb_splice_bits(skb, sk, stm->offset, pipe, len, flags);
  854. if (copied < 0) {
  855. err = copied;
  856. goto err_out;
  857. }
  858. KCM_STATS_ADD(kcm->stats.rx_bytes, copied);
  859. stm->offset += copied;
  860. stm->full_len -= copied;
  861. /* We have no way to return MSG_EOR. If all the bytes have been
  862. * read we still leave the message in the receive socket buffer.
  863. * A subsequent recvmsg needs to be done to return MSG_EOR and
  864. * finish reading the message.
  865. */
  866. skb_free_datagram(sk, skb);
  867. return copied;
  868. err_out:
  869. skb_free_datagram(sk, skb);
  870. return err;
  871. }
  872. /* kcm sock lock held */
  873. static void kcm_recv_disable(struct kcm_sock *kcm)
  874. {
  875. struct kcm_mux *mux = kcm->mux;
  876. if (kcm->rx_disabled)
  877. return;
  878. spin_lock_bh(&mux->rx_lock);
  879. kcm->rx_disabled = 1;
  880. /* If a psock is reserved we'll do cleanup in unreserve */
  881. if (!kcm->rx_psock) {
  882. if (kcm->rx_wait) {
  883. list_del(&kcm->wait_rx_list);
  884. /* paired with lockless reads in kcm_rfree() */
  885. WRITE_ONCE(kcm->rx_wait, false);
  886. }
  887. requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
  888. }
  889. spin_unlock_bh(&mux->rx_lock);
  890. }
  891. /* kcm sock lock held */
  892. static void kcm_recv_enable(struct kcm_sock *kcm)
  893. {
  894. struct kcm_mux *mux = kcm->mux;
  895. if (!kcm->rx_disabled)
  896. return;
  897. spin_lock_bh(&mux->rx_lock);
  898. kcm->rx_disabled = 0;
  899. kcm_rcv_ready(kcm);
  900. spin_unlock_bh(&mux->rx_lock);
  901. }
  902. static int kcm_setsockopt(struct socket *sock, int level, int optname,
  903. sockptr_t optval, unsigned int optlen)
  904. {
  905. struct kcm_sock *kcm = kcm_sk(sock->sk);
  906. int val, valbool;
  907. int err = 0;
  908. if (level != SOL_KCM)
  909. return -ENOPROTOOPT;
  910. if (optlen < sizeof(int))
  911. return -EINVAL;
  912. if (copy_from_sockptr(&val, optval, sizeof(int)))
  913. return -EFAULT;
  914. valbool = val ? 1 : 0;
  915. switch (optname) {
  916. case KCM_RECV_DISABLE:
  917. lock_sock(&kcm->sk);
  918. if (valbool)
  919. kcm_recv_disable(kcm);
  920. else
  921. kcm_recv_enable(kcm);
  922. release_sock(&kcm->sk);
  923. break;
  924. default:
  925. err = -ENOPROTOOPT;
  926. }
  927. return err;
  928. }
  929. static int kcm_getsockopt(struct socket *sock, int level, int optname,
  930. char __user *optval, int __user *optlen)
  931. {
  932. struct kcm_sock *kcm = kcm_sk(sock->sk);
  933. int val, len;
  934. if (level != SOL_KCM)
  935. return -ENOPROTOOPT;
  936. if (get_user(len, optlen))
  937. return -EFAULT;
  938. if (len < 0)
  939. return -EINVAL;
  940. len = min_t(unsigned int, len, sizeof(int));
  941. switch (optname) {
  942. case KCM_RECV_DISABLE:
  943. val = kcm->rx_disabled;
  944. break;
  945. default:
  946. return -ENOPROTOOPT;
  947. }
  948. if (put_user(len, optlen))
  949. return -EFAULT;
  950. if (copy_to_user(optval, &val, len))
  951. return -EFAULT;
  952. return 0;
  953. }
  954. static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
  955. {
  956. struct kcm_sock *tkcm;
  957. struct list_head *head;
  958. int index = 0;
  959. /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so
  960. * we set sk_state, otherwise epoll_wait always returns right away with
  961. * EPOLLHUP
  962. */
  963. kcm->sk.sk_state = TCP_ESTABLISHED;
  964. /* Add to mux's kcm sockets list */
  965. kcm->mux = mux;
  966. spin_lock_bh(&mux->lock);
  967. head = &mux->kcm_socks;
  968. list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) {
  969. if (tkcm->index != index)
  970. break;
  971. head = &tkcm->kcm_sock_list;
  972. index++;
  973. }
  974. list_add(&kcm->kcm_sock_list, head);
  975. kcm->index = index;
  976. mux->kcm_socks_cnt++;
  977. spin_unlock_bh(&mux->lock);
  978. INIT_WORK(&kcm->tx_work, kcm_tx_work);
  979. mutex_init(&kcm->tx_mutex);
  980. spin_lock_bh(&mux->rx_lock);
  981. kcm_rcv_ready(kcm);
  982. spin_unlock_bh(&mux->rx_lock);
  983. }
  984. static int kcm_attach(struct socket *sock, struct socket *csock,
  985. struct bpf_prog *prog)
  986. {
  987. struct kcm_sock *kcm = kcm_sk(sock->sk);
  988. struct kcm_mux *mux = kcm->mux;
  989. struct sock *csk;
  990. struct kcm_psock *psock = NULL, *tpsock;
  991. struct list_head *head;
  992. int index = 0;
  993. static const struct strp_callbacks cb = {
  994. .rcv_msg = kcm_rcv_strparser,
  995. .parse_msg = kcm_parse_func_strparser,
  996. .read_sock_done = kcm_read_sock_done,
  997. };
  998. int err = 0;
  999. csk = csock->sk;
  1000. if (!csk)
  1001. return -EINVAL;
  1002. lock_sock(csk);
  1003. /* Only allow TCP sockets to be attached for now */
  1004. if ((csk->sk_family != AF_INET && csk->sk_family != AF_INET6) ||
  1005. csk->sk_protocol != IPPROTO_TCP) {
  1006. err = -EOPNOTSUPP;
  1007. goto out;
  1008. }
  1009. /* Don't allow listeners or closed sockets */
  1010. if (csk->sk_state == TCP_LISTEN || csk->sk_state == TCP_CLOSE) {
  1011. err = -EOPNOTSUPP;
  1012. goto out;
  1013. }
  1014. psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
  1015. if (!psock) {
  1016. err = -ENOMEM;
  1017. goto out;
  1018. }
  1019. psock->mux = mux;
  1020. psock->sk = csk;
  1021. psock->bpf_prog = prog;
  1022. write_lock_bh(&csk->sk_callback_lock);
  1023. /* Check if sk_user_data is already by KCM or someone else.
  1024. * Must be done under lock to prevent race conditions.
  1025. */
  1026. if (csk->sk_user_data) {
  1027. write_unlock_bh(&csk->sk_callback_lock);
  1028. kmem_cache_free(kcm_psockp, psock);
  1029. err = -EALREADY;
  1030. goto out;
  1031. }
  1032. err = strp_init(&psock->strp, csk, &cb);
  1033. if (err) {
  1034. write_unlock_bh(&csk->sk_callback_lock);
  1035. kmem_cache_free(kcm_psockp, psock);
  1036. goto out;
  1037. }
  1038. psock->save_data_ready = csk->sk_data_ready;
  1039. psock->save_write_space = csk->sk_write_space;
  1040. psock->save_state_change = csk->sk_state_change;
  1041. csk->sk_user_data = psock;
  1042. csk->sk_data_ready = psock_data_ready;
  1043. csk->sk_write_space = psock_write_space;
  1044. csk->sk_state_change = psock_state_change;
  1045. write_unlock_bh(&csk->sk_callback_lock);
  1046. sock_hold(csk);
  1047. /* Finished initialization, now add the psock to the MUX. */
  1048. spin_lock_bh(&mux->lock);
  1049. head = &mux->psocks;
  1050. list_for_each_entry(tpsock, &mux->psocks, psock_list) {
  1051. if (tpsock->index != index)
  1052. break;
  1053. head = &tpsock->psock_list;
  1054. index++;
  1055. }
  1056. list_add(&psock->psock_list, head);
  1057. psock->index = index;
  1058. KCM_STATS_INCR(mux->stats.psock_attach);
  1059. mux->psocks_cnt++;
  1060. psock_now_avail(psock);
  1061. spin_unlock_bh(&mux->lock);
  1062. /* Schedule RX work in case there are already bytes queued */
  1063. strp_check_rcv(&psock->strp);
  1064. out:
  1065. release_sock(csk);
  1066. return err;
  1067. }
  1068. static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info)
  1069. {
  1070. struct socket *csock;
  1071. struct bpf_prog *prog;
  1072. int err;
  1073. csock = sockfd_lookup(info->fd, &err);
  1074. if (!csock)
  1075. return -ENOENT;
  1076. prog = bpf_prog_get_type(info->bpf_fd, BPF_PROG_TYPE_SOCKET_FILTER);
  1077. if (IS_ERR(prog)) {
  1078. err = PTR_ERR(prog);
  1079. goto out;
  1080. }
  1081. err = kcm_attach(sock, csock, prog);
  1082. if (err) {
  1083. bpf_prog_put(prog);
  1084. goto out;
  1085. }
  1086. /* Keep reference on file also */
  1087. return 0;
  1088. out:
  1089. sockfd_put(csock);
  1090. return err;
  1091. }
  1092. static void kcm_unattach(struct kcm_psock *psock)
  1093. {
  1094. struct sock *csk = psock->sk;
  1095. struct kcm_mux *mux = psock->mux;
  1096. lock_sock(csk);
  1097. /* Stop getting callbacks from TCP socket. After this there should
  1098. * be no way to reserve a kcm for this psock.
  1099. */
  1100. write_lock_bh(&csk->sk_callback_lock);
  1101. csk->sk_user_data = NULL;
  1102. csk->sk_data_ready = psock->save_data_ready;
  1103. csk->sk_write_space = psock->save_write_space;
  1104. csk->sk_state_change = psock->save_state_change;
  1105. strp_stop(&psock->strp);
  1106. if (WARN_ON(psock->rx_kcm)) {
  1107. write_unlock_bh(&csk->sk_callback_lock);
  1108. release_sock(csk);
  1109. return;
  1110. }
  1111. spin_lock_bh(&mux->rx_lock);
  1112. /* Stop receiver activities. After this point psock should not be
  1113. * able to get onto ready list either through callbacks or work.
  1114. */
  1115. if (psock->ready_rx_msg) {
  1116. list_del(&psock->psock_ready_list);
  1117. kfree_skb(psock->ready_rx_msg);
  1118. psock->ready_rx_msg = NULL;
  1119. KCM_STATS_INCR(mux->stats.rx_ready_drops);
  1120. }
  1121. spin_unlock_bh(&mux->rx_lock);
  1122. write_unlock_bh(&csk->sk_callback_lock);
  1123. /* Call strp_done without sock lock */
  1124. release_sock(csk);
  1125. strp_done(&psock->strp);
  1126. lock_sock(csk);
  1127. bpf_prog_put(psock->bpf_prog);
  1128. spin_lock_bh(&mux->lock);
  1129. aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats);
  1130. save_strp_stats(&psock->strp, &mux->aggregate_strp_stats);
  1131. KCM_STATS_INCR(mux->stats.psock_unattach);
  1132. if (psock->tx_kcm) {
  1133. /* psock was reserved. Just mark it finished and we will clean
  1134. * up in the kcm paths, we need kcm lock which can not be
  1135. * acquired here.
  1136. */
  1137. KCM_STATS_INCR(mux->stats.psock_unattach_rsvd);
  1138. spin_unlock_bh(&mux->lock);
  1139. /* We are unattaching a socket that is reserved. Abort the
  1140. * socket since we may be out of sync in sending on it. We need
  1141. * to do this without the mux lock.
  1142. */
  1143. kcm_abort_tx_psock(psock, EPIPE, false);
  1144. spin_lock_bh(&mux->lock);
  1145. if (!psock->tx_kcm) {
  1146. /* psock now unreserved in window mux was unlocked */
  1147. goto no_reserved;
  1148. }
  1149. psock->done = 1;
  1150. /* Commit done before queuing work to process it */
  1151. smp_mb();
  1152. /* Queue tx work to make sure psock->done is handled */
  1153. queue_work(kcm_wq, &psock->tx_kcm->tx_work);
  1154. spin_unlock_bh(&mux->lock);
  1155. } else {
  1156. no_reserved:
  1157. if (!psock->tx_stopped)
  1158. list_del(&psock->psock_avail_list);
  1159. list_del(&psock->psock_list);
  1160. mux->psocks_cnt--;
  1161. spin_unlock_bh(&mux->lock);
  1162. sock_put(csk);
  1163. fput(csk->sk_socket->file);
  1164. kmem_cache_free(kcm_psockp, psock);
  1165. }
  1166. release_sock(csk);
  1167. }
  1168. static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info)
  1169. {
  1170. struct kcm_sock *kcm = kcm_sk(sock->sk);
  1171. struct kcm_mux *mux = kcm->mux;
  1172. struct kcm_psock *psock;
  1173. struct socket *csock;
  1174. struct sock *csk;
  1175. int err;
  1176. csock = sockfd_lookup(info->fd, &err);
  1177. if (!csock)
  1178. return -ENOENT;
  1179. csk = csock->sk;
  1180. if (!csk) {
  1181. err = -EINVAL;
  1182. goto out;
  1183. }
  1184. err = -ENOENT;
  1185. spin_lock_bh(&mux->lock);
  1186. list_for_each_entry(psock, &mux->psocks, psock_list) {
  1187. if (psock->sk != csk)
  1188. continue;
  1189. /* Found the matching psock */
  1190. if (psock->unattaching || WARN_ON(psock->done)) {
  1191. err = -EALREADY;
  1192. break;
  1193. }
  1194. psock->unattaching = 1;
  1195. spin_unlock_bh(&mux->lock);
  1196. /* Lower socket lock should already be held */
  1197. kcm_unattach(psock);
  1198. err = 0;
  1199. goto out;
  1200. }
  1201. spin_unlock_bh(&mux->lock);
  1202. out:
  1203. sockfd_put(csock);
  1204. return err;
  1205. }
  1206. static struct proto kcm_proto = {
  1207. .name = "KCM",
  1208. .owner = THIS_MODULE,
  1209. .obj_size = sizeof(struct kcm_sock),
  1210. };
  1211. /* Clone a kcm socket. */
  1212. static struct file *kcm_clone(struct socket *osock)
  1213. {
  1214. struct socket *newsock;
  1215. struct sock *newsk;
  1216. newsock = sock_alloc();
  1217. if (!newsock)
  1218. return ERR_PTR(-ENFILE);
  1219. newsock->type = osock->type;
  1220. newsock->ops = osock->ops;
  1221. __module_get(newsock->ops->owner);
  1222. newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL,
  1223. &kcm_proto, false);
  1224. if (!newsk) {
  1225. sock_release(newsock);
  1226. return ERR_PTR(-ENOMEM);
  1227. }
  1228. sock_init_data(newsock, newsk);
  1229. init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux);
  1230. return sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name);
  1231. }
  1232. static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
  1233. {
  1234. int err;
  1235. switch (cmd) {
  1236. case SIOCKCMATTACH: {
  1237. struct kcm_attach info;
  1238. if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
  1239. return -EFAULT;
  1240. err = kcm_attach_ioctl(sock, &info);
  1241. break;
  1242. }
  1243. case SIOCKCMUNATTACH: {
  1244. struct kcm_unattach info;
  1245. if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
  1246. return -EFAULT;
  1247. err = kcm_unattach_ioctl(sock, &info);
  1248. break;
  1249. }
  1250. case SIOCKCMCLONE: {
  1251. struct kcm_clone info;
  1252. struct file *file;
  1253. info.fd = get_unused_fd_flags(0);
  1254. if (unlikely(info.fd < 0))
  1255. return info.fd;
  1256. file = kcm_clone(sock);
  1257. if (IS_ERR(file)) {
  1258. put_unused_fd(info.fd);
  1259. return PTR_ERR(file);
  1260. }
  1261. if (copy_to_user((void __user *)arg, &info,
  1262. sizeof(info))) {
  1263. put_unused_fd(info.fd);
  1264. fput(file);
  1265. return -EFAULT;
  1266. }
  1267. fd_install(info.fd, file);
  1268. err = 0;
  1269. break;
  1270. }
  1271. default:
  1272. err = -ENOIOCTLCMD;
  1273. break;
  1274. }
  1275. return err;
  1276. }
  1277. static void free_mux(struct rcu_head *rcu)
  1278. {
  1279. struct kcm_mux *mux = container_of(rcu,
  1280. struct kcm_mux, rcu);
  1281. kmem_cache_free(kcm_muxp, mux);
  1282. }
  1283. static void release_mux(struct kcm_mux *mux)
  1284. {
  1285. struct kcm_net *knet = mux->knet;
  1286. struct kcm_psock *psock, *tmp_psock;
  1287. /* Release psocks */
  1288. list_for_each_entry_safe(psock, tmp_psock,
  1289. &mux->psocks, psock_list) {
  1290. if (!WARN_ON(psock->unattaching))
  1291. kcm_unattach(psock);
  1292. }
  1293. if (WARN_ON(mux->psocks_cnt))
  1294. return;
  1295. __skb_queue_purge(&mux->rx_hold_queue);
  1296. mutex_lock(&knet->mutex);
  1297. aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats);
  1298. aggregate_psock_stats(&mux->aggregate_psock_stats,
  1299. &knet->aggregate_psock_stats);
  1300. aggregate_strp_stats(&mux->aggregate_strp_stats,
  1301. &knet->aggregate_strp_stats);
  1302. list_del_rcu(&mux->kcm_mux_list);
  1303. knet->count--;
  1304. mutex_unlock(&knet->mutex);
  1305. call_rcu(&mux->rcu, free_mux);
  1306. }
  1307. static void kcm_done(struct kcm_sock *kcm)
  1308. {
  1309. struct kcm_mux *mux = kcm->mux;
  1310. struct sock *sk = &kcm->sk;
  1311. int socks_cnt;
  1312. spin_lock_bh(&mux->rx_lock);
  1313. if (kcm->rx_psock) {
  1314. /* Cleanup in unreserve_rx_kcm */
  1315. WARN_ON(kcm->done);
  1316. kcm->rx_disabled = 1;
  1317. kcm->done = 1;
  1318. spin_unlock_bh(&mux->rx_lock);
  1319. return;
  1320. }
  1321. if (kcm->rx_wait) {
  1322. list_del(&kcm->wait_rx_list);
  1323. /* paired with lockless reads in kcm_rfree() */
  1324. WRITE_ONCE(kcm->rx_wait, false);
  1325. }
  1326. /* Move any pending receive messages to other kcm sockets */
  1327. requeue_rx_msgs(mux, &sk->sk_receive_queue);
  1328. spin_unlock_bh(&mux->rx_lock);
  1329. if (WARN_ON(sk_rmem_alloc_get(sk)))
  1330. return;
  1331. /* Detach from MUX */
  1332. spin_lock_bh(&mux->lock);
  1333. list_del(&kcm->kcm_sock_list);
  1334. mux->kcm_socks_cnt--;
  1335. socks_cnt = mux->kcm_socks_cnt;
  1336. spin_unlock_bh(&mux->lock);
  1337. if (!socks_cnt) {
  1338. /* We are done with the mux now. */
  1339. release_mux(mux);
  1340. }
  1341. WARN_ON(kcm->rx_wait);
  1342. sock_put(&kcm->sk);
  1343. }
  1344. /* Called by kcm_release to close a KCM socket.
  1345. * If this is the last KCM socket on the MUX, destroy the MUX.
  1346. */
  1347. static int kcm_release(struct socket *sock)
  1348. {
  1349. struct sock *sk = sock->sk;
  1350. struct kcm_sock *kcm;
  1351. struct kcm_mux *mux;
  1352. struct kcm_psock *psock;
  1353. if (!sk)
  1354. return 0;
  1355. kcm = kcm_sk(sk);
  1356. mux = kcm->mux;
  1357. lock_sock(sk);
  1358. sock_orphan(sk);
  1359. kfree_skb(kcm->seq_skb);
  1360. /* Purge queue under lock to avoid race condition with tx_work trying
  1361. * to act when queue is nonempty. If tx_work runs after this point
  1362. * it will just return.
  1363. */
  1364. __skb_queue_purge(&sk->sk_write_queue);
  1365. release_sock(sk);
  1366. spin_lock_bh(&mux->lock);
  1367. if (kcm->tx_wait) {
  1368. /* Take of tx_wait list, after this point there should be no way
  1369. * that a psock will be assigned to this kcm.
  1370. */
  1371. list_del(&kcm->wait_psock_list);
  1372. kcm->tx_wait = false;
  1373. }
  1374. spin_unlock_bh(&mux->lock);
  1375. /* Cancel work. After this point there should be no outside references
  1376. * to the kcm socket.
  1377. */
  1378. disable_work_sync(&kcm->tx_work);
  1379. lock_sock(sk);
  1380. psock = kcm->tx_psock;
  1381. if (psock) {
  1382. /* A psock was reserved, so we need to kill it since it
  1383. * may already have some bytes queued from a message. We
  1384. * need to do this after removing kcm from tx_wait list.
  1385. */
  1386. kcm_abort_tx_psock(psock, EPIPE, false);
  1387. unreserve_psock(kcm);
  1388. }
  1389. release_sock(sk);
  1390. WARN_ON(kcm->tx_wait);
  1391. WARN_ON(kcm->tx_psock);
  1392. sock->sk = NULL;
  1393. kcm_done(kcm);
  1394. return 0;
  1395. }
  1396. static const struct proto_ops kcm_dgram_ops = {
  1397. .family = PF_KCM,
  1398. .owner = THIS_MODULE,
  1399. .release = kcm_release,
  1400. .bind = sock_no_bind,
  1401. .connect = sock_no_connect,
  1402. .socketpair = sock_no_socketpair,
  1403. .accept = sock_no_accept,
  1404. .getname = sock_no_getname,
  1405. .poll = datagram_poll,
  1406. .ioctl = kcm_ioctl,
  1407. .listen = sock_no_listen,
  1408. .shutdown = sock_no_shutdown,
  1409. .setsockopt = kcm_setsockopt,
  1410. .getsockopt = kcm_getsockopt,
  1411. .sendmsg = kcm_sendmsg,
  1412. .recvmsg = kcm_recvmsg,
  1413. .mmap = sock_no_mmap,
  1414. .splice_eof = kcm_splice_eof,
  1415. };
  1416. static const struct proto_ops kcm_seqpacket_ops = {
  1417. .family = PF_KCM,
  1418. .owner = THIS_MODULE,
  1419. .release = kcm_release,
  1420. .bind = sock_no_bind,
  1421. .connect = sock_no_connect,
  1422. .socketpair = sock_no_socketpair,
  1423. .accept = sock_no_accept,
  1424. .getname = sock_no_getname,
  1425. .poll = datagram_poll,
  1426. .ioctl = kcm_ioctl,
  1427. .listen = sock_no_listen,
  1428. .shutdown = sock_no_shutdown,
  1429. .setsockopt = kcm_setsockopt,
  1430. .getsockopt = kcm_getsockopt,
  1431. .sendmsg = kcm_sendmsg,
  1432. .recvmsg = kcm_recvmsg,
  1433. .mmap = sock_no_mmap,
  1434. .splice_eof = kcm_splice_eof,
  1435. .splice_read = kcm_splice_read,
  1436. };
  1437. /* Create proto operation for kcm sockets */
  1438. static int kcm_create(struct net *net, struct socket *sock,
  1439. int protocol, int kern)
  1440. {
  1441. struct kcm_net *knet = net_generic(net, kcm_net_id);
  1442. struct sock *sk;
  1443. struct kcm_mux *mux;
  1444. switch (sock->type) {
  1445. case SOCK_DGRAM:
  1446. sock->ops = &kcm_dgram_ops;
  1447. break;
  1448. case SOCK_SEQPACKET:
  1449. sock->ops = &kcm_seqpacket_ops;
  1450. break;
  1451. default:
  1452. return -ESOCKTNOSUPPORT;
  1453. }
  1454. if (protocol != KCMPROTO_CONNECTED)
  1455. return -EPROTONOSUPPORT;
  1456. sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern);
  1457. if (!sk)
  1458. return -ENOMEM;
  1459. /* Allocate a kcm mux, shared between KCM sockets */
  1460. mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL);
  1461. if (!mux) {
  1462. sk_free(sk);
  1463. return -ENOMEM;
  1464. }
  1465. spin_lock_init(&mux->lock);
  1466. spin_lock_init(&mux->rx_lock);
  1467. INIT_LIST_HEAD(&mux->kcm_socks);
  1468. INIT_LIST_HEAD(&mux->kcm_rx_waiters);
  1469. INIT_LIST_HEAD(&mux->kcm_tx_waiters);
  1470. INIT_LIST_HEAD(&mux->psocks);
  1471. INIT_LIST_HEAD(&mux->psocks_ready);
  1472. INIT_LIST_HEAD(&mux->psocks_avail);
  1473. mux->knet = knet;
  1474. /* Add new MUX to list */
  1475. mutex_lock(&knet->mutex);
  1476. list_add_rcu(&mux->kcm_mux_list, &knet->mux_list);
  1477. knet->count++;
  1478. mutex_unlock(&knet->mutex);
  1479. skb_queue_head_init(&mux->rx_hold_queue);
  1480. /* Init KCM socket */
  1481. sock_init_data(sock, sk);
  1482. init_kcm_sock(kcm_sk(sk), mux);
  1483. return 0;
  1484. }
  1485. static const struct net_proto_family kcm_family_ops = {
  1486. .family = PF_KCM,
  1487. .create = kcm_create,
  1488. .owner = THIS_MODULE,
  1489. };
  1490. static __net_init int kcm_init_net(struct net *net)
  1491. {
  1492. struct kcm_net *knet = net_generic(net, kcm_net_id);
  1493. INIT_LIST_HEAD_RCU(&knet->mux_list);
  1494. mutex_init(&knet->mutex);
  1495. return 0;
  1496. }
  1497. static __net_exit void kcm_exit_net(struct net *net)
  1498. {
  1499. struct kcm_net *knet = net_generic(net, kcm_net_id);
  1500. /* All KCM sockets should be closed at this point, which should mean
  1501. * that all multiplexors and psocks have been destroyed.
  1502. */
  1503. WARN_ON(!list_empty(&knet->mux_list));
  1504. mutex_destroy(&knet->mutex);
  1505. }
  1506. static struct pernet_operations kcm_net_ops = {
  1507. .init = kcm_init_net,
  1508. .exit = kcm_exit_net,
  1509. .id = &kcm_net_id,
  1510. .size = sizeof(struct kcm_net),
  1511. };
  1512. static int __init kcm_init(void)
  1513. {
  1514. int err = -ENOMEM;
  1515. kcm_muxp = KMEM_CACHE(kcm_mux, SLAB_HWCACHE_ALIGN);
  1516. if (!kcm_muxp)
  1517. goto fail;
  1518. kcm_psockp = KMEM_CACHE(kcm_psock, SLAB_HWCACHE_ALIGN);
  1519. if (!kcm_psockp)
  1520. goto fail;
  1521. kcm_wq = create_singlethread_workqueue("kkcmd");
  1522. if (!kcm_wq)
  1523. goto fail;
  1524. err = proto_register(&kcm_proto, 1);
  1525. if (err)
  1526. goto fail;
  1527. err = register_pernet_device(&kcm_net_ops);
  1528. if (err)
  1529. goto net_ops_fail;
  1530. err = sock_register(&kcm_family_ops);
  1531. if (err)
  1532. goto sock_register_fail;
  1533. err = kcm_proc_init();
  1534. if (err)
  1535. goto proc_init_fail;
  1536. return 0;
  1537. proc_init_fail:
  1538. sock_unregister(PF_KCM);
  1539. sock_register_fail:
  1540. unregister_pernet_device(&kcm_net_ops);
  1541. net_ops_fail:
  1542. proto_unregister(&kcm_proto);
  1543. fail:
  1544. kmem_cache_destroy(kcm_muxp);
  1545. kmem_cache_destroy(kcm_psockp);
  1546. if (kcm_wq)
  1547. destroy_workqueue(kcm_wq);
  1548. return err;
  1549. }
  1550. static void __exit kcm_exit(void)
  1551. {
  1552. kcm_proc_exit();
  1553. sock_unregister(PF_KCM);
  1554. unregister_pernet_device(&kcm_net_ops);
  1555. proto_unregister(&kcm_proto);
  1556. destroy_workqueue(kcm_wq);
  1557. kmem_cache_destroy(kcm_muxp);
  1558. kmem_cache_destroy(kcm_psockp);
  1559. }
  1560. module_init(kcm_init);
  1561. module_exit(kcm_exit);
  1562. MODULE_LICENSE("GPL");
  1563. MODULE_DESCRIPTION("KCM (Kernel Connection Multiplexor) sockets");
  1564. MODULE_ALIAS_NETPROTO(PF_KCM);