sock_reuseport.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * To speed up listener socket lookup, create an array to store all sockets
  4. * listening on the same port. This allows a decision to be made after finding
  5. * the first socket. An optional BPF program can also be configured for
  6. * selecting the socket index from the array of available sockets.
  7. */
  8. #include <net/ip.h>
  9. #include <net/sock_reuseport.h>
  10. #include <linux/bpf.h>
  11. #include <linux/idr.h>
  12. #include <linux/filter.h>
  13. #include <linux/rcupdate.h>
  14. #define INIT_SOCKS 128
  15. DEFINE_SPINLOCK(reuseport_lock);
  16. static DEFINE_IDA(reuseport_ida);
  17. static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
  18. struct sock_reuseport *reuse, bool bind_inany);
  19. void reuseport_has_conns_set(struct sock *sk)
  20. {
  21. struct sock_reuseport *reuse;
  22. if (!rcu_access_pointer(sk->sk_reuseport_cb))
  23. return;
  24. spin_lock_bh(&reuseport_lock);
  25. reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
  26. lockdep_is_held(&reuseport_lock));
  27. if (likely(reuse))
  28. reuse->has_conns = 1;
  29. spin_unlock_bh(&reuseport_lock);
  30. }
  31. EXPORT_SYMBOL(reuseport_has_conns_set);
  32. static void __reuseport_get_incoming_cpu(struct sock_reuseport *reuse)
  33. {
  34. /* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */
  35. WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu + 1);
  36. }
  37. static void __reuseport_put_incoming_cpu(struct sock_reuseport *reuse)
  38. {
  39. /* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */
  40. WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu - 1);
  41. }
  42. static void reuseport_get_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse)
  43. {
  44. if (sk->sk_incoming_cpu >= 0)
  45. __reuseport_get_incoming_cpu(reuse);
  46. }
  47. static void reuseport_put_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse)
  48. {
  49. if (sk->sk_incoming_cpu >= 0)
  50. __reuseport_put_incoming_cpu(reuse);
  51. }
  52. void reuseport_update_incoming_cpu(struct sock *sk, int val)
  53. {
  54. struct sock_reuseport *reuse;
  55. int old_sk_incoming_cpu;
  56. if (unlikely(!rcu_access_pointer(sk->sk_reuseport_cb))) {
  57. /* Paired with REAE_ONCE() in sk_incoming_cpu_update()
  58. * and compute_score().
  59. */
  60. WRITE_ONCE(sk->sk_incoming_cpu, val);
  61. return;
  62. }
  63. spin_lock_bh(&reuseport_lock);
  64. /* This must be done under reuseport_lock to avoid a race with
  65. * reuseport_grow(), which accesses sk->sk_incoming_cpu without
  66. * lock_sock() when detaching a shutdown()ed sk.
  67. *
  68. * Paired with READ_ONCE() in reuseport_select_sock_by_hash().
  69. */
  70. old_sk_incoming_cpu = sk->sk_incoming_cpu;
  71. WRITE_ONCE(sk->sk_incoming_cpu, val);
  72. reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
  73. lockdep_is_held(&reuseport_lock));
  74. /* reuseport_grow() has detached a closed sk. */
  75. if (!reuse)
  76. goto out;
  77. if (old_sk_incoming_cpu < 0 && val >= 0)
  78. __reuseport_get_incoming_cpu(reuse);
  79. else if (old_sk_incoming_cpu >= 0 && val < 0)
  80. __reuseport_put_incoming_cpu(reuse);
  81. out:
  82. spin_unlock_bh(&reuseport_lock);
  83. }
  84. static int reuseport_sock_index(struct sock *sk,
  85. const struct sock_reuseport *reuse,
  86. bool closed)
  87. {
  88. int left, right;
  89. if (!closed) {
  90. left = 0;
  91. right = reuse->num_socks;
  92. } else {
  93. left = reuse->max_socks - reuse->num_closed_socks;
  94. right = reuse->max_socks;
  95. }
  96. for (; left < right; left++)
  97. if (reuse->socks[left] == sk)
  98. return left;
  99. return -1;
  100. }
  101. static void __reuseport_add_sock(struct sock *sk,
  102. struct sock_reuseport *reuse)
  103. {
  104. reuse->socks[reuse->num_socks] = sk;
  105. /* paired with smp_rmb() in reuseport_(select|migrate)_sock() */
  106. smp_wmb();
  107. reuse->num_socks++;
  108. reuseport_get_incoming_cpu(sk, reuse);
  109. }
  110. static bool __reuseport_detach_sock(struct sock *sk,
  111. struct sock_reuseport *reuse)
  112. {
  113. int i = reuseport_sock_index(sk, reuse, false);
  114. if (i == -1)
  115. return false;
  116. reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
  117. reuse->num_socks--;
  118. reuseport_put_incoming_cpu(sk, reuse);
  119. return true;
  120. }
  121. static void __reuseport_add_closed_sock(struct sock *sk,
  122. struct sock_reuseport *reuse)
  123. {
  124. reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk;
  125. /* paired with READ_ONCE() in inet_csk_bind_conflict() */
  126. WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1);
  127. reuseport_get_incoming_cpu(sk, reuse);
  128. }
  129. static bool __reuseport_detach_closed_sock(struct sock *sk,
  130. struct sock_reuseport *reuse)
  131. {
  132. int i = reuseport_sock_index(sk, reuse, true);
  133. if (i == -1)
  134. return false;
  135. reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
  136. /* paired with READ_ONCE() in inet_csk_bind_conflict() */
  137. WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1);
  138. reuseport_put_incoming_cpu(sk, reuse);
  139. return true;
  140. }
  141. static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
  142. {
  143. struct sock_reuseport *reuse;
  144. reuse = kzalloc(struct_size(reuse, socks, max_socks), GFP_ATOMIC);
  145. if (!reuse)
  146. return NULL;
  147. reuse->max_socks = max_socks;
  148. RCU_INIT_POINTER(reuse->prog, NULL);
  149. return reuse;
  150. }
  151. int reuseport_alloc(struct sock *sk, bool bind_inany)
  152. {
  153. struct sock_reuseport *reuse;
  154. int id, ret = 0;
  155. /* bh lock used since this function call may precede hlist lock in
  156. * soft irq of receive path or setsockopt from process context
  157. */
  158. spin_lock_bh(&reuseport_lock);
  159. /* Allocation attempts can occur concurrently via the setsockopt path
  160. * and the bind/hash path. Nothing to do when we lose the race.
  161. */
  162. reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
  163. lockdep_is_held(&reuseport_lock));
  164. if (reuse) {
  165. if (reuse->num_closed_socks) {
  166. /* sk was shutdown()ed before */
  167. ret = reuseport_resurrect(sk, reuse, NULL, bind_inany);
  168. goto out;
  169. }
  170. /* Only set reuse->bind_inany if the bind_inany is true.
  171. * Otherwise, it will overwrite the reuse->bind_inany
  172. * which was set by the bind/hash path.
  173. */
  174. if (bind_inany)
  175. reuse->bind_inany = bind_inany;
  176. goto out;
  177. }
  178. reuse = __reuseport_alloc(INIT_SOCKS);
  179. if (!reuse) {
  180. ret = -ENOMEM;
  181. goto out;
  182. }
  183. id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
  184. if (id < 0) {
  185. kfree(reuse);
  186. ret = id;
  187. goto out;
  188. }
  189. reuse->reuseport_id = id;
  190. reuse->bind_inany = bind_inany;
  191. reuse->socks[0] = sk;
  192. reuse->num_socks = 1;
  193. reuseport_get_incoming_cpu(sk, reuse);
  194. rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
  195. out:
  196. spin_unlock_bh(&reuseport_lock);
  197. return ret;
  198. }
  199. EXPORT_SYMBOL(reuseport_alloc);
  200. static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
  201. {
  202. struct sock_reuseport *more_reuse;
  203. u32 more_socks_size, i;
  204. more_socks_size = reuse->max_socks * 2U;
  205. if (more_socks_size > U16_MAX) {
  206. if (reuse->num_closed_socks) {
  207. /* Make room by removing a closed sk.
  208. * The child has already been migrated.
  209. * Only reqsk left at this point.
  210. */
  211. struct sock *sk;
  212. sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
  213. RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL);
  214. __reuseport_detach_closed_sock(sk, reuse);
  215. return reuse;
  216. }
  217. return NULL;
  218. }
  219. more_reuse = __reuseport_alloc(more_socks_size);
  220. if (!more_reuse)
  221. return NULL;
  222. more_reuse->num_socks = reuse->num_socks;
  223. more_reuse->num_closed_socks = reuse->num_closed_socks;
  224. more_reuse->prog = reuse->prog;
  225. more_reuse->reuseport_id = reuse->reuseport_id;
  226. more_reuse->bind_inany = reuse->bind_inany;
  227. more_reuse->has_conns = reuse->has_conns;
  228. more_reuse->incoming_cpu = reuse->incoming_cpu;
  229. memcpy(more_reuse->socks, reuse->socks,
  230. reuse->num_socks * sizeof(struct sock *));
  231. memcpy(more_reuse->socks +
  232. (more_reuse->max_socks - more_reuse->num_closed_socks),
  233. reuse->socks + (reuse->max_socks - reuse->num_closed_socks),
  234. reuse->num_closed_socks * sizeof(struct sock *));
  235. more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
  236. for (i = 0; i < reuse->max_socks; ++i)
  237. rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
  238. more_reuse);
  239. /* Note: we use kfree_rcu here instead of reuseport_free_rcu so
  240. * that reuse and more_reuse can temporarily share a reference
  241. * to prog.
  242. */
  243. kfree_rcu(reuse, rcu);
  244. return more_reuse;
  245. }
  246. static void reuseport_free_rcu(struct rcu_head *head)
  247. {
  248. struct sock_reuseport *reuse;
  249. reuse = container_of(head, struct sock_reuseport, rcu);
  250. sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
  251. ida_free(&reuseport_ida, reuse->reuseport_id);
  252. kfree(reuse);
  253. }
  254. /**
  255. * reuseport_add_sock - Add a socket to the reuseport group of another.
  256. * @sk: New socket to add to the group.
  257. * @sk2: Socket belonging to the existing reuseport group.
  258. * @bind_inany: Whether or not the group is bound to a local INANY address.
  259. *
  260. * May return ENOMEM and not add socket to group under memory pressure.
  261. */
  262. int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
  263. {
  264. struct sock_reuseport *old_reuse, *reuse;
  265. if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
  266. int err = reuseport_alloc(sk2, bind_inany);
  267. if (err)
  268. return err;
  269. }
  270. spin_lock_bh(&reuseport_lock);
  271. reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
  272. lockdep_is_held(&reuseport_lock));
  273. old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
  274. lockdep_is_held(&reuseport_lock));
  275. if (old_reuse && old_reuse->num_closed_socks) {
  276. /* sk was shutdown()ed before */
  277. int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany);
  278. spin_unlock_bh(&reuseport_lock);
  279. return err;
  280. }
  281. if (old_reuse && old_reuse->num_socks != 1) {
  282. spin_unlock_bh(&reuseport_lock);
  283. return -EBUSY;
  284. }
  285. if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
  286. reuse = reuseport_grow(reuse);
  287. if (!reuse) {
  288. spin_unlock_bh(&reuseport_lock);
  289. return -ENOMEM;
  290. }
  291. }
  292. __reuseport_add_sock(sk, reuse);
  293. rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
  294. spin_unlock_bh(&reuseport_lock);
  295. if (old_reuse)
  296. call_rcu(&old_reuse->rcu, reuseport_free_rcu);
  297. return 0;
  298. }
  299. EXPORT_SYMBOL(reuseport_add_sock);
  300. static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
  301. struct sock_reuseport *reuse, bool bind_inany)
  302. {
  303. if (old_reuse == reuse) {
  304. /* If sk was in the same reuseport group, just pop sk out of
  305. * the closed section and push sk into the listening section.
  306. */
  307. __reuseport_detach_closed_sock(sk, old_reuse);
  308. __reuseport_add_sock(sk, old_reuse);
  309. return 0;
  310. }
  311. if (!reuse) {
  312. /* In bind()/listen() path, we cannot carry over the eBPF prog
  313. * for the shutdown()ed socket. In setsockopt() path, we should
  314. * not change the eBPF prog of listening sockets by attaching a
  315. * prog to the shutdown()ed socket. Thus, we will allocate a new
  316. * reuseport group and detach sk from the old group.
  317. */
  318. int id;
  319. reuse = __reuseport_alloc(INIT_SOCKS);
  320. if (!reuse)
  321. return -ENOMEM;
  322. id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
  323. if (id < 0) {
  324. kfree(reuse);
  325. return id;
  326. }
  327. reuse->reuseport_id = id;
  328. reuse->bind_inany = bind_inany;
  329. } else {
  330. /* Move sk from the old group to the new one if
  331. * - all the other listeners in the old group were close()d or
  332. * shutdown()ed, and then sk2 has listen()ed on the same port
  333. * OR
  334. * - sk listen()ed without bind() (or with autobind), was
  335. * shutdown()ed, and then listen()s on another port which
  336. * sk2 listen()s on.
  337. */
  338. if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
  339. reuse = reuseport_grow(reuse);
  340. if (!reuse)
  341. return -ENOMEM;
  342. }
  343. }
  344. __reuseport_detach_closed_sock(sk, old_reuse);
  345. __reuseport_add_sock(sk, reuse);
  346. rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
  347. if (old_reuse->num_socks + old_reuse->num_closed_socks == 0)
  348. call_rcu(&old_reuse->rcu, reuseport_free_rcu);
  349. return 0;
  350. }
  351. void reuseport_detach_sock(struct sock *sk)
  352. {
  353. struct sock_reuseport *reuse;
  354. spin_lock_bh(&reuseport_lock);
  355. reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
  356. lockdep_is_held(&reuseport_lock));
  357. /* reuseport_grow() has detached a closed sk */
  358. if (!reuse)
  359. goto out;
  360. /* Notify the bpf side. The sk may be added to a sockarray
  361. * map. If so, sockarray logic will remove it from the map.
  362. *
  363. * Other bpf map types that work with reuseport, like sockmap,
  364. * don't need an explicit callback from here. They override sk
  365. * unhash/close ops to remove the sk from the map before we
  366. * get to this point.
  367. */
  368. bpf_sk_reuseport_detach(sk);
  369. rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
  370. if (!__reuseport_detach_closed_sock(sk, reuse))
  371. __reuseport_detach_sock(sk, reuse);
  372. if (reuse->num_socks + reuse->num_closed_socks == 0)
  373. call_rcu(&reuse->rcu, reuseport_free_rcu);
  374. out:
  375. spin_unlock_bh(&reuseport_lock);
  376. }
  377. EXPORT_SYMBOL(reuseport_detach_sock);
  378. void reuseport_stop_listen_sock(struct sock *sk)
  379. {
  380. if (sk->sk_protocol == IPPROTO_TCP) {
  381. struct sock_reuseport *reuse;
  382. struct bpf_prog *prog;
  383. spin_lock_bh(&reuseport_lock);
  384. reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
  385. lockdep_is_held(&reuseport_lock));
  386. prog = rcu_dereference_protected(reuse->prog,
  387. lockdep_is_held(&reuseport_lock));
  388. if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req) ||
  389. (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) {
  390. /* Migration capable, move sk from the listening section
  391. * to the closed section.
  392. */
  393. bpf_sk_reuseport_detach(sk);
  394. __reuseport_detach_sock(sk, reuse);
  395. __reuseport_add_closed_sock(sk, reuse);
  396. spin_unlock_bh(&reuseport_lock);
  397. return;
  398. }
  399. spin_unlock_bh(&reuseport_lock);
  400. }
  401. /* Not capable to do migration, detach immediately */
  402. reuseport_detach_sock(sk);
  403. }
  404. EXPORT_SYMBOL(reuseport_stop_listen_sock);
  405. static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
  406. struct bpf_prog *prog, struct sk_buff *skb,
  407. int hdr_len)
  408. {
  409. struct sk_buff *nskb = NULL;
  410. u32 index;
  411. if (skb_shared(skb)) {
  412. nskb = skb_clone(skb, GFP_ATOMIC);
  413. if (!nskb)
  414. return NULL;
  415. skb = nskb;
  416. }
  417. /* temporarily advance data past protocol header */
  418. if (!pskb_pull(skb, hdr_len)) {
  419. kfree_skb(nskb);
  420. return NULL;
  421. }
  422. index = bpf_prog_run_save_cb(prog, skb);
  423. __skb_push(skb, hdr_len);
  424. consume_skb(nskb);
  425. if (index >= socks)
  426. return NULL;
  427. return reuse->socks[index];
  428. }
  429. static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse,
  430. u32 hash, u16 num_socks)
  431. {
  432. struct sock *first_valid_sk = NULL;
  433. int i, j;
  434. i = j = reciprocal_scale(hash, num_socks);
  435. do {
  436. struct sock *sk = reuse->socks[i];
  437. if (sk->sk_state != TCP_ESTABLISHED) {
  438. /* Paired with WRITE_ONCE() in __reuseport_(get|put)_incoming_cpu(). */
  439. if (!READ_ONCE(reuse->incoming_cpu))
  440. return sk;
  441. /* Paired with WRITE_ONCE() in reuseport_update_incoming_cpu(). */
  442. if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
  443. return sk;
  444. if (!first_valid_sk)
  445. first_valid_sk = sk;
  446. }
  447. i++;
  448. if (i >= num_socks)
  449. i = 0;
  450. } while (i != j);
  451. return first_valid_sk;
  452. }
  453. /**
  454. * reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
  455. * @sk: First socket in the group.
  456. * @hash: When no BPF filter is available, use this hash to select.
  457. * @skb: skb to run through BPF filter.
  458. * @hdr_len: BPF filter expects skb data pointer at payload data. If
  459. * the skb does not yet point at the payload, this parameter represents
  460. * how far the pointer needs to advance to reach the payload.
  461. * Returns a socket that should receive the packet (or NULL on error).
  462. */
  463. struct sock *reuseport_select_sock(struct sock *sk,
  464. u32 hash,
  465. struct sk_buff *skb,
  466. int hdr_len)
  467. {
  468. struct sock_reuseport *reuse;
  469. struct bpf_prog *prog;
  470. struct sock *sk2 = NULL;
  471. u16 socks;
  472. rcu_read_lock();
  473. reuse = rcu_dereference(sk->sk_reuseport_cb);
  474. /* if memory allocation failed or add call is not yet complete */
  475. if (!reuse)
  476. goto out;
  477. prog = rcu_dereference(reuse->prog);
  478. socks = READ_ONCE(reuse->num_socks);
  479. if (likely(socks)) {
  480. /* paired with smp_wmb() in __reuseport_add_sock() */
  481. smp_rmb();
  482. if (!prog || !skb)
  483. goto select_by_hash;
  484. if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
  485. sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash);
  486. else
  487. sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
  488. select_by_hash:
  489. /* no bpf or invalid bpf result: fall back to hash usage */
  490. if (!sk2)
  491. sk2 = reuseport_select_sock_by_hash(reuse, hash, socks);
  492. }
  493. out:
  494. rcu_read_unlock();
  495. return sk2;
  496. }
  497. EXPORT_SYMBOL(reuseport_select_sock);
  498. /**
  499. * reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group.
  500. * @sk: close()ed or shutdown()ed socket in the group.
  501. * @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or
  502. * NEW_SYN_RECV request socket during 3WHS.
  503. * @skb: skb to run through BPF filter.
  504. * Returns a socket (with sk_refcnt +1) that should accept the child socket
  505. * (or NULL on error).
  506. */
  507. struct sock *reuseport_migrate_sock(struct sock *sk,
  508. struct sock *migrating_sk,
  509. struct sk_buff *skb)
  510. {
  511. struct sock_reuseport *reuse;
  512. struct sock *nsk = NULL;
  513. bool allocated = false;
  514. struct bpf_prog *prog;
  515. u16 socks;
  516. u32 hash;
  517. rcu_read_lock();
  518. reuse = rcu_dereference(sk->sk_reuseport_cb);
  519. if (!reuse)
  520. goto out;
  521. socks = READ_ONCE(reuse->num_socks);
  522. if (unlikely(!socks))
  523. goto failure;
  524. /* paired with smp_wmb() in __reuseport_add_sock() */
  525. smp_rmb();
  526. hash = migrating_sk->sk_hash;
  527. prog = rcu_dereference(reuse->prog);
  528. if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) {
  529. if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req))
  530. goto select_by_hash;
  531. goto failure;
  532. }
  533. if (!skb) {
  534. skb = alloc_skb(0, GFP_ATOMIC);
  535. if (!skb)
  536. goto failure;
  537. allocated = true;
  538. }
  539. nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash);
  540. if (allocated)
  541. kfree_skb(skb);
  542. select_by_hash:
  543. if (!nsk)
  544. nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
  545. if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) {
  546. nsk = NULL;
  547. goto failure;
  548. }
  549. out:
  550. rcu_read_unlock();
  551. return nsk;
  552. failure:
  553. __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
  554. goto out;
  555. }
  556. EXPORT_SYMBOL(reuseport_migrate_sock);
  557. int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
  558. {
  559. struct sock_reuseport *reuse;
  560. struct bpf_prog *old_prog;
  561. if (sk_unhashed(sk)) {
  562. int err;
  563. if (!sk->sk_reuseport)
  564. return -EINVAL;
  565. err = reuseport_alloc(sk, false);
  566. if (err)
  567. return err;
  568. } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
  569. /* The socket wasn't bound with SO_REUSEPORT */
  570. return -EINVAL;
  571. }
  572. spin_lock_bh(&reuseport_lock);
  573. reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
  574. lockdep_is_held(&reuseport_lock));
  575. old_prog = rcu_dereference_protected(reuse->prog,
  576. lockdep_is_held(&reuseport_lock));
  577. rcu_assign_pointer(reuse->prog, prog);
  578. spin_unlock_bh(&reuseport_lock);
  579. sk_reuseport_prog_free(old_prog);
  580. return 0;
  581. }
  582. EXPORT_SYMBOL(reuseport_attach_prog);
  583. int reuseport_detach_prog(struct sock *sk)
  584. {
  585. struct sock_reuseport *reuse;
  586. struct bpf_prog *old_prog;
  587. old_prog = NULL;
  588. spin_lock_bh(&reuseport_lock);
  589. reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
  590. lockdep_is_held(&reuseport_lock));
  591. /* reuse must be checked after acquiring the reuseport_lock
  592. * because reuseport_grow() can detach a closed sk.
  593. */
  594. if (!reuse) {
  595. spin_unlock_bh(&reuseport_lock);
  596. return sk->sk_reuseport ? -ENOENT : -EINVAL;
  597. }
  598. if (sk_unhashed(sk) && reuse->num_closed_socks) {
  599. spin_unlock_bh(&reuseport_lock);
  600. return -ENOENT;
  601. }
  602. old_prog = rcu_replace_pointer(reuse->prog, old_prog,
  603. lockdep_is_held(&reuseport_lock));
  604. spin_unlock_bh(&reuseport_lock);
  605. if (!old_prog)
  606. return -ENOENT;
  607. sk_reuseport_prog_free(old_prog);
  608. return 0;
  609. }
  610. EXPORT_SYMBOL(reuseport_detach_prog);