udp.c 94 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * INET An implementation of the TCP/IP protocol suite for the LINUX
  4. * operating system. INET is implemented using the BSD Socket
  5. * interface as the means of communication with the user level.
  6. *
  7. * The User Datagram Protocol (UDP).
  8. *
  9. * Authors: Ross Biro
  10. * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11. * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  12. * Alan Cox, <alan@lxorguk.ukuu.org.uk>
  13. * Hirokazu Takahashi, <taka@valinux.co.jp>
  14. *
  15. * Fixes:
  16. * Alan Cox : verify_area() calls
  17. * Alan Cox : stopped close while in use off icmp
  18. * messages. Not a fix but a botch that
  19. * for udp at least is 'valid'.
  20. * Alan Cox : Fixed icmp handling properly
  21. * Alan Cox : Correct error for oversized datagrams
  22. * Alan Cox : Tidied select() semantics.
  23. * Alan Cox : udp_err() fixed properly, also now
  24. * select and read wake correctly on errors
  25. * Alan Cox : udp_send verify_area moved to avoid mem leak
  26. * Alan Cox : UDP can count its memory
  27. * Alan Cox : send to an unknown connection causes
  28. * an ECONNREFUSED off the icmp, but
  29. * does NOT close.
  30. * Alan Cox : Switched to new sk_buff handlers. No more backlog!
  31. * Alan Cox : Using generic datagram code. Even smaller and the PEEK
  32. * bug no longer crashes it.
  33. * Fred Van Kempen : Net2e support for sk->broadcast.
  34. * Alan Cox : Uses skb_free_datagram
  35. * Alan Cox : Added get/set sockopt support.
  36. * Alan Cox : Broadcasting without option set returns EACCES.
  37. * Alan Cox : No wakeup calls. Instead we now use the callbacks.
  38. * Alan Cox : Use ip_tos and ip_ttl
  39. * Alan Cox : SNMP Mibs
  40. * Alan Cox : MSG_DONTROUTE, and 0.0.0.0 support.
  41. * Matt Dillon : UDP length checks.
  42. * Alan Cox : Smarter af_inet used properly.
  43. * Alan Cox : Use new kernel side addressing.
  44. * Alan Cox : Incorrect return on truncated datagram receive.
  45. * Arnt Gulbrandsen : New udp_send and stuff
  46. * Alan Cox : Cache last socket
  47. * Alan Cox : Route cache
  48. * Jon Peatfield : Minor efficiency fix to sendto().
  49. * Mike Shaver : RFC1122 checks.
  50. * Alan Cox : Nonblocking error fix.
  51. * Willy Konynenberg : Transparent proxying support.
  52. * Mike McLagan : Routing by source
  53. * David S. Miller : New socket lookup architecture.
  54. * Last socket cache retained as it
  55. * does have a high hit rate.
  56. * Olaf Kirch : Don't linearise iovec on sendmsg.
  57. * Andi Kleen : Some cleanups, cache destination entry
  58. * for connect.
  59. * Vitaly E. Lavrov : Transparent proxy revived after year coma.
  60. * Melvin Smith : Check msg_name not msg_namelen in sendto(),
  61. * return ENOTCONN for unconnected sockets (POSIX)
  62. * Janos Farkas : don't deliver multi/broadcasts to a different
  63. * bound-to-device socket
  64. * Hirokazu Takahashi : HW checksumming for outgoing UDP
  65. * datagrams.
  66. * Hirokazu Takahashi : sendfile() on UDP works now.
  67. * Arnaldo C. Melo : convert /proc/net/udp to seq_file
  68. * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
  69. * Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind
  70. * a single port at the same time.
  71. * Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
  72. * James Chapman : Add L2TP encapsulation type.
  73. */
  74. #define pr_fmt(fmt) "UDP: " fmt
  75. #include <linux/bpf-cgroup.h>
  76. #include <linux/uaccess.h>
  77. #include <asm/ioctls.h>
  78. #include <linux/memblock.h>
  79. #include <linux/highmem.h>
  80. #include <linux/types.h>
  81. #include <linux/fcntl.h>
  82. #include <linux/module.h>
  83. #include <linux/socket.h>
  84. #include <linux/sockios.h>
  85. #include <linux/igmp.h>
  86. #include <linux/inetdevice.h>
  87. #include <linux/in.h>
  88. #include <linux/errno.h>
  89. #include <linux/timer.h>
  90. #include <linux/mm.h>
  91. #include <linux/inet.h>
  92. #include <linux/netdevice.h>
  93. #include <linux/slab.h>
  94. #include <net/tcp_states.h>
  95. #include <linux/skbuff.h>
  96. #include <linux/proc_fs.h>
  97. #include <linux/seq_file.h>
  98. #include <net/net_namespace.h>
  99. #include <net/icmp.h>
  100. #include <net/inet_hashtables.h>
  101. #include <net/ip_tunnels.h>
  102. #include <net/route.h>
  103. #include <net/checksum.h>
  104. #include <net/gso.h>
  105. #include <net/xfrm.h>
  106. #include <trace/events/udp.h>
  107. #include <linux/static_key.h>
  108. #include <linux/btf_ids.h>
  109. #include <trace/events/skb.h>
  110. #include <net/busy_poll.h>
  111. #include "udp_impl.h"
  112. #include <net/sock_reuseport.h>
  113. #include <net/addrconf.h>
  114. #include <net/udp_tunnel.h>
  115. #include <net/gro.h>
  116. #include <net/inet_dscp.h>
  117. #if IS_ENABLED(CONFIG_IPV6)
  118. #include <net/ipv6_stubs.h>
  119. #endif
  120. struct udp_table udp_table __read_mostly;
  121. EXPORT_SYMBOL(udp_table);
  122. long sysctl_udp_mem[3] __read_mostly;
  123. EXPORT_SYMBOL(sysctl_udp_mem);
  124. atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp;
  125. EXPORT_SYMBOL(udp_memory_allocated);
  126. DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
  127. EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc);
  128. #define MAX_UDP_PORTS 65536
  129. #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN_PERNET)
  130. static struct udp_table *udp_get_table_prot(struct sock *sk)
  131. {
  132. return sk->sk_prot->h.udp_table ? : sock_net(sk)->ipv4.udp_table;
  133. }
  134. static int udp_lib_lport_inuse(struct net *net, __u16 num,
  135. const struct udp_hslot *hslot,
  136. unsigned long *bitmap,
  137. struct sock *sk, unsigned int log)
  138. {
  139. struct sock *sk2;
  140. kuid_t uid = sock_i_uid(sk);
  141. sk_for_each(sk2, &hslot->head) {
  142. if (net_eq(sock_net(sk2), net) &&
  143. sk2 != sk &&
  144. (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
  145. (!sk2->sk_reuse || !sk->sk_reuse) &&
  146. (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
  147. sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
  148. inet_rcv_saddr_equal(sk, sk2, true)) {
  149. if (sk2->sk_reuseport && sk->sk_reuseport &&
  150. !rcu_access_pointer(sk->sk_reuseport_cb) &&
  151. uid_eq(uid, sock_i_uid(sk2))) {
  152. if (!bitmap)
  153. return 0;
  154. } else {
  155. if (!bitmap)
  156. return 1;
  157. __set_bit(udp_sk(sk2)->udp_port_hash >> log,
  158. bitmap);
  159. }
  160. }
  161. }
  162. return 0;
  163. }
  164. /*
  165. * Note: we still hold spinlock of primary hash chain, so no other writer
  166. * can insert/delete a socket with local_port == num
  167. */
  168. static int udp_lib_lport_inuse2(struct net *net, __u16 num,
  169. struct udp_hslot *hslot2,
  170. struct sock *sk)
  171. {
  172. struct sock *sk2;
  173. kuid_t uid = sock_i_uid(sk);
  174. int res = 0;
  175. spin_lock(&hslot2->lock);
  176. udp_portaddr_for_each_entry(sk2, &hslot2->head) {
  177. if (net_eq(sock_net(sk2), net) &&
  178. sk2 != sk &&
  179. (udp_sk(sk2)->udp_port_hash == num) &&
  180. (!sk2->sk_reuse || !sk->sk_reuse) &&
  181. (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
  182. sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
  183. inet_rcv_saddr_equal(sk, sk2, true)) {
  184. if (sk2->sk_reuseport && sk->sk_reuseport &&
  185. !rcu_access_pointer(sk->sk_reuseport_cb) &&
  186. uid_eq(uid, sock_i_uid(sk2))) {
  187. res = 0;
  188. } else {
  189. res = 1;
  190. }
  191. break;
  192. }
  193. }
  194. spin_unlock(&hslot2->lock);
  195. return res;
  196. }
  197. static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
  198. {
  199. struct net *net = sock_net(sk);
  200. kuid_t uid = sock_i_uid(sk);
  201. struct sock *sk2;
  202. sk_for_each(sk2, &hslot->head) {
  203. if (net_eq(sock_net(sk2), net) &&
  204. sk2 != sk &&
  205. sk2->sk_family == sk->sk_family &&
  206. ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
  207. (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) &&
  208. (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
  209. sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
  210. inet_rcv_saddr_equal(sk, sk2, false)) {
  211. return reuseport_add_sock(sk, sk2,
  212. inet_rcv_saddr_any(sk));
  213. }
  214. }
  215. return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
  216. }
  217. /**
  218. * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6
  219. *
  220. * @sk: socket struct in question
  221. * @snum: port number to look up
  222. * @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
  223. * with NULL address
  224. */
  225. int udp_lib_get_port(struct sock *sk, unsigned short snum,
  226. unsigned int hash2_nulladdr)
  227. {
  228. struct udp_table *udptable = udp_get_table_prot(sk);
  229. struct udp_hslot *hslot, *hslot2;
  230. struct net *net = sock_net(sk);
  231. int error = -EADDRINUSE;
  232. if (!snum) {
  233. DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
  234. unsigned short first, last;
  235. int low, high, remaining;
  236. unsigned int rand;
  237. inet_sk_get_local_port_range(sk, &low, &high);
  238. remaining = (high - low) + 1;
  239. rand = get_random_u32();
  240. first = reciprocal_scale(rand, remaining) + low;
  241. /*
  242. * force rand to be an odd multiple of UDP_HTABLE_SIZE
  243. */
  244. rand = (rand | 1) * (udptable->mask + 1);
  245. last = first + udptable->mask + 1;
  246. do {
  247. hslot = udp_hashslot(udptable, net, first);
  248. bitmap_zero(bitmap, PORTS_PER_CHAIN);
  249. spin_lock_bh(&hslot->lock);
  250. udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
  251. udptable->log);
  252. snum = first;
  253. /*
  254. * Iterate on all possible values of snum for this hash.
  255. * Using steps of an odd multiple of UDP_HTABLE_SIZE
  256. * give us randomization and full range coverage.
  257. */
  258. do {
  259. if (low <= snum && snum <= high &&
  260. !test_bit(snum >> udptable->log, bitmap) &&
  261. !inet_is_local_reserved_port(net, snum))
  262. goto found;
  263. snum += rand;
  264. } while (snum != first);
  265. spin_unlock_bh(&hslot->lock);
  266. cond_resched();
  267. } while (++first != last);
  268. goto fail;
  269. } else {
  270. hslot = udp_hashslot(udptable, net, snum);
  271. spin_lock_bh(&hslot->lock);
  272. if (hslot->count > 10) {
  273. int exist;
  274. unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;
  275. slot2 &= udptable->mask;
  276. hash2_nulladdr &= udptable->mask;
  277. hslot2 = udp_hashslot2(udptable, slot2);
  278. if (hslot->count < hslot2->count)
  279. goto scan_primary_hash;
  280. exist = udp_lib_lport_inuse2(net, snum, hslot2, sk);
  281. if (!exist && (hash2_nulladdr != slot2)) {
  282. hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
  283. exist = udp_lib_lport_inuse2(net, snum, hslot2,
  284. sk);
  285. }
  286. if (exist)
  287. goto fail_unlock;
  288. else
  289. goto found;
  290. }
  291. scan_primary_hash:
  292. if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 0))
  293. goto fail_unlock;
  294. }
  295. found:
  296. inet_sk(sk)->inet_num = snum;
  297. udp_sk(sk)->udp_port_hash = snum;
  298. udp_sk(sk)->udp_portaddr_hash ^= snum;
  299. if (sk_unhashed(sk)) {
  300. if (sk->sk_reuseport &&
  301. udp_reuseport_add_sock(sk, hslot)) {
  302. inet_sk(sk)->inet_num = 0;
  303. udp_sk(sk)->udp_port_hash = 0;
  304. udp_sk(sk)->udp_portaddr_hash ^= snum;
  305. goto fail_unlock;
  306. }
  307. sock_set_flag(sk, SOCK_RCU_FREE);
  308. sk_add_node_rcu(sk, &hslot->head);
  309. hslot->count++;
  310. sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
  311. hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
  312. spin_lock(&hslot2->lock);
  313. if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
  314. sk->sk_family == AF_INET6)
  315. hlist_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node,
  316. &hslot2->head);
  317. else
  318. hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
  319. &hslot2->head);
  320. hslot2->count++;
  321. spin_unlock(&hslot2->lock);
  322. }
  323. error = 0;
  324. fail_unlock:
  325. spin_unlock_bh(&hslot->lock);
  326. fail:
  327. return error;
  328. }
  329. EXPORT_SYMBOL(udp_lib_get_port);
  330. int udp_v4_get_port(struct sock *sk, unsigned short snum)
  331. {
  332. unsigned int hash2_nulladdr =
  333. ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
  334. unsigned int hash2_partial =
  335. ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
  336. /* precompute partial secondary hash */
  337. udp_sk(sk)->udp_portaddr_hash = hash2_partial;
  338. return udp_lib_get_port(sk, snum, hash2_nulladdr);
  339. }
  340. static int compute_score(struct sock *sk, const struct net *net,
  341. __be32 saddr, __be16 sport,
  342. __be32 daddr, unsigned short hnum,
  343. int dif, int sdif)
  344. {
  345. int score;
  346. struct inet_sock *inet;
  347. bool dev_match;
  348. if (!net_eq(sock_net(sk), net) ||
  349. udp_sk(sk)->udp_port_hash != hnum ||
  350. ipv6_only_sock(sk))
  351. return -1;
  352. if (sk->sk_rcv_saddr != daddr)
  353. return -1;
  354. score = (sk->sk_family == PF_INET) ? 2 : 1;
  355. inet = inet_sk(sk);
  356. if (inet->inet_daddr) {
  357. if (inet->inet_daddr != saddr)
  358. return -1;
  359. score += 4;
  360. }
  361. if (inet->inet_dport) {
  362. if (inet->inet_dport != sport)
  363. return -1;
  364. score += 4;
  365. }
  366. dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
  367. dif, sdif);
  368. if (!dev_match)
  369. return -1;
  370. if (sk->sk_bound_dev_if)
  371. score += 4;
  372. if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
  373. score++;
  374. return score;
  375. }
  376. INDIRECT_CALLABLE_SCOPE
  377. u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport,
  378. const __be32 faddr, const __be16 fport)
  379. {
  380. net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret));
  381. return __inet_ehashfn(laddr, lport, faddr, fport,
  382. udp_ehash_secret + net_hash_mix(net));
  383. }
  384. /**
  385. * udp4_lib_lookup1() - Simplified lookup using primary hash (destination port)
  386. * @net: Network namespace
  387. * @saddr: Source address, network order
  388. * @sport: Source port, network order
  389. * @daddr: Destination address, network order
  390. * @hnum: Destination port, host order
  391. * @dif: Destination interface index
  392. * @sdif: Destination bridge port index, if relevant
  393. * @udptable: Set of UDP hash tables
  394. *
  395. * Simplified lookup to be used as fallback if no sockets are found due to a
  396. * potential race between (receive) address change, and lookup happening before
  397. * the rehash operation. This function ignores SO_REUSEPORT groups while scoring
  398. * result sockets, because if we have one, we don't need the fallback at all.
  399. *
  400. * Called under rcu_read_lock().
  401. *
  402. * Return: socket with highest matching score if any, NULL if none
  403. */
  404. static struct sock *udp4_lib_lookup1(const struct net *net,
  405. __be32 saddr, __be16 sport,
  406. __be32 daddr, unsigned int hnum,
  407. int dif, int sdif,
  408. const struct udp_table *udptable)
  409. {
  410. unsigned int slot = udp_hashfn(net, hnum, udptable->mask);
  411. struct udp_hslot *hslot = &udptable->hash[slot];
  412. struct sock *sk, *result = NULL;
  413. int score, badness = 0;
  414. sk_for_each_rcu(sk, &hslot->head) {
  415. score = compute_score(sk, net,
  416. saddr, sport, daddr, hnum, dif, sdif);
  417. if (score > badness) {
  418. result = sk;
  419. badness = score;
  420. }
  421. }
  422. return result;
  423. }
  424. /* called with rcu_read_lock() */
  425. static struct sock *udp4_lib_lookup2(const struct net *net,
  426. __be32 saddr, __be16 sport,
  427. __be32 daddr, unsigned int hnum,
  428. int dif, int sdif,
  429. struct udp_hslot *hslot2,
  430. struct sk_buff *skb)
  431. {
  432. struct sock *sk, *result;
  433. int score, badness;
  434. bool need_rescore;
  435. result = NULL;
  436. badness = 0;
  437. udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
  438. need_rescore = false;
  439. rescore:
  440. score = compute_score(need_rescore ? result : sk, net, saddr,
  441. sport, daddr, hnum, dif, sdif);
  442. if (score > badness) {
  443. badness = score;
  444. if (need_rescore)
  445. continue;
  446. if (sk->sk_state == TCP_ESTABLISHED) {
  447. result = sk;
  448. continue;
  449. }
  450. result = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
  451. saddr, sport, daddr, hnum, udp_ehashfn);
  452. if (!result) {
  453. result = sk;
  454. continue;
  455. }
  456. /* Fall back to scoring if group has connections */
  457. if (!reuseport_has_conns(sk))
  458. return result;
  459. /* Reuseport logic returned an error, keep original score. */
  460. if (IS_ERR(result))
  461. continue;
  462. /* compute_score is too long of a function to be
  463. * inlined, and calling it again here yields
  464. * measureable overhead for some
  465. * workloads. Work around it by jumping
  466. * backwards to rescore 'result'.
  467. */
  468. need_rescore = true;
  469. goto rescore;
  470. }
  471. }
  472. return result;
  473. }
  474. /* UDP is nearly always wildcards out the wazoo, it makes no sense to try
  475. * harder than this. -DaveM
  476. */
  477. struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr,
  478. __be16 sport, __be32 daddr, __be16 dport, int dif,
  479. int sdif, struct udp_table *udptable, struct sk_buff *skb)
  480. {
  481. unsigned short hnum = ntohs(dport);
  482. unsigned int hash2, slot2;
  483. struct udp_hslot *hslot2;
  484. struct sock *result, *sk;
  485. hash2 = ipv4_portaddr_hash(net, daddr, hnum);
  486. slot2 = hash2 & udptable->mask;
  487. hslot2 = &udptable->hash2[slot2];
  488. /* Lookup connected or non-wildcard socket */
  489. result = udp4_lib_lookup2(net, saddr, sport,
  490. daddr, hnum, dif, sdif,
  491. hslot2, skb);
  492. if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED)
  493. goto done;
  494. /* Lookup redirect from BPF */
  495. if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
  496. udptable == net->ipv4.udp_table) {
  497. sk = inet_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr),
  498. saddr, sport, daddr, hnum, dif,
  499. udp_ehashfn);
  500. if (sk) {
  501. result = sk;
  502. goto done;
  503. }
  504. }
  505. /* Got non-wildcard socket or error on first lookup */
  506. if (result)
  507. goto done;
  508. /* Lookup wildcard sockets */
  509. hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
  510. slot2 = hash2 & udptable->mask;
  511. hslot2 = &udptable->hash2[slot2];
  512. result = udp4_lib_lookup2(net, saddr, sport,
  513. htonl(INADDR_ANY), hnum, dif, sdif,
  514. hslot2, skb);
  515. if (!IS_ERR_OR_NULL(result))
  516. goto done;
  517. /* Primary hash (destination port) lookup as fallback for this race:
  518. * 1. __ip4_datagram_connect() sets sk_rcv_saddr
  519. * 2. lookup (this function): new sk_rcv_saddr, hashes not updated yet
  520. * 3. rehash operation updating _secondary and four-tuple_ hashes
  521. * The primary hash doesn't need an update after 1., so, thanks to this
  522. * further step, 1. and 3. don't need to be atomic against the lookup.
  523. */
  524. result = udp4_lib_lookup1(net, saddr, sport, daddr, hnum, dif, sdif,
  525. udptable);
  526. done:
  527. if (IS_ERR(result))
  528. return NULL;
  529. return result;
  530. }
  531. EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
  532. static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
  533. __be16 sport, __be16 dport,
  534. struct udp_table *udptable)
  535. {
  536. const struct iphdr *iph = ip_hdr(skb);
  537. return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
  538. iph->daddr, dport, inet_iif(skb),
  539. inet_sdif(skb), udptable, skb);
  540. }
  541. struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
  542. __be16 sport, __be16 dport)
  543. {
  544. const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
  545. const struct iphdr *iph = (struct iphdr *)(skb->data + offset);
  546. struct net *net = dev_net(skb->dev);
  547. int iif, sdif;
  548. inet_get_iif_sdif(skb, &iif, &sdif);
  549. return __udp4_lib_lookup(net, iph->saddr, sport,
  550. iph->daddr, dport, iif,
  551. sdif, net->ipv4.udp_table, NULL);
  552. }
  553. /* Must be called under rcu_read_lock().
  554. * Does increment socket refcount.
  555. */
  556. #if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
  557. struct sock *udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport,
  558. __be32 daddr, __be16 dport, int dif)
  559. {
  560. struct sock *sk;
  561. sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
  562. dif, 0, net->ipv4.udp_table, NULL);
  563. if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
  564. sk = NULL;
  565. return sk;
  566. }
  567. EXPORT_SYMBOL_GPL(udp4_lib_lookup);
  568. #endif
  569. static inline bool __udp_is_mcast_sock(struct net *net, const struct sock *sk,
  570. __be16 loc_port, __be32 loc_addr,
  571. __be16 rmt_port, __be32 rmt_addr,
  572. int dif, int sdif, unsigned short hnum)
  573. {
  574. const struct inet_sock *inet = inet_sk(sk);
  575. if (!net_eq(sock_net(sk), net) ||
  576. udp_sk(sk)->udp_port_hash != hnum ||
  577. (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
  578. (inet->inet_dport != rmt_port && inet->inet_dport) ||
  579. (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
  580. ipv6_only_sock(sk) ||
  581. !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
  582. return false;
  583. if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif))
  584. return false;
  585. return true;
  586. }
  587. DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
  588. EXPORT_SYMBOL(udp_encap_needed_key);
  589. #if IS_ENABLED(CONFIG_IPV6)
  590. DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
  591. EXPORT_SYMBOL(udpv6_encap_needed_key);
  592. #endif
  593. void udp_encap_enable(void)
  594. {
  595. static_branch_inc(&udp_encap_needed_key);
  596. }
  597. EXPORT_SYMBOL(udp_encap_enable);
  598. void udp_encap_disable(void)
  599. {
  600. static_branch_dec(&udp_encap_needed_key);
  601. }
  602. EXPORT_SYMBOL(udp_encap_disable);
  603. /* Handler for tunnels with arbitrary destination ports: no socket lookup, go
  604. * through error handlers in encapsulations looking for a match.
  605. */
  606. static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info)
  607. {
  608. int i;
  609. for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
  610. int (*handler)(struct sk_buff *skb, u32 info);
  611. const struct ip_tunnel_encap_ops *encap;
  612. encap = rcu_dereference(iptun_encaps[i]);
  613. if (!encap)
  614. continue;
  615. handler = encap->err_handler;
  616. if (handler && !handler(skb, info))
  617. return 0;
  618. }
  619. return -ENOENT;
  620. }
  621. /* Try to match ICMP errors to UDP tunnels by looking up a socket without
  622. * reversing source and destination port: this will match tunnels that force the
  623. * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that
  624. * lwtunnels might actually break this assumption by being configured with
  625. * different destination ports on endpoints, in this case we won't be able to
  626. * trace ICMP messages back to them.
  627. *
  628. * If this doesn't match any socket, probe tunnels with arbitrary destination
  629. * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port
  630. * we've sent packets to won't necessarily match the local destination port.
  631. *
  632. * Then ask the tunnel implementation to match the error against a valid
  633. * association.
  634. *
  635. * Return an error if we can't find a match, the socket if we need further
  636. * processing, zero otherwise.
  637. */
  638. static struct sock *__udp4_lib_err_encap(struct net *net,
  639. const struct iphdr *iph,
  640. struct udphdr *uh,
  641. struct udp_table *udptable,
  642. struct sock *sk,
  643. struct sk_buff *skb, u32 info)
  644. {
  645. int (*lookup)(struct sock *sk, struct sk_buff *skb);
  646. int network_offset, transport_offset;
  647. struct udp_sock *up;
  648. network_offset = skb_network_offset(skb);
  649. transport_offset = skb_transport_offset(skb);
  650. /* Network header needs to point to the outer IPv4 header inside ICMP */
  651. skb_reset_network_header(skb);
  652. /* Transport header needs to point to the UDP header */
  653. skb_set_transport_header(skb, iph->ihl << 2);
  654. if (sk) {
  655. up = udp_sk(sk);
  656. lookup = READ_ONCE(up->encap_err_lookup);
  657. if (lookup && lookup(sk, skb))
  658. sk = NULL;
  659. goto out;
  660. }
  661. sk = __udp4_lib_lookup(net, iph->daddr, uh->source,
  662. iph->saddr, uh->dest, skb->dev->ifindex, 0,
  663. udptable, NULL);
  664. if (sk) {
  665. up = udp_sk(sk);
  666. lookup = READ_ONCE(up->encap_err_lookup);
  667. if (!lookup || lookup(sk, skb))
  668. sk = NULL;
  669. }
  670. out:
  671. if (!sk)
  672. sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info));
  673. skb_set_transport_header(skb, transport_offset);
  674. skb_set_network_header(skb, network_offset);
  675. return sk;
  676. }
  677. /*
  678. * This routine is called by the ICMP module when it gets some
  679. * sort of error condition. If err < 0 then the socket should
  680. * be closed and the error returned to the user. If err > 0
  681. * it's just the icmp type << 8 | icmp code.
  682. * Header points to the ip header of the error packet. We move
  683. * on past this. Then (as it used to claim before adjustment)
  684. * header points to the first 8 bytes of the udp header. We need
  685. * to find the appropriate port.
  686. */
  687. int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
  688. {
  689. struct inet_sock *inet;
  690. const struct iphdr *iph = (const struct iphdr *)skb->data;
  691. struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
  692. const int type = icmp_hdr(skb)->type;
  693. const int code = icmp_hdr(skb)->code;
  694. bool tunnel = false;
  695. struct sock *sk;
  696. int harderr;
  697. int err;
  698. struct net *net = dev_net(skb->dev);
  699. sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
  700. iph->saddr, uh->source, skb->dev->ifindex,
  701. inet_sdif(skb), udptable, NULL);
  702. if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) {
  703. /* No socket for error: try tunnels before discarding */
  704. if (static_branch_unlikely(&udp_encap_needed_key)) {
  705. sk = __udp4_lib_err_encap(net, iph, uh, udptable, sk, skb,
  706. info);
  707. if (!sk)
  708. return 0;
  709. } else
  710. sk = ERR_PTR(-ENOENT);
  711. if (IS_ERR(sk)) {
  712. __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
  713. return PTR_ERR(sk);
  714. }
  715. tunnel = true;
  716. }
  717. err = 0;
  718. harderr = 0;
  719. inet = inet_sk(sk);
  720. switch (type) {
  721. default:
  722. case ICMP_TIME_EXCEEDED:
  723. err = EHOSTUNREACH;
  724. break;
  725. case ICMP_SOURCE_QUENCH:
  726. goto out;
  727. case ICMP_PARAMETERPROB:
  728. err = EPROTO;
  729. harderr = 1;
  730. break;
  731. case ICMP_DEST_UNREACH:
  732. if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
  733. ipv4_sk_update_pmtu(skb, sk, info);
  734. if (READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT) {
  735. err = EMSGSIZE;
  736. harderr = 1;
  737. break;
  738. }
  739. goto out;
  740. }
  741. err = EHOSTUNREACH;
  742. if (code <= NR_ICMP_UNREACH) {
  743. harderr = icmp_err_convert[code].fatal;
  744. err = icmp_err_convert[code].errno;
  745. }
  746. break;
  747. case ICMP_REDIRECT:
  748. ipv4_sk_redirect(skb, sk);
  749. goto out;
  750. }
  751. /*
  752. * RFC1122: OK. Passes ICMP errors back to application, as per
  753. * 4.1.3.3.
  754. */
  755. if (tunnel) {
  756. /* ...not for tunnels though: we don't have a sending socket */
  757. if (udp_sk(sk)->encap_err_rcv)
  758. udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest, info,
  759. (u8 *)(uh+1));
  760. goto out;
  761. }
  762. if (!inet_test_bit(RECVERR, sk)) {
  763. if (!harderr || sk->sk_state != TCP_ESTABLISHED)
  764. goto out;
  765. } else
  766. ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
  767. sk->sk_err = err;
  768. sk_error_report(sk);
  769. out:
  770. return 0;
  771. }
  772. int udp_err(struct sk_buff *skb, u32 info)
  773. {
  774. return __udp4_lib_err(skb, info, dev_net(skb->dev)->ipv4.udp_table);
  775. }
  776. /*
  777. * Throw away all pending data and cancel the corking. Socket is locked.
  778. */
  779. void udp_flush_pending_frames(struct sock *sk)
  780. {
  781. struct udp_sock *up = udp_sk(sk);
  782. if (up->pending) {
  783. up->len = 0;
  784. WRITE_ONCE(up->pending, 0);
  785. ip_flush_pending_frames(sk);
  786. }
  787. }
  788. EXPORT_SYMBOL(udp_flush_pending_frames);
  789. /**
  790. * udp4_hwcsum - handle outgoing HW checksumming
  791. * @skb: sk_buff containing the filled-in UDP header
  792. * (checksum field must be zeroed out)
  793. * @src: source IP address
  794. * @dst: destination IP address
  795. */
  796. void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
  797. {
  798. struct udphdr *uh = udp_hdr(skb);
  799. int offset = skb_transport_offset(skb);
  800. int len = skb->len - offset;
  801. int hlen = len;
  802. __wsum csum = 0;
  803. if (!skb_has_frag_list(skb)) {
  804. /*
  805. * Only one fragment on the socket.
  806. */
  807. skb->csum_start = skb_transport_header(skb) - skb->head;
  808. skb->csum_offset = offsetof(struct udphdr, check);
  809. uh->check = ~csum_tcpudp_magic(src, dst, len,
  810. IPPROTO_UDP, 0);
  811. } else {
  812. struct sk_buff *frags;
  813. /*
  814. * HW-checksum won't work as there are two or more
  815. * fragments on the socket so that all csums of sk_buffs
  816. * should be together
  817. */
  818. skb_walk_frags(skb, frags) {
  819. csum = csum_add(csum, frags->csum);
  820. hlen -= frags->len;
  821. }
  822. csum = skb_checksum(skb, offset, hlen, csum);
  823. skb->ip_summed = CHECKSUM_NONE;
  824. uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
  825. if (uh->check == 0)
  826. uh->check = CSUM_MANGLED_0;
  827. }
  828. }
  829. EXPORT_SYMBOL_GPL(udp4_hwcsum);
  830. /* Function to set UDP checksum for an IPv4 UDP packet. This is intended
  831. * for the simple case like when setting the checksum for a UDP tunnel.
  832. */
  833. void udp_set_csum(bool nocheck, struct sk_buff *skb,
  834. __be32 saddr, __be32 daddr, int len)
  835. {
  836. struct udphdr *uh = udp_hdr(skb);
  837. if (nocheck) {
  838. uh->check = 0;
  839. } else if (skb_is_gso(skb)) {
  840. uh->check = ~udp_v4_check(len, saddr, daddr, 0);
  841. } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
  842. uh->check = 0;
  843. uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb));
  844. if (uh->check == 0)
  845. uh->check = CSUM_MANGLED_0;
  846. } else {
  847. skb->ip_summed = CHECKSUM_PARTIAL;
  848. skb->csum_start = skb_transport_header(skb) - skb->head;
  849. skb->csum_offset = offsetof(struct udphdr, check);
  850. uh->check = ~udp_v4_check(len, saddr, daddr, 0);
  851. }
  852. }
  853. EXPORT_SYMBOL(udp_set_csum);
  854. static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
  855. struct inet_cork *cork)
  856. {
  857. struct sock *sk = skb->sk;
  858. struct inet_sock *inet = inet_sk(sk);
  859. struct udphdr *uh;
  860. int err;
  861. int is_udplite = IS_UDPLITE(sk);
  862. int offset = skb_transport_offset(skb);
  863. int len = skb->len - offset;
  864. int datalen = len - sizeof(*uh);
  865. __wsum csum = 0;
  866. /*
  867. * Create a UDP header
  868. */
  869. uh = udp_hdr(skb);
  870. uh->source = inet->inet_sport;
  871. uh->dest = fl4->fl4_dport;
  872. uh->len = htons(len);
  873. uh->check = 0;
  874. if (cork->gso_size) {
  875. const int hlen = skb_network_header_len(skb) +
  876. sizeof(struct udphdr);
  877. if (hlen + min(datalen, cork->gso_size) > cork->fragsize) {
  878. kfree_skb(skb);
  879. return -EMSGSIZE;
  880. }
  881. if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) {
  882. kfree_skb(skb);
  883. return -EINVAL;
  884. }
  885. if (sk->sk_no_check_tx) {
  886. kfree_skb(skb);
  887. return -EINVAL;
  888. }
  889. if (is_udplite || dst_xfrm(skb_dst(skb))) {
  890. kfree_skb(skb);
  891. return -EIO;
  892. }
  893. if (datalen > cork->gso_size) {
  894. skb_shinfo(skb)->gso_size = cork->gso_size;
  895. skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
  896. skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
  897. cork->gso_size);
  898. /* Don't checksum the payload, skb will get segmented */
  899. goto csum_partial;
  900. }
  901. }
  902. if (is_udplite) /* UDP-Lite */
  903. csum = udplite_csum(skb);
  904. else if (sk->sk_no_check_tx) { /* UDP csum off */
  905. skb->ip_summed = CHECKSUM_NONE;
  906. goto send;
  907. } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
  908. csum_partial:
  909. udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
  910. goto send;
  911. } else
  912. csum = udp_csum(skb);
  913. /* add protocol-dependent pseudo-header */
  914. uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len,
  915. sk->sk_protocol, csum);
  916. if (uh->check == 0)
  917. uh->check = CSUM_MANGLED_0;
  918. send:
  919. err = ip_send_skb(sock_net(sk), skb);
  920. if (err) {
  921. if (err == -ENOBUFS &&
  922. !inet_test_bit(RECVERR, sk)) {
  923. UDP_INC_STATS(sock_net(sk),
  924. UDP_MIB_SNDBUFERRORS, is_udplite);
  925. err = 0;
  926. }
  927. } else
  928. UDP_INC_STATS(sock_net(sk),
  929. UDP_MIB_OUTDATAGRAMS, is_udplite);
  930. return err;
  931. }
  932. /*
  933. * Push out all pending data as one UDP datagram. Socket is locked.
  934. */
  935. int udp_push_pending_frames(struct sock *sk)
  936. {
  937. struct udp_sock *up = udp_sk(sk);
  938. struct inet_sock *inet = inet_sk(sk);
  939. struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
  940. struct sk_buff *skb;
  941. int err = 0;
  942. skb = ip_finish_skb(sk, fl4);
  943. if (!skb)
  944. goto out;
  945. err = udp_send_skb(skb, fl4, &inet->cork.base);
  946. out:
  947. up->len = 0;
  948. WRITE_ONCE(up->pending, 0);
  949. return err;
  950. }
  951. EXPORT_SYMBOL(udp_push_pending_frames);
  952. static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size)
  953. {
  954. switch (cmsg->cmsg_type) {
  955. case UDP_SEGMENT:
  956. if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u16)))
  957. return -EINVAL;
  958. *gso_size = *(__u16 *)CMSG_DATA(cmsg);
  959. return 0;
  960. default:
  961. return -EINVAL;
  962. }
  963. }
  964. int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size)
  965. {
  966. struct cmsghdr *cmsg;
  967. bool need_ip = false;
  968. int err;
  969. for_each_cmsghdr(cmsg, msg) {
  970. if (!CMSG_OK(msg, cmsg))
  971. return -EINVAL;
  972. if (cmsg->cmsg_level != SOL_UDP) {
  973. need_ip = true;
  974. continue;
  975. }
  976. err = __udp_cmsg_send(cmsg, gso_size);
  977. if (err)
  978. return err;
  979. }
  980. return need_ip;
  981. }
  982. EXPORT_SYMBOL_GPL(udp_cmsg_send);
  983. int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
  984. {
  985. struct inet_sock *inet = inet_sk(sk);
  986. struct udp_sock *up = udp_sk(sk);
  987. DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
  988. struct flowi4 fl4_stack;
  989. struct flowi4 *fl4;
  990. int ulen = len;
  991. struct ipcm_cookie ipc;
  992. struct rtable *rt = NULL;
  993. int free = 0;
  994. int connected = 0;
  995. __be32 daddr, faddr, saddr;
  996. u8 tos, scope;
  997. __be16 dport;
  998. int err, is_udplite = IS_UDPLITE(sk);
  999. int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE;
  1000. int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
  1001. struct sk_buff *skb;
  1002. struct ip_options_data opt_copy;
  1003. int uc_index;
  1004. if (len > 0xFFFF)
  1005. return -EMSGSIZE;
  1006. /*
  1007. * Check the flags.
  1008. */
  1009. if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
  1010. return -EOPNOTSUPP;
  1011. getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
  1012. fl4 = &inet->cork.fl.u.ip4;
  1013. if (READ_ONCE(up->pending)) {
  1014. /*
  1015. * There are pending frames.
  1016. * The socket lock must be held while it's corked.
  1017. */
  1018. lock_sock(sk);
  1019. if (likely(up->pending)) {
  1020. if (unlikely(up->pending != AF_INET)) {
  1021. release_sock(sk);
  1022. return -EINVAL;
  1023. }
  1024. goto do_append_data;
  1025. }
  1026. release_sock(sk);
  1027. }
  1028. ulen += sizeof(struct udphdr);
  1029. /*
  1030. * Get and verify the address.
  1031. */
  1032. if (usin) {
  1033. if (msg->msg_namelen < sizeof(*usin))
  1034. return -EINVAL;
  1035. if (usin->sin_family != AF_INET) {
  1036. if (usin->sin_family != AF_UNSPEC)
  1037. return -EAFNOSUPPORT;
  1038. }
  1039. daddr = usin->sin_addr.s_addr;
  1040. dport = usin->sin_port;
  1041. if (dport == 0)
  1042. return -EINVAL;
  1043. } else {
  1044. if (sk->sk_state != TCP_ESTABLISHED)
  1045. return -EDESTADDRREQ;
  1046. daddr = inet->inet_daddr;
  1047. dport = inet->inet_dport;
  1048. /* Open fast path for connected socket.
  1049. Route will not be used, if at least one option is set.
  1050. */
  1051. connected = 1;
  1052. }
  1053. ipcm_init_sk(&ipc, inet);
  1054. ipc.gso_size = READ_ONCE(up->gso_size);
  1055. if (msg->msg_controllen) {
  1056. err = udp_cmsg_send(sk, msg, &ipc.gso_size);
  1057. if (err > 0) {
  1058. err = ip_cmsg_send(sk, msg, &ipc,
  1059. sk->sk_family == AF_INET6);
  1060. connected = 0;
  1061. }
  1062. if (unlikely(err < 0)) {
  1063. kfree(ipc.opt);
  1064. return err;
  1065. }
  1066. if (ipc.opt)
  1067. free = 1;
  1068. }
  1069. if (!ipc.opt) {
  1070. struct ip_options_rcu *inet_opt;
  1071. rcu_read_lock();
  1072. inet_opt = rcu_dereference(inet->inet_opt);
  1073. if (inet_opt) {
  1074. memcpy(&opt_copy, inet_opt,
  1075. sizeof(*inet_opt) + inet_opt->opt.optlen);
  1076. ipc.opt = &opt_copy.opt;
  1077. }
  1078. rcu_read_unlock();
  1079. }
  1080. if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) {
  1081. err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
  1082. (struct sockaddr *)usin,
  1083. &msg->msg_namelen,
  1084. &ipc.addr);
  1085. if (err)
  1086. goto out_free;
  1087. if (usin) {
  1088. if (usin->sin_port == 0) {
  1089. /* BPF program set invalid port. Reject it. */
  1090. err = -EINVAL;
  1091. goto out_free;
  1092. }
  1093. daddr = usin->sin_addr.s_addr;
  1094. dport = usin->sin_port;
  1095. }
  1096. }
  1097. saddr = ipc.addr;
  1098. ipc.addr = faddr = daddr;
  1099. if (ipc.opt && ipc.opt->opt.srr) {
  1100. if (!daddr) {
  1101. err = -EINVAL;
  1102. goto out_free;
  1103. }
  1104. faddr = ipc.opt->opt.faddr;
  1105. connected = 0;
  1106. }
  1107. tos = get_rttos(&ipc, inet);
  1108. scope = ip_sendmsg_scope(inet, &ipc, msg);
  1109. if (scope == RT_SCOPE_LINK)
  1110. connected = 0;
  1111. uc_index = READ_ONCE(inet->uc_index);
  1112. if (ipv4_is_multicast(daddr)) {
  1113. if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
  1114. ipc.oif = READ_ONCE(inet->mc_index);
  1115. if (!saddr)
  1116. saddr = READ_ONCE(inet->mc_addr);
  1117. connected = 0;
  1118. } else if (!ipc.oif) {
  1119. ipc.oif = uc_index;
  1120. } else if (ipv4_is_lbcast(daddr) && uc_index) {
  1121. /* oif is set, packet is to local broadcast and
  1122. * uc_index is set. oif is most likely set
  1123. * by sk_bound_dev_if. If uc_index != oif check if the
  1124. * oif is an L3 master and uc_index is an L3 slave.
  1125. * If so, we want to allow the send using the uc_index.
  1126. */
  1127. if (ipc.oif != uc_index &&
  1128. ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
  1129. uc_index)) {
  1130. ipc.oif = uc_index;
  1131. }
  1132. }
  1133. if (connected)
  1134. rt = dst_rtable(sk_dst_check(sk, 0));
  1135. if (!rt) {
  1136. struct net *net = sock_net(sk);
  1137. __u8 flow_flags = inet_sk_flowi_flags(sk);
  1138. fl4 = &fl4_stack;
  1139. flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos, scope,
  1140. sk->sk_protocol, flow_flags, faddr, saddr,
  1141. dport, inet->inet_sport, sk->sk_uid);
  1142. security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
  1143. rt = ip_route_output_flow(net, fl4, sk);
  1144. if (IS_ERR(rt)) {
  1145. err = PTR_ERR(rt);
  1146. rt = NULL;
  1147. if (err == -ENETUNREACH)
  1148. IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
  1149. goto out;
  1150. }
  1151. err = -EACCES;
  1152. if ((rt->rt_flags & RTCF_BROADCAST) &&
  1153. !sock_flag(sk, SOCK_BROADCAST))
  1154. goto out;
  1155. if (connected)
  1156. sk_dst_set(sk, dst_clone(&rt->dst));
  1157. }
  1158. if (msg->msg_flags&MSG_CONFIRM)
  1159. goto do_confirm;
  1160. back_from_confirm:
  1161. saddr = fl4->saddr;
  1162. if (!ipc.addr)
  1163. daddr = ipc.addr = fl4->daddr;
  1164. /* Lockless fast path for the non-corking case. */
  1165. if (!corkreq) {
  1166. struct inet_cork cork;
  1167. skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
  1168. sizeof(struct udphdr), &ipc, &rt,
  1169. &cork, msg->msg_flags);
  1170. err = PTR_ERR(skb);
  1171. if (!IS_ERR_OR_NULL(skb))
  1172. err = udp_send_skb(skb, fl4, &cork);
  1173. goto out;
  1174. }
  1175. lock_sock(sk);
  1176. if (unlikely(up->pending)) {
  1177. /* The socket is already corked while preparing it. */
  1178. /* ... which is an evident application bug. --ANK */
  1179. release_sock(sk);
  1180. net_dbg_ratelimited("socket already corked\n");
  1181. err = -EINVAL;
  1182. goto out;
  1183. }
  1184. /*
  1185. * Now cork the socket to pend data.
  1186. */
  1187. fl4 = &inet->cork.fl.u.ip4;
  1188. fl4->daddr = daddr;
  1189. fl4->saddr = saddr;
  1190. fl4->fl4_dport = dport;
  1191. fl4->fl4_sport = inet->inet_sport;
  1192. WRITE_ONCE(up->pending, AF_INET);
  1193. do_append_data:
  1194. up->len += ulen;
  1195. err = ip_append_data(sk, fl4, getfrag, msg, ulen,
  1196. sizeof(struct udphdr), &ipc, &rt,
  1197. corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
  1198. if (err)
  1199. udp_flush_pending_frames(sk);
  1200. else if (!corkreq)
  1201. err = udp_push_pending_frames(sk);
  1202. else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
  1203. WRITE_ONCE(up->pending, 0);
  1204. release_sock(sk);
  1205. out:
  1206. ip_rt_put(rt);
  1207. out_free:
  1208. if (free)
  1209. kfree(ipc.opt);
  1210. if (!err)
  1211. return len;
  1212. /*
  1213. * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting
  1214. * ENOBUFS might not be good (it's not tunable per se), but otherwise
  1215. * we don't have a good statistic (IpOutDiscards but it can be too many
  1216. * things). We could add another new stat but at least for now that
  1217. * seems like overkill.
  1218. */
  1219. if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
  1220. UDP_INC_STATS(sock_net(sk),
  1221. UDP_MIB_SNDBUFERRORS, is_udplite);
  1222. }
  1223. return err;
  1224. do_confirm:
  1225. if (msg->msg_flags & MSG_PROBE)
  1226. dst_confirm_neigh(&rt->dst, &fl4->daddr);
  1227. if (!(msg->msg_flags&MSG_PROBE) || len)
  1228. goto back_from_confirm;
  1229. err = 0;
  1230. goto out;
  1231. }
  1232. EXPORT_SYMBOL(udp_sendmsg);
  1233. void udp_splice_eof(struct socket *sock)
  1234. {
  1235. struct sock *sk = sock->sk;
  1236. struct udp_sock *up = udp_sk(sk);
  1237. if (!READ_ONCE(up->pending) || udp_test_bit(CORK, sk))
  1238. return;
  1239. lock_sock(sk);
  1240. if (up->pending && !udp_test_bit(CORK, sk))
  1241. udp_push_pending_frames(sk);
  1242. release_sock(sk);
  1243. }
  1244. EXPORT_SYMBOL_GPL(udp_splice_eof);
  1245. #define UDP_SKB_IS_STATELESS 0x80000000
  1246. /* all head states (dst, sk, nf conntrack) except skb extensions are
  1247. * cleared by udp_rcv().
  1248. *
  1249. * We need to preserve secpath, if present, to eventually process
  1250. * IP_CMSG_PASSSEC at recvmsg() time.
  1251. *
  1252. * Other extensions can be cleared.
  1253. */
  1254. static bool udp_try_make_stateless(struct sk_buff *skb)
  1255. {
  1256. if (!skb_has_extensions(skb))
  1257. return true;
  1258. if (!secpath_exists(skb)) {
  1259. skb_ext_reset(skb);
  1260. return true;
  1261. }
  1262. return false;
  1263. }
  1264. static void udp_set_dev_scratch(struct sk_buff *skb)
  1265. {
  1266. struct udp_dev_scratch *scratch = udp_skb_scratch(skb);
  1267. BUILD_BUG_ON(sizeof(struct udp_dev_scratch) > sizeof(long));
  1268. scratch->_tsize_state = skb->truesize;
  1269. #if BITS_PER_LONG == 64
  1270. scratch->len = skb->len;
  1271. scratch->csum_unnecessary = !!skb_csum_unnecessary(skb);
  1272. scratch->is_linear = !skb_is_nonlinear(skb);
  1273. #endif
  1274. if (udp_try_make_stateless(skb))
  1275. scratch->_tsize_state |= UDP_SKB_IS_STATELESS;
  1276. }
  1277. static void udp_skb_csum_unnecessary_set(struct sk_buff *skb)
  1278. {
  1279. /* We come here after udp_lib_checksum_complete() returned 0.
  1280. * This means that __skb_checksum_complete() might have
  1281. * set skb->csum_valid to 1.
  1282. * On 64bit platforms, we can set csum_unnecessary
  1283. * to true, but only if the skb is not shared.
  1284. */
  1285. #if BITS_PER_LONG == 64
  1286. if (!skb_shared(skb))
  1287. udp_skb_scratch(skb)->csum_unnecessary = true;
  1288. #endif
  1289. }
  1290. static int udp_skb_truesize(struct sk_buff *skb)
  1291. {
  1292. return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS;
  1293. }
  1294. static bool udp_skb_has_head_state(struct sk_buff *skb)
  1295. {
  1296. return !(udp_skb_scratch(skb)->_tsize_state & UDP_SKB_IS_STATELESS);
  1297. }
  1298. /* fully reclaim rmem/fwd memory allocated for skb */
  1299. static void udp_rmem_release(struct sock *sk, unsigned int size,
  1300. int partial, bool rx_queue_lock_held)
  1301. {
  1302. struct udp_sock *up = udp_sk(sk);
  1303. struct sk_buff_head *sk_queue;
  1304. unsigned int amt;
  1305. if (likely(partial)) {
  1306. up->forward_deficit += size;
  1307. size = up->forward_deficit;
  1308. if (size < READ_ONCE(up->forward_threshold) &&
  1309. !skb_queue_empty(&up->reader_queue))
  1310. return;
  1311. } else {
  1312. size += up->forward_deficit;
  1313. }
  1314. up->forward_deficit = 0;
  1315. /* acquire the sk_receive_queue for fwd allocated memory scheduling,
  1316. * if the called don't held it already
  1317. */
  1318. sk_queue = &sk->sk_receive_queue;
  1319. if (!rx_queue_lock_held)
  1320. spin_lock(&sk_queue->lock);
  1321. amt = (size + sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1);
  1322. sk_forward_alloc_add(sk, size - amt);
  1323. if (amt)
  1324. __sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT);
  1325. atomic_sub(size, &sk->sk_rmem_alloc);
  1326. /* this can save us from acquiring the rx queue lock on next receive */
  1327. skb_queue_splice_tail_init(sk_queue, &up->reader_queue);
  1328. if (!rx_queue_lock_held)
  1329. spin_unlock(&sk_queue->lock);
  1330. }
  1331. /* Note: called with reader_queue.lock held.
  1332. * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch
  1333. * This avoids a cache line miss while receive_queue lock is held.
  1334. * Look at __udp_enqueue_schedule_skb() to find where this copy is done.
  1335. */
  1336. void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
  1337. {
  1338. prefetch(&skb->data);
  1339. udp_rmem_release(sk, udp_skb_truesize(skb), 1, false);
  1340. }
  1341. EXPORT_SYMBOL(udp_skb_destructor);
  1342. /* as above, but the caller held the rx queue lock, too */
  1343. static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb)
  1344. {
  1345. prefetch(&skb->data);
  1346. udp_rmem_release(sk, udp_skb_truesize(skb), 1, true);
  1347. }
  1348. /* Idea of busylocks is to let producers grab an extra spinlock
  1349. * to relieve pressure on the receive_queue spinlock shared by consumer.
  1350. * Under flood, this means that only one producer can be in line
  1351. * trying to acquire the receive_queue spinlock.
  1352. * These busylock can be allocated on a per cpu manner, instead of a
  1353. * per socket one (that would consume a cache line per socket)
  1354. */
  1355. static int udp_busylocks_log __read_mostly;
  1356. static spinlock_t *udp_busylocks __read_mostly;
  1357. static spinlock_t *busylock_acquire(void *ptr)
  1358. {
  1359. spinlock_t *busy;
  1360. busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log);
  1361. spin_lock(busy);
  1362. return busy;
  1363. }
  1364. static void busylock_release(spinlock_t *busy)
  1365. {
  1366. if (busy)
  1367. spin_unlock(busy);
  1368. }
  1369. static int udp_rmem_schedule(struct sock *sk, int size)
  1370. {
  1371. int delta;
  1372. delta = size - sk->sk_forward_alloc;
  1373. if (delta > 0 && !__sk_mem_schedule(sk, delta, SK_MEM_RECV))
  1374. return -ENOBUFS;
  1375. return 0;
  1376. }
  1377. int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
  1378. {
  1379. struct sk_buff_head *list = &sk->sk_receive_queue;
  1380. unsigned int rmem, rcvbuf;
  1381. spinlock_t *busy = NULL;
  1382. int size, err = -ENOMEM;
  1383. rmem = atomic_read(&sk->sk_rmem_alloc);
  1384. rcvbuf = READ_ONCE(sk->sk_rcvbuf);
  1385. size = skb->truesize;
  1386. /* Immediately drop when the receive queue is full.
  1387. * Cast to unsigned int performs the boundary check for INT_MAX.
  1388. */
  1389. if (rmem + size > rcvbuf) {
  1390. if (rcvbuf > INT_MAX >> 1)
  1391. goto drop;
  1392. /* Always allow at least one packet for small buffer. */
  1393. if (rmem > rcvbuf)
  1394. goto drop;
  1395. }
  1396. /* Under mem pressure, it might be helpful to help udp_recvmsg()
  1397. * having linear skbs :
  1398. * - Reduce memory overhead and thus increase receive queue capacity
  1399. * - Less cache line misses at copyout() time
  1400. * - Less work at consume_skb() (less alien page frag freeing)
  1401. */
  1402. if (rmem > (rcvbuf >> 1)) {
  1403. skb_condense(skb);
  1404. size = skb->truesize;
  1405. busy = busylock_acquire(sk);
  1406. }
  1407. udp_set_dev_scratch(skb);
  1408. atomic_add(size, &sk->sk_rmem_alloc);
  1409. spin_lock(&list->lock);
  1410. err = udp_rmem_schedule(sk, size);
  1411. if (err) {
  1412. spin_unlock(&list->lock);
  1413. goto uncharge_drop;
  1414. }
  1415. sk_forward_alloc_add(sk, -size);
  1416. /* no need to setup a destructor, we will explicitly release the
  1417. * forward allocated memory on dequeue
  1418. */
  1419. sock_skb_set_dropcount(sk, skb);
  1420. __skb_queue_tail(list, skb);
  1421. spin_unlock(&list->lock);
  1422. if (!sock_flag(sk, SOCK_DEAD))
  1423. INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk);
  1424. busylock_release(busy);
  1425. return 0;
  1426. uncharge_drop:
  1427. atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
  1428. drop:
  1429. atomic_inc(&sk->sk_drops);
  1430. busylock_release(busy);
  1431. return err;
  1432. }
  1433. EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);
  1434. void udp_destruct_common(struct sock *sk)
  1435. {
  1436. /* reclaim completely the forward allocated memory */
  1437. struct udp_sock *up = udp_sk(sk);
  1438. unsigned int total = 0;
  1439. struct sk_buff *skb;
  1440. skb_queue_splice_tail_init(&sk->sk_receive_queue, &up->reader_queue);
  1441. while ((skb = __skb_dequeue(&up->reader_queue)) != NULL) {
  1442. total += skb->truesize;
  1443. kfree_skb(skb);
  1444. }
  1445. udp_rmem_release(sk, total, 0, true);
  1446. }
  1447. EXPORT_SYMBOL_GPL(udp_destruct_common);
  1448. static void udp_destruct_sock(struct sock *sk)
  1449. {
  1450. udp_destruct_common(sk);
  1451. inet_sock_destruct(sk);
  1452. }
  1453. int udp_init_sock(struct sock *sk)
  1454. {
  1455. udp_lib_init_sock(sk);
  1456. sk->sk_destruct = udp_destruct_sock;
  1457. set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
  1458. return 0;
  1459. }
  1460. void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
  1461. {
  1462. if (unlikely(READ_ONCE(udp_sk(sk)->peeking_with_offset)))
  1463. sk_peek_offset_bwd(sk, len);
  1464. if (!skb_unref(skb))
  1465. return;
  1466. /* In the more common cases we cleared the head states previously,
  1467. * see __udp_queue_rcv_skb().
  1468. */
  1469. if (unlikely(udp_skb_has_head_state(skb)))
  1470. skb_release_head_state(skb);
  1471. __consume_stateless_skb(skb);
  1472. }
  1473. EXPORT_SYMBOL_GPL(skb_consume_udp);
  1474. static struct sk_buff *__first_packet_length(struct sock *sk,
  1475. struct sk_buff_head *rcvq,
  1476. unsigned int *total)
  1477. {
  1478. struct sk_buff *skb;
  1479. while ((skb = skb_peek(rcvq)) != NULL) {
  1480. if (udp_lib_checksum_complete(skb)) {
  1481. __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS,
  1482. IS_UDPLITE(sk));
  1483. __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
  1484. IS_UDPLITE(sk));
  1485. atomic_inc(&sk->sk_drops);
  1486. __skb_unlink(skb, rcvq);
  1487. *total += skb->truesize;
  1488. kfree_skb(skb);
  1489. } else {
  1490. udp_skb_csum_unnecessary_set(skb);
  1491. break;
  1492. }
  1493. }
  1494. return skb;
  1495. }
  1496. /**
  1497. * first_packet_length - return length of first packet in receive queue
  1498. * @sk: socket
  1499. *
  1500. * Drops all bad checksum frames, until a valid one is found.
  1501. * Returns the length of found skb, or -1 if none is found.
  1502. */
  1503. static int first_packet_length(struct sock *sk)
  1504. {
  1505. struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue;
  1506. struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
  1507. unsigned int total = 0;
  1508. struct sk_buff *skb;
  1509. int res;
  1510. spin_lock_bh(&rcvq->lock);
  1511. skb = __first_packet_length(sk, rcvq, &total);
  1512. if (!skb && !skb_queue_empty_lockless(sk_queue)) {
  1513. spin_lock(&sk_queue->lock);
  1514. skb_queue_splice_tail_init(sk_queue, rcvq);
  1515. spin_unlock(&sk_queue->lock);
  1516. skb = __first_packet_length(sk, rcvq, &total);
  1517. }
  1518. res = skb ? skb->len : -1;
  1519. if (total)
  1520. udp_rmem_release(sk, total, 1, false);
  1521. spin_unlock_bh(&rcvq->lock);
  1522. return res;
  1523. }
  1524. /*
  1525. * IOCTL requests applicable to the UDP protocol
  1526. */
  1527. int udp_ioctl(struct sock *sk, int cmd, int *karg)
  1528. {
  1529. switch (cmd) {
  1530. case SIOCOUTQ:
  1531. {
  1532. *karg = sk_wmem_alloc_get(sk);
  1533. return 0;
  1534. }
  1535. case SIOCINQ:
  1536. {
  1537. *karg = max_t(int, 0, first_packet_length(sk));
  1538. return 0;
  1539. }
  1540. default:
  1541. return -ENOIOCTLCMD;
  1542. }
  1543. return 0;
  1544. }
  1545. EXPORT_SYMBOL(udp_ioctl);
  1546. struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
  1547. int *off, int *err)
  1548. {
  1549. struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
  1550. struct sk_buff_head *queue;
  1551. struct sk_buff *last;
  1552. long timeo;
  1553. int error;
  1554. queue = &udp_sk(sk)->reader_queue;
  1555. timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
  1556. do {
  1557. struct sk_buff *skb;
  1558. error = sock_error(sk);
  1559. if (error)
  1560. break;
  1561. error = -EAGAIN;
  1562. do {
  1563. spin_lock_bh(&queue->lock);
  1564. skb = __skb_try_recv_from_queue(sk, queue, flags, off,
  1565. err, &last);
  1566. if (skb) {
  1567. if (!(flags & MSG_PEEK))
  1568. udp_skb_destructor(sk, skb);
  1569. spin_unlock_bh(&queue->lock);
  1570. return skb;
  1571. }
  1572. if (skb_queue_empty_lockless(sk_queue)) {
  1573. spin_unlock_bh(&queue->lock);
  1574. goto busy_check;
  1575. }
  1576. /* refill the reader queue and walk it again
  1577. * keep both queues locked to avoid re-acquiring
  1578. * the sk_receive_queue lock if fwd memory scheduling
  1579. * is needed.
  1580. */
  1581. spin_lock(&sk_queue->lock);
  1582. skb_queue_splice_tail_init(sk_queue, queue);
  1583. skb = __skb_try_recv_from_queue(sk, queue, flags, off,
  1584. err, &last);
  1585. if (skb && !(flags & MSG_PEEK))
  1586. udp_skb_dtor_locked(sk, skb);
  1587. spin_unlock(&sk_queue->lock);
  1588. spin_unlock_bh(&queue->lock);
  1589. if (skb)
  1590. return skb;
  1591. busy_check:
  1592. if (!sk_can_busy_loop(sk))
  1593. break;
  1594. sk_busy_loop(sk, flags & MSG_DONTWAIT);
  1595. } while (!skb_queue_empty_lockless(sk_queue));
  1596. /* sk_queue is empty, reader_queue may contain peeked packets */
  1597. } while (timeo &&
  1598. !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
  1599. &error, &timeo,
  1600. (struct sk_buff *)sk_queue));
  1601. *err = error;
  1602. return NULL;
  1603. }
  1604. EXPORT_SYMBOL(__skb_recv_udp);
  1605. int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
  1606. {
  1607. struct sk_buff *skb;
  1608. int err;
  1609. try_again:
  1610. skb = skb_recv_udp(sk, MSG_DONTWAIT, &err);
  1611. if (!skb)
  1612. return err;
  1613. if (udp_lib_checksum_complete(skb)) {
  1614. int is_udplite = IS_UDPLITE(sk);
  1615. struct net *net = sock_net(sk);
  1616. __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite);
  1617. __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite);
  1618. atomic_inc(&sk->sk_drops);
  1619. kfree_skb(skb);
  1620. goto try_again;
  1621. }
  1622. WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
  1623. return recv_actor(sk, skb);
  1624. }
  1625. EXPORT_SYMBOL(udp_read_skb);
  1626. /*
  1627. * This should be easy, if there is something there we
  1628. * return it, otherwise we block.
  1629. */
  1630. int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
  1631. int *addr_len)
  1632. {
  1633. struct inet_sock *inet = inet_sk(sk);
  1634. DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
  1635. struct sk_buff *skb;
  1636. unsigned int ulen, copied;
  1637. int off, err, peeking = flags & MSG_PEEK;
  1638. int is_udplite = IS_UDPLITE(sk);
  1639. bool checksum_valid = false;
  1640. if (flags & MSG_ERRQUEUE)
  1641. return ip_recv_error(sk, msg, len, addr_len);
  1642. try_again:
  1643. off = sk_peek_offset(sk, flags);
  1644. skb = __skb_recv_udp(sk, flags, &off, &err);
  1645. if (!skb)
  1646. return err;
  1647. ulen = udp_skb_len(skb);
  1648. copied = len;
  1649. if (copied > ulen - off)
  1650. copied = ulen - off;
  1651. else if (copied < ulen)
  1652. msg->msg_flags |= MSG_TRUNC;
  1653. /*
  1654. * If checksum is needed at all, try to do it while copying the
  1655. * data. If the data is truncated, or if we only want a partial
  1656. * coverage checksum (UDP-Lite), do it before the copy.
  1657. */
  1658. if (copied < ulen || peeking ||
  1659. (is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
  1660. checksum_valid = udp_skb_csum_unnecessary(skb) ||
  1661. !__udp_lib_checksum_complete(skb);
  1662. if (!checksum_valid)
  1663. goto csum_copy_err;
  1664. }
  1665. if (checksum_valid || udp_skb_csum_unnecessary(skb)) {
  1666. if (udp_skb_is_linear(skb))
  1667. err = copy_linear_skb(skb, copied, off, &msg->msg_iter);
  1668. else
  1669. err = skb_copy_datagram_msg(skb, off, msg, copied);
  1670. } else {
  1671. err = skb_copy_and_csum_datagram_msg(skb, off, msg);
  1672. if (err == -EINVAL)
  1673. goto csum_copy_err;
  1674. }
  1675. if (unlikely(err)) {
  1676. if (!peeking) {
  1677. atomic_inc(&sk->sk_drops);
  1678. UDP_INC_STATS(sock_net(sk),
  1679. UDP_MIB_INERRORS, is_udplite);
  1680. }
  1681. kfree_skb(skb);
  1682. return err;
  1683. }
  1684. if (!peeking)
  1685. UDP_INC_STATS(sock_net(sk),
  1686. UDP_MIB_INDATAGRAMS, is_udplite);
  1687. sock_recv_cmsgs(msg, sk, skb);
  1688. /* Copy the address. */
  1689. if (sin) {
  1690. sin->sin_family = AF_INET;
  1691. sin->sin_port = udp_hdr(skb)->source;
  1692. sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
  1693. memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
  1694. *addr_len = sizeof(*sin);
  1695. BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
  1696. (struct sockaddr *)sin,
  1697. addr_len);
  1698. }
  1699. if (udp_test_bit(GRO_ENABLED, sk))
  1700. udp_cmsg_recv(msg, sk, skb);
  1701. if (inet_cmsg_flags(inet))
  1702. ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);
  1703. err = copied;
  1704. if (flags & MSG_TRUNC)
  1705. err = ulen;
  1706. skb_consume_udp(sk, skb, peeking ? -err : err);
  1707. return err;
  1708. csum_copy_err:
  1709. if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags,
  1710. udp_skb_destructor)) {
  1711. UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
  1712. UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
  1713. }
  1714. kfree_skb(skb);
  1715. /* starting over for a new packet, but check if we need to yield */
  1716. cond_resched();
  1717. msg->msg_flags &= ~MSG_TRUNC;
  1718. goto try_again;
  1719. }
  1720. int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
  1721. {
  1722. /* This check is replicated from __ip4_datagram_connect() and
  1723. * intended to prevent BPF program called below from accessing bytes
  1724. * that are out of the bound specified by user in addr_len.
  1725. */
  1726. if (addr_len < sizeof(struct sockaddr_in))
  1727. return -EINVAL;
  1728. return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len);
  1729. }
  1730. EXPORT_SYMBOL(udp_pre_connect);
  1731. int __udp_disconnect(struct sock *sk, int flags)
  1732. {
  1733. struct inet_sock *inet = inet_sk(sk);
  1734. /*
  1735. * 1003.1g - break association.
  1736. */
  1737. sk->sk_state = TCP_CLOSE;
  1738. inet->inet_daddr = 0;
  1739. inet->inet_dport = 0;
  1740. sock_rps_reset_rxhash(sk);
  1741. sk->sk_bound_dev_if = 0;
  1742. if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) {
  1743. inet_reset_saddr(sk);
  1744. if (sk->sk_prot->rehash &&
  1745. (sk->sk_userlocks & SOCK_BINDPORT_LOCK))
  1746. sk->sk_prot->rehash(sk);
  1747. }
  1748. if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
  1749. sk->sk_prot->unhash(sk);
  1750. inet->inet_sport = 0;
  1751. }
  1752. sk_dst_reset(sk);
  1753. return 0;
  1754. }
  1755. EXPORT_SYMBOL(__udp_disconnect);
  1756. int udp_disconnect(struct sock *sk, int flags)
  1757. {
  1758. lock_sock(sk);
  1759. __udp_disconnect(sk, flags);
  1760. release_sock(sk);
  1761. return 0;
  1762. }
  1763. EXPORT_SYMBOL(udp_disconnect);
  1764. void udp_lib_unhash(struct sock *sk)
  1765. {
  1766. if (sk_hashed(sk)) {
  1767. struct udp_table *udptable = udp_get_table_prot(sk);
  1768. struct udp_hslot *hslot, *hslot2;
  1769. hslot = udp_hashslot(udptable, sock_net(sk),
  1770. udp_sk(sk)->udp_port_hash);
  1771. hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
  1772. spin_lock_bh(&hslot->lock);
  1773. if (rcu_access_pointer(sk->sk_reuseport_cb))
  1774. reuseport_detach_sock(sk);
  1775. if (sk_del_node_init_rcu(sk)) {
  1776. hslot->count--;
  1777. inet_sk(sk)->inet_num = 0;
  1778. sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
  1779. spin_lock(&hslot2->lock);
  1780. hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
  1781. hslot2->count--;
  1782. spin_unlock(&hslot2->lock);
  1783. }
  1784. spin_unlock_bh(&hslot->lock);
  1785. }
  1786. }
  1787. EXPORT_SYMBOL(udp_lib_unhash);
  1788. /*
  1789. * inet_rcv_saddr was changed, we must rehash secondary hash
  1790. */
  1791. void udp_lib_rehash(struct sock *sk, u16 newhash)
  1792. {
  1793. if (sk_hashed(sk)) {
  1794. struct udp_table *udptable = udp_get_table_prot(sk);
  1795. struct udp_hslot *hslot, *hslot2, *nhslot2;
  1796. hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
  1797. nhslot2 = udp_hashslot2(udptable, newhash);
  1798. udp_sk(sk)->udp_portaddr_hash = newhash;
  1799. if (hslot2 != nhslot2 ||
  1800. rcu_access_pointer(sk->sk_reuseport_cb)) {
  1801. hslot = udp_hashslot(udptable, sock_net(sk),
  1802. udp_sk(sk)->udp_port_hash);
  1803. /* we must lock primary chain too */
  1804. spin_lock_bh(&hslot->lock);
  1805. if (rcu_access_pointer(sk->sk_reuseport_cb))
  1806. reuseport_detach_sock(sk);
  1807. if (hslot2 != nhslot2) {
  1808. spin_lock(&hslot2->lock);
  1809. hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
  1810. hslot2->count--;
  1811. spin_unlock(&hslot2->lock);
  1812. spin_lock(&nhslot2->lock);
  1813. hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
  1814. &nhslot2->head);
  1815. nhslot2->count++;
  1816. spin_unlock(&nhslot2->lock);
  1817. }
  1818. spin_unlock_bh(&hslot->lock);
  1819. }
  1820. }
  1821. }
  1822. EXPORT_SYMBOL(udp_lib_rehash);
  1823. void udp_v4_rehash(struct sock *sk)
  1824. {
  1825. u16 new_hash = ipv4_portaddr_hash(sock_net(sk),
  1826. inet_sk(sk)->inet_rcv_saddr,
  1827. inet_sk(sk)->inet_num);
  1828. udp_lib_rehash(sk, new_hash);
  1829. }
  1830. static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
  1831. {
  1832. int rc;
  1833. if (inet_sk(sk)->inet_daddr) {
  1834. sock_rps_save_rxhash(sk, skb);
  1835. sk_mark_napi_id(sk, skb);
  1836. sk_incoming_cpu_update(sk);
  1837. } else {
  1838. sk_mark_napi_id_once(sk, skb);
  1839. }
  1840. rc = __udp_enqueue_schedule_skb(sk, skb);
  1841. if (rc < 0) {
  1842. int is_udplite = IS_UDPLITE(sk);
  1843. int drop_reason;
  1844. /* Note that an ENOMEM error is charged twice */
  1845. if (rc == -ENOMEM) {
  1846. UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
  1847. is_udplite);
  1848. drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
  1849. } else {
  1850. UDP_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS,
  1851. is_udplite);
  1852. drop_reason = SKB_DROP_REASON_PROTO_MEM;
  1853. }
  1854. UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
  1855. trace_udp_fail_queue_rcv_skb(rc, sk, skb);
  1856. sk_skb_reason_drop(sk, skb, drop_reason);
  1857. return -1;
  1858. }
  1859. return 0;
  1860. }
  1861. /* returns:
  1862. * -1: error
  1863. * 0: success
  1864. * >0: "udp encap" protocol resubmission
  1865. *
  1866. * Note that in the success and error cases, the skb is assumed to
  1867. * have either been requeued or freed.
  1868. */
  1869. static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
  1870. {
  1871. int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
  1872. struct udp_sock *up = udp_sk(sk);
  1873. int is_udplite = IS_UDPLITE(sk);
  1874. /*
  1875. * Charge it to the socket, dropping if the queue is full.
  1876. */
  1877. if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
  1878. drop_reason = SKB_DROP_REASON_XFRM_POLICY;
  1879. goto drop;
  1880. }
  1881. nf_reset_ct(skb);
  1882. if (static_branch_unlikely(&udp_encap_needed_key) &&
  1883. READ_ONCE(up->encap_type)) {
  1884. int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
  1885. /*
  1886. * This is an encapsulation socket so pass the skb to
  1887. * the socket's udp_encap_rcv() hook. Otherwise, just
  1888. * fall through and pass this up the UDP socket.
  1889. * up->encap_rcv() returns the following value:
  1890. * =0 if skb was successfully passed to the encap
  1891. * handler or was discarded by it.
  1892. * >0 if skb should be passed on to UDP.
  1893. * <0 if skb should be resubmitted as proto -N
  1894. */
  1895. /* if we're overly short, let UDP handle it */
  1896. encap_rcv = READ_ONCE(up->encap_rcv);
  1897. if (encap_rcv) {
  1898. int ret;
  1899. /* Verify checksum before giving to encap */
  1900. if (udp_lib_checksum_complete(skb))
  1901. goto csum_error;
  1902. ret = encap_rcv(sk, skb);
  1903. if (ret <= 0) {
  1904. __UDP_INC_STATS(sock_net(sk),
  1905. UDP_MIB_INDATAGRAMS,
  1906. is_udplite);
  1907. return -ret;
  1908. }
  1909. }
  1910. /* FALLTHROUGH -- it's a UDP Packet */
  1911. }
  1912. /*
  1913. * UDP-Lite specific tests, ignored on UDP sockets
  1914. */
  1915. if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) {
  1916. u16 pcrlen = READ_ONCE(up->pcrlen);
  1917. /*
  1918. * MIB statistics other than incrementing the error count are
  1919. * disabled for the following two types of errors: these depend
  1920. * on the application settings, not on the functioning of the
  1921. * protocol stack as such.
  1922. *
  1923. * RFC 3828 here recommends (sec 3.3): "There should also be a
  1924. * way ... to ... at least let the receiving application block
  1925. * delivery of packets with coverage values less than a value
  1926. * provided by the application."
  1927. */
  1928. if (pcrlen == 0) { /* full coverage was set */
  1929. net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n",
  1930. UDP_SKB_CB(skb)->cscov, skb->len);
  1931. goto drop;
  1932. }
  1933. /* The next case involves violating the min. coverage requested
  1934. * by the receiver. This is subtle: if receiver wants x and x is
  1935. * greater than the buffersize/MTU then receiver will complain
  1936. * that it wants x while sender emits packets of smaller size y.
  1937. * Therefore the above ...()->partial_cov statement is essential.
  1938. */
  1939. if (UDP_SKB_CB(skb)->cscov < pcrlen) {
  1940. net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n",
  1941. UDP_SKB_CB(skb)->cscov, pcrlen);
  1942. goto drop;
  1943. }
  1944. }
  1945. prefetch(&sk->sk_rmem_alloc);
  1946. if (rcu_access_pointer(sk->sk_filter) &&
  1947. udp_lib_checksum_complete(skb))
  1948. goto csum_error;
  1949. if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) {
  1950. drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
  1951. goto drop;
  1952. }
  1953. udp_csum_pull_header(skb);
  1954. ipv4_pktinfo_prepare(sk, skb, true);
  1955. return __udp_queue_rcv_skb(sk, skb);
  1956. csum_error:
  1957. drop_reason = SKB_DROP_REASON_UDP_CSUM;
  1958. __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
  1959. drop:
  1960. __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
  1961. atomic_inc(&sk->sk_drops);
  1962. sk_skb_reason_drop(sk, skb, drop_reason);
  1963. return -1;
  1964. }
  1965. static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
  1966. {
  1967. struct sk_buff *next, *segs;
  1968. int ret;
  1969. if (likely(!udp_unexpected_gso(sk, skb)))
  1970. return udp_queue_rcv_one_skb(sk, skb);
  1971. BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_GSO_CB_OFFSET);
  1972. __skb_push(skb, -skb_mac_offset(skb));
  1973. segs = udp_rcv_segment(sk, skb, true);
  1974. skb_list_walk_safe(segs, skb, next) {
  1975. __skb_pull(skb, skb_transport_offset(skb));
  1976. udp_post_segment_fix_csum(skb);
  1977. ret = udp_queue_rcv_one_skb(sk, skb);
  1978. if (ret > 0)
  1979. ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret);
  1980. }
  1981. return 0;
  1982. }
  1983. /* For TCP sockets, sk_rx_dst is protected by socket lock
  1984. * For UDP, we use xchg() to guard against concurrent changes.
  1985. */
  1986. bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
  1987. {
  1988. struct dst_entry *old;
  1989. if (dst_hold_safe(dst)) {
  1990. old = unrcu_pointer(xchg(&sk->sk_rx_dst, RCU_INITIALIZER(dst)));
  1991. dst_release(old);
  1992. return old != dst;
  1993. }
  1994. return false;
  1995. }
  1996. EXPORT_SYMBOL(udp_sk_rx_dst_set);
  1997. /*
  1998. * Multicasts and broadcasts go to each listener.
  1999. *
  2000. * Note: called only from the BH handler context.
  2001. */
  2002. static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
  2003. struct udphdr *uh,
  2004. __be32 saddr, __be32 daddr,
  2005. struct udp_table *udptable,
  2006. int proto)
  2007. {
  2008. struct sock *sk, *first = NULL;
  2009. unsigned short hnum = ntohs(uh->dest);
  2010. struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
  2011. unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
  2012. unsigned int offset = offsetof(typeof(*sk), sk_node);
  2013. int dif = skb->dev->ifindex;
  2014. int sdif = inet_sdif(skb);
  2015. struct hlist_node *node;
  2016. struct sk_buff *nskb;
  2017. if (use_hash2) {
  2018. hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
  2019. udptable->mask;
  2020. hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask;
  2021. start_lookup:
  2022. hslot = &udptable->hash2[hash2];
  2023. offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
  2024. }
  2025. sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
  2026. if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr,
  2027. uh->source, saddr, dif, sdif, hnum))
  2028. continue;
  2029. if (!first) {
  2030. first = sk;
  2031. continue;
  2032. }
  2033. nskb = skb_clone(skb, GFP_ATOMIC);
  2034. if (unlikely(!nskb)) {
  2035. atomic_inc(&sk->sk_drops);
  2036. __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
  2037. IS_UDPLITE(sk));
  2038. __UDP_INC_STATS(net, UDP_MIB_INERRORS,
  2039. IS_UDPLITE(sk));
  2040. continue;
  2041. }
  2042. if (udp_queue_rcv_skb(sk, nskb) > 0)
  2043. consume_skb(nskb);
  2044. }
  2045. /* Also lookup *:port if we are using hash2 and haven't done so yet. */
  2046. if (use_hash2 && hash2 != hash2_any) {
  2047. hash2 = hash2_any;
  2048. goto start_lookup;
  2049. }
  2050. if (first) {
  2051. if (udp_queue_rcv_skb(first, skb) > 0)
  2052. consume_skb(skb);
  2053. } else {
  2054. kfree_skb(skb);
  2055. __UDP_INC_STATS(net, UDP_MIB_IGNOREDMULTI,
  2056. proto == IPPROTO_UDPLITE);
  2057. }
  2058. return 0;
  2059. }
  2060. /* Initialize UDP checksum. If exited with zero value (success),
  2061. * CHECKSUM_UNNECESSARY means, that no more checks are required.
  2062. * Otherwise, csum completion requires checksumming packet body,
  2063. * including udp header and folding it to skb->csum.
  2064. */
  2065. static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
  2066. int proto)
  2067. {
  2068. int err;
  2069. UDP_SKB_CB(skb)->partial_cov = 0;
  2070. UDP_SKB_CB(skb)->cscov = skb->len;
  2071. if (proto == IPPROTO_UDPLITE) {
  2072. err = udplite_checksum_init(skb, uh);
  2073. if (err)
  2074. return err;
  2075. if (UDP_SKB_CB(skb)->partial_cov) {
  2076. skb->csum = inet_compute_pseudo(skb, proto);
  2077. return 0;
  2078. }
  2079. }
  2080. /* Note, we are only interested in != 0 or == 0, thus the
  2081. * force to int.
  2082. */
  2083. err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
  2084. inet_compute_pseudo);
  2085. if (err)
  2086. return err;
  2087. if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) {
  2088. /* If SW calculated the value, we know it's bad */
  2089. if (skb->csum_complete_sw)
  2090. return 1;
  2091. /* HW says the value is bad. Let's validate that.
  2092. * skb->csum is no longer the full packet checksum,
  2093. * so don't treat it as such.
  2094. */
  2095. skb_checksum_complete_unset(skb);
  2096. }
  2097. return 0;
  2098. }
  2099. /* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
  2100. * return code conversion for ip layer consumption
  2101. */
  2102. static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
  2103. struct udphdr *uh)
  2104. {
  2105. int ret;
  2106. if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
  2107. skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo);
  2108. ret = udp_queue_rcv_skb(sk, skb);
  2109. /* a return value > 0 means to resubmit the input, but
  2110. * it wants the return to be -protocol, or 0
  2111. */
  2112. if (ret > 0)
  2113. return -ret;
  2114. return 0;
  2115. }
  2116. /*
  2117. * All we need to do is get the socket, and then do a checksum.
  2118. */
  2119. int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
  2120. int proto)
  2121. {
  2122. struct sock *sk = NULL;
  2123. struct udphdr *uh;
  2124. unsigned short ulen;
  2125. struct rtable *rt = skb_rtable(skb);
  2126. __be32 saddr, daddr;
  2127. struct net *net = dev_net(skb->dev);
  2128. bool refcounted;
  2129. int drop_reason;
  2130. drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
  2131. /*
  2132. * Validate the packet.
  2133. */
  2134. if (!pskb_may_pull(skb, sizeof(struct udphdr)))
  2135. goto drop; /* No space for header. */
  2136. uh = udp_hdr(skb);
  2137. ulen = ntohs(uh->len);
  2138. saddr = ip_hdr(skb)->saddr;
  2139. daddr = ip_hdr(skb)->daddr;
  2140. if (ulen > skb->len)
  2141. goto short_packet;
  2142. if (proto == IPPROTO_UDP) {
  2143. /* UDP validates ulen. */
  2144. if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
  2145. goto short_packet;
  2146. uh = udp_hdr(skb);
  2147. }
  2148. if (udp4_csum_init(skb, uh, proto))
  2149. goto csum_error;
  2150. sk = inet_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest,
  2151. &refcounted, udp_ehashfn);
  2152. if (IS_ERR(sk))
  2153. goto no_sk;
  2154. if (sk) {
  2155. struct dst_entry *dst = skb_dst(skb);
  2156. int ret;
  2157. if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst))
  2158. udp_sk_rx_dst_set(sk, dst);
  2159. ret = udp_unicast_rcv_skb(sk, skb, uh);
  2160. if (refcounted)
  2161. sock_put(sk);
  2162. return ret;
  2163. }
  2164. if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
  2165. return __udp4_lib_mcast_deliver(net, skb, uh,
  2166. saddr, daddr, udptable, proto);
  2167. sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
  2168. if (sk)
  2169. return udp_unicast_rcv_skb(sk, skb, uh);
  2170. no_sk:
  2171. if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
  2172. goto drop;
  2173. nf_reset_ct(skb);
  2174. /* No socket. Drop packet silently, if checksum is wrong */
  2175. if (udp_lib_checksum_complete(skb))
  2176. goto csum_error;
  2177. drop_reason = SKB_DROP_REASON_NO_SOCKET;
  2178. __UDP_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
  2179. icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
  2180. /*
  2181. * Hmm. We got an UDP packet to a port to which we
  2182. * don't wanna listen. Ignore it.
  2183. */
  2184. sk_skb_reason_drop(sk, skb, drop_reason);
  2185. return 0;
  2186. short_packet:
  2187. drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
  2188. net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
  2189. proto == IPPROTO_UDPLITE ? "Lite" : "",
  2190. &saddr, ntohs(uh->source),
  2191. ulen, skb->len,
  2192. &daddr, ntohs(uh->dest));
  2193. goto drop;
  2194. csum_error:
  2195. /*
  2196. * RFC1122: OK. Discards the bad packet silently (as far as
  2197. * the network is concerned, anyway) as per 4.1.3.4 (MUST).
  2198. */
  2199. drop_reason = SKB_DROP_REASON_UDP_CSUM;
  2200. net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
  2201. proto == IPPROTO_UDPLITE ? "Lite" : "",
  2202. &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
  2203. ulen);
  2204. __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
  2205. drop:
  2206. __UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
  2207. sk_skb_reason_drop(sk, skb, drop_reason);
  2208. return 0;
  2209. }
  2210. /* We can only early demux multicast if there is a single matching socket.
  2211. * If more than one socket found returns NULL
  2212. */
  2213. static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
  2214. __be16 loc_port, __be32 loc_addr,
  2215. __be16 rmt_port, __be32 rmt_addr,
  2216. int dif, int sdif)
  2217. {
  2218. struct udp_table *udptable = net->ipv4.udp_table;
  2219. unsigned short hnum = ntohs(loc_port);
  2220. struct sock *sk, *result;
  2221. struct udp_hslot *hslot;
  2222. unsigned int slot;
  2223. slot = udp_hashfn(net, hnum, udptable->mask);
  2224. hslot = &udptable->hash[slot];
  2225. /* Do not bother scanning a too big list */
  2226. if (hslot->count > 10)
  2227. return NULL;
  2228. result = NULL;
  2229. sk_for_each_rcu(sk, &hslot->head) {
  2230. if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr,
  2231. rmt_port, rmt_addr, dif, sdif, hnum)) {
  2232. if (result)
  2233. return NULL;
  2234. result = sk;
  2235. }
  2236. }
  2237. return result;
  2238. }
  2239. /* For unicast we should only early demux connected sockets or we can
  2240. * break forwarding setups. The chains here can be long so only check
  2241. * if the first socket is an exact match and if not move on.
  2242. */
  2243. static struct sock *__udp4_lib_demux_lookup(struct net *net,
  2244. __be16 loc_port, __be32 loc_addr,
  2245. __be16 rmt_port, __be32 rmt_addr,
  2246. int dif, int sdif)
  2247. {
  2248. struct udp_table *udptable = net->ipv4.udp_table;
  2249. INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
  2250. unsigned short hnum = ntohs(loc_port);
  2251. unsigned int hash2, slot2;
  2252. struct udp_hslot *hslot2;
  2253. __portpair ports;
  2254. struct sock *sk;
  2255. hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
  2256. slot2 = hash2 & udptable->mask;
  2257. hslot2 = &udptable->hash2[slot2];
  2258. ports = INET_COMBINED_PORTS(rmt_port, hnum);
  2259. udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
  2260. if (inet_match(net, sk, acookie, ports, dif, sdif))
  2261. return sk;
  2262. /* Only check first socket in chain */
  2263. break;
  2264. }
  2265. return NULL;
  2266. }
  2267. int udp_v4_early_demux(struct sk_buff *skb)
  2268. {
  2269. struct net *net = dev_net(skb->dev);
  2270. struct in_device *in_dev = NULL;
  2271. const struct iphdr *iph;
  2272. const struct udphdr *uh;
  2273. struct sock *sk = NULL;
  2274. struct dst_entry *dst;
  2275. int dif = skb->dev->ifindex;
  2276. int sdif = inet_sdif(skb);
  2277. int ours;
  2278. /* validate the packet */
  2279. if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
  2280. return 0;
  2281. iph = ip_hdr(skb);
  2282. uh = udp_hdr(skb);
  2283. if (skb->pkt_type == PACKET_MULTICAST) {
  2284. in_dev = __in_dev_get_rcu(skb->dev);
  2285. if (!in_dev)
  2286. return 0;
  2287. ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
  2288. iph->protocol);
  2289. if (!ours)
  2290. return 0;
  2291. sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
  2292. uh->source, iph->saddr,
  2293. dif, sdif);
  2294. } else if (skb->pkt_type == PACKET_HOST) {
  2295. sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
  2296. uh->source, iph->saddr, dif, sdif);
  2297. }
  2298. if (!sk)
  2299. return 0;
  2300. skb->sk = sk;
  2301. DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk));
  2302. skb->destructor = sock_pfree;
  2303. dst = rcu_dereference(sk->sk_rx_dst);
  2304. if (dst)
  2305. dst = dst_check(dst, 0);
  2306. if (dst) {
  2307. u32 itag = 0;
  2308. /* set noref for now.
  2309. * any place which wants to hold dst has to call
  2310. * dst_hold_safe()
  2311. */
  2312. skb_dst_set_noref(skb, dst);
  2313. /* for unconnected multicast sockets we need to validate
  2314. * the source on each packet
  2315. */
  2316. if (!inet_sk(sk)->inet_daddr && in_dev)
  2317. return ip_mc_validate_source(skb, iph->daddr,
  2318. iph->saddr,
  2319. iph->tos & INET_DSCP_MASK,
  2320. skb->dev, in_dev, &itag);
  2321. }
  2322. return 0;
  2323. }
  2324. int udp_rcv(struct sk_buff *skb)
  2325. {
  2326. return __udp4_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP);
  2327. }
  2328. void udp_destroy_sock(struct sock *sk)
  2329. {
  2330. struct udp_sock *up = udp_sk(sk);
  2331. bool slow = lock_sock_fast(sk);
  2332. /* protects from races with udp_abort() */
  2333. sock_set_flag(sk, SOCK_DEAD);
  2334. udp_flush_pending_frames(sk);
  2335. unlock_sock_fast(sk, slow);
  2336. if (static_branch_unlikely(&udp_encap_needed_key)) {
  2337. if (up->encap_type) {
  2338. void (*encap_destroy)(struct sock *sk);
  2339. encap_destroy = READ_ONCE(up->encap_destroy);
  2340. if (encap_destroy)
  2341. encap_destroy(sk);
  2342. }
  2343. if (udp_test_bit(ENCAP_ENABLED, sk))
  2344. static_branch_dec(&udp_encap_needed_key);
  2345. }
  2346. }
  2347. static void set_xfrm_gro_udp_encap_rcv(__u16 encap_type, unsigned short family,
  2348. struct sock *sk)
  2349. {
  2350. #ifdef CONFIG_XFRM
  2351. if (udp_test_bit(GRO_ENABLED, sk) && encap_type == UDP_ENCAP_ESPINUDP) {
  2352. if (family == AF_INET)
  2353. WRITE_ONCE(udp_sk(sk)->gro_receive, xfrm4_gro_udp_encap_rcv);
  2354. else if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6)
  2355. WRITE_ONCE(udp_sk(sk)->gro_receive, ipv6_stub->xfrm6_gro_udp_encap_rcv);
  2356. }
  2357. #endif
  2358. }
  2359. /*
  2360. * Socket option code for UDP
  2361. */
  2362. int udp_lib_setsockopt(struct sock *sk, int level, int optname,
  2363. sockptr_t optval, unsigned int optlen,
  2364. int (*push_pending_frames)(struct sock *))
  2365. {
  2366. struct udp_sock *up = udp_sk(sk);
  2367. int val, valbool;
  2368. int err = 0;
  2369. int is_udplite = IS_UDPLITE(sk);
  2370. if (level == SOL_SOCKET) {
  2371. err = sk_setsockopt(sk, level, optname, optval, optlen);
  2372. if (optname == SO_RCVBUF || optname == SO_RCVBUFFORCE) {
  2373. sockopt_lock_sock(sk);
  2374. /* paired with READ_ONCE in udp_rmem_release() */
  2375. WRITE_ONCE(up->forward_threshold, sk->sk_rcvbuf >> 2);
  2376. sockopt_release_sock(sk);
  2377. }
  2378. return err;
  2379. }
  2380. if (optlen < sizeof(int))
  2381. return -EINVAL;
  2382. if (copy_from_sockptr(&val, optval, sizeof(val)))
  2383. return -EFAULT;
  2384. valbool = val ? 1 : 0;
  2385. switch (optname) {
  2386. case UDP_CORK:
  2387. if (val != 0) {
  2388. udp_set_bit(CORK, sk);
  2389. } else {
  2390. udp_clear_bit(CORK, sk);
  2391. lock_sock(sk);
  2392. push_pending_frames(sk);
  2393. release_sock(sk);
  2394. }
  2395. break;
  2396. case UDP_ENCAP:
  2397. switch (val) {
  2398. case 0:
  2399. #ifdef CONFIG_XFRM
  2400. case UDP_ENCAP_ESPINUDP:
  2401. set_xfrm_gro_udp_encap_rcv(val, sk->sk_family, sk);
  2402. #if IS_ENABLED(CONFIG_IPV6)
  2403. if (sk->sk_family == AF_INET6)
  2404. WRITE_ONCE(up->encap_rcv,
  2405. ipv6_stub->xfrm6_udp_encap_rcv);
  2406. else
  2407. #endif
  2408. WRITE_ONCE(up->encap_rcv,
  2409. xfrm4_udp_encap_rcv);
  2410. #endif
  2411. fallthrough;
  2412. case UDP_ENCAP_L2TPINUDP:
  2413. WRITE_ONCE(up->encap_type, val);
  2414. udp_tunnel_encap_enable(sk);
  2415. break;
  2416. default:
  2417. err = -ENOPROTOOPT;
  2418. break;
  2419. }
  2420. break;
  2421. case UDP_NO_CHECK6_TX:
  2422. udp_set_no_check6_tx(sk, valbool);
  2423. break;
  2424. case UDP_NO_CHECK6_RX:
  2425. udp_set_no_check6_rx(sk, valbool);
  2426. break;
  2427. case UDP_SEGMENT:
  2428. if (val < 0 || val > USHRT_MAX)
  2429. return -EINVAL;
  2430. WRITE_ONCE(up->gso_size, val);
  2431. break;
  2432. case UDP_GRO:
  2433. /* when enabling GRO, accept the related GSO packet type */
  2434. if (valbool)
  2435. udp_tunnel_encap_enable(sk);
  2436. udp_assign_bit(GRO_ENABLED, sk, valbool);
  2437. udp_assign_bit(ACCEPT_L4, sk, valbool);
  2438. set_xfrm_gro_udp_encap_rcv(up->encap_type, sk->sk_family, sk);
  2439. break;
  2440. /*
  2441. * UDP-Lite's partial checksum coverage (RFC 3828).
  2442. */
  2443. /* The sender sets actual checksum coverage length via this option.
  2444. * The case coverage > packet length is handled by send module. */
  2445. case UDPLITE_SEND_CSCOV:
  2446. if (!is_udplite) /* Disable the option on UDP sockets */
  2447. return -ENOPROTOOPT;
  2448. if (val != 0 && val < 8) /* Illegal coverage: use default (8) */
  2449. val = 8;
  2450. else if (val > USHRT_MAX)
  2451. val = USHRT_MAX;
  2452. WRITE_ONCE(up->pcslen, val);
  2453. udp_set_bit(UDPLITE_SEND_CC, sk);
  2454. break;
  2455. /* The receiver specifies a minimum checksum coverage value. To make
  2456. * sense, this should be set to at least 8 (as done below). If zero is
  2457. * used, this again means full checksum coverage. */
  2458. case UDPLITE_RECV_CSCOV:
  2459. if (!is_udplite) /* Disable the option on UDP sockets */
  2460. return -ENOPROTOOPT;
  2461. if (val != 0 && val < 8) /* Avoid silly minimal values. */
  2462. val = 8;
  2463. else if (val > USHRT_MAX)
  2464. val = USHRT_MAX;
  2465. WRITE_ONCE(up->pcrlen, val);
  2466. udp_set_bit(UDPLITE_RECV_CC, sk);
  2467. break;
  2468. default:
  2469. err = -ENOPROTOOPT;
  2470. break;
  2471. }
  2472. return err;
  2473. }
  2474. EXPORT_SYMBOL(udp_lib_setsockopt);
  2475. int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
  2476. unsigned int optlen)
  2477. {
  2478. if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET)
  2479. return udp_lib_setsockopt(sk, level, optname,
  2480. optval, optlen,
  2481. udp_push_pending_frames);
  2482. return ip_setsockopt(sk, level, optname, optval, optlen);
  2483. }
  2484. int udp_lib_getsockopt(struct sock *sk, int level, int optname,
  2485. char __user *optval, int __user *optlen)
  2486. {
  2487. struct udp_sock *up = udp_sk(sk);
  2488. int val, len;
  2489. if (get_user(len, optlen))
  2490. return -EFAULT;
  2491. if (len < 0)
  2492. return -EINVAL;
  2493. len = min_t(unsigned int, len, sizeof(int));
  2494. switch (optname) {
  2495. case UDP_CORK:
  2496. val = udp_test_bit(CORK, sk);
  2497. break;
  2498. case UDP_ENCAP:
  2499. val = READ_ONCE(up->encap_type);
  2500. break;
  2501. case UDP_NO_CHECK6_TX:
  2502. val = udp_get_no_check6_tx(sk);
  2503. break;
  2504. case UDP_NO_CHECK6_RX:
  2505. val = udp_get_no_check6_rx(sk);
  2506. break;
  2507. case UDP_SEGMENT:
  2508. val = READ_ONCE(up->gso_size);
  2509. break;
  2510. case UDP_GRO:
  2511. val = udp_test_bit(GRO_ENABLED, sk);
  2512. break;
  2513. /* The following two cannot be changed on UDP sockets, the return is
  2514. * always 0 (which corresponds to the full checksum coverage of UDP). */
  2515. case UDPLITE_SEND_CSCOV:
  2516. val = READ_ONCE(up->pcslen);
  2517. break;
  2518. case UDPLITE_RECV_CSCOV:
  2519. val = READ_ONCE(up->pcrlen);
  2520. break;
  2521. default:
  2522. return -ENOPROTOOPT;
  2523. }
  2524. if (put_user(len, optlen))
  2525. return -EFAULT;
  2526. if (copy_to_user(optval, &val, len))
  2527. return -EFAULT;
  2528. return 0;
  2529. }
  2530. EXPORT_SYMBOL(udp_lib_getsockopt);
  2531. int udp_getsockopt(struct sock *sk, int level, int optname,
  2532. char __user *optval, int __user *optlen)
  2533. {
  2534. if (level == SOL_UDP || level == SOL_UDPLITE)
  2535. return udp_lib_getsockopt(sk, level, optname, optval, optlen);
  2536. return ip_getsockopt(sk, level, optname, optval, optlen);
  2537. }
  2538. /**
  2539. * udp_poll - wait for a UDP event.
  2540. * @file: - file struct
  2541. * @sock: - socket
  2542. * @wait: - poll table
  2543. *
  2544. * This is same as datagram poll, except for the special case of
  2545. * blocking sockets. If application is using a blocking fd
  2546. * and a packet with checksum error is in the queue;
  2547. * then it could get return from select indicating data available
  2548. * but then block when reading it. Add special case code
  2549. * to work around these arguably broken applications.
  2550. */
  2551. __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
  2552. {
  2553. __poll_t mask = datagram_poll(file, sock, wait);
  2554. struct sock *sk = sock->sk;
  2555. if (!skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
  2556. mask |= EPOLLIN | EPOLLRDNORM;
  2557. /* Check for false positives due to checksum errors */
  2558. if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
  2559. !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
  2560. mask &= ~(EPOLLIN | EPOLLRDNORM);
  2561. /* psock ingress_msg queue should not contain any bad checksum frames */
  2562. if (sk_is_readable(sk))
  2563. mask |= EPOLLIN | EPOLLRDNORM;
  2564. return mask;
  2565. }
  2566. EXPORT_SYMBOL(udp_poll);
  2567. int udp_abort(struct sock *sk, int err)
  2568. {
  2569. if (!has_current_bpf_ctx())
  2570. lock_sock(sk);
  2571. /* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing
  2572. * with close()
  2573. */
  2574. if (sock_flag(sk, SOCK_DEAD))
  2575. goto out;
  2576. sk->sk_err = err;
  2577. sk_error_report(sk);
  2578. __udp_disconnect(sk, 0);
  2579. out:
  2580. if (!has_current_bpf_ctx())
  2581. release_sock(sk);
  2582. return 0;
  2583. }
  2584. EXPORT_SYMBOL_GPL(udp_abort);
  2585. struct proto udp_prot = {
  2586. .name = "UDP",
  2587. .owner = THIS_MODULE,
  2588. .close = udp_lib_close,
  2589. .pre_connect = udp_pre_connect,
  2590. .connect = ip4_datagram_connect,
  2591. .disconnect = udp_disconnect,
  2592. .ioctl = udp_ioctl,
  2593. .init = udp_init_sock,
  2594. .destroy = udp_destroy_sock,
  2595. .setsockopt = udp_setsockopt,
  2596. .getsockopt = udp_getsockopt,
  2597. .sendmsg = udp_sendmsg,
  2598. .recvmsg = udp_recvmsg,
  2599. .splice_eof = udp_splice_eof,
  2600. .release_cb = ip4_datagram_release_cb,
  2601. .hash = udp_lib_hash,
  2602. .unhash = udp_lib_unhash,
  2603. .rehash = udp_v4_rehash,
  2604. .get_port = udp_v4_get_port,
  2605. .put_port = udp_lib_unhash,
  2606. #ifdef CONFIG_BPF_SYSCALL
  2607. .psock_update_sk_prot = udp_bpf_update_proto,
  2608. #endif
  2609. .memory_allocated = &udp_memory_allocated,
  2610. .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc,
  2611. .sysctl_mem = sysctl_udp_mem,
  2612. .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
  2613. .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
  2614. .obj_size = sizeof(struct udp_sock),
  2615. .h.udp_table = NULL,
  2616. .diag_destroy = udp_abort,
  2617. };
  2618. EXPORT_SYMBOL(udp_prot);
  2619. /* ------------------------------------------------------------------------ */
  2620. #ifdef CONFIG_PROC_FS
  2621. static unsigned short seq_file_family(const struct seq_file *seq);
  2622. static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
  2623. {
  2624. unsigned short family = seq_file_family(seq);
  2625. /* AF_UNSPEC is used as a match all */
  2626. return ((family == AF_UNSPEC || family == sk->sk_family) &&
  2627. net_eq(sock_net(sk), seq_file_net(seq)));
  2628. }
  2629. #ifdef CONFIG_BPF_SYSCALL
  2630. static const struct seq_operations bpf_iter_udp_seq_ops;
  2631. #endif
  2632. static struct udp_table *udp_get_table_seq(struct seq_file *seq,
  2633. struct net *net)
  2634. {
  2635. const struct udp_seq_afinfo *afinfo;
  2636. #ifdef CONFIG_BPF_SYSCALL
  2637. if (seq->op == &bpf_iter_udp_seq_ops)
  2638. return net->ipv4.udp_table;
  2639. #endif
  2640. afinfo = pde_data(file_inode(seq->file));
  2641. return afinfo->udp_table ? : net->ipv4.udp_table;
  2642. }
  2643. static struct sock *udp_get_first(struct seq_file *seq, int start)
  2644. {
  2645. struct udp_iter_state *state = seq->private;
  2646. struct net *net = seq_file_net(seq);
  2647. struct udp_table *udptable;
  2648. struct sock *sk;
  2649. udptable = udp_get_table_seq(seq, net);
  2650. for (state->bucket = start; state->bucket <= udptable->mask;
  2651. ++state->bucket) {
  2652. struct udp_hslot *hslot = &udptable->hash[state->bucket];
  2653. if (hlist_empty(&hslot->head))
  2654. continue;
  2655. spin_lock_bh(&hslot->lock);
  2656. sk_for_each(sk, &hslot->head) {
  2657. if (seq_sk_match(seq, sk))
  2658. goto found;
  2659. }
  2660. spin_unlock_bh(&hslot->lock);
  2661. }
  2662. sk = NULL;
  2663. found:
  2664. return sk;
  2665. }
  2666. static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
  2667. {
  2668. struct udp_iter_state *state = seq->private;
  2669. struct net *net = seq_file_net(seq);
  2670. struct udp_table *udptable;
  2671. do {
  2672. sk = sk_next(sk);
  2673. } while (sk && !seq_sk_match(seq, sk));
  2674. if (!sk) {
  2675. udptable = udp_get_table_seq(seq, net);
  2676. if (state->bucket <= udptable->mask)
  2677. spin_unlock_bh(&udptable->hash[state->bucket].lock);
  2678. return udp_get_first(seq, state->bucket + 1);
  2679. }
  2680. return sk;
  2681. }
  2682. static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
  2683. {
  2684. struct sock *sk = udp_get_first(seq, 0);
  2685. if (sk)
  2686. while (pos && (sk = udp_get_next(seq, sk)) != NULL)
  2687. --pos;
  2688. return pos ? NULL : sk;
  2689. }
  2690. void *udp_seq_start(struct seq_file *seq, loff_t *pos)
  2691. {
  2692. struct udp_iter_state *state = seq->private;
  2693. state->bucket = MAX_UDP_PORTS;
  2694. return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
  2695. }
  2696. EXPORT_SYMBOL(udp_seq_start);
  2697. void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  2698. {
  2699. struct sock *sk;
  2700. if (v == SEQ_START_TOKEN)
  2701. sk = udp_get_idx(seq, 0);
  2702. else
  2703. sk = udp_get_next(seq, v);
  2704. ++*pos;
  2705. return sk;
  2706. }
  2707. EXPORT_SYMBOL(udp_seq_next);
  2708. void udp_seq_stop(struct seq_file *seq, void *v)
  2709. {
  2710. struct udp_iter_state *state = seq->private;
  2711. struct udp_table *udptable;
  2712. udptable = udp_get_table_seq(seq, seq_file_net(seq));
  2713. if (state->bucket <= udptable->mask)
  2714. spin_unlock_bh(&udptable->hash[state->bucket].lock);
  2715. }
  2716. EXPORT_SYMBOL(udp_seq_stop);
  2717. /* ------------------------------------------------------------------------ */
  2718. static void udp4_format_sock(struct sock *sp, struct seq_file *f,
  2719. int bucket)
  2720. {
  2721. struct inet_sock *inet = inet_sk(sp);
  2722. __be32 dest = inet->inet_daddr;
  2723. __be32 src = inet->inet_rcv_saddr;
  2724. __u16 destp = ntohs(inet->inet_dport);
  2725. __u16 srcp = ntohs(inet->inet_sport);
  2726. seq_printf(f, "%5d: %08X:%04X %08X:%04X"
  2727. " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u",
  2728. bucket, src, srcp, dest, destp, sp->sk_state,
  2729. sk_wmem_alloc_get(sp),
  2730. udp_rqueue_get(sp),
  2731. 0, 0L, 0,
  2732. from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
  2733. 0, sock_i_ino(sp),
  2734. refcount_read(&sp->sk_refcnt), sp,
  2735. atomic_read(&sp->sk_drops));
  2736. }
  2737. int udp4_seq_show(struct seq_file *seq, void *v)
  2738. {
  2739. seq_setwidth(seq, 127);
  2740. if (v == SEQ_START_TOKEN)
  2741. seq_puts(seq, " sl local_address rem_address st tx_queue "
  2742. "rx_queue tr tm->when retrnsmt uid timeout "
  2743. "inode ref pointer drops");
  2744. else {
  2745. struct udp_iter_state *state = seq->private;
  2746. udp4_format_sock(v, seq, state->bucket);
  2747. }
  2748. seq_pad(seq, '\n');
  2749. return 0;
  2750. }
  2751. #ifdef CONFIG_BPF_SYSCALL
  2752. struct bpf_iter__udp {
  2753. __bpf_md_ptr(struct bpf_iter_meta *, meta);
  2754. __bpf_md_ptr(struct udp_sock *, udp_sk);
  2755. uid_t uid __aligned(8);
  2756. int bucket __aligned(8);
  2757. };
  2758. struct bpf_udp_iter_state {
  2759. struct udp_iter_state state;
  2760. unsigned int cur_sk;
  2761. unsigned int end_sk;
  2762. unsigned int max_sk;
  2763. int offset;
  2764. struct sock **batch;
  2765. bool st_bucket_done;
  2766. };
  2767. static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
  2768. unsigned int new_batch_sz);
  2769. static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
  2770. {
  2771. struct bpf_udp_iter_state *iter = seq->private;
  2772. struct udp_iter_state *state = &iter->state;
  2773. struct net *net = seq_file_net(seq);
  2774. int resume_bucket, resume_offset;
  2775. struct udp_table *udptable;
  2776. unsigned int batch_sks = 0;
  2777. bool resized = false;
  2778. struct sock *sk;
  2779. resume_bucket = state->bucket;
  2780. resume_offset = iter->offset;
  2781. /* The current batch is done, so advance the bucket. */
  2782. if (iter->st_bucket_done)
  2783. state->bucket++;
  2784. udptable = udp_get_table_seq(seq, net);
  2785. again:
  2786. /* New batch for the next bucket.
  2787. * Iterate over the hash table to find a bucket with sockets matching
  2788. * the iterator attributes, and return the first matching socket from
  2789. * the bucket. The remaining matched sockets from the bucket are batched
  2790. * before releasing the bucket lock. This allows BPF programs that are
  2791. * called in seq_show to acquire the bucket lock if needed.
  2792. */
  2793. iter->cur_sk = 0;
  2794. iter->end_sk = 0;
  2795. iter->st_bucket_done = false;
  2796. batch_sks = 0;
  2797. for (; state->bucket <= udptable->mask; state->bucket++) {
  2798. struct udp_hslot *hslot2 = &udptable->hash2[state->bucket];
  2799. if (hlist_empty(&hslot2->head))
  2800. continue;
  2801. iter->offset = 0;
  2802. spin_lock_bh(&hslot2->lock);
  2803. udp_portaddr_for_each_entry(sk, &hslot2->head) {
  2804. if (seq_sk_match(seq, sk)) {
  2805. /* Resume from the last iterated socket at the
  2806. * offset in the bucket before iterator was stopped.
  2807. */
  2808. if (state->bucket == resume_bucket &&
  2809. iter->offset < resume_offset) {
  2810. ++iter->offset;
  2811. continue;
  2812. }
  2813. if (iter->end_sk < iter->max_sk) {
  2814. sock_hold(sk);
  2815. iter->batch[iter->end_sk++] = sk;
  2816. }
  2817. batch_sks++;
  2818. }
  2819. }
  2820. spin_unlock_bh(&hslot2->lock);
  2821. if (iter->end_sk)
  2822. break;
  2823. }
  2824. /* All done: no batch made. */
  2825. if (!iter->end_sk)
  2826. return NULL;
  2827. if (iter->end_sk == batch_sks) {
  2828. /* Batching is done for the current bucket; return the first
  2829. * socket to be iterated from the batch.
  2830. */
  2831. iter->st_bucket_done = true;
  2832. goto done;
  2833. }
  2834. if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) {
  2835. resized = true;
  2836. /* After allocating a larger batch, retry one more time to grab
  2837. * the whole bucket.
  2838. */
  2839. goto again;
  2840. }
  2841. done:
  2842. return iter->batch[0];
  2843. }
  2844. static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  2845. {
  2846. struct bpf_udp_iter_state *iter = seq->private;
  2847. struct sock *sk;
  2848. /* Whenever seq_next() is called, the iter->cur_sk is
  2849. * done with seq_show(), so unref the iter->cur_sk.
  2850. */
  2851. if (iter->cur_sk < iter->end_sk) {
  2852. sock_put(iter->batch[iter->cur_sk++]);
  2853. ++iter->offset;
  2854. }
  2855. /* After updating iter->cur_sk, check if there are more sockets
  2856. * available in the current bucket batch.
  2857. */
  2858. if (iter->cur_sk < iter->end_sk)
  2859. sk = iter->batch[iter->cur_sk];
  2860. else
  2861. /* Prepare a new batch. */
  2862. sk = bpf_iter_udp_batch(seq);
  2863. ++*pos;
  2864. return sk;
  2865. }
  2866. static void *bpf_iter_udp_seq_start(struct seq_file *seq, loff_t *pos)
  2867. {
  2868. /* bpf iter does not support lseek, so it always
  2869. * continue from where it was stop()-ped.
  2870. */
  2871. if (*pos)
  2872. return bpf_iter_udp_batch(seq);
  2873. return SEQ_START_TOKEN;
  2874. }
  2875. static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
  2876. struct udp_sock *udp_sk, uid_t uid, int bucket)
  2877. {
  2878. struct bpf_iter__udp ctx;
  2879. meta->seq_num--; /* skip SEQ_START_TOKEN */
  2880. ctx.meta = meta;
  2881. ctx.udp_sk = udp_sk;
  2882. ctx.uid = uid;
  2883. ctx.bucket = bucket;
  2884. return bpf_iter_run_prog(prog, &ctx);
  2885. }
  2886. static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v)
  2887. {
  2888. struct udp_iter_state *state = seq->private;
  2889. struct bpf_iter_meta meta;
  2890. struct bpf_prog *prog;
  2891. struct sock *sk = v;
  2892. uid_t uid;
  2893. int ret;
  2894. if (v == SEQ_START_TOKEN)
  2895. return 0;
  2896. lock_sock(sk);
  2897. if (unlikely(sk_unhashed(sk))) {
  2898. ret = SEQ_SKIP;
  2899. goto unlock;
  2900. }
  2901. uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
  2902. meta.seq = seq;
  2903. prog = bpf_iter_get_info(&meta, false);
  2904. ret = udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
  2905. unlock:
  2906. release_sock(sk);
  2907. return ret;
  2908. }
  2909. static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter)
  2910. {
  2911. while (iter->cur_sk < iter->end_sk)
  2912. sock_put(iter->batch[iter->cur_sk++]);
  2913. }
  2914. static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
  2915. {
  2916. struct bpf_udp_iter_state *iter = seq->private;
  2917. struct bpf_iter_meta meta;
  2918. struct bpf_prog *prog;
  2919. if (!v) {
  2920. meta.seq = seq;
  2921. prog = bpf_iter_get_info(&meta, true);
  2922. if (prog)
  2923. (void)udp_prog_seq_show(prog, &meta, v, 0, 0);
  2924. }
  2925. if (iter->cur_sk < iter->end_sk) {
  2926. bpf_iter_udp_put_batch(iter);
  2927. iter->st_bucket_done = false;
  2928. }
  2929. }
  2930. static const struct seq_operations bpf_iter_udp_seq_ops = {
  2931. .start = bpf_iter_udp_seq_start,
  2932. .next = bpf_iter_udp_seq_next,
  2933. .stop = bpf_iter_udp_seq_stop,
  2934. .show = bpf_iter_udp_seq_show,
  2935. };
  2936. #endif
  2937. static unsigned short seq_file_family(const struct seq_file *seq)
  2938. {
  2939. const struct udp_seq_afinfo *afinfo;
  2940. #ifdef CONFIG_BPF_SYSCALL
  2941. /* BPF iterator: bpf programs to filter sockets. */
  2942. if (seq->op == &bpf_iter_udp_seq_ops)
  2943. return AF_UNSPEC;
  2944. #endif
  2945. /* Proc fs iterator */
  2946. afinfo = pde_data(file_inode(seq->file));
  2947. return afinfo->family;
  2948. }
  2949. const struct seq_operations udp_seq_ops = {
  2950. .start = udp_seq_start,
  2951. .next = udp_seq_next,
  2952. .stop = udp_seq_stop,
  2953. .show = udp4_seq_show,
  2954. };
  2955. EXPORT_SYMBOL(udp_seq_ops);
  2956. static struct udp_seq_afinfo udp4_seq_afinfo = {
  2957. .family = AF_INET,
  2958. .udp_table = NULL,
  2959. };
  2960. static int __net_init udp4_proc_init_net(struct net *net)
  2961. {
  2962. if (!proc_create_net_data("udp", 0444, net->proc_net, &udp_seq_ops,
  2963. sizeof(struct udp_iter_state), &udp4_seq_afinfo))
  2964. return -ENOMEM;
  2965. return 0;
  2966. }
  2967. static void __net_exit udp4_proc_exit_net(struct net *net)
  2968. {
  2969. remove_proc_entry("udp", net->proc_net);
  2970. }
  2971. static struct pernet_operations udp4_net_ops = {
  2972. .init = udp4_proc_init_net,
  2973. .exit = udp4_proc_exit_net,
  2974. };
  2975. int __init udp4_proc_init(void)
  2976. {
  2977. return register_pernet_subsys(&udp4_net_ops);
  2978. }
  2979. void udp4_proc_exit(void)
  2980. {
  2981. unregister_pernet_subsys(&udp4_net_ops);
  2982. }
  2983. #endif /* CONFIG_PROC_FS */
  2984. static __initdata unsigned long uhash_entries;
  2985. static int __init set_uhash_entries(char *str)
  2986. {
  2987. ssize_t ret;
  2988. if (!str)
  2989. return 0;
  2990. ret = kstrtoul(str, 0, &uhash_entries);
  2991. if (ret)
  2992. return 0;
  2993. if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
  2994. uhash_entries = UDP_HTABLE_SIZE_MIN;
  2995. return 1;
  2996. }
  2997. __setup("uhash_entries=", set_uhash_entries);
  2998. void __init udp_table_init(struct udp_table *table, const char *name)
  2999. {
  3000. unsigned int i;
  3001. table->hash = alloc_large_system_hash(name,
  3002. 2 * sizeof(struct udp_hslot),
  3003. uhash_entries,
  3004. 21, /* one slot per 2 MB */
  3005. 0,
  3006. &table->log,
  3007. &table->mask,
  3008. UDP_HTABLE_SIZE_MIN,
  3009. UDP_HTABLE_SIZE_MAX);
  3010. table->hash2 = table->hash + (table->mask + 1);
  3011. for (i = 0; i <= table->mask; i++) {
  3012. INIT_HLIST_HEAD(&table->hash[i].head);
  3013. table->hash[i].count = 0;
  3014. spin_lock_init(&table->hash[i].lock);
  3015. }
  3016. for (i = 0; i <= table->mask; i++) {
  3017. INIT_HLIST_HEAD(&table->hash2[i].head);
  3018. table->hash2[i].count = 0;
  3019. spin_lock_init(&table->hash2[i].lock);
  3020. }
  3021. }
  3022. u32 udp_flow_hashrnd(void)
  3023. {
  3024. static u32 hashrnd __read_mostly;
  3025. net_get_random_once(&hashrnd, sizeof(hashrnd));
  3026. return hashrnd;
  3027. }
  3028. EXPORT_SYMBOL(udp_flow_hashrnd);
  3029. static void __net_init udp_sysctl_init(struct net *net)
  3030. {
  3031. net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE;
  3032. net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE;
  3033. #ifdef CONFIG_NET_L3_MASTER_DEV
  3034. net->ipv4.sysctl_udp_l3mdev_accept = 0;
  3035. #endif
  3036. }
  3037. static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries)
  3038. {
  3039. struct udp_table *udptable;
  3040. int i;
  3041. udptable = kmalloc(sizeof(*udptable), GFP_KERNEL);
  3042. if (!udptable)
  3043. goto out;
  3044. udptable->hash = vmalloc_huge(hash_entries * 2 * sizeof(struct udp_hslot),
  3045. GFP_KERNEL_ACCOUNT);
  3046. if (!udptable->hash)
  3047. goto free_table;
  3048. udptable->hash2 = udptable->hash + hash_entries;
  3049. udptable->mask = hash_entries - 1;
  3050. udptable->log = ilog2(hash_entries);
  3051. for (i = 0; i < hash_entries; i++) {
  3052. INIT_HLIST_HEAD(&udptable->hash[i].head);
  3053. udptable->hash[i].count = 0;
  3054. spin_lock_init(&udptable->hash[i].lock);
  3055. INIT_HLIST_HEAD(&udptable->hash2[i].head);
  3056. udptable->hash2[i].count = 0;
  3057. spin_lock_init(&udptable->hash2[i].lock);
  3058. }
  3059. return udptable;
  3060. free_table:
  3061. kfree(udptable);
  3062. out:
  3063. return NULL;
  3064. }
  3065. static void __net_exit udp_pernet_table_free(struct net *net)
  3066. {
  3067. struct udp_table *udptable = net->ipv4.udp_table;
  3068. if (udptable == &udp_table)
  3069. return;
  3070. kvfree(udptable->hash);
  3071. kfree(udptable);
  3072. }
  3073. static void __net_init udp_set_table(struct net *net)
  3074. {
  3075. struct udp_table *udptable;
  3076. unsigned int hash_entries;
  3077. struct net *old_net;
  3078. if (net_eq(net, &init_net))
  3079. goto fallback;
  3080. old_net = current->nsproxy->net_ns;
  3081. hash_entries = READ_ONCE(old_net->ipv4.sysctl_udp_child_hash_entries);
  3082. if (!hash_entries)
  3083. goto fallback;
  3084. /* Set min to keep the bitmap on stack in udp_lib_get_port() */
  3085. if (hash_entries < UDP_HTABLE_SIZE_MIN_PERNET)
  3086. hash_entries = UDP_HTABLE_SIZE_MIN_PERNET;
  3087. else
  3088. hash_entries = roundup_pow_of_two(hash_entries);
  3089. udptable = udp_pernet_table_alloc(hash_entries);
  3090. if (udptable) {
  3091. net->ipv4.udp_table = udptable;
  3092. } else {
  3093. pr_warn("Failed to allocate UDP hash table (entries: %u) "
  3094. "for a netns, fallback to the global one\n",
  3095. hash_entries);
  3096. fallback:
  3097. net->ipv4.udp_table = &udp_table;
  3098. }
  3099. }
  3100. static int __net_init udp_pernet_init(struct net *net)
  3101. {
  3102. udp_sysctl_init(net);
  3103. udp_set_table(net);
  3104. return 0;
  3105. }
  3106. static void __net_exit udp_pernet_exit(struct net *net)
  3107. {
  3108. udp_pernet_table_free(net);
  3109. }
  3110. static struct pernet_operations __net_initdata udp_sysctl_ops = {
  3111. .init = udp_pernet_init,
  3112. .exit = udp_pernet_exit,
  3113. };
  3114. #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
  3115. DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
  3116. struct udp_sock *udp_sk, uid_t uid, int bucket)
  3117. static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
  3118. unsigned int new_batch_sz)
  3119. {
  3120. struct sock **new_batch;
  3121. new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch),
  3122. GFP_USER | __GFP_NOWARN);
  3123. if (!new_batch)
  3124. return -ENOMEM;
  3125. bpf_iter_udp_put_batch(iter);
  3126. kvfree(iter->batch);
  3127. iter->batch = new_batch;
  3128. iter->max_sk = new_batch_sz;
  3129. return 0;
  3130. }
  3131. #define INIT_BATCH_SZ 16
  3132. static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
  3133. {
  3134. struct bpf_udp_iter_state *iter = priv_data;
  3135. int ret;
  3136. ret = bpf_iter_init_seq_net(priv_data, aux);
  3137. if (ret)
  3138. return ret;
  3139. ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ);
  3140. if (ret)
  3141. bpf_iter_fini_seq_net(priv_data);
  3142. return ret;
  3143. }
  3144. static void bpf_iter_fini_udp(void *priv_data)
  3145. {
  3146. struct bpf_udp_iter_state *iter = priv_data;
  3147. bpf_iter_fini_seq_net(priv_data);
  3148. kvfree(iter->batch);
  3149. }
  3150. static const struct bpf_iter_seq_info udp_seq_info = {
  3151. .seq_ops = &bpf_iter_udp_seq_ops,
  3152. .init_seq_private = bpf_iter_init_udp,
  3153. .fini_seq_private = bpf_iter_fini_udp,
  3154. .seq_priv_size = sizeof(struct bpf_udp_iter_state),
  3155. };
  3156. static struct bpf_iter_reg udp_reg_info = {
  3157. .target = "udp",
  3158. .ctx_arg_info_size = 1,
  3159. .ctx_arg_info = {
  3160. { offsetof(struct bpf_iter__udp, udp_sk),
  3161. PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
  3162. },
  3163. .seq_info = &udp_seq_info,
  3164. };
  3165. static void __init bpf_iter_register(void)
  3166. {
  3167. udp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UDP];
  3168. if (bpf_iter_reg_target(&udp_reg_info))
  3169. pr_warn("Warning: could not register bpf iterator udp\n");
  3170. }
  3171. #endif
  3172. void __init udp_init(void)
  3173. {
  3174. unsigned long limit;
  3175. unsigned int i;
  3176. udp_table_init(&udp_table, "UDP");
  3177. limit = nr_free_buffer_pages() / 8;
  3178. limit = max(limit, 128UL);
  3179. sysctl_udp_mem[0] = limit / 4 * 3;
  3180. sysctl_udp_mem[1] = limit;
  3181. sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;
  3182. /* 16 spinlocks per cpu */
  3183. udp_busylocks_log = ilog2(nr_cpu_ids) + 4;
  3184. udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log,
  3185. GFP_KERNEL);
  3186. if (!udp_busylocks)
  3187. panic("UDP: failed to alloc udp_busylocks\n");
  3188. for (i = 0; i < (1U << udp_busylocks_log); i++)
  3189. spin_lock_init(udp_busylocks + i);
  3190. if (register_pernet_subsys(&udp_sysctl_ops))
  3191. panic("UDP: failed to init sysctl parameters.\n");
  3192. #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
  3193. bpf_iter_register();
  3194. #endif
  3195. }