route.c 94 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * INET An implementation of the TCP/IP protocol suite for the LINUX
  4. * operating system. INET is implemented using the BSD Socket
  5. * interface as the means of communication with the user level.
  6. *
  7. * ROUTE - implementation of the IP router.
  8. *
  9. * Authors: Ross Biro
  10. * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11. * Alan Cox, <gw4pts@gw4pts.ampr.org>
  12. * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  13. * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  14. *
  15. * Fixes:
  16. * Alan Cox : Verify area fixes.
  17. * Alan Cox : cli() protects routing changes
  18. * Rui Oliveira : ICMP routing table updates
  19. * (rco@di.uminho.pt) Routing table insertion and update
  20. * Linus Torvalds : Rewrote bits to be sensible
  21. * Alan Cox : Added BSD route gw semantics
  22. * Alan Cox : Super /proc >4K
  23. * Alan Cox : MTU in route table
  24. * Alan Cox : MSS actually. Also added the window
  25. * clamper.
  26. * Sam Lantinga : Fixed route matching in rt_del()
  27. * Alan Cox : Routing cache support.
  28. * Alan Cox : Removed compatibility cruft.
  29. * Alan Cox : RTF_REJECT support.
  30. * Alan Cox : TCP irtt support.
  31. * Jonathan Naylor : Added Metric support.
  32. * Miquel van Smoorenburg : BSD API fixes.
  33. * Miquel van Smoorenburg : Metrics.
  34. * Alan Cox : Use __u32 properly
  35. * Alan Cox : Aligned routing errors more closely with BSD
  36. * our system is still very different.
  37. * Alan Cox : Faster /proc handling
  38. * Alexey Kuznetsov : Massive rework to support tree based routing,
  39. * routing caches and better behaviour.
  40. *
  41. * Olaf Erb : irtt wasn't being copied right.
  42. * Bjorn Ekwall : Kerneld route support.
  43. * Alan Cox : Multicast fixed (I hope)
  44. * Pavel Krauz : Limited broadcast fixed
  45. * Mike McLagan : Routing by source
  46. * Alexey Kuznetsov : End of old history. Split to fib.c and
  47. * route.c and rewritten from scratch.
  48. * Andi Kleen : Load-limit warning messages.
  49. * Vitaly E. Lavrov : Transparent proxy revived after year coma.
  50. * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
  51. * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
  52. * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
  53. * Marc Boucher : routing by fwmark
  54. * Robert Olsson : Added rt_cache statistics
  55. * Arnaldo C. Melo : Convert proc stuff to seq_file
  56. * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
  57. * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
  58. * Ilia Sotnikov : Removed TOS from hash calculations
  59. */
  60. #define pr_fmt(fmt) "IPv4: " fmt
  61. #include <linux/module.h>
  62. #include <linux/bitops.h>
  63. #include <linux/kernel.h>
  64. #include <linux/mm.h>
  65. #include <linux/memblock.h>
  66. #include <linux/socket.h>
  67. #include <linux/errno.h>
  68. #include <linux/in.h>
  69. #include <linux/inet.h>
  70. #include <linux/netdevice.h>
  71. #include <linux/proc_fs.h>
  72. #include <linux/init.h>
  73. #include <linux/skbuff.h>
  74. #include <linux/inetdevice.h>
  75. #include <linux/igmp.h>
  76. #include <linux/pkt_sched.h>
  77. #include <linux/mroute.h>
  78. #include <linux/netfilter_ipv4.h>
  79. #include <linux/random.h>
  80. #include <linux/rcupdate.h>
  81. #include <linux/slab.h>
  82. #include <linux/jhash.h>
  83. #include <net/dst.h>
  84. #include <net/dst_metadata.h>
  85. #include <net/inet_dscp.h>
  86. #include <net/net_namespace.h>
  87. #include <net/ip.h>
  88. #include <net/route.h>
  89. #include <net/inetpeer.h>
  90. #include <net/sock.h>
  91. #include <net/ip_fib.h>
  92. #include <net/nexthop.h>
  93. #include <net/tcp.h>
  94. #include <net/icmp.h>
  95. #include <net/xfrm.h>
  96. #include <net/lwtunnel.h>
  97. #include <net/netevent.h>
  98. #include <net/rtnetlink.h>
  99. #ifdef CONFIG_SYSCTL
  100. #include <linux/sysctl.h>
  101. #endif
  102. #include <net/secure_seq.h>
  103. #include <net/ip_tunnels.h>
  104. #include "fib_lookup.h"
  105. #define RT_GC_TIMEOUT (300*HZ)
  106. #define DEFAULT_MIN_PMTU (512 + 20 + 20)
  107. #define DEFAULT_MTU_EXPIRES (10 * 60 * HZ)
  108. #define DEFAULT_MIN_ADVMSS 256
  109. static int ip_rt_max_size;
  110. static int ip_rt_redirect_number __read_mostly = 9;
  111. static int ip_rt_redirect_load __read_mostly = HZ / 50;
  112. static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
  113. static int ip_rt_error_cost __read_mostly = HZ;
  114. static int ip_rt_error_burst __read_mostly = 5 * HZ;
  115. static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
  116. /*
  117. * Interface to generic destination cache.
  118. */
  119. INDIRECT_CALLABLE_SCOPE
  120. struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
  121. static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
  122. INDIRECT_CALLABLE_SCOPE
  123. unsigned int ipv4_mtu(const struct dst_entry *dst);
  124. static void ipv4_negative_advice(struct sock *sk,
  125. struct dst_entry *dst);
  126. static void ipv4_link_failure(struct sk_buff *skb);
  127. static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
  128. struct sk_buff *skb, u32 mtu,
  129. bool confirm_neigh);
  130. static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
  131. struct sk_buff *skb);
  132. static void ipv4_dst_destroy(struct dst_entry *dst);
  133. static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
  134. {
  135. WARN_ON(1);
  136. return NULL;
  137. }
  138. static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
  139. struct sk_buff *skb,
  140. const void *daddr);
  141. static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
  142. static struct dst_ops ipv4_dst_ops = {
  143. .family = AF_INET,
  144. .check = ipv4_dst_check,
  145. .default_advmss = ipv4_default_advmss,
  146. .mtu = ipv4_mtu,
  147. .cow_metrics = ipv4_cow_metrics,
  148. .destroy = ipv4_dst_destroy,
  149. .negative_advice = ipv4_negative_advice,
  150. .link_failure = ipv4_link_failure,
  151. .update_pmtu = ip_rt_update_pmtu,
  152. .redirect = ip_do_redirect,
  153. .local_out = __ip_local_out,
  154. .neigh_lookup = ipv4_neigh_lookup,
  155. .confirm_neigh = ipv4_confirm_neigh,
  156. };
  157. #define ECN_OR_COST(class) TC_PRIO_##class
  158. const __u8 ip_tos2prio[16] = {
  159. TC_PRIO_BESTEFFORT,
  160. ECN_OR_COST(BESTEFFORT),
  161. TC_PRIO_BESTEFFORT,
  162. ECN_OR_COST(BESTEFFORT),
  163. TC_PRIO_BULK,
  164. ECN_OR_COST(BULK),
  165. TC_PRIO_BULK,
  166. ECN_OR_COST(BULK),
  167. TC_PRIO_INTERACTIVE,
  168. ECN_OR_COST(INTERACTIVE),
  169. TC_PRIO_INTERACTIVE,
  170. ECN_OR_COST(INTERACTIVE),
  171. TC_PRIO_INTERACTIVE_BULK,
  172. ECN_OR_COST(INTERACTIVE_BULK),
  173. TC_PRIO_INTERACTIVE_BULK,
  174. ECN_OR_COST(INTERACTIVE_BULK)
  175. };
  176. EXPORT_SYMBOL(ip_tos2prio);
  177. static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
  178. #ifndef CONFIG_PREEMPT_RT
  179. #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
  180. #else
  181. #define RT_CACHE_STAT_INC(field) this_cpu_inc(rt_cache_stat.field)
  182. #endif
  183. #ifdef CONFIG_PROC_FS
  184. static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
  185. {
  186. if (*pos)
  187. return NULL;
  188. return SEQ_START_TOKEN;
  189. }
  190. static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  191. {
  192. ++*pos;
  193. return NULL;
  194. }
  195. static void rt_cache_seq_stop(struct seq_file *seq, void *v)
  196. {
  197. }
  198. static int rt_cache_seq_show(struct seq_file *seq, void *v)
  199. {
  200. if (v == SEQ_START_TOKEN)
  201. seq_printf(seq, "%-127s\n",
  202. "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
  203. "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
  204. "HHUptod\tSpecDst");
  205. return 0;
  206. }
  207. static const struct seq_operations rt_cache_seq_ops = {
  208. .start = rt_cache_seq_start,
  209. .next = rt_cache_seq_next,
  210. .stop = rt_cache_seq_stop,
  211. .show = rt_cache_seq_show,
  212. };
  213. static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
  214. {
  215. int cpu;
  216. if (*pos == 0)
  217. return SEQ_START_TOKEN;
  218. for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
  219. if (!cpu_possible(cpu))
  220. continue;
  221. *pos = cpu+1;
  222. return &per_cpu(rt_cache_stat, cpu);
  223. }
  224. return NULL;
  225. }
  226. static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  227. {
  228. int cpu;
  229. for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
  230. if (!cpu_possible(cpu))
  231. continue;
  232. *pos = cpu+1;
  233. return &per_cpu(rt_cache_stat, cpu);
  234. }
  235. (*pos)++;
  236. return NULL;
  237. }
  238. static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
  239. {
  240. }
  241. static int rt_cpu_seq_show(struct seq_file *seq, void *v)
  242. {
  243. struct rt_cache_stat *st = v;
  244. if (v == SEQ_START_TOKEN) {
  245. seq_puts(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
  246. return 0;
  247. }
  248. seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x "
  249. "%08x %08x %08x %08x %08x %08x "
  250. "%08x %08x %08x %08x\n",
  251. dst_entries_get_slow(&ipv4_dst_ops),
  252. 0, /* st->in_hit */
  253. st->in_slow_tot,
  254. st->in_slow_mc,
  255. st->in_no_route,
  256. st->in_brd,
  257. st->in_martian_dst,
  258. st->in_martian_src,
  259. 0, /* st->out_hit */
  260. st->out_slow_tot,
  261. st->out_slow_mc,
  262. 0, /* st->gc_total */
  263. 0, /* st->gc_ignored */
  264. 0, /* st->gc_goal_miss */
  265. 0, /* st->gc_dst_overflow */
  266. 0, /* st->in_hlist_search */
  267. 0 /* st->out_hlist_search */
  268. );
  269. return 0;
  270. }
  271. static const struct seq_operations rt_cpu_seq_ops = {
  272. .start = rt_cpu_seq_start,
  273. .next = rt_cpu_seq_next,
  274. .stop = rt_cpu_seq_stop,
  275. .show = rt_cpu_seq_show,
  276. };
  277. #ifdef CONFIG_IP_ROUTE_CLASSID
  278. static int rt_acct_proc_show(struct seq_file *m, void *v)
  279. {
  280. struct ip_rt_acct *dst, *src;
  281. unsigned int i, j;
  282. dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
  283. if (!dst)
  284. return -ENOMEM;
  285. for_each_possible_cpu(i) {
  286. src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
  287. for (j = 0; j < 256; j++) {
  288. dst[j].o_bytes += src[j].o_bytes;
  289. dst[j].o_packets += src[j].o_packets;
  290. dst[j].i_bytes += src[j].i_bytes;
  291. dst[j].i_packets += src[j].i_packets;
  292. }
  293. }
  294. seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
  295. kfree(dst);
  296. return 0;
  297. }
  298. #endif
  299. static int __net_init ip_rt_do_proc_init(struct net *net)
  300. {
  301. struct proc_dir_entry *pde;
  302. pde = proc_create_seq("rt_cache", 0444, net->proc_net,
  303. &rt_cache_seq_ops);
  304. if (!pde)
  305. goto err1;
  306. pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat,
  307. &rt_cpu_seq_ops);
  308. if (!pde)
  309. goto err2;
  310. #ifdef CONFIG_IP_ROUTE_CLASSID
  311. pde = proc_create_single("rt_acct", 0, net->proc_net,
  312. rt_acct_proc_show);
  313. if (!pde)
  314. goto err3;
  315. #endif
  316. return 0;
  317. #ifdef CONFIG_IP_ROUTE_CLASSID
  318. err3:
  319. remove_proc_entry("rt_cache", net->proc_net_stat);
  320. #endif
  321. err2:
  322. remove_proc_entry("rt_cache", net->proc_net);
  323. err1:
  324. return -ENOMEM;
  325. }
  326. static void __net_exit ip_rt_do_proc_exit(struct net *net)
  327. {
  328. remove_proc_entry("rt_cache", net->proc_net_stat);
  329. remove_proc_entry("rt_cache", net->proc_net);
  330. #ifdef CONFIG_IP_ROUTE_CLASSID
  331. remove_proc_entry("rt_acct", net->proc_net);
  332. #endif
  333. }
  334. static struct pernet_operations ip_rt_proc_ops __net_initdata = {
  335. .init = ip_rt_do_proc_init,
  336. .exit = ip_rt_do_proc_exit,
  337. };
  338. static int __init ip_rt_proc_init(void)
  339. {
  340. return register_pernet_subsys(&ip_rt_proc_ops);
  341. }
  342. #else
  343. static inline int ip_rt_proc_init(void)
  344. {
  345. return 0;
  346. }
  347. #endif /* CONFIG_PROC_FS */
  348. static inline bool rt_is_expired(const struct rtable *rth)
  349. {
  350. bool res;
  351. rcu_read_lock();
  352. res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev));
  353. rcu_read_unlock();
  354. return res;
  355. }
  356. void rt_cache_flush(struct net *net)
  357. {
  358. rt_genid_bump_ipv4(net);
  359. }
  360. static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
  361. struct sk_buff *skb,
  362. const void *daddr)
  363. {
  364. const struct rtable *rt = container_of(dst, struct rtable, dst);
  365. struct net_device *dev = dst_dev(dst);
  366. struct neighbour *n;
  367. rcu_read_lock();
  368. if (likely(rt->rt_gw_family == AF_INET)) {
  369. n = ip_neigh_gw4(dev, rt->rt_gw4);
  370. } else if (rt->rt_gw_family == AF_INET6) {
  371. n = ip_neigh_gw6(dev, &rt->rt_gw6);
  372. } else {
  373. __be32 pkey;
  374. pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
  375. n = ip_neigh_gw4(dev, pkey);
  376. }
  377. if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
  378. n = NULL;
  379. rcu_read_unlock();
  380. return n;
  381. }
  382. static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
  383. {
  384. const struct rtable *rt = container_of(dst, struct rtable, dst);
  385. struct net_device *dev = dst_dev(dst);
  386. const __be32 *pkey = daddr;
  387. if (rt->rt_gw_family == AF_INET) {
  388. pkey = (const __be32 *)&rt->rt_gw4;
  389. } else if (rt->rt_gw_family == AF_INET6) {
  390. return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
  391. } else if (!daddr ||
  392. (rt->rt_flags &
  393. (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
  394. return;
  395. }
  396. __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
  397. }
  398. /* Hash tables of size 2048..262144 depending on RAM size.
  399. * Each bucket uses 8 bytes.
  400. */
  401. static u32 ip_idents_mask __read_mostly;
  402. static atomic_t *ip_idents __read_mostly;
  403. static u32 *ip_tstamps __read_mostly;
  404. /* In order to protect privacy, we add a perturbation to identifiers
  405. * if one generator is seldom used. This makes hard for an attacker
  406. * to infer how many packets were sent between two points in time.
  407. */
  408. static u32 ip_idents_reserve(u32 hash, int segs)
  409. {
  410. u32 bucket, old, now = (u32)jiffies;
  411. atomic_t *p_id;
  412. u32 *p_tstamp;
  413. u32 delta = 0;
  414. bucket = hash & ip_idents_mask;
  415. p_tstamp = ip_tstamps + bucket;
  416. p_id = ip_idents + bucket;
  417. old = READ_ONCE(*p_tstamp);
  418. if (old != now && cmpxchg(p_tstamp, old, now) == old)
  419. delta = get_random_u32_below(now - old);
  420. /* If UBSAN reports an error there, please make sure your compiler
  421. * supports -fno-strict-overflow before reporting it that was a bug
  422. * in UBSAN, and it has been fixed in GCC-8.
  423. */
  424. return atomic_add_return(segs + delta, p_id) - segs;
  425. }
  426. void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
  427. {
  428. u32 hash, id;
  429. /* Note the following code is not safe, but this is okay. */
  430. if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
  431. get_random_bytes(&net->ipv4.ip_id_key,
  432. sizeof(net->ipv4.ip_id_key));
  433. hash = siphash_3u32((__force u32)iph->daddr,
  434. (__force u32)iph->saddr,
  435. iph->protocol,
  436. &net->ipv4.ip_id_key);
  437. id = ip_idents_reserve(hash, segs);
  438. iph->id = htons(id);
  439. }
  440. EXPORT_SYMBOL(__ip_select_ident);
  441. static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
  442. const struct sock *sk, const struct iphdr *iph,
  443. int oif, __u8 tos, u8 prot, u32 mark,
  444. int flow_flags)
  445. {
  446. __u8 scope = RT_SCOPE_UNIVERSE;
  447. if (sk) {
  448. oif = sk->sk_bound_dev_if;
  449. mark = READ_ONCE(sk->sk_mark);
  450. tos = ip_sock_rt_tos(sk);
  451. scope = ip_sock_rt_scope(sk);
  452. prot = inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW :
  453. sk->sk_protocol;
  454. }
  455. flowi4_init_output(fl4, oif, mark, tos & INET_DSCP_MASK, scope,
  456. prot, flow_flags, iph->daddr, iph->saddr, 0, 0,
  457. sock_net_uid(net, sk));
  458. }
  459. static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
  460. const struct sock *sk)
  461. {
  462. const struct net *net = dev_net(skb->dev);
  463. const struct iphdr *iph = ip_hdr(skb);
  464. int oif = skb->dev->ifindex;
  465. u8 prot = iph->protocol;
  466. u32 mark = skb->mark;
  467. __u8 tos = iph->tos;
  468. __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
  469. }
  470. static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
  471. {
  472. const struct inet_sock *inet = inet_sk(sk);
  473. const struct ip_options_rcu *inet_opt;
  474. __be32 daddr = inet->inet_daddr;
  475. rcu_read_lock();
  476. inet_opt = rcu_dereference(inet->inet_opt);
  477. if (inet_opt && inet_opt->opt.srr)
  478. daddr = inet_opt->opt.faddr;
  479. flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark),
  480. ip_sock_rt_tos(sk),
  481. ip_sock_rt_scope(sk),
  482. inet_test_bit(HDRINCL, sk) ?
  483. IPPROTO_RAW : sk->sk_protocol,
  484. inet_sk_flowi_flags(sk),
  485. daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
  486. rcu_read_unlock();
  487. }
  488. static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
  489. const struct sk_buff *skb)
  490. {
  491. if (skb)
  492. build_skb_flow_key(fl4, skb, sk);
  493. else
  494. build_sk_flow_key(fl4, sk);
  495. }
  496. static DEFINE_SPINLOCK(fnhe_lock);
  497. static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
  498. {
  499. struct rtable *rt;
  500. rt = rcu_dereference(fnhe->fnhe_rth_input);
  501. if (rt) {
  502. RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
  503. dst_dev_put(&rt->dst);
  504. dst_release(&rt->dst);
  505. }
  506. rt = rcu_dereference(fnhe->fnhe_rth_output);
  507. if (rt) {
  508. RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
  509. dst_dev_put(&rt->dst);
  510. dst_release(&rt->dst);
  511. }
  512. }
  513. static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
  514. {
  515. struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
  516. struct fib_nh_exception *fnhe, *oldest = NULL;
  517. for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
  518. fnhe = rcu_dereference_protected(*fnhe_p,
  519. lockdep_is_held(&fnhe_lock));
  520. if (!fnhe)
  521. break;
  522. if (!oldest ||
  523. time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
  524. oldest = fnhe;
  525. oldest_p = fnhe_p;
  526. }
  527. }
  528. fnhe_flush_routes(oldest);
  529. *oldest_p = oldest->fnhe_next;
  530. kfree_rcu(oldest, rcu);
  531. }
  532. static u32 fnhe_hashfun(__be32 daddr)
  533. {
  534. static siphash_aligned_key_t fnhe_hash_key;
  535. u64 hval;
  536. net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
  537. hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
  538. return hash_64(hval, FNHE_HASH_SHIFT);
  539. }
  540. static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
  541. {
  542. rt->rt_pmtu = fnhe->fnhe_pmtu;
  543. rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
  544. rt->dst.expires = fnhe->fnhe_expires;
  545. if (fnhe->fnhe_gw) {
  546. rt->rt_flags |= RTCF_REDIRECTED;
  547. rt->rt_uses_gateway = 1;
  548. rt->rt_gw_family = AF_INET;
  549. rt->rt_gw4 = fnhe->fnhe_gw;
  550. }
  551. }
  552. static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
  553. __be32 gw, u32 pmtu, bool lock,
  554. unsigned long expires)
  555. {
  556. struct fnhe_hash_bucket *hash;
  557. struct fib_nh_exception *fnhe;
  558. struct rtable *rt;
  559. u32 genid, hval;
  560. unsigned int i;
  561. int depth;
  562. genid = fnhe_genid(dev_net(nhc->nhc_dev));
  563. hval = fnhe_hashfun(daddr);
  564. spin_lock_bh(&fnhe_lock);
  565. hash = rcu_dereference(nhc->nhc_exceptions);
  566. if (!hash) {
  567. hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
  568. if (!hash)
  569. goto out_unlock;
  570. rcu_assign_pointer(nhc->nhc_exceptions, hash);
  571. }
  572. hash += hval;
  573. depth = 0;
  574. for (fnhe = rcu_dereference(hash->chain); fnhe;
  575. fnhe = rcu_dereference(fnhe->fnhe_next)) {
  576. if (fnhe->fnhe_daddr == daddr)
  577. break;
  578. depth++;
  579. }
  580. if (fnhe) {
  581. if (fnhe->fnhe_genid != genid)
  582. fnhe->fnhe_genid = genid;
  583. if (gw)
  584. fnhe->fnhe_gw = gw;
  585. if (pmtu) {
  586. fnhe->fnhe_pmtu = pmtu;
  587. fnhe->fnhe_mtu_locked = lock;
  588. }
  589. fnhe->fnhe_expires = max(1UL, expires);
  590. /* Update all cached dsts too */
  591. rt = rcu_dereference(fnhe->fnhe_rth_input);
  592. if (rt)
  593. fill_route_from_fnhe(rt, fnhe);
  594. rt = rcu_dereference(fnhe->fnhe_rth_output);
  595. if (rt)
  596. fill_route_from_fnhe(rt, fnhe);
  597. } else {
  598. /* Randomize max depth to avoid some side channels attacks. */
  599. int max_depth = FNHE_RECLAIM_DEPTH +
  600. get_random_u32_below(FNHE_RECLAIM_DEPTH);
  601. while (depth > max_depth) {
  602. fnhe_remove_oldest(hash);
  603. depth--;
  604. }
  605. fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
  606. if (!fnhe)
  607. goto out_unlock;
  608. fnhe->fnhe_next = hash->chain;
  609. fnhe->fnhe_genid = genid;
  610. fnhe->fnhe_daddr = daddr;
  611. fnhe->fnhe_gw = gw;
  612. fnhe->fnhe_pmtu = pmtu;
  613. fnhe->fnhe_mtu_locked = lock;
  614. fnhe->fnhe_expires = max(1UL, expires);
  615. rcu_assign_pointer(hash->chain, fnhe);
  616. /* Exception created; mark the cached routes for the nexthop
  617. * stale, so anyone caching it rechecks if this exception
  618. * applies to them.
  619. */
  620. rt = rcu_dereference(nhc->nhc_rth_input);
  621. if (rt)
  622. rt->dst.obsolete = DST_OBSOLETE_KILL;
  623. for_each_possible_cpu(i) {
  624. struct rtable __rcu **prt;
  625. prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
  626. rt = rcu_dereference(*prt);
  627. if (rt)
  628. rt->dst.obsolete = DST_OBSOLETE_KILL;
  629. }
  630. }
  631. fnhe->fnhe_stamp = jiffies;
  632. out_unlock:
  633. spin_unlock_bh(&fnhe_lock);
  634. }
  635. static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
  636. bool kill_route)
  637. {
  638. __be32 new_gw = icmp_hdr(skb)->un.gateway;
  639. __be32 old_gw = ip_hdr(skb)->saddr;
  640. struct net_device *dev = skb->dev;
  641. struct in_device *in_dev;
  642. struct fib_result res;
  643. struct neighbour *n;
  644. struct net *net;
  645. switch (icmp_hdr(skb)->code & 7) {
  646. case ICMP_REDIR_NET:
  647. case ICMP_REDIR_NETTOS:
  648. case ICMP_REDIR_HOST:
  649. case ICMP_REDIR_HOSTTOS:
  650. break;
  651. default:
  652. return;
  653. }
  654. if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
  655. return;
  656. in_dev = __in_dev_get_rcu(dev);
  657. if (!in_dev)
  658. return;
  659. net = dev_net(dev);
  660. if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
  661. ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
  662. ipv4_is_zeronet(new_gw))
  663. goto reject_redirect;
  664. if (!IN_DEV_SHARED_MEDIA(in_dev)) {
  665. if (!inet_addr_onlink(in_dev, new_gw, old_gw))
  666. goto reject_redirect;
  667. if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
  668. goto reject_redirect;
  669. } else {
  670. if (inet_addr_type(net, new_gw) != RTN_UNICAST)
  671. goto reject_redirect;
  672. }
  673. n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
  674. if (!n)
  675. n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
  676. if (!IS_ERR(n)) {
  677. if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
  678. neigh_event_send(n, NULL);
  679. } else {
  680. if (fib_lookup(net, fl4, &res, 0) == 0) {
  681. struct fib_nh_common *nhc;
  682. fib_select_path(net, &res, fl4, skb);
  683. nhc = FIB_RES_NHC(res);
  684. update_or_create_fnhe(nhc, fl4->daddr, new_gw,
  685. 0, false,
  686. jiffies + ip_rt_gc_timeout);
  687. }
  688. if (kill_route)
  689. rt->dst.obsolete = DST_OBSOLETE_KILL;
  690. call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
  691. }
  692. neigh_release(n);
  693. }
  694. return;
  695. reject_redirect:
  696. #ifdef CONFIG_IP_ROUTE_VERBOSE
  697. if (IN_DEV_LOG_MARTIANS(in_dev)) {
  698. const struct iphdr *iph = (const struct iphdr *) skb->data;
  699. __be32 daddr = iph->daddr;
  700. __be32 saddr = iph->saddr;
  701. net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
  702. " Advised path = %pI4 -> %pI4\n",
  703. &old_gw, dev->name, &new_gw,
  704. &saddr, &daddr);
  705. }
  706. #endif
  707. ;
  708. }
  709. static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
  710. {
  711. struct rtable *rt;
  712. struct flowi4 fl4;
  713. const struct iphdr *iph = (const struct iphdr *) skb->data;
  714. struct net *net = dev_net(skb->dev);
  715. int oif = skb->dev->ifindex;
  716. u8 prot = iph->protocol;
  717. u32 mark = skb->mark;
  718. __u8 tos = iph->tos;
  719. rt = dst_rtable(dst);
  720. __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
  721. __ip_do_redirect(rt, skb, &fl4, true);
  722. }
  723. static void ipv4_negative_advice(struct sock *sk,
  724. struct dst_entry *dst)
  725. {
  726. struct rtable *rt = dst_rtable(dst);
  727. if ((dst->obsolete > 0) ||
  728. (rt->rt_flags & RTCF_REDIRECTED) ||
  729. rt->dst.expires)
  730. sk_dst_reset(sk);
  731. }
  732. /*
  733. * Algorithm:
  734. * 1. The first ip_rt_redirect_number redirects are sent
  735. * with exponential backoff, then we stop sending them at all,
  736. * assuming that the host ignores our redirects.
  737. * 2. If we did not see packets requiring redirects
  738. * during ip_rt_redirect_silence, we assume that the host
  739. * forgot redirected route and start to send redirects again.
  740. *
  741. * This algorithm is much cheaper and more intelligent than dumb load limiting
  742. * in icmp.c.
  743. *
  744. * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
  745. * and "frag. need" (breaks PMTU discovery) in icmp.c.
  746. */
  747. void ip_rt_send_redirect(struct sk_buff *skb)
  748. {
  749. struct rtable *rt = skb_rtable(skb);
  750. struct in_device *in_dev;
  751. struct inet_peer *peer;
  752. struct net *net;
  753. int log_martians;
  754. int vif;
  755. rcu_read_lock();
  756. in_dev = __in_dev_get_rcu(rt->dst.dev);
  757. if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
  758. rcu_read_unlock();
  759. return;
  760. }
  761. log_martians = IN_DEV_LOG_MARTIANS(in_dev);
  762. vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
  763. net = dev_net(rt->dst.dev);
  764. peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif);
  765. if (!peer) {
  766. rcu_read_unlock();
  767. icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
  768. rt_nexthop(rt, ip_hdr(skb)->daddr));
  769. return;
  770. }
  771. /* No redirected packets during ip_rt_redirect_silence;
  772. * reset the algorithm.
  773. */
  774. if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
  775. peer->rate_tokens = 0;
  776. peer->n_redirects = 0;
  777. }
  778. /* Too many ignored redirects; do not send anything
  779. * set dst.rate_last to the last seen redirected packet.
  780. */
  781. if (peer->n_redirects >= ip_rt_redirect_number) {
  782. peer->rate_last = jiffies;
  783. goto out_unlock;
  784. }
  785. /* Check for load limit; set rate_last to the latest sent
  786. * redirect.
  787. */
  788. if (peer->n_redirects == 0 ||
  789. time_after(jiffies,
  790. (peer->rate_last +
  791. (ip_rt_redirect_load << peer->n_redirects)))) {
  792. __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
  793. icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
  794. peer->rate_last = jiffies;
  795. ++peer->n_redirects;
  796. if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE) && log_martians &&
  797. peer->n_redirects == ip_rt_redirect_number)
  798. net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
  799. &ip_hdr(skb)->saddr, inet_iif(skb),
  800. &ip_hdr(skb)->daddr, &gw);
  801. }
  802. out_unlock:
  803. rcu_read_unlock();
  804. }
  805. static int ip_error(struct sk_buff *skb)
  806. {
  807. struct rtable *rt = skb_rtable(skb);
  808. struct net_device *dev = skb->dev;
  809. struct in_device *in_dev;
  810. struct inet_peer *peer;
  811. unsigned long now;
  812. struct net *net;
  813. SKB_DR(reason);
  814. bool send;
  815. int code;
  816. if (netif_is_l3_master(skb->dev)) {
  817. dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
  818. if (!dev)
  819. goto out;
  820. }
  821. in_dev = __in_dev_get_rcu(dev);
  822. /* IP on this device is disabled. */
  823. if (!in_dev)
  824. goto out;
  825. net = dev_net(rt->dst.dev);
  826. if (!IN_DEV_FORWARD(in_dev)) {
  827. switch (rt->dst.error) {
  828. case EHOSTUNREACH:
  829. SKB_DR_SET(reason, IP_INADDRERRORS);
  830. __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
  831. break;
  832. case ENETUNREACH:
  833. SKB_DR_SET(reason, IP_INNOROUTES);
  834. __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
  835. break;
  836. }
  837. goto out;
  838. }
  839. switch (rt->dst.error) {
  840. case EINVAL:
  841. default:
  842. goto out;
  843. case EHOSTUNREACH:
  844. code = ICMP_HOST_UNREACH;
  845. break;
  846. case ENETUNREACH:
  847. code = ICMP_NET_UNREACH;
  848. SKB_DR_SET(reason, IP_INNOROUTES);
  849. __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
  850. break;
  851. case EACCES:
  852. code = ICMP_PKT_FILTERED;
  853. break;
  854. }
  855. rcu_read_lock();
  856. peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
  857. l3mdev_master_ifindex_rcu(skb->dev));
  858. send = true;
  859. if (peer) {
  860. now = jiffies;
  861. peer->rate_tokens += now - peer->rate_last;
  862. if (peer->rate_tokens > ip_rt_error_burst)
  863. peer->rate_tokens = ip_rt_error_burst;
  864. peer->rate_last = now;
  865. if (peer->rate_tokens >= ip_rt_error_cost)
  866. peer->rate_tokens -= ip_rt_error_cost;
  867. else
  868. send = false;
  869. }
  870. rcu_read_unlock();
  871. if (send)
  872. icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
  873. out: kfree_skb_reason(skb, reason);
  874. return 0;
  875. }
  876. static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
  877. {
  878. struct dst_entry *dst = &rt->dst;
  879. struct fib_result res;
  880. bool lock = false;
  881. struct net *net;
  882. u32 old_mtu;
  883. if (ip_mtu_locked(dst))
  884. return;
  885. old_mtu = ipv4_mtu(dst);
  886. if (old_mtu < mtu)
  887. return;
  888. rcu_read_lock();
  889. net = dev_net_rcu(dst_dev(dst));
  890. if (mtu < net->ipv4.ip_rt_min_pmtu) {
  891. lock = true;
  892. mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
  893. }
  894. if (rt->rt_pmtu == mtu && !lock &&
  895. time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2))
  896. goto out;
  897. if (fib_lookup(net, fl4, &res, 0) == 0) {
  898. struct fib_nh_common *nhc;
  899. fib_select_path(net, &res, fl4, NULL);
  900. #ifdef CONFIG_IP_ROUTE_MULTIPATH
  901. if (fib_info_num_path(res.fi) > 1) {
  902. int nhsel;
  903. for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) {
  904. nhc = fib_info_nhc(res.fi, nhsel);
  905. update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
  906. jiffies + net->ipv4.ip_rt_mtu_expires);
  907. }
  908. goto out;
  909. }
  910. #endif /* CONFIG_IP_ROUTE_MULTIPATH */
  911. nhc = FIB_RES_NHC(res);
  912. update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
  913. jiffies + net->ipv4.ip_rt_mtu_expires);
  914. }
  915. out:
  916. rcu_read_unlock();
  917. }
  918. static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
  919. struct sk_buff *skb, u32 mtu,
  920. bool confirm_neigh)
  921. {
  922. struct rtable *rt = dst_rtable(dst);
  923. struct flowi4 fl4;
  924. ip_rt_build_flow_key(&fl4, sk, skb);
  925. /* Don't make lookup fail for bridged encapsulations */
  926. if (skb && netif_is_any_bridge_port(skb->dev))
  927. fl4.flowi4_oif = 0;
  928. __ip_rt_update_pmtu(rt, &fl4, mtu);
  929. }
  930. void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
  931. int oif, u8 protocol)
  932. {
  933. const struct iphdr *iph = (const struct iphdr *)skb->data;
  934. struct flowi4 fl4;
  935. struct rtable *rt;
  936. u32 mark = IP4_REPLY_MARK(net, skb->mark);
  937. __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, mark,
  938. 0);
  939. rt = __ip_route_output_key(net, &fl4);
  940. if (!IS_ERR(rt)) {
  941. __ip_rt_update_pmtu(rt, &fl4, mtu);
  942. ip_rt_put(rt);
  943. }
  944. }
  945. EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
  946. static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
  947. {
  948. const struct iphdr *iph = (const struct iphdr *)skb->data;
  949. struct flowi4 fl4;
  950. struct rtable *rt;
  951. __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
  952. if (!fl4.flowi4_mark)
  953. fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
  954. rt = __ip_route_output_key(sock_net(sk), &fl4);
  955. if (!IS_ERR(rt)) {
  956. __ip_rt_update_pmtu(rt, &fl4, mtu);
  957. ip_rt_put(rt);
  958. }
  959. }
  960. void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
  961. {
  962. const struct iphdr *iph = (const struct iphdr *)skb->data;
  963. struct flowi4 fl4;
  964. struct rtable *rt;
  965. struct dst_entry *odst = NULL;
  966. bool new = false;
  967. struct net *net = sock_net(sk);
  968. bh_lock_sock(sk);
  969. if (!ip_sk_accept_pmtu(sk))
  970. goto out;
  971. odst = sk_dst_get(sk);
  972. if (sock_owned_by_user(sk) || !odst) {
  973. __ipv4_sk_update_pmtu(skb, sk, mtu);
  974. goto out;
  975. }
  976. __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
  977. rt = dst_rtable(odst);
  978. if (odst->obsolete && !odst->ops->check(odst, 0)) {
  979. rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
  980. if (IS_ERR(rt))
  981. goto out;
  982. new = true;
  983. }
  984. __ip_rt_update_pmtu(dst_rtable(xfrm_dst_path(&rt->dst)), &fl4, mtu);
  985. if (!dst_check(&rt->dst, 0)) {
  986. if (new)
  987. dst_release(&rt->dst);
  988. rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
  989. if (IS_ERR(rt))
  990. goto out;
  991. new = true;
  992. }
  993. if (new)
  994. sk_dst_set(sk, &rt->dst);
  995. out:
  996. bh_unlock_sock(sk);
  997. dst_release(odst);
  998. }
  999. EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
  1000. void ipv4_redirect(struct sk_buff *skb, struct net *net,
  1001. int oif, u8 protocol)
  1002. {
  1003. const struct iphdr *iph = (const struct iphdr *)skb->data;
  1004. struct flowi4 fl4;
  1005. struct rtable *rt;
  1006. __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, 0, 0);
  1007. rt = __ip_route_output_key(net, &fl4);
  1008. if (!IS_ERR(rt)) {
  1009. __ip_do_redirect(rt, skb, &fl4, false);
  1010. ip_rt_put(rt);
  1011. }
  1012. }
  1013. EXPORT_SYMBOL_GPL(ipv4_redirect);
  1014. void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
  1015. {
  1016. const struct iphdr *iph = (const struct iphdr *)skb->data;
  1017. struct flowi4 fl4;
  1018. struct rtable *rt;
  1019. struct net *net = sock_net(sk);
  1020. __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
  1021. rt = __ip_route_output_key(net, &fl4);
  1022. if (!IS_ERR(rt)) {
  1023. __ip_do_redirect(rt, skb, &fl4, false);
  1024. ip_rt_put(rt);
  1025. }
  1026. }
  1027. EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
  1028. INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
  1029. u32 cookie)
  1030. {
  1031. struct rtable *rt = dst_rtable(dst);
  1032. /* All IPV4 dsts are created with ->obsolete set to the value
  1033. * DST_OBSOLETE_FORCE_CHK which forces validation calls down
  1034. * into this function always.
  1035. *
  1036. * When a PMTU/redirect information update invalidates a route,
  1037. * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
  1038. * DST_OBSOLETE_DEAD.
  1039. */
  1040. if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
  1041. return NULL;
  1042. return dst;
  1043. }
  1044. EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
  1045. static void ipv4_send_dest_unreach(struct sk_buff *skb)
  1046. {
  1047. struct net_device *dev;
  1048. struct ip_options opt;
  1049. int res;
  1050. /* Recompile ip options since IPCB may not be valid anymore.
  1051. * Also check we have a reasonable ipv4 header.
  1052. */
  1053. if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
  1054. ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
  1055. return;
  1056. memset(&opt, 0, sizeof(opt));
  1057. if (ip_hdr(skb)->ihl > 5) {
  1058. if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
  1059. return;
  1060. opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
  1061. rcu_read_lock();
  1062. dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
  1063. res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
  1064. rcu_read_unlock();
  1065. if (res)
  1066. return;
  1067. }
  1068. __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
  1069. }
  1070. static void ipv4_link_failure(struct sk_buff *skb)
  1071. {
  1072. struct rtable *rt;
  1073. ipv4_send_dest_unreach(skb);
  1074. rt = skb_rtable(skb);
  1075. if (rt)
  1076. dst_set_expires(&rt->dst, 0);
  1077. }
  1078. static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
  1079. {
  1080. pr_debug("%s: %pI4 -> %pI4, %s\n",
  1081. __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
  1082. skb->dev ? skb->dev->name : "?");
  1083. kfree_skb(skb);
  1084. WARN_ON(1);
  1085. return 0;
  1086. }
  1087. /*
  1088. * We do not cache source address of outgoing interface,
  1089. * because it is used only by IP RR, TS and SRR options,
  1090. * so that it out of fast path.
  1091. *
  1092. * BTW remember: "addr" is allowed to be not aligned
  1093. * in IP options!
  1094. */
  1095. void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
  1096. {
  1097. __be32 src;
  1098. if (rt_is_output_route(rt))
  1099. src = ip_hdr(skb)->saddr;
  1100. else {
  1101. struct fib_result res;
  1102. struct iphdr *iph = ip_hdr(skb);
  1103. struct flowi4 fl4 = {
  1104. .daddr = iph->daddr,
  1105. .saddr = iph->saddr,
  1106. .flowi4_tos = iph->tos & INET_DSCP_MASK,
  1107. .flowi4_oif = rt->dst.dev->ifindex,
  1108. .flowi4_iif = skb->dev->ifindex,
  1109. .flowi4_mark = skb->mark,
  1110. };
  1111. rcu_read_lock();
  1112. if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
  1113. src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
  1114. else
  1115. src = inet_select_addr(rt->dst.dev,
  1116. rt_nexthop(rt, iph->daddr),
  1117. RT_SCOPE_UNIVERSE);
  1118. rcu_read_unlock();
  1119. }
  1120. memcpy(addr, &src, 4);
  1121. }
  1122. #ifdef CONFIG_IP_ROUTE_CLASSID
  1123. static void set_class_tag(struct rtable *rt, u32 tag)
  1124. {
  1125. if (!(rt->dst.tclassid & 0xFFFF))
  1126. rt->dst.tclassid |= tag & 0xFFFF;
  1127. if (!(rt->dst.tclassid & 0xFFFF0000))
  1128. rt->dst.tclassid |= tag & 0xFFFF0000;
  1129. }
  1130. #endif
  1131. static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
  1132. {
  1133. unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
  1134. unsigned int advmss;
  1135. struct net *net;
  1136. rcu_read_lock();
  1137. net = dev_net_rcu(dst_dev(dst));
  1138. advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
  1139. net->ipv4.ip_rt_min_advmss);
  1140. rcu_read_unlock();
  1141. return min(advmss, IPV4_MAX_PMTU - header_size);
  1142. }
  1143. INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
  1144. {
  1145. return ip_dst_mtu_maybe_forward(dst, false);
  1146. }
  1147. EXPORT_INDIRECT_CALLABLE(ipv4_mtu);
  1148. static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
  1149. {
  1150. struct fnhe_hash_bucket *hash;
  1151. struct fib_nh_exception *fnhe, __rcu **fnhe_p;
  1152. u32 hval = fnhe_hashfun(daddr);
  1153. spin_lock_bh(&fnhe_lock);
  1154. hash = rcu_dereference_protected(nhc->nhc_exceptions,
  1155. lockdep_is_held(&fnhe_lock));
  1156. hash += hval;
  1157. fnhe_p = &hash->chain;
  1158. fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
  1159. while (fnhe) {
  1160. if (fnhe->fnhe_daddr == daddr) {
  1161. rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
  1162. fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
  1163. /* set fnhe_daddr to 0 to ensure it won't bind with
  1164. * new dsts in rt_bind_exception().
  1165. */
  1166. fnhe->fnhe_daddr = 0;
  1167. fnhe_flush_routes(fnhe);
  1168. kfree_rcu(fnhe, rcu);
  1169. break;
  1170. }
  1171. fnhe_p = &fnhe->fnhe_next;
  1172. fnhe = rcu_dereference_protected(fnhe->fnhe_next,
  1173. lockdep_is_held(&fnhe_lock));
  1174. }
  1175. spin_unlock_bh(&fnhe_lock);
  1176. }
  1177. static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
  1178. __be32 daddr)
  1179. {
  1180. struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
  1181. struct fib_nh_exception *fnhe;
  1182. u32 hval;
  1183. if (!hash)
  1184. return NULL;
  1185. hval = fnhe_hashfun(daddr);
  1186. for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
  1187. fnhe = rcu_dereference(fnhe->fnhe_next)) {
  1188. if (fnhe->fnhe_daddr == daddr) {
  1189. if (fnhe->fnhe_expires &&
  1190. time_after(jiffies, fnhe->fnhe_expires)) {
  1191. ip_del_fnhe(nhc, daddr);
  1192. break;
  1193. }
  1194. return fnhe;
  1195. }
  1196. }
  1197. return NULL;
  1198. }
  1199. /* MTU selection:
  1200. * 1. mtu on route is locked - use it
  1201. * 2. mtu from nexthop exception
  1202. * 3. mtu from egress device
  1203. */
  1204. u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
  1205. {
  1206. struct fib_nh_common *nhc = res->nhc;
  1207. struct net_device *dev = nhc->nhc_dev;
  1208. struct fib_info *fi = res->fi;
  1209. u32 mtu = 0;
  1210. if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
  1211. fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
  1212. mtu = fi->fib_mtu;
  1213. if (likely(!mtu)) {
  1214. struct fib_nh_exception *fnhe;
  1215. fnhe = find_exception(nhc, daddr);
  1216. if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
  1217. mtu = fnhe->fnhe_pmtu;
  1218. }
  1219. if (likely(!mtu))
  1220. mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
  1221. return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
  1222. }
  1223. static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
  1224. __be32 daddr, const bool do_cache)
  1225. {
  1226. bool ret = false;
  1227. spin_lock_bh(&fnhe_lock);
  1228. if (daddr == fnhe->fnhe_daddr) {
  1229. struct rtable __rcu **porig;
  1230. struct rtable *orig;
  1231. int genid = fnhe_genid(dev_net(rt->dst.dev));
  1232. if (rt_is_input_route(rt))
  1233. porig = &fnhe->fnhe_rth_input;
  1234. else
  1235. porig = &fnhe->fnhe_rth_output;
  1236. orig = rcu_dereference(*porig);
  1237. if (fnhe->fnhe_genid != genid) {
  1238. fnhe->fnhe_genid = genid;
  1239. fnhe->fnhe_gw = 0;
  1240. fnhe->fnhe_pmtu = 0;
  1241. fnhe->fnhe_expires = 0;
  1242. fnhe->fnhe_mtu_locked = false;
  1243. fnhe_flush_routes(fnhe);
  1244. orig = NULL;
  1245. }
  1246. fill_route_from_fnhe(rt, fnhe);
  1247. if (!rt->rt_gw4) {
  1248. rt->rt_gw4 = daddr;
  1249. rt->rt_gw_family = AF_INET;
  1250. }
  1251. if (do_cache) {
  1252. dst_hold(&rt->dst);
  1253. rcu_assign_pointer(*porig, rt);
  1254. if (orig) {
  1255. dst_dev_put(&orig->dst);
  1256. dst_release(&orig->dst);
  1257. }
  1258. ret = true;
  1259. }
  1260. fnhe->fnhe_stamp = jiffies;
  1261. }
  1262. spin_unlock_bh(&fnhe_lock);
  1263. return ret;
  1264. }
  1265. static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
  1266. {
  1267. struct rtable *orig, *prev, **p;
  1268. bool ret = true;
  1269. if (rt_is_input_route(rt)) {
  1270. p = (struct rtable **)&nhc->nhc_rth_input;
  1271. } else {
  1272. p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
  1273. }
  1274. orig = *p;
  1275. /* hold dst before doing cmpxchg() to avoid race condition
  1276. * on this dst
  1277. */
  1278. dst_hold(&rt->dst);
  1279. prev = cmpxchg(p, orig, rt);
  1280. if (prev == orig) {
  1281. if (orig) {
  1282. rt_add_uncached_list(orig);
  1283. dst_release(&orig->dst);
  1284. }
  1285. } else {
  1286. dst_release(&rt->dst);
  1287. ret = false;
  1288. }
  1289. return ret;
  1290. }
  1291. struct uncached_list {
  1292. spinlock_t lock;
  1293. struct list_head head;
  1294. };
  1295. static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
  1296. void rt_add_uncached_list(struct rtable *rt)
  1297. {
  1298. struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
  1299. rt->dst.rt_uncached_list = ul;
  1300. spin_lock_bh(&ul->lock);
  1301. list_add_tail(&rt->dst.rt_uncached, &ul->head);
  1302. spin_unlock_bh(&ul->lock);
  1303. }
  1304. void rt_del_uncached_list(struct rtable *rt)
  1305. {
  1306. if (!list_empty(&rt->dst.rt_uncached)) {
  1307. struct uncached_list *ul = rt->dst.rt_uncached_list;
  1308. spin_lock_bh(&ul->lock);
  1309. list_del_init(&rt->dst.rt_uncached);
  1310. spin_unlock_bh(&ul->lock);
  1311. }
  1312. }
  1313. static void ipv4_dst_destroy(struct dst_entry *dst)
  1314. {
  1315. ip_dst_metrics_put(dst);
  1316. rt_del_uncached_list(dst_rtable(dst));
  1317. }
  1318. void rt_flush_dev(struct net_device *dev)
  1319. {
  1320. struct rtable *rt, *safe;
  1321. int cpu;
  1322. for_each_possible_cpu(cpu) {
  1323. struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
  1324. if (list_empty(&ul->head))
  1325. continue;
  1326. spin_lock_bh(&ul->lock);
  1327. list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
  1328. if (rt->dst.dev != dev)
  1329. continue;
  1330. rt->dst.dev = blackhole_netdev;
  1331. netdev_ref_replace(dev, blackhole_netdev,
  1332. &rt->dst.dev_tracker, GFP_ATOMIC);
  1333. list_del_init(&rt->dst.rt_uncached);
  1334. }
  1335. spin_unlock_bh(&ul->lock);
  1336. }
  1337. }
  1338. static bool rt_cache_valid(const struct rtable *rt)
  1339. {
  1340. return rt &&
  1341. rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
  1342. !rt_is_expired(rt);
  1343. }
  1344. static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
  1345. const struct fib_result *res,
  1346. struct fib_nh_exception *fnhe,
  1347. struct fib_info *fi, u16 type, u32 itag,
  1348. const bool do_cache)
  1349. {
  1350. bool cached = false;
  1351. if (fi) {
  1352. struct fib_nh_common *nhc = FIB_RES_NHC(*res);
  1353. if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
  1354. rt->rt_uses_gateway = 1;
  1355. rt->rt_gw_family = nhc->nhc_gw_family;
  1356. /* only INET and INET6 are supported */
  1357. if (likely(nhc->nhc_gw_family == AF_INET))
  1358. rt->rt_gw4 = nhc->nhc_gw.ipv4;
  1359. else
  1360. rt->rt_gw6 = nhc->nhc_gw.ipv6;
  1361. }
  1362. ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
  1363. #ifdef CONFIG_IP_ROUTE_CLASSID
  1364. if (nhc->nhc_family == AF_INET) {
  1365. struct fib_nh *nh;
  1366. nh = container_of(nhc, struct fib_nh, nh_common);
  1367. rt->dst.tclassid = nh->nh_tclassid;
  1368. }
  1369. #endif
  1370. rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
  1371. if (unlikely(fnhe))
  1372. cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
  1373. else if (do_cache)
  1374. cached = rt_cache_route(nhc, rt);
  1375. if (unlikely(!cached)) {
  1376. /* Routes we intend to cache in nexthop exception or
  1377. * FIB nexthop have the DST_NOCACHE bit clear.
  1378. * However, if we are unsuccessful at storing this
  1379. * route into the cache we really need to set it.
  1380. */
  1381. if (!rt->rt_gw4) {
  1382. rt->rt_gw_family = AF_INET;
  1383. rt->rt_gw4 = daddr;
  1384. }
  1385. rt_add_uncached_list(rt);
  1386. }
  1387. } else
  1388. rt_add_uncached_list(rt);
  1389. #ifdef CONFIG_IP_ROUTE_CLASSID
  1390. #ifdef CONFIG_IP_MULTIPLE_TABLES
  1391. set_class_tag(rt, res->tclassid);
  1392. #endif
  1393. set_class_tag(rt, itag);
  1394. #endif
  1395. }
  1396. struct rtable *rt_dst_alloc(struct net_device *dev,
  1397. unsigned int flags, u16 type,
  1398. bool noxfrm)
  1399. {
  1400. struct rtable *rt;
  1401. rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
  1402. (noxfrm ? DST_NOXFRM : 0));
  1403. if (rt) {
  1404. rt->rt_genid = rt_genid_ipv4(dev_net(dev));
  1405. rt->rt_flags = flags;
  1406. rt->rt_type = type;
  1407. rt->rt_is_input = 0;
  1408. rt->rt_iif = 0;
  1409. rt->rt_pmtu = 0;
  1410. rt->rt_mtu_locked = 0;
  1411. rt->rt_uses_gateway = 0;
  1412. rt->rt_gw_family = 0;
  1413. rt->rt_gw4 = 0;
  1414. rt->dst.output = ip_output;
  1415. if (flags & RTCF_LOCAL)
  1416. rt->dst.input = ip_local_deliver;
  1417. }
  1418. return rt;
  1419. }
  1420. EXPORT_SYMBOL(rt_dst_alloc);
  1421. struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
  1422. {
  1423. struct rtable *new_rt;
  1424. new_rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
  1425. rt->dst.flags);
  1426. if (new_rt) {
  1427. new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
  1428. new_rt->rt_flags = rt->rt_flags;
  1429. new_rt->rt_type = rt->rt_type;
  1430. new_rt->rt_is_input = rt->rt_is_input;
  1431. new_rt->rt_iif = rt->rt_iif;
  1432. new_rt->rt_pmtu = rt->rt_pmtu;
  1433. new_rt->rt_mtu_locked = rt->rt_mtu_locked;
  1434. new_rt->rt_gw_family = rt->rt_gw_family;
  1435. if (rt->rt_gw_family == AF_INET)
  1436. new_rt->rt_gw4 = rt->rt_gw4;
  1437. else if (rt->rt_gw_family == AF_INET6)
  1438. new_rt->rt_gw6 = rt->rt_gw6;
  1439. new_rt->dst.input = READ_ONCE(rt->dst.input);
  1440. new_rt->dst.output = READ_ONCE(rt->dst.output);
  1441. new_rt->dst.error = rt->dst.error;
  1442. new_rt->dst.lastuse = jiffies;
  1443. new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
  1444. }
  1445. return new_rt;
  1446. }
  1447. EXPORT_SYMBOL(rt_dst_clone);
  1448. /* called in rcu_read_lock() section */
  1449. int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
  1450. u8 tos, struct net_device *dev,
  1451. struct in_device *in_dev, u32 *itag)
  1452. {
  1453. int err;
  1454. /* Primary sanity checks. */
  1455. if (!in_dev)
  1456. return -EINVAL;
  1457. if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
  1458. skb->protocol != htons(ETH_P_IP))
  1459. return -EINVAL;
  1460. if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
  1461. return -EINVAL;
  1462. if (ipv4_is_zeronet(saddr)) {
  1463. if (!ipv4_is_local_multicast(daddr) &&
  1464. ip_hdr(skb)->protocol != IPPROTO_IGMP)
  1465. return -EINVAL;
  1466. } else {
  1467. err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
  1468. in_dev, itag);
  1469. if (err < 0)
  1470. return err;
  1471. }
  1472. return 0;
  1473. }
  1474. /* called in rcu_read_lock() section */
  1475. static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
  1476. u8 tos, struct net_device *dev, int our)
  1477. {
  1478. struct in_device *in_dev = __in_dev_get_rcu(dev);
  1479. unsigned int flags = RTCF_MULTICAST;
  1480. struct rtable *rth;
  1481. u32 itag = 0;
  1482. int err;
  1483. err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
  1484. if (err)
  1485. return err;
  1486. if (our)
  1487. flags |= RTCF_LOCAL;
  1488. if (IN_DEV_ORCONF(in_dev, NOPOLICY))
  1489. IPCB(skb)->flags |= IPSKB_NOPOLICY;
  1490. rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
  1491. false);
  1492. if (!rth)
  1493. return -ENOBUFS;
  1494. #ifdef CONFIG_IP_ROUTE_CLASSID
  1495. rth->dst.tclassid = itag;
  1496. #endif
  1497. rth->dst.output = ip_rt_bug;
  1498. rth->rt_is_input= 1;
  1499. #ifdef CONFIG_IP_MROUTE
  1500. if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
  1501. rth->dst.input = ip_mr_input;
  1502. #endif
  1503. RT_CACHE_STAT_INC(in_slow_mc);
  1504. skb_dst_drop(skb);
  1505. skb_dst_set(skb, &rth->dst);
  1506. return 0;
  1507. }
  1508. static void ip_handle_martian_source(struct net_device *dev,
  1509. struct in_device *in_dev,
  1510. struct sk_buff *skb,
  1511. __be32 daddr,
  1512. __be32 saddr)
  1513. {
  1514. RT_CACHE_STAT_INC(in_martian_src);
  1515. #ifdef CONFIG_IP_ROUTE_VERBOSE
  1516. if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
  1517. /*
  1518. * RFC1812 recommendation, if source is martian,
  1519. * the only hint is MAC header.
  1520. */
  1521. pr_warn("martian source %pI4 from %pI4, on dev %s\n",
  1522. &daddr, &saddr, dev->name);
  1523. if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
  1524. print_hex_dump(KERN_WARNING, "ll header: ",
  1525. DUMP_PREFIX_OFFSET, 16, 1,
  1526. skb_mac_header(skb),
  1527. dev->hard_header_len, false);
  1528. }
  1529. }
  1530. #endif
  1531. }
  1532. /* called in rcu_read_lock() section */
  1533. static int __mkroute_input(struct sk_buff *skb,
  1534. const struct fib_result *res,
  1535. struct in_device *in_dev,
  1536. __be32 daddr, __be32 saddr, u32 tos)
  1537. {
  1538. struct fib_nh_common *nhc = FIB_RES_NHC(*res);
  1539. struct net_device *dev = nhc->nhc_dev;
  1540. struct fib_nh_exception *fnhe;
  1541. struct rtable *rth;
  1542. int err;
  1543. struct in_device *out_dev;
  1544. bool do_cache;
  1545. u32 itag = 0;
  1546. /* get a working reference to the output device */
  1547. out_dev = __in_dev_get_rcu(dev);
  1548. if (!out_dev) {
  1549. net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
  1550. return -EINVAL;
  1551. }
  1552. err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
  1553. in_dev->dev, in_dev, &itag);
  1554. if (err < 0) {
  1555. ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
  1556. saddr);
  1557. goto cleanup;
  1558. }
  1559. do_cache = res->fi && !itag;
  1560. if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
  1561. skb->protocol == htons(ETH_P_IP)) {
  1562. __be32 gw;
  1563. gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
  1564. if (IN_DEV_SHARED_MEDIA(out_dev) ||
  1565. inet_addr_onlink(out_dev, saddr, gw))
  1566. IPCB(skb)->flags |= IPSKB_DOREDIRECT;
  1567. }
  1568. if (skb->protocol != htons(ETH_P_IP)) {
  1569. /* Not IP (i.e. ARP). Do not create route, if it is
  1570. * invalid for proxy arp. DNAT routes are always valid.
  1571. *
  1572. * Proxy arp feature have been extended to allow, ARP
  1573. * replies back to the same interface, to support
  1574. * Private VLAN switch technologies. See arp.c.
  1575. */
  1576. if (out_dev == in_dev &&
  1577. IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
  1578. err = -EINVAL;
  1579. goto cleanup;
  1580. }
  1581. }
  1582. if (IN_DEV_ORCONF(in_dev, NOPOLICY))
  1583. IPCB(skb)->flags |= IPSKB_NOPOLICY;
  1584. fnhe = find_exception(nhc, daddr);
  1585. if (do_cache) {
  1586. if (fnhe)
  1587. rth = rcu_dereference(fnhe->fnhe_rth_input);
  1588. else
  1589. rth = rcu_dereference(nhc->nhc_rth_input);
  1590. if (rt_cache_valid(rth)) {
  1591. skb_dst_set_noref(skb, &rth->dst);
  1592. goto out;
  1593. }
  1594. }
  1595. rth = rt_dst_alloc(out_dev->dev, 0, res->type,
  1596. IN_DEV_ORCONF(out_dev, NOXFRM));
  1597. if (!rth) {
  1598. err = -ENOBUFS;
  1599. goto cleanup;
  1600. }
  1601. rth->rt_is_input = 1;
  1602. RT_CACHE_STAT_INC(in_slow_tot);
  1603. rth->dst.input = ip_forward;
  1604. rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
  1605. do_cache);
  1606. lwtunnel_set_redirect(&rth->dst);
  1607. skb_dst_set(skb, &rth->dst);
  1608. out:
  1609. err = 0;
  1610. cleanup:
  1611. return err;
  1612. }
  1613. #ifdef CONFIG_IP_ROUTE_MULTIPATH
  1614. /* To make ICMP packets follow the right flow, the multipath hash is
  1615. * calculated from the inner IP addresses.
  1616. */
  1617. static void ip_multipath_l3_keys(const struct sk_buff *skb,
  1618. struct flow_keys *hash_keys)
  1619. {
  1620. const struct iphdr *outer_iph = ip_hdr(skb);
  1621. const struct iphdr *key_iph = outer_iph;
  1622. const struct iphdr *inner_iph;
  1623. const struct icmphdr *icmph;
  1624. struct iphdr _inner_iph;
  1625. struct icmphdr _icmph;
  1626. if (likely(outer_iph->protocol != IPPROTO_ICMP))
  1627. goto out;
  1628. if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
  1629. goto out;
  1630. icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
  1631. &_icmph);
  1632. if (!icmph)
  1633. goto out;
  1634. if (!icmp_is_err(icmph->type))
  1635. goto out;
  1636. inner_iph = skb_header_pointer(skb,
  1637. outer_iph->ihl * 4 + sizeof(_icmph),
  1638. sizeof(_inner_iph), &_inner_iph);
  1639. if (!inner_iph)
  1640. goto out;
  1641. key_iph = inner_iph;
  1642. out:
  1643. hash_keys->addrs.v4addrs.src = key_iph->saddr;
  1644. hash_keys->addrs.v4addrs.dst = key_iph->daddr;
  1645. }
  1646. static u32 fib_multipath_custom_hash_outer(const struct net *net,
  1647. const struct sk_buff *skb,
  1648. bool *p_has_inner)
  1649. {
  1650. u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
  1651. struct flow_keys keys, hash_keys;
  1652. if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
  1653. return 0;
  1654. memset(&hash_keys, 0, sizeof(hash_keys));
  1655. skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);
  1656. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
  1657. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
  1658. hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
  1659. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
  1660. hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
  1661. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
  1662. hash_keys.basic.ip_proto = keys.basic.ip_proto;
  1663. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
  1664. hash_keys.ports.src = keys.ports.src;
  1665. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
  1666. hash_keys.ports.dst = keys.ports.dst;
  1667. *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
  1668. return fib_multipath_hash_from_keys(net, &hash_keys);
  1669. }
  1670. static u32 fib_multipath_custom_hash_inner(const struct net *net,
  1671. const struct sk_buff *skb,
  1672. bool has_inner)
  1673. {
  1674. u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
  1675. struct flow_keys keys, hash_keys;
  1676. /* We assume the packet carries an encapsulation, but if none was
  1677. * encountered during dissection of the outer flow, then there is no
  1678. * point in calling the flow dissector again.
  1679. */
  1680. if (!has_inner)
  1681. return 0;
  1682. if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
  1683. return 0;
  1684. memset(&hash_keys, 0, sizeof(hash_keys));
  1685. skb_flow_dissect_flow_keys(skb, &keys, 0);
  1686. if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
  1687. return 0;
  1688. if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
  1689. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
  1690. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
  1691. hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
  1692. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
  1693. hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
  1694. } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
  1695. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
  1696. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
  1697. hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
  1698. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
  1699. hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
  1700. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
  1701. hash_keys.tags.flow_label = keys.tags.flow_label;
  1702. }
  1703. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
  1704. hash_keys.basic.ip_proto = keys.basic.ip_proto;
  1705. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
  1706. hash_keys.ports.src = keys.ports.src;
  1707. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
  1708. hash_keys.ports.dst = keys.ports.dst;
  1709. return fib_multipath_hash_from_keys(net, &hash_keys);
  1710. }
  1711. static u32 fib_multipath_custom_hash_skb(const struct net *net,
  1712. const struct sk_buff *skb)
  1713. {
  1714. u32 mhash, mhash_inner;
  1715. bool has_inner = true;
  1716. mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner);
  1717. mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner);
  1718. return jhash_2words(mhash, mhash_inner, 0);
  1719. }
  1720. static u32 fib_multipath_custom_hash_fl4(const struct net *net,
  1721. const struct flowi4 *fl4)
  1722. {
  1723. u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
  1724. struct flow_keys hash_keys;
  1725. if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
  1726. return 0;
  1727. memset(&hash_keys, 0, sizeof(hash_keys));
  1728. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
  1729. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
  1730. hash_keys.addrs.v4addrs.src = fl4->saddr;
  1731. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
  1732. hash_keys.addrs.v4addrs.dst = fl4->daddr;
  1733. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
  1734. hash_keys.basic.ip_proto = fl4->flowi4_proto;
  1735. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
  1736. hash_keys.ports.src = fl4->fl4_sport;
  1737. if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
  1738. hash_keys.ports.dst = fl4->fl4_dport;
  1739. return fib_multipath_hash_from_keys(net, &hash_keys);
  1740. }
  1741. /* if skb is set it will be used and fl4 can be NULL */
  1742. int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
  1743. const struct sk_buff *skb, struct flow_keys *flkeys)
  1744. {
  1745. u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
  1746. struct flow_keys hash_keys;
  1747. u32 mhash = 0;
  1748. switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) {
  1749. case 0:
  1750. memset(&hash_keys, 0, sizeof(hash_keys));
  1751. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
  1752. if (skb) {
  1753. ip_multipath_l3_keys(skb, &hash_keys);
  1754. } else {
  1755. hash_keys.addrs.v4addrs.src = fl4->saddr;
  1756. hash_keys.addrs.v4addrs.dst = fl4->daddr;
  1757. }
  1758. mhash = fib_multipath_hash_from_keys(net, &hash_keys);
  1759. break;
  1760. case 1:
  1761. /* skb is currently provided only when forwarding */
  1762. if (skb) {
  1763. unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
  1764. struct flow_keys keys;
  1765. /* short-circuit if we already have L4 hash present */
  1766. if (skb->l4_hash)
  1767. return skb_get_hash_raw(skb) >> 1;
  1768. memset(&hash_keys, 0, sizeof(hash_keys));
  1769. if (!flkeys) {
  1770. skb_flow_dissect_flow_keys(skb, &keys, flag);
  1771. flkeys = &keys;
  1772. }
  1773. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
  1774. hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
  1775. hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
  1776. hash_keys.ports.src = flkeys->ports.src;
  1777. hash_keys.ports.dst = flkeys->ports.dst;
  1778. hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
  1779. } else {
  1780. memset(&hash_keys, 0, sizeof(hash_keys));
  1781. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
  1782. hash_keys.addrs.v4addrs.src = fl4->saddr;
  1783. hash_keys.addrs.v4addrs.dst = fl4->daddr;
  1784. hash_keys.ports.src = fl4->fl4_sport;
  1785. hash_keys.ports.dst = fl4->fl4_dport;
  1786. hash_keys.basic.ip_proto = fl4->flowi4_proto;
  1787. }
  1788. mhash = fib_multipath_hash_from_keys(net, &hash_keys);
  1789. break;
  1790. case 2:
  1791. memset(&hash_keys, 0, sizeof(hash_keys));
  1792. /* skb is currently provided only when forwarding */
  1793. if (skb) {
  1794. struct flow_keys keys;
  1795. skb_flow_dissect_flow_keys(skb, &keys, 0);
  1796. /* Inner can be v4 or v6 */
  1797. if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
  1798. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
  1799. hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
  1800. hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
  1801. } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
  1802. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
  1803. hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
  1804. hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
  1805. hash_keys.tags.flow_label = keys.tags.flow_label;
  1806. hash_keys.basic.ip_proto = keys.basic.ip_proto;
  1807. } else {
  1808. /* Same as case 0 */
  1809. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
  1810. ip_multipath_l3_keys(skb, &hash_keys);
  1811. }
  1812. } else {
  1813. /* Same as case 0 */
  1814. hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
  1815. hash_keys.addrs.v4addrs.src = fl4->saddr;
  1816. hash_keys.addrs.v4addrs.dst = fl4->daddr;
  1817. }
  1818. mhash = fib_multipath_hash_from_keys(net, &hash_keys);
  1819. break;
  1820. case 3:
  1821. if (skb)
  1822. mhash = fib_multipath_custom_hash_skb(net, skb);
  1823. else
  1824. mhash = fib_multipath_custom_hash_fl4(net, fl4);
  1825. break;
  1826. }
  1827. if (multipath_hash)
  1828. mhash = jhash_2words(mhash, multipath_hash, 0);
  1829. return mhash >> 1;
  1830. }
  1831. #endif /* CONFIG_IP_ROUTE_MULTIPATH */
  1832. static int ip_mkroute_input(struct sk_buff *skb,
  1833. struct fib_result *res,
  1834. struct in_device *in_dev,
  1835. __be32 daddr, __be32 saddr, u32 tos,
  1836. struct flow_keys *hkeys)
  1837. {
  1838. #ifdef CONFIG_IP_ROUTE_MULTIPATH
  1839. if (res->fi && fib_info_num_path(res->fi) > 1) {
  1840. int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
  1841. fib_select_multipath(res, h);
  1842. IPCB(skb)->flags |= IPSKB_MULTIPATH;
  1843. }
  1844. #endif
  1845. /* create a routing cache entry */
  1846. return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
  1847. }
  1848. /* Implements all the saddr-related checks as ip_route_input_slow(),
  1849. * assuming daddr is valid and the destination is not a local broadcast one.
  1850. * Uses the provided hint instead of performing a route lookup.
  1851. */
  1852. int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
  1853. u8 tos, struct net_device *dev,
  1854. const struct sk_buff *hint)
  1855. {
  1856. struct in_device *in_dev = __in_dev_get_rcu(dev);
  1857. struct rtable *rt = skb_rtable(hint);
  1858. struct net *net = dev_net(dev);
  1859. int err = -EINVAL;
  1860. u32 tag = 0;
  1861. if (!in_dev)
  1862. return -EINVAL;
  1863. if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
  1864. goto martian_source;
  1865. if (ipv4_is_zeronet(saddr))
  1866. goto martian_source;
  1867. if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
  1868. goto martian_source;
  1869. if (rt->rt_type != RTN_LOCAL)
  1870. goto skip_validate_source;
  1871. tos &= INET_DSCP_MASK;
  1872. err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
  1873. if (err < 0)
  1874. goto martian_source;
  1875. skip_validate_source:
  1876. skb_dst_copy(skb, hint);
  1877. return 0;
  1878. martian_source:
  1879. ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
  1880. return err;
  1881. }
  1882. /* get device for dst_alloc with local routes */
  1883. static struct net_device *ip_rt_get_dev(struct net *net,
  1884. const struct fib_result *res)
  1885. {
  1886. struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
  1887. struct net_device *dev = NULL;
  1888. if (nhc)
  1889. dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
  1890. return dev ? : net->loopback_dev;
  1891. }
  1892. /*
  1893. * NOTE. We drop all the packets that has local source
  1894. * addresses, because every properly looped back packet
  1895. * must have correct destination already attached by output routine.
  1896. * Changes in the enforced policies must be applied also to
  1897. * ip_route_use_hint().
  1898. *
  1899. * Such approach solves two big problems:
  1900. * 1. Not simplex devices are handled properly.
  1901. * 2. IP spoofing attempts are filtered with 100% of guarantee.
  1902. * called with rcu_read_lock()
  1903. */
  1904. static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
  1905. u8 tos, struct net_device *dev,
  1906. struct fib_result *res)
  1907. {
  1908. struct in_device *in_dev = __in_dev_get_rcu(dev);
  1909. struct flow_keys *flkeys = NULL, _flkeys;
  1910. struct net *net = dev_net(dev);
  1911. struct ip_tunnel_info *tun_info;
  1912. int err = -EINVAL;
  1913. unsigned int flags = 0;
  1914. u32 itag = 0;
  1915. struct rtable *rth;
  1916. struct flowi4 fl4;
  1917. bool do_cache = true;
  1918. /* IP on this device is disabled. */
  1919. if (!in_dev)
  1920. goto out;
  1921. /* Check for the most weird martians, which can be not detected
  1922. * by fib_lookup.
  1923. */
  1924. tun_info = skb_tunnel_info(skb);
  1925. if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
  1926. fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
  1927. else
  1928. fl4.flowi4_tun_key.tun_id = 0;
  1929. skb_dst_drop(skb);
  1930. if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
  1931. goto martian_source;
  1932. res->fi = NULL;
  1933. res->table = NULL;
  1934. if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
  1935. goto brd_input;
  1936. /* Accept zero addresses only to limited broadcast;
  1937. * I even do not know to fix it or not. Waiting for complains :-)
  1938. */
  1939. if (ipv4_is_zeronet(saddr))
  1940. goto martian_source;
  1941. if (ipv4_is_zeronet(daddr))
  1942. goto martian_destination;
  1943. /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
  1944. * and call it once if daddr or/and saddr are loopback addresses
  1945. */
  1946. if (ipv4_is_loopback(daddr)) {
  1947. if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
  1948. goto martian_destination;
  1949. } else if (ipv4_is_loopback(saddr)) {
  1950. if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
  1951. goto martian_source;
  1952. }
  1953. /*
  1954. * Now we are ready to route packet.
  1955. */
  1956. fl4.flowi4_l3mdev = 0;
  1957. fl4.flowi4_oif = 0;
  1958. fl4.flowi4_iif = dev->ifindex;
  1959. fl4.flowi4_mark = skb->mark;
  1960. fl4.flowi4_tos = tos;
  1961. fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
  1962. fl4.flowi4_flags = 0;
  1963. fl4.daddr = daddr;
  1964. fl4.saddr = saddr;
  1965. fl4.flowi4_uid = sock_net_uid(net, NULL);
  1966. fl4.flowi4_multipath_hash = 0;
  1967. if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
  1968. flkeys = &_flkeys;
  1969. } else {
  1970. fl4.flowi4_proto = 0;
  1971. fl4.fl4_sport = 0;
  1972. fl4.fl4_dport = 0;
  1973. }
  1974. err = fib_lookup(net, &fl4, res, 0);
  1975. if (err != 0) {
  1976. if (!IN_DEV_FORWARD(in_dev))
  1977. err = -EHOSTUNREACH;
  1978. goto no_route;
  1979. }
  1980. if (res->type == RTN_BROADCAST) {
  1981. if (IN_DEV_BFORWARD(in_dev))
  1982. goto make_route;
  1983. /* not do cache if bc_forwarding is enabled */
  1984. if (IPV4_DEVCONF_ALL_RO(net, BC_FORWARDING))
  1985. do_cache = false;
  1986. goto brd_input;
  1987. }
  1988. if (res->type == RTN_LOCAL) {
  1989. err = fib_validate_source(skb, saddr, daddr, tos,
  1990. 0, dev, in_dev, &itag);
  1991. if (err < 0)
  1992. goto martian_source;
  1993. goto local_input;
  1994. }
  1995. if (!IN_DEV_FORWARD(in_dev)) {
  1996. err = -EHOSTUNREACH;
  1997. goto no_route;
  1998. }
  1999. if (res->type != RTN_UNICAST)
  2000. goto martian_destination;
  2001. make_route:
  2002. err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
  2003. out: return err;
  2004. brd_input:
  2005. if (skb->protocol != htons(ETH_P_IP))
  2006. goto e_inval;
  2007. if (!ipv4_is_zeronet(saddr)) {
  2008. err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
  2009. in_dev, &itag);
  2010. if (err < 0)
  2011. goto martian_source;
  2012. }
  2013. flags |= RTCF_BROADCAST;
  2014. res->type = RTN_BROADCAST;
  2015. RT_CACHE_STAT_INC(in_brd);
  2016. local_input:
  2017. if (IN_DEV_ORCONF(in_dev, NOPOLICY))
  2018. IPCB(skb)->flags |= IPSKB_NOPOLICY;
  2019. do_cache &= res->fi && !itag;
  2020. if (do_cache) {
  2021. struct fib_nh_common *nhc = FIB_RES_NHC(*res);
  2022. rth = rcu_dereference(nhc->nhc_rth_input);
  2023. if (rt_cache_valid(rth)) {
  2024. skb_dst_set_noref(skb, &rth->dst);
  2025. err = 0;
  2026. goto out;
  2027. }
  2028. }
  2029. rth = rt_dst_alloc(ip_rt_get_dev(net, res),
  2030. flags | RTCF_LOCAL, res->type, false);
  2031. if (!rth)
  2032. goto e_nobufs;
  2033. rth->dst.output= ip_rt_bug;
  2034. #ifdef CONFIG_IP_ROUTE_CLASSID
  2035. rth->dst.tclassid = itag;
  2036. #endif
  2037. rth->rt_is_input = 1;
  2038. RT_CACHE_STAT_INC(in_slow_tot);
  2039. if (res->type == RTN_UNREACHABLE) {
  2040. rth->dst.input= ip_error;
  2041. rth->dst.error= -err;
  2042. rth->rt_flags &= ~RTCF_LOCAL;
  2043. }
  2044. if (do_cache) {
  2045. struct fib_nh_common *nhc = FIB_RES_NHC(*res);
  2046. rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
  2047. if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
  2048. WARN_ON(rth->dst.input == lwtunnel_input);
  2049. rth->dst.lwtstate->orig_input = rth->dst.input;
  2050. rth->dst.input = lwtunnel_input;
  2051. }
  2052. if (unlikely(!rt_cache_route(nhc, rth)))
  2053. rt_add_uncached_list(rth);
  2054. }
  2055. skb_dst_set(skb, &rth->dst);
  2056. err = 0;
  2057. goto out;
  2058. no_route:
  2059. RT_CACHE_STAT_INC(in_no_route);
  2060. res->type = RTN_UNREACHABLE;
  2061. res->fi = NULL;
  2062. res->table = NULL;
  2063. goto local_input;
  2064. /*
  2065. * Do not cache martian addresses: they should be logged (RFC1812)
  2066. */
  2067. martian_destination:
  2068. RT_CACHE_STAT_INC(in_martian_dst);
  2069. #ifdef CONFIG_IP_ROUTE_VERBOSE
  2070. if (IN_DEV_LOG_MARTIANS(in_dev))
  2071. net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
  2072. &daddr, &saddr, dev->name);
  2073. #endif
  2074. e_inval:
  2075. err = -EINVAL;
  2076. goto out;
  2077. e_nobufs:
  2078. err = -ENOBUFS;
  2079. goto out;
  2080. martian_source:
  2081. ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
  2082. goto out;
  2083. }
  2084. /* called with rcu_read_lock held */
  2085. static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
  2086. u8 tos, struct net_device *dev, struct fib_result *res)
  2087. {
  2088. /* Multicast recognition logic is moved from route cache to here.
  2089. * The problem was that too many Ethernet cards have broken/missing
  2090. * hardware multicast filters :-( As result the host on multicasting
  2091. * network acquires a lot of useless route cache entries, sort of
  2092. * SDR messages from all the world. Now we try to get rid of them.
  2093. * Really, provided software IP multicast filter is organized
  2094. * reasonably (at least, hashed), it does not result in a slowdown
  2095. * comparing with route cache reject entries.
  2096. * Note, that multicast routers are not affected, because
  2097. * route cache entry is created eventually.
  2098. */
  2099. if (ipv4_is_multicast(daddr)) {
  2100. struct in_device *in_dev = __in_dev_get_rcu(dev);
  2101. int our = 0;
  2102. int err = -EINVAL;
  2103. if (!in_dev)
  2104. return err;
  2105. our = ip_check_mc_rcu(in_dev, daddr, saddr,
  2106. ip_hdr(skb)->protocol);
  2107. /* check l3 master if no match yet */
  2108. if (!our && netif_is_l3_slave(dev)) {
  2109. struct in_device *l3_in_dev;
  2110. l3_in_dev = __in_dev_get_rcu(skb->dev);
  2111. if (l3_in_dev)
  2112. our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
  2113. ip_hdr(skb)->protocol);
  2114. }
  2115. if (our
  2116. #ifdef CONFIG_IP_MROUTE
  2117. ||
  2118. (!ipv4_is_local_multicast(daddr) &&
  2119. IN_DEV_MFORWARD(in_dev))
  2120. #endif
  2121. ) {
  2122. err = ip_route_input_mc(skb, daddr, saddr,
  2123. tos, dev, our);
  2124. }
  2125. return err;
  2126. }
  2127. return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
  2128. }
  2129. int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
  2130. u8 tos, struct net_device *dev)
  2131. {
  2132. struct fib_result res;
  2133. int err;
  2134. tos &= INET_DSCP_MASK;
  2135. rcu_read_lock();
  2136. err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
  2137. rcu_read_unlock();
  2138. return err;
  2139. }
  2140. EXPORT_SYMBOL(ip_route_input_noref);
  2141. /* called with rcu_read_lock() */
  2142. static struct rtable *__mkroute_output(const struct fib_result *res,
  2143. const struct flowi4 *fl4, int orig_oif,
  2144. struct net_device *dev_out,
  2145. unsigned int flags)
  2146. {
  2147. struct fib_info *fi = res->fi;
  2148. struct fib_nh_exception *fnhe;
  2149. struct in_device *in_dev;
  2150. u16 type = res->type;
  2151. struct rtable *rth;
  2152. bool do_cache;
  2153. in_dev = __in_dev_get_rcu(dev_out);
  2154. if (!in_dev)
  2155. return ERR_PTR(-EINVAL);
  2156. if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
  2157. if (ipv4_is_loopback(fl4->saddr) &&
  2158. !(dev_out->flags & IFF_LOOPBACK) &&
  2159. !netif_is_l3_master(dev_out))
  2160. return ERR_PTR(-EINVAL);
  2161. if (ipv4_is_lbcast(fl4->daddr)) {
  2162. type = RTN_BROADCAST;
  2163. /* reset fi to prevent gateway resolution */
  2164. fi = NULL;
  2165. } else if (ipv4_is_multicast(fl4->daddr)) {
  2166. type = RTN_MULTICAST;
  2167. } else if (ipv4_is_zeronet(fl4->daddr)) {
  2168. return ERR_PTR(-EINVAL);
  2169. }
  2170. if (dev_out->flags & IFF_LOOPBACK)
  2171. flags |= RTCF_LOCAL;
  2172. do_cache = true;
  2173. if (type == RTN_BROADCAST) {
  2174. flags |= RTCF_BROADCAST | RTCF_LOCAL;
  2175. } else if (type == RTN_MULTICAST) {
  2176. flags |= RTCF_MULTICAST | RTCF_LOCAL;
  2177. if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
  2178. fl4->flowi4_proto))
  2179. flags &= ~RTCF_LOCAL;
  2180. else
  2181. do_cache = false;
  2182. /* If multicast route do not exist use
  2183. * default one, but do not gateway in this case.
  2184. * Yes, it is hack.
  2185. */
  2186. if (fi && res->prefixlen < 4)
  2187. fi = NULL;
  2188. } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
  2189. (orig_oif != dev_out->ifindex)) {
  2190. /* For local routes that require a particular output interface
  2191. * we do not want to cache the result. Caching the result
  2192. * causes incorrect behaviour when there are multiple source
  2193. * addresses on the interface, the end result being that if the
  2194. * intended recipient is waiting on that interface for the
  2195. * packet he won't receive it because it will be delivered on
  2196. * the loopback interface and the IP_PKTINFO ipi_ifindex will
  2197. * be set to the loopback interface as well.
  2198. */
  2199. do_cache = false;
  2200. }
  2201. fnhe = NULL;
  2202. do_cache &= fi != NULL;
  2203. if (fi) {
  2204. struct fib_nh_common *nhc = FIB_RES_NHC(*res);
  2205. struct rtable __rcu **prth;
  2206. fnhe = find_exception(nhc, fl4->daddr);
  2207. if (!do_cache)
  2208. goto add;
  2209. if (fnhe) {
  2210. prth = &fnhe->fnhe_rth_output;
  2211. } else {
  2212. if (unlikely(fl4->flowi4_flags &
  2213. FLOWI_FLAG_KNOWN_NH &&
  2214. !(nhc->nhc_gw_family &&
  2215. nhc->nhc_scope == RT_SCOPE_LINK))) {
  2216. do_cache = false;
  2217. goto add;
  2218. }
  2219. prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
  2220. }
  2221. rth = rcu_dereference(*prth);
  2222. if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
  2223. return rth;
  2224. }
  2225. add:
  2226. rth = rt_dst_alloc(dev_out, flags, type,
  2227. IN_DEV_ORCONF(in_dev, NOXFRM));
  2228. if (!rth)
  2229. return ERR_PTR(-ENOBUFS);
  2230. rth->rt_iif = orig_oif;
  2231. RT_CACHE_STAT_INC(out_slow_tot);
  2232. if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
  2233. if (flags & RTCF_LOCAL &&
  2234. !(dev_out->flags & IFF_LOOPBACK)) {
  2235. rth->dst.output = ip_mc_output;
  2236. RT_CACHE_STAT_INC(out_slow_mc);
  2237. }
  2238. #ifdef CONFIG_IP_MROUTE
  2239. if (type == RTN_MULTICAST) {
  2240. if (IN_DEV_MFORWARD(in_dev) &&
  2241. !ipv4_is_local_multicast(fl4->daddr)) {
  2242. rth->dst.input = ip_mr_input;
  2243. rth->dst.output = ip_mc_output;
  2244. }
  2245. }
  2246. #endif
  2247. }
  2248. rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
  2249. lwtunnel_set_redirect(&rth->dst);
  2250. return rth;
  2251. }
  2252. /*
  2253. * Major route resolver routine.
  2254. */
  2255. struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
  2256. const struct sk_buff *skb)
  2257. {
  2258. struct fib_result res = {
  2259. .type = RTN_UNSPEC,
  2260. .fi = NULL,
  2261. .table = NULL,
  2262. .tclassid = 0,
  2263. };
  2264. struct rtable *rth;
  2265. fl4->flowi4_iif = LOOPBACK_IFINDEX;
  2266. fl4->flowi4_tos &= INET_DSCP_MASK;
  2267. rcu_read_lock();
  2268. rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
  2269. rcu_read_unlock();
  2270. return rth;
  2271. }
  2272. EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
  2273. struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
  2274. struct fib_result *res,
  2275. const struct sk_buff *skb)
  2276. {
  2277. struct net_device *dev_out = NULL;
  2278. int orig_oif = fl4->flowi4_oif;
  2279. unsigned int flags = 0;
  2280. struct rtable *rth;
  2281. int err;
  2282. if (fl4->saddr) {
  2283. if (ipv4_is_multicast(fl4->saddr) ||
  2284. ipv4_is_lbcast(fl4->saddr) ||
  2285. ipv4_is_zeronet(fl4->saddr)) {
  2286. rth = ERR_PTR(-EINVAL);
  2287. goto out;
  2288. }
  2289. rth = ERR_PTR(-ENETUNREACH);
  2290. /* I removed check for oif == dev_out->oif here.
  2291. * It was wrong for two reasons:
  2292. * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
  2293. * is assigned to multiple interfaces.
  2294. * 2. Moreover, we are allowed to send packets with saddr
  2295. * of another iface. --ANK
  2296. */
  2297. if (fl4->flowi4_oif == 0 &&
  2298. (ipv4_is_multicast(fl4->daddr) ||
  2299. ipv4_is_lbcast(fl4->daddr))) {
  2300. /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
  2301. dev_out = __ip_dev_find(net, fl4->saddr, false);
  2302. if (!dev_out)
  2303. goto out;
  2304. /* Special hack: user can direct multicasts
  2305. * and limited broadcast via necessary interface
  2306. * without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
  2307. * This hack is not just for fun, it allows
  2308. * vic,vat and friends to work.
  2309. * They bind socket to loopback, set ttl to zero
  2310. * and expect that it will work.
  2311. * From the viewpoint of routing cache they are broken,
  2312. * because we are not allowed to build multicast path
  2313. * with loopback source addr (look, routing cache
  2314. * cannot know, that ttl is zero, so that packet
  2315. * will not leave this host and route is valid).
  2316. * Luckily, this hack is good workaround.
  2317. */
  2318. fl4->flowi4_oif = dev_out->ifindex;
  2319. goto make_route;
  2320. }
  2321. if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
  2322. /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
  2323. if (!__ip_dev_find(net, fl4->saddr, false))
  2324. goto out;
  2325. }
  2326. }
  2327. if (fl4->flowi4_oif) {
  2328. dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
  2329. rth = ERR_PTR(-ENODEV);
  2330. if (!dev_out)
  2331. goto out;
  2332. /* RACE: Check return value of inet_select_addr instead. */
  2333. if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
  2334. rth = ERR_PTR(-ENETUNREACH);
  2335. goto out;
  2336. }
  2337. if (ipv4_is_local_multicast(fl4->daddr) ||
  2338. ipv4_is_lbcast(fl4->daddr) ||
  2339. fl4->flowi4_proto == IPPROTO_IGMP) {
  2340. if (!fl4->saddr)
  2341. fl4->saddr = inet_select_addr(dev_out, 0,
  2342. RT_SCOPE_LINK);
  2343. goto make_route;
  2344. }
  2345. if (!fl4->saddr) {
  2346. if (ipv4_is_multicast(fl4->daddr))
  2347. fl4->saddr = inet_select_addr(dev_out, 0,
  2348. fl4->flowi4_scope);
  2349. else if (!fl4->daddr)
  2350. fl4->saddr = inet_select_addr(dev_out, 0,
  2351. RT_SCOPE_HOST);
  2352. }
  2353. }
  2354. if (!fl4->daddr) {
  2355. fl4->daddr = fl4->saddr;
  2356. if (!fl4->daddr)
  2357. fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
  2358. dev_out = net->loopback_dev;
  2359. fl4->flowi4_oif = LOOPBACK_IFINDEX;
  2360. res->type = RTN_LOCAL;
  2361. flags |= RTCF_LOCAL;
  2362. goto make_route;
  2363. }
  2364. err = fib_lookup(net, fl4, res, 0);
  2365. if (err) {
  2366. res->fi = NULL;
  2367. res->table = NULL;
  2368. if (fl4->flowi4_oif &&
  2369. (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) {
  2370. /* Apparently, routing tables are wrong. Assume,
  2371. * that the destination is on link.
  2372. *
  2373. * WHY? DW.
  2374. * Because we are allowed to send to iface
  2375. * even if it has NO routes and NO assigned
  2376. * addresses. When oif is specified, routing
  2377. * tables are looked up with only one purpose:
  2378. * to catch if destination is gatewayed, rather than
  2379. * direct. Moreover, if MSG_DONTROUTE is set,
  2380. * we send packet, ignoring both routing tables
  2381. * and ifaddr state. --ANK
  2382. *
  2383. *
  2384. * We could make it even if oif is unknown,
  2385. * likely IPv6, but we do not.
  2386. */
  2387. if (fl4->saddr == 0)
  2388. fl4->saddr = inet_select_addr(dev_out, 0,
  2389. RT_SCOPE_LINK);
  2390. res->type = RTN_UNICAST;
  2391. goto make_route;
  2392. }
  2393. rth = ERR_PTR(err);
  2394. goto out;
  2395. }
  2396. if (res->type == RTN_LOCAL) {
  2397. if (!fl4->saddr) {
  2398. if (res->fi->fib_prefsrc)
  2399. fl4->saddr = res->fi->fib_prefsrc;
  2400. else
  2401. fl4->saddr = fl4->daddr;
  2402. }
  2403. /* L3 master device is the loopback for that domain */
  2404. dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
  2405. net->loopback_dev;
  2406. /* make sure orig_oif points to fib result device even
  2407. * though packet rx/tx happens over loopback or l3mdev
  2408. */
  2409. orig_oif = FIB_RES_OIF(*res);
  2410. fl4->flowi4_oif = dev_out->ifindex;
  2411. flags |= RTCF_LOCAL;
  2412. goto make_route;
  2413. }
  2414. fib_select_path(net, res, fl4, skb);
  2415. dev_out = FIB_RES_DEV(*res);
  2416. make_route:
  2417. rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
  2418. out:
  2419. return rth;
  2420. }
  2421. static struct dst_ops ipv4_dst_blackhole_ops = {
  2422. .family = AF_INET,
  2423. .default_advmss = ipv4_default_advmss,
  2424. .neigh_lookup = ipv4_neigh_lookup,
  2425. .check = dst_blackhole_check,
  2426. .cow_metrics = dst_blackhole_cow_metrics,
  2427. .update_pmtu = dst_blackhole_update_pmtu,
  2428. .redirect = dst_blackhole_redirect,
  2429. .mtu = dst_blackhole_mtu,
  2430. };
  2431. struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
  2432. {
  2433. struct rtable *ort = dst_rtable(dst_orig);
  2434. struct rtable *rt;
  2435. rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0);
  2436. if (rt) {
  2437. struct dst_entry *new = &rt->dst;
  2438. new->__use = 1;
  2439. new->input = dst_discard;
  2440. new->output = dst_discard_out;
  2441. new->dev = net->loopback_dev;
  2442. netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC);
  2443. rt->rt_is_input = ort->rt_is_input;
  2444. rt->rt_iif = ort->rt_iif;
  2445. rt->rt_pmtu = ort->rt_pmtu;
  2446. rt->rt_mtu_locked = ort->rt_mtu_locked;
  2447. rt->rt_genid = rt_genid_ipv4(net);
  2448. rt->rt_flags = ort->rt_flags;
  2449. rt->rt_type = ort->rt_type;
  2450. rt->rt_uses_gateway = ort->rt_uses_gateway;
  2451. rt->rt_gw_family = ort->rt_gw_family;
  2452. if (rt->rt_gw_family == AF_INET)
  2453. rt->rt_gw4 = ort->rt_gw4;
  2454. else if (rt->rt_gw_family == AF_INET6)
  2455. rt->rt_gw6 = ort->rt_gw6;
  2456. }
  2457. dst_release(dst_orig);
  2458. return rt ? &rt->dst : ERR_PTR(-ENOMEM);
  2459. }
  2460. struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
  2461. const struct sock *sk)
  2462. {
  2463. struct rtable *rt = __ip_route_output_key(net, flp4);
  2464. if (IS_ERR(rt))
  2465. return rt;
  2466. if (flp4->flowi4_proto) {
  2467. flp4->flowi4_oif = rt->dst.dev->ifindex;
  2468. rt = dst_rtable(xfrm_lookup_route(net, &rt->dst,
  2469. flowi4_to_flowi(flp4),
  2470. sk, 0));
  2471. }
  2472. return rt;
  2473. }
  2474. EXPORT_SYMBOL_GPL(ip_route_output_flow);
  2475. /* called with rcu_read_lock held */
  2476. static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
  2477. struct rtable *rt, u32 table_id, dscp_t dscp,
  2478. struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
  2479. u32 seq, unsigned int flags)
  2480. {
  2481. struct rtmsg *r;
  2482. struct nlmsghdr *nlh;
  2483. unsigned long expires = 0;
  2484. u32 error;
  2485. u32 metrics[RTAX_MAX];
  2486. nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
  2487. if (!nlh)
  2488. return -EMSGSIZE;
  2489. r = nlmsg_data(nlh);
  2490. r->rtm_family = AF_INET;
  2491. r->rtm_dst_len = 32;
  2492. r->rtm_src_len = 0;
  2493. r->rtm_tos = inet_dscp_to_dsfield(dscp);
  2494. r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
  2495. if (nla_put_u32(skb, RTA_TABLE, table_id))
  2496. goto nla_put_failure;
  2497. r->rtm_type = rt->rt_type;
  2498. r->rtm_scope = RT_SCOPE_UNIVERSE;
  2499. r->rtm_protocol = RTPROT_UNSPEC;
  2500. r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
  2501. if (rt->rt_flags & RTCF_NOTIFY)
  2502. r->rtm_flags |= RTM_F_NOTIFY;
  2503. if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
  2504. r->rtm_flags |= RTCF_DOREDIRECT;
  2505. if (nla_put_in_addr(skb, RTA_DST, dst))
  2506. goto nla_put_failure;
  2507. if (src) {
  2508. r->rtm_src_len = 32;
  2509. if (nla_put_in_addr(skb, RTA_SRC, src))
  2510. goto nla_put_failure;
  2511. }
  2512. if (rt->dst.dev &&
  2513. nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
  2514. goto nla_put_failure;
  2515. if (rt->dst.lwtstate &&
  2516. lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
  2517. goto nla_put_failure;
  2518. #ifdef CONFIG_IP_ROUTE_CLASSID
  2519. if (rt->dst.tclassid &&
  2520. nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
  2521. goto nla_put_failure;
  2522. #endif
  2523. if (fl4 && !rt_is_input_route(rt) &&
  2524. fl4->saddr != src) {
  2525. if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
  2526. goto nla_put_failure;
  2527. }
  2528. if (rt->rt_uses_gateway) {
  2529. if (rt->rt_gw_family == AF_INET &&
  2530. nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
  2531. goto nla_put_failure;
  2532. } else if (rt->rt_gw_family == AF_INET6) {
  2533. int alen = sizeof(struct in6_addr);
  2534. struct nlattr *nla;
  2535. struct rtvia *via;
  2536. nla = nla_reserve(skb, RTA_VIA, alen + 2);
  2537. if (!nla)
  2538. goto nla_put_failure;
  2539. via = nla_data(nla);
  2540. via->rtvia_family = AF_INET6;
  2541. memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
  2542. }
  2543. }
  2544. expires = rt->dst.expires;
  2545. if (expires) {
  2546. unsigned long now = jiffies;
  2547. if (time_before(now, expires))
  2548. expires -= now;
  2549. else
  2550. expires = 0;
  2551. }
  2552. memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
  2553. if (rt->rt_pmtu && expires)
  2554. metrics[RTAX_MTU - 1] = rt->rt_pmtu;
  2555. if (rt->rt_mtu_locked && expires)
  2556. metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
  2557. if (rtnetlink_put_metrics(skb, metrics) < 0)
  2558. goto nla_put_failure;
  2559. if (fl4) {
  2560. if (fl4->flowi4_mark &&
  2561. nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
  2562. goto nla_put_failure;
  2563. if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
  2564. nla_put_u32(skb, RTA_UID,
  2565. from_kuid_munged(current_user_ns(),
  2566. fl4->flowi4_uid)))
  2567. goto nla_put_failure;
  2568. if (rt_is_input_route(rt)) {
  2569. #ifdef CONFIG_IP_MROUTE
  2570. if (ipv4_is_multicast(dst) &&
  2571. !ipv4_is_local_multicast(dst) &&
  2572. IPV4_DEVCONF_ALL_RO(net, MC_FORWARDING)) {
  2573. int err = ipmr_get_route(net, skb,
  2574. fl4->saddr, fl4->daddr,
  2575. r, portid);
  2576. if (err <= 0) {
  2577. if (err == 0)
  2578. return 0;
  2579. goto nla_put_failure;
  2580. }
  2581. } else
  2582. #endif
  2583. if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
  2584. goto nla_put_failure;
  2585. }
  2586. }
  2587. error = rt->dst.error;
  2588. if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
  2589. goto nla_put_failure;
  2590. nlmsg_end(skb, nlh);
  2591. return 0;
  2592. nla_put_failure:
  2593. nlmsg_cancel(skb, nlh);
  2594. return -EMSGSIZE;
  2595. }
  2596. static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
  2597. struct netlink_callback *cb, u32 table_id,
  2598. struct fnhe_hash_bucket *bucket, int genid,
  2599. int *fa_index, int fa_start, unsigned int flags)
  2600. {
  2601. int i;
  2602. for (i = 0; i < FNHE_HASH_SIZE; i++) {
  2603. struct fib_nh_exception *fnhe;
  2604. for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
  2605. fnhe = rcu_dereference(fnhe->fnhe_next)) {
  2606. struct rtable *rt;
  2607. int err;
  2608. if (*fa_index < fa_start)
  2609. goto next;
  2610. if (fnhe->fnhe_genid != genid)
  2611. goto next;
  2612. if (fnhe->fnhe_expires &&
  2613. time_after(jiffies, fnhe->fnhe_expires))
  2614. goto next;
  2615. rt = rcu_dereference(fnhe->fnhe_rth_input);
  2616. if (!rt)
  2617. rt = rcu_dereference(fnhe->fnhe_rth_output);
  2618. if (!rt)
  2619. goto next;
  2620. err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
  2621. table_id, 0, NULL, skb,
  2622. NETLINK_CB(cb->skb).portid,
  2623. cb->nlh->nlmsg_seq, flags);
  2624. if (err)
  2625. return err;
  2626. next:
  2627. (*fa_index)++;
  2628. }
  2629. }
  2630. return 0;
  2631. }
  2632. int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
  2633. u32 table_id, struct fib_info *fi,
  2634. int *fa_index, int fa_start, unsigned int flags)
  2635. {
  2636. struct net *net = sock_net(cb->skb->sk);
  2637. int nhsel, genid = fnhe_genid(net);
  2638. for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
  2639. struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
  2640. struct fnhe_hash_bucket *bucket;
  2641. int err;
  2642. if (nhc->nhc_flags & RTNH_F_DEAD)
  2643. continue;
  2644. rcu_read_lock();
  2645. bucket = rcu_dereference(nhc->nhc_exceptions);
  2646. err = 0;
  2647. if (bucket)
  2648. err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
  2649. genid, fa_index, fa_start,
  2650. flags);
  2651. rcu_read_unlock();
  2652. if (err)
  2653. return err;
  2654. }
  2655. return 0;
  2656. }
  2657. static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
  2658. u8 ip_proto, __be16 sport,
  2659. __be16 dport)
  2660. {
  2661. struct sk_buff *skb;
  2662. struct iphdr *iph;
  2663. skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
  2664. if (!skb)
  2665. return NULL;
  2666. /* Reserve room for dummy headers, this skb can pass
  2667. * through good chunk of routing engine.
  2668. */
  2669. skb_reset_mac_header(skb);
  2670. skb_reset_network_header(skb);
  2671. skb->protocol = htons(ETH_P_IP);
  2672. iph = skb_put(skb, sizeof(struct iphdr));
  2673. iph->protocol = ip_proto;
  2674. iph->saddr = src;
  2675. iph->daddr = dst;
  2676. iph->version = 0x4;
  2677. iph->frag_off = 0;
  2678. iph->ihl = 0x5;
  2679. skb_set_transport_header(skb, skb->len);
  2680. switch (iph->protocol) {
  2681. case IPPROTO_UDP: {
  2682. struct udphdr *udph;
  2683. udph = skb_put_zero(skb, sizeof(struct udphdr));
  2684. udph->source = sport;
  2685. udph->dest = dport;
  2686. udph->len = htons(sizeof(struct udphdr));
  2687. udph->check = 0;
  2688. break;
  2689. }
  2690. case IPPROTO_TCP: {
  2691. struct tcphdr *tcph;
  2692. tcph = skb_put_zero(skb, sizeof(struct tcphdr));
  2693. tcph->source = sport;
  2694. tcph->dest = dport;
  2695. tcph->doff = sizeof(struct tcphdr) / 4;
  2696. tcph->rst = 1;
  2697. tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
  2698. src, dst, 0);
  2699. break;
  2700. }
  2701. case IPPROTO_ICMP: {
  2702. struct icmphdr *icmph;
  2703. icmph = skb_put_zero(skb, sizeof(struct icmphdr));
  2704. icmph->type = ICMP_ECHO;
  2705. icmph->code = 0;
  2706. }
  2707. }
  2708. return skb;
  2709. }
  2710. static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
  2711. const struct nlmsghdr *nlh,
  2712. struct nlattr **tb,
  2713. struct netlink_ext_ack *extack)
  2714. {
  2715. struct rtmsg *rtm;
  2716. int i, err;
  2717. if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
  2718. NL_SET_ERR_MSG(extack,
  2719. "ipv4: Invalid header for route get request");
  2720. return -EINVAL;
  2721. }
  2722. if (!netlink_strict_get_check(skb))
  2723. return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
  2724. rtm_ipv4_policy, extack);
  2725. rtm = nlmsg_data(nlh);
  2726. if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
  2727. (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
  2728. rtm->rtm_table || rtm->rtm_protocol ||
  2729. rtm->rtm_scope || rtm->rtm_type) {
  2730. NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
  2731. return -EINVAL;
  2732. }
  2733. if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
  2734. RTM_F_LOOKUP_TABLE |
  2735. RTM_F_FIB_MATCH)) {
  2736. NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
  2737. return -EINVAL;
  2738. }
  2739. err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
  2740. rtm_ipv4_policy, extack);
  2741. if (err)
  2742. return err;
  2743. if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
  2744. (tb[RTA_DST] && !rtm->rtm_dst_len)) {
  2745. NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
  2746. return -EINVAL;
  2747. }
  2748. for (i = 0; i <= RTA_MAX; i++) {
  2749. if (!tb[i])
  2750. continue;
  2751. switch (i) {
  2752. case RTA_IIF:
  2753. case RTA_OIF:
  2754. case RTA_SRC:
  2755. case RTA_DST:
  2756. case RTA_IP_PROTO:
  2757. case RTA_SPORT:
  2758. case RTA_DPORT:
  2759. case RTA_MARK:
  2760. case RTA_UID:
  2761. break;
  2762. default:
  2763. NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
  2764. return -EINVAL;
  2765. }
  2766. }
  2767. return 0;
  2768. }
  2769. static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
  2770. struct netlink_ext_ack *extack)
  2771. {
  2772. struct net *net = sock_net(in_skb->sk);
  2773. struct nlattr *tb[RTA_MAX+1];
  2774. u32 table_id = RT_TABLE_MAIN;
  2775. __be16 sport = 0, dport = 0;
  2776. struct fib_result res = {};
  2777. u8 ip_proto = IPPROTO_UDP;
  2778. struct rtable *rt = NULL;
  2779. struct sk_buff *skb;
  2780. struct rtmsg *rtm;
  2781. struct flowi4 fl4 = {};
  2782. __be32 dst = 0;
  2783. __be32 src = 0;
  2784. kuid_t uid;
  2785. u32 iif;
  2786. int err;
  2787. int mark;
  2788. err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
  2789. if (err < 0)
  2790. return err;
  2791. rtm = nlmsg_data(nlh);
  2792. src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
  2793. dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
  2794. iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
  2795. mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
  2796. if (tb[RTA_UID])
  2797. uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
  2798. else
  2799. uid = (iif ? INVALID_UID : current_uid());
  2800. if (tb[RTA_IP_PROTO]) {
  2801. err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
  2802. &ip_proto, AF_INET, extack);
  2803. if (err)
  2804. return err;
  2805. }
  2806. if (tb[RTA_SPORT])
  2807. sport = nla_get_be16(tb[RTA_SPORT]);
  2808. if (tb[RTA_DPORT])
  2809. dport = nla_get_be16(tb[RTA_DPORT]);
  2810. skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
  2811. if (!skb)
  2812. return -ENOBUFS;
  2813. fl4.daddr = dst;
  2814. fl4.saddr = src;
  2815. fl4.flowi4_tos = rtm->rtm_tos & INET_DSCP_MASK;
  2816. fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
  2817. fl4.flowi4_mark = mark;
  2818. fl4.flowi4_uid = uid;
  2819. if (sport)
  2820. fl4.fl4_sport = sport;
  2821. if (dport)
  2822. fl4.fl4_dport = dport;
  2823. fl4.flowi4_proto = ip_proto;
  2824. rcu_read_lock();
  2825. if (iif) {
  2826. struct net_device *dev;
  2827. dev = dev_get_by_index_rcu(net, iif);
  2828. if (!dev) {
  2829. err = -ENODEV;
  2830. goto errout_rcu;
  2831. }
  2832. fl4.flowi4_iif = iif; /* for rt_fill_info */
  2833. skb->dev = dev;
  2834. skb->mark = mark;
  2835. err = ip_route_input_rcu(skb, dst, src,
  2836. rtm->rtm_tos & INET_DSCP_MASK, dev,
  2837. &res);
  2838. rt = skb_rtable(skb);
  2839. if (err == 0 && rt->dst.error)
  2840. err = -rt->dst.error;
  2841. } else {
  2842. fl4.flowi4_iif = LOOPBACK_IFINDEX;
  2843. skb->dev = net->loopback_dev;
  2844. rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
  2845. err = 0;
  2846. if (IS_ERR(rt))
  2847. err = PTR_ERR(rt);
  2848. else
  2849. skb_dst_set(skb, &rt->dst);
  2850. }
  2851. if (err)
  2852. goto errout_rcu;
  2853. if (rtm->rtm_flags & RTM_F_NOTIFY)
  2854. rt->rt_flags |= RTCF_NOTIFY;
  2855. if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
  2856. table_id = res.table ? res.table->tb_id : 0;
  2857. /* reset skb for netlink reply msg */
  2858. skb_trim(skb, 0);
  2859. skb_reset_network_header(skb);
  2860. skb_reset_transport_header(skb);
  2861. skb_reset_mac_header(skb);
  2862. if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
  2863. struct fib_rt_info fri;
  2864. if (!res.fi) {
  2865. err = fib_props[res.type].error;
  2866. if (!err)
  2867. err = -EHOSTUNREACH;
  2868. goto errout_rcu;
  2869. }
  2870. fri.fi = res.fi;
  2871. fri.tb_id = table_id;
  2872. fri.dst = res.prefix;
  2873. fri.dst_len = res.prefixlen;
  2874. fri.dscp = res.dscp;
  2875. fri.type = rt->rt_type;
  2876. fri.offload = 0;
  2877. fri.trap = 0;
  2878. fri.offload_failed = 0;
  2879. if (res.fa_head) {
  2880. struct fib_alias *fa;
  2881. hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
  2882. u8 slen = 32 - fri.dst_len;
  2883. if (fa->fa_slen == slen &&
  2884. fa->tb_id == fri.tb_id &&
  2885. fa->fa_dscp == fri.dscp &&
  2886. fa->fa_info == res.fi &&
  2887. fa->fa_type == fri.type) {
  2888. fri.offload = READ_ONCE(fa->offload);
  2889. fri.trap = READ_ONCE(fa->trap);
  2890. fri.offload_failed =
  2891. READ_ONCE(fa->offload_failed);
  2892. break;
  2893. }
  2894. }
  2895. }
  2896. err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
  2897. nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
  2898. } else {
  2899. err = rt_fill_info(net, dst, src, rt, table_id, res.dscp, &fl4,
  2900. skb, NETLINK_CB(in_skb).portid,
  2901. nlh->nlmsg_seq, 0);
  2902. }
  2903. if (err < 0)
  2904. goto errout_rcu;
  2905. rcu_read_unlock();
  2906. err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
  2907. errout_free:
  2908. return err;
  2909. errout_rcu:
  2910. rcu_read_unlock();
  2911. kfree_skb(skb);
  2912. goto errout_free;
  2913. }
  2914. void ip_rt_multicast_event(struct in_device *in_dev)
  2915. {
  2916. rt_cache_flush(dev_net(in_dev->dev));
  2917. }
  2918. #ifdef CONFIG_SYSCTL
  2919. static int ip_rt_gc_interval __read_mostly = 60 * HZ;
  2920. static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
  2921. static int ip_rt_gc_elasticity __read_mostly = 8;
  2922. static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
  2923. static int ipv4_sysctl_rtcache_flush(const struct ctl_table *__ctl, int write,
  2924. void *buffer, size_t *lenp, loff_t *ppos)
  2925. {
  2926. struct net *net = (struct net *)__ctl->extra1;
  2927. if (write) {
  2928. rt_cache_flush(net);
  2929. fnhe_genid_bump(net);
  2930. return 0;
  2931. }
  2932. return -EINVAL;
  2933. }
  2934. static struct ctl_table ipv4_route_table[] = {
  2935. {
  2936. .procname = "gc_thresh",
  2937. .data = &ipv4_dst_ops.gc_thresh,
  2938. .maxlen = sizeof(int),
  2939. .mode = 0644,
  2940. .proc_handler = proc_dointvec,
  2941. },
  2942. {
  2943. .procname = "max_size",
  2944. .data = &ip_rt_max_size,
  2945. .maxlen = sizeof(int),
  2946. .mode = 0644,
  2947. .proc_handler = proc_dointvec,
  2948. },
  2949. {
  2950. /* Deprecated. Use gc_min_interval_ms */
  2951. .procname = "gc_min_interval",
  2952. .data = &ip_rt_gc_min_interval,
  2953. .maxlen = sizeof(int),
  2954. .mode = 0644,
  2955. .proc_handler = proc_dointvec_jiffies,
  2956. },
  2957. {
  2958. .procname = "gc_min_interval_ms",
  2959. .data = &ip_rt_gc_min_interval,
  2960. .maxlen = sizeof(int),
  2961. .mode = 0644,
  2962. .proc_handler = proc_dointvec_ms_jiffies,
  2963. },
  2964. {
  2965. .procname = "gc_timeout",
  2966. .data = &ip_rt_gc_timeout,
  2967. .maxlen = sizeof(int),
  2968. .mode = 0644,
  2969. .proc_handler = proc_dointvec_jiffies,
  2970. },
  2971. {
  2972. .procname = "gc_interval",
  2973. .data = &ip_rt_gc_interval,
  2974. .maxlen = sizeof(int),
  2975. .mode = 0644,
  2976. .proc_handler = proc_dointvec_jiffies,
  2977. },
  2978. {
  2979. .procname = "redirect_load",
  2980. .data = &ip_rt_redirect_load,
  2981. .maxlen = sizeof(int),
  2982. .mode = 0644,
  2983. .proc_handler = proc_dointvec,
  2984. },
  2985. {
  2986. .procname = "redirect_number",
  2987. .data = &ip_rt_redirect_number,
  2988. .maxlen = sizeof(int),
  2989. .mode = 0644,
  2990. .proc_handler = proc_dointvec,
  2991. },
  2992. {
  2993. .procname = "redirect_silence",
  2994. .data = &ip_rt_redirect_silence,
  2995. .maxlen = sizeof(int),
  2996. .mode = 0644,
  2997. .proc_handler = proc_dointvec,
  2998. },
  2999. {
  3000. .procname = "error_cost",
  3001. .data = &ip_rt_error_cost,
  3002. .maxlen = sizeof(int),
  3003. .mode = 0644,
  3004. .proc_handler = proc_dointvec,
  3005. },
  3006. {
  3007. .procname = "error_burst",
  3008. .data = &ip_rt_error_burst,
  3009. .maxlen = sizeof(int),
  3010. .mode = 0644,
  3011. .proc_handler = proc_dointvec,
  3012. },
  3013. {
  3014. .procname = "gc_elasticity",
  3015. .data = &ip_rt_gc_elasticity,
  3016. .maxlen = sizeof(int),
  3017. .mode = 0644,
  3018. .proc_handler = proc_dointvec,
  3019. },
  3020. };
  3021. static const char ipv4_route_flush_procname[] = "flush";
  3022. static struct ctl_table ipv4_route_netns_table[] = {
  3023. {
  3024. .procname = ipv4_route_flush_procname,
  3025. .maxlen = sizeof(int),
  3026. .mode = 0200,
  3027. .proc_handler = ipv4_sysctl_rtcache_flush,
  3028. },
  3029. {
  3030. .procname = "min_pmtu",
  3031. .data = &init_net.ipv4.ip_rt_min_pmtu,
  3032. .maxlen = sizeof(int),
  3033. .mode = 0644,
  3034. .proc_handler = proc_dointvec_minmax,
  3035. .extra1 = &ip_min_valid_pmtu,
  3036. },
  3037. {
  3038. .procname = "mtu_expires",
  3039. .data = &init_net.ipv4.ip_rt_mtu_expires,
  3040. .maxlen = sizeof(int),
  3041. .mode = 0644,
  3042. .proc_handler = proc_dointvec_jiffies,
  3043. },
  3044. {
  3045. .procname = "min_adv_mss",
  3046. .data = &init_net.ipv4.ip_rt_min_advmss,
  3047. .maxlen = sizeof(int),
  3048. .mode = 0644,
  3049. .proc_handler = proc_dointvec,
  3050. },
  3051. };
  3052. static __net_init int sysctl_route_net_init(struct net *net)
  3053. {
  3054. struct ctl_table *tbl;
  3055. size_t table_size = ARRAY_SIZE(ipv4_route_netns_table);
  3056. tbl = ipv4_route_netns_table;
  3057. if (!net_eq(net, &init_net)) {
  3058. int i;
  3059. tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL);
  3060. if (!tbl)
  3061. goto err_dup;
  3062. /* Don't export non-whitelisted sysctls to unprivileged users */
  3063. if (net->user_ns != &init_user_ns) {
  3064. if (tbl[0].procname != ipv4_route_flush_procname)
  3065. table_size = 0;
  3066. }
  3067. /* Update the variables to point into the current struct net
  3068. * except for the first element flush
  3069. */
  3070. for (i = 1; i < table_size; i++)
  3071. tbl[i].data += (void *)net - (void *)&init_net;
  3072. }
  3073. tbl[0].extra1 = net;
  3074. net->ipv4.route_hdr = register_net_sysctl_sz(net, "net/ipv4/route",
  3075. tbl, table_size);
  3076. if (!net->ipv4.route_hdr)
  3077. goto err_reg;
  3078. return 0;
  3079. err_reg:
  3080. if (tbl != ipv4_route_netns_table)
  3081. kfree(tbl);
  3082. err_dup:
  3083. return -ENOMEM;
  3084. }
  3085. static __net_exit void sysctl_route_net_exit(struct net *net)
  3086. {
  3087. const struct ctl_table *tbl;
  3088. tbl = net->ipv4.route_hdr->ctl_table_arg;
  3089. unregister_net_sysctl_table(net->ipv4.route_hdr);
  3090. BUG_ON(tbl == ipv4_route_netns_table);
  3091. kfree(tbl);
  3092. }
  3093. static __net_initdata struct pernet_operations sysctl_route_ops = {
  3094. .init = sysctl_route_net_init,
  3095. .exit = sysctl_route_net_exit,
  3096. };
  3097. #endif
  3098. static __net_init int netns_ip_rt_init(struct net *net)
  3099. {
  3100. /* Set default value for namespaceified sysctls */
  3101. net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU;
  3102. net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES;
  3103. net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS;
  3104. return 0;
  3105. }
  3106. static struct pernet_operations __net_initdata ip_rt_ops = {
  3107. .init = netns_ip_rt_init,
  3108. };
  3109. static __net_init int rt_genid_init(struct net *net)
  3110. {
  3111. atomic_set(&net->ipv4.rt_genid, 0);
  3112. atomic_set(&net->fnhe_genid, 0);
  3113. atomic_set(&net->ipv4.dev_addr_genid, get_random_u32());
  3114. return 0;
  3115. }
  3116. static __net_initdata struct pernet_operations rt_genid_ops = {
  3117. .init = rt_genid_init,
  3118. };
  3119. static int __net_init ipv4_inetpeer_init(struct net *net)
  3120. {
  3121. struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
  3122. if (!bp)
  3123. return -ENOMEM;
  3124. inet_peer_base_init(bp);
  3125. net->ipv4.peers = bp;
  3126. return 0;
  3127. }
  3128. static void __net_exit ipv4_inetpeer_exit(struct net *net)
  3129. {
  3130. struct inet_peer_base *bp = net->ipv4.peers;
  3131. net->ipv4.peers = NULL;
  3132. inetpeer_invalidate_tree(bp);
  3133. kfree(bp);
  3134. }
  3135. static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
  3136. .init = ipv4_inetpeer_init,
  3137. .exit = ipv4_inetpeer_exit,
  3138. };
  3139. #ifdef CONFIG_IP_ROUTE_CLASSID
  3140. struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
  3141. #endif /* CONFIG_IP_ROUTE_CLASSID */
  3142. int __init ip_rt_init(void)
  3143. {
  3144. void *idents_hash;
  3145. int cpu;
  3146. /* For modern hosts, this will use 2 MB of memory */
  3147. idents_hash = alloc_large_system_hash("IP idents",
  3148. sizeof(*ip_idents) + sizeof(*ip_tstamps),
  3149. 0,
  3150. 16, /* one bucket per 64 KB */
  3151. HASH_ZERO,
  3152. NULL,
  3153. &ip_idents_mask,
  3154. 2048,
  3155. 256*1024);
  3156. ip_idents = idents_hash;
  3157. get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
  3158. ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
  3159. for_each_possible_cpu(cpu) {
  3160. struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
  3161. INIT_LIST_HEAD(&ul->head);
  3162. spin_lock_init(&ul->lock);
  3163. }
  3164. #ifdef CONFIG_IP_ROUTE_CLASSID
  3165. ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
  3166. if (!ip_rt_acct)
  3167. panic("IP: failed to allocate ip_rt_acct\n");
  3168. #endif
  3169. ipv4_dst_ops.kmem_cachep = KMEM_CACHE(rtable,
  3170. SLAB_HWCACHE_ALIGN | SLAB_PANIC);
  3171. ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
  3172. if (dst_entries_init(&ipv4_dst_ops) < 0)
  3173. panic("IP: failed to allocate ipv4_dst_ops counter\n");
  3174. if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
  3175. panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
  3176. ipv4_dst_ops.gc_thresh = ~0;
  3177. ip_rt_max_size = INT_MAX;
  3178. devinet_init();
  3179. ip_fib_init();
  3180. if (ip_rt_proc_init())
  3181. pr_err("Unable to create route proc files\n");
  3182. #ifdef CONFIG_XFRM
  3183. xfrm_init();
  3184. xfrm4_init();
  3185. #endif
  3186. rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
  3187. RTNL_FLAG_DOIT_UNLOCKED);
  3188. #ifdef CONFIG_SYSCTL
  3189. register_pernet_subsys(&sysctl_route_ops);
  3190. #endif
  3191. register_pernet_subsys(&ip_rt_ops);
  3192. register_pernet_subsys(&rt_genid_ops);
  3193. register_pernet_subsys(&ipv4_inetpeer_ops);
  3194. return 0;
  3195. }
  3196. #ifdef CONFIG_SYSCTL
  3197. /*
  3198. * We really need to sanitize the damn ipv4 init order, then all
  3199. * this nonsense will go away.
  3200. */
  3201. void __init ip_static_sysctl_init(void)
  3202. {
  3203. register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
  3204. }
  3205. #endif