vrf.c 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * vrf.c: device driver to encapsulate a VRF space
  4. *
  5. * Copyright (c) 2015 Cumulus Networks. All rights reserved.
  6. * Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com>
  7. * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com>
  8. *
  9. * Based on dummy, team and ipvlan drivers
  10. */
  11. #include <linux/ethtool.h>
  12. #include <linux/module.h>
  13. #include <linux/kernel.h>
  14. #include <linux/netdevice.h>
  15. #include <linux/etherdevice.h>
  16. #include <linux/ip.h>
  17. #include <linux/init.h>
  18. #include <linux/moduleparam.h>
  19. #include <linux/netfilter.h>
  20. #include <linux/rtnetlink.h>
  21. #include <net/rtnetlink.h>
  22. #include <linux/u64_stats_sync.h>
  23. #include <linux/hashtable.h>
  24. #include <linux/spinlock_types.h>
  25. #include <linux/inetdevice.h>
  26. #include <net/arp.h>
  27. #include <net/ip.h>
  28. #include <net/ip_fib.h>
  29. #include <net/ip6_fib.h>
  30. #include <net/ip6_route.h>
  31. #include <net/route.h>
  32. #include <net/addrconf.h>
  33. #include <net/l3mdev.h>
  34. #include <net/fib_rules.h>
  35. #include <net/sch_generic.h>
  36. #include <net/netns/generic.h>
  37. #include <net/netfilter/nf_conntrack.h>
  38. #include <net/inet_dscp.h>
  39. #define DRV_NAME "vrf"
  40. #define DRV_VERSION "1.1"
  41. #define FIB_RULE_PREF 1000 /* default preference for FIB rules */
  42. #define HT_MAP_BITS 4
  43. #define HASH_INITVAL ((u32)0xcafef00d)
  44. struct vrf_map {
  45. DECLARE_HASHTABLE(ht, HT_MAP_BITS);
  46. spinlock_t vmap_lock;
  47. /* shared_tables:
  48. * count how many distinct tables do not comply with the strict mode
  49. * requirement.
  50. * shared_tables value must be 0 in order to enable the strict mode.
  51. *
  52. * example of the evolution of shared_tables:
  53. * | time
  54. * add vrf0 --> table 100 shared_tables = 0 | t0
  55. * add vrf1 --> table 101 shared_tables = 0 | t1
  56. * add vrf2 --> table 100 shared_tables = 1 | t2
  57. * add vrf3 --> table 100 shared_tables = 1 | t3
  58. * add vrf4 --> table 101 shared_tables = 2 v t4
  59. *
  60. * shared_tables is a "step function" (or "staircase function")
  61. * and it is increased by one when the second vrf is associated to a
  62. * table.
  63. *
  64. * at t2, vrf0 and vrf2 are bound to table 100: shared_tables = 1.
  65. *
  66. * at t3, another dev (vrf3) is bound to the same table 100 but the
  67. * value of shared_tables is still 1.
  68. * This means that no matter how many new vrfs will register on the
  69. * table 100, the shared_tables will not increase (considering only
  70. * table 100).
  71. *
  72. * at t4, vrf4 is bound to table 101, and shared_tables = 2.
  73. *
  74. * Looking at the value of shared_tables we can immediately know if
  75. * the strict_mode can or cannot be enforced. Indeed, strict_mode
  76. * can be enforced iff shared_tables = 0.
  77. *
  78. * Conversely, shared_tables is decreased when a vrf is de-associated
  79. * from a table with exactly two associated vrfs.
  80. */
  81. u32 shared_tables;
  82. bool strict_mode;
  83. };
  84. struct vrf_map_elem {
  85. struct hlist_node hnode;
  86. struct list_head vrf_list; /* VRFs registered to this table */
  87. u32 table_id;
  88. int users;
  89. int ifindex;
  90. };
  91. static unsigned int vrf_net_id;
  92. /* per netns vrf data */
  93. struct netns_vrf {
  94. /* protected by rtnl lock */
  95. bool add_fib_rules;
  96. struct vrf_map vmap;
  97. struct ctl_table_header *ctl_hdr;
  98. };
  99. struct net_vrf {
  100. struct rtable __rcu *rth;
  101. struct rt6_info __rcu *rt6;
  102. #if IS_ENABLED(CONFIG_IPV6)
  103. struct fib6_table *fib6_table;
  104. #endif
  105. u32 tb_id;
  106. struct list_head me_list; /* entry in vrf_map_elem */
  107. int ifindex;
  108. };
  109. static void vrf_rx_stats(struct net_device *dev, int len)
  110. {
  111. struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
  112. u64_stats_update_begin(&dstats->syncp);
  113. u64_stats_inc(&dstats->rx_packets);
  114. u64_stats_add(&dstats->rx_bytes, len);
  115. u64_stats_update_end(&dstats->syncp);
  116. }
  117. static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb)
  118. {
  119. vrf_dev->stats.tx_errors++;
  120. kfree_skb(skb);
  121. }
  122. static struct vrf_map *netns_vrf_map(struct net *net)
  123. {
  124. struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id);
  125. return &nn_vrf->vmap;
  126. }
  127. static struct vrf_map *netns_vrf_map_by_dev(struct net_device *dev)
  128. {
  129. return netns_vrf_map(dev_net(dev));
  130. }
  131. static int vrf_map_elem_get_vrf_ifindex(struct vrf_map_elem *me)
  132. {
  133. struct list_head *me_head = &me->vrf_list;
  134. struct net_vrf *vrf;
  135. if (list_empty(me_head))
  136. return -ENODEV;
  137. vrf = list_first_entry(me_head, struct net_vrf, me_list);
  138. return vrf->ifindex;
  139. }
  140. static struct vrf_map_elem *vrf_map_elem_alloc(gfp_t flags)
  141. {
  142. struct vrf_map_elem *me;
  143. me = kmalloc(sizeof(*me), flags);
  144. if (!me)
  145. return NULL;
  146. return me;
  147. }
  148. static void vrf_map_elem_free(struct vrf_map_elem *me)
  149. {
  150. kfree(me);
  151. }
  152. static void vrf_map_elem_init(struct vrf_map_elem *me, int table_id,
  153. int ifindex, int users)
  154. {
  155. me->table_id = table_id;
  156. me->ifindex = ifindex;
  157. me->users = users;
  158. INIT_LIST_HEAD(&me->vrf_list);
  159. }
  160. static struct vrf_map_elem *vrf_map_lookup_elem(struct vrf_map *vmap,
  161. u32 table_id)
  162. {
  163. struct vrf_map_elem *me;
  164. u32 key;
  165. key = jhash_1word(table_id, HASH_INITVAL);
  166. hash_for_each_possible(vmap->ht, me, hnode, key) {
  167. if (me->table_id == table_id)
  168. return me;
  169. }
  170. return NULL;
  171. }
  172. static void vrf_map_add_elem(struct vrf_map *vmap, struct vrf_map_elem *me)
  173. {
  174. u32 table_id = me->table_id;
  175. u32 key;
  176. key = jhash_1word(table_id, HASH_INITVAL);
  177. hash_add(vmap->ht, &me->hnode, key);
  178. }
  179. static void vrf_map_del_elem(struct vrf_map_elem *me)
  180. {
  181. hash_del(&me->hnode);
  182. }
  183. static void vrf_map_lock(struct vrf_map *vmap) __acquires(&vmap->vmap_lock)
  184. {
  185. spin_lock(&vmap->vmap_lock);
  186. }
  187. static void vrf_map_unlock(struct vrf_map *vmap) __releases(&vmap->vmap_lock)
  188. {
  189. spin_unlock(&vmap->vmap_lock);
  190. }
  191. /* called with rtnl lock held */
  192. static int
  193. vrf_map_register_dev(struct net_device *dev, struct netlink_ext_ack *extack)
  194. {
  195. struct vrf_map *vmap = netns_vrf_map_by_dev(dev);
  196. struct net_vrf *vrf = netdev_priv(dev);
  197. struct vrf_map_elem *new_me, *me;
  198. u32 table_id = vrf->tb_id;
  199. bool free_new_me = false;
  200. int users;
  201. int res;
  202. /* we pre-allocate elements used in the spin-locked section (so that we
  203. * keep the spinlock as short as possible).
  204. */
  205. new_me = vrf_map_elem_alloc(GFP_KERNEL);
  206. if (!new_me)
  207. return -ENOMEM;
  208. vrf_map_elem_init(new_me, table_id, dev->ifindex, 0);
  209. vrf_map_lock(vmap);
  210. me = vrf_map_lookup_elem(vmap, table_id);
  211. if (!me) {
  212. me = new_me;
  213. vrf_map_add_elem(vmap, me);
  214. goto link_vrf;
  215. }
  216. /* we already have an entry in the vrf_map, so it means there is (at
  217. * least) a vrf registered on the specific table.
  218. */
  219. free_new_me = true;
  220. if (vmap->strict_mode) {
  221. /* vrfs cannot share the same table */
  222. NL_SET_ERR_MSG(extack, "Table is used by another VRF");
  223. res = -EBUSY;
  224. goto unlock;
  225. }
  226. link_vrf:
  227. users = ++me->users;
  228. if (users == 2)
  229. ++vmap->shared_tables;
  230. list_add(&vrf->me_list, &me->vrf_list);
  231. res = 0;
  232. unlock:
  233. vrf_map_unlock(vmap);
  234. /* clean-up, if needed */
  235. if (free_new_me)
  236. vrf_map_elem_free(new_me);
  237. return res;
  238. }
  239. /* called with rtnl lock held */
  240. static void vrf_map_unregister_dev(struct net_device *dev)
  241. {
  242. struct vrf_map *vmap = netns_vrf_map_by_dev(dev);
  243. struct net_vrf *vrf = netdev_priv(dev);
  244. u32 table_id = vrf->tb_id;
  245. struct vrf_map_elem *me;
  246. int users;
  247. vrf_map_lock(vmap);
  248. me = vrf_map_lookup_elem(vmap, table_id);
  249. if (!me)
  250. goto unlock;
  251. list_del(&vrf->me_list);
  252. users = --me->users;
  253. if (users == 1) {
  254. --vmap->shared_tables;
  255. } else if (users == 0) {
  256. vrf_map_del_elem(me);
  257. /* no one will refer to this element anymore */
  258. vrf_map_elem_free(me);
  259. }
  260. unlock:
  261. vrf_map_unlock(vmap);
  262. }
  263. /* return the vrf device index associated with the table_id */
  264. static int vrf_ifindex_lookup_by_table_id(struct net *net, u32 table_id)
  265. {
  266. struct vrf_map *vmap = netns_vrf_map(net);
  267. struct vrf_map_elem *me;
  268. int ifindex;
  269. vrf_map_lock(vmap);
  270. if (!vmap->strict_mode) {
  271. ifindex = -EPERM;
  272. goto unlock;
  273. }
  274. me = vrf_map_lookup_elem(vmap, table_id);
  275. if (!me) {
  276. ifindex = -ENODEV;
  277. goto unlock;
  278. }
  279. ifindex = vrf_map_elem_get_vrf_ifindex(me);
  280. unlock:
  281. vrf_map_unlock(vmap);
  282. return ifindex;
  283. }
  284. /* by default VRF devices do not have a qdisc and are expected
  285. * to be created with only a single queue.
  286. */
  287. static bool qdisc_tx_is_default(const struct net_device *dev)
  288. {
  289. struct netdev_queue *txq;
  290. struct Qdisc *qdisc;
  291. if (dev->num_tx_queues > 1)
  292. return false;
  293. txq = netdev_get_tx_queue(dev, 0);
  294. qdisc = rcu_access_pointer(txq->qdisc);
  295. return !qdisc->enqueue;
  296. }
  297. /* Local traffic destined to local address. Reinsert the packet to rx
  298. * path, similar to loopback handling.
  299. */
  300. static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev,
  301. struct dst_entry *dst)
  302. {
  303. int len = skb->len;
  304. skb_orphan(skb);
  305. skb_dst_set(skb, dst);
  306. /* set pkt_type to avoid skb hitting packet taps twice -
  307. * once on Tx and again in Rx processing
  308. */
  309. skb->pkt_type = PACKET_LOOPBACK;
  310. skb->protocol = eth_type_trans(skb, dev);
  311. if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) {
  312. vrf_rx_stats(dev, len);
  313. } else {
  314. struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
  315. u64_stats_update_begin(&dstats->syncp);
  316. u64_stats_inc(&dstats->rx_drops);
  317. u64_stats_update_end(&dstats->syncp);
  318. }
  319. return NETDEV_TX_OK;
  320. }
  321. static void vrf_nf_set_untracked(struct sk_buff *skb)
  322. {
  323. if (skb_get_nfct(skb) == 0)
  324. nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
  325. }
  326. static void vrf_nf_reset_ct(struct sk_buff *skb)
  327. {
  328. if (skb_get_nfct(skb) == IP_CT_UNTRACKED)
  329. nf_reset_ct(skb);
  330. }
  331. #if IS_ENABLED(CONFIG_IPV6)
  332. static int vrf_ip6_local_out(struct net *net, struct sock *sk,
  333. struct sk_buff *skb)
  334. {
  335. int err;
  336. vrf_nf_reset_ct(skb);
  337. err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net,
  338. sk, skb, NULL, skb_dst(skb)->dev, dst_output);
  339. if (likely(err == 1))
  340. err = dst_output(net, sk, skb);
  341. return err;
  342. }
  343. static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
  344. struct net_device *dev)
  345. {
  346. const struct ipv6hdr *iph;
  347. struct net *net = dev_net(skb->dev);
  348. struct flowi6 fl6;
  349. int ret = NET_XMIT_DROP;
  350. struct dst_entry *dst;
  351. struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst;
  352. if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr)))
  353. goto err;
  354. iph = ipv6_hdr(skb);
  355. memset(&fl6, 0, sizeof(fl6));
  356. /* needed to match OIF rule */
  357. fl6.flowi6_l3mdev = dev->ifindex;
  358. fl6.flowi6_iif = LOOPBACK_IFINDEX;
  359. fl6.daddr = iph->daddr;
  360. fl6.saddr = iph->saddr;
  361. fl6.flowlabel = ip6_flowinfo(iph);
  362. fl6.flowi6_mark = skb->mark;
  363. fl6.flowi6_proto = iph->nexthdr;
  364. dst = ip6_dst_lookup_flow(net, NULL, &fl6, NULL);
  365. if (IS_ERR(dst) || dst == dst_null)
  366. goto err;
  367. skb_dst_drop(skb);
  368. /* if dst.dev is the VRF device again this is locally originated traffic
  369. * destined to a local address. Short circuit to Rx path.
  370. */
  371. if (dst->dev == dev)
  372. return vrf_local_xmit(skb, dev, dst);
  373. skb_dst_set(skb, dst);
  374. /* strip the ethernet header added for pass through VRF device */
  375. __skb_pull(skb, skb_network_offset(skb));
  376. memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
  377. ret = vrf_ip6_local_out(net, skb->sk, skb);
  378. if (unlikely(net_xmit_eval(ret)))
  379. dev->stats.tx_errors++;
  380. else
  381. ret = NET_XMIT_SUCCESS;
  382. return ret;
  383. err:
  384. vrf_tx_error(dev, skb);
  385. return NET_XMIT_DROP;
  386. }
  387. #else
  388. static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
  389. struct net_device *dev)
  390. {
  391. vrf_tx_error(dev, skb);
  392. return NET_XMIT_DROP;
  393. }
  394. #endif
  395. /* based on ip_local_out; can't use it b/c the dst is switched pointing to us */
  396. static int vrf_ip_local_out(struct net *net, struct sock *sk,
  397. struct sk_buff *skb)
  398. {
  399. int err;
  400. vrf_nf_reset_ct(skb);
  401. err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk,
  402. skb, NULL, skb_dst(skb)->dev, dst_output);
  403. if (likely(err == 1))
  404. err = dst_output(net, sk, skb);
  405. return err;
  406. }
  407. static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb,
  408. struct net_device *vrf_dev)
  409. {
  410. struct iphdr *ip4h;
  411. int ret = NET_XMIT_DROP;
  412. struct flowi4 fl4;
  413. struct net *net = dev_net(vrf_dev);
  414. struct rtable *rt;
  415. if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr)))
  416. goto err;
  417. ip4h = ip_hdr(skb);
  418. memset(&fl4, 0, sizeof(fl4));
  419. /* needed to match OIF rule */
  420. fl4.flowi4_l3mdev = vrf_dev->ifindex;
  421. fl4.flowi4_iif = LOOPBACK_IFINDEX;
  422. fl4.flowi4_tos = ip4h->tos & INET_DSCP_MASK;
  423. fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
  424. fl4.flowi4_proto = ip4h->protocol;
  425. fl4.daddr = ip4h->daddr;
  426. fl4.saddr = ip4h->saddr;
  427. rt = ip_route_output_flow(net, &fl4, NULL);
  428. if (IS_ERR(rt))
  429. goto err;
  430. skb_dst_drop(skb);
  431. /* if dst.dev is the VRF device again this is locally originated traffic
  432. * destined to a local address. Short circuit to Rx path.
  433. */
  434. if (rt->dst.dev == vrf_dev)
  435. return vrf_local_xmit(skb, vrf_dev, &rt->dst);
  436. skb_dst_set(skb, &rt->dst);
  437. /* strip the ethernet header added for pass through VRF device */
  438. __skb_pull(skb, skb_network_offset(skb));
  439. if (!ip4h->saddr) {
  440. ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0,
  441. RT_SCOPE_LINK);
  442. }
  443. memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
  444. ret = vrf_ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
  445. if (unlikely(net_xmit_eval(ret)))
  446. vrf_dev->stats.tx_errors++;
  447. else
  448. ret = NET_XMIT_SUCCESS;
  449. out:
  450. return ret;
  451. err:
  452. vrf_tx_error(vrf_dev, skb);
  453. goto out;
  454. }
  455. static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev)
  456. {
  457. switch (skb->protocol) {
  458. case htons(ETH_P_IP):
  459. return vrf_process_v4_outbound(skb, dev);
  460. case htons(ETH_P_IPV6):
  461. return vrf_process_v6_outbound(skb, dev);
  462. default:
  463. vrf_tx_error(dev, skb);
  464. return NET_XMIT_DROP;
  465. }
  466. }
  467. static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev)
  468. {
  469. struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
  470. int len = skb->len;
  471. netdev_tx_t ret = is_ip_tx_frame(skb, dev);
  472. u64_stats_update_begin(&dstats->syncp);
  473. if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
  474. u64_stats_inc(&dstats->tx_packets);
  475. u64_stats_add(&dstats->tx_bytes, len);
  476. } else {
  477. u64_stats_inc(&dstats->tx_drops);
  478. }
  479. u64_stats_update_end(&dstats->syncp);
  480. return ret;
  481. }
  482. static void vrf_finish_direct(struct sk_buff *skb)
  483. {
  484. struct net_device *vrf_dev = skb->dev;
  485. if (!list_empty(&vrf_dev->ptype_all) &&
  486. likely(skb_headroom(skb) >= ETH_HLEN)) {
  487. struct ethhdr *eth = skb_push(skb, ETH_HLEN);
  488. ether_addr_copy(eth->h_source, vrf_dev->dev_addr);
  489. eth_zero_addr(eth->h_dest);
  490. eth->h_proto = skb->protocol;
  491. rcu_read_lock_bh();
  492. dev_queue_xmit_nit(skb, vrf_dev);
  493. rcu_read_unlock_bh();
  494. skb_pull(skb, ETH_HLEN);
  495. }
  496. vrf_nf_reset_ct(skb);
  497. }
  498. #if IS_ENABLED(CONFIG_IPV6)
  499. /* modelled after ip6_finish_output2 */
  500. static int vrf_finish_output6(struct net *net, struct sock *sk,
  501. struct sk_buff *skb)
  502. {
  503. struct dst_entry *dst = skb_dst(skb);
  504. struct net_device *dev = dst->dev;
  505. const struct in6_addr *nexthop;
  506. struct neighbour *neigh;
  507. int ret;
  508. vrf_nf_reset_ct(skb);
  509. skb->protocol = htons(ETH_P_IPV6);
  510. skb->dev = dev;
  511. rcu_read_lock();
  512. nexthop = rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr);
  513. neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
  514. if (unlikely(!neigh))
  515. neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
  516. if (!IS_ERR(neigh)) {
  517. sock_confirm_neigh(skb, neigh);
  518. ret = neigh_output(neigh, skb, false);
  519. rcu_read_unlock();
  520. return ret;
  521. }
  522. rcu_read_unlock();
  523. IP6_INC_STATS(dev_net(dst->dev),
  524. ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
  525. kfree_skb(skb);
  526. return -EINVAL;
  527. }
  528. /* modelled after ip6_output */
  529. static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb)
  530. {
  531. return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  532. net, sk, skb, NULL, skb_dst(skb)->dev,
  533. vrf_finish_output6,
  534. !(IP6CB(skb)->flags & IP6SKB_REROUTED));
  535. }
  536. /* set dst on skb to send packet to us via dev_xmit path. Allows
  537. * packet to go through device based features such as qdisc, netfilter
  538. * hooks and packet sockets with skb->dev set to vrf device.
  539. */
  540. static struct sk_buff *vrf_ip6_out_redirect(struct net_device *vrf_dev,
  541. struct sk_buff *skb)
  542. {
  543. struct net_vrf *vrf = netdev_priv(vrf_dev);
  544. struct dst_entry *dst = NULL;
  545. struct rt6_info *rt6;
  546. rcu_read_lock();
  547. rt6 = rcu_dereference(vrf->rt6);
  548. if (likely(rt6)) {
  549. dst = &rt6->dst;
  550. dst_hold(dst);
  551. }
  552. rcu_read_unlock();
  553. if (unlikely(!dst)) {
  554. vrf_tx_error(vrf_dev, skb);
  555. return NULL;
  556. }
  557. skb_dst_drop(skb);
  558. skb_dst_set(skb, dst);
  559. return skb;
  560. }
  561. static int vrf_output6_direct_finish(struct net *net, struct sock *sk,
  562. struct sk_buff *skb)
  563. {
  564. vrf_finish_direct(skb);
  565. return vrf_ip6_local_out(net, sk, skb);
  566. }
  567. static int vrf_output6_direct(struct net *net, struct sock *sk,
  568. struct sk_buff *skb)
  569. {
  570. int err = 1;
  571. skb->protocol = htons(ETH_P_IPV6);
  572. if (!(IPCB(skb)->flags & IPSKB_REROUTED))
  573. err = nf_hook(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb,
  574. NULL, skb->dev, vrf_output6_direct_finish);
  575. if (likely(err == 1))
  576. vrf_finish_direct(skb);
  577. return err;
  578. }
  579. static int vrf_ip6_out_direct_finish(struct net *net, struct sock *sk,
  580. struct sk_buff *skb)
  581. {
  582. int err;
  583. err = vrf_output6_direct(net, sk, skb);
  584. if (likely(err == 1))
  585. err = vrf_ip6_local_out(net, sk, skb);
  586. return err;
  587. }
  588. static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev,
  589. struct sock *sk,
  590. struct sk_buff *skb)
  591. {
  592. struct net *net = dev_net(vrf_dev);
  593. int err;
  594. skb->dev = vrf_dev;
  595. err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk,
  596. skb, NULL, vrf_dev, vrf_ip6_out_direct_finish);
  597. if (likely(err == 1))
  598. err = vrf_output6_direct(net, sk, skb);
  599. if (likely(err == 1))
  600. return skb;
  601. return NULL;
  602. }
  603. static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev,
  604. struct sock *sk,
  605. struct sk_buff *skb)
  606. {
  607. /* don't divert link scope packets */
  608. if (rt6_need_strict(&ipv6_hdr(skb)->daddr))
  609. return skb;
  610. vrf_nf_set_untracked(skb);
  611. if (qdisc_tx_is_default(vrf_dev) ||
  612. IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED)
  613. return vrf_ip6_out_direct(vrf_dev, sk, skb);
  614. return vrf_ip6_out_redirect(vrf_dev, skb);
  615. }
  616. /* holding rtnl */
  617. static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf)
  618. {
  619. struct rt6_info *rt6 = rtnl_dereference(vrf->rt6);
  620. struct net *net = dev_net(dev);
  621. struct dst_entry *dst;
  622. RCU_INIT_POINTER(vrf->rt6, NULL);
  623. synchronize_rcu();
  624. /* move dev in dst's to loopback so this VRF device can be deleted
  625. * - based on dst_ifdown
  626. */
  627. if (rt6) {
  628. dst = &rt6->dst;
  629. netdev_ref_replace(dst->dev, net->loopback_dev,
  630. &dst->dev_tracker, GFP_KERNEL);
  631. dst->dev = net->loopback_dev;
  632. dst_release(dst);
  633. }
  634. }
  635. static int vrf_rt6_create(struct net_device *dev)
  636. {
  637. int flags = DST_NOPOLICY | DST_NOXFRM;
  638. struct net_vrf *vrf = netdev_priv(dev);
  639. struct net *net = dev_net(dev);
  640. struct rt6_info *rt6;
  641. int rc = -ENOMEM;
  642. /* IPv6 can be CONFIG enabled and then disabled runtime */
  643. if (!ipv6_mod_enabled())
  644. return 0;
  645. vrf->fib6_table = fib6_new_table(net, vrf->tb_id);
  646. if (!vrf->fib6_table)
  647. goto out;
  648. /* create a dst for routing packets out a VRF device */
  649. rt6 = ip6_dst_alloc(net, dev, flags);
  650. if (!rt6)
  651. goto out;
  652. rt6->dst.output = vrf_output6;
  653. rcu_assign_pointer(vrf->rt6, rt6);
  654. rc = 0;
  655. out:
  656. return rc;
  657. }
  658. #else
  659. static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev,
  660. struct sock *sk,
  661. struct sk_buff *skb)
  662. {
  663. return skb;
  664. }
  665. static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf)
  666. {
  667. }
  668. static int vrf_rt6_create(struct net_device *dev)
  669. {
  670. return 0;
  671. }
  672. #endif
  673. /* modelled after ip_finish_output2 */
  674. static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
  675. {
  676. struct dst_entry *dst = skb_dst(skb);
  677. struct rtable *rt = dst_rtable(dst);
  678. struct net_device *dev = dst->dev;
  679. unsigned int hh_len = LL_RESERVED_SPACE(dev);
  680. struct neighbour *neigh;
  681. bool is_v6gw = false;
  682. vrf_nf_reset_ct(skb);
  683. /* Be paranoid, rather than too clever. */
  684. if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
  685. skb = skb_expand_head(skb, hh_len);
  686. if (!skb) {
  687. dev->stats.tx_errors++;
  688. return -ENOMEM;
  689. }
  690. }
  691. rcu_read_lock();
  692. neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
  693. if (!IS_ERR(neigh)) {
  694. int ret;
  695. sock_confirm_neigh(skb, neigh);
  696. /* if crossing protocols, can not use the cached header */
  697. ret = neigh_output(neigh, skb, is_v6gw);
  698. rcu_read_unlock();
  699. return ret;
  700. }
  701. rcu_read_unlock();
  702. vrf_tx_error(skb->dev, skb);
  703. return -EINVAL;
  704. }
  705. static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
  706. {
  707. struct net_device *dev = skb_dst(skb)->dev;
  708. IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
  709. skb->dev = dev;
  710. skb->protocol = htons(ETH_P_IP);
  711. return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
  712. net, sk, skb, NULL, dev,
  713. vrf_finish_output,
  714. !(IPCB(skb)->flags & IPSKB_REROUTED));
  715. }
  716. /* set dst on skb to send packet to us via dev_xmit path. Allows
  717. * packet to go through device based features such as qdisc, netfilter
  718. * hooks and packet sockets with skb->dev set to vrf device.
  719. */
  720. static struct sk_buff *vrf_ip_out_redirect(struct net_device *vrf_dev,
  721. struct sk_buff *skb)
  722. {
  723. struct net_vrf *vrf = netdev_priv(vrf_dev);
  724. struct dst_entry *dst = NULL;
  725. struct rtable *rth;
  726. rcu_read_lock();
  727. rth = rcu_dereference(vrf->rth);
  728. if (likely(rth)) {
  729. dst = &rth->dst;
  730. dst_hold(dst);
  731. }
  732. rcu_read_unlock();
  733. if (unlikely(!dst)) {
  734. vrf_tx_error(vrf_dev, skb);
  735. return NULL;
  736. }
  737. skb_dst_drop(skb);
  738. skb_dst_set(skb, dst);
  739. return skb;
  740. }
  741. static int vrf_output_direct_finish(struct net *net, struct sock *sk,
  742. struct sk_buff *skb)
  743. {
  744. vrf_finish_direct(skb);
  745. return vrf_ip_local_out(net, sk, skb);
  746. }
  747. static int vrf_output_direct(struct net *net, struct sock *sk,
  748. struct sk_buff *skb)
  749. {
  750. int err = 1;
  751. skb->protocol = htons(ETH_P_IP);
  752. if (!(IPCB(skb)->flags & IPSKB_REROUTED))
  753. err = nf_hook(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb,
  754. NULL, skb->dev, vrf_output_direct_finish);
  755. if (likely(err == 1))
  756. vrf_finish_direct(skb);
  757. return err;
  758. }
  759. static int vrf_ip_out_direct_finish(struct net *net, struct sock *sk,
  760. struct sk_buff *skb)
  761. {
  762. int err;
  763. err = vrf_output_direct(net, sk, skb);
  764. if (likely(err == 1))
  765. err = vrf_ip_local_out(net, sk, skb);
  766. return err;
  767. }
  768. static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev,
  769. struct sock *sk,
  770. struct sk_buff *skb)
  771. {
  772. struct net *net = dev_net(vrf_dev);
  773. int err;
  774. skb->dev = vrf_dev;
  775. err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk,
  776. skb, NULL, vrf_dev, vrf_ip_out_direct_finish);
  777. if (likely(err == 1))
  778. err = vrf_output_direct(net, sk, skb);
  779. if (likely(err == 1))
  780. return skb;
  781. return NULL;
  782. }
  783. static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev,
  784. struct sock *sk,
  785. struct sk_buff *skb)
  786. {
  787. /* don't divert multicast or local broadcast */
  788. if (ipv4_is_multicast(ip_hdr(skb)->daddr) ||
  789. ipv4_is_lbcast(ip_hdr(skb)->daddr))
  790. return skb;
  791. vrf_nf_set_untracked(skb);
  792. if (qdisc_tx_is_default(vrf_dev) ||
  793. IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
  794. return vrf_ip_out_direct(vrf_dev, sk, skb);
  795. return vrf_ip_out_redirect(vrf_dev, skb);
  796. }
  797. /* called with rcu lock held */
  798. static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev,
  799. struct sock *sk,
  800. struct sk_buff *skb,
  801. u16 proto)
  802. {
  803. switch (proto) {
  804. case AF_INET:
  805. return vrf_ip_out(vrf_dev, sk, skb);
  806. case AF_INET6:
  807. return vrf_ip6_out(vrf_dev, sk, skb);
  808. }
  809. return skb;
  810. }
  811. /* holding rtnl */
  812. static void vrf_rtable_release(struct net_device *dev, struct net_vrf *vrf)
  813. {
  814. struct rtable *rth = rtnl_dereference(vrf->rth);
  815. struct net *net = dev_net(dev);
  816. struct dst_entry *dst;
  817. RCU_INIT_POINTER(vrf->rth, NULL);
  818. synchronize_rcu();
  819. /* move dev in dst's to loopback so this VRF device can be deleted
  820. * - based on dst_ifdown
  821. */
  822. if (rth) {
  823. dst = &rth->dst;
  824. netdev_ref_replace(dst->dev, net->loopback_dev,
  825. &dst->dev_tracker, GFP_KERNEL);
  826. dst->dev = net->loopback_dev;
  827. dst_release(dst);
  828. }
  829. }
  830. static int vrf_rtable_create(struct net_device *dev)
  831. {
  832. struct net_vrf *vrf = netdev_priv(dev);
  833. struct rtable *rth;
  834. if (!fib_new_table(dev_net(dev), vrf->tb_id))
  835. return -ENOMEM;
  836. /* create a dst for routing packets out through a VRF device */
  837. rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1);
  838. if (!rth)
  839. return -ENOMEM;
  840. rth->dst.output = vrf_output;
  841. rcu_assign_pointer(vrf->rth, rth);
  842. return 0;
  843. }
  844. /**************************** device handling ********************/
  845. /* cycle interface to flush neighbor cache and move routes across tables */
  846. static void cycle_netdev(struct net_device *dev,
  847. struct netlink_ext_ack *extack)
  848. {
  849. unsigned int flags = dev->flags;
  850. int ret;
  851. if (!netif_running(dev))
  852. return;
  853. ret = dev_change_flags(dev, flags & ~IFF_UP, extack);
  854. if (ret >= 0)
  855. ret = dev_change_flags(dev, flags, extack);
  856. if (ret < 0) {
  857. netdev_err(dev,
  858. "Failed to cycle device %s; route tables might be wrong!\n",
  859. dev->name);
  860. }
  861. }
  862. static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev,
  863. struct netlink_ext_ack *extack)
  864. {
  865. int ret;
  866. /* do not allow loopback device to be enslaved to a VRF.
  867. * The vrf device acts as the loopback for the vrf.
  868. */
  869. if (port_dev == dev_net(dev)->loopback_dev) {
  870. NL_SET_ERR_MSG(extack,
  871. "Can not enslave loopback device to a VRF");
  872. return -EOPNOTSUPP;
  873. }
  874. port_dev->priv_flags |= IFF_L3MDEV_SLAVE;
  875. ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL, extack);
  876. if (ret < 0)
  877. goto err;
  878. cycle_netdev(port_dev, extack);
  879. return 0;
  880. err:
  881. port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;
  882. return ret;
  883. }
  884. static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev,
  885. struct netlink_ext_ack *extack)
  886. {
  887. if (netif_is_l3_master(port_dev)) {
  888. NL_SET_ERR_MSG(extack,
  889. "Can not enslave an L3 master device to a VRF");
  890. return -EINVAL;
  891. }
  892. if (netif_is_l3_slave(port_dev))
  893. return -EINVAL;
  894. return do_vrf_add_slave(dev, port_dev, extack);
  895. }
  896. /* inverse of do_vrf_add_slave */
  897. static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev)
  898. {
  899. netdev_upper_dev_unlink(port_dev, dev);
  900. port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;
  901. cycle_netdev(port_dev, NULL);
  902. return 0;
  903. }
  904. static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev)
  905. {
  906. return do_vrf_del_slave(dev, port_dev);
  907. }
  908. static void vrf_dev_uninit(struct net_device *dev)
  909. {
  910. struct net_vrf *vrf = netdev_priv(dev);
  911. vrf_rtable_release(dev, vrf);
  912. vrf_rt6_release(dev, vrf);
  913. }
  914. static int vrf_dev_init(struct net_device *dev)
  915. {
  916. struct net_vrf *vrf = netdev_priv(dev);
  917. /* create the default dst which points back to us */
  918. if (vrf_rtable_create(dev) != 0)
  919. goto out_nomem;
  920. if (vrf_rt6_create(dev) != 0)
  921. goto out_rth;
  922. dev->flags = IFF_MASTER | IFF_NOARP;
  923. /* similarly, oper state is irrelevant; set to up to avoid confusion */
  924. dev->operstate = IF_OPER_UP;
  925. netdev_lockdep_set_classes(dev);
  926. return 0;
  927. out_rth:
  928. vrf_rtable_release(dev, vrf);
  929. out_nomem:
  930. return -ENOMEM;
  931. }
  932. static const struct net_device_ops vrf_netdev_ops = {
  933. .ndo_init = vrf_dev_init,
  934. .ndo_uninit = vrf_dev_uninit,
  935. .ndo_start_xmit = vrf_xmit,
  936. .ndo_set_mac_address = eth_mac_addr,
  937. .ndo_add_slave = vrf_add_slave,
  938. .ndo_del_slave = vrf_del_slave,
  939. };
  940. static u32 vrf_fib_table(const struct net_device *dev)
  941. {
  942. struct net_vrf *vrf = netdev_priv(dev);
  943. return vrf->tb_id;
  944. }
  945. static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
  946. {
  947. kfree_skb(skb);
  948. return 0;
  949. }
  950. static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned int hook,
  951. struct sk_buff *skb,
  952. struct net_device *dev)
  953. {
  954. struct net *net = dev_net(dev);
  955. if (nf_hook(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) != 1)
  956. skb = NULL; /* kfree_skb(skb) handled by nf code */
  957. return skb;
  958. }
  959. static int vrf_prepare_mac_header(struct sk_buff *skb,
  960. struct net_device *vrf_dev, u16 proto)
  961. {
  962. struct ethhdr *eth;
  963. int err;
  964. /* in general, we do not know if there is enough space in the head of
  965. * the packet for hosting the mac header.
  966. */
  967. err = skb_cow_head(skb, LL_RESERVED_SPACE(vrf_dev));
  968. if (unlikely(err))
  969. /* no space in the skb head */
  970. return -ENOBUFS;
  971. __skb_push(skb, ETH_HLEN);
  972. eth = (struct ethhdr *)skb->data;
  973. skb_reset_mac_header(skb);
  974. skb_reset_mac_len(skb);
  975. /* we set the ethernet destination and the source addresses to the
  976. * address of the VRF device.
  977. */
  978. ether_addr_copy(eth->h_dest, vrf_dev->dev_addr);
  979. ether_addr_copy(eth->h_source, vrf_dev->dev_addr);
  980. eth->h_proto = htons(proto);
  981. /* the destination address of the Ethernet frame corresponds to the
  982. * address set on the VRF interface; therefore, the packet is intended
  983. * to be processed locally.
  984. */
  985. skb->protocol = eth->h_proto;
  986. skb->pkt_type = PACKET_HOST;
  987. skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
  988. skb_pull_inline(skb, ETH_HLEN);
  989. return 0;
  990. }
  991. /* prepare and add the mac header to the packet if it was not set previously.
  992. * In this way, packet sniffers such as tcpdump can parse the packet correctly.
  993. * If the mac header was already set, the original mac header is left
  994. * untouched and the function returns immediately.
  995. */
  996. static int vrf_add_mac_header_if_unset(struct sk_buff *skb,
  997. struct net_device *vrf_dev,
  998. u16 proto, struct net_device *orig_dev)
  999. {
  1000. if (skb_mac_header_was_set(skb) && dev_has_header(orig_dev))
  1001. return 0;
  1002. return vrf_prepare_mac_header(skb, vrf_dev, proto);
  1003. }
  1004. #if IS_ENABLED(CONFIG_IPV6)
  1005. /* neighbor handling is done with actual device; do not want
  1006. * to flip skb->dev for those ndisc packets. This really fails
  1007. * for multiple next protocols (e.g., NEXTHDR_HOP). But it is
  1008. * a start.
  1009. */
  1010. static bool ipv6_ndisc_frame(const struct sk_buff *skb)
  1011. {
  1012. const struct ipv6hdr *iph = ipv6_hdr(skb);
  1013. bool rc = false;
  1014. if (iph->nexthdr == NEXTHDR_ICMP) {
  1015. const struct icmp6hdr *icmph;
  1016. struct icmp6hdr _icmph;
  1017. icmph = skb_header_pointer(skb, sizeof(*iph),
  1018. sizeof(_icmph), &_icmph);
  1019. if (!icmph)
  1020. goto out;
  1021. switch (icmph->icmp6_type) {
  1022. case NDISC_ROUTER_SOLICITATION:
  1023. case NDISC_ROUTER_ADVERTISEMENT:
  1024. case NDISC_NEIGHBOUR_SOLICITATION:
  1025. case NDISC_NEIGHBOUR_ADVERTISEMENT:
  1026. case NDISC_REDIRECT:
  1027. rc = true;
  1028. break;
  1029. }
  1030. }
  1031. out:
  1032. return rc;
  1033. }
  1034. static struct rt6_info *vrf_ip6_route_lookup(struct net *net,
  1035. const struct net_device *dev,
  1036. struct flowi6 *fl6,
  1037. int ifindex,
  1038. const struct sk_buff *skb,
  1039. int flags)
  1040. {
  1041. struct net_vrf *vrf = netdev_priv(dev);
  1042. return ip6_pol_route(net, vrf->fib6_table, ifindex, fl6, skb, flags);
  1043. }
  1044. static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev,
  1045. int ifindex)
  1046. {
  1047. const struct ipv6hdr *iph = ipv6_hdr(skb);
  1048. struct flowi6 fl6 = {
  1049. .flowi6_iif = ifindex,
  1050. .flowi6_mark = skb->mark,
  1051. .flowi6_proto = iph->nexthdr,
  1052. .daddr = iph->daddr,
  1053. .saddr = iph->saddr,
  1054. .flowlabel = ip6_flowinfo(iph),
  1055. };
  1056. struct net *net = dev_net(vrf_dev);
  1057. struct rt6_info *rt6;
  1058. skb_dst_drop(skb);
  1059. rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, skb,
  1060. RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE);
  1061. if (unlikely(!rt6))
  1062. return;
  1063. if (unlikely(&rt6->dst == &net->ipv6.ip6_null_entry->dst))
  1064. return;
  1065. skb_dst_set(skb, &rt6->dst);
  1066. }
  1067. static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
  1068. struct sk_buff *skb)
  1069. {
  1070. int orig_iif = skb->skb_iif;
  1071. bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr);
  1072. bool is_ndisc = ipv6_ndisc_frame(skb);
  1073. /* loopback, multicast & non-ND link-local traffic; do not push through
  1074. * packet taps again. Reset pkt_type for upper layers to process skb.
  1075. * For non-loopback strict packets, determine the dst using the original
  1076. * ifindex.
  1077. */
  1078. if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) {
  1079. skb->dev = vrf_dev;
  1080. skb->skb_iif = vrf_dev->ifindex;
  1081. IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
  1082. if (skb->pkt_type == PACKET_LOOPBACK)
  1083. skb->pkt_type = PACKET_HOST;
  1084. else
  1085. vrf_ip6_input_dst(skb, vrf_dev, orig_iif);
  1086. goto out;
  1087. }
  1088. /* if packet is NDISC then keep the ingress interface */
  1089. if (!is_ndisc) {
  1090. struct net_device *orig_dev = skb->dev;
  1091. vrf_rx_stats(vrf_dev, skb->len);
  1092. skb->dev = vrf_dev;
  1093. skb->skb_iif = vrf_dev->ifindex;
  1094. if (!list_empty(&vrf_dev->ptype_all)) {
  1095. int err;
  1096. err = vrf_add_mac_header_if_unset(skb, vrf_dev,
  1097. ETH_P_IPV6,
  1098. orig_dev);
  1099. if (likely(!err)) {
  1100. skb_push(skb, skb->mac_len);
  1101. dev_queue_xmit_nit(skb, vrf_dev);
  1102. skb_pull(skb, skb->mac_len);
  1103. }
  1104. }
  1105. IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
  1106. }
  1107. if (need_strict)
  1108. vrf_ip6_input_dst(skb, vrf_dev, orig_iif);
  1109. skb = vrf_rcv_nfhook(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, vrf_dev);
  1110. out:
  1111. return skb;
  1112. }
  1113. #else
  1114. static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
  1115. struct sk_buff *skb)
  1116. {
  1117. return skb;
  1118. }
  1119. #endif
  1120. static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev,
  1121. struct sk_buff *skb)
  1122. {
  1123. struct net_device *orig_dev = skb->dev;
  1124. skb->dev = vrf_dev;
  1125. skb->skb_iif = vrf_dev->ifindex;
  1126. IPCB(skb)->flags |= IPSKB_L3SLAVE;
  1127. if (ipv4_is_multicast(ip_hdr(skb)->daddr))
  1128. goto out;
  1129. /* loopback traffic; do not push through packet taps again.
  1130. * Reset pkt_type for upper layers to process skb
  1131. */
  1132. if (skb->pkt_type == PACKET_LOOPBACK) {
  1133. skb->pkt_type = PACKET_HOST;
  1134. goto out;
  1135. }
  1136. vrf_rx_stats(vrf_dev, skb->len);
  1137. if (!list_empty(&vrf_dev->ptype_all)) {
  1138. int err;
  1139. err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IP,
  1140. orig_dev);
  1141. if (likely(!err)) {
  1142. skb_push(skb, skb->mac_len);
  1143. dev_queue_xmit_nit(skb, vrf_dev);
  1144. skb_pull(skb, skb->mac_len);
  1145. }
  1146. }
  1147. skb = vrf_rcv_nfhook(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, vrf_dev);
  1148. out:
  1149. return skb;
  1150. }
  1151. /* called with rcu lock held */
  1152. static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev,
  1153. struct sk_buff *skb,
  1154. u16 proto)
  1155. {
  1156. switch (proto) {
  1157. case AF_INET:
  1158. return vrf_ip_rcv(vrf_dev, skb);
  1159. case AF_INET6:
  1160. return vrf_ip6_rcv(vrf_dev, skb);
  1161. }
  1162. return skb;
  1163. }
  1164. #if IS_ENABLED(CONFIG_IPV6)
  1165. /* send to link-local or multicast address via interface enslaved to
  1166. * VRF device. Force lookup to VRF table without changing flow struct
  1167. * Note: Caller to this function must hold rcu_read_lock() and no refcnt
  1168. * is taken on the dst by this function.
  1169. */
  1170. static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev,
  1171. struct flowi6 *fl6)
  1172. {
  1173. struct net *net = dev_net(dev);
  1174. int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_DST_NOREF;
  1175. struct dst_entry *dst = NULL;
  1176. struct rt6_info *rt;
  1177. /* VRF device does not have a link-local address and
  1178. * sending packets to link-local or mcast addresses over
  1179. * a VRF device does not make sense
  1180. */
  1181. if (fl6->flowi6_oif == dev->ifindex) {
  1182. dst = &net->ipv6.ip6_null_entry->dst;
  1183. return dst;
  1184. }
  1185. if (!ipv6_addr_any(&fl6->saddr))
  1186. flags |= RT6_LOOKUP_F_HAS_SADDR;
  1187. rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, NULL, flags);
  1188. if (rt)
  1189. dst = &rt->dst;
  1190. return dst;
  1191. }
  1192. #endif
  1193. static const struct l3mdev_ops vrf_l3mdev_ops = {
  1194. .l3mdev_fib_table = vrf_fib_table,
  1195. .l3mdev_l3_rcv = vrf_l3_rcv,
  1196. .l3mdev_l3_out = vrf_l3_out,
  1197. #if IS_ENABLED(CONFIG_IPV6)
  1198. .l3mdev_link_scope_lookup = vrf_link_scope_lookup,
  1199. #endif
  1200. };
  1201. static void vrf_get_drvinfo(struct net_device *dev,
  1202. struct ethtool_drvinfo *info)
  1203. {
  1204. strscpy(info->driver, DRV_NAME, sizeof(info->driver));
  1205. strscpy(info->version, DRV_VERSION, sizeof(info->version));
  1206. }
  1207. static const struct ethtool_ops vrf_ethtool_ops = {
  1208. .get_drvinfo = vrf_get_drvinfo,
  1209. };
  1210. static inline size_t vrf_fib_rule_nl_size(void)
  1211. {
  1212. size_t sz;
  1213. sz = NLMSG_ALIGN(sizeof(struct fib_rule_hdr));
  1214. sz += nla_total_size(sizeof(u8)); /* FRA_L3MDEV */
  1215. sz += nla_total_size(sizeof(u32)); /* FRA_PRIORITY */
  1216. sz += nla_total_size(sizeof(u8)); /* FRA_PROTOCOL */
  1217. return sz;
  1218. }
  1219. static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it)
  1220. {
  1221. struct fib_rule_hdr *frh;
  1222. struct nlmsghdr *nlh;
  1223. struct sk_buff *skb;
  1224. int err;
  1225. if ((family == AF_INET6 || family == RTNL_FAMILY_IP6MR) &&
  1226. !ipv6_mod_enabled())
  1227. return 0;
  1228. skb = nlmsg_new(vrf_fib_rule_nl_size(), GFP_KERNEL);
  1229. if (!skb)
  1230. return -ENOMEM;
  1231. nlh = nlmsg_put(skb, 0, 0, 0, sizeof(*frh), 0);
  1232. if (!nlh)
  1233. goto nla_put_failure;
  1234. /* rule only needs to appear once */
  1235. nlh->nlmsg_flags |= NLM_F_EXCL;
  1236. frh = nlmsg_data(nlh);
  1237. memset(frh, 0, sizeof(*frh));
  1238. frh->family = family;
  1239. frh->action = FR_ACT_TO_TBL;
  1240. if (nla_put_u8(skb, FRA_PROTOCOL, RTPROT_KERNEL))
  1241. goto nla_put_failure;
  1242. if (nla_put_u8(skb, FRA_L3MDEV, 1))
  1243. goto nla_put_failure;
  1244. if (nla_put_u32(skb, FRA_PRIORITY, FIB_RULE_PREF))
  1245. goto nla_put_failure;
  1246. nlmsg_end(skb, nlh);
  1247. /* fib_nl_{new,del}rule handling looks for net from skb->sk */
  1248. skb->sk = dev_net(dev)->rtnl;
  1249. if (add_it) {
  1250. err = fib_nl_newrule(skb, nlh, NULL);
  1251. if (err == -EEXIST)
  1252. err = 0;
  1253. } else {
  1254. err = fib_nl_delrule(skb, nlh, NULL);
  1255. if (err == -ENOENT)
  1256. err = 0;
  1257. }
  1258. nlmsg_free(skb);
  1259. return err;
  1260. nla_put_failure:
  1261. nlmsg_free(skb);
  1262. return -EMSGSIZE;
  1263. }
  1264. static int vrf_add_fib_rules(const struct net_device *dev)
  1265. {
  1266. int err;
  1267. err = vrf_fib_rule(dev, AF_INET, true);
  1268. if (err < 0)
  1269. goto out_err;
  1270. err = vrf_fib_rule(dev, AF_INET6, true);
  1271. if (err < 0)
  1272. goto ipv6_err;
  1273. #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES)
  1274. err = vrf_fib_rule(dev, RTNL_FAMILY_IPMR, true);
  1275. if (err < 0)
  1276. goto ipmr_err;
  1277. #endif
  1278. #if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES)
  1279. err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true);
  1280. if (err < 0)
  1281. goto ip6mr_err;
  1282. #endif
  1283. return 0;
  1284. #if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES)
  1285. ip6mr_err:
  1286. vrf_fib_rule(dev, RTNL_FAMILY_IPMR, false);
  1287. #endif
  1288. #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES)
  1289. ipmr_err:
  1290. vrf_fib_rule(dev, AF_INET6, false);
  1291. #endif
  1292. ipv6_err:
  1293. vrf_fib_rule(dev, AF_INET, false);
  1294. out_err:
  1295. netdev_err(dev, "Failed to add FIB rules.\n");
  1296. return err;
  1297. }
  1298. static void vrf_setup(struct net_device *dev)
  1299. {
  1300. ether_setup(dev);
  1301. /* Initialize the device structure. */
  1302. dev->netdev_ops = &vrf_netdev_ops;
  1303. dev->l3mdev_ops = &vrf_l3mdev_ops;
  1304. dev->ethtool_ops = &vrf_ethtool_ops;
  1305. dev->needs_free_netdev = true;
  1306. /* Fill in device structure with ethernet-generic values. */
  1307. eth_hw_addr_random(dev);
  1308. /* don't acquire vrf device's netif_tx_lock when transmitting */
  1309. dev->lltx = true;
  1310. /* don't allow vrf devices to change network namespaces. */
  1311. dev->netns_local = true;
  1312. /* does not make sense for a VLAN to be added to a vrf device */
  1313. dev->features |= NETIF_F_VLAN_CHALLENGED;
  1314. /* enable offload features */
  1315. dev->features |= NETIF_F_GSO_SOFTWARE;
  1316. dev->features |= NETIF_F_RXCSUM | NETIF_F_HW_CSUM | NETIF_F_SCTP_CRC;
  1317. dev->features |= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA;
  1318. dev->hw_features = dev->features;
  1319. dev->hw_enc_features = dev->features;
  1320. /* default to no qdisc; user can add if desired */
  1321. dev->priv_flags |= IFF_NO_QUEUE;
  1322. dev->priv_flags |= IFF_NO_RX_HANDLER;
  1323. dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
  1324. /* VRF devices do not care about MTU, but if the MTU is set
  1325. * too low then the ipv4 and ipv6 protocols are disabled
  1326. * which breaks networking.
  1327. */
  1328. dev->min_mtu = IPV6_MIN_MTU;
  1329. dev->max_mtu = IP6_MAX_MTU;
  1330. dev->mtu = dev->max_mtu;
  1331. dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS;
  1332. }
  1333. static int vrf_validate(struct nlattr *tb[], struct nlattr *data[],
  1334. struct netlink_ext_ack *extack)
  1335. {
  1336. if (tb[IFLA_ADDRESS]) {
  1337. if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
  1338. NL_SET_ERR_MSG(extack, "Invalid hardware address");
  1339. return -EINVAL;
  1340. }
  1341. if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
  1342. NL_SET_ERR_MSG(extack, "Invalid hardware address");
  1343. return -EADDRNOTAVAIL;
  1344. }
  1345. }
  1346. return 0;
  1347. }
  1348. static void vrf_dellink(struct net_device *dev, struct list_head *head)
  1349. {
  1350. struct net_device *port_dev;
  1351. struct list_head *iter;
  1352. netdev_for_each_lower_dev(dev, port_dev, iter)
  1353. vrf_del_slave(dev, port_dev);
  1354. vrf_map_unregister_dev(dev);
  1355. unregister_netdevice_queue(dev, head);
  1356. }
  1357. static int vrf_newlink(struct net *src_net, struct net_device *dev,
  1358. struct nlattr *tb[], struct nlattr *data[],
  1359. struct netlink_ext_ack *extack)
  1360. {
  1361. struct net_vrf *vrf = netdev_priv(dev);
  1362. struct netns_vrf *nn_vrf;
  1363. bool *add_fib_rules;
  1364. struct net *net;
  1365. int err;
  1366. if (!data || !data[IFLA_VRF_TABLE]) {
  1367. NL_SET_ERR_MSG(extack, "VRF table id is missing");
  1368. return -EINVAL;
  1369. }
  1370. vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]);
  1371. if (vrf->tb_id == RT_TABLE_UNSPEC) {
  1372. NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VRF_TABLE],
  1373. "Invalid VRF table id");
  1374. return -EINVAL;
  1375. }
  1376. dev->priv_flags |= IFF_L3MDEV_MASTER;
  1377. err = register_netdevice(dev);
  1378. if (err)
  1379. goto out;
  1380. /* mapping between table_id and vrf;
  1381. * note: such binding could not be done in the dev init function
  1382. * because dev->ifindex id is not available yet.
  1383. */
  1384. vrf->ifindex = dev->ifindex;
  1385. err = vrf_map_register_dev(dev, extack);
  1386. if (err) {
  1387. unregister_netdevice(dev);
  1388. goto out;
  1389. }
  1390. net = dev_net(dev);
  1391. nn_vrf = net_generic(net, vrf_net_id);
  1392. add_fib_rules = &nn_vrf->add_fib_rules;
  1393. if (*add_fib_rules) {
  1394. err = vrf_add_fib_rules(dev);
  1395. if (err) {
  1396. vrf_map_unregister_dev(dev);
  1397. unregister_netdevice(dev);
  1398. goto out;
  1399. }
  1400. *add_fib_rules = false;
  1401. }
  1402. out:
  1403. return err;
  1404. }
  1405. static size_t vrf_nl_getsize(const struct net_device *dev)
  1406. {
  1407. return nla_total_size(sizeof(u32)); /* IFLA_VRF_TABLE */
  1408. }
  1409. static int vrf_fillinfo(struct sk_buff *skb,
  1410. const struct net_device *dev)
  1411. {
  1412. struct net_vrf *vrf = netdev_priv(dev);
  1413. return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id);
  1414. }
  1415. static size_t vrf_get_slave_size(const struct net_device *bond_dev,
  1416. const struct net_device *slave_dev)
  1417. {
  1418. return nla_total_size(sizeof(u32)); /* IFLA_VRF_PORT_TABLE */
  1419. }
  1420. static int vrf_fill_slave_info(struct sk_buff *skb,
  1421. const struct net_device *vrf_dev,
  1422. const struct net_device *slave_dev)
  1423. {
  1424. struct net_vrf *vrf = netdev_priv(vrf_dev);
  1425. if (nla_put_u32(skb, IFLA_VRF_PORT_TABLE, vrf->tb_id))
  1426. return -EMSGSIZE;
  1427. return 0;
  1428. }
  1429. static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = {
  1430. [IFLA_VRF_TABLE] = { .type = NLA_U32 },
  1431. };
  1432. static struct rtnl_link_ops vrf_link_ops __read_mostly = {
  1433. .kind = DRV_NAME,
  1434. .priv_size = sizeof(struct net_vrf),
  1435. .get_size = vrf_nl_getsize,
  1436. .policy = vrf_nl_policy,
  1437. .validate = vrf_validate,
  1438. .fill_info = vrf_fillinfo,
  1439. .get_slave_size = vrf_get_slave_size,
  1440. .fill_slave_info = vrf_fill_slave_info,
  1441. .newlink = vrf_newlink,
  1442. .dellink = vrf_dellink,
  1443. .setup = vrf_setup,
  1444. .maxtype = IFLA_VRF_MAX,
  1445. };
  1446. static int vrf_device_event(struct notifier_block *unused,
  1447. unsigned long event, void *ptr)
  1448. {
  1449. struct net_device *dev = netdev_notifier_info_to_dev(ptr);
  1450. /* only care about unregister events to drop slave references */
  1451. if (event == NETDEV_UNREGISTER) {
  1452. struct net_device *vrf_dev;
  1453. if (!netif_is_l3_slave(dev))
  1454. goto out;
  1455. vrf_dev = netdev_master_upper_dev_get(dev);
  1456. vrf_del_slave(vrf_dev, dev);
  1457. }
  1458. out:
  1459. return NOTIFY_DONE;
  1460. }
  1461. static struct notifier_block vrf_notifier_block __read_mostly = {
  1462. .notifier_call = vrf_device_event,
  1463. };
  1464. static int vrf_map_init(struct vrf_map *vmap)
  1465. {
  1466. spin_lock_init(&vmap->vmap_lock);
  1467. hash_init(vmap->ht);
  1468. vmap->strict_mode = false;
  1469. return 0;
  1470. }
  1471. #ifdef CONFIG_SYSCTL
  1472. static bool vrf_strict_mode(struct vrf_map *vmap)
  1473. {
  1474. bool strict_mode;
  1475. vrf_map_lock(vmap);
  1476. strict_mode = vmap->strict_mode;
  1477. vrf_map_unlock(vmap);
  1478. return strict_mode;
  1479. }
  1480. static int vrf_strict_mode_change(struct vrf_map *vmap, bool new_mode)
  1481. {
  1482. bool *cur_mode;
  1483. int res = 0;
  1484. vrf_map_lock(vmap);
  1485. cur_mode = &vmap->strict_mode;
  1486. if (*cur_mode == new_mode)
  1487. goto unlock;
  1488. if (*cur_mode) {
  1489. /* disable strict mode */
  1490. *cur_mode = false;
  1491. } else {
  1492. if (vmap->shared_tables) {
  1493. /* we cannot allow strict_mode because there are some
  1494. * vrfs that share one or more tables.
  1495. */
  1496. res = -EBUSY;
  1497. goto unlock;
  1498. }
  1499. /* no tables are shared among vrfs, so we can go back
  1500. * to 1:1 association between a vrf with its table.
  1501. */
  1502. *cur_mode = true;
  1503. }
  1504. unlock:
  1505. vrf_map_unlock(vmap);
  1506. return res;
  1507. }
  1508. static int vrf_shared_table_handler(const struct ctl_table *table, int write,
  1509. void *buffer, size_t *lenp, loff_t *ppos)
  1510. {
  1511. struct net *net = (struct net *)table->extra1;
  1512. struct vrf_map *vmap = netns_vrf_map(net);
  1513. int proc_strict_mode = 0;
  1514. struct ctl_table tmp = {
  1515. .procname = table->procname,
  1516. .data = &proc_strict_mode,
  1517. .maxlen = sizeof(int),
  1518. .mode = table->mode,
  1519. .extra1 = SYSCTL_ZERO,
  1520. .extra2 = SYSCTL_ONE,
  1521. };
  1522. int ret;
  1523. if (!write)
  1524. proc_strict_mode = vrf_strict_mode(vmap);
  1525. ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
  1526. if (write && ret == 0)
  1527. ret = vrf_strict_mode_change(vmap, (bool)proc_strict_mode);
  1528. return ret;
  1529. }
  1530. static const struct ctl_table vrf_table[] = {
  1531. {
  1532. .procname = "strict_mode",
  1533. .data = NULL,
  1534. .maxlen = sizeof(int),
  1535. .mode = 0644,
  1536. .proc_handler = vrf_shared_table_handler,
  1537. /* set by the vrf_netns_init */
  1538. .extra1 = NULL,
  1539. },
  1540. };
  1541. static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf)
  1542. {
  1543. struct ctl_table *table;
  1544. table = kmemdup(vrf_table, sizeof(vrf_table), GFP_KERNEL);
  1545. if (!table)
  1546. return -ENOMEM;
  1547. /* init the extra1 parameter with the reference to current netns */
  1548. table[0].extra1 = net;
  1549. nn_vrf->ctl_hdr = register_net_sysctl_sz(net, "net/vrf", table,
  1550. ARRAY_SIZE(vrf_table));
  1551. if (!nn_vrf->ctl_hdr) {
  1552. kfree(table);
  1553. return -ENOMEM;
  1554. }
  1555. return 0;
  1556. }
  1557. static void vrf_netns_exit_sysctl(struct net *net)
  1558. {
  1559. struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id);
  1560. const struct ctl_table *table;
  1561. table = nn_vrf->ctl_hdr->ctl_table_arg;
  1562. unregister_net_sysctl_table(nn_vrf->ctl_hdr);
  1563. kfree(table);
  1564. }
  1565. #else
  1566. static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf)
  1567. {
  1568. return 0;
  1569. }
  1570. static void vrf_netns_exit_sysctl(struct net *net)
  1571. {
  1572. }
  1573. #endif
  1574. /* Initialize per network namespace state */
  1575. static int __net_init vrf_netns_init(struct net *net)
  1576. {
  1577. struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id);
  1578. nn_vrf->add_fib_rules = true;
  1579. vrf_map_init(&nn_vrf->vmap);
  1580. return vrf_netns_init_sysctl(net, nn_vrf);
  1581. }
  1582. static void __net_exit vrf_netns_exit(struct net *net)
  1583. {
  1584. vrf_netns_exit_sysctl(net);
  1585. }
  1586. static struct pernet_operations vrf_net_ops __net_initdata = {
  1587. .init = vrf_netns_init,
  1588. .exit = vrf_netns_exit,
  1589. .id = &vrf_net_id,
  1590. .size = sizeof(struct netns_vrf),
  1591. };
  1592. static int __init vrf_init_module(void)
  1593. {
  1594. int rc;
  1595. register_netdevice_notifier(&vrf_notifier_block);
  1596. rc = register_pernet_subsys(&vrf_net_ops);
  1597. if (rc < 0)
  1598. goto error;
  1599. rc = l3mdev_table_lookup_register(L3MDEV_TYPE_VRF,
  1600. vrf_ifindex_lookup_by_table_id);
  1601. if (rc < 0)
  1602. goto unreg_pernet;
  1603. rc = rtnl_link_register(&vrf_link_ops);
  1604. if (rc < 0)
  1605. goto table_lookup_unreg;
  1606. return 0;
  1607. table_lookup_unreg:
  1608. l3mdev_table_lookup_unregister(L3MDEV_TYPE_VRF,
  1609. vrf_ifindex_lookup_by_table_id);
  1610. unreg_pernet:
  1611. unregister_pernet_subsys(&vrf_net_ops);
  1612. error:
  1613. unregister_netdevice_notifier(&vrf_notifier_block);
  1614. return rc;
  1615. }
  1616. module_init(vrf_init_module);
  1617. MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern");
  1618. MODULE_DESCRIPTION("Device driver to instantiate VRF domains");
  1619. MODULE_LICENSE("GPL");
  1620. MODULE_ALIAS_RTNL_LINK(DRV_NAME);
  1621. MODULE_VERSION(DRV_VERSION);