veth.c 46 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * drivers/net/veth.c
  4. *
  5. * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc
  6. *
  7. * Author: Pavel Emelianov <xemul@openvz.org>
  8. * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com>
  9. *
  10. */
  11. #include <linux/netdevice.h>
  12. #include <linux/slab.h>
  13. #include <linux/ethtool.h>
  14. #include <linux/etherdevice.h>
  15. #include <linux/u64_stats_sync.h>
  16. #include <net/rtnetlink.h>
  17. #include <net/dst.h>
  18. #include <net/xfrm.h>
  19. #include <net/xdp.h>
  20. #include <linux/veth.h>
  21. #include <linux/module.h>
  22. #include <linux/bpf.h>
  23. #include <linux/filter.h>
  24. #include <linux/ptr_ring.h>
  25. #include <linux/bpf_trace.h>
  26. #include <linux/net_tstamp.h>
  27. #include <linux/skbuff_ref.h>
  28. #include <net/page_pool/helpers.h>
  29. #define DRV_NAME "veth"
  30. #define DRV_VERSION "1.0"
  31. #define VETH_XDP_FLAG BIT(0)
  32. #define VETH_RING_SIZE 256
  33. #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN)
  34. #define VETH_XDP_TX_BULK_SIZE 16
  35. #define VETH_XDP_BATCH 16
  36. struct veth_stats {
  37. u64 rx_drops;
  38. /* xdp */
  39. u64 xdp_packets;
  40. u64 xdp_bytes;
  41. u64 xdp_redirect;
  42. u64 xdp_drops;
  43. u64 xdp_tx;
  44. u64 xdp_tx_err;
  45. u64 peer_tq_xdp_xmit;
  46. u64 peer_tq_xdp_xmit_err;
  47. };
  48. struct veth_rq_stats {
  49. struct veth_stats vs;
  50. struct u64_stats_sync syncp;
  51. };
  52. struct veth_rq {
  53. struct napi_struct xdp_napi;
  54. struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */
  55. struct net_device *dev;
  56. struct bpf_prog __rcu *xdp_prog;
  57. struct xdp_mem_info xdp_mem;
  58. struct veth_rq_stats stats;
  59. bool rx_notify_masked;
  60. struct ptr_ring xdp_ring;
  61. struct xdp_rxq_info xdp_rxq;
  62. struct page_pool *page_pool;
  63. };
  64. struct veth_priv {
  65. struct net_device __rcu *peer;
  66. atomic64_t dropped;
  67. struct bpf_prog *_xdp_prog;
  68. struct veth_rq *rq;
  69. unsigned int requested_headroom;
  70. };
  71. struct veth_xdp_tx_bq {
  72. struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE];
  73. unsigned int count;
  74. };
  75. /*
  76. * ethtool interface
  77. */
  78. struct veth_q_stat_desc {
  79. char desc[ETH_GSTRING_LEN];
  80. size_t offset;
  81. };
  82. #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m)
  83. static const struct veth_q_stat_desc veth_rq_stats_desc[] = {
  84. { "xdp_packets", VETH_RQ_STAT(xdp_packets) },
  85. { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) },
  86. { "drops", VETH_RQ_STAT(rx_drops) },
  87. { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) },
  88. { "xdp_drops", VETH_RQ_STAT(xdp_drops) },
  89. { "xdp_tx", VETH_RQ_STAT(xdp_tx) },
  90. { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) },
  91. };
  92. #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc)
  93. static const struct veth_q_stat_desc veth_tq_stats_desc[] = {
  94. { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) },
  95. { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) },
  96. };
  97. #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc)
  98. static struct {
  99. const char string[ETH_GSTRING_LEN];
  100. } ethtool_stats_keys[] = {
  101. { "peer_ifindex" },
  102. };
  103. struct veth_xdp_buff {
  104. struct xdp_buff xdp;
  105. struct sk_buff *skb;
  106. };
  107. static int veth_get_link_ksettings(struct net_device *dev,
  108. struct ethtool_link_ksettings *cmd)
  109. {
  110. cmd->base.speed = SPEED_10000;
  111. cmd->base.duplex = DUPLEX_FULL;
  112. cmd->base.port = PORT_TP;
  113. cmd->base.autoneg = AUTONEG_DISABLE;
  114. return 0;
  115. }
  116. static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
  117. {
  118. strscpy(info->driver, DRV_NAME, sizeof(info->driver));
  119. strscpy(info->version, DRV_VERSION, sizeof(info->version));
  120. }
  121. static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
  122. {
  123. u8 *p = buf;
  124. int i, j;
  125. switch(stringset) {
  126. case ETH_SS_STATS:
  127. memcpy(p, &ethtool_stats_keys, sizeof(ethtool_stats_keys));
  128. p += sizeof(ethtool_stats_keys);
  129. for (i = 0; i < dev->real_num_rx_queues; i++)
  130. for (j = 0; j < VETH_RQ_STATS_LEN; j++)
  131. ethtool_sprintf(&p, "rx_queue_%u_%.18s",
  132. i, veth_rq_stats_desc[j].desc);
  133. for (i = 0; i < dev->real_num_tx_queues; i++)
  134. for (j = 0; j < VETH_TQ_STATS_LEN; j++)
  135. ethtool_sprintf(&p, "tx_queue_%u_%.18s",
  136. i, veth_tq_stats_desc[j].desc);
  137. page_pool_ethtool_stats_get_strings(p);
  138. break;
  139. }
  140. }
  141. static int veth_get_sset_count(struct net_device *dev, int sset)
  142. {
  143. switch (sset) {
  144. case ETH_SS_STATS:
  145. return ARRAY_SIZE(ethtool_stats_keys) +
  146. VETH_RQ_STATS_LEN * dev->real_num_rx_queues +
  147. VETH_TQ_STATS_LEN * dev->real_num_tx_queues +
  148. page_pool_ethtool_stats_get_count();
  149. default:
  150. return -EOPNOTSUPP;
  151. }
  152. }
  153. static void veth_get_page_pool_stats(struct net_device *dev, u64 *data)
  154. {
  155. #ifdef CONFIG_PAGE_POOL_STATS
  156. struct veth_priv *priv = netdev_priv(dev);
  157. struct page_pool_stats pp_stats = {};
  158. int i;
  159. for (i = 0; i < dev->real_num_rx_queues; i++) {
  160. if (!priv->rq[i].page_pool)
  161. continue;
  162. page_pool_get_stats(priv->rq[i].page_pool, &pp_stats);
  163. }
  164. page_pool_ethtool_stats_get(data, &pp_stats);
  165. #endif /* CONFIG_PAGE_POOL_STATS */
  166. }
  167. static void veth_get_ethtool_stats(struct net_device *dev,
  168. struct ethtool_stats *stats, u64 *data)
  169. {
  170. struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
  171. struct net_device *peer = rtnl_dereference(priv->peer);
  172. int i, j, idx, pp_idx;
  173. data[0] = peer ? peer->ifindex : 0;
  174. idx = 1;
  175. for (i = 0; i < dev->real_num_rx_queues; i++) {
  176. const struct veth_rq_stats *rq_stats = &priv->rq[i].stats;
  177. const void *stats_base = (void *)&rq_stats->vs;
  178. unsigned int start;
  179. size_t offset;
  180. do {
  181. start = u64_stats_fetch_begin(&rq_stats->syncp);
  182. for (j = 0; j < VETH_RQ_STATS_LEN; j++) {
  183. offset = veth_rq_stats_desc[j].offset;
  184. data[idx + j] = *(u64 *)(stats_base + offset);
  185. }
  186. } while (u64_stats_fetch_retry(&rq_stats->syncp, start));
  187. idx += VETH_RQ_STATS_LEN;
  188. }
  189. pp_idx = idx;
  190. if (!peer)
  191. goto page_pool_stats;
  192. rcv_priv = netdev_priv(peer);
  193. for (i = 0; i < peer->real_num_rx_queues; i++) {
  194. const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats;
  195. const void *base = (void *)&rq_stats->vs;
  196. unsigned int start, tx_idx = idx;
  197. size_t offset;
  198. tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN;
  199. do {
  200. start = u64_stats_fetch_begin(&rq_stats->syncp);
  201. for (j = 0; j < VETH_TQ_STATS_LEN; j++) {
  202. offset = veth_tq_stats_desc[j].offset;
  203. data[tx_idx + j] += *(u64 *)(base + offset);
  204. }
  205. } while (u64_stats_fetch_retry(&rq_stats->syncp, start));
  206. }
  207. pp_idx = idx + dev->real_num_tx_queues * VETH_TQ_STATS_LEN;
  208. page_pool_stats:
  209. veth_get_page_pool_stats(dev, &data[pp_idx]);
  210. }
  211. static void veth_get_channels(struct net_device *dev,
  212. struct ethtool_channels *channels)
  213. {
  214. channels->tx_count = dev->real_num_tx_queues;
  215. channels->rx_count = dev->real_num_rx_queues;
  216. channels->max_tx = dev->num_tx_queues;
  217. channels->max_rx = dev->num_rx_queues;
  218. }
  219. static int veth_set_channels(struct net_device *dev,
  220. struct ethtool_channels *ch);
  221. static const struct ethtool_ops veth_ethtool_ops = {
  222. .get_drvinfo = veth_get_drvinfo,
  223. .get_link = ethtool_op_get_link,
  224. .get_strings = veth_get_strings,
  225. .get_sset_count = veth_get_sset_count,
  226. .get_ethtool_stats = veth_get_ethtool_stats,
  227. .get_link_ksettings = veth_get_link_ksettings,
  228. .get_ts_info = ethtool_op_get_ts_info,
  229. .get_channels = veth_get_channels,
  230. .set_channels = veth_set_channels,
  231. };
  232. /* general routines */
  233. static bool veth_is_xdp_frame(void *ptr)
  234. {
  235. return (unsigned long)ptr & VETH_XDP_FLAG;
  236. }
  237. static struct xdp_frame *veth_ptr_to_xdp(void *ptr)
  238. {
  239. return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG);
  240. }
  241. static void *veth_xdp_to_ptr(struct xdp_frame *xdp)
  242. {
  243. return (void *)((unsigned long)xdp | VETH_XDP_FLAG);
  244. }
  245. static void veth_ptr_free(void *ptr)
  246. {
  247. if (veth_is_xdp_frame(ptr))
  248. xdp_return_frame(veth_ptr_to_xdp(ptr));
  249. else
  250. kfree_skb(ptr);
  251. }
  252. static void __veth_xdp_flush(struct veth_rq *rq)
  253. {
  254. /* Write ptr_ring before reading rx_notify_masked */
  255. smp_mb();
  256. if (!READ_ONCE(rq->rx_notify_masked) &&
  257. napi_schedule_prep(&rq->xdp_napi)) {
  258. WRITE_ONCE(rq->rx_notify_masked, true);
  259. __napi_schedule(&rq->xdp_napi);
  260. }
  261. }
  262. static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb)
  263. {
  264. if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) {
  265. dev_kfree_skb_any(skb);
  266. return NET_RX_DROP;
  267. }
  268. return NET_RX_SUCCESS;
  269. }
  270. static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
  271. struct veth_rq *rq, bool xdp)
  272. {
  273. return __dev_forward_skb(dev, skb) ?: xdp ?
  274. veth_xdp_rx(rq, skb) :
  275. __netif_rx(skb);
  276. }
  277. /* return true if the specified skb has chances of GRO aggregation
  278. * Don't strive for accuracy, but try to avoid GRO overhead in the most
  279. * common scenarios.
  280. * When XDP is enabled, all traffic is considered eligible, as the xmit
  281. * device has TSO off.
  282. * When TSO is enabled on the xmit device, we are likely interested only
  283. * in UDP aggregation, explicitly check for that if the skb is suspected
  284. * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets -
  285. * to belong to locally generated UDP traffic.
  286. */
  287. static bool veth_skb_is_eligible_for_gro(const struct net_device *dev,
  288. const struct net_device *rcv,
  289. const struct sk_buff *skb)
  290. {
  291. return !(dev->features & NETIF_F_ALL_TSO) ||
  292. (skb->destructor == sock_wfree &&
  293. rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD));
  294. }
  295. static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
  296. {
  297. struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
  298. struct veth_rq *rq = NULL;
  299. int ret = NETDEV_TX_OK;
  300. struct net_device *rcv;
  301. int length = skb->len;
  302. bool use_napi = false;
  303. int rxq;
  304. rcu_read_lock();
  305. rcv = rcu_dereference(priv->peer);
  306. if (unlikely(!rcv) || !pskb_may_pull(skb, ETH_HLEN)) {
  307. kfree_skb(skb);
  308. goto drop;
  309. }
  310. rcv_priv = netdev_priv(rcv);
  311. rxq = skb_get_queue_mapping(skb);
  312. if (rxq < rcv->real_num_rx_queues) {
  313. rq = &rcv_priv->rq[rxq];
  314. /* The napi pointer is available when an XDP program is
  315. * attached or when GRO is enabled
  316. * Don't bother with napi/GRO if the skb can't be aggregated
  317. */
  318. use_napi = rcu_access_pointer(rq->napi) &&
  319. veth_skb_is_eligible_for_gro(dev, rcv, skb);
  320. }
  321. skb_tx_timestamp(skb);
  322. if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) {
  323. if (!use_napi)
  324. dev_sw_netstats_tx_add(dev, 1, length);
  325. else
  326. __veth_xdp_flush(rq);
  327. } else {
  328. drop:
  329. atomic64_inc(&priv->dropped);
  330. ret = NET_XMIT_DROP;
  331. }
  332. rcu_read_unlock();
  333. return ret;
  334. }
  335. static void veth_stats_rx(struct veth_stats *result, struct net_device *dev)
  336. {
  337. struct veth_priv *priv = netdev_priv(dev);
  338. int i;
  339. result->peer_tq_xdp_xmit_err = 0;
  340. result->xdp_packets = 0;
  341. result->xdp_tx_err = 0;
  342. result->xdp_bytes = 0;
  343. result->rx_drops = 0;
  344. for (i = 0; i < dev->num_rx_queues; i++) {
  345. u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err;
  346. struct veth_rq_stats *stats = &priv->rq[i].stats;
  347. unsigned int start;
  348. do {
  349. start = u64_stats_fetch_begin(&stats->syncp);
  350. peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err;
  351. xdp_tx_err = stats->vs.xdp_tx_err;
  352. packets = stats->vs.xdp_packets;
  353. bytes = stats->vs.xdp_bytes;
  354. drops = stats->vs.rx_drops;
  355. } while (u64_stats_fetch_retry(&stats->syncp, start));
  356. result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err;
  357. result->xdp_tx_err += xdp_tx_err;
  358. result->xdp_packets += packets;
  359. result->xdp_bytes += bytes;
  360. result->rx_drops += drops;
  361. }
  362. }
  363. static void veth_get_stats64(struct net_device *dev,
  364. struct rtnl_link_stats64 *tot)
  365. {
  366. struct veth_priv *priv = netdev_priv(dev);
  367. struct net_device *peer;
  368. struct veth_stats rx;
  369. tot->tx_dropped = atomic64_read(&priv->dropped);
  370. dev_fetch_sw_netstats(tot, dev->tstats);
  371. veth_stats_rx(&rx, dev);
  372. tot->tx_dropped += rx.xdp_tx_err;
  373. tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err;
  374. tot->rx_bytes += rx.xdp_bytes;
  375. tot->rx_packets += rx.xdp_packets;
  376. rcu_read_lock();
  377. peer = rcu_dereference(priv->peer);
  378. if (peer) {
  379. struct rtnl_link_stats64 tot_peer = {};
  380. dev_fetch_sw_netstats(&tot_peer, peer->tstats);
  381. tot->rx_bytes += tot_peer.tx_bytes;
  382. tot->rx_packets += tot_peer.tx_packets;
  383. veth_stats_rx(&rx, peer);
  384. tot->tx_dropped += rx.peer_tq_xdp_xmit_err;
  385. tot->rx_dropped += rx.xdp_tx_err;
  386. tot->tx_bytes += rx.xdp_bytes;
  387. tot->tx_packets += rx.xdp_packets;
  388. }
  389. rcu_read_unlock();
  390. }
  391. /* fake multicast ability */
  392. static void veth_set_multicast_list(struct net_device *dev)
  393. {
  394. }
  395. static int veth_select_rxq(struct net_device *dev)
  396. {
  397. return smp_processor_id() % dev->real_num_rx_queues;
  398. }
  399. static struct net_device *veth_peer_dev(struct net_device *dev)
  400. {
  401. struct veth_priv *priv = netdev_priv(dev);
  402. /* Callers must be under RCU read side. */
  403. return rcu_dereference(priv->peer);
  404. }
  405. static int veth_xdp_xmit(struct net_device *dev, int n,
  406. struct xdp_frame **frames,
  407. u32 flags, bool ndo_xmit)
  408. {
  409. struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
  410. int i, ret = -ENXIO, nxmit = 0;
  411. struct net_device *rcv;
  412. unsigned int max_len;
  413. struct veth_rq *rq;
  414. if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
  415. return -EINVAL;
  416. rcu_read_lock();
  417. rcv = rcu_dereference(priv->peer);
  418. if (unlikely(!rcv))
  419. goto out;
  420. rcv_priv = netdev_priv(rcv);
  421. rq = &rcv_priv->rq[veth_select_rxq(rcv)];
  422. /* The napi pointer is set if NAPI is enabled, which ensures that
  423. * xdp_ring is initialized on receive side and the peer device is up.
  424. */
  425. if (!rcu_access_pointer(rq->napi))
  426. goto out;
  427. max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN;
  428. spin_lock(&rq->xdp_ring.producer_lock);
  429. for (i = 0; i < n; i++) {
  430. struct xdp_frame *frame = frames[i];
  431. void *ptr = veth_xdp_to_ptr(frame);
  432. if (unlikely(xdp_get_frame_len(frame) > max_len ||
  433. __ptr_ring_produce(&rq->xdp_ring, ptr)))
  434. break;
  435. nxmit++;
  436. }
  437. spin_unlock(&rq->xdp_ring.producer_lock);
  438. if (flags & XDP_XMIT_FLUSH)
  439. __veth_xdp_flush(rq);
  440. ret = nxmit;
  441. if (ndo_xmit) {
  442. u64_stats_update_begin(&rq->stats.syncp);
  443. rq->stats.vs.peer_tq_xdp_xmit += nxmit;
  444. rq->stats.vs.peer_tq_xdp_xmit_err += n - nxmit;
  445. u64_stats_update_end(&rq->stats.syncp);
  446. }
  447. out:
  448. rcu_read_unlock();
  449. return ret;
  450. }
  451. static int veth_ndo_xdp_xmit(struct net_device *dev, int n,
  452. struct xdp_frame **frames, u32 flags)
  453. {
  454. int err;
  455. err = veth_xdp_xmit(dev, n, frames, flags, true);
  456. if (err < 0) {
  457. struct veth_priv *priv = netdev_priv(dev);
  458. atomic64_add(n, &priv->dropped);
  459. }
  460. return err;
  461. }
  462. static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq)
  463. {
  464. int sent, i, err = 0, drops;
  465. sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false);
  466. if (sent < 0) {
  467. err = sent;
  468. sent = 0;
  469. }
  470. for (i = sent; unlikely(i < bq->count); i++)
  471. xdp_return_frame(bq->q[i]);
  472. drops = bq->count - sent;
  473. trace_xdp_bulk_tx(rq->dev, sent, drops, err);
  474. u64_stats_update_begin(&rq->stats.syncp);
  475. rq->stats.vs.xdp_tx += sent;
  476. rq->stats.vs.xdp_tx_err += drops;
  477. u64_stats_update_end(&rq->stats.syncp);
  478. bq->count = 0;
  479. }
  480. static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq)
  481. {
  482. struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev);
  483. struct net_device *rcv;
  484. struct veth_rq *rcv_rq;
  485. rcu_read_lock();
  486. veth_xdp_flush_bq(rq, bq);
  487. rcv = rcu_dereference(priv->peer);
  488. if (unlikely(!rcv))
  489. goto out;
  490. rcv_priv = netdev_priv(rcv);
  491. rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)];
  492. /* xdp_ring is initialized on receive side? */
  493. if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog)))
  494. goto out;
  495. __veth_xdp_flush(rcv_rq);
  496. out:
  497. rcu_read_unlock();
  498. }
  499. static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp,
  500. struct veth_xdp_tx_bq *bq)
  501. {
  502. struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp);
  503. if (unlikely(!frame))
  504. return -EOVERFLOW;
  505. if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE))
  506. veth_xdp_flush_bq(rq, bq);
  507. bq->q[bq->count++] = frame;
  508. return 0;
  509. }
  510. static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq,
  511. struct xdp_frame *frame,
  512. struct veth_xdp_tx_bq *bq,
  513. struct veth_stats *stats)
  514. {
  515. struct xdp_frame orig_frame;
  516. struct bpf_prog *xdp_prog;
  517. rcu_read_lock();
  518. xdp_prog = rcu_dereference(rq->xdp_prog);
  519. if (likely(xdp_prog)) {
  520. struct veth_xdp_buff vxbuf;
  521. struct xdp_buff *xdp = &vxbuf.xdp;
  522. u32 act;
  523. xdp_convert_frame_to_buff(frame, xdp);
  524. xdp->rxq = &rq->xdp_rxq;
  525. vxbuf.skb = NULL;
  526. act = bpf_prog_run_xdp(xdp_prog, xdp);
  527. switch (act) {
  528. case XDP_PASS:
  529. if (xdp_update_frame_from_buff(xdp, frame))
  530. goto err_xdp;
  531. break;
  532. case XDP_TX:
  533. orig_frame = *frame;
  534. xdp->rxq->mem = frame->mem;
  535. if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) {
  536. trace_xdp_exception(rq->dev, xdp_prog, act);
  537. frame = &orig_frame;
  538. stats->rx_drops++;
  539. goto err_xdp;
  540. }
  541. stats->xdp_tx++;
  542. rcu_read_unlock();
  543. goto xdp_xmit;
  544. case XDP_REDIRECT:
  545. orig_frame = *frame;
  546. xdp->rxq->mem = frame->mem;
  547. if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) {
  548. frame = &orig_frame;
  549. stats->rx_drops++;
  550. goto err_xdp;
  551. }
  552. stats->xdp_redirect++;
  553. rcu_read_unlock();
  554. goto xdp_xmit;
  555. default:
  556. bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act);
  557. fallthrough;
  558. case XDP_ABORTED:
  559. trace_xdp_exception(rq->dev, xdp_prog, act);
  560. fallthrough;
  561. case XDP_DROP:
  562. stats->xdp_drops++;
  563. goto err_xdp;
  564. }
  565. }
  566. rcu_read_unlock();
  567. return frame;
  568. err_xdp:
  569. rcu_read_unlock();
  570. xdp_return_frame(frame);
  571. xdp_xmit:
  572. return NULL;
  573. }
  574. /* frames array contains VETH_XDP_BATCH at most */
  575. static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames,
  576. int n_xdpf, struct veth_xdp_tx_bq *bq,
  577. struct veth_stats *stats)
  578. {
  579. void *skbs[VETH_XDP_BATCH];
  580. int i;
  581. if (xdp_alloc_skb_bulk(skbs, n_xdpf,
  582. GFP_ATOMIC | __GFP_ZERO) < 0) {
  583. for (i = 0; i < n_xdpf; i++)
  584. xdp_return_frame(frames[i]);
  585. stats->rx_drops += n_xdpf;
  586. return;
  587. }
  588. for (i = 0; i < n_xdpf; i++) {
  589. struct sk_buff *skb = skbs[i];
  590. skb = __xdp_build_skb_from_frame(frames[i], skb,
  591. rq->dev);
  592. if (!skb) {
  593. xdp_return_frame(frames[i]);
  594. stats->rx_drops++;
  595. continue;
  596. }
  597. napi_gro_receive(&rq->xdp_napi, skb);
  598. }
  599. }
  600. static void veth_xdp_get(struct xdp_buff *xdp)
  601. {
  602. struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
  603. int i;
  604. get_page(virt_to_page(xdp->data));
  605. if (likely(!xdp_buff_has_frags(xdp)))
  606. return;
  607. for (i = 0; i < sinfo->nr_frags; i++)
  608. __skb_frag_ref(&sinfo->frags[i]);
  609. }
  610. static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq,
  611. struct xdp_buff *xdp,
  612. struct sk_buff **pskb)
  613. {
  614. struct sk_buff *skb = *pskb;
  615. u32 frame_sz;
  616. if (skb_shared(skb) || skb_head_is_locked(skb) ||
  617. skb_shinfo(skb)->nr_frags ||
  618. skb_headroom(skb) < XDP_PACKET_HEADROOM) {
  619. if (skb_pp_cow_data(rq->page_pool, pskb, XDP_PACKET_HEADROOM))
  620. goto drop;
  621. skb = *pskb;
  622. }
  623. /* SKB "head" area always have tailroom for skb_shared_info */
  624. frame_sz = skb_end_pointer(skb) - skb->head;
  625. frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
  626. xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq);
  627. xdp_prepare_buff(xdp, skb->head, skb_headroom(skb),
  628. skb_headlen(skb), true);
  629. if (skb_is_nonlinear(skb)) {
  630. skb_shinfo(skb)->xdp_frags_size = skb->data_len;
  631. xdp_buff_set_frags_flag(xdp);
  632. } else {
  633. xdp_buff_clear_frags_flag(xdp);
  634. }
  635. *pskb = skb;
  636. return 0;
  637. drop:
  638. consume_skb(skb);
  639. *pskb = NULL;
  640. return -ENOMEM;
  641. }
  642. static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
  643. struct sk_buff *skb,
  644. struct veth_xdp_tx_bq *bq,
  645. struct veth_stats *stats)
  646. {
  647. void *orig_data, *orig_data_end;
  648. struct bpf_prog *xdp_prog;
  649. struct veth_xdp_buff vxbuf;
  650. struct xdp_buff *xdp = &vxbuf.xdp;
  651. u32 act, metalen;
  652. int off;
  653. skb_prepare_for_gro(skb);
  654. rcu_read_lock();
  655. xdp_prog = rcu_dereference(rq->xdp_prog);
  656. if (unlikely(!xdp_prog)) {
  657. rcu_read_unlock();
  658. goto out;
  659. }
  660. __skb_push(skb, skb->data - skb_mac_header(skb));
  661. if (veth_convert_skb_to_xdp_buff(rq, xdp, &skb))
  662. goto drop;
  663. vxbuf.skb = skb;
  664. orig_data = xdp->data;
  665. orig_data_end = xdp->data_end;
  666. act = bpf_prog_run_xdp(xdp_prog, xdp);
  667. switch (act) {
  668. case XDP_PASS:
  669. break;
  670. case XDP_TX:
  671. veth_xdp_get(xdp);
  672. consume_skb(skb);
  673. xdp->rxq->mem = rq->xdp_mem;
  674. if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) {
  675. trace_xdp_exception(rq->dev, xdp_prog, act);
  676. stats->rx_drops++;
  677. goto err_xdp;
  678. }
  679. stats->xdp_tx++;
  680. rcu_read_unlock();
  681. goto xdp_xmit;
  682. case XDP_REDIRECT:
  683. veth_xdp_get(xdp);
  684. consume_skb(skb);
  685. xdp->rxq->mem = rq->xdp_mem;
  686. if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) {
  687. stats->rx_drops++;
  688. goto err_xdp;
  689. }
  690. stats->xdp_redirect++;
  691. rcu_read_unlock();
  692. goto xdp_xmit;
  693. default:
  694. bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act);
  695. fallthrough;
  696. case XDP_ABORTED:
  697. trace_xdp_exception(rq->dev, xdp_prog, act);
  698. fallthrough;
  699. case XDP_DROP:
  700. stats->xdp_drops++;
  701. goto xdp_drop;
  702. }
  703. rcu_read_unlock();
  704. /* check if bpf_xdp_adjust_head was used */
  705. off = orig_data - xdp->data;
  706. if (off > 0)
  707. __skb_push(skb, off);
  708. else if (off < 0)
  709. __skb_pull(skb, -off);
  710. skb_reset_mac_header(skb);
  711. /* check if bpf_xdp_adjust_tail was used */
  712. off = xdp->data_end - orig_data_end;
  713. if (off != 0)
  714. __skb_put(skb, off); /* positive on grow, negative on shrink */
  715. /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
  716. * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
  717. */
  718. if (xdp_buff_has_frags(xdp))
  719. skb->data_len = skb_shinfo(skb)->xdp_frags_size;
  720. else
  721. skb->data_len = 0;
  722. skb->protocol = eth_type_trans(skb, rq->dev);
  723. metalen = xdp->data - xdp->data_meta;
  724. if (metalen)
  725. skb_metadata_set(skb, metalen);
  726. out:
  727. return skb;
  728. drop:
  729. stats->rx_drops++;
  730. xdp_drop:
  731. rcu_read_unlock();
  732. kfree_skb(skb);
  733. return NULL;
  734. err_xdp:
  735. rcu_read_unlock();
  736. xdp_return_buff(xdp);
  737. xdp_xmit:
  738. return NULL;
  739. }
  740. static int veth_xdp_rcv(struct veth_rq *rq, int budget,
  741. struct veth_xdp_tx_bq *bq,
  742. struct veth_stats *stats)
  743. {
  744. int i, done = 0, n_xdpf = 0;
  745. void *xdpf[VETH_XDP_BATCH];
  746. for (i = 0; i < budget; i++) {
  747. void *ptr = __ptr_ring_consume(&rq->xdp_ring);
  748. if (!ptr)
  749. break;
  750. if (veth_is_xdp_frame(ptr)) {
  751. /* ndo_xdp_xmit */
  752. struct xdp_frame *frame = veth_ptr_to_xdp(ptr);
  753. stats->xdp_bytes += xdp_get_frame_len(frame);
  754. frame = veth_xdp_rcv_one(rq, frame, bq, stats);
  755. if (frame) {
  756. /* XDP_PASS */
  757. xdpf[n_xdpf++] = frame;
  758. if (n_xdpf == VETH_XDP_BATCH) {
  759. veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf,
  760. bq, stats);
  761. n_xdpf = 0;
  762. }
  763. }
  764. } else {
  765. /* ndo_start_xmit */
  766. struct sk_buff *skb = ptr;
  767. stats->xdp_bytes += skb->len;
  768. skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
  769. if (skb) {
  770. if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC))
  771. netif_receive_skb(skb);
  772. else
  773. napi_gro_receive(&rq->xdp_napi, skb);
  774. }
  775. }
  776. done++;
  777. }
  778. if (n_xdpf)
  779. veth_xdp_rcv_bulk_skb(rq, xdpf, n_xdpf, bq, stats);
  780. u64_stats_update_begin(&rq->stats.syncp);
  781. rq->stats.vs.xdp_redirect += stats->xdp_redirect;
  782. rq->stats.vs.xdp_bytes += stats->xdp_bytes;
  783. rq->stats.vs.xdp_drops += stats->xdp_drops;
  784. rq->stats.vs.rx_drops += stats->rx_drops;
  785. rq->stats.vs.xdp_packets += done;
  786. u64_stats_update_end(&rq->stats.syncp);
  787. return done;
  788. }
  789. static int veth_poll(struct napi_struct *napi, int budget)
  790. {
  791. struct veth_rq *rq =
  792. container_of(napi, struct veth_rq, xdp_napi);
  793. struct veth_stats stats = {};
  794. struct veth_xdp_tx_bq bq;
  795. int done;
  796. bq.count = 0;
  797. xdp_set_return_frame_no_direct();
  798. done = veth_xdp_rcv(rq, budget, &bq, &stats);
  799. if (stats.xdp_redirect > 0)
  800. xdp_do_flush();
  801. if (done < budget && napi_complete_done(napi, done)) {
  802. /* Write rx_notify_masked before reading ptr_ring */
  803. smp_store_mb(rq->rx_notify_masked, false);
  804. if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) {
  805. if (napi_schedule_prep(&rq->xdp_napi)) {
  806. WRITE_ONCE(rq->rx_notify_masked, true);
  807. __napi_schedule(&rq->xdp_napi);
  808. }
  809. }
  810. }
  811. if (stats.xdp_tx > 0)
  812. veth_xdp_flush(rq, &bq);
  813. xdp_clear_return_frame_no_direct();
  814. return done;
  815. }
  816. static int veth_create_page_pool(struct veth_rq *rq)
  817. {
  818. struct page_pool_params pp_params = {
  819. .order = 0,
  820. .pool_size = VETH_RING_SIZE,
  821. .nid = NUMA_NO_NODE,
  822. .dev = &rq->dev->dev,
  823. };
  824. rq->page_pool = page_pool_create(&pp_params);
  825. if (IS_ERR(rq->page_pool)) {
  826. int err = PTR_ERR(rq->page_pool);
  827. rq->page_pool = NULL;
  828. return err;
  829. }
  830. return 0;
  831. }
  832. static int __veth_napi_enable_range(struct net_device *dev, int start, int end)
  833. {
  834. struct veth_priv *priv = netdev_priv(dev);
  835. int err, i;
  836. for (i = start; i < end; i++) {
  837. err = veth_create_page_pool(&priv->rq[i]);
  838. if (err)
  839. goto err_page_pool;
  840. }
  841. for (i = start; i < end; i++) {
  842. struct veth_rq *rq = &priv->rq[i];
  843. err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
  844. if (err)
  845. goto err_xdp_ring;
  846. }
  847. for (i = start; i < end; i++) {
  848. struct veth_rq *rq = &priv->rq[i];
  849. napi_enable(&rq->xdp_napi);
  850. rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi);
  851. }
  852. return 0;
  853. err_xdp_ring:
  854. for (i--; i >= start; i--)
  855. ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free);
  856. i = end;
  857. err_page_pool:
  858. for (i--; i >= start; i--) {
  859. page_pool_destroy(priv->rq[i].page_pool);
  860. priv->rq[i].page_pool = NULL;
  861. }
  862. return err;
  863. }
  864. static int __veth_napi_enable(struct net_device *dev)
  865. {
  866. return __veth_napi_enable_range(dev, 0, dev->real_num_rx_queues);
  867. }
  868. static void veth_napi_del_range(struct net_device *dev, int start, int end)
  869. {
  870. struct veth_priv *priv = netdev_priv(dev);
  871. int i;
  872. for (i = start; i < end; i++) {
  873. struct veth_rq *rq = &priv->rq[i];
  874. rcu_assign_pointer(priv->rq[i].napi, NULL);
  875. napi_disable(&rq->xdp_napi);
  876. __netif_napi_del(&rq->xdp_napi);
  877. }
  878. synchronize_net();
  879. for (i = start; i < end; i++) {
  880. struct veth_rq *rq = &priv->rq[i];
  881. rq->rx_notify_masked = false;
  882. ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
  883. }
  884. for (i = start; i < end; i++) {
  885. page_pool_destroy(priv->rq[i].page_pool);
  886. priv->rq[i].page_pool = NULL;
  887. }
  888. }
  889. static void veth_napi_del(struct net_device *dev)
  890. {
  891. veth_napi_del_range(dev, 0, dev->real_num_rx_queues);
  892. }
  893. static bool veth_gro_requested(const struct net_device *dev)
  894. {
  895. return !!(dev->wanted_features & NETIF_F_GRO);
  896. }
  897. static int veth_enable_xdp_range(struct net_device *dev, int start, int end,
  898. bool napi_already_on)
  899. {
  900. struct veth_priv *priv = netdev_priv(dev);
  901. int err, i;
  902. for (i = start; i < end; i++) {
  903. struct veth_rq *rq = &priv->rq[i];
  904. if (!napi_already_on)
  905. netif_napi_add(dev, &rq->xdp_napi, veth_poll);
  906. err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id);
  907. if (err < 0)
  908. goto err_rxq_reg;
  909. err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
  910. MEM_TYPE_PAGE_SHARED,
  911. NULL);
  912. if (err < 0)
  913. goto err_reg_mem;
  914. /* Save original mem info as it can be overwritten */
  915. rq->xdp_mem = rq->xdp_rxq.mem;
  916. }
  917. return 0;
  918. err_reg_mem:
  919. xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
  920. err_rxq_reg:
  921. for (i--; i >= start; i--) {
  922. struct veth_rq *rq = &priv->rq[i];
  923. xdp_rxq_info_unreg(&rq->xdp_rxq);
  924. if (!napi_already_on)
  925. netif_napi_del(&rq->xdp_napi);
  926. }
  927. return err;
  928. }
  929. static void veth_disable_xdp_range(struct net_device *dev, int start, int end,
  930. bool delete_napi)
  931. {
  932. struct veth_priv *priv = netdev_priv(dev);
  933. int i;
  934. for (i = start; i < end; i++) {
  935. struct veth_rq *rq = &priv->rq[i];
  936. rq->xdp_rxq.mem = rq->xdp_mem;
  937. xdp_rxq_info_unreg(&rq->xdp_rxq);
  938. if (delete_napi)
  939. netif_napi_del(&rq->xdp_napi);
  940. }
  941. }
  942. static int veth_enable_xdp(struct net_device *dev)
  943. {
  944. bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP);
  945. struct veth_priv *priv = netdev_priv(dev);
  946. int err, i;
  947. if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) {
  948. err = veth_enable_xdp_range(dev, 0, dev->real_num_rx_queues, napi_already_on);
  949. if (err)
  950. return err;
  951. if (!napi_already_on) {
  952. err = __veth_napi_enable(dev);
  953. if (err) {
  954. veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true);
  955. return err;
  956. }
  957. }
  958. }
  959. for (i = 0; i < dev->real_num_rx_queues; i++) {
  960. rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog);
  961. rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi);
  962. }
  963. return 0;
  964. }
  965. static void veth_disable_xdp(struct net_device *dev)
  966. {
  967. struct veth_priv *priv = netdev_priv(dev);
  968. int i;
  969. for (i = 0; i < dev->real_num_rx_queues; i++)
  970. rcu_assign_pointer(priv->rq[i].xdp_prog, NULL);
  971. if (!netif_running(dev) || !veth_gro_requested(dev))
  972. veth_napi_del(dev);
  973. veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false);
  974. }
  975. static int veth_napi_enable_range(struct net_device *dev, int start, int end)
  976. {
  977. struct veth_priv *priv = netdev_priv(dev);
  978. int err, i;
  979. for (i = start; i < end; i++) {
  980. struct veth_rq *rq = &priv->rq[i];
  981. netif_napi_add(dev, &rq->xdp_napi, veth_poll);
  982. }
  983. err = __veth_napi_enable_range(dev, start, end);
  984. if (err) {
  985. for (i = start; i < end; i++) {
  986. struct veth_rq *rq = &priv->rq[i];
  987. netif_napi_del(&rq->xdp_napi);
  988. }
  989. return err;
  990. }
  991. return err;
  992. }
  993. static int veth_napi_enable(struct net_device *dev)
  994. {
  995. return veth_napi_enable_range(dev, 0, dev->real_num_rx_queues);
  996. }
  997. static void veth_disable_range_safe(struct net_device *dev, int start, int end)
  998. {
  999. struct veth_priv *priv = netdev_priv(dev);
  1000. if (start >= end)
  1001. return;
  1002. if (priv->_xdp_prog) {
  1003. veth_napi_del_range(dev, start, end);
  1004. veth_disable_xdp_range(dev, start, end, false);
  1005. } else if (veth_gro_requested(dev)) {
  1006. veth_napi_del_range(dev, start, end);
  1007. }
  1008. }
  1009. static int veth_enable_range_safe(struct net_device *dev, int start, int end)
  1010. {
  1011. struct veth_priv *priv = netdev_priv(dev);
  1012. int err;
  1013. if (start >= end)
  1014. return 0;
  1015. if (priv->_xdp_prog) {
  1016. /* these channels are freshly initialized, napi is not on there even
  1017. * when GRO is requeste
  1018. */
  1019. err = veth_enable_xdp_range(dev, start, end, false);
  1020. if (err)
  1021. return err;
  1022. err = __veth_napi_enable_range(dev, start, end);
  1023. if (err) {
  1024. /* on error always delete the newly added napis */
  1025. veth_disable_xdp_range(dev, start, end, true);
  1026. return err;
  1027. }
  1028. } else if (veth_gro_requested(dev)) {
  1029. return veth_napi_enable_range(dev, start, end);
  1030. }
  1031. return 0;
  1032. }
  1033. static void veth_set_xdp_features(struct net_device *dev)
  1034. {
  1035. struct veth_priv *priv = netdev_priv(dev);
  1036. struct net_device *peer;
  1037. peer = rtnl_dereference(priv->peer);
  1038. if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) {
  1039. struct veth_priv *priv_peer = netdev_priv(peer);
  1040. xdp_features_t val = NETDEV_XDP_ACT_BASIC |
  1041. NETDEV_XDP_ACT_REDIRECT |
  1042. NETDEV_XDP_ACT_RX_SG;
  1043. if (priv_peer->_xdp_prog || veth_gro_requested(peer))
  1044. val |= NETDEV_XDP_ACT_NDO_XMIT |
  1045. NETDEV_XDP_ACT_NDO_XMIT_SG;
  1046. xdp_set_features_flag(dev, val);
  1047. } else {
  1048. xdp_clear_features_flag(dev);
  1049. }
  1050. }
  1051. static int veth_set_channels(struct net_device *dev,
  1052. struct ethtool_channels *ch)
  1053. {
  1054. struct veth_priv *priv = netdev_priv(dev);
  1055. unsigned int old_rx_count, new_rx_count;
  1056. struct veth_priv *peer_priv;
  1057. struct net_device *peer;
  1058. int err;
  1059. /* sanity check. Upper bounds are already enforced by the caller */
  1060. if (!ch->rx_count || !ch->tx_count)
  1061. return -EINVAL;
  1062. /* avoid braking XDP, if that is enabled */
  1063. peer = rtnl_dereference(priv->peer);
  1064. peer_priv = peer ? netdev_priv(peer) : NULL;
  1065. if (priv->_xdp_prog && peer && ch->rx_count < peer->real_num_tx_queues)
  1066. return -EINVAL;
  1067. if (peer && peer_priv && peer_priv->_xdp_prog && ch->tx_count > peer->real_num_rx_queues)
  1068. return -EINVAL;
  1069. old_rx_count = dev->real_num_rx_queues;
  1070. new_rx_count = ch->rx_count;
  1071. if (netif_running(dev)) {
  1072. /* turn device off */
  1073. netif_carrier_off(dev);
  1074. if (peer)
  1075. netif_carrier_off(peer);
  1076. /* try to allocate new resurces, as needed*/
  1077. err = veth_enable_range_safe(dev, old_rx_count, new_rx_count);
  1078. if (err)
  1079. goto out;
  1080. }
  1081. err = netif_set_real_num_rx_queues(dev, ch->rx_count);
  1082. if (err)
  1083. goto revert;
  1084. err = netif_set_real_num_tx_queues(dev, ch->tx_count);
  1085. if (err) {
  1086. int err2 = netif_set_real_num_rx_queues(dev, old_rx_count);
  1087. /* this error condition could happen only if rx and tx change
  1088. * in opposite directions (e.g. tx nr raises, rx nr decreases)
  1089. * and we can't do anything to fully restore the original
  1090. * status
  1091. */
  1092. if (err2)
  1093. pr_warn("Can't restore rx queues config %d -> %d %d",
  1094. new_rx_count, old_rx_count, err2);
  1095. else
  1096. goto revert;
  1097. }
  1098. out:
  1099. if (netif_running(dev)) {
  1100. /* note that we need to swap the arguments WRT the enable part
  1101. * to identify the range we have to disable
  1102. */
  1103. veth_disable_range_safe(dev, new_rx_count, old_rx_count);
  1104. netif_carrier_on(dev);
  1105. if (peer)
  1106. netif_carrier_on(peer);
  1107. }
  1108. /* update XDP supported features */
  1109. veth_set_xdp_features(dev);
  1110. if (peer)
  1111. veth_set_xdp_features(peer);
  1112. return err;
  1113. revert:
  1114. new_rx_count = old_rx_count;
  1115. old_rx_count = ch->rx_count;
  1116. goto out;
  1117. }
  1118. static int veth_open(struct net_device *dev)
  1119. {
  1120. struct veth_priv *priv = netdev_priv(dev);
  1121. struct net_device *peer = rtnl_dereference(priv->peer);
  1122. int err;
  1123. if (!peer)
  1124. return -ENOTCONN;
  1125. if (priv->_xdp_prog) {
  1126. err = veth_enable_xdp(dev);
  1127. if (err)
  1128. return err;
  1129. } else if (veth_gro_requested(dev)) {
  1130. err = veth_napi_enable(dev);
  1131. if (err)
  1132. return err;
  1133. }
  1134. if (peer->flags & IFF_UP) {
  1135. netif_carrier_on(dev);
  1136. netif_carrier_on(peer);
  1137. }
  1138. veth_set_xdp_features(dev);
  1139. return 0;
  1140. }
  1141. static int veth_close(struct net_device *dev)
  1142. {
  1143. struct veth_priv *priv = netdev_priv(dev);
  1144. struct net_device *peer = rtnl_dereference(priv->peer);
  1145. netif_carrier_off(dev);
  1146. if (peer)
  1147. netif_carrier_off(peer);
  1148. if (priv->_xdp_prog)
  1149. veth_disable_xdp(dev);
  1150. else if (veth_gro_requested(dev))
  1151. veth_napi_del(dev);
  1152. return 0;
  1153. }
  1154. static int is_valid_veth_mtu(int mtu)
  1155. {
  1156. return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU;
  1157. }
  1158. static int veth_alloc_queues(struct net_device *dev)
  1159. {
  1160. struct veth_priv *priv = netdev_priv(dev);
  1161. int i;
  1162. priv->rq = kvcalloc(dev->num_rx_queues, sizeof(*priv->rq),
  1163. GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
  1164. if (!priv->rq)
  1165. return -ENOMEM;
  1166. for (i = 0; i < dev->num_rx_queues; i++) {
  1167. priv->rq[i].dev = dev;
  1168. u64_stats_init(&priv->rq[i].stats.syncp);
  1169. }
  1170. return 0;
  1171. }
  1172. static void veth_free_queues(struct net_device *dev)
  1173. {
  1174. struct veth_priv *priv = netdev_priv(dev);
  1175. kvfree(priv->rq);
  1176. }
  1177. static int veth_dev_init(struct net_device *dev)
  1178. {
  1179. netdev_lockdep_set_classes(dev);
  1180. return veth_alloc_queues(dev);
  1181. }
  1182. static void veth_dev_free(struct net_device *dev)
  1183. {
  1184. veth_free_queues(dev);
  1185. }
  1186. #ifdef CONFIG_NET_POLL_CONTROLLER
  1187. static void veth_poll_controller(struct net_device *dev)
  1188. {
  1189. /* veth only receives frames when its peer sends one
  1190. * Since it has nothing to do with disabling irqs, we are guaranteed
  1191. * never to have pending data when we poll for it so
  1192. * there is nothing to do here.
  1193. *
  1194. * We need this though so netpoll recognizes us as an interface that
  1195. * supports polling, which enables bridge devices in virt setups to
  1196. * still use netconsole
  1197. */
  1198. }
  1199. #endif /* CONFIG_NET_POLL_CONTROLLER */
  1200. static int veth_get_iflink(const struct net_device *dev)
  1201. {
  1202. struct veth_priv *priv = netdev_priv(dev);
  1203. struct net_device *peer;
  1204. int iflink;
  1205. rcu_read_lock();
  1206. peer = rcu_dereference(priv->peer);
  1207. iflink = peer ? READ_ONCE(peer->ifindex) : 0;
  1208. rcu_read_unlock();
  1209. return iflink;
  1210. }
  1211. static netdev_features_t veth_fix_features(struct net_device *dev,
  1212. netdev_features_t features)
  1213. {
  1214. struct veth_priv *priv = netdev_priv(dev);
  1215. struct net_device *peer;
  1216. peer = rtnl_dereference(priv->peer);
  1217. if (peer) {
  1218. struct veth_priv *peer_priv = netdev_priv(peer);
  1219. if (peer_priv->_xdp_prog)
  1220. features &= ~NETIF_F_GSO_SOFTWARE;
  1221. }
  1222. return features;
  1223. }
  1224. static int veth_set_features(struct net_device *dev,
  1225. netdev_features_t features)
  1226. {
  1227. netdev_features_t changed = features ^ dev->features;
  1228. struct veth_priv *priv = netdev_priv(dev);
  1229. struct net_device *peer;
  1230. int err;
  1231. if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog)
  1232. return 0;
  1233. peer = rtnl_dereference(priv->peer);
  1234. if (features & NETIF_F_GRO) {
  1235. err = veth_napi_enable(dev);
  1236. if (err)
  1237. return err;
  1238. if (peer)
  1239. xdp_features_set_redirect_target(peer, true);
  1240. } else {
  1241. if (peer)
  1242. xdp_features_clear_redirect_target(peer);
  1243. veth_napi_del(dev);
  1244. }
  1245. return 0;
  1246. }
  1247. static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
  1248. {
  1249. struct veth_priv *peer_priv, *priv = netdev_priv(dev);
  1250. struct net_device *peer;
  1251. if (new_hr < 0)
  1252. new_hr = 0;
  1253. rcu_read_lock();
  1254. peer = rcu_dereference(priv->peer);
  1255. if (unlikely(!peer))
  1256. goto out;
  1257. peer_priv = netdev_priv(peer);
  1258. priv->requested_headroom = new_hr;
  1259. new_hr = max(priv->requested_headroom, peer_priv->requested_headroom);
  1260. dev->needed_headroom = new_hr;
  1261. peer->needed_headroom = new_hr;
  1262. out:
  1263. rcu_read_unlock();
  1264. }
  1265. static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
  1266. struct netlink_ext_ack *extack)
  1267. {
  1268. struct veth_priv *priv = netdev_priv(dev);
  1269. struct bpf_prog *old_prog;
  1270. struct net_device *peer;
  1271. unsigned int max_mtu;
  1272. int err;
  1273. old_prog = priv->_xdp_prog;
  1274. priv->_xdp_prog = prog;
  1275. peer = rtnl_dereference(priv->peer);
  1276. if (prog) {
  1277. if (!peer) {
  1278. NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached");
  1279. err = -ENOTCONN;
  1280. goto err;
  1281. }
  1282. max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) -
  1283. peer->hard_header_len;
  1284. /* Allow increasing the max_mtu if the program supports
  1285. * XDP fragments.
  1286. */
  1287. if (prog->aux->xdp_has_frags)
  1288. max_mtu += PAGE_SIZE * MAX_SKB_FRAGS;
  1289. if (peer->mtu > max_mtu) {
  1290. NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP");
  1291. err = -ERANGE;
  1292. goto err;
  1293. }
  1294. if (dev->real_num_rx_queues < peer->real_num_tx_queues) {
  1295. NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues");
  1296. err = -ENOSPC;
  1297. goto err;
  1298. }
  1299. if (dev->flags & IFF_UP) {
  1300. err = veth_enable_xdp(dev);
  1301. if (err) {
  1302. NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed");
  1303. goto err;
  1304. }
  1305. }
  1306. if (!old_prog) {
  1307. peer->hw_features &= ~NETIF_F_GSO_SOFTWARE;
  1308. peer->max_mtu = max_mtu;
  1309. }
  1310. xdp_features_set_redirect_target(peer, true);
  1311. }
  1312. if (old_prog) {
  1313. if (!prog) {
  1314. if (peer && !veth_gro_requested(dev))
  1315. xdp_features_clear_redirect_target(peer);
  1316. if (dev->flags & IFF_UP)
  1317. veth_disable_xdp(dev);
  1318. if (peer) {
  1319. peer->hw_features |= NETIF_F_GSO_SOFTWARE;
  1320. peer->max_mtu = ETH_MAX_MTU;
  1321. }
  1322. }
  1323. bpf_prog_put(old_prog);
  1324. }
  1325. if ((!!old_prog ^ !!prog) && peer)
  1326. netdev_update_features(peer);
  1327. return 0;
  1328. err:
  1329. priv->_xdp_prog = old_prog;
  1330. return err;
  1331. }
  1332. static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
  1333. {
  1334. switch (xdp->command) {
  1335. case XDP_SETUP_PROG:
  1336. return veth_xdp_set(dev, xdp->prog, xdp->extack);
  1337. default:
  1338. return -EINVAL;
  1339. }
  1340. }
  1341. static int veth_xdp_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
  1342. {
  1343. struct veth_xdp_buff *_ctx = (void *)ctx;
  1344. if (!_ctx->skb)
  1345. return -ENODATA;
  1346. *timestamp = skb_hwtstamps(_ctx->skb)->hwtstamp;
  1347. return 0;
  1348. }
  1349. static int veth_xdp_rx_hash(const struct xdp_md *ctx, u32 *hash,
  1350. enum xdp_rss_hash_type *rss_type)
  1351. {
  1352. struct veth_xdp_buff *_ctx = (void *)ctx;
  1353. struct sk_buff *skb = _ctx->skb;
  1354. if (!skb)
  1355. return -ENODATA;
  1356. *hash = skb_get_hash(skb);
  1357. *rss_type = skb->l4_hash ? XDP_RSS_TYPE_L4_ANY : XDP_RSS_TYPE_NONE;
  1358. return 0;
  1359. }
  1360. static int veth_xdp_rx_vlan_tag(const struct xdp_md *ctx, __be16 *vlan_proto,
  1361. u16 *vlan_tci)
  1362. {
  1363. const struct veth_xdp_buff *_ctx = (void *)ctx;
  1364. const struct sk_buff *skb = _ctx->skb;
  1365. int err;
  1366. if (!skb)
  1367. return -ENODATA;
  1368. err = __vlan_hwaccel_get_tag(skb, vlan_tci);
  1369. if (err)
  1370. return err;
  1371. *vlan_proto = skb->vlan_proto;
  1372. return err;
  1373. }
  1374. static const struct net_device_ops veth_netdev_ops = {
  1375. .ndo_init = veth_dev_init,
  1376. .ndo_open = veth_open,
  1377. .ndo_stop = veth_close,
  1378. .ndo_start_xmit = veth_xmit,
  1379. .ndo_get_stats64 = veth_get_stats64,
  1380. .ndo_set_rx_mode = veth_set_multicast_list,
  1381. .ndo_set_mac_address = eth_mac_addr,
  1382. #ifdef CONFIG_NET_POLL_CONTROLLER
  1383. .ndo_poll_controller = veth_poll_controller,
  1384. #endif
  1385. .ndo_get_iflink = veth_get_iflink,
  1386. .ndo_fix_features = veth_fix_features,
  1387. .ndo_set_features = veth_set_features,
  1388. .ndo_features_check = passthru_features_check,
  1389. .ndo_set_rx_headroom = veth_set_rx_headroom,
  1390. .ndo_bpf = veth_xdp,
  1391. .ndo_xdp_xmit = veth_ndo_xdp_xmit,
  1392. .ndo_get_peer_dev = veth_peer_dev,
  1393. };
  1394. static const struct xdp_metadata_ops veth_xdp_metadata_ops = {
  1395. .xmo_rx_timestamp = veth_xdp_rx_timestamp,
  1396. .xmo_rx_hash = veth_xdp_rx_hash,
  1397. .xmo_rx_vlan_tag = veth_xdp_rx_vlan_tag,
  1398. };
  1399. #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
  1400. NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \
  1401. NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \
  1402. NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \
  1403. NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX )
  1404. static void veth_setup(struct net_device *dev)
  1405. {
  1406. ether_setup(dev);
  1407. dev->priv_flags &= ~IFF_TX_SKB_SHARING;
  1408. dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
  1409. dev->priv_flags |= IFF_NO_QUEUE;
  1410. dev->priv_flags |= IFF_PHONY_HEADROOM;
  1411. dev->priv_flags |= IFF_DISABLE_NETPOLL;
  1412. dev->lltx = true;
  1413. dev->netdev_ops = &veth_netdev_ops;
  1414. dev->xdp_metadata_ops = &veth_xdp_metadata_ops;
  1415. dev->ethtool_ops = &veth_ethtool_ops;
  1416. dev->features |= VETH_FEATURES;
  1417. dev->vlan_features = dev->features &
  1418. ~(NETIF_F_HW_VLAN_CTAG_TX |
  1419. NETIF_F_HW_VLAN_STAG_TX |
  1420. NETIF_F_HW_VLAN_CTAG_RX |
  1421. NETIF_F_HW_VLAN_STAG_RX);
  1422. dev->needs_free_netdev = true;
  1423. dev->priv_destructor = veth_dev_free;
  1424. dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
  1425. dev->max_mtu = ETH_MAX_MTU;
  1426. dev->hw_features = VETH_FEATURES;
  1427. dev->hw_enc_features = VETH_FEATURES;
  1428. dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
  1429. netif_set_tso_max_size(dev, GSO_MAX_SIZE);
  1430. }
  1431. /*
  1432. * netlink interface
  1433. */
  1434. static int veth_validate(struct nlattr *tb[], struct nlattr *data[],
  1435. struct netlink_ext_ack *extack)
  1436. {
  1437. if (tb[IFLA_ADDRESS]) {
  1438. if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
  1439. return -EINVAL;
  1440. if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
  1441. return -EADDRNOTAVAIL;
  1442. }
  1443. if (tb[IFLA_MTU]) {
  1444. if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU])))
  1445. return -EINVAL;
  1446. }
  1447. return 0;
  1448. }
  1449. static struct rtnl_link_ops veth_link_ops;
  1450. static void veth_disable_gro(struct net_device *dev)
  1451. {
  1452. dev->features &= ~NETIF_F_GRO;
  1453. dev->wanted_features &= ~NETIF_F_GRO;
  1454. netdev_update_features(dev);
  1455. }
  1456. static int veth_init_queues(struct net_device *dev, struct nlattr *tb[])
  1457. {
  1458. int err;
  1459. if (!tb[IFLA_NUM_TX_QUEUES] && dev->num_tx_queues > 1) {
  1460. err = netif_set_real_num_tx_queues(dev, 1);
  1461. if (err)
  1462. return err;
  1463. }
  1464. if (!tb[IFLA_NUM_RX_QUEUES] && dev->num_rx_queues > 1) {
  1465. err = netif_set_real_num_rx_queues(dev, 1);
  1466. if (err)
  1467. return err;
  1468. }
  1469. return 0;
  1470. }
  1471. static int veth_newlink(struct net *src_net, struct net_device *dev,
  1472. struct nlattr *tb[], struct nlattr *data[],
  1473. struct netlink_ext_ack *extack)
  1474. {
  1475. int err;
  1476. struct net_device *peer;
  1477. struct veth_priv *priv;
  1478. char ifname[IFNAMSIZ];
  1479. struct nlattr *peer_tb[IFLA_MAX + 1], **tbp;
  1480. unsigned char name_assign_type;
  1481. struct ifinfomsg *ifmp;
  1482. struct net *net;
  1483. /*
  1484. * create and register peer first
  1485. */
  1486. if (data != NULL && data[VETH_INFO_PEER] != NULL) {
  1487. struct nlattr *nla_peer;
  1488. nla_peer = data[VETH_INFO_PEER];
  1489. ifmp = nla_data(nla_peer);
  1490. err = rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack);
  1491. if (err < 0)
  1492. return err;
  1493. err = veth_validate(peer_tb, NULL, extack);
  1494. if (err < 0)
  1495. return err;
  1496. tbp = peer_tb;
  1497. } else {
  1498. ifmp = NULL;
  1499. tbp = tb;
  1500. }
  1501. if (ifmp && tbp[IFLA_IFNAME]) {
  1502. nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
  1503. name_assign_type = NET_NAME_USER;
  1504. } else {
  1505. snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d");
  1506. name_assign_type = NET_NAME_ENUM;
  1507. }
  1508. net = rtnl_link_get_net(src_net, tbp);
  1509. if (IS_ERR(net))
  1510. return PTR_ERR(net);
  1511. peer = rtnl_create_link(net, ifname, name_assign_type,
  1512. &veth_link_ops, tbp, extack);
  1513. if (IS_ERR(peer)) {
  1514. put_net(net);
  1515. return PTR_ERR(peer);
  1516. }
  1517. if (!ifmp || !tbp[IFLA_ADDRESS])
  1518. eth_hw_addr_random(peer);
  1519. if (ifmp && (dev->ifindex != 0))
  1520. peer->ifindex = ifmp->ifi_index;
  1521. netif_inherit_tso_max(peer, dev);
  1522. err = register_netdevice(peer);
  1523. put_net(net);
  1524. net = NULL;
  1525. if (err < 0)
  1526. goto err_register_peer;
  1527. /* keep GRO disabled by default to be consistent with the established
  1528. * veth behavior
  1529. */
  1530. veth_disable_gro(peer);
  1531. netif_carrier_off(peer);
  1532. err = rtnl_configure_link(peer, ifmp, 0, NULL);
  1533. if (err < 0)
  1534. goto err_configure_peer;
  1535. /*
  1536. * register dev last
  1537. *
  1538. * note, that since we've registered new device the dev's name
  1539. * should be re-allocated
  1540. */
  1541. if (tb[IFLA_ADDRESS] == NULL)
  1542. eth_hw_addr_random(dev);
  1543. if (tb[IFLA_IFNAME])
  1544. nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
  1545. else
  1546. snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d");
  1547. err = register_netdevice(dev);
  1548. if (err < 0)
  1549. goto err_register_dev;
  1550. netif_carrier_off(dev);
  1551. /*
  1552. * tie the deviced together
  1553. */
  1554. priv = netdev_priv(dev);
  1555. rcu_assign_pointer(priv->peer, peer);
  1556. err = veth_init_queues(dev, tb);
  1557. if (err)
  1558. goto err_queues;
  1559. priv = netdev_priv(peer);
  1560. rcu_assign_pointer(priv->peer, dev);
  1561. err = veth_init_queues(peer, tb);
  1562. if (err)
  1563. goto err_queues;
  1564. veth_disable_gro(dev);
  1565. /* update XDP supported features */
  1566. veth_set_xdp_features(dev);
  1567. veth_set_xdp_features(peer);
  1568. return 0;
  1569. err_queues:
  1570. unregister_netdevice(dev);
  1571. err_register_dev:
  1572. /* nothing to do */
  1573. err_configure_peer:
  1574. unregister_netdevice(peer);
  1575. return err;
  1576. err_register_peer:
  1577. free_netdev(peer);
  1578. return err;
  1579. }
  1580. static void veth_dellink(struct net_device *dev, struct list_head *head)
  1581. {
  1582. struct veth_priv *priv;
  1583. struct net_device *peer;
  1584. priv = netdev_priv(dev);
  1585. peer = rtnl_dereference(priv->peer);
  1586. /* Note : dellink() is called from default_device_exit_batch(),
  1587. * before a rcu_synchronize() point. The devices are guaranteed
  1588. * not being freed before one RCU grace period.
  1589. */
  1590. RCU_INIT_POINTER(priv->peer, NULL);
  1591. unregister_netdevice_queue(dev, head);
  1592. if (peer) {
  1593. priv = netdev_priv(peer);
  1594. RCU_INIT_POINTER(priv->peer, NULL);
  1595. unregister_netdevice_queue(peer, head);
  1596. }
  1597. }
  1598. static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = {
  1599. [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) },
  1600. };
  1601. static struct net *veth_get_link_net(const struct net_device *dev)
  1602. {
  1603. struct veth_priv *priv = netdev_priv(dev);
  1604. struct net_device *peer = rtnl_dereference(priv->peer);
  1605. return peer ? dev_net(peer) : dev_net(dev);
  1606. }
  1607. static unsigned int veth_get_num_queues(void)
  1608. {
  1609. /* enforce the same queue limit as rtnl_create_link */
  1610. int queues = num_possible_cpus();
  1611. if (queues > 4096)
  1612. queues = 4096;
  1613. return queues;
  1614. }
  1615. static struct rtnl_link_ops veth_link_ops = {
  1616. .kind = DRV_NAME,
  1617. .priv_size = sizeof(struct veth_priv),
  1618. .setup = veth_setup,
  1619. .validate = veth_validate,
  1620. .newlink = veth_newlink,
  1621. .dellink = veth_dellink,
  1622. .policy = veth_policy,
  1623. .maxtype = VETH_INFO_MAX,
  1624. .get_link_net = veth_get_link_net,
  1625. .get_num_tx_queues = veth_get_num_queues,
  1626. .get_num_rx_queues = veth_get_num_queues,
  1627. };
  1628. /*
  1629. * init/fini
  1630. */
  1631. static __init int veth_init(void)
  1632. {
  1633. return rtnl_link_register(&veth_link_ops);
  1634. }
  1635. static __exit void veth_exit(void)
  1636. {
  1637. rtnl_link_unregister(&veth_link_ops);
  1638. }
  1639. module_init(veth_init);
  1640. module_exit(veth_exit);
  1641. MODULE_DESCRIPTION("Virtual Ethernet Tunnel");
  1642. MODULE_LICENSE("GPL v2");
  1643. MODULE_ALIAS_RTNL_LINK(DRV_NAME);