flow.c 28 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (c) 2007-2014 Nicira, Inc.
  4. */
  5. #include <linux/uaccess.h>
  6. #include <linux/netdevice.h>
  7. #include <linux/etherdevice.h>
  8. #include <linux/if_ether.h>
  9. #include <linux/if_vlan.h>
  10. #include <net/llc_pdu.h>
  11. #include <linux/kernel.h>
  12. #include <linux/jhash.h>
  13. #include <linux/jiffies.h>
  14. #include <linux/llc.h>
  15. #include <linux/module.h>
  16. #include <linux/in.h>
  17. #include <linux/rcupdate.h>
  18. #include <linux/cpumask.h>
  19. #include <linux/if_arp.h>
  20. #include <linux/ip.h>
  21. #include <linux/ipv6.h>
  22. #include <linux/mpls.h>
  23. #include <linux/sctp.h>
  24. #include <linux/smp.h>
  25. #include <linux/tcp.h>
  26. #include <linux/udp.h>
  27. #include <linux/icmp.h>
  28. #include <linux/icmpv6.h>
  29. #include <linux/rculist.h>
  30. #include <net/ip.h>
  31. #include <net/ip_tunnels.h>
  32. #include <net/ipv6.h>
  33. #include <net/mpls.h>
  34. #include <net/ndisc.h>
  35. #include <net/nsh.h>
  36. #include <net/pkt_cls.h>
  37. #include <net/netfilter/nf_conntrack_zones.h>
  38. #include "conntrack.h"
  39. #include "datapath.h"
  40. #include "flow.h"
  41. #include "flow_netlink.h"
  42. #include "vport.h"
  43. u64 ovs_flow_used_time(unsigned long flow_jiffies)
  44. {
  45. struct timespec64 cur_ts;
  46. u64 cur_ms, idle_ms;
  47. ktime_get_ts64(&cur_ts);
  48. idle_ms = jiffies_to_msecs(jiffies - flow_jiffies);
  49. cur_ms = (u64)(u32)cur_ts.tv_sec * MSEC_PER_SEC +
  50. cur_ts.tv_nsec / NSEC_PER_MSEC;
  51. return cur_ms - idle_ms;
  52. }
  53. #define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF))
  54. void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
  55. const struct sk_buff *skb)
  56. {
  57. struct sw_flow_stats *stats;
  58. unsigned int cpu = smp_processor_id();
  59. int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
  60. stats = rcu_dereference(flow->stats[cpu]);
  61. /* Check if already have CPU-specific stats. */
  62. if (likely(stats)) {
  63. spin_lock(&stats->lock);
  64. /* Mark if we write on the pre-allocated stats. */
  65. if (cpu == 0 && unlikely(flow->stats_last_writer != cpu))
  66. flow->stats_last_writer = cpu;
  67. } else {
  68. stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */
  69. spin_lock(&stats->lock);
  70. /* If the current CPU is the only writer on the
  71. * pre-allocated stats keep using them.
  72. */
  73. if (unlikely(flow->stats_last_writer != cpu)) {
  74. /* A previous locker may have already allocated the
  75. * stats, so we need to check again. If CPU-specific
  76. * stats were already allocated, we update the pre-
  77. * allocated stats as we have already locked them.
  78. */
  79. if (likely(flow->stats_last_writer != -1) &&
  80. likely(!rcu_access_pointer(flow->stats[cpu]))) {
  81. /* Try to allocate CPU-specific stats. */
  82. struct sw_flow_stats *new_stats;
  83. new_stats =
  84. kmem_cache_alloc_node(flow_stats_cache,
  85. GFP_NOWAIT |
  86. __GFP_THISNODE |
  87. __GFP_NOWARN |
  88. __GFP_NOMEMALLOC,
  89. numa_node_id());
  90. if (likely(new_stats)) {
  91. new_stats->used = jiffies;
  92. new_stats->packet_count = 1;
  93. new_stats->byte_count = len;
  94. new_stats->tcp_flags = tcp_flags;
  95. spin_lock_init(&new_stats->lock);
  96. rcu_assign_pointer(flow->stats[cpu],
  97. new_stats);
  98. cpumask_set_cpu(cpu,
  99. flow->cpu_used_mask);
  100. goto unlock;
  101. }
  102. }
  103. flow->stats_last_writer = cpu;
  104. }
  105. }
  106. stats->used = jiffies;
  107. stats->packet_count++;
  108. stats->byte_count += len;
  109. stats->tcp_flags |= tcp_flags;
  110. unlock:
  111. spin_unlock(&stats->lock);
  112. }
  113. /* Must be called with rcu_read_lock or ovs_mutex. */
  114. void ovs_flow_stats_get(const struct sw_flow *flow,
  115. struct ovs_flow_stats *ovs_stats,
  116. unsigned long *used, __be16 *tcp_flags)
  117. {
  118. int cpu;
  119. *used = 0;
  120. *tcp_flags = 0;
  121. memset(ovs_stats, 0, sizeof(*ovs_stats));
  122. /* We open code this to make sure cpu 0 is always considered */
  123. for (cpu = 0; cpu < nr_cpu_ids;
  124. cpu = cpumask_next(cpu, flow->cpu_used_mask)) {
  125. struct sw_flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]);
  126. if (stats) {
  127. /* Local CPU may write on non-local stats, so we must
  128. * block bottom-halves here.
  129. */
  130. spin_lock_bh(&stats->lock);
  131. if (!*used || time_after(stats->used, *used))
  132. *used = stats->used;
  133. *tcp_flags |= stats->tcp_flags;
  134. ovs_stats->n_packets += stats->packet_count;
  135. ovs_stats->n_bytes += stats->byte_count;
  136. spin_unlock_bh(&stats->lock);
  137. }
  138. }
  139. }
  140. /* Called with ovs_mutex. */
  141. void ovs_flow_stats_clear(struct sw_flow *flow)
  142. {
  143. int cpu;
  144. /* We open code this to make sure cpu 0 is always considered */
  145. for (cpu = 0; cpu < nr_cpu_ids;
  146. cpu = cpumask_next(cpu, flow->cpu_used_mask)) {
  147. struct sw_flow_stats *stats = ovsl_dereference(flow->stats[cpu]);
  148. if (stats) {
  149. spin_lock_bh(&stats->lock);
  150. stats->used = 0;
  151. stats->packet_count = 0;
  152. stats->byte_count = 0;
  153. stats->tcp_flags = 0;
  154. spin_unlock_bh(&stats->lock);
  155. }
  156. }
  157. }
  158. static int check_header(struct sk_buff *skb, int len)
  159. {
  160. if (unlikely(skb->len < len))
  161. return -EINVAL;
  162. if (unlikely(!pskb_may_pull(skb, len)))
  163. return -ENOMEM;
  164. return 0;
  165. }
  166. static bool arphdr_ok(struct sk_buff *skb)
  167. {
  168. return pskb_may_pull(skb, skb_network_offset(skb) +
  169. sizeof(struct arp_eth_header));
  170. }
  171. static int check_iphdr(struct sk_buff *skb)
  172. {
  173. unsigned int nh_ofs = skb_network_offset(skb);
  174. unsigned int ip_len;
  175. int err;
  176. err = check_header(skb, nh_ofs + sizeof(struct iphdr));
  177. if (unlikely(err))
  178. return err;
  179. ip_len = ip_hdrlen(skb);
  180. if (unlikely(ip_len < sizeof(struct iphdr) ||
  181. skb->len < nh_ofs + ip_len))
  182. return -EINVAL;
  183. skb_set_transport_header(skb, nh_ofs + ip_len);
  184. return 0;
  185. }
  186. static bool tcphdr_ok(struct sk_buff *skb)
  187. {
  188. int th_ofs = skb_transport_offset(skb);
  189. int tcp_len;
  190. if (unlikely(!pskb_may_pull(skb, th_ofs + sizeof(struct tcphdr))))
  191. return false;
  192. tcp_len = tcp_hdrlen(skb);
  193. if (unlikely(tcp_len < sizeof(struct tcphdr) ||
  194. skb->len < th_ofs + tcp_len))
  195. return false;
  196. return true;
  197. }
  198. static bool udphdr_ok(struct sk_buff *skb)
  199. {
  200. return pskb_may_pull(skb, skb_transport_offset(skb) +
  201. sizeof(struct udphdr));
  202. }
  203. static bool sctphdr_ok(struct sk_buff *skb)
  204. {
  205. return pskb_may_pull(skb, skb_transport_offset(skb) +
  206. sizeof(struct sctphdr));
  207. }
  208. static bool icmphdr_ok(struct sk_buff *skb)
  209. {
  210. return pskb_may_pull(skb, skb_transport_offset(skb) +
  211. sizeof(struct icmphdr));
  212. }
  213. /**
  214. * get_ipv6_ext_hdrs() - Parses packet and sets IPv6 extension header flags.
  215. *
  216. * @skb: buffer where extension header data starts in packet
  217. * @nh: ipv6 header
  218. * @ext_hdrs: flags are stored here
  219. *
  220. * OFPIEH12_UNREP is set if more than one of a given IPv6 extension header
  221. * is unexpectedly encountered. (Two destination options headers may be
  222. * expected and would not cause this bit to be set.)
  223. *
  224. * OFPIEH12_UNSEQ is set if IPv6 extension headers were not in the order
  225. * preferred (but not required) by RFC 2460:
  226. *
  227. * When more than one extension header is used in the same packet, it is
  228. * recommended that those headers appear in the following order:
  229. * IPv6 header
  230. * Hop-by-Hop Options header
  231. * Destination Options header
  232. * Routing header
  233. * Fragment header
  234. * Authentication header
  235. * Encapsulating Security Payload header
  236. * Destination Options header
  237. * upper-layer header
  238. */
  239. static void get_ipv6_ext_hdrs(struct sk_buff *skb, struct ipv6hdr *nh,
  240. u16 *ext_hdrs)
  241. {
  242. u8 next_type = nh->nexthdr;
  243. unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr);
  244. int dest_options_header_count = 0;
  245. *ext_hdrs = 0;
  246. while (ipv6_ext_hdr(next_type)) {
  247. struct ipv6_opt_hdr _hdr, *hp;
  248. switch (next_type) {
  249. case IPPROTO_NONE:
  250. *ext_hdrs |= OFPIEH12_NONEXT;
  251. /* stop parsing */
  252. return;
  253. case IPPROTO_ESP:
  254. if (*ext_hdrs & OFPIEH12_ESP)
  255. *ext_hdrs |= OFPIEH12_UNREP;
  256. if ((*ext_hdrs & ~(OFPIEH12_HOP | OFPIEH12_DEST |
  257. OFPIEH12_ROUTER | IPPROTO_FRAGMENT |
  258. OFPIEH12_AUTH | OFPIEH12_UNREP)) ||
  259. dest_options_header_count >= 2) {
  260. *ext_hdrs |= OFPIEH12_UNSEQ;
  261. }
  262. *ext_hdrs |= OFPIEH12_ESP;
  263. break;
  264. case IPPROTO_AH:
  265. if (*ext_hdrs & OFPIEH12_AUTH)
  266. *ext_hdrs |= OFPIEH12_UNREP;
  267. if ((*ext_hdrs &
  268. ~(OFPIEH12_HOP | OFPIEH12_DEST | OFPIEH12_ROUTER |
  269. IPPROTO_FRAGMENT | OFPIEH12_UNREP)) ||
  270. dest_options_header_count >= 2) {
  271. *ext_hdrs |= OFPIEH12_UNSEQ;
  272. }
  273. *ext_hdrs |= OFPIEH12_AUTH;
  274. break;
  275. case IPPROTO_DSTOPTS:
  276. if (dest_options_header_count == 0) {
  277. if (*ext_hdrs &
  278. ~(OFPIEH12_HOP | OFPIEH12_UNREP))
  279. *ext_hdrs |= OFPIEH12_UNSEQ;
  280. *ext_hdrs |= OFPIEH12_DEST;
  281. } else if (dest_options_header_count == 1) {
  282. if (*ext_hdrs &
  283. ~(OFPIEH12_HOP | OFPIEH12_DEST |
  284. OFPIEH12_ROUTER | OFPIEH12_FRAG |
  285. OFPIEH12_AUTH | OFPIEH12_ESP |
  286. OFPIEH12_UNREP)) {
  287. *ext_hdrs |= OFPIEH12_UNSEQ;
  288. }
  289. } else {
  290. *ext_hdrs |= OFPIEH12_UNREP;
  291. }
  292. dest_options_header_count++;
  293. break;
  294. case IPPROTO_FRAGMENT:
  295. if (*ext_hdrs & OFPIEH12_FRAG)
  296. *ext_hdrs |= OFPIEH12_UNREP;
  297. if ((*ext_hdrs & ~(OFPIEH12_HOP |
  298. OFPIEH12_DEST |
  299. OFPIEH12_ROUTER |
  300. OFPIEH12_UNREP)) ||
  301. dest_options_header_count >= 2) {
  302. *ext_hdrs |= OFPIEH12_UNSEQ;
  303. }
  304. *ext_hdrs |= OFPIEH12_FRAG;
  305. break;
  306. case IPPROTO_ROUTING:
  307. if (*ext_hdrs & OFPIEH12_ROUTER)
  308. *ext_hdrs |= OFPIEH12_UNREP;
  309. if ((*ext_hdrs & ~(OFPIEH12_HOP |
  310. OFPIEH12_DEST |
  311. OFPIEH12_UNREP)) ||
  312. dest_options_header_count >= 2) {
  313. *ext_hdrs |= OFPIEH12_UNSEQ;
  314. }
  315. *ext_hdrs |= OFPIEH12_ROUTER;
  316. break;
  317. case IPPROTO_HOPOPTS:
  318. if (*ext_hdrs & OFPIEH12_HOP)
  319. *ext_hdrs |= OFPIEH12_UNREP;
  320. /* OFPIEH12_HOP is set to 1 if a hop-by-hop IPv6
  321. * extension header is present as the first
  322. * extension header in the packet.
  323. */
  324. if (*ext_hdrs == 0)
  325. *ext_hdrs |= OFPIEH12_HOP;
  326. else
  327. *ext_hdrs |= OFPIEH12_UNSEQ;
  328. break;
  329. default:
  330. return;
  331. }
  332. hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr);
  333. if (!hp)
  334. break;
  335. next_type = hp->nexthdr;
  336. start += ipv6_optlen(hp);
  337. }
  338. }
  339. static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
  340. {
  341. unsigned short frag_off;
  342. unsigned int payload_ofs = 0;
  343. unsigned int nh_ofs = skb_network_offset(skb);
  344. unsigned int nh_len;
  345. struct ipv6hdr *nh;
  346. int err, nexthdr, flags = 0;
  347. err = check_header(skb, nh_ofs + sizeof(*nh));
  348. if (unlikely(err))
  349. return err;
  350. nh = ipv6_hdr(skb);
  351. get_ipv6_ext_hdrs(skb, nh, &key->ipv6.exthdrs);
  352. key->ip.proto = NEXTHDR_NONE;
  353. key->ip.tos = ipv6_get_dsfield(nh);
  354. key->ip.ttl = nh->hop_limit;
  355. key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL);
  356. key->ipv6.addr.src = nh->saddr;
  357. key->ipv6.addr.dst = nh->daddr;
  358. nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags);
  359. if (flags & IP6_FH_F_FRAG) {
  360. if (frag_off) {
  361. key->ip.frag = OVS_FRAG_TYPE_LATER;
  362. key->ip.proto = NEXTHDR_FRAGMENT;
  363. return 0;
  364. }
  365. key->ip.frag = OVS_FRAG_TYPE_FIRST;
  366. } else {
  367. key->ip.frag = OVS_FRAG_TYPE_NONE;
  368. }
  369. /* Delayed handling of error in ipv6_find_hdr() as it
  370. * always sets flags and frag_off to a valid value which may be
  371. * used to set key->ip.frag above.
  372. */
  373. if (unlikely(nexthdr < 0))
  374. return -EPROTO;
  375. nh_len = payload_ofs - nh_ofs;
  376. skb_set_transport_header(skb, nh_ofs + nh_len);
  377. key->ip.proto = nexthdr;
  378. return nh_len;
  379. }
  380. static bool icmp6hdr_ok(struct sk_buff *skb)
  381. {
  382. return pskb_may_pull(skb, skb_transport_offset(skb) +
  383. sizeof(struct icmp6hdr));
  384. }
  385. /**
  386. * parse_vlan_tag - Parse vlan tag from vlan header.
  387. * @skb: skb containing frame to parse
  388. * @key_vh: pointer to parsed vlan tag
  389. * @untag_vlan: should the vlan header be removed from the frame
  390. *
  391. * Return: ERROR on memory error.
  392. * %0 if it encounters a non-vlan or incomplete packet.
  393. * %1 after successfully parsing vlan tag.
  394. */
  395. static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh,
  396. bool untag_vlan)
  397. {
  398. struct vlan_head *vh = (struct vlan_head *)skb->data;
  399. if (likely(!eth_type_vlan(vh->tpid)))
  400. return 0;
  401. if (unlikely(skb->len < sizeof(struct vlan_head) + sizeof(__be16)))
  402. return 0;
  403. if (unlikely(!pskb_may_pull(skb, sizeof(struct vlan_head) +
  404. sizeof(__be16))))
  405. return -ENOMEM;
  406. vh = (struct vlan_head *)skb->data;
  407. key_vh->tci = vh->tci | htons(VLAN_CFI_MASK);
  408. key_vh->tpid = vh->tpid;
  409. if (unlikely(untag_vlan)) {
  410. int offset = skb->data - skb_mac_header(skb);
  411. u16 tci;
  412. int err;
  413. __skb_push(skb, offset);
  414. err = __skb_vlan_pop(skb, &tci);
  415. __skb_pull(skb, offset);
  416. if (err)
  417. return err;
  418. __vlan_hwaccel_put_tag(skb, key_vh->tpid, tci);
  419. } else {
  420. __skb_pull(skb, sizeof(struct vlan_head));
  421. }
  422. return 1;
  423. }
  424. static void clear_vlan(struct sw_flow_key *key)
  425. {
  426. key->eth.vlan.tci = 0;
  427. key->eth.vlan.tpid = 0;
  428. key->eth.cvlan.tci = 0;
  429. key->eth.cvlan.tpid = 0;
  430. }
  431. static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
  432. {
  433. int res;
  434. if (skb_vlan_tag_present(skb)) {
  435. key->eth.vlan.tci = htons(skb->vlan_tci) | htons(VLAN_CFI_MASK);
  436. key->eth.vlan.tpid = skb->vlan_proto;
  437. } else {
  438. /* Parse outer vlan tag in the non-accelerated case. */
  439. res = parse_vlan_tag(skb, &key->eth.vlan, true);
  440. if (res <= 0)
  441. return res;
  442. }
  443. /* Parse inner vlan tag. */
  444. res = parse_vlan_tag(skb, &key->eth.cvlan, false);
  445. if (res <= 0)
  446. return res;
  447. return 0;
  448. }
  449. static __be16 parse_ethertype(struct sk_buff *skb)
  450. {
  451. struct llc_snap_hdr {
  452. u8 dsap; /* Always 0xAA */
  453. u8 ssap; /* Always 0xAA */
  454. u8 ctrl;
  455. u8 oui[3];
  456. __be16 ethertype;
  457. };
  458. struct llc_snap_hdr *llc;
  459. __be16 proto;
  460. proto = *(__be16 *) skb->data;
  461. __skb_pull(skb, sizeof(__be16));
  462. if (eth_proto_is_802_3(proto))
  463. return proto;
  464. if (skb->len < sizeof(struct llc_snap_hdr))
  465. return htons(ETH_P_802_2);
  466. if (unlikely(!pskb_may_pull(skb, sizeof(struct llc_snap_hdr))))
  467. return htons(0);
  468. llc = (struct llc_snap_hdr *) skb->data;
  469. if (llc->dsap != LLC_SAP_SNAP ||
  470. llc->ssap != LLC_SAP_SNAP ||
  471. (llc->oui[0] | llc->oui[1] | llc->oui[2]) != 0)
  472. return htons(ETH_P_802_2);
  473. __skb_pull(skb, sizeof(struct llc_snap_hdr));
  474. if (eth_proto_is_802_3(llc->ethertype))
  475. return llc->ethertype;
  476. return htons(ETH_P_802_2);
  477. }
  478. static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
  479. int nh_len)
  480. {
  481. struct icmp6hdr *icmp = icmp6_hdr(skb);
  482. /* The ICMPv6 type and code fields use the 16-bit transport port
  483. * fields, so we need to store them in 16-bit network byte order.
  484. */
  485. key->tp.src = htons(icmp->icmp6_type);
  486. key->tp.dst = htons(icmp->icmp6_code);
  487. if (icmp->icmp6_code == 0 &&
  488. (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION ||
  489. icmp->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT)) {
  490. int icmp_len = skb->len - skb_transport_offset(skb);
  491. struct nd_msg *nd;
  492. int offset;
  493. memset(&key->ipv6.nd, 0, sizeof(key->ipv6.nd));
  494. /* In order to process neighbor discovery options, we need the
  495. * entire packet.
  496. */
  497. if (unlikely(icmp_len < sizeof(*nd)))
  498. return 0;
  499. if (unlikely(skb_linearize(skb)))
  500. return -ENOMEM;
  501. nd = (struct nd_msg *)skb_transport_header(skb);
  502. key->ipv6.nd.target = nd->target;
  503. icmp_len -= sizeof(*nd);
  504. offset = 0;
  505. while (icmp_len >= 8) {
  506. struct nd_opt_hdr *nd_opt =
  507. (struct nd_opt_hdr *)(nd->opt + offset);
  508. int opt_len = nd_opt->nd_opt_len * 8;
  509. if (unlikely(!opt_len || opt_len > icmp_len))
  510. return 0;
  511. /* Store the link layer address if the appropriate
  512. * option is provided. It is considered an error if
  513. * the same link layer option is specified twice.
  514. */
  515. if (nd_opt->nd_opt_type == ND_OPT_SOURCE_LL_ADDR
  516. && opt_len == 8) {
  517. if (unlikely(!is_zero_ether_addr(key->ipv6.nd.sll)))
  518. goto invalid;
  519. ether_addr_copy(key->ipv6.nd.sll,
  520. &nd->opt[offset+sizeof(*nd_opt)]);
  521. } else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR
  522. && opt_len == 8) {
  523. if (unlikely(!is_zero_ether_addr(key->ipv6.nd.tll)))
  524. goto invalid;
  525. ether_addr_copy(key->ipv6.nd.tll,
  526. &nd->opt[offset+sizeof(*nd_opt)]);
  527. }
  528. icmp_len -= opt_len;
  529. offset += opt_len;
  530. }
  531. }
  532. return 0;
  533. invalid:
  534. memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target));
  535. memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll));
  536. memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll));
  537. return 0;
  538. }
  539. static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key)
  540. {
  541. struct nshhdr *nh;
  542. unsigned int nh_ofs = skb_network_offset(skb);
  543. u8 version, length;
  544. int err;
  545. err = check_header(skb, nh_ofs + NSH_BASE_HDR_LEN);
  546. if (unlikely(err))
  547. return err;
  548. nh = nsh_hdr(skb);
  549. version = nsh_get_ver(nh);
  550. length = nsh_hdr_len(nh);
  551. if (version != 0)
  552. return -EINVAL;
  553. err = check_header(skb, nh_ofs + length);
  554. if (unlikely(err))
  555. return err;
  556. nh = nsh_hdr(skb);
  557. key->nsh.base.flags = nsh_get_flags(nh);
  558. key->nsh.base.ttl = nsh_get_ttl(nh);
  559. key->nsh.base.mdtype = nh->mdtype;
  560. key->nsh.base.np = nh->np;
  561. key->nsh.base.path_hdr = nh->path_hdr;
  562. switch (key->nsh.base.mdtype) {
  563. case NSH_M_TYPE1:
  564. if (length != NSH_M_TYPE1_LEN)
  565. return -EINVAL;
  566. memcpy(key->nsh.context, nh->md1.context,
  567. sizeof(nh->md1));
  568. break;
  569. case NSH_M_TYPE2:
  570. memset(key->nsh.context, 0,
  571. sizeof(nh->md1));
  572. break;
  573. default:
  574. return -EINVAL;
  575. }
  576. return 0;
  577. }
  578. /**
  579. * key_extract_l3l4 - extracts L3/L4 header information.
  580. * @skb: sk_buff that contains the frame, with skb->data pointing to the
  581. * L3 header
  582. * @key: output flow key
  583. *
  584. * Return: %0 if successful, otherwise a negative errno value.
  585. */
  586. static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key)
  587. {
  588. int error;
  589. /* Network layer. */
  590. if (key->eth.type == htons(ETH_P_IP)) {
  591. struct iphdr *nh;
  592. __be16 offset;
  593. error = check_iphdr(skb);
  594. if (unlikely(error)) {
  595. memset(&key->ip, 0, sizeof(key->ip));
  596. memset(&key->ipv4, 0, sizeof(key->ipv4));
  597. if (error == -EINVAL) {
  598. skb->transport_header = skb->network_header;
  599. error = 0;
  600. }
  601. return error;
  602. }
  603. nh = ip_hdr(skb);
  604. key->ipv4.addr.src = nh->saddr;
  605. key->ipv4.addr.dst = nh->daddr;
  606. key->ip.proto = nh->protocol;
  607. key->ip.tos = nh->tos;
  608. key->ip.ttl = nh->ttl;
  609. offset = nh->frag_off & htons(IP_OFFSET);
  610. if (offset) {
  611. key->ip.frag = OVS_FRAG_TYPE_LATER;
  612. memset(&key->tp, 0, sizeof(key->tp));
  613. return 0;
  614. }
  615. if (nh->frag_off & htons(IP_MF) ||
  616. skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
  617. key->ip.frag = OVS_FRAG_TYPE_FIRST;
  618. else
  619. key->ip.frag = OVS_FRAG_TYPE_NONE;
  620. /* Transport layer. */
  621. if (key->ip.proto == IPPROTO_TCP) {
  622. if (tcphdr_ok(skb)) {
  623. struct tcphdr *tcp = tcp_hdr(skb);
  624. key->tp.src = tcp->source;
  625. key->tp.dst = tcp->dest;
  626. key->tp.flags = TCP_FLAGS_BE16(tcp);
  627. } else {
  628. memset(&key->tp, 0, sizeof(key->tp));
  629. }
  630. } else if (key->ip.proto == IPPROTO_UDP) {
  631. if (udphdr_ok(skb)) {
  632. struct udphdr *udp = udp_hdr(skb);
  633. key->tp.src = udp->source;
  634. key->tp.dst = udp->dest;
  635. } else {
  636. memset(&key->tp, 0, sizeof(key->tp));
  637. }
  638. } else if (key->ip.proto == IPPROTO_SCTP) {
  639. if (sctphdr_ok(skb)) {
  640. struct sctphdr *sctp = sctp_hdr(skb);
  641. key->tp.src = sctp->source;
  642. key->tp.dst = sctp->dest;
  643. } else {
  644. memset(&key->tp, 0, sizeof(key->tp));
  645. }
  646. } else if (key->ip.proto == IPPROTO_ICMP) {
  647. if (icmphdr_ok(skb)) {
  648. struct icmphdr *icmp = icmp_hdr(skb);
  649. /* The ICMP type and code fields use the 16-bit
  650. * transport port fields, so we need to store
  651. * them in 16-bit network byte order. */
  652. key->tp.src = htons(icmp->type);
  653. key->tp.dst = htons(icmp->code);
  654. } else {
  655. memset(&key->tp, 0, sizeof(key->tp));
  656. }
  657. }
  658. } else if (key->eth.type == htons(ETH_P_ARP) ||
  659. key->eth.type == htons(ETH_P_RARP)) {
  660. struct arp_eth_header *arp;
  661. bool arp_available = arphdr_ok(skb);
  662. arp = (struct arp_eth_header *)skb_network_header(skb);
  663. if (arp_available &&
  664. arp->ar_hrd == htons(ARPHRD_ETHER) &&
  665. arp->ar_pro == htons(ETH_P_IP) &&
  666. arp->ar_hln == ETH_ALEN &&
  667. arp->ar_pln == 4) {
  668. /* We only match on the lower 8 bits of the opcode. */
  669. if (ntohs(arp->ar_op) <= 0xff)
  670. key->ip.proto = ntohs(arp->ar_op);
  671. else
  672. key->ip.proto = 0;
  673. memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src));
  674. memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst));
  675. ether_addr_copy(key->ipv4.arp.sha, arp->ar_sha);
  676. ether_addr_copy(key->ipv4.arp.tha, arp->ar_tha);
  677. } else {
  678. memset(&key->ip, 0, sizeof(key->ip));
  679. memset(&key->ipv4, 0, sizeof(key->ipv4));
  680. }
  681. } else if (eth_p_mpls(key->eth.type)) {
  682. size_t label_count = 1;
  683. memset(&key->mpls, 0, sizeof(key->mpls));
  684. skb_set_inner_network_header(skb, skb->mac_len);
  685. while (1) {
  686. __be32 lse;
  687. error = check_header(skb, skb->mac_len +
  688. label_count * MPLS_HLEN);
  689. if (unlikely(error))
  690. return 0;
  691. memcpy(&lse, skb_inner_network_header(skb), MPLS_HLEN);
  692. if (label_count <= MPLS_LABEL_DEPTH)
  693. memcpy(&key->mpls.lse[label_count - 1], &lse,
  694. MPLS_HLEN);
  695. skb_set_inner_network_header(skb, skb->mac_len +
  696. label_count * MPLS_HLEN);
  697. if (lse & htonl(MPLS_LS_S_MASK))
  698. break;
  699. label_count++;
  700. }
  701. if (label_count > MPLS_LABEL_DEPTH)
  702. label_count = MPLS_LABEL_DEPTH;
  703. key->mpls.num_labels_mask = GENMASK(label_count - 1, 0);
  704. } else if (key->eth.type == htons(ETH_P_IPV6)) {
  705. int nh_len; /* IPv6 Header + Extensions */
  706. nh_len = parse_ipv6hdr(skb, key);
  707. if (unlikely(nh_len < 0)) {
  708. switch (nh_len) {
  709. case -EINVAL:
  710. memset(&key->ip, 0, sizeof(key->ip));
  711. memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr));
  712. fallthrough;
  713. case -EPROTO:
  714. skb->transport_header = skb->network_header;
  715. error = 0;
  716. break;
  717. default:
  718. error = nh_len;
  719. }
  720. return error;
  721. }
  722. if (key->ip.frag == OVS_FRAG_TYPE_LATER) {
  723. memset(&key->tp, 0, sizeof(key->tp));
  724. return 0;
  725. }
  726. if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
  727. key->ip.frag = OVS_FRAG_TYPE_FIRST;
  728. /* Transport layer. */
  729. if (key->ip.proto == NEXTHDR_TCP) {
  730. if (tcphdr_ok(skb)) {
  731. struct tcphdr *tcp = tcp_hdr(skb);
  732. key->tp.src = tcp->source;
  733. key->tp.dst = tcp->dest;
  734. key->tp.flags = TCP_FLAGS_BE16(tcp);
  735. } else {
  736. memset(&key->tp, 0, sizeof(key->tp));
  737. }
  738. } else if (key->ip.proto == NEXTHDR_UDP) {
  739. if (udphdr_ok(skb)) {
  740. struct udphdr *udp = udp_hdr(skb);
  741. key->tp.src = udp->source;
  742. key->tp.dst = udp->dest;
  743. } else {
  744. memset(&key->tp, 0, sizeof(key->tp));
  745. }
  746. } else if (key->ip.proto == NEXTHDR_SCTP) {
  747. if (sctphdr_ok(skb)) {
  748. struct sctphdr *sctp = sctp_hdr(skb);
  749. key->tp.src = sctp->source;
  750. key->tp.dst = sctp->dest;
  751. } else {
  752. memset(&key->tp, 0, sizeof(key->tp));
  753. }
  754. } else if (key->ip.proto == NEXTHDR_ICMP) {
  755. if (icmp6hdr_ok(skb)) {
  756. error = parse_icmpv6(skb, key, nh_len);
  757. if (error)
  758. return error;
  759. } else {
  760. memset(&key->tp, 0, sizeof(key->tp));
  761. }
  762. }
  763. } else if (key->eth.type == htons(ETH_P_NSH)) {
  764. error = parse_nsh(skb, key);
  765. if (error)
  766. return error;
  767. }
  768. return 0;
  769. }
  770. /**
  771. * key_extract - extracts a flow key from an Ethernet frame.
  772. * @skb: sk_buff that contains the frame, with skb->data pointing to the
  773. * Ethernet header
  774. * @key: output flow key
  775. *
  776. * The caller must ensure that skb->len >= ETH_HLEN.
  777. *
  778. * Initializes @skb header fields as follows:
  779. *
  780. * - skb->mac_header: the L2 header.
  781. *
  782. * - skb->network_header: just past the L2 header, or just past the
  783. * VLAN header, to the first byte of the L2 payload.
  784. *
  785. * - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6
  786. * on output, then just past the IP header, if one is present and
  787. * of a correct length, otherwise the same as skb->network_header.
  788. * For other key->eth.type values it is left untouched.
  789. *
  790. * - skb->protocol: the type of the data starting at skb->network_header.
  791. * Equals to key->eth.type.
  792. *
  793. * Return: %0 if successful, otherwise a negative errno value.
  794. */
  795. static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
  796. {
  797. struct ethhdr *eth;
  798. /* Flags are always used as part of stats */
  799. key->tp.flags = 0;
  800. skb_reset_mac_header(skb);
  801. /* Link layer. */
  802. clear_vlan(key);
  803. if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) {
  804. if (unlikely(eth_type_vlan(skb->protocol)))
  805. return -EINVAL;
  806. skb_reset_network_header(skb);
  807. key->eth.type = skb->protocol;
  808. } else {
  809. eth = eth_hdr(skb);
  810. ether_addr_copy(key->eth.src, eth->h_source);
  811. ether_addr_copy(key->eth.dst, eth->h_dest);
  812. __skb_pull(skb, 2 * ETH_ALEN);
  813. /* We are going to push all headers that we pull, so no need to
  814. * update skb->csum here.
  815. */
  816. if (unlikely(parse_vlan(skb, key)))
  817. return -ENOMEM;
  818. key->eth.type = parse_ethertype(skb);
  819. if (unlikely(key->eth.type == htons(0)))
  820. return -ENOMEM;
  821. /* Multiple tagged packets need to retain TPID to satisfy
  822. * skb_vlan_pop(), which will later shift the ethertype into
  823. * skb->protocol.
  824. */
  825. if (key->eth.cvlan.tci & htons(VLAN_CFI_MASK))
  826. skb->protocol = key->eth.cvlan.tpid;
  827. else
  828. skb->protocol = key->eth.type;
  829. skb_reset_network_header(skb);
  830. __skb_push(skb, skb->data - skb_mac_header(skb));
  831. }
  832. skb_reset_mac_len(skb);
  833. /* Fill out L3/L4 key info, if any */
  834. return key_extract_l3l4(skb, key);
  835. }
  836. /* In the case of conntrack fragment handling it expects L3 headers,
  837. * add a helper.
  838. */
  839. int ovs_flow_key_update_l3l4(struct sk_buff *skb, struct sw_flow_key *key)
  840. {
  841. return key_extract_l3l4(skb, key);
  842. }
  843. int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key)
  844. {
  845. int res;
  846. res = key_extract(skb, key);
  847. if (!res)
  848. key->mac_proto &= ~SW_FLOW_KEY_INVALID;
  849. return res;
  850. }
  851. static int key_extract_mac_proto(struct sk_buff *skb)
  852. {
  853. switch (skb->dev->type) {
  854. case ARPHRD_ETHER:
  855. return MAC_PROTO_ETHERNET;
  856. case ARPHRD_NONE:
  857. if (skb->protocol == htons(ETH_P_TEB))
  858. return MAC_PROTO_ETHERNET;
  859. return MAC_PROTO_NONE;
  860. }
  861. WARN_ON_ONCE(1);
  862. return -EINVAL;
  863. }
  864. int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
  865. struct sk_buff *skb, struct sw_flow_key *key)
  866. {
  867. #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
  868. struct tc_skb_ext *tc_ext;
  869. #endif
  870. bool post_ct = false, post_ct_snat = false, post_ct_dnat = false;
  871. int res, err;
  872. u16 zone = 0;
  873. /* Extract metadata from packet. */
  874. if (tun_info) {
  875. key->tun_proto = ip_tunnel_info_af(tun_info);
  876. memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key));
  877. if (tun_info->options_len) {
  878. BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) *
  879. 8)) - 1
  880. > sizeof(key->tun_opts));
  881. ip_tunnel_info_opts_get(TUN_METADATA_OPTS(key, tun_info->options_len),
  882. tun_info);
  883. key->tun_opts_len = tun_info->options_len;
  884. } else {
  885. key->tun_opts_len = 0;
  886. }
  887. } else {
  888. key->tun_proto = 0;
  889. key->tun_opts_len = 0;
  890. memset(&key->tun_key, 0, sizeof(key->tun_key));
  891. }
  892. key->phy.priority = skb->priority;
  893. key->phy.in_port = OVS_CB(skb)->input_vport->port_no;
  894. key->phy.skb_mark = skb->mark;
  895. key->ovs_flow_hash = 0;
  896. res = key_extract_mac_proto(skb);
  897. if (res < 0)
  898. return res;
  899. key->mac_proto = res;
  900. #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
  901. if (tc_skb_ext_tc_enabled()) {
  902. tc_ext = skb_ext_find(skb, TC_SKB_EXT);
  903. key->recirc_id = tc_ext && !tc_ext->act_miss ?
  904. tc_ext->chain : 0;
  905. OVS_CB(skb)->mru = tc_ext ? tc_ext->mru : 0;
  906. post_ct = tc_ext ? tc_ext->post_ct : false;
  907. post_ct_snat = post_ct ? tc_ext->post_ct_snat : false;
  908. post_ct_dnat = post_ct ? tc_ext->post_ct_dnat : false;
  909. zone = post_ct ? tc_ext->zone : 0;
  910. } else {
  911. key->recirc_id = 0;
  912. }
  913. #else
  914. key->recirc_id = 0;
  915. #endif
  916. err = key_extract(skb, key);
  917. if (!err) {
  918. ovs_ct_fill_key(skb, key, post_ct); /* Must be after key_extract(). */
  919. if (post_ct) {
  920. if (!skb_get_nfct(skb)) {
  921. key->ct_zone = zone;
  922. } else {
  923. if (!post_ct_dnat)
  924. key->ct_state &= ~OVS_CS_F_DST_NAT;
  925. if (!post_ct_snat)
  926. key->ct_state &= ~OVS_CS_F_SRC_NAT;
  927. }
  928. }
  929. }
  930. return err;
  931. }
  932. int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr,
  933. struct sk_buff *skb,
  934. struct sw_flow_key *key, bool log)
  935. {
  936. const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
  937. u64 attrs = 0;
  938. int err;
  939. err = parse_flow_nlattrs(attr, a, &attrs, log);
  940. if (err)
  941. return -EINVAL;
  942. /* Extract metadata from netlink attributes. */
  943. err = ovs_nla_get_flow_metadata(net, a, attrs, key, log);
  944. if (err)
  945. return err;
  946. /* key_extract assumes that skb->protocol is set-up for
  947. * layer 3 packets which is the case for other callers,
  948. * in particular packets received from the network stack.
  949. * Here the correct value can be set from the metadata
  950. * extracted above.
  951. * For L2 packet key eth type would be zero. skb protocol
  952. * would be set to correct value later during key-extact.
  953. */
  954. skb->protocol = key->eth.type;
  955. err = key_extract(skb, key);
  956. if (err)
  957. return err;
  958. /* Check that we have conntrack original direction tuple metadata only
  959. * for packets for which it makes sense. Otherwise the key may be
  960. * corrupted due to overlapping key fields.
  961. */
  962. if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4) &&
  963. key->eth.type != htons(ETH_P_IP))
  964. return -EINVAL;
  965. if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6) &&
  966. (key->eth.type != htons(ETH_P_IPV6) ||
  967. sw_flow_key_is_nd(key)))
  968. return -EINVAL;
  969. return 0;
  970. }