ioam6_iptunnel.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540
  1. // SPDX-License-Identifier: GPL-2.0+
  2. /*
  3. * IPv6 IOAM Lightweight Tunnel implementation
  4. *
  5. * Author:
  6. * Justin Iurman <justin.iurman@uliege.be>
  7. */
  8. #include <linux/kernel.h>
  9. #include <linux/skbuff.h>
  10. #include <linux/net.h>
  11. #include <linux/in6.h>
  12. #include <linux/ioam6.h>
  13. #include <linux/ioam6_iptunnel.h>
  14. #include <net/dst.h>
  15. #include <net/sock.h>
  16. #include <net/lwtunnel.h>
  17. #include <net/ioam6.h>
  18. #include <net/netlink.h>
  19. #include <net/ipv6.h>
  20. #include <net/dst_cache.h>
  21. #include <net/ip6_route.h>
  22. #include <net/addrconf.h>
  23. #define IOAM6_MASK_SHORT_FIELDS 0xff100000
  24. #define IOAM6_MASK_WIDE_FIELDS 0xe00000
  25. struct ioam6_lwt_encap {
  26. struct ipv6_hopopt_hdr eh;
  27. u8 pad[2]; /* 2-octet padding for 4n-alignment */
  28. struct ioam6_hdr ioamh;
  29. struct ioam6_trace_hdr traceh;
  30. } __packed;
  31. struct ioam6_lwt_freq {
  32. u32 k;
  33. u32 n;
  34. };
  35. struct ioam6_lwt {
  36. struct dst_cache cache;
  37. struct ioam6_lwt_freq freq;
  38. atomic_t pkt_cnt;
  39. u8 mode;
  40. bool has_tunsrc;
  41. struct in6_addr tunsrc;
  42. struct in6_addr tundst;
  43. struct ioam6_lwt_encap tuninfo;
  44. };
  45. static const struct netlink_range_validation freq_range = {
  46. .min = IOAM6_IPTUNNEL_FREQ_MIN,
  47. .max = IOAM6_IPTUNNEL_FREQ_MAX,
  48. };
  49. static struct ioam6_lwt *ioam6_lwt_state(struct lwtunnel_state *lwt)
  50. {
  51. return (struct ioam6_lwt *)lwt->data;
  52. }
  53. static struct ioam6_lwt_encap *ioam6_lwt_info(struct lwtunnel_state *lwt)
  54. {
  55. return &ioam6_lwt_state(lwt)->tuninfo;
  56. }
  57. static struct ioam6_trace_hdr *ioam6_lwt_trace(struct lwtunnel_state *lwt)
  58. {
  59. return &(ioam6_lwt_state(lwt)->tuninfo.traceh);
  60. }
  61. static const struct nla_policy ioam6_iptunnel_policy[IOAM6_IPTUNNEL_MAX + 1] = {
  62. [IOAM6_IPTUNNEL_FREQ_K] = NLA_POLICY_FULL_RANGE(NLA_U32, &freq_range),
  63. [IOAM6_IPTUNNEL_FREQ_N] = NLA_POLICY_FULL_RANGE(NLA_U32, &freq_range),
  64. [IOAM6_IPTUNNEL_MODE] = NLA_POLICY_RANGE(NLA_U8,
  65. IOAM6_IPTUNNEL_MODE_MIN,
  66. IOAM6_IPTUNNEL_MODE_MAX),
  67. [IOAM6_IPTUNNEL_SRC] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
  68. [IOAM6_IPTUNNEL_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
  69. [IOAM6_IPTUNNEL_TRACE] = NLA_POLICY_EXACT_LEN(
  70. sizeof(struct ioam6_trace_hdr)),
  71. };
  72. static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace)
  73. {
  74. u32 fields;
  75. if (!trace->type_be32 || !trace->remlen ||
  76. trace->remlen > IOAM6_TRACE_DATA_SIZE_MAX / 4 ||
  77. trace->type.bit12 | trace->type.bit13 | trace->type.bit14 |
  78. trace->type.bit15 | trace->type.bit16 | trace->type.bit17 |
  79. trace->type.bit18 | trace->type.bit19 | trace->type.bit20 |
  80. trace->type.bit21 | trace->type.bit23)
  81. return false;
  82. trace->nodelen = 0;
  83. fields = be32_to_cpu(trace->type_be32);
  84. trace->nodelen += hweight32(fields & IOAM6_MASK_SHORT_FIELDS)
  85. * (sizeof(__be32) / 4);
  86. trace->nodelen += hweight32(fields & IOAM6_MASK_WIDE_FIELDS)
  87. * (sizeof(__be64) / 4);
  88. return true;
  89. }
  90. static int ioam6_build_state(struct net *net, struct nlattr *nla,
  91. unsigned int family, const void *cfg,
  92. struct lwtunnel_state **ts,
  93. struct netlink_ext_ack *extack)
  94. {
  95. struct nlattr *tb[IOAM6_IPTUNNEL_MAX + 1];
  96. struct ioam6_lwt_encap *tuninfo;
  97. struct ioam6_trace_hdr *trace;
  98. struct lwtunnel_state *lwt;
  99. struct ioam6_lwt *ilwt;
  100. int len_aligned, err;
  101. u32 freq_k, freq_n;
  102. u8 mode;
  103. if (family != AF_INET6)
  104. return -EINVAL;
  105. err = nla_parse_nested(tb, IOAM6_IPTUNNEL_MAX, nla,
  106. ioam6_iptunnel_policy, extack);
  107. if (err < 0)
  108. return err;
  109. if ((!tb[IOAM6_IPTUNNEL_FREQ_K] && tb[IOAM6_IPTUNNEL_FREQ_N]) ||
  110. (tb[IOAM6_IPTUNNEL_FREQ_K] && !tb[IOAM6_IPTUNNEL_FREQ_N])) {
  111. NL_SET_ERR_MSG(extack, "freq: missing parameter");
  112. return -EINVAL;
  113. } else if (!tb[IOAM6_IPTUNNEL_FREQ_K] && !tb[IOAM6_IPTUNNEL_FREQ_N]) {
  114. freq_k = IOAM6_IPTUNNEL_FREQ_MIN;
  115. freq_n = IOAM6_IPTUNNEL_FREQ_MIN;
  116. } else {
  117. freq_k = nla_get_u32(tb[IOAM6_IPTUNNEL_FREQ_K]);
  118. freq_n = nla_get_u32(tb[IOAM6_IPTUNNEL_FREQ_N]);
  119. if (freq_k > freq_n) {
  120. NL_SET_ERR_MSG(extack, "freq: k > n is forbidden");
  121. return -EINVAL;
  122. }
  123. }
  124. if (!tb[IOAM6_IPTUNNEL_MODE])
  125. mode = IOAM6_IPTUNNEL_MODE_INLINE;
  126. else
  127. mode = nla_get_u8(tb[IOAM6_IPTUNNEL_MODE]);
  128. if (tb[IOAM6_IPTUNNEL_SRC] && mode == IOAM6_IPTUNNEL_MODE_INLINE) {
  129. NL_SET_ERR_MSG(extack, "no tunnel src expected with this mode");
  130. return -EINVAL;
  131. }
  132. if (!tb[IOAM6_IPTUNNEL_DST] && mode != IOAM6_IPTUNNEL_MODE_INLINE) {
  133. NL_SET_ERR_MSG(extack, "this mode needs a tunnel destination");
  134. return -EINVAL;
  135. }
  136. if (!tb[IOAM6_IPTUNNEL_TRACE]) {
  137. NL_SET_ERR_MSG(extack, "missing trace");
  138. return -EINVAL;
  139. }
  140. trace = nla_data(tb[IOAM6_IPTUNNEL_TRACE]);
  141. if (!ioam6_validate_trace_hdr(trace)) {
  142. NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_TRACE],
  143. "invalid trace validation");
  144. return -EINVAL;
  145. }
  146. len_aligned = ALIGN(trace->remlen * 4, 8);
  147. lwt = lwtunnel_state_alloc(sizeof(*ilwt) + len_aligned);
  148. if (!lwt)
  149. return -ENOMEM;
  150. ilwt = ioam6_lwt_state(lwt);
  151. err = dst_cache_init(&ilwt->cache, GFP_ATOMIC);
  152. if (err)
  153. goto free_lwt;
  154. atomic_set(&ilwt->pkt_cnt, 0);
  155. ilwt->freq.k = freq_k;
  156. ilwt->freq.n = freq_n;
  157. ilwt->mode = mode;
  158. if (!tb[IOAM6_IPTUNNEL_SRC]) {
  159. ilwt->has_tunsrc = false;
  160. } else {
  161. ilwt->has_tunsrc = true;
  162. ilwt->tunsrc = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_SRC]);
  163. if (ipv6_addr_any(&ilwt->tunsrc)) {
  164. NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_SRC],
  165. "invalid tunnel source address");
  166. err = -EINVAL;
  167. goto free_cache;
  168. }
  169. }
  170. if (tb[IOAM6_IPTUNNEL_DST]) {
  171. ilwt->tundst = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_DST]);
  172. if (ipv6_addr_any(&ilwt->tundst)) {
  173. NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_DST],
  174. "invalid tunnel dest address");
  175. err = -EINVAL;
  176. goto free_cache;
  177. }
  178. }
  179. tuninfo = ioam6_lwt_info(lwt);
  180. tuninfo->eh.hdrlen = ((sizeof(*tuninfo) + len_aligned) >> 3) - 1;
  181. tuninfo->pad[0] = IPV6_TLV_PADN;
  182. tuninfo->ioamh.type = IOAM6_TYPE_PREALLOC;
  183. tuninfo->ioamh.opt_type = IPV6_TLV_IOAM;
  184. tuninfo->ioamh.opt_len = sizeof(tuninfo->ioamh) - 2 + sizeof(*trace)
  185. + trace->remlen * 4;
  186. memcpy(&tuninfo->traceh, trace, sizeof(*trace));
  187. if (len_aligned - trace->remlen * 4) {
  188. tuninfo->traceh.data[trace->remlen * 4] = IPV6_TLV_PADN;
  189. tuninfo->traceh.data[trace->remlen * 4 + 1] = 2;
  190. }
  191. lwt->type = LWTUNNEL_ENCAP_IOAM6;
  192. lwt->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
  193. *ts = lwt;
  194. return 0;
  195. free_cache:
  196. dst_cache_destroy(&ilwt->cache);
  197. free_lwt:
  198. kfree(lwt);
  199. return err;
  200. }
  201. static int ioam6_do_fill(struct net *net, struct sk_buff *skb)
  202. {
  203. struct ioam6_trace_hdr *trace;
  204. struct ioam6_namespace *ns;
  205. trace = (struct ioam6_trace_hdr *)(skb_transport_header(skb)
  206. + sizeof(struct ipv6_hopopt_hdr) + 2
  207. + sizeof(struct ioam6_hdr));
  208. ns = ioam6_namespace(net, trace->namespace_id);
  209. if (ns)
  210. ioam6_fill_trace_data(skb, ns, trace, false);
  211. return 0;
  212. }
  213. static int ioam6_do_inline(struct net *net, struct sk_buff *skb,
  214. struct ioam6_lwt_encap *tuninfo,
  215. struct dst_entry *cache_dst)
  216. {
  217. struct ipv6hdr *oldhdr, *hdr;
  218. int hdrlen, err;
  219. hdrlen = (tuninfo->eh.hdrlen + 1) << 3;
  220. err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb));
  221. if (unlikely(err))
  222. return err;
  223. oldhdr = ipv6_hdr(skb);
  224. skb_pull(skb, sizeof(*oldhdr));
  225. skb_postpull_rcsum(skb, skb_network_header(skb), sizeof(*oldhdr));
  226. skb_push(skb, sizeof(*oldhdr) + hdrlen);
  227. skb_reset_network_header(skb);
  228. skb_mac_header_rebuild(skb);
  229. hdr = ipv6_hdr(skb);
  230. memmove(hdr, oldhdr, sizeof(*oldhdr));
  231. tuninfo->eh.nexthdr = hdr->nexthdr;
  232. skb_set_transport_header(skb, sizeof(*hdr));
  233. skb_postpush_rcsum(skb, hdr, sizeof(*hdr) + hdrlen);
  234. memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen);
  235. hdr->nexthdr = NEXTHDR_HOP;
  236. hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr));
  237. return ioam6_do_fill(net, skb);
  238. }
  239. static int ioam6_do_encap(struct net *net, struct sk_buff *skb,
  240. struct ioam6_lwt_encap *tuninfo,
  241. bool has_tunsrc,
  242. struct in6_addr *tunsrc,
  243. struct in6_addr *tundst,
  244. struct dst_entry *cache_dst)
  245. {
  246. struct dst_entry *dst = skb_dst(skb);
  247. struct ipv6hdr *hdr, *inner_hdr;
  248. int hdrlen, len, err;
  249. hdrlen = (tuninfo->eh.hdrlen + 1) << 3;
  250. len = sizeof(*hdr) + hdrlen;
  251. err = skb_cow_head(skb, len + dst_dev_overhead(cache_dst, skb));
  252. if (unlikely(err))
  253. return err;
  254. inner_hdr = ipv6_hdr(skb);
  255. skb_push(skb, len);
  256. skb_reset_network_header(skb);
  257. skb_mac_header_rebuild(skb);
  258. skb_set_transport_header(skb, sizeof(*hdr));
  259. tuninfo->eh.nexthdr = NEXTHDR_IPV6;
  260. memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen);
  261. hdr = ipv6_hdr(skb);
  262. memcpy(hdr, inner_hdr, sizeof(*hdr));
  263. hdr->nexthdr = NEXTHDR_HOP;
  264. hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr));
  265. hdr->daddr = *tundst;
  266. if (has_tunsrc)
  267. memcpy(&hdr->saddr, tunsrc, sizeof(*tunsrc));
  268. else
  269. ipv6_dev_get_saddr(net, dst->dev, &hdr->daddr,
  270. IPV6_PREFER_SRC_PUBLIC, &hdr->saddr);
  271. skb_postpush_rcsum(skb, hdr, len);
  272. return ioam6_do_fill(net, skb);
  273. }
  274. static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
  275. {
  276. struct dst_entry *dst = skb_dst(skb), *cache_dst = NULL;
  277. struct in6_addr orig_daddr;
  278. struct ioam6_lwt *ilwt;
  279. int err = -EINVAL;
  280. u32 pkt_cnt;
  281. if (skb->protocol != htons(ETH_P_IPV6))
  282. goto drop;
  283. ilwt = ioam6_lwt_state(dst->lwtstate);
  284. /* Check for insertion frequency (i.e., "k over n" insertions) */
  285. pkt_cnt = atomic_fetch_inc(&ilwt->pkt_cnt);
  286. if (pkt_cnt % ilwt->freq.n >= ilwt->freq.k)
  287. goto out;
  288. orig_daddr = ipv6_hdr(skb)->daddr;
  289. local_bh_disable();
  290. cache_dst = dst_cache_get(&ilwt->cache);
  291. local_bh_enable();
  292. switch (ilwt->mode) {
  293. case IOAM6_IPTUNNEL_MODE_INLINE:
  294. do_inline:
  295. /* Direct insertion - if there is no Hop-by-Hop yet */
  296. if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP)
  297. goto out;
  298. err = ioam6_do_inline(net, skb, &ilwt->tuninfo, cache_dst);
  299. if (unlikely(err))
  300. goto drop;
  301. break;
  302. case IOAM6_IPTUNNEL_MODE_ENCAP:
  303. do_encap:
  304. /* Encapsulation (ip6ip6) */
  305. err = ioam6_do_encap(net, skb, &ilwt->tuninfo,
  306. ilwt->has_tunsrc, &ilwt->tunsrc,
  307. &ilwt->tundst, cache_dst);
  308. if (unlikely(err))
  309. goto drop;
  310. break;
  311. case IOAM6_IPTUNNEL_MODE_AUTO:
  312. /* Automatic (RFC8200 compliant):
  313. * - local packets -> INLINE mode
  314. * - in-transit packets -> ENCAP mode
  315. */
  316. if (!skb->dev)
  317. goto do_inline;
  318. goto do_encap;
  319. default:
  320. goto drop;
  321. }
  322. if (unlikely(!cache_dst)) {
  323. struct ipv6hdr *hdr = ipv6_hdr(skb);
  324. struct flowi6 fl6;
  325. memset(&fl6, 0, sizeof(fl6));
  326. fl6.daddr = hdr->daddr;
  327. fl6.saddr = hdr->saddr;
  328. fl6.flowlabel = ip6_flowinfo(hdr);
  329. fl6.flowi6_mark = skb->mark;
  330. fl6.flowi6_proto = hdr->nexthdr;
  331. cache_dst = ip6_route_output(net, NULL, &fl6);
  332. if (cache_dst->error) {
  333. err = cache_dst->error;
  334. goto drop;
  335. }
  336. /* cache only if we don't create a dst reference loop */
  337. if (dst->lwtstate != cache_dst->lwtstate) {
  338. local_bh_disable();
  339. dst_cache_set_ip6(&ilwt->cache, cache_dst, &fl6.saddr);
  340. local_bh_enable();
  341. }
  342. err = skb_cow_head(skb, LL_RESERVED_SPACE(cache_dst->dev));
  343. if (unlikely(err))
  344. goto drop;
  345. }
  346. if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) {
  347. skb_dst_drop(skb);
  348. skb_dst_set(skb, cache_dst);
  349. return dst_output(net, sk, skb);
  350. }
  351. out:
  352. dst_release(cache_dst);
  353. return dst->lwtstate->orig_output(net, sk, skb);
  354. drop:
  355. dst_release(cache_dst);
  356. kfree_skb(skb);
  357. return err;
  358. }
  359. static void ioam6_destroy_state(struct lwtunnel_state *lwt)
  360. {
  361. dst_cache_destroy(&ioam6_lwt_state(lwt)->cache);
  362. }
  363. static int ioam6_fill_encap_info(struct sk_buff *skb,
  364. struct lwtunnel_state *lwtstate)
  365. {
  366. struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate);
  367. int err;
  368. err = nla_put_u32(skb, IOAM6_IPTUNNEL_FREQ_K, ilwt->freq.k);
  369. if (err)
  370. goto ret;
  371. err = nla_put_u32(skb, IOAM6_IPTUNNEL_FREQ_N, ilwt->freq.n);
  372. if (err)
  373. goto ret;
  374. err = nla_put_u8(skb, IOAM6_IPTUNNEL_MODE, ilwt->mode);
  375. if (err)
  376. goto ret;
  377. if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) {
  378. if (ilwt->has_tunsrc) {
  379. err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_SRC,
  380. &ilwt->tunsrc);
  381. if (err)
  382. goto ret;
  383. }
  384. err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_DST, &ilwt->tundst);
  385. if (err)
  386. goto ret;
  387. }
  388. err = nla_put(skb, IOAM6_IPTUNNEL_TRACE, sizeof(ilwt->tuninfo.traceh),
  389. &ilwt->tuninfo.traceh);
  390. ret:
  391. return err;
  392. }
  393. static int ioam6_encap_nlsize(struct lwtunnel_state *lwtstate)
  394. {
  395. struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate);
  396. int nlsize;
  397. nlsize = nla_total_size(sizeof(ilwt->freq.k)) +
  398. nla_total_size(sizeof(ilwt->freq.n)) +
  399. nla_total_size(sizeof(ilwt->mode)) +
  400. nla_total_size(sizeof(ilwt->tuninfo.traceh));
  401. if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) {
  402. if (ilwt->has_tunsrc)
  403. nlsize += nla_total_size(sizeof(ilwt->tunsrc));
  404. nlsize += nla_total_size(sizeof(ilwt->tundst));
  405. }
  406. return nlsize;
  407. }
  408. static int ioam6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
  409. {
  410. struct ioam6_trace_hdr *trace_a = ioam6_lwt_trace(a);
  411. struct ioam6_trace_hdr *trace_b = ioam6_lwt_trace(b);
  412. struct ioam6_lwt *ilwt_a = ioam6_lwt_state(a);
  413. struct ioam6_lwt *ilwt_b = ioam6_lwt_state(b);
  414. return (ilwt_a->freq.k != ilwt_b->freq.k ||
  415. ilwt_a->freq.n != ilwt_b->freq.n ||
  416. ilwt_a->mode != ilwt_b->mode ||
  417. ilwt_a->has_tunsrc != ilwt_b->has_tunsrc ||
  418. (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE &&
  419. !ipv6_addr_equal(&ilwt_a->tundst, &ilwt_b->tundst)) ||
  420. (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE &&
  421. ilwt_a->has_tunsrc &&
  422. !ipv6_addr_equal(&ilwt_a->tunsrc, &ilwt_b->tunsrc)) ||
  423. trace_a->namespace_id != trace_b->namespace_id);
  424. }
  425. static const struct lwtunnel_encap_ops ioam6_iptun_ops = {
  426. .build_state = ioam6_build_state,
  427. .destroy_state = ioam6_destroy_state,
  428. .output = ioam6_output,
  429. .fill_encap = ioam6_fill_encap_info,
  430. .get_encap_size = ioam6_encap_nlsize,
  431. .cmp_encap = ioam6_encap_cmp,
  432. .owner = THIS_MODULE,
  433. };
  434. int __init ioam6_iptunnel_init(void)
  435. {
  436. return lwtunnel_encap_add_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6);
  437. }
  438. void ioam6_iptunnel_exit(void)
  439. {
  440. lwtunnel_encap_del_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6);
  441. }