act_ct.c 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698
  1. // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
  2. /* -
  3. * net/sched/act_ct.c Connection Tracking action
  4. *
  5. * Authors: Paul Blakey <paulb@mellanox.com>
  6. * Yossi Kuperman <yossiku@mellanox.com>
  7. * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
  8. */
  9. #include <linux/module.h>
  10. #include <linux/init.h>
  11. #include <linux/kernel.h>
  12. #include <linux/skbuff.h>
  13. #include <linux/rtnetlink.h>
  14. #include <linux/pkt_cls.h>
  15. #include <linux/ip.h>
  16. #include <linux/ipv6.h>
  17. #include <linux/rhashtable.h>
  18. #include <net/netlink.h>
  19. #include <net/pkt_sched.h>
  20. #include <net/pkt_cls.h>
  21. #include <net/act_api.h>
  22. #include <net/ip.h>
  23. #include <net/ipv6_frag.h>
  24. #include <uapi/linux/tc_act/tc_ct.h>
  25. #include <net/tc_act/tc_ct.h>
  26. #include <net/tc_wrapper.h>
  27. #include <net/netfilter/nf_flow_table.h>
  28. #include <net/netfilter/nf_conntrack.h>
  29. #include <net/netfilter/nf_conntrack_core.h>
  30. #include <net/netfilter/nf_conntrack_zones.h>
  31. #include <net/netfilter/nf_conntrack_helper.h>
  32. #include <net/netfilter/nf_conntrack_acct.h>
  33. #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
  34. #include <net/netfilter/nf_conntrack_act_ct.h>
  35. #include <net/netfilter/nf_conntrack_seqadj.h>
  36. #include <uapi/linux/netfilter/nf_nat.h>
  37. static struct workqueue_struct *act_ct_wq;
  38. static struct rhashtable zones_ht;
  39. static DEFINE_MUTEX(zones_mutex);
  40. struct zones_ht_key {
  41. struct net *net;
  42. u16 zone;
  43. };
  44. struct tcf_ct_flow_table {
  45. struct rhash_head node; /* In zones tables */
  46. struct rcu_work rwork;
  47. struct nf_flowtable nf_ft;
  48. refcount_t ref;
  49. struct zones_ht_key key;
  50. bool dying;
  51. };
  52. static const struct rhashtable_params zones_params = {
  53. .head_offset = offsetof(struct tcf_ct_flow_table, node),
  54. .key_offset = offsetof(struct tcf_ct_flow_table, key),
  55. .key_len = offsetofend(struct zones_ht_key, zone),
  56. .automatic_shrinking = true,
  57. };
  58. static struct flow_action_entry *
  59. tcf_ct_flow_table_flow_action_get_next(struct flow_action *flow_action)
  60. {
  61. int i = flow_action->num_entries++;
  62. return &flow_action->entries[i];
  63. }
  64. static void tcf_ct_add_mangle_action(struct flow_action *action,
  65. enum flow_action_mangle_base htype,
  66. u32 offset,
  67. u32 mask,
  68. u32 val)
  69. {
  70. struct flow_action_entry *entry;
  71. entry = tcf_ct_flow_table_flow_action_get_next(action);
  72. entry->id = FLOW_ACTION_MANGLE;
  73. entry->mangle.htype = htype;
  74. entry->mangle.mask = ~mask;
  75. entry->mangle.offset = offset;
  76. entry->mangle.val = val;
  77. }
  78. /* The following nat helper functions check if the inverted reverse tuple
  79. * (target) is different then the current dir tuple - meaning nat for ports
  80. * and/or ip is needed, and add the relevant mangle actions.
  81. */
  82. static void
  83. tcf_ct_flow_table_add_action_nat_ipv4(const struct nf_conntrack_tuple *tuple,
  84. struct nf_conntrack_tuple target,
  85. struct flow_action *action)
  86. {
  87. if (memcmp(&target.src.u3, &tuple->src.u3, sizeof(target.src.u3)))
  88. tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP4,
  89. offsetof(struct iphdr, saddr),
  90. 0xFFFFFFFF,
  91. be32_to_cpu(target.src.u3.ip));
  92. if (memcmp(&target.dst.u3, &tuple->dst.u3, sizeof(target.dst.u3)))
  93. tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP4,
  94. offsetof(struct iphdr, daddr),
  95. 0xFFFFFFFF,
  96. be32_to_cpu(target.dst.u3.ip));
  97. }
  98. static void
  99. tcf_ct_add_ipv6_addr_mangle_action(struct flow_action *action,
  100. union nf_inet_addr *addr,
  101. u32 offset)
  102. {
  103. int i;
  104. for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i++)
  105. tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP6,
  106. i * sizeof(u32) + offset,
  107. 0xFFFFFFFF, be32_to_cpu(addr->ip6[i]));
  108. }
  109. static void
  110. tcf_ct_flow_table_add_action_nat_ipv6(const struct nf_conntrack_tuple *tuple,
  111. struct nf_conntrack_tuple target,
  112. struct flow_action *action)
  113. {
  114. if (memcmp(&target.src.u3, &tuple->src.u3, sizeof(target.src.u3)))
  115. tcf_ct_add_ipv6_addr_mangle_action(action, &target.src.u3,
  116. offsetof(struct ipv6hdr,
  117. saddr));
  118. if (memcmp(&target.dst.u3, &tuple->dst.u3, sizeof(target.dst.u3)))
  119. tcf_ct_add_ipv6_addr_mangle_action(action, &target.dst.u3,
  120. offsetof(struct ipv6hdr,
  121. daddr));
  122. }
  123. static void
  124. tcf_ct_flow_table_add_action_nat_tcp(const struct nf_conntrack_tuple *tuple,
  125. struct nf_conntrack_tuple target,
  126. struct flow_action *action)
  127. {
  128. __be16 target_src = target.src.u.tcp.port;
  129. __be16 target_dst = target.dst.u.tcp.port;
  130. if (target_src != tuple->src.u.tcp.port)
  131. tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
  132. offsetof(struct tcphdr, source),
  133. 0xFFFF, be16_to_cpu(target_src));
  134. if (target_dst != tuple->dst.u.tcp.port)
  135. tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
  136. offsetof(struct tcphdr, dest),
  137. 0xFFFF, be16_to_cpu(target_dst));
  138. }
  139. static void
  140. tcf_ct_flow_table_add_action_nat_udp(const struct nf_conntrack_tuple *tuple,
  141. struct nf_conntrack_tuple target,
  142. struct flow_action *action)
  143. {
  144. __be16 target_src = target.src.u.udp.port;
  145. __be16 target_dst = target.dst.u.udp.port;
  146. if (target_src != tuple->src.u.udp.port)
  147. tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP,
  148. offsetof(struct udphdr, source),
  149. 0xFFFF, be16_to_cpu(target_src));
  150. if (target_dst != tuple->dst.u.udp.port)
  151. tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP,
  152. offsetof(struct udphdr, dest),
  153. 0xFFFF, be16_to_cpu(target_dst));
  154. }
  155. static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct,
  156. enum ip_conntrack_dir dir,
  157. enum ip_conntrack_info ctinfo,
  158. struct flow_action *action)
  159. {
  160. struct nf_conn_labels *ct_labels;
  161. struct flow_action_entry *entry;
  162. u32 *act_ct_labels;
  163. entry = tcf_ct_flow_table_flow_action_get_next(action);
  164. entry->id = FLOW_ACTION_CT_METADATA;
  165. #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
  166. entry->ct_metadata.mark = READ_ONCE(ct->mark);
  167. #endif
  168. /* aligns with the CT reference on the SKB nf_ct_set */
  169. entry->ct_metadata.cookie = (unsigned long)ct | ctinfo;
  170. entry->ct_metadata.orig_dir = dir == IP_CT_DIR_ORIGINAL;
  171. act_ct_labels = entry->ct_metadata.labels;
  172. ct_labels = nf_ct_labels_find(ct);
  173. if (ct_labels)
  174. memcpy(act_ct_labels, ct_labels->bits, NF_CT_LABELS_MAX_SIZE);
  175. else
  176. memset(act_ct_labels, 0, NF_CT_LABELS_MAX_SIZE);
  177. }
  178. static int tcf_ct_flow_table_add_action_nat(struct net *net,
  179. struct nf_conn *ct,
  180. enum ip_conntrack_dir dir,
  181. struct flow_action *action)
  182. {
  183. const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
  184. struct nf_conntrack_tuple target;
  185. if (!(ct->status & IPS_NAT_MASK))
  186. return 0;
  187. nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple);
  188. switch (tuple->src.l3num) {
  189. case NFPROTO_IPV4:
  190. tcf_ct_flow_table_add_action_nat_ipv4(tuple, target,
  191. action);
  192. break;
  193. case NFPROTO_IPV6:
  194. tcf_ct_flow_table_add_action_nat_ipv6(tuple, target,
  195. action);
  196. break;
  197. default:
  198. return -EOPNOTSUPP;
  199. }
  200. switch (nf_ct_protonum(ct)) {
  201. case IPPROTO_TCP:
  202. tcf_ct_flow_table_add_action_nat_tcp(tuple, target, action);
  203. break;
  204. case IPPROTO_UDP:
  205. tcf_ct_flow_table_add_action_nat_udp(tuple, target, action);
  206. break;
  207. default:
  208. return -EOPNOTSUPP;
  209. }
  210. return 0;
  211. }
  212. static int tcf_ct_flow_table_fill_actions(struct net *net,
  213. struct flow_offload *flow,
  214. enum flow_offload_tuple_dir tdir,
  215. struct nf_flow_rule *flow_rule)
  216. {
  217. struct flow_action *action = &flow_rule->rule->action;
  218. int num_entries = action->num_entries;
  219. struct nf_conn *ct = flow->ct;
  220. enum ip_conntrack_info ctinfo;
  221. enum ip_conntrack_dir dir;
  222. int i, err;
  223. switch (tdir) {
  224. case FLOW_OFFLOAD_DIR_ORIGINAL:
  225. dir = IP_CT_DIR_ORIGINAL;
  226. ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
  227. IP_CT_ESTABLISHED : IP_CT_NEW;
  228. if (ctinfo == IP_CT_ESTABLISHED)
  229. set_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags);
  230. break;
  231. case FLOW_OFFLOAD_DIR_REPLY:
  232. dir = IP_CT_DIR_REPLY;
  233. ctinfo = IP_CT_ESTABLISHED_REPLY;
  234. break;
  235. default:
  236. return -EOPNOTSUPP;
  237. }
  238. err = tcf_ct_flow_table_add_action_nat(net, ct, dir, action);
  239. if (err)
  240. goto err_nat;
  241. tcf_ct_flow_table_add_action_meta(ct, dir, ctinfo, action);
  242. return 0;
  243. err_nat:
  244. /* Clear filled actions */
  245. for (i = num_entries; i < action->num_entries; i++)
  246. memset(&action->entries[i], 0, sizeof(action->entries[i]));
  247. action->num_entries = num_entries;
  248. return err;
  249. }
  250. static bool tcf_ct_flow_is_outdated(const struct flow_offload *flow)
  251. {
  252. return test_bit(IPS_SEEN_REPLY_BIT, &flow->ct->status) &&
  253. test_bit(IPS_HW_OFFLOAD_BIT, &flow->ct->status) &&
  254. !test_bit(NF_FLOW_HW_PENDING, &flow->flags) &&
  255. !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags);
  256. }
  257. static void tcf_ct_flow_table_get_ref(struct tcf_ct_flow_table *ct_ft);
  258. static void tcf_ct_nf_get(struct nf_flowtable *ft)
  259. {
  260. struct tcf_ct_flow_table *ct_ft =
  261. container_of(ft, struct tcf_ct_flow_table, nf_ft);
  262. tcf_ct_flow_table_get_ref(ct_ft);
  263. }
  264. static void tcf_ct_flow_table_put(struct tcf_ct_flow_table *ct_ft);
  265. static void tcf_ct_nf_put(struct nf_flowtable *ft)
  266. {
  267. struct tcf_ct_flow_table *ct_ft =
  268. container_of(ft, struct tcf_ct_flow_table, nf_ft);
  269. tcf_ct_flow_table_put(ct_ft);
  270. }
  271. static struct nf_flowtable_type flowtable_ct = {
  272. .gc = tcf_ct_flow_is_outdated,
  273. .action = tcf_ct_flow_table_fill_actions,
  274. .get = tcf_ct_nf_get,
  275. .put = tcf_ct_nf_put,
  276. .owner = THIS_MODULE,
  277. };
  278. static int tcf_ct_flow_table_get(struct net *net, struct tcf_ct_params *params)
  279. {
  280. struct zones_ht_key key = { .net = net, .zone = params->zone };
  281. struct tcf_ct_flow_table *ct_ft;
  282. int err = -ENOMEM;
  283. mutex_lock(&zones_mutex);
  284. ct_ft = rhashtable_lookup_fast(&zones_ht, &key, zones_params);
  285. if (ct_ft && refcount_inc_not_zero(&ct_ft->ref))
  286. goto out_unlock;
  287. ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL);
  288. if (!ct_ft)
  289. goto err_alloc;
  290. refcount_set(&ct_ft->ref, 1);
  291. ct_ft->key = key;
  292. err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
  293. if (err)
  294. goto err_insert;
  295. ct_ft->nf_ft.type = &flowtable_ct;
  296. ct_ft->nf_ft.flags |= NF_FLOWTABLE_HW_OFFLOAD |
  297. NF_FLOWTABLE_COUNTER;
  298. err = nf_flow_table_init(&ct_ft->nf_ft);
  299. if (err)
  300. goto err_init;
  301. write_pnet(&ct_ft->nf_ft.net, net);
  302. __module_get(THIS_MODULE);
  303. out_unlock:
  304. params->ct_ft = ct_ft;
  305. params->nf_ft = &ct_ft->nf_ft;
  306. mutex_unlock(&zones_mutex);
  307. return 0;
  308. err_init:
  309. rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
  310. err_insert:
  311. kfree(ct_ft);
  312. err_alloc:
  313. mutex_unlock(&zones_mutex);
  314. return err;
  315. }
  316. static void tcf_ct_flow_table_get_ref(struct tcf_ct_flow_table *ct_ft)
  317. {
  318. refcount_inc(&ct_ft->ref);
  319. }
  320. static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
  321. {
  322. struct tcf_ct_flow_table *ct_ft;
  323. struct flow_block *block;
  324. ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table,
  325. rwork);
  326. nf_flow_table_free(&ct_ft->nf_ft);
  327. block = &ct_ft->nf_ft.flow_block;
  328. down_write(&ct_ft->nf_ft.flow_block_lock);
  329. WARN_ON(!list_empty(&block->cb_list));
  330. up_write(&ct_ft->nf_ft.flow_block_lock);
  331. kfree(ct_ft);
  332. module_put(THIS_MODULE);
  333. }
  334. static void tcf_ct_flow_table_put(struct tcf_ct_flow_table *ct_ft)
  335. {
  336. if (refcount_dec_and_test(&ct_ft->ref)) {
  337. rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
  338. INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
  339. queue_rcu_work(act_ct_wq, &ct_ft->rwork);
  340. }
  341. }
  342. static void tcf_ct_flow_tc_ifidx(struct flow_offload *entry,
  343. struct nf_conn_act_ct_ext *act_ct_ext, u8 dir)
  344. {
  345. entry->tuplehash[dir].tuple.xmit_type = FLOW_OFFLOAD_XMIT_TC;
  346. entry->tuplehash[dir].tuple.tc.iifidx = act_ct_ext->ifindex[dir];
  347. }
  348. static void tcf_ct_flow_ct_ext_ifidx_update(struct flow_offload *entry)
  349. {
  350. struct nf_conn_act_ct_ext *act_ct_ext;
  351. act_ct_ext = nf_conn_act_ct_ext_find(entry->ct);
  352. if (act_ct_ext) {
  353. tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_ORIGINAL);
  354. tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_REPLY);
  355. }
  356. }
  357. static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
  358. struct nf_conn *ct,
  359. bool tcp, bool bidirectional)
  360. {
  361. struct nf_conn_act_ct_ext *act_ct_ext;
  362. struct flow_offload *entry;
  363. int err;
  364. if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status))
  365. return;
  366. entry = flow_offload_alloc(ct);
  367. if (!entry) {
  368. WARN_ON_ONCE(1);
  369. goto err_alloc;
  370. }
  371. if (tcp) {
  372. ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
  373. ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
  374. }
  375. if (bidirectional)
  376. __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &entry->flags);
  377. act_ct_ext = nf_conn_act_ct_ext_find(ct);
  378. if (act_ct_ext) {
  379. tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_ORIGINAL);
  380. tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_REPLY);
  381. }
  382. err = flow_offload_add(&ct_ft->nf_ft, entry);
  383. if (err)
  384. goto err_add;
  385. return;
  386. err_add:
  387. flow_offload_free(entry);
  388. err_alloc:
  389. clear_bit(IPS_OFFLOAD_BIT, &ct->status);
  390. }
  391. static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
  392. struct nf_conn *ct,
  393. enum ip_conntrack_info ctinfo)
  394. {
  395. bool tcp = false, bidirectional = true;
  396. switch (nf_ct_protonum(ct)) {
  397. case IPPROTO_TCP:
  398. if ((ctinfo != IP_CT_ESTABLISHED &&
  399. ctinfo != IP_CT_ESTABLISHED_REPLY) ||
  400. !test_bit(IPS_ASSURED_BIT, &ct->status) ||
  401. ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED)
  402. return;
  403. tcp = true;
  404. break;
  405. case IPPROTO_UDP:
  406. if (!nf_ct_is_confirmed(ct))
  407. return;
  408. if (!test_bit(IPS_ASSURED_BIT, &ct->status))
  409. bidirectional = false;
  410. break;
  411. #ifdef CONFIG_NF_CT_PROTO_GRE
  412. case IPPROTO_GRE: {
  413. struct nf_conntrack_tuple *tuple;
  414. if ((ctinfo != IP_CT_ESTABLISHED &&
  415. ctinfo != IP_CT_ESTABLISHED_REPLY) ||
  416. !test_bit(IPS_ASSURED_BIT, &ct->status) ||
  417. ct->status & IPS_NAT_MASK)
  418. return;
  419. tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
  420. /* No support for GRE v1 */
  421. if (tuple->src.u.gre.key || tuple->dst.u.gre.key)
  422. return;
  423. break;
  424. }
  425. #endif
  426. default:
  427. return;
  428. }
  429. if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
  430. ct->status & IPS_SEQ_ADJUST)
  431. return;
  432. tcf_ct_flow_table_add(ct_ft, ct, tcp, bidirectional);
  433. }
  434. static bool
  435. tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb,
  436. struct flow_offload_tuple *tuple,
  437. struct tcphdr **tcph)
  438. {
  439. struct flow_ports *ports;
  440. unsigned int thoff;
  441. struct iphdr *iph;
  442. size_t hdrsize;
  443. u8 ipproto;
  444. if (!pskb_network_may_pull(skb, sizeof(*iph)))
  445. return false;
  446. iph = ip_hdr(skb);
  447. thoff = iph->ihl * 4;
  448. if (ip_is_fragment(iph) ||
  449. unlikely(thoff != sizeof(struct iphdr)))
  450. return false;
  451. ipproto = iph->protocol;
  452. switch (ipproto) {
  453. case IPPROTO_TCP:
  454. hdrsize = sizeof(struct tcphdr);
  455. break;
  456. case IPPROTO_UDP:
  457. hdrsize = sizeof(*ports);
  458. break;
  459. #ifdef CONFIG_NF_CT_PROTO_GRE
  460. case IPPROTO_GRE:
  461. hdrsize = sizeof(struct gre_base_hdr);
  462. break;
  463. #endif
  464. default:
  465. return false;
  466. }
  467. if (iph->ttl <= 1)
  468. return false;
  469. if (!pskb_network_may_pull(skb, thoff + hdrsize))
  470. return false;
  471. switch (ipproto) {
  472. case IPPROTO_TCP:
  473. *tcph = (void *)(skb_network_header(skb) + thoff);
  474. fallthrough;
  475. case IPPROTO_UDP:
  476. ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
  477. tuple->src_port = ports->source;
  478. tuple->dst_port = ports->dest;
  479. break;
  480. case IPPROTO_GRE: {
  481. struct gre_base_hdr *greh;
  482. greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff);
  483. if ((greh->flags & GRE_VERSION) != GRE_VERSION_0)
  484. return false;
  485. break;
  486. }
  487. }
  488. iph = ip_hdr(skb);
  489. tuple->src_v4.s_addr = iph->saddr;
  490. tuple->dst_v4.s_addr = iph->daddr;
  491. tuple->l3proto = AF_INET;
  492. tuple->l4proto = ipproto;
  493. return true;
  494. }
  495. static bool
  496. tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb,
  497. struct flow_offload_tuple *tuple,
  498. struct tcphdr **tcph)
  499. {
  500. struct flow_ports *ports;
  501. struct ipv6hdr *ip6h;
  502. unsigned int thoff;
  503. size_t hdrsize;
  504. u8 nexthdr;
  505. if (!pskb_network_may_pull(skb, sizeof(*ip6h)))
  506. return false;
  507. ip6h = ipv6_hdr(skb);
  508. thoff = sizeof(*ip6h);
  509. nexthdr = ip6h->nexthdr;
  510. switch (nexthdr) {
  511. case IPPROTO_TCP:
  512. hdrsize = sizeof(struct tcphdr);
  513. break;
  514. case IPPROTO_UDP:
  515. hdrsize = sizeof(*ports);
  516. break;
  517. #ifdef CONFIG_NF_CT_PROTO_GRE
  518. case IPPROTO_GRE:
  519. hdrsize = sizeof(struct gre_base_hdr);
  520. break;
  521. #endif
  522. default:
  523. return false;
  524. }
  525. if (ip6h->hop_limit <= 1)
  526. return false;
  527. if (!pskb_network_may_pull(skb, thoff + hdrsize))
  528. return false;
  529. switch (nexthdr) {
  530. case IPPROTO_TCP:
  531. *tcph = (void *)(skb_network_header(skb) + thoff);
  532. fallthrough;
  533. case IPPROTO_UDP:
  534. ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
  535. tuple->src_port = ports->source;
  536. tuple->dst_port = ports->dest;
  537. break;
  538. case IPPROTO_GRE: {
  539. struct gre_base_hdr *greh;
  540. greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff);
  541. if ((greh->flags & GRE_VERSION) != GRE_VERSION_0)
  542. return false;
  543. break;
  544. }
  545. }
  546. ip6h = ipv6_hdr(skb);
  547. tuple->src_v6 = ip6h->saddr;
  548. tuple->dst_v6 = ip6h->daddr;
  549. tuple->l3proto = AF_INET6;
  550. tuple->l4proto = nexthdr;
  551. return true;
  552. }
  553. static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
  554. struct sk_buff *skb,
  555. u8 family)
  556. {
  557. struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft;
  558. struct flow_offload_tuple_rhash *tuplehash;
  559. struct flow_offload_tuple tuple = {};
  560. enum ip_conntrack_info ctinfo;
  561. struct tcphdr *tcph = NULL;
  562. bool force_refresh = false;
  563. struct flow_offload *flow;
  564. struct nf_conn *ct;
  565. u8 dir;
  566. switch (family) {
  567. case NFPROTO_IPV4:
  568. if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple, &tcph))
  569. return false;
  570. break;
  571. case NFPROTO_IPV6:
  572. if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple, &tcph))
  573. return false;
  574. break;
  575. default:
  576. return false;
  577. }
  578. tuplehash = flow_offload_lookup(nf_ft, &tuple);
  579. if (!tuplehash)
  580. return false;
  581. dir = tuplehash->tuple.dir;
  582. flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
  583. ct = flow->ct;
  584. if (dir == FLOW_OFFLOAD_DIR_REPLY &&
  585. !test_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags)) {
  586. /* Only offload reply direction after connection became
  587. * assured.
  588. */
  589. if (test_bit(IPS_ASSURED_BIT, &ct->status))
  590. set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags);
  591. else if (test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags))
  592. /* If flow_table flow has already been updated to the
  593. * established state, then don't refresh.
  594. */
  595. return false;
  596. force_refresh = true;
  597. }
  598. if (tcph && (unlikely(tcph->fin || tcph->rst))) {
  599. flow_offload_teardown(flow);
  600. return false;
  601. }
  602. if (dir == FLOW_OFFLOAD_DIR_ORIGINAL)
  603. ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
  604. IP_CT_ESTABLISHED : IP_CT_NEW;
  605. else
  606. ctinfo = IP_CT_ESTABLISHED_REPLY;
  607. nf_conn_act_ct_ext_fill(skb, ct, ctinfo);
  608. tcf_ct_flow_ct_ext_ifidx_update(flow);
  609. flow_offload_refresh(nf_ft, flow, force_refresh);
  610. if (!test_bit(IPS_ASSURED_BIT, &ct->status)) {
  611. /* Process this flow in SW to allow promoting to ASSURED */
  612. return false;
  613. }
  614. nf_conntrack_get(&ct->ct_general);
  615. nf_ct_set(skb, ct, ctinfo);
  616. if (nf_ft->flags & NF_FLOWTABLE_COUNTER)
  617. nf_ct_acct_update(ct, dir, skb->len);
  618. return true;
  619. }
  620. static int tcf_ct_flow_tables_init(void)
  621. {
  622. return rhashtable_init(&zones_ht, &zones_params);
  623. }
  624. static void tcf_ct_flow_tables_uninit(void)
  625. {
  626. rhashtable_destroy(&zones_ht);
  627. }
  628. static struct tc_action_ops act_ct_ops;
  629. struct tc_ct_action_net {
  630. struct tc_action_net tn; /* Must be first */
  631. };
  632. /* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
  633. static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb,
  634. struct tcf_ct_params *p)
  635. {
  636. enum ip_conntrack_info ctinfo;
  637. struct nf_conn *ct;
  638. ct = nf_ct_get(skb, &ctinfo);
  639. if (!ct)
  640. return false;
  641. if (!net_eq(net, read_pnet(&ct->ct_net)))
  642. goto drop_ct;
  643. if (nf_ct_zone(ct)->id != p->zone)
  644. goto drop_ct;
  645. if (p->helper) {
  646. struct nf_conn_help *help;
  647. help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER);
  648. if (help && rcu_access_pointer(help->helper) != p->helper)
  649. goto drop_ct;
  650. }
  651. /* Force conntrack entry direction. */
  652. if ((p->ct_action & TCA_CT_ACT_FORCE) &&
  653. CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
  654. if (nf_ct_is_confirmed(ct))
  655. nf_ct_kill(ct);
  656. goto drop_ct;
  657. }
  658. return true;
  659. drop_ct:
  660. nf_ct_put(ct);
  661. nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
  662. return false;
  663. }
  664. static u8 tcf_ct_skb_nf_family(struct sk_buff *skb)
  665. {
  666. u8 family = NFPROTO_UNSPEC;
  667. switch (skb_protocol(skb, true)) {
  668. case htons(ETH_P_IP):
  669. family = NFPROTO_IPV4;
  670. break;
  671. case htons(ETH_P_IPV6):
  672. family = NFPROTO_IPV6;
  673. break;
  674. default:
  675. break;
  676. }
  677. return family;
  678. }
  679. static int tcf_ct_ipv4_is_fragment(struct sk_buff *skb, bool *frag)
  680. {
  681. unsigned int len;
  682. len = skb_network_offset(skb) + sizeof(struct iphdr);
  683. if (unlikely(skb->len < len))
  684. return -EINVAL;
  685. if (unlikely(!pskb_may_pull(skb, len)))
  686. return -ENOMEM;
  687. *frag = ip_is_fragment(ip_hdr(skb));
  688. return 0;
  689. }
  690. static int tcf_ct_ipv6_is_fragment(struct sk_buff *skb, bool *frag)
  691. {
  692. unsigned int flags = 0, len, payload_ofs = 0;
  693. unsigned short frag_off;
  694. int nexthdr;
  695. len = skb_network_offset(skb) + sizeof(struct ipv6hdr);
  696. if (unlikely(skb->len < len))
  697. return -EINVAL;
  698. if (unlikely(!pskb_may_pull(skb, len)))
  699. return -ENOMEM;
  700. nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags);
  701. if (unlikely(nexthdr < 0))
  702. return -EPROTO;
  703. *frag = flags & IP6_FH_F_FRAG;
  704. return 0;
  705. }
  706. static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
  707. u8 family, u16 zone, bool *defrag)
  708. {
  709. enum ip_conntrack_info ctinfo;
  710. struct nf_conn *ct;
  711. int err = 0;
  712. bool frag;
  713. u8 proto;
  714. u16 mru;
  715. /* Previously seen (loopback)? Ignore. */
  716. ct = nf_ct_get(skb, &ctinfo);
  717. if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED)
  718. return 0;
  719. if (family == NFPROTO_IPV4)
  720. err = tcf_ct_ipv4_is_fragment(skb, &frag);
  721. else
  722. err = tcf_ct_ipv6_is_fragment(skb, &frag);
  723. if (err || !frag)
  724. return err;
  725. err = nf_ct_handle_fragments(net, skb, zone, family, &proto, &mru);
  726. if (err)
  727. return err;
  728. *defrag = true;
  729. tc_skb_cb(skb)->mru = mru;
  730. return 0;
  731. }
  732. static void tcf_ct_params_free(struct tcf_ct_params *params)
  733. {
  734. if (params->helper) {
  735. #if IS_ENABLED(CONFIG_NF_NAT)
  736. if (params->ct_action & TCA_CT_ACT_NAT)
  737. nf_nat_helper_put(params->helper);
  738. #endif
  739. nf_conntrack_helper_put(params->helper);
  740. }
  741. if (params->ct_ft)
  742. tcf_ct_flow_table_put(params->ct_ft);
  743. if (params->tmpl) {
  744. if (params->put_labels)
  745. nf_connlabels_put(nf_ct_net(params->tmpl));
  746. nf_ct_put(params->tmpl);
  747. }
  748. kfree(params);
  749. }
  750. static void tcf_ct_params_free_rcu(struct rcu_head *head)
  751. {
  752. struct tcf_ct_params *params;
  753. params = container_of(head, struct tcf_ct_params, rcu);
  754. tcf_ct_params_free(params);
  755. }
  756. static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask)
  757. {
  758. #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
  759. u32 new_mark;
  760. if (!mask)
  761. return;
  762. new_mark = mark | (READ_ONCE(ct->mark) & ~(mask));
  763. if (READ_ONCE(ct->mark) != new_mark) {
  764. WRITE_ONCE(ct->mark, new_mark);
  765. if (nf_ct_is_confirmed(ct))
  766. nf_conntrack_event_cache(IPCT_MARK, ct);
  767. }
  768. #endif
  769. }
  770. static void tcf_ct_act_set_labels(struct nf_conn *ct,
  771. u32 *labels,
  772. u32 *labels_m)
  773. {
  774. #if IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)
  775. size_t labels_sz = sizeof_field(struct tcf_ct_params, labels);
  776. if (!memchr_inv(labels_m, 0, labels_sz))
  777. return;
  778. nf_connlabels_replace(ct, labels, labels_m, 4);
  779. #endif
  780. }
  781. static int tcf_ct_act_nat(struct sk_buff *skb,
  782. struct nf_conn *ct,
  783. enum ip_conntrack_info ctinfo,
  784. int ct_action,
  785. struct nf_nat_range2 *range,
  786. bool commit)
  787. {
  788. #if IS_ENABLED(CONFIG_NF_NAT)
  789. int err, action = 0;
  790. if (!(ct_action & TCA_CT_ACT_NAT))
  791. return NF_ACCEPT;
  792. if (ct_action & TCA_CT_ACT_NAT_SRC)
  793. action |= BIT(NF_NAT_MANIP_SRC);
  794. if (ct_action & TCA_CT_ACT_NAT_DST)
  795. action |= BIT(NF_NAT_MANIP_DST);
  796. err = nf_ct_nat(skb, ct, ctinfo, &action, range, commit);
  797. if (err != NF_ACCEPT)
  798. return err & NF_VERDICT_MASK;
  799. if (action & BIT(NF_NAT_MANIP_SRC))
  800. tc_skb_cb(skb)->post_ct_snat = 1;
  801. if (action & BIT(NF_NAT_MANIP_DST))
  802. tc_skb_cb(skb)->post_ct_dnat = 1;
  803. return err;
  804. #else
  805. return NF_ACCEPT;
  806. #endif
  807. }
  808. TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
  809. struct tcf_result *res)
  810. {
  811. struct net *net = dev_net(skb->dev);
  812. enum ip_conntrack_info ctinfo;
  813. struct tcf_ct *c = to_ct(a);
  814. struct nf_conn *tmpl = NULL;
  815. struct nf_hook_state state;
  816. bool cached, commit, clear;
  817. int nh_ofs, err, retval;
  818. struct tcf_ct_params *p;
  819. bool add_helper = false;
  820. bool skip_add = false;
  821. bool defrag = false;
  822. struct nf_conn *ct;
  823. u8 family;
  824. p = rcu_dereference_bh(c->params);
  825. retval = READ_ONCE(c->tcf_action);
  826. commit = p->ct_action & TCA_CT_ACT_COMMIT;
  827. clear = p->ct_action & TCA_CT_ACT_CLEAR;
  828. tmpl = p->tmpl;
  829. tcf_lastuse_update(&c->tcf_tm);
  830. tcf_action_update_bstats(&c->common, skb);
  831. if (clear) {
  832. tc_skb_cb(skb)->post_ct = false;
  833. ct = nf_ct_get(skb, &ctinfo);
  834. if (ct) {
  835. nf_ct_put(ct);
  836. nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
  837. }
  838. goto out_clear;
  839. }
  840. family = tcf_ct_skb_nf_family(skb);
  841. if (family == NFPROTO_UNSPEC)
  842. goto drop;
  843. /* The conntrack module expects to be working at L3.
  844. * We also try to pull the IPv4/6 header to linear area
  845. */
  846. nh_ofs = skb_network_offset(skb);
  847. skb_pull_rcsum(skb, nh_ofs);
  848. err = tcf_ct_handle_fragments(net, skb, family, p->zone, &defrag);
  849. if (err)
  850. goto out_frag;
  851. err = nf_ct_skb_network_trim(skb, family);
  852. if (err)
  853. goto drop;
  854. /* If we are recirculating packets to match on ct fields and
  855. * committing with a separate ct action, then we don't need to
  856. * actually run the packet through conntrack twice unless it's for a
  857. * different zone.
  858. */
  859. cached = tcf_ct_skb_nfct_cached(net, skb, p);
  860. if (!cached) {
  861. if (tcf_ct_flow_table_lookup(p, skb, family)) {
  862. skip_add = true;
  863. goto do_nat;
  864. }
  865. /* Associate skb with specified zone. */
  866. if (tmpl) {
  867. nf_conntrack_put(skb_nfct(skb));
  868. nf_conntrack_get(&tmpl->ct_general);
  869. nf_ct_set(skb, tmpl, IP_CT_NEW);
  870. }
  871. state.hook = NF_INET_PRE_ROUTING;
  872. state.net = net;
  873. state.pf = family;
  874. err = nf_conntrack_in(skb, &state);
  875. if (err != NF_ACCEPT)
  876. goto nf_error;
  877. }
  878. do_nat:
  879. ct = nf_ct_get(skb, &ctinfo);
  880. if (!ct)
  881. goto out_push;
  882. nf_ct_deliver_cached_events(ct);
  883. nf_conn_act_ct_ext_fill(skb, ct, ctinfo);
  884. err = tcf_ct_act_nat(skb, ct, ctinfo, p->ct_action, &p->range, commit);
  885. if (err != NF_ACCEPT)
  886. goto nf_error;
  887. if (!nf_ct_is_confirmed(ct) && commit && p->helper && !nfct_help(ct)) {
  888. err = __nf_ct_try_assign_helper(ct, p->tmpl, GFP_ATOMIC);
  889. if (err)
  890. goto drop;
  891. add_helper = true;
  892. if (p->ct_action & TCA_CT_ACT_NAT && !nfct_seqadj(ct)) {
  893. if (!nfct_seqadj_ext_add(ct))
  894. goto drop;
  895. }
  896. }
  897. if (nf_ct_is_confirmed(ct) ? ((!cached && !skip_add) || add_helper) : commit) {
  898. err = nf_ct_helper(skb, ct, ctinfo, family);
  899. if (err != NF_ACCEPT)
  900. goto nf_error;
  901. }
  902. if (commit) {
  903. tcf_ct_act_set_mark(ct, p->mark, p->mark_mask);
  904. tcf_ct_act_set_labels(ct, p->labels, p->labels_mask);
  905. if (!nf_ct_is_confirmed(ct))
  906. nf_conn_act_ct_ext_add(skb, ct, ctinfo);
  907. /* This will take care of sending queued events
  908. * even if the connection is already confirmed.
  909. */
  910. err = nf_conntrack_confirm(skb);
  911. if (err != NF_ACCEPT)
  912. goto nf_error;
  913. /* The ct may be dropped if a clash has been resolved,
  914. * so it's necessary to retrieve it from skb again to
  915. * prevent UAF.
  916. */
  917. ct = nf_ct_get(skb, &ctinfo);
  918. if (!ct)
  919. skip_add = true;
  920. }
  921. if (!skip_add)
  922. tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
  923. out_push:
  924. skb_push_rcsum(skb, nh_ofs);
  925. tc_skb_cb(skb)->post_ct = true;
  926. tc_skb_cb(skb)->zone = p->zone;
  927. out_clear:
  928. if (defrag)
  929. qdisc_skb_cb(skb)->pkt_len = skb->len;
  930. return retval;
  931. out_frag:
  932. if (err != -EINPROGRESS)
  933. tcf_action_inc_drop_qstats(&c->common);
  934. return TC_ACT_CONSUMED;
  935. drop:
  936. tcf_action_inc_drop_qstats(&c->common);
  937. return TC_ACT_SHOT;
  938. nf_error:
  939. /* some verdicts store extra data in upper bits, such
  940. * as errno or queue number.
  941. */
  942. switch (err & NF_VERDICT_MASK) {
  943. case NF_DROP:
  944. goto drop;
  945. case NF_STOLEN:
  946. tcf_action_inc_drop_qstats(&c->common);
  947. return TC_ACT_CONSUMED;
  948. default:
  949. DEBUG_NET_WARN_ON_ONCE(1);
  950. goto drop;
  951. }
  952. }
  953. static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
  954. [TCA_CT_ACTION] = { .type = NLA_U16 },
  955. [TCA_CT_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_ct)),
  956. [TCA_CT_ZONE] = { .type = NLA_U16 },
  957. [TCA_CT_MARK] = { .type = NLA_U32 },
  958. [TCA_CT_MARK_MASK] = { .type = NLA_U32 },
  959. [TCA_CT_LABELS] = { .type = NLA_BINARY,
  960. .len = 128 / BITS_PER_BYTE },
  961. [TCA_CT_LABELS_MASK] = { .type = NLA_BINARY,
  962. .len = 128 / BITS_PER_BYTE },
  963. [TCA_CT_NAT_IPV4_MIN] = { .type = NLA_U32 },
  964. [TCA_CT_NAT_IPV4_MAX] = { .type = NLA_U32 },
  965. [TCA_CT_NAT_IPV6_MIN] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
  966. [TCA_CT_NAT_IPV6_MAX] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
  967. [TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 },
  968. [TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 },
  969. [TCA_CT_HELPER_NAME] = { .type = NLA_STRING, .len = NF_CT_HELPER_NAME_LEN },
  970. [TCA_CT_HELPER_FAMILY] = { .type = NLA_U8 },
  971. [TCA_CT_HELPER_PROTO] = { .type = NLA_U8 },
  972. };
  973. static int tcf_ct_fill_params_nat(struct tcf_ct_params *p,
  974. struct tc_ct *parm,
  975. struct nlattr **tb,
  976. struct netlink_ext_ack *extack)
  977. {
  978. struct nf_nat_range2 *range;
  979. if (!(p->ct_action & TCA_CT_ACT_NAT))
  980. return 0;
  981. if (!IS_ENABLED(CONFIG_NF_NAT)) {
  982. NL_SET_ERR_MSG_MOD(extack, "Netfilter nat isn't enabled in kernel");
  983. return -EOPNOTSUPP;
  984. }
  985. if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST)))
  986. return 0;
  987. if ((p->ct_action & TCA_CT_ACT_NAT_SRC) &&
  988. (p->ct_action & TCA_CT_ACT_NAT_DST)) {
  989. NL_SET_ERR_MSG_MOD(extack, "dnat and snat can't be enabled at the same time");
  990. return -EOPNOTSUPP;
  991. }
  992. range = &p->range;
  993. if (tb[TCA_CT_NAT_IPV4_MIN]) {
  994. struct nlattr *max_attr = tb[TCA_CT_NAT_IPV4_MAX];
  995. p->ipv4_range = true;
  996. range->flags |= NF_NAT_RANGE_MAP_IPS;
  997. range->min_addr.ip =
  998. nla_get_in_addr(tb[TCA_CT_NAT_IPV4_MIN]);
  999. range->max_addr.ip = max_attr ?
  1000. nla_get_in_addr(max_attr) :
  1001. range->min_addr.ip;
  1002. } else if (tb[TCA_CT_NAT_IPV6_MIN]) {
  1003. struct nlattr *max_attr = tb[TCA_CT_NAT_IPV6_MAX];
  1004. p->ipv4_range = false;
  1005. range->flags |= NF_NAT_RANGE_MAP_IPS;
  1006. range->min_addr.in6 =
  1007. nla_get_in6_addr(tb[TCA_CT_NAT_IPV6_MIN]);
  1008. range->max_addr.in6 = max_attr ?
  1009. nla_get_in6_addr(max_attr) :
  1010. range->min_addr.in6;
  1011. }
  1012. if (tb[TCA_CT_NAT_PORT_MIN]) {
  1013. range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
  1014. range->min_proto.all = nla_get_be16(tb[TCA_CT_NAT_PORT_MIN]);
  1015. range->max_proto.all = tb[TCA_CT_NAT_PORT_MAX] ?
  1016. nla_get_be16(tb[TCA_CT_NAT_PORT_MAX]) :
  1017. range->min_proto.all;
  1018. }
  1019. return 0;
  1020. }
  1021. static void tcf_ct_set_key_val(struct nlattr **tb,
  1022. void *val, int val_type,
  1023. void *mask, int mask_type,
  1024. int len)
  1025. {
  1026. if (!tb[val_type])
  1027. return;
  1028. nla_memcpy(val, tb[val_type], len);
  1029. if (!mask)
  1030. return;
  1031. if (mask_type == TCA_CT_UNSPEC || !tb[mask_type])
  1032. memset(mask, 0xff, len);
  1033. else
  1034. nla_memcpy(mask, tb[mask_type], len);
  1035. }
  1036. static int tcf_ct_fill_params(struct net *net,
  1037. struct tcf_ct_params *p,
  1038. struct tc_ct *parm,
  1039. struct nlattr **tb,
  1040. struct netlink_ext_ack *extack)
  1041. {
  1042. struct nf_conntrack_zone zone;
  1043. int err, family, proto, len;
  1044. bool put_labels = false;
  1045. struct nf_conn *tmpl;
  1046. char *name;
  1047. p->zone = NF_CT_DEFAULT_ZONE_ID;
  1048. tcf_ct_set_key_val(tb,
  1049. &p->ct_action, TCA_CT_ACTION,
  1050. NULL, TCA_CT_UNSPEC,
  1051. sizeof(p->ct_action));
  1052. if (p->ct_action & TCA_CT_ACT_CLEAR)
  1053. return 0;
  1054. err = tcf_ct_fill_params_nat(p, parm, tb, extack);
  1055. if (err)
  1056. return err;
  1057. if (tb[TCA_CT_MARK]) {
  1058. if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) {
  1059. NL_SET_ERR_MSG_MOD(extack, "Conntrack mark isn't enabled.");
  1060. return -EOPNOTSUPP;
  1061. }
  1062. tcf_ct_set_key_val(tb,
  1063. &p->mark, TCA_CT_MARK,
  1064. &p->mark_mask, TCA_CT_MARK_MASK,
  1065. sizeof(p->mark));
  1066. }
  1067. if (tb[TCA_CT_LABELS]) {
  1068. unsigned int n_bits = sizeof_field(struct tcf_ct_params, labels) * 8;
  1069. if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) {
  1070. NL_SET_ERR_MSG_MOD(extack, "Conntrack labels isn't enabled.");
  1071. return -EOPNOTSUPP;
  1072. }
  1073. if (nf_connlabels_get(net, n_bits - 1)) {
  1074. NL_SET_ERR_MSG_MOD(extack, "Failed to set connlabel length");
  1075. return -EOPNOTSUPP;
  1076. } else {
  1077. put_labels = true;
  1078. }
  1079. tcf_ct_set_key_val(tb,
  1080. p->labels, TCA_CT_LABELS,
  1081. p->labels_mask, TCA_CT_LABELS_MASK,
  1082. sizeof(p->labels));
  1083. }
  1084. if (tb[TCA_CT_ZONE]) {
  1085. if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) {
  1086. NL_SET_ERR_MSG_MOD(extack, "Conntrack zones isn't enabled.");
  1087. return -EOPNOTSUPP;
  1088. }
  1089. tcf_ct_set_key_val(tb,
  1090. &p->zone, TCA_CT_ZONE,
  1091. NULL, TCA_CT_UNSPEC,
  1092. sizeof(p->zone));
  1093. }
  1094. nf_ct_zone_init(&zone, p->zone, NF_CT_DEFAULT_ZONE_DIR, 0);
  1095. tmpl = nf_ct_tmpl_alloc(net, &zone, GFP_KERNEL);
  1096. if (!tmpl) {
  1097. NL_SET_ERR_MSG_MOD(extack, "Failed to allocate conntrack template");
  1098. return -ENOMEM;
  1099. }
  1100. p->tmpl = tmpl;
  1101. if (tb[TCA_CT_HELPER_NAME]) {
  1102. name = nla_data(tb[TCA_CT_HELPER_NAME]);
  1103. len = nla_len(tb[TCA_CT_HELPER_NAME]);
  1104. if (len > 16 || name[len - 1] != '\0') {
  1105. NL_SET_ERR_MSG_MOD(extack, "Failed to parse helper name.");
  1106. err = -EINVAL;
  1107. goto err;
  1108. }
  1109. family = tb[TCA_CT_HELPER_FAMILY] ? nla_get_u8(tb[TCA_CT_HELPER_FAMILY]) : AF_INET;
  1110. proto = tb[TCA_CT_HELPER_PROTO] ? nla_get_u8(tb[TCA_CT_HELPER_PROTO]) : IPPROTO_TCP;
  1111. err = nf_ct_add_helper(tmpl, name, family, proto,
  1112. p->ct_action & TCA_CT_ACT_NAT, &p->helper);
  1113. if (err) {
  1114. NL_SET_ERR_MSG_MOD(extack, "Failed to add helper");
  1115. goto err;
  1116. }
  1117. }
  1118. p->put_labels = put_labels;
  1119. if (p->ct_action & TCA_CT_ACT_COMMIT)
  1120. __set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
  1121. return 0;
  1122. err:
  1123. if (put_labels)
  1124. nf_connlabels_put(net);
  1125. nf_ct_put(p->tmpl);
  1126. p->tmpl = NULL;
  1127. return err;
  1128. }
  1129. static int tcf_ct_init(struct net *net, struct nlattr *nla,
  1130. struct nlattr *est, struct tc_action **a,
  1131. struct tcf_proto *tp, u32 flags,
  1132. struct netlink_ext_ack *extack)
  1133. {
  1134. struct tc_action_net *tn = net_generic(net, act_ct_ops.net_id);
  1135. bool bind = flags & TCA_ACT_FLAGS_BIND;
  1136. struct tcf_ct_params *params = NULL;
  1137. struct nlattr *tb[TCA_CT_MAX + 1];
  1138. struct tcf_chain *goto_ch = NULL;
  1139. struct tc_ct *parm;
  1140. struct tcf_ct *c;
  1141. int err, res = 0;
  1142. u32 index;
  1143. if (!nla) {
  1144. NL_SET_ERR_MSG_MOD(extack, "Ct requires attributes to be passed");
  1145. return -EINVAL;
  1146. }
  1147. err = nla_parse_nested(tb, TCA_CT_MAX, nla, ct_policy, extack);
  1148. if (err < 0)
  1149. return err;
  1150. if (!tb[TCA_CT_PARMS]) {
  1151. NL_SET_ERR_MSG_MOD(extack, "Missing required ct parameters");
  1152. return -EINVAL;
  1153. }
  1154. parm = nla_data(tb[TCA_CT_PARMS]);
  1155. index = parm->index;
  1156. err = tcf_idr_check_alloc(tn, &index, a, bind);
  1157. if (err < 0)
  1158. return err;
  1159. if (!err) {
  1160. err = tcf_idr_create_from_flags(tn, index, est, a,
  1161. &act_ct_ops, bind, flags);
  1162. if (err) {
  1163. tcf_idr_cleanup(tn, index);
  1164. return err;
  1165. }
  1166. res = ACT_P_CREATED;
  1167. } else {
  1168. if (bind)
  1169. return ACT_P_BOUND;
  1170. if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
  1171. tcf_idr_release(*a, bind);
  1172. return -EEXIST;
  1173. }
  1174. }
  1175. err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
  1176. if (err < 0)
  1177. goto cleanup;
  1178. c = to_ct(*a);
  1179. params = kzalloc(sizeof(*params), GFP_KERNEL);
  1180. if (unlikely(!params)) {
  1181. err = -ENOMEM;
  1182. goto cleanup;
  1183. }
  1184. err = tcf_ct_fill_params(net, params, parm, tb, extack);
  1185. if (err)
  1186. goto cleanup;
  1187. err = tcf_ct_flow_table_get(net, params);
  1188. if (err)
  1189. goto cleanup;
  1190. spin_lock_bh(&c->tcf_lock);
  1191. goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
  1192. params = rcu_replace_pointer(c->params, params,
  1193. lockdep_is_held(&c->tcf_lock));
  1194. spin_unlock_bh(&c->tcf_lock);
  1195. if (goto_ch)
  1196. tcf_chain_put_by_act(goto_ch);
  1197. if (params)
  1198. call_rcu(&params->rcu, tcf_ct_params_free_rcu);
  1199. return res;
  1200. cleanup:
  1201. if (goto_ch)
  1202. tcf_chain_put_by_act(goto_ch);
  1203. if (params)
  1204. tcf_ct_params_free(params);
  1205. tcf_idr_release(*a, bind);
  1206. return err;
  1207. }
  1208. static void tcf_ct_cleanup(struct tc_action *a)
  1209. {
  1210. struct tcf_ct_params *params;
  1211. struct tcf_ct *c = to_ct(a);
  1212. params = rcu_dereference_protected(c->params, 1);
  1213. if (params)
  1214. call_rcu(&params->rcu, tcf_ct_params_free_rcu);
  1215. }
  1216. static int tcf_ct_dump_key_val(struct sk_buff *skb,
  1217. void *val, int val_type,
  1218. void *mask, int mask_type,
  1219. int len)
  1220. {
  1221. int err;
  1222. if (mask && !memchr_inv(mask, 0, len))
  1223. return 0;
  1224. err = nla_put(skb, val_type, len, val);
  1225. if (err)
  1226. return err;
  1227. if (mask_type != TCA_CT_UNSPEC) {
  1228. err = nla_put(skb, mask_type, len, mask);
  1229. if (err)
  1230. return err;
  1231. }
  1232. return 0;
  1233. }
  1234. static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p)
  1235. {
  1236. struct nf_nat_range2 *range = &p->range;
  1237. if (!(p->ct_action & TCA_CT_ACT_NAT))
  1238. return 0;
  1239. if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST)))
  1240. return 0;
  1241. if (range->flags & NF_NAT_RANGE_MAP_IPS) {
  1242. if (p->ipv4_range) {
  1243. if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MIN,
  1244. range->min_addr.ip))
  1245. return -1;
  1246. if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MAX,
  1247. range->max_addr.ip))
  1248. return -1;
  1249. } else {
  1250. if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MIN,
  1251. &range->min_addr.in6))
  1252. return -1;
  1253. if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MAX,
  1254. &range->max_addr.in6))
  1255. return -1;
  1256. }
  1257. }
  1258. if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
  1259. if (nla_put_be16(skb, TCA_CT_NAT_PORT_MIN,
  1260. range->min_proto.all))
  1261. return -1;
  1262. if (nla_put_be16(skb, TCA_CT_NAT_PORT_MAX,
  1263. range->max_proto.all))
  1264. return -1;
  1265. }
  1266. return 0;
  1267. }
  1268. static int tcf_ct_dump_helper(struct sk_buff *skb, struct nf_conntrack_helper *helper)
  1269. {
  1270. if (!helper)
  1271. return 0;
  1272. if (nla_put_string(skb, TCA_CT_HELPER_NAME, helper->name) ||
  1273. nla_put_u8(skb, TCA_CT_HELPER_FAMILY, helper->tuple.src.l3num) ||
  1274. nla_put_u8(skb, TCA_CT_HELPER_PROTO, helper->tuple.dst.protonum))
  1275. return -1;
  1276. return 0;
  1277. }
  1278. static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a,
  1279. int bind, int ref)
  1280. {
  1281. unsigned char *b = skb_tail_pointer(skb);
  1282. struct tcf_ct *c = to_ct(a);
  1283. struct tcf_ct_params *p;
  1284. struct tc_ct opt = {
  1285. .index = c->tcf_index,
  1286. .refcnt = refcount_read(&c->tcf_refcnt) - ref,
  1287. .bindcnt = atomic_read(&c->tcf_bindcnt) - bind,
  1288. };
  1289. struct tcf_t t;
  1290. spin_lock_bh(&c->tcf_lock);
  1291. p = rcu_dereference_protected(c->params,
  1292. lockdep_is_held(&c->tcf_lock));
  1293. opt.action = c->tcf_action;
  1294. if (tcf_ct_dump_key_val(skb,
  1295. &p->ct_action, TCA_CT_ACTION,
  1296. NULL, TCA_CT_UNSPEC,
  1297. sizeof(p->ct_action)))
  1298. goto nla_put_failure;
  1299. if (p->ct_action & TCA_CT_ACT_CLEAR)
  1300. goto skip_dump;
  1301. if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
  1302. tcf_ct_dump_key_val(skb,
  1303. &p->mark, TCA_CT_MARK,
  1304. &p->mark_mask, TCA_CT_MARK_MASK,
  1305. sizeof(p->mark)))
  1306. goto nla_put_failure;
  1307. if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
  1308. tcf_ct_dump_key_val(skb,
  1309. p->labels, TCA_CT_LABELS,
  1310. p->labels_mask, TCA_CT_LABELS_MASK,
  1311. sizeof(p->labels)))
  1312. goto nla_put_failure;
  1313. if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
  1314. tcf_ct_dump_key_val(skb,
  1315. &p->zone, TCA_CT_ZONE,
  1316. NULL, TCA_CT_UNSPEC,
  1317. sizeof(p->zone)))
  1318. goto nla_put_failure;
  1319. if (tcf_ct_dump_nat(skb, p))
  1320. goto nla_put_failure;
  1321. if (tcf_ct_dump_helper(skb, p->helper))
  1322. goto nla_put_failure;
  1323. skip_dump:
  1324. if (nla_put(skb, TCA_CT_PARMS, sizeof(opt), &opt))
  1325. goto nla_put_failure;
  1326. tcf_tm_dump(&t, &c->tcf_tm);
  1327. if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD))
  1328. goto nla_put_failure;
  1329. spin_unlock_bh(&c->tcf_lock);
  1330. return skb->len;
  1331. nla_put_failure:
  1332. spin_unlock_bh(&c->tcf_lock);
  1333. nlmsg_trim(skb, b);
  1334. return -1;
  1335. }
  1336. static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets,
  1337. u64 drops, u64 lastuse, bool hw)
  1338. {
  1339. struct tcf_ct *c = to_ct(a);
  1340. tcf_action_update_stats(a, bytes, packets, drops, hw);
  1341. c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse);
  1342. }
  1343. static int tcf_ct_offload_act_setup(struct tc_action *act, void *entry_data,
  1344. u32 *index_inc, bool bind,
  1345. struct netlink_ext_ack *extack)
  1346. {
  1347. if (bind) {
  1348. struct flow_action_entry *entry = entry_data;
  1349. if (tcf_ct_helper(act))
  1350. return -EOPNOTSUPP;
  1351. entry->id = FLOW_ACTION_CT;
  1352. entry->ct.action = tcf_ct_action(act);
  1353. entry->ct.zone = tcf_ct_zone(act);
  1354. entry->ct.flow_table = tcf_ct_ft(act);
  1355. *index_inc = 1;
  1356. } else {
  1357. struct flow_offload_action *fl_action = entry_data;
  1358. fl_action->id = FLOW_ACTION_CT;
  1359. }
  1360. return 0;
  1361. }
  1362. static struct tc_action_ops act_ct_ops = {
  1363. .kind = "ct",
  1364. .id = TCA_ID_CT,
  1365. .owner = THIS_MODULE,
  1366. .act = tcf_ct_act,
  1367. .dump = tcf_ct_dump,
  1368. .init = tcf_ct_init,
  1369. .cleanup = tcf_ct_cleanup,
  1370. .stats_update = tcf_stats_update,
  1371. .offload_act_setup = tcf_ct_offload_act_setup,
  1372. .size = sizeof(struct tcf_ct),
  1373. };
  1374. MODULE_ALIAS_NET_ACT("ct");
  1375. static __net_init int ct_init_net(struct net *net)
  1376. {
  1377. struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id);
  1378. return tc_action_net_init(net, &tn->tn, &act_ct_ops);
  1379. }
  1380. static void __net_exit ct_exit_net(struct list_head *net_list)
  1381. {
  1382. tc_action_net_exit(net_list, act_ct_ops.net_id);
  1383. }
  1384. static struct pernet_operations ct_net_ops = {
  1385. .init = ct_init_net,
  1386. .exit_batch = ct_exit_net,
  1387. .id = &act_ct_ops.net_id,
  1388. .size = sizeof(struct tc_ct_action_net),
  1389. };
  1390. static int __init ct_init_module(void)
  1391. {
  1392. int err;
  1393. act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0);
  1394. if (!act_ct_wq)
  1395. return -ENOMEM;
  1396. err = tcf_ct_flow_tables_init();
  1397. if (err)
  1398. goto err_tbl_init;
  1399. err = tcf_register_action(&act_ct_ops, &ct_net_ops);
  1400. if (err)
  1401. goto err_register;
  1402. static_branch_inc(&tcf_frag_xmit_count);
  1403. return 0;
  1404. err_register:
  1405. tcf_ct_flow_tables_uninit();
  1406. err_tbl_init:
  1407. destroy_workqueue(act_ct_wq);
  1408. return err;
  1409. }
  1410. static void __exit ct_cleanup_module(void)
  1411. {
  1412. static_branch_dec(&tcf_frag_xmit_count);
  1413. tcf_unregister_action(&act_ct_ops, &ct_net_ops);
  1414. tcf_ct_flow_tables_uninit();
  1415. destroy_workqueue(act_ct_wq);
  1416. }
  1417. module_init(ct_init_module);
  1418. module_exit(ct_cleanup_module);
  1419. MODULE_AUTHOR("Paul Blakey <paulb@mellanox.com>");
  1420. MODULE_AUTHOR("Yossi Kuperman <yossiku@mellanox.com>");
  1421. MODULE_AUTHOR("Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>");
  1422. MODULE_DESCRIPTION("Connection tracking action");
  1423. MODULE_LICENSE("GPL v2");