aria-gfni-avx512-asm_64.S 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * ARIA Cipher 64-way parallel algorithm (AVX512)
  4. *
  5. * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
  6. *
  7. */
  8. #include <linux/linkage.h>
  9. #include <asm/frame.h>
  10. #include <asm/asm-offsets.h>
  11. #include <linux/cfi_types.h>
  12. /* register macros */
  13. #define CTX %rdi
  14. #define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
  15. ( (((a0) & 1) << 0) | \
  16. (((a1) & 1) << 1) | \
  17. (((a2) & 1) << 2) | \
  18. (((a3) & 1) << 3) | \
  19. (((a4) & 1) << 4) | \
  20. (((a5) & 1) << 5) | \
  21. (((a6) & 1) << 6) | \
  22. (((a7) & 1) << 7) )
  23. #define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
  24. ( ((l7) << (0 * 8)) | \
  25. ((l6) << (1 * 8)) | \
  26. ((l5) << (2 * 8)) | \
  27. ((l4) << (3 * 8)) | \
  28. ((l3) << (4 * 8)) | \
  29. ((l2) << (5 * 8)) | \
  30. ((l1) << (6 * 8)) | \
  31. ((l0) << (7 * 8)) )
  32. #define add_le128(out, in, lo_counter, hi_counter1) \
  33. vpaddq lo_counter, in, out; \
  34. vpcmpuq $1, lo_counter, out, %k1; \
  35. kaddb %k1, %k1, %k1; \
  36. vpaddq hi_counter1, out, out{%k1};
  37. #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
  38. vpandq x, mask4bit, tmp0; \
  39. vpandqn x, mask4bit, x; \
  40. vpsrld $4, x, x; \
  41. \
  42. vpshufb tmp0, lo_t, tmp0; \
  43. vpshufb x, hi_t, x; \
  44. vpxorq tmp0, x, x;
  45. #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
  46. vpunpckhdq x1, x0, t2; \
  47. vpunpckldq x1, x0, x0; \
  48. \
  49. vpunpckldq x3, x2, t1; \
  50. vpunpckhdq x3, x2, x2; \
  51. \
  52. vpunpckhqdq t1, x0, x1; \
  53. vpunpcklqdq t1, x0, x0; \
  54. \
  55. vpunpckhqdq x2, t2, x3; \
  56. vpunpcklqdq x2, t2, x2;
  57. #define byteslice_16x16b(a0, b0, c0, d0, \
  58. a1, b1, c1, d1, \
  59. a2, b2, c2, d2, \
  60. a3, b3, c3, d3, \
  61. st0, st1) \
  62. vmovdqu64 d2, st0; \
  63. vmovdqu64 d3, st1; \
  64. transpose_4x4(a0, a1, a2, a3, d2, d3); \
  65. transpose_4x4(b0, b1, b2, b3, d2, d3); \
  66. vmovdqu64 st0, d2; \
  67. vmovdqu64 st1, d3; \
  68. \
  69. vmovdqu64 a0, st0; \
  70. vmovdqu64 a1, st1; \
  71. transpose_4x4(c0, c1, c2, c3, a0, a1); \
  72. transpose_4x4(d0, d1, d2, d3, a0, a1); \
  73. \
  74. vbroadcasti64x2 .Lshufb_16x16b(%rip), a0; \
  75. vmovdqu64 st1, a1; \
  76. vpshufb a0, a2, a2; \
  77. vpshufb a0, a3, a3; \
  78. vpshufb a0, b0, b0; \
  79. vpshufb a0, b1, b1; \
  80. vpshufb a0, b2, b2; \
  81. vpshufb a0, b3, b3; \
  82. vpshufb a0, a1, a1; \
  83. vpshufb a0, c0, c0; \
  84. vpshufb a0, c1, c1; \
  85. vpshufb a0, c2, c2; \
  86. vpshufb a0, c3, c3; \
  87. vpshufb a0, d0, d0; \
  88. vpshufb a0, d1, d1; \
  89. vpshufb a0, d2, d2; \
  90. vpshufb a0, d3, d3; \
  91. vmovdqu64 d3, st1; \
  92. vmovdqu64 st0, d3; \
  93. vpshufb a0, d3, a0; \
  94. vmovdqu64 d2, st0; \
  95. \
  96. transpose_4x4(a0, b0, c0, d0, d2, d3); \
  97. transpose_4x4(a1, b1, c1, d1, d2, d3); \
  98. vmovdqu64 st0, d2; \
  99. vmovdqu64 st1, d3; \
  100. \
  101. vmovdqu64 b0, st0; \
  102. vmovdqu64 b1, st1; \
  103. transpose_4x4(a2, b2, c2, d2, b0, b1); \
  104. transpose_4x4(a3, b3, c3, d3, b0, b1); \
  105. vmovdqu64 st0, b0; \
  106. vmovdqu64 st1, b1; \
  107. /* does not adjust output bytes inside vectors */
  108. #define debyteslice_16x16b(a0, b0, c0, d0, \
  109. a1, b1, c1, d1, \
  110. a2, b2, c2, d2, \
  111. a3, b3, c3, d3, \
  112. st0, st1) \
  113. vmovdqu64 d2, st0; \
  114. vmovdqu64 d3, st1; \
  115. transpose_4x4(a0, a1, a2, a3, d2, d3); \
  116. transpose_4x4(b0, b1, b2, b3, d2, d3); \
  117. vmovdqu64 st0, d2; \
  118. vmovdqu64 st1, d3; \
  119. \
  120. vmovdqu64 a0, st0; \
  121. vmovdqu64 a1, st1; \
  122. transpose_4x4(c0, c1, c2, c3, a0, a1); \
  123. transpose_4x4(d0, d1, d2, d3, a0, a1); \
  124. \
  125. vbroadcasti64x2 .Lshufb_16x16b(%rip), a0; \
  126. vmovdqu64 st1, a1; \
  127. vpshufb a0, a2, a2; \
  128. vpshufb a0, a3, a3; \
  129. vpshufb a0, b0, b0; \
  130. vpshufb a0, b1, b1; \
  131. vpshufb a0, b2, b2; \
  132. vpshufb a0, b3, b3; \
  133. vpshufb a0, a1, a1; \
  134. vpshufb a0, c0, c0; \
  135. vpshufb a0, c1, c1; \
  136. vpshufb a0, c2, c2; \
  137. vpshufb a0, c3, c3; \
  138. vpshufb a0, d0, d0; \
  139. vpshufb a0, d1, d1; \
  140. vpshufb a0, d2, d2; \
  141. vpshufb a0, d3, d3; \
  142. vmovdqu64 d3, st1; \
  143. vmovdqu64 st0, d3; \
  144. vpshufb a0, d3, a0; \
  145. vmovdqu64 d2, st0; \
  146. \
  147. transpose_4x4(c0, d0, a0, b0, d2, d3); \
  148. transpose_4x4(c1, d1, a1, b1, d2, d3); \
  149. vmovdqu64 st0, d2; \
  150. vmovdqu64 st1, d3; \
  151. \
  152. vmovdqu64 b0, st0; \
  153. vmovdqu64 b1, st1; \
  154. transpose_4x4(c2, d2, a2, b2, b0, b1); \
  155. transpose_4x4(c3, d3, a3, b3, b0, b1); \
  156. vmovdqu64 st0, b0; \
  157. vmovdqu64 st1, b1; \
  158. /* does not adjust output bytes inside vectors */
  159. /* load blocks to registers and apply pre-whitening */
  160. #define inpack16_pre(x0, x1, x2, x3, \
  161. x4, x5, x6, x7, \
  162. y0, y1, y2, y3, \
  163. y4, y5, y6, y7, \
  164. rio) \
  165. vmovdqu64 (0 * 64)(rio), x0; \
  166. vmovdqu64 (1 * 64)(rio), x1; \
  167. vmovdqu64 (2 * 64)(rio), x2; \
  168. vmovdqu64 (3 * 64)(rio), x3; \
  169. vmovdqu64 (4 * 64)(rio), x4; \
  170. vmovdqu64 (5 * 64)(rio), x5; \
  171. vmovdqu64 (6 * 64)(rio), x6; \
  172. vmovdqu64 (7 * 64)(rio), x7; \
  173. vmovdqu64 (8 * 64)(rio), y0; \
  174. vmovdqu64 (9 * 64)(rio), y1; \
  175. vmovdqu64 (10 * 64)(rio), y2; \
  176. vmovdqu64 (11 * 64)(rio), y3; \
  177. vmovdqu64 (12 * 64)(rio), y4; \
  178. vmovdqu64 (13 * 64)(rio), y5; \
  179. vmovdqu64 (14 * 64)(rio), y6; \
  180. vmovdqu64 (15 * 64)(rio), y7;
  181. /* byteslice pre-whitened blocks and store to temporary memory */
  182. #define inpack16_post(x0, x1, x2, x3, \
  183. x4, x5, x6, x7, \
  184. y0, y1, y2, y3, \
  185. y4, y5, y6, y7, \
  186. mem_ab, mem_cd) \
  187. byteslice_16x16b(x0, x1, x2, x3, \
  188. x4, x5, x6, x7, \
  189. y0, y1, y2, y3, \
  190. y4, y5, y6, y7, \
  191. (mem_ab), (mem_cd)); \
  192. \
  193. vmovdqu64 x0, 0 * 64(mem_ab); \
  194. vmovdqu64 x1, 1 * 64(mem_ab); \
  195. vmovdqu64 x2, 2 * 64(mem_ab); \
  196. vmovdqu64 x3, 3 * 64(mem_ab); \
  197. vmovdqu64 x4, 4 * 64(mem_ab); \
  198. vmovdqu64 x5, 5 * 64(mem_ab); \
  199. vmovdqu64 x6, 6 * 64(mem_ab); \
  200. vmovdqu64 x7, 7 * 64(mem_ab); \
  201. vmovdqu64 y0, 0 * 64(mem_cd); \
  202. vmovdqu64 y1, 1 * 64(mem_cd); \
  203. vmovdqu64 y2, 2 * 64(mem_cd); \
  204. vmovdqu64 y3, 3 * 64(mem_cd); \
  205. vmovdqu64 y4, 4 * 64(mem_cd); \
  206. vmovdqu64 y5, 5 * 64(mem_cd); \
  207. vmovdqu64 y6, 6 * 64(mem_cd); \
  208. vmovdqu64 y7, 7 * 64(mem_cd);
  209. #define write_output(x0, x1, x2, x3, \
  210. x4, x5, x6, x7, \
  211. y0, y1, y2, y3, \
  212. y4, y5, y6, y7, \
  213. mem) \
  214. vmovdqu64 x0, 0 * 64(mem); \
  215. vmovdqu64 x1, 1 * 64(mem); \
  216. vmovdqu64 x2, 2 * 64(mem); \
  217. vmovdqu64 x3, 3 * 64(mem); \
  218. vmovdqu64 x4, 4 * 64(mem); \
  219. vmovdqu64 x5, 5 * 64(mem); \
  220. vmovdqu64 x6, 6 * 64(mem); \
  221. vmovdqu64 x7, 7 * 64(mem); \
  222. vmovdqu64 y0, 8 * 64(mem); \
  223. vmovdqu64 y1, 9 * 64(mem); \
  224. vmovdqu64 y2, 10 * 64(mem); \
  225. vmovdqu64 y3, 11 * 64(mem); \
  226. vmovdqu64 y4, 12 * 64(mem); \
  227. vmovdqu64 y5, 13 * 64(mem); \
  228. vmovdqu64 y6, 14 * 64(mem); \
  229. vmovdqu64 y7, 15 * 64(mem); \
  230. #define aria_store_state_8way(x0, x1, x2, x3, \
  231. x4, x5, x6, x7, \
  232. mem_tmp, idx) \
  233. vmovdqu64 x0, ((idx + 0) * 64)(mem_tmp); \
  234. vmovdqu64 x1, ((idx + 1) * 64)(mem_tmp); \
  235. vmovdqu64 x2, ((idx + 2) * 64)(mem_tmp); \
  236. vmovdqu64 x3, ((idx + 3) * 64)(mem_tmp); \
  237. vmovdqu64 x4, ((idx + 4) * 64)(mem_tmp); \
  238. vmovdqu64 x5, ((idx + 5) * 64)(mem_tmp); \
  239. vmovdqu64 x6, ((idx + 6) * 64)(mem_tmp); \
  240. vmovdqu64 x7, ((idx + 7) * 64)(mem_tmp);
  241. #define aria_load_state_8way(x0, x1, x2, x3, \
  242. x4, x5, x6, x7, \
  243. mem_tmp, idx) \
  244. vmovdqu64 ((idx + 0) * 64)(mem_tmp), x0; \
  245. vmovdqu64 ((idx + 1) * 64)(mem_tmp), x1; \
  246. vmovdqu64 ((idx + 2) * 64)(mem_tmp), x2; \
  247. vmovdqu64 ((idx + 3) * 64)(mem_tmp), x3; \
  248. vmovdqu64 ((idx + 4) * 64)(mem_tmp), x4; \
  249. vmovdqu64 ((idx + 5) * 64)(mem_tmp), x5; \
  250. vmovdqu64 ((idx + 6) * 64)(mem_tmp), x6; \
  251. vmovdqu64 ((idx + 7) * 64)(mem_tmp), x7;
  252. #define aria_ark_16way(x0, x1, x2, x3, \
  253. x4, x5, x6, x7, \
  254. y0, y1, y2, y3, \
  255. y4, y5, y6, y7, \
  256. t0, rk, round) \
  257. /* AddRoundKey */ \
  258. vpbroadcastb ((round * 16) + 3)(rk), t0; \
  259. vpxorq t0, x0, x0; \
  260. vpbroadcastb ((round * 16) + 2)(rk), t0; \
  261. vpxorq t0, x1, x1; \
  262. vpbroadcastb ((round * 16) + 1)(rk), t0; \
  263. vpxorq t0, x2, x2; \
  264. vpbroadcastb ((round * 16) + 0)(rk), t0; \
  265. vpxorq t0, x3, x3; \
  266. vpbroadcastb ((round * 16) + 7)(rk), t0; \
  267. vpxorq t0, x4, x4; \
  268. vpbroadcastb ((round * 16) + 6)(rk), t0; \
  269. vpxorq t0, x5, x5; \
  270. vpbroadcastb ((round * 16) + 5)(rk), t0; \
  271. vpxorq t0, x6, x6; \
  272. vpbroadcastb ((round * 16) + 4)(rk), t0; \
  273. vpxorq t0, x7, x7; \
  274. vpbroadcastb ((round * 16) + 11)(rk), t0; \
  275. vpxorq t0, y0, y0; \
  276. vpbroadcastb ((round * 16) + 10)(rk), t0; \
  277. vpxorq t0, y1, y1; \
  278. vpbroadcastb ((round * 16) + 9)(rk), t0; \
  279. vpxorq t0, y2, y2; \
  280. vpbroadcastb ((round * 16) + 8)(rk), t0; \
  281. vpxorq t0, y3, y3; \
  282. vpbroadcastb ((round * 16) + 15)(rk), t0; \
  283. vpxorq t0, y4, y4; \
  284. vpbroadcastb ((round * 16) + 14)(rk), t0; \
  285. vpxorq t0, y5, y5; \
  286. vpbroadcastb ((round * 16) + 13)(rk), t0; \
  287. vpxorq t0, y6, y6; \
  288. vpbroadcastb ((round * 16) + 12)(rk), t0; \
  289. vpxorq t0, y7, y7;
  290. #define aria_sbox_8way_gfni(x0, x1, x2, x3, \
  291. x4, x5, x6, x7, \
  292. t0, t1, t2, t3, \
  293. t4, t5, t6, t7) \
  294. vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \
  295. vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \
  296. vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \
  297. vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \
  298. vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \
  299. vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
  300. vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
  301. vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
  302. vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
  303. vgf2p8affineinvqb $0, t2, x2, x2; \
  304. vgf2p8affineinvqb $0, t2, x6, x6; \
  305. vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
  306. vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
  307. vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
  308. vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
  309. vgf2p8affineinvqb $0, t2, x3, x3; \
  310. vgf2p8affineinvqb $0, t2, x7, x7;
  311. #define aria_sbox_16way_gfni(x0, x1, x2, x3, \
  312. x4, x5, x6, x7, \
  313. y0, y1, y2, y3, \
  314. y4, y5, y6, y7, \
  315. t0, t1, t2, t3, \
  316. t4, t5, t6, t7) \
  317. vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \
  318. vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \
  319. vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \
  320. vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \
  321. vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \
  322. vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
  323. vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
  324. vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
  325. vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
  326. vgf2p8affineinvqb $0, t2, x2, x2; \
  327. vgf2p8affineinvqb $0, t2, x6, x6; \
  328. vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
  329. vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
  330. vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
  331. vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
  332. vgf2p8affineinvqb $0, t2, x3, x3; \
  333. vgf2p8affineinvqb $0, t2, x7, x7; \
  334. vgf2p8affineinvqb $(tf_s2_const), t0, y1, y1; \
  335. vgf2p8affineinvqb $(tf_s2_const), t0, y5, y5; \
  336. vgf2p8affineqb $(tf_inv_const), t1, y2, y2; \
  337. vgf2p8affineqb $(tf_inv_const), t1, y6, y6; \
  338. vgf2p8affineinvqb $0, t2, y2, y2; \
  339. vgf2p8affineinvqb $0, t2, y6, y6; \
  340. vgf2p8affineinvqb $(tf_aff_const), t3, y0, y0; \
  341. vgf2p8affineinvqb $(tf_aff_const), t3, y4, y4; \
  342. vgf2p8affineqb $(tf_x2_const), t4, y3, y3; \
  343. vgf2p8affineqb $(tf_x2_const), t4, y7, y7; \
  344. vgf2p8affineinvqb $0, t2, y3, y3; \
  345. vgf2p8affineinvqb $0, t2, y7, y7;
  346. #define aria_diff_m(x0, x1, x2, x3, \
  347. t0, t1, t2, t3) \
  348. /* T = rotr32(X, 8); */ \
  349. /* X ^= T */ \
  350. vpxorq x0, x3, t0; \
  351. vpxorq x1, x0, t1; \
  352. vpxorq x2, x1, t2; \
  353. vpxorq x3, x2, t3; \
  354. /* X = T ^ rotr(X, 16); */ \
  355. vpxorq t2, x0, x0; \
  356. vpxorq x1, t3, t3; \
  357. vpxorq t0, x2, x2; \
  358. vpxorq t1, x3, x1; \
  359. vmovdqu64 t3, x3;
  360. #define aria_diff_word(x0, x1, x2, x3, \
  361. x4, x5, x6, x7, \
  362. y0, y1, y2, y3, \
  363. y4, y5, y6, y7) \
  364. /* t1 ^= t2; */ \
  365. vpxorq y0, x4, x4; \
  366. vpxorq y1, x5, x5; \
  367. vpxorq y2, x6, x6; \
  368. vpxorq y3, x7, x7; \
  369. \
  370. /* t2 ^= t3; */ \
  371. vpxorq y4, y0, y0; \
  372. vpxorq y5, y1, y1; \
  373. vpxorq y6, y2, y2; \
  374. vpxorq y7, y3, y3; \
  375. \
  376. /* t0 ^= t1; */ \
  377. vpxorq x4, x0, x0; \
  378. vpxorq x5, x1, x1; \
  379. vpxorq x6, x2, x2; \
  380. vpxorq x7, x3, x3; \
  381. \
  382. /* t3 ^= t1; */ \
  383. vpxorq x4, y4, y4; \
  384. vpxorq x5, y5, y5; \
  385. vpxorq x6, y6, y6; \
  386. vpxorq x7, y7, y7; \
  387. \
  388. /* t2 ^= t0; */ \
  389. vpxorq x0, y0, y0; \
  390. vpxorq x1, y1, y1; \
  391. vpxorq x2, y2, y2; \
  392. vpxorq x3, y3, y3; \
  393. \
  394. /* t1 ^= t2; */ \
  395. vpxorq y0, x4, x4; \
  396. vpxorq y1, x5, x5; \
  397. vpxorq y2, x6, x6; \
  398. vpxorq y3, x7, x7;
  399. #define aria_fe_gfni(x0, x1, x2, x3, \
  400. x4, x5, x6, x7, \
  401. y0, y1, y2, y3, \
  402. y4, y5, y6, y7, \
  403. z0, z1, z2, z3, \
  404. z4, z5, z6, z7, \
  405. mem_tmp, rk, round) \
  406. aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7, \
  407. y0, y1, y2, y3, y4, y5, y6, y7, \
  408. z0, rk, round); \
  409. \
  410. aria_sbox_16way_gfni(x2, x3, x0, x1, \
  411. x6, x7, x4, x5, \
  412. y2, y3, y0, y1, \
  413. y6, y7, y4, y5, \
  414. z0, z1, z2, z3, \
  415. z4, z5, z6, z7); \
  416. \
  417. aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3); \
  418. aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3); \
  419. aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3); \
  420. aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3); \
  421. aria_diff_word(x0, x1, x2, x3, \
  422. x4, x5, x6, x7, \
  423. y0, y1, y2, y3, \
  424. y4, y5, y6, y7); \
  425. /* aria_diff_byte() \
  426. * T3 = ABCD -> BADC \
  427. * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
  428. * T0 = ABCD -> CDAB \
  429. * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
  430. * T1 = ABCD -> DCBA \
  431. * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
  432. */ \
  433. aria_diff_word(x2, x3, x0, x1, \
  434. x7, x6, x5, x4, \
  435. y0, y1, y2, y3, \
  436. y5, y4, y7, y6); \
  437. #define aria_fo_gfni(x0, x1, x2, x3, \
  438. x4, x5, x6, x7, \
  439. y0, y1, y2, y3, \
  440. y4, y5, y6, y7, \
  441. z0, z1, z2, z3, \
  442. z4, z5, z6, z7, \
  443. mem_tmp, rk, round) \
  444. aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7, \
  445. y0, y1, y2, y3, y4, y5, y6, y7, \
  446. z0, rk, round); \
  447. \
  448. aria_sbox_16way_gfni(x0, x1, x2, x3, \
  449. x4, x5, x6, x7, \
  450. y0, y1, y2, y3, \
  451. y4, y5, y6, y7, \
  452. z0, z1, z2, z3, \
  453. z4, z5, z6, z7); \
  454. \
  455. aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3); \
  456. aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3); \
  457. aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3); \
  458. aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3); \
  459. aria_diff_word(x0, x1, x2, x3, \
  460. x4, x5, x6, x7, \
  461. y0, y1, y2, y3, \
  462. y4, y5, y6, y7); \
  463. /* aria_diff_byte() \
  464. * T1 = ABCD -> BADC \
  465. * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
  466. * T2 = ABCD -> CDAB \
  467. * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
  468. * T3 = ABCD -> DCBA \
  469. * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
  470. */ \
  471. aria_diff_word(x0, x1, x2, x3, \
  472. x5, x4, x7, x6, \
  473. y2, y3, y0, y1, \
  474. y7, y6, y5, y4);
  475. #define aria_ff_gfni(x0, x1, x2, x3, \
  476. x4, x5, x6, x7, \
  477. y0, y1, y2, y3, \
  478. y4, y5, y6, y7, \
  479. z0, z1, z2, z3, \
  480. z4, z5, z6, z7, \
  481. mem_tmp, rk, round, last_round) \
  482. aria_ark_16way(x0, x1, x2, x3, \
  483. x4, x5, x6, x7, \
  484. y0, y1, y2, y3, \
  485. y4, y5, y6, y7, \
  486. z0, rk, round); \
  487. aria_sbox_16way_gfni(x2, x3, x0, x1, \
  488. x6, x7, x4, x5, \
  489. y2, y3, y0, y1, \
  490. y6, y7, y4, y5, \
  491. z0, z1, z2, z3, \
  492. z4, z5, z6, z7); \
  493. aria_ark_16way(x0, x1, x2, x3, \
  494. x4, x5, x6, x7, \
  495. y0, y1, y2, y3, \
  496. y4, y5, y6, y7, \
  497. z0, rk, last_round);
  498. .section .rodata.cst64, "aM", @progbits, 64
  499. .align 64
  500. .Lcounter0123_lo:
  501. .quad 0, 0
  502. .quad 1, 0
  503. .quad 2, 0
  504. .quad 3, 0
  505. .section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
  506. .align 32
  507. #define SHUFB_BYTES(idx) \
  508. 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
  509. .Lshufb_16x16b:
  510. .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
  511. .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
  512. .section .rodata.cst16, "aM", @progbits, 16
  513. .align 16
  514. .Lcounter4444_lo:
  515. .quad 4, 0
  516. .Lcounter8888_lo:
  517. .quad 8, 0
  518. .Lcounter16161616_lo:
  519. .quad 16, 0
  520. .Lcounter1111_hi:
  521. .quad 0, 1
  522. /* For CTR-mode IV byteswap */
  523. .Lbswap128_mask:
  524. .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
  525. .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
  526. .section .rodata.cst8, "aM", @progbits, 8
  527. .align 8
  528. /* AES affine: */
  529. #define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
  530. .Ltf_aff_bitmatrix:
  531. .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
  532. BV8(1, 1, 0, 0, 0, 1, 1, 1),
  533. BV8(1, 1, 1, 0, 0, 0, 1, 1),
  534. BV8(1, 1, 1, 1, 0, 0, 0, 1),
  535. BV8(1, 1, 1, 1, 1, 0, 0, 0),
  536. BV8(0, 1, 1, 1, 1, 1, 0, 0),
  537. BV8(0, 0, 1, 1, 1, 1, 1, 0),
  538. BV8(0, 0, 0, 1, 1, 1, 1, 1))
  539. /* AES inverse affine: */
  540. #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
  541. .Ltf_inv_bitmatrix:
  542. .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
  543. BV8(1, 0, 0, 1, 0, 0, 1, 0),
  544. BV8(0, 1, 0, 0, 1, 0, 0, 1),
  545. BV8(1, 0, 1, 0, 0, 1, 0, 0),
  546. BV8(0, 1, 0, 1, 0, 0, 1, 0),
  547. BV8(0, 0, 1, 0, 1, 0, 0, 1),
  548. BV8(1, 0, 0, 1, 0, 1, 0, 0),
  549. BV8(0, 1, 0, 0, 1, 0, 1, 0))
  550. /* S2: */
  551. #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
  552. .Ltf_s2_bitmatrix:
  553. .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
  554. BV8(0, 0, 1, 1, 1, 1, 1, 1),
  555. BV8(1, 1, 1, 0, 1, 1, 0, 1),
  556. BV8(1, 1, 0, 0, 0, 0, 1, 1),
  557. BV8(0, 1, 0, 0, 0, 0, 1, 1),
  558. BV8(1, 1, 0, 0, 1, 1, 1, 0),
  559. BV8(0, 1, 1, 0, 0, 0, 1, 1),
  560. BV8(1, 1, 1, 1, 0, 1, 1, 0))
  561. /* X2: */
  562. #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
  563. .Ltf_x2_bitmatrix:
  564. .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
  565. BV8(0, 0, 1, 0, 0, 1, 1, 0),
  566. BV8(0, 0, 0, 0, 1, 0, 1, 0),
  567. BV8(1, 1, 1, 0, 0, 0, 1, 1),
  568. BV8(1, 1, 1, 0, 1, 1, 0, 0),
  569. BV8(0, 1, 1, 0, 1, 0, 1, 1),
  570. BV8(1, 0, 1, 1, 1, 1, 0, 1),
  571. BV8(1, 0, 0, 1, 0, 0, 1, 1))
  572. /* Identity matrix: */
  573. .Ltf_id_bitmatrix:
  574. .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
  575. BV8(0, 1, 0, 0, 0, 0, 0, 0),
  576. BV8(0, 0, 1, 0, 0, 0, 0, 0),
  577. BV8(0, 0, 0, 1, 0, 0, 0, 0),
  578. BV8(0, 0, 0, 0, 1, 0, 0, 0),
  579. BV8(0, 0, 0, 0, 0, 1, 0, 0),
  580. BV8(0, 0, 0, 0, 0, 0, 1, 0),
  581. BV8(0, 0, 0, 0, 0, 0, 0, 1))
  582. .text
  583. SYM_FUNC_START_LOCAL(__aria_gfni_avx512_crypt_64way)
  584. /* input:
  585. * %r9: rk
  586. * %rsi: dst
  587. * %rdx: src
  588. * %zmm0..%zmm15: byte-sliced blocks
  589. */
  590. FRAME_BEGIN
  591. movq %rsi, %rax;
  592. leaq 8 * 64(%rax), %r8;
  593. inpack16_post(%zmm0, %zmm1, %zmm2, %zmm3,
  594. %zmm4, %zmm5, %zmm6, %zmm7,
  595. %zmm8, %zmm9, %zmm10, %zmm11,
  596. %zmm12, %zmm13, %zmm14,
  597. %zmm15, %rax, %r8);
  598. aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
  599. %zmm4, %zmm5, %zmm6, %zmm7,
  600. %zmm8, %zmm9, %zmm10, %zmm11,
  601. %zmm12, %zmm13, %zmm14, %zmm15,
  602. %zmm24, %zmm25, %zmm26, %zmm27,
  603. %zmm28, %zmm29, %zmm30, %zmm31,
  604. %rax, %r9, 0);
  605. aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
  606. %zmm6, %zmm7, %zmm4, %zmm5,
  607. %zmm9, %zmm8, %zmm11, %zmm10,
  608. %zmm12, %zmm13, %zmm14, %zmm15,
  609. %zmm24, %zmm25, %zmm26, %zmm27,
  610. %zmm28, %zmm29, %zmm30, %zmm31,
  611. %rax, %r9, 1);
  612. aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
  613. %zmm4, %zmm5, %zmm6, %zmm7,
  614. %zmm8, %zmm9, %zmm10, %zmm11,
  615. %zmm12, %zmm13, %zmm14, %zmm15,
  616. %zmm24, %zmm25, %zmm26, %zmm27,
  617. %zmm28, %zmm29, %zmm30, %zmm31,
  618. %rax, %r9, 2);
  619. aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
  620. %zmm6, %zmm7, %zmm4, %zmm5,
  621. %zmm9, %zmm8, %zmm11, %zmm10,
  622. %zmm12, %zmm13, %zmm14, %zmm15,
  623. %zmm24, %zmm25, %zmm26, %zmm27,
  624. %zmm28, %zmm29, %zmm30, %zmm31,
  625. %rax, %r9, 3);
  626. aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
  627. %zmm4, %zmm5, %zmm6, %zmm7,
  628. %zmm8, %zmm9, %zmm10, %zmm11,
  629. %zmm12, %zmm13, %zmm14, %zmm15,
  630. %zmm24, %zmm25, %zmm26, %zmm27,
  631. %zmm28, %zmm29, %zmm30, %zmm31,
  632. %rax, %r9, 4);
  633. aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
  634. %zmm6, %zmm7, %zmm4, %zmm5,
  635. %zmm9, %zmm8, %zmm11, %zmm10,
  636. %zmm12, %zmm13, %zmm14, %zmm15,
  637. %zmm24, %zmm25, %zmm26, %zmm27,
  638. %zmm28, %zmm29, %zmm30, %zmm31,
  639. %rax, %r9, 5);
  640. aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
  641. %zmm4, %zmm5, %zmm6, %zmm7,
  642. %zmm8, %zmm9, %zmm10, %zmm11,
  643. %zmm12, %zmm13, %zmm14, %zmm15,
  644. %zmm24, %zmm25, %zmm26, %zmm27,
  645. %zmm28, %zmm29, %zmm30, %zmm31,
  646. %rax, %r9, 6);
  647. aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
  648. %zmm6, %zmm7, %zmm4, %zmm5,
  649. %zmm9, %zmm8, %zmm11, %zmm10,
  650. %zmm12, %zmm13, %zmm14, %zmm15,
  651. %zmm24, %zmm25, %zmm26, %zmm27,
  652. %zmm28, %zmm29, %zmm30, %zmm31,
  653. %rax, %r9, 7);
  654. aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
  655. %zmm4, %zmm5, %zmm6, %zmm7,
  656. %zmm8, %zmm9, %zmm10, %zmm11,
  657. %zmm12, %zmm13, %zmm14, %zmm15,
  658. %zmm24, %zmm25, %zmm26, %zmm27,
  659. %zmm28, %zmm29, %zmm30, %zmm31,
  660. %rax, %r9, 8);
  661. aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
  662. %zmm6, %zmm7, %zmm4, %zmm5,
  663. %zmm9, %zmm8, %zmm11, %zmm10,
  664. %zmm12, %zmm13, %zmm14, %zmm15,
  665. %zmm24, %zmm25, %zmm26, %zmm27,
  666. %zmm28, %zmm29, %zmm30, %zmm31,
  667. %rax, %r9, 9);
  668. aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
  669. %zmm4, %zmm5, %zmm6, %zmm7,
  670. %zmm8, %zmm9, %zmm10, %zmm11,
  671. %zmm12, %zmm13, %zmm14, %zmm15,
  672. %zmm24, %zmm25, %zmm26, %zmm27,
  673. %zmm28, %zmm29, %zmm30, %zmm31,
  674. %rax, %r9, 10);
  675. cmpl $12, ARIA_CTX_rounds(CTX);
  676. jne .Laria_gfni_192;
  677. aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
  678. %zmm6, %zmm7, %zmm4, %zmm5,
  679. %zmm9, %zmm8, %zmm11, %zmm10,
  680. %zmm12, %zmm13, %zmm14, %zmm15,
  681. %zmm24, %zmm25, %zmm26, %zmm27,
  682. %zmm28, %zmm29, %zmm30, %zmm31,
  683. %rax, %r9, 11, 12);
  684. jmp .Laria_gfni_end;
  685. .Laria_gfni_192:
  686. aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
  687. %zmm6, %zmm7, %zmm4, %zmm5,
  688. %zmm9, %zmm8, %zmm11, %zmm10,
  689. %zmm12, %zmm13, %zmm14, %zmm15,
  690. %zmm24, %zmm25, %zmm26, %zmm27,
  691. %zmm28, %zmm29, %zmm30, %zmm31,
  692. %rax, %r9, 11);
  693. aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
  694. %zmm4, %zmm5, %zmm6, %zmm7,
  695. %zmm8, %zmm9, %zmm10, %zmm11,
  696. %zmm12, %zmm13, %zmm14, %zmm15,
  697. %zmm24, %zmm25, %zmm26, %zmm27,
  698. %zmm28, %zmm29, %zmm30, %zmm31,
  699. %rax, %r9, 12);
  700. cmpl $14, ARIA_CTX_rounds(CTX);
  701. jne .Laria_gfni_256;
  702. aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
  703. %zmm6, %zmm7, %zmm4, %zmm5,
  704. %zmm9, %zmm8, %zmm11, %zmm10,
  705. %zmm12, %zmm13, %zmm14, %zmm15,
  706. %zmm24, %zmm25, %zmm26, %zmm27,
  707. %zmm28, %zmm29, %zmm30, %zmm31,
  708. %rax, %r9, 13, 14);
  709. jmp .Laria_gfni_end;
  710. .Laria_gfni_256:
  711. aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
  712. %zmm6, %zmm7, %zmm4, %zmm5,
  713. %zmm9, %zmm8, %zmm11, %zmm10,
  714. %zmm12, %zmm13, %zmm14, %zmm15,
  715. %zmm24, %zmm25, %zmm26, %zmm27,
  716. %zmm28, %zmm29, %zmm30, %zmm31,
  717. %rax, %r9, 13);
  718. aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
  719. %zmm4, %zmm5, %zmm6, %zmm7,
  720. %zmm8, %zmm9, %zmm10, %zmm11,
  721. %zmm12, %zmm13, %zmm14, %zmm15,
  722. %zmm24, %zmm25, %zmm26, %zmm27,
  723. %zmm28, %zmm29, %zmm30, %zmm31,
  724. %rax, %r9, 14);
  725. aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
  726. %zmm6, %zmm7, %zmm4, %zmm5,
  727. %zmm9, %zmm8, %zmm11, %zmm10,
  728. %zmm12, %zmm13, %zmm14, %zmm15,
  729. %zmm24, %zmm25, %zmm26, %zmm27,
  730. %zmm28, %zmm29, %zmm30, %zmm31,
  731. %rax, %r9, 15, 16);
  732. .Laria_gfni_end:
  733. debyteslice_16x16b(%zmm9, %zmm12, %zmm3, %zmm6,
  734. %zmm8, %zmm13, %zmm2, %zmm7,
  735. %zmm11, %zmm14, %zmm1, %zmm4,
  736. %zmm10, %zmm15, %zmm0, %zmm5,
  737. (%rax), (%r8));
  738. FRAME_END
  739. RET;
  740. SYM_FUNC_END(__aria_gfni_avx512_crypt_64way)
  741. SYM_TYPED_FUNC_START(aria_gfni_avx512_encrypt_64way)
  742. /* input:
  743. * %rdi: ctx, CTX
  744. * %rsi: dst
  745. * %rdx: src
  746. */
  747. FRAME_BEGIN
  748. leaq ARIA_CTX_enc_key(CTX), %r9;
  749. inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
  750. %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
  751. %zmm15, %rdx);
  752. call __aria_gfni_avx512_crypt_64way;
  753. write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
  754. %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
  755. %zmm15, %rax);
  756. FRAME_END
  757. RET;
  758. SYM_FUNC_END(aria_gfni_avx512_encrypt_64way)
  759. SYM_TYPED_FUNC_START(aria_gfni_avx512_decrypt_64way)
  760. /* input:
  761. * %rdi: ctx, CTX
  762. * %rsi: dst
  763. * %rdx: src
  764. */
  765. FRAME_BEGIN
  766. leaq ARIA_CTX_dec_key(CTX), %r9;
  767. inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
  768. %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
  769. %zmm15, %rdx);
  770. call __aria_gfni_avx512_crypt_64way;
  771. write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
  772. %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
  773. %zmm15, %rax);
  774. FRAME_END
  775. RET;
  776. SYM_FUNC_END(aria_gfni_avx512_decrypt_64way)
  777. SYM_FUNC_START_LOCAL(__aria_gfni_avx512_ctr_gen_keystream_64way)
  778. /* input:
  779. * %rdi: ctx
  780. * %rsi: dst
  781. * %rdx: src
  782. * %rcx: keystream
  783. * %r8: iv (big endian, 128bit)
  784. */
  785. FRAME_BEGIN
  786. vbroadcasti64x2 .Lbswap128_mask (%rip), %zmm19;
  787. vmovdqa64 .Lcounter0123_lo (%rip), %zmm21;
  788. vbroadcasti64x2 .Lcounter4444_lo (%rip), %zmm22;
  789. vbroadcasti64x2 .Lcounter8888_lo (%rip), %zmm23;
  790. vbroadcasti64x2 .Lcounter16161616_lo (%rip), %zmm24;
  791. vbroadcasti64x2 .Lcounter1111_hi (%rip), %zmm25;
  792. /* load IV and byteswap */
  793. movq 8(%r8), %r11;
  794. movq (%r8), %r10;
  795. bswapq %r11;
  796. bswapq %r10;
  797. vbroadcasti64x2 (%r8), %zmm20;
  798. vpshufb %zmm19, %zmm20, %zmm20;
  799. /* check need for handling 64-bit overflow and carry */
  800. cmpq $(0xffffffffffffffff - 64), %r11;
  801. ja .Lload_ctr_carry;
  802. /* construct IVs */
  803. vpaddq %zmm21, %zmm20, %zmm0; /* +0:+1:+2:+3 */
  804. vpaddq %zmm22, %zmm0, %zmm1; /* +4:+5:+6:+7 */
  805. vpaddq %zmm23, %zmm0, %zmm2; /* +8:+9:+10:+11 */
  806. vpaddq %zmm23, %zmm1, %zmm3; /* +12:+13:+14:+15 */
  807. vpaddq %zmm24, %zmm0, %zmm4; /* +16... */
  808. vpaddq %zmm24, %zmm1, %zmm5; /* +20... */
  809. vpaddq %zmm24, %zmm2, %zmm6; /* +24... */
  810. vpaddq %zmm24, %zmm3, %zmm7; /* +28... */
  811. vpaddq %zmm24, %zmm4, %zmm8; /* +32... */
  812. vpaddq %zmm24, %zmm5, %zmm9; /* +36... */
  813. vpaddq %zmm24, %zmm6, %zmm10; /* +40... */
  814. vpaddq %zmm24, %zmm7, %zmm11; /* +44... */
  815. vpaddq %zmm24, %zmm8, %zmm12; /* +48... */
  816. vpaddq %zmm24, %zmm9, %zmm13; /* +52... */
  817. vpaddq %zmm24, %zmm10, %zmm14; /* +56... */
  818. vpaddq %zmm24, %zmm11, %zmm15; /* +60... */
  819. jmp .Lload_ctr_done;
  820. .Lload_ctr_carry:
  821. /* construct IVs */
  822. add_le128(%zmm0, %zmm20, %zmm21, %zmm25); /* +0:+1:+2:+3 */
  823. add_le128(%zmm1, %zmm0, %zmm22, %zmm25); /* +4:+5:+6:+7 */
  824. add_le128(%zmm2, %zmm0, %zmm23, %zmm25); /* +8:+9:+10:+11 */
  825. add_le128(%zmm3, %zmm1, %zmm23, %zmm25); /* +12:+13:+14:+15 */
  826. add_le128(%zmm4, %zmm0, %zmm24, %zmm25); /* +16... */
  827. add_le128(%zmm5, %zmm1, %zmm24, %zmm25); /* +20... */
  828. add_le128(%zmm6, %zmm2, %zmm24, %zmm25); /* +24... */
  829. add_le128(%zmm7, %zmm3, %zmm24, %zmm25); /* +28... */
  830. add_le128(%zmm8, %zmm4, %zmm24, %zmm25); /* +32... */
  831. add_le128(%zmm9, %zmm5, %zmm24, %zmm25); /* +36... */
  832. add_le128(%zmm10, %zmm6, %zmm24, %zmm25); /* +40... */
  833. add_le128(%zmm11, %zmm7, %zmm24, %zmm25); /* +44... */
  834. add_le128(%zmm12, %zmm8, %zmm24, %zmm25); /* +48... */
  835. add_le128(%zmm13, %zmm9, %zmm24, %zmm25); /* +52... */
  836. add_le128(%zmm14, %zmm10, %zmm24, %zmm25); /* +56... */
  837. add_le128(%zmm15, %zmm11, %zmm24, %zmm25); /* +60... */
  838. .Lload_ctr_done:
  839. /* Byte-swap IVs and update counter. */
  840. addq $64, %r11;
  841. adcq $0, %r10;
  842. vpshufb %zmm19, %zmm15, %zmm15;
  843. vpshufb %zmm19, %zmm14, %zmm14;
  844. vpshufb %zmm19, %zmm13, %zmm13;
  845. vpshufb %zmm19, %zmm12, %zmm12;
  846. vpshufb %zmm19, %zmm11, %zmm11;
  847. vpshufb %zmm19, %zmm10, %zmm10;
  848. vpshufb %zmm19, %zmm9, %zmm9;
  849. vpshufb %zmm19, %zmm8, %zmm8;
  850. bswapq %r11;
  851. bswapq %r10;
  852. vpshufb %zmm19, %zmm7, %zmm7;
  853. vpshufb %zmm19, %zmm6, %zmm6;
  854. vpshufb %zmm19, %zmm5, %zmm5;
  855. vpshufb %zmm19, %zmm4, %zmm4;
  856. vpshufb %zmm19, %zmm3, %zmm3;
  857. vpshufb %zmm19, %zmm2, %zmm2;
  858. vpshufb %zmm19, %zmm1, %zmm1;
  859. vpshufb %zmm19, %zmm0, %zmm0;
  860. movq %r11, 8(%r8);
  861. movq %r10, (%r8);
  862. FRAME_END
  863. RET;
  864. SYM_FUNC_END(__aria_gfni_avx512_ctr_gen_keystream_64way)
  865. SYM_TYPED_FUNC_START(aria_gfni_avx512_ctr_crypt_64way)
  866. /* input:
  867. * %rdi: ctx
  868. * %rsi: dst
  869. * %rdx: src
  870. * %rcx: keystream
  871. * %r8: iv (big endian, 128bit)
  872. */
  873. FRAME_BEGIN
  874. call __aria_gfni_avx512_ctr_gen_keystream_64way
  875. leaq (%rsi), %r10;
  876. leaq (%rdx), %r11;
  877. leaq (%rcx), %rsi;
  878. leaq (%rcx), %rdx;
  879. leaq ARIA_CTX_enc_key(CTX), %r9;
  880. call __aria_gfni_avx512_crypt_64way;
  881. vpxorq (0 * 64)(%r11), %zmm3, %zmm3;
  882. vpxorq (1 * 64)(%r11), %zmm2, %zmm2;
  883. vpxorq (2 * 64)(%r11), %zmm1, %zmm1;
  884. vpxorq (3 * 64)(%r11), %zmm0, %zmm0;
  885. vpxorq (4 * 64)(%r11), %zmm6, %zmm6;
  886. vpxorq (5 * 64)(%r11), %zmm7, %zmm7;
  887. vpxorq (6 * 64)(%r11), %zmm4, %zmm4;
  888. vpxorq (7 * 64)(%r11), %zmm5, %zmm5;
  889. vpxorq (8 * 64)(%r11), %zmm9, %zmm9;
  890. vpxorq (9 * 64)(%r11), %zmm8, %zmm8;
  891. vpxorq (10 * 64)(%r11), %zmm11, %zmm11;
  892. vpxorq (11 * 64)(%r11), %zmm10, %zmm10;
  893. vpxorq (12 * 64)(%r11), %zmm12, %zmm12;
  894. vpxorq (13 * 64)(%r11), %zmm13, %zmm13;
  895. vpxorq (14 * 64)(%r11), %zmm14, %zmm14;
  896. vpxorq (15 * 64)(%r11), %zmm15, %zmm15;
  897. write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
  898. %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
  899. %zmm15, %r10);
  900. FRAME_END
  901. RET;
  902. SYM_FUNC_END(aria_gfni_avx512_ctr_crypt_64way)