aes_ctrby8_avx-x86_64.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597
  1. /* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */
  2. /*
  3. * AES CTR mode by8 optimization with AVX instructions. (x86_64)
  4. *
  5. * Copyright(c) 2014 Intel Corporation.
  6. *
  7. * Contact Information:
  8. * James Guilford <james.guilford@intel.com>
  9. * Sean Gulley <sean.m.gulley@intel.com>
  10. * Chandramouli Narayanan <mouli@linux.intel.com>
  11. */
  12. /*
  13. * This is AES128/192/256 CTR mode optimization implementation. It requires
  14. * the support of Intel(R) AESNI and AVX instructions.
  15. *
  16. * This work was inspired by the AES CTR mode optimization published
  17. * in Intel Optimized IPSEC Cryptographic library.
  18. * Additional information on it can be found at:
  19. * https://github.com/intel/intel-ipsec-mb
  20. */
  21. #include <linux/linkage.h>
  22. #define VMOVDQ vmovdqu
  23. /*
  24. * Note: the "x" prefix in these aliases means "this is an xmm register". The
  25. * alias prefixes have no relation to XCTR where the "X" prefix means "XOR
  26. * counter".
  27. */
  28. #define xdata0 %xmm0
  29. #define xdata1 %xmm1
  30. #define xdata2 %xmm2
  31. #define xdata3 %xmm3
  32. #define xdata4 %xmm4
  33. #define xdata5 %xmm5
  34. #define xdata6 %xmm6
  35. #define xdata7 %xmm7
  36. #define xcounter %xmm8 // CTR mode only
  37. #define xiv %xmm8 // XCTR mode only
  38. #define xbyteswap %xmm9 // CTR mode only
  39. #define xtmp %xmm9 // XCTR mode only
  40. #define xkey0 %xmm10
  41. #define xkey4 %xmm11
  42. #define xkey8 %xmm12
  43. #define xkey12 %xmm13
  44. #define xkeyA %xmm14
  45. #define xkeyB %xmm15
  46. #define p_in %rdi
  47. #define p_iv %rsi
  48. #define p_keys %rdx
  49. #define p_out %rcx
  50. #define num_bytes %r8
  51. #define counter %r9 // XCTR mode only
  52. #define tmp %r10
  53. #define DDQ_DATA 0
  54. #define XDATA 1
  55. #define KEY_128 1
  56. #define KEY_192 2
  57. #define KEY_256 3
  58. .section .rodata
  59. .align 16
  60. byteswap_const:
  61. .octa 0x000102030405060708090A0B0C0D0E0F
  62. ddq_low_msk:
  63. .octa 0x0000000000000000FFFFFFFFFFFFFFFF
  64. ddq_high_add_1:
  65. .octa 0x00000000000000010000000000000000
  66. ddq_add_1:
  67. .octa 0x00000000000000000000000000000001
  68. ddq_add_2:
  69. .octa 0x00000000000000000000000000000002
  70. ddq_add_3:
  71. .octa 0x00000000000000000000000000000003
  72. ddq_add_4:
  73. .octa 0x00000000000000000000000000000004
  74. ddq_add_5:
  75. .octa 0x00000000000000000000000000000005
  76. ddq_add_6:
  77. .octa 0x00000000000000000000000000000006
  78. ddq_add_7:
  79. .octa 0x00000000000000000000000000000007
  80. ddq_add_8:
  81. .octa 0x00000000000000000000000000000008
  82. .text
  83. /* generate a unique variable for ddq_add_x */
  84. /* generate a unique variable for xmm register */
  85. .macro setxdata n
  86. var_xdata = %xmm\n
  87. .endm
  88. /* club the numeric 'id' to the symbol 'name' */
  89. .macro club name, id
  90. .altmacro
  91. .if \name == XDATA
  92. setxdata %\id
  93. .endif
  94. .noaltmacro
  95. .endm
  96. /*
  97. * do_aes num_in_par load_keys key_len
  98. * This increments p_in, but not p_out
  99. */
  100. .macro do_aes b, k, key_len, xctr
  101. .set by, \b
  102. .set load_keys, \k
  103. .set klen, \key_len
  104. .if (load_keys)
  105. vmovdqa 0*16(p_keys), xkey0
  106. .endif
  107. .if \xctr
  108. movq counter, xtmp
  109. .set i, 0
  110. .rept (by)
  111. club XDATA, i
  112. vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata
  113. .set i, (i +1)
  114. .endr
  115. .set i, 0
  116. .rept (by)
  117. club XDATA, i
  118. vpxor xiv, var_xdata, var_xdata
  119. .set i, (i +1)
  120. .endr
  121. .else
  122. vpshufb xbyteswap, xcounter, xdata0
  123. .set i, 1
  124. .rept (by - 1)
  125. club XDATA, i
  126. vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
  127. vptest ddq_low_msk(%rip), var_xdata
  128. jnz 1f
  129. vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
  130. vpaddq ddq_high_add_1(%rip), xcounter, xcounter
  131. 1:
  132. vpshufb xbyteswap, var_xdata, var_xdata
  133. .set i, (i +1)
  134. .endr
  135. .endif
  136. vmovdqa 1*16(p_keys), xkeyA
  137. vpxor xkey0, xdata0, xdata0
  138. .if \xctr
  139. add $by, counter
  140. .else
  141. vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
  142. vptest ddq_low_msk(%rip), xcounter
  143. jnz 1f
  144. vpaddq ddq_high_add_1(%rip), xcounter, xcounter
  145. 1:
  146. .endif
  147. .set i, 1
  148. .rept (by - 1)
  149. club XDATA, i
  150. vpxor xkey0, var_xdata, var_xdata
  151. .set i, (i +1)
  152. .endr
  153. vmovdqa 2*16(p_keys), xkeyB
  154. .set i, 0
  155. .rept by
  156. club XDATA, i
  157. vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
  158. .set i, (i +1)
  159. .endr
  160. .if (klen == KEY_128)
  161. .if (load_keys)
  162. vmovdqa 3*16(p_keys), xkey4
  163. .endif
  164. .else
  165. vmovdqa 3*16(p_keys), xkeyA
  166. .endif
  167. .set i, 0
  168. .rept by
  169. club XDATA, i
  170. vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
  171. .set i, (i +1)
  172. .endr
  173. add $(16*by), p_in
  174. .if (klen == KEY_128)
  175. vmovdqa 4*16(p_keys), xkeyB
  176. .else
  177. .if (load_keys)
  178. vmovdqa 4*16(p_keys), xkey4
  179. .endif
  180. .endif
  181. .set i, 0
  182. .rept by
  183. club XDATA, i
  184. /* key 3 */
  185. .if (klen == KEY_128)
  186. vaesenc xkey4, var_xdata, var_xdata
  187. .else
  188. vaesenc xkeyA, var_xdata, var_xdata
  189. .endif
  190. .set i, (i +1)
  191. .endr
  192. vmovdqa 5*16(p_keys), xkeyA
  193. .set i, 0
  194. .rept by
  195. club XDATA, i
  196. /* key 4 */
  197. .if (klen == KEY_128)
  198. vaesenc xkeyB, var_xdata, var_xdata
  199. .else
  200. vaesenc xkey4, var_xdata, var_xdata
  201. .endif
  202. .set i, (i +1)
  203. .endr
  204. .if (klen == KEY_128)
  205. .if (load_keys)
  206. vmovdqa 6*16(p_keys), xkey8
  207. .endif
  208. .else
  209. vmovdqa 6*16(p_keys), xkeyB
  210. .endif
  211. .set i, 0
  212. .rept by
  213. club XDATA, i
  214. vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
  215. .set i, (i +1)
  216. .endr
  217. vmovdqa 7*16(p_keys), xkeyA
  218. .set i, 0
  219. .rept by
  220. club XDATA, i
  221. /* key 6 */
  222. .if (klen == KEY_128)
  223. vaesenc xkey8, var_xdata, var_xdata
  224. .else
  225. vaesenc xkeyB, var_xdata, var_xdata
  226. .endif
  227. .set i, (i +1)
  228. .endr
  229. .if (klen == KEY_128)
  230. vmovdqa 8*16(p_keys), xkeyB
  231. .else
  232. .if (load_keys)
  233. vmovdqa 8*16(p_keys), xkey8
  234. .endif
  235. .endif
  236. .set i, 0
  237. .rept by
  238. club XDATA, i
  239. vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
  240. .set i, (i +1)
  241. .endr
  242. .if (klen == KEY_128)
  243. .if (load_keys)
  244. vmovdqa 9*16(p_keys), xkey12
  245. .endif
  246. .else
  247. vmovdqa 9*16(p_keys), xkeyA
  248. .endif
  249. .set i, 0
  250. .rept by
  251. club XDATA, i
  252. /* key 8 */
  253. .if (klen == KEY_128)
  254. vaesenc xkeyB, var_xdata, var_xdata
  255. .else
  256. vaesenc xkey8, var_xdata, var_xdata
  257. .endif
  258. .set i, (i +1)
  259. .endr
  260. vmovdqa 10*16(p_keys), xkeyB
  261. .set i, 0
  262. .rept by
  263. club XDATA, i
  264. /* key 9 */
  265. .if (klen == KEY_128)
  266. vaesenc xkey12, var_xdata, var_xdata
  267. .else
  268. vaesenc xkeyA, var_xdata, var_xdata
  269. .endif
  270. .set i, (i +1)
  271. .endr
  272. .if (klen != KEY_128)
  273. vmovdqa 11*16(p_keys), xkeyA
  274. .endif
  275. .set i, 0
  276. .rept by
  277. club XDATA, i
  278. /* key 10 */
  279. .if (klen == KEY_128)
  280. vaesenclast xkeyB, var_xdata, var_xdata
  281. .else
  282. vaesenc xkeyB, var_xdata, var_xdata
  283. .endif
  284. .set i, (i +1)
  285. .endr
  286. .if (klen != KEY_128)
  287. .if (load_keys)
  288. vmovdqa 12*16(p_keys), xkey12
  289. .endif
  290. .set i, 0
  291. .rept by
  292. club XDATA, i
  293. vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
  294. .set i, (i +1)
  295. .endr
  296. .if (klen == KEY_256)
  297. vmovdqa 13*16(p_keys), xkeyA
  298. .endif
  299. .set i, 0
  300. .rept by
  301. club XDATA, i
  302. .if (klen == KEY_256)
  303. /* key 12 */
  304. vaesenc xkey12, var_xdata, var_xdata
  305. .else
  306. vaesenclast xkey12, var_xdata, var_xdata
  307. .endif
  308. .set i, (i +1)
  309. .endr
  310. .if (klen == KEY_256)
  311. vmovdqa 14*16(p_keys), xkeyB
  312. .set i, 0
  313. .rept by
  314. club XDATA, i
  315. /* key 13 */
  316. vaesenc xkeyA, var_xdata, var_xdata
  317. .set i, (i +1)
  318. .endr
  319. .set i, 0
  320. .rept by
  321. club XDATA, i
  322. /* key 14 */
  323. vaesenclast xkeyB, var_xdata, var_xdata
  324. .set i, (i +1)
  325. .endr
  326. .endif
  327. .endif
  328. .set i, 0
  329. .rept (by / 2)
  330. .set j, (i+1)
  331. VMOVDQ (i*16 - 16*by)(p_in), xkeyA
  332. VMOVDQ (j*16 - 16*by)(p_in), xkeyB
  333. club XDATA, i
  334. vpxor xkeyA, var_xdata, var_xdata
  335. club XDATA, j
  336. vpxor xkeyB, var_xdata, var_xdata
  337. .set i, (i+2)
  338. .endr
  339. .if (i < by)
  340. VMOVDQ (i*16 - 16*by)(p_in), xkeyA
  341. club XDATA, i
  342. vpxor xkeyA, var_xdata, var_xdata
  343. .endif
  344. .set i, 0
  345. .rept by
  346. club XDATA, i
  347. VMOVDQ var_xdata, i*16(p_out)
  348. .set i, (i+1)
  349. .endr
  350. .endm
  351. .macro do_aes_load val, key_len, xctr
  352. do_aes \val, 1, \key_len, \xctr
  353. .endm
  354. .macro do_aes_noload val, key_len, xctr
  355. do_aes \val, 0, \key_len, \xctr
  356. .endm
  357. /* main body of aes ctr load */
  358. .macro do_aes_ctrmain key_len, xctr
  359. cmp $16, num_bytes
  360. jb .Ldo_return2\xctr\key_len
  361. .if \xctr
  362. shr $4, counter
  363. vmovdqu (p_iv), xiv
  364. .else
  365. vmovdqa byteswap_const(%rip), xbyteswap
  366. vmovdqu (p_iv), xcounter
  367. vpshufb xbyteswap, xcounter, xcounter
  368. .endif
  369. mov num_bytes, tmp
  370. and $(7*16), tmp
  371. jz .Lmult_of_8_blks\xctr\key_len
  372. /* 1 <= tmp <= 7 */
  373. cmp $(4*16), tmp
  374. jg .Lgt4\xctr\key_len
  375. je .Leq4\xctr\key_len
  376. .Llt4\xctr\key_len:
  377. cmp $(2*16), tmp
  378. jg .Leq3\xctr\key_len
  379. je .Leq2\xctr\key_len
  380. .Leq1\xctr\key_len:
  381. do_aes_load 1, \key_len, \xctr
  382. add $(1*16), p_out
  383. and $(~7*16), num_bytes
  384. jz .Ldo_return2\xctr\key_len
  385. jmp .Lmain_loop2\xctr\key_len
  386. .Leq2\xctr\key_len:
  387. do_aes_load 2, \key_len, \xctr
  388. add $(2*16), p_out
  389. and $(~7*16), num_bytes
  390. jz .Ldo_return2\xctr\key_len
  391. jmp .Lmain_loop2\xctr\key_len
  392. .Leq3\xctr\key_len:
  393. do_aes_load 3, \key_len, \xctr
  394. add $(3*16), p_out
  395. and $(~7*16), num_bytes
  396. jz .Ldo_return2\xctr\key_len
  397. jmp .Lmain_loop2\xctr\key_len
  398. .Leq4\xctr\key_len:
  399. do_aes_load 4, \key_len, \xctr
  400. add $(4*16), p_out
  401. and $(~7*16), num_bytes
  402. jz .Ldo_return2\xctr\key_len
  403. jmp .Lmain_loop2\xctr\key_len
  404. .Lgt4\xctr\key_len:
  405. cmp $(6*16), tmp
  406. jg .Leq7\xctr\key_len
  407. je .Leq6\xctr\key_len
  408. .Leq5\xctr\key_len:
  409. do_aes_load 5, \key_len, \xctr
  410. add $(5*16), p_out
  411. and $(~7*16), num_bytes
  412. jz .Ldo_return2\xctr\key_len
  413. jmp .Lmain_loop2\xctr\key_len
  414. .Leq6\xctr\key_len:
  415. do_aes_load 6, \key_len, \xctr
  416. add $(6*16), p_out
  417. and $(~7*16), num_bytes
  418. jz .Ldo_return2\xctr\key_len
  419. jmp .Lmain_loop2\xctr\key_len
  420. .Leq7\xctr\key_len:
  421. do_aes_load 7, \key_len, \xctr
  422. add $(7*16), p_out
  423. and $(~7*16), num_bytes
  424. jz .Ldo_return2\xctr\key_len
  425. jmp .Lmain_loop2\xctr\key_len
  426. .Lmult_of_8_blks\xctr\key_len:
  427. .if (\key_len != KEY_128)
  428. vmovdqa 0*16(p_keys), xkey0
  429. vmovdqa 4*16(p_keys), xkey4
  430. vmovdqa 8*16(p_keys), xkey8
  431. vmovdqa 12*16(p_keys), xkey12
  432. .else
  433. vmovdqa 0*16(p_keys), xkey0
  434. vmovdqa 3*16(p_keys), xkey4
  435. vmovdqa 6*16(p_keys), xkey8
  436. vmovdqa 9*16(p_keys), xkey12
  437. .endif
  438. .align 16
  439. .Lmain_loop2\xctr\key_len:
  440. /* num_bytes is a multiple of 8 and >0 */
  441. do_aes_noload 8, \key_len, \xctr
  442. add $(8*16), p_out
  443. sub $(8*16), num_bytes
  444. jne .Lmain_loop2\xctr\key_len
  445. .Ldo_return2\xctr\key_len:
  446. .if !\xctr
  447. /* return updated IV */
  448. vpshufb xbyteswap, xcounter, xcounter
  449. vmovdqu xcounter, (p_iv)
  450. .endif
  451. RET
  452. .endm
  453. /*
  454. * routine to do AES128 CTR enc/decrypt "by8"
  455. * XMM registers are clobbered.
  456. * Saving/restoring must be done at a higher level
  457. * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
  458. * unsigned int num_bytes)
  459. */
  460. SYM_FUNC_START(aes_ctr_enc_128_avx_by8)
  461. /* call the aes main loop */
  462. do_aes_ctrmain KEY_128 0
  463. SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
  464. /*
  465. * routine to do AES192 CTR enc/decrypt "by8"
  466. * XMM registers are clobbered.
  467. * Saving/restoring must be done at a higher level
  468. * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
  469. * unsigned int num_bytes)
  470. */
  471. SYM_FUNC_START(aes_ctr_enc_192_avx_by8)
  472. /* call the aes main loop */
  473. do_aes_ctrmain KEY_192 0
  474. SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
  475. /*
  476. * routine to do AES256 CTR enc/decrypt "by8"
  477. * XMM registers are clobbered.
  478. * Saving/restoring must be done at a higher level
  479. * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
  480. * unsigned int num_bytes)
  481. */
  482. SYM_FUNC_START(aes_ctr_enc_256_avx_by8)
  483. /* call the aes main loop */
  484. do_aes_ctrmain KEY_256 0
  485. SYM_FUNC_END(aes_ctr_enc_256_avx_by8)
  486. /*
  487. * routine to do AES128 XCTR enc/decrypt "by8"
  488. * XMM registers are clobbered.
  489. * Saving/restoring must be done at a higher level
  490. * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys,
  491. * u8* out, unsigned int num_bytes, unsigned int byte_ctr)
  492. */
  493. SYM_FUNC_START(aes_xctr_enc_128_avx_by8)
  494. /* call the aes main loop */
  495. do_aes_ctrmain KEY_128 1
  496. SYM_FUNC_END(aes_xctr_enc_128_avx_by8)
  497. /*
  498. * routine to do AES192 XCTR enc/decrypt "by8"
  499. * XMM registers are clobbered.
  500. * Saving/restoring must be done at a higher level
  501. * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys,
  502. * u8* out, unsigned int num_bytes, unsigned int byte_ctr)
  503. */
  504. SYM_FUNC_START(aes_xctr_enc_192_avx_by8)
  505. /* call the aes main loop */
  506. do_aes_ctrmain KEY_192 1
  507. SYM_FUNC_END(aes_xctr_enc_192_avx_by8)
  508. /*
  509. * routine to do AES256 XCTR enc/decrypt "by8"
  510. * XMM registers are clobbered.
  511. * Saving/restoring must be done at a higher level
  512. * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys,
  513. * u8* out, unsigned int num_bytes, unsigned int byte_ctr)
  514. */
  515. SYM_FUNC_START(aes_xctr_enc_256_avx_by8)
  516. /* call the aes main loop */
  517. do_aes_ctrmain KEY_256 1
  518. SYM_FUNC_END(aes_xctr_enc_256_avx_by8)