aes_ctrby8_avx-x86_64.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569
  1. /*
  2. * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
  3. *
  4. * This is AES128/192/256 CTR mode optimization implementation. It requires
  5. * the support of Intel(R) AESNI and AVX instructions.
  6. *
  7. * This work was inspired by the AES CTR mode optimization published
  8. * in Intel Optimized IPSEC Cryptograhpic library.
  9. * Additional information on it can be found at:
  10. * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
  11. *
  12. * This file is provided under a dual BSD/GPLv2 license. When using or
  13. * redistributing this file, you may do so under either license.
  14. *
  15. * GPL LICENSE SUMMARY
  16. *
  17. * Copyright(c) 2014 Intel Corporation.
  18. *
  19. * This program is free software; you can redistribute it and/or modify
  20. * it under the terms of version 2 of the GNU General Public License as
  21. * published by the Free Software Foundation.
  22. *
  23. * This program is distributed in the hope that it will be useful, but
  24. * WITHOUT ANY WARRANTY; without even the implied warranty of
  25. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  26. * General Public License for more details.
  27. *
  28. * Contact Information:
  29. * James Guilford <james.guilford@intel.com>
  30. * Sean Gulley <sean.m.gulley@intel.com>
  31. * Chandramouli Narayanan <mouli@linux.intel.com>
  32. *
  33. * BSD LICENSE
  34. *
  35. * Copyright(c) 2014 Intel Corporation.
  36. *
  37. * Redistribution and use in source and binary forms, with or without
  38. * modification, are permitted provided that the following conditions
  39. * are met:
  40. *
  41. * Redistributions of source code must retain the above copyright
  42. * notice, this list of conditions and the following disclaimer.
  43. * Redistributions in binary form must reproduce the above copyright
  44. * notice, this list of conditions and the following disclaimer in
  45. * the documentation and/or other materials provided with the
  46. * distribution.
  47. * Neither the name of Intel Corporation nor the names of its
  48. * contributors may be used to endorse or promote products derived
  49. * from this software without specific prior written permission.
  50. *
  51. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  52. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  53. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  54. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  55. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  56. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  57. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  58. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  59. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  60. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  61. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  62. *
  63. */
  64. #include <linux/linkage.h>
  65. #include <asm/inst.h>
  66. #define VMOVDQ vmovdqu
  67. #define xdata0 %xmm0
  68. #define xdata1 %xmm1
  69. #define xdata2 %xmm2
  70. #define xdata3 %xmm3
  71. #define xdata4 %xmm4
  72. #define xdata5 %xmm5
  73. #define xdata6 %xmm6
  74. #define xdata7 %xmm7
  75. #define xcounter %xmm8
  76. #define xbyteswap %xmm9
  77. #define xkey0 %xmm10
  78. #define xkey4 %xmm11
  79. #define xkey8 %xmm12
  80. #define xkey12 %xmm13
  81. #define xkeyA %xmm14
  82. #define xkeyB %xmm15
  83. #define p_in %rdi
  84. #define p_iv %rsi
  85. #define p_keys %rdx
  86. #define p_out %rcx
  87. #define num_bytes %r8
  88. #define tmp %r10
  89. #define DDQ_DATA 0
  90. #define XDATA 1
  91. #define KEY_128 1
  92. #define KEY_192 2
  93. #define KEY_256 3
  94. .section .rodata
  95. .align 16
  96. byteswap_const:
  97. .octa 0x000102030405060708090A0B0C0D0E0F
  98. ddq_low_msk:
  99. .octa 0x0000000000000000FFFFFFFFFFFFFFFF
  100. ddq_high_add_1:
  101. .octa 0x00000000000000010000000000000000
  102. ddq_add_1:
  103. .octa 0x00000000000000000000000000000001
  104. ddq_add_2:
  105. .octa 0x00000000000000000000000000000002
  106. ddq_add_3:
  107. .octa 0x00000000000000000000000000000003
  108. ddq_add_4:
  109. .octa 0x00000000000000000000000000000004
  110. ddq_add_5:
  111. .octa 0x00000000000000000000000000000005
  112. ddq_add_6:
  113. .octa 0x00000000000000000000000000000006
  114. ddq_add_7:
  115. .octa 0x00000000000000000000000000000007
  116. ddq_add_8:
  117. .octa 0x00000000000000000000000000000008
  118. .text
  119. /* generate a unique variable for ddq_add_x */
  120. /* generate a unique variable for xmm register */
  121. .macro setxdata n
  122. var_xdata = %xmm\n
  123. .endm
  124. /* club the numeric 'id' to the symbol 'name' */
  125. .macro club name, id
  126. .altmacro
  127. .if \name == XDATA
  128. setxdata %\id
  129. .endif
  130. .noaltmacro
  131. .endm
  132. /*
  133. * do_aes num_in_par load_keys key_len
  134. * This increments p_in, but not p_out
  135. */
  136. .macro do_aes b, k, key_len
  137. .set by, \b
  138. .set load_keys, \k
  139. .set klen, \key_len
  140. .if (load_keys)
  141. vmovdqa 0*16(p_keys), xkey0
  142. .endif
  143. vpshufb xbyteswap, xcounter, xdata0
  144. .set i, 1
  145. .rept (by - 1)
  146. club XDATA, i
  147. vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
  148. vptest ddq_low_msk(%rip), var_xdata
  149. jnz 1f
  150. vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
  151. vpaddq ddq_high_add_1(%rip), xcounter, xcounter
  152. 1:
  153. vpshufb xbyteswap, var_xdata, var_xdata
  154. .set i, (i +1)
  155. .endr
  156. vmovdqa 1*16(p_keys), xkeyA
  157. vpxor xkey0, xdata0, xdata0
  158. vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
  159. vptest ddq_low_msk(%rip), xcounter
  160. jnz 1f
  161. vpaddq ddq_high_add_1(%rip), xcounter, xcounter
  162. 1:
  163. .set i, 1
  164. .rept (by - 1)
  165. club XDATA, i
  166. vpxor xkey0, var_xdata, var_xdata
  167. .set i, (i +1)
  168. .endr
  169. vmovdqa 2*16(p_keys), xkeyB
  170. .set i, 0
  171. .rept by
  172. club XDATA, i
  173. vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
  174. .set i, (i +1)
  175. .endr
  176. .if (klen == KEY_128)
  177. .if (load_keys)
  178. vmovdqa 3*16(p_keys), xkey4
  179. .endif
  180. .else
  181. vmovdqa 3*16(p_keys), xkeyA
  182. .endif
  183. .set i, 0
  184. .rept by
  185. club XDATA, i
  186. vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
  187. .set i, (i +1)
  188. .endr
  189. add $(16*by), p_in
  190. .if (klen == KEY_128)
  191. vmovdqa 4*16(p_keys), xkeyB
  192. .else
  193. .if (load_keys)
  194. vmovdqa 4*16(p_keys), xkey4
  195. .endif
  196. .endif
  197. .set i, 0
  198. .rept by
  199. club XDATA, i
  200. /* key 3 */
  201. .if (klen == KEY_128)
  202. vaesenc xkey4, var_xdata, var_xdata
  203. .else
  204. vaesenc xkeyA, var_xdata, var_xdata
  205. .endif
  206. .set i, (i +1)
  207. .endr
  208. vmovdqa 5*16(p_keys), xkeyA
  209. .set i, 0
  210. .rept by
  211. club XDATA, i
  212. /* key 4 */
  213. .if (klen == KEY_128)
  214. vaesenc xkeyB, var_xdata, var_xdata
  215. .else
  216. vaesenc xkey4, var_xdata, var_xdata
  217. .endif
  218. .set i, (i +1)
  219. .endr
  220. .if (klen == KEY_128)
  221. .if (load_keys)
  222. vmovdqa 6*16(p_keys), xkey8
  223. .endif
  224. .else
  225. vmovdqa 6*16(p_keys), xkeyB
  226. .endif
  227. .set i, 0
  228. .rept by
  229. club XDATA, i
  230. vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
  231. .set i, (i +1)
  232. .endr
  233. vmovdqa 7*16(p_keys), xkeyA
  234. .set i, 0
  235. .rept by
  236. club XDATA, i
  237. /* key 6 */
  238. .if (klen == KEY_128)
  239. vaesenc xkey8, var_xdata, var_xdata
  240. .else
  241. vaesenc xkeyB, var_xdata, var_xdata
  242. .endif
  243. .set i, (i +1)
  244. .endr
  245. .if (klen == KEY_128)
  246. vmovdqa 8*16(p_keys), xkeyB
  247. .else
  248. .if (load_keys)
  249. vmovdqa 8*16(p_keys), xkey8
  250. .endif
  251. .endif
  252. .set i, 0
  253. .rept by
  254. club XDATA, i
  255. vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
  256. .set i, (i +1)
  257. .endr
  258. .if (klen == KEY_128)
  259. .if (load_keys)
  260. vmovdqa 9*16(p_keys), xkey12
  261. .endif
  262. .else
  263. vmovdqa 9*16(p_keys), xkeyA
  264. .endif
  265. .set i, 0
  266. .rept by
  267. club XDATA, i
  268. /* key 8 */
  269. .if (klen == KEY_128)
  270. vaesenc xkeyB, var_xdata, var_xdata
  271. .else
  272. vaesenc xkey8, var_xdata, var_xdata
  273. .endif
  274. .set i, (i +1)
  275. .endr
  276. vmovdqa 10*16(p_keys), xkeyB
  277. .set i, 0
  278. .rept by
  279. club XDATA, i
  280. /* key 9 */
  281. .if (klen == KEY_128)
  282. vaesenc xkey12, var_xdata, var_xdata
  283. .else
  284. vaesenc xkeyA, var_xdata, var_xdata
  285. .endif
  286. .set i, (i +1)
  287. .endr
  288. .if (klen != KEY_128)
  289. vmovdqa 11*16(p_keys), xkeyA
  290. .endif
  291. .set i, 0
  292. .rept by
  293. club XDATA, i
  294. /* key 10 */
  295. .if (klen == KEY_128)
  296. vaesenclast xkeyB, var_xdata, var_xdata
  297. .else
  298. vaesenc xkeyB, var_xdata, var_xdata
  299. .endif
  300. .set i, (i +1)
  301. .endr
  302. .if (klen != KEY_128)
  303. .if (load_keys)
  304. vmovdqa 12*16(p_keys), xkey12
  305. .endif
  306. .set i, 0
  307. .rept by
  308. club XDATA, i
  309. vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
  310. .set i, (i +1)
  311. .endr
  312. .if (klen == KEY_256)
  313. vmovdqa 13*16(p_keys), xkeyA
  314. .endif
  315. .set i, 0
  316. .rept by
  317. club XDATA, i
  318. .if (klen == KEY_256)
  319. /* key 12 */
  320. vaesenc xkey12, var_xdata, var_xdata
  321. .else
  322. vaesenclast xkey12, var_xdata, var_xdata
  323. .endif
  324. .set i, (i +1)
  325. .endr
  326. .if (klen == KEY_256)
  327. vmovdqa 14*16(p_keys), xkeyB
  328. .set i, 0
  329. .rept by
  330. club XDATA, i
  331. /* key 13 */
  332. vaesenc xkeyA, var_xdata, var_xdata
  333. .set i, (i +1)
  334. .endr
  335. .set i, 0
  336. .rept by
  337. club XDATA, i
  338. /* key 14 */
  339. vaesenclast xkeyB, var_xdata, var_xdata
  340. .set i, (i +1)
  341. .endr
  342. .endif
  343. .endif
  344. .set i, 0
  345. .rept (by / 2)
  346. .set j, (i+1)
  347. VMOVDQ (i*16 - 16*by)(p_in), xkeyA
  348. VMOVDQ (j*16 - 16*by)(p_in), xkeyB
  349. club XDATA, i
  350. vpxor xkeyA, var_xdata, var_xdata
  351. club XDATA, j
  352. vpxor xkeyB, var_xdata, var_xdata
  353. .set i, (i+2)
  354. .endr
  355. .if (i < by)
  356. VMOVDQ (i*16 - 16*by)(p_in), xkeyA
  357. club XDATA, i
  358. vpxor xkeyA, var_xdata, var_xdata
  359. .endif
  360. .set i, 0
  361. .rept by
  362. club XDATA, i
  363. VMOVDQ var_xdata, i*16(p_out)
  364. .set i, (i+1)
  365. .endr
  366. .endm
  367. .macro do_aes_load val, key_len
  368. do_aes \val, 1, \key_len
  369. .endm
  370. .macro do_aes_noload val, key_len
  371. do_aes \val, 0, \key_len
  372. .endm
  373. /* main body of aes ctr load */
  374. .macro do_aes_ctrmain key_len
  375. cmp $16, num_bytes
  376. jb .Ldo_return2\key_len
  377. vmovdqa byteswap_const(%rip), xbyteswap
  378. vmovdqu (p_iv), xcounter
  379. vpshufb xbyteswap, xcounter, xcounter
  380. mov num_bytes, tmp
  381. and $(7*16), tmp
  382. jz .Lmult_of_8_blks\key_len
  383. /* 1 <= tmp <= 7 */
  384. cmp $(4*16), tmp
  385. jg .Lgt4\key_len
  386. je .Leq4\key_len
  387. .Llt4\key_len:
  388. cmp $(2*16), tmp
  389. jg .Leq3\key_len
  390. je .Leq2\key_len
  391. .Leq1\key_len:
  392. do_aes_load 1, \key_len
  393. add $(1*16), p_out
  394. and $(~7*16), num_bytes
  395. jz .Ldo_return2\key_len
  396. jmp .Lmain_loop2\key_len
  397. .Leq2\key_len:
  398. do_aes_load 2, \key_len
  399. add $(2*16), p_out
  400. and $(~7*16), num_bytes
  401. jz .Ldo_return2\key_len
  402. jmp .Lmain_loop2\key_len
  403. .Leq3\key_len:
  404. do_aes_load 3, \key_len
  405. add $(3*16), p_out
  406. and $(~7*16), num_bytes
  407. jz .Ldo_return2\key_len
  408. jmp .Lmain_loop2\key_len
  409. .Leq4\key_len:
  410. do_aes_load 4, \key_len
  411. add $(4*16), p_out
  412. and $(~7*16), num_bytes
  413. jz .Ldo_return2\key_len
  414. jmp .Lmain_loop2\key_len
  415. .Lgt4\key_len:
  416. cmp $(6*16), tmp
  417. jg .Leq7\key_len
  418. je .Leq6\key_len
  419. .Leq5\key_len:
  420. do_aes_load 5, \key_len
  421. add $(5*16), p_out
  422. and $(~7*16), num_bytes
  423. jz .Ldo_return2\key_len
  424. jmp .Lmain_loop2\key_len
  425. .Leq6\key_len:
  426. do_aes_load 6, \key_len
  427. add $(6*16), p_out
  428. and $(~7*16), num_bytes
  429. jz .Ldo_return2\key_len
  430. jmp .Lmain_loop2\key_len
  431. .Leq7\key_len:
  432. do_aes_load 7, \key_len
  433. add $(7*16), p_out
  434. and $(~7*16), num_bytes
  435. jz .Ldo_return2\key_len
  436. jmp .Lmain_loop2\key_len
  437. .Lmult_of_8_blks\key_len:
  438. .if (\key_len != KEY_128)
  439. vmovdqa 0*16(p_keys), xkey0
  440. vmovdqa 4*16(p_keys), xkey4
  441. vmovdqa 8*16(p_keys), xkey8
  442. vmovdqa 12*16(p_keys), xkey12
  443. .else
  444. vmovdqa 0*16(p_keys), xkey0
  445. vmovdqa 3*16(p_keys), xkey4
  446. vmovdqa 6*16(p_keys), xkey8
  447. vmovdqa 9*16(p_keys), xkey12
  448. .endif
  449. .align 16
  450. .Lmain_loop2\key_len:
  451. /* num_bytes is a multiple of 8 and >0 */
  452. do_aes_noload 8, \key_len
  453. add $(8*16), p_out
  454. sub $(8*16), num_bytes
  455. jne .Lmain_loop2\key_len
  456. .Ldo_return2\key_len:
  457. /* return updated IV */
  458. vpshufb xbyteswap, xcounter, xcounter
  459. vmovdqu xcounter, (p_iv)
  460. ret
  461. .endm
  462. /*
  463. * routine to do AES128 CTR enc/decrypt "by8"
  464. * XMM registers are clobbered.
  465. * Saving/restoring must be done at a higher level
  466. * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
  467. * unsigned int num_bytes)
  468. */
  469. ENTRY(aes_ctr_enc_128_avx_by8)
  470. /* call the aes main loop */
  471. do_aes_ctrmain KEY_128
  472. ENDPROC(aes_ctr_enc_128_avx_by8)
  473. /*
  474. * routine to do AES192 CTR enc/decrypt "by8"
  475. * XMM registers are clobbered.
  476. * Saving/restoring must be done at a higher level
  477. * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
  478. * unsigned int num_bytes)
  479. */
  480. ENTRY(aes_ctr_enc_192_avx_by8)
  481. /* call the aes main loop */
  482. do_aes_ctrmain KEY_192
  483. ENDPROC(aes_ctr_enc_192_avx_by8)
  484. /*
  485. * routine to do AES256 CTR enc/decrypt "by8"
  486. * XMM registers are clobbered.
  487. * Saving/restoring must be done at a higher level
  488. * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
  489. * unsigned int num_bytes)
  490. */
  491. ENTRY(aes_ctr_enc_256_avx_by8)
  492. /* call the aes main loop */
  493. do_aes_ctrmain KEY_256
  494. ENDPROC(aes_ctr_enc_256_avx_by8)