twofish-avx-x86_64-asm_64.S 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471
  1. /*
  2. * Twofish Cipher 8-way parallel algorithm (AVX/x86_64)
  3. *
  4. * Copyright (C) 2012 Johannes Goetzfried
  5. * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
  6. *
  7. * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  8. *
  9. * This program is free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * This program is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. * GNU General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU General Public License
  20. * along with this program; if not, write to the Free Software
  21. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  22. * USA
  23. *
  24. */
  25. #include <linux/linkage.h>
  26. #include <asm/frame.h>
  27. #include "glue_helper-asm-avx.S"
  28. .file "twofish-avx-x86_64-asm_64.S"
  29. .section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
  30. .align 16
  31. .Lbswap128_mask:
  32. .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  33. .section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
  34. .align 16
  35. .Lxts_gf128mul_and_shl1_mask:
  36. .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
  37. .text
  38. /* structure of crypto context */
  39. #define s0 0
  40. #define s1 1024
  41. #define s2 2048
  42. #define s3 3072
  43. #define w 4096
  44. #define k 4128
  45. /**********************************************************************
  46. 8-way AVX twofish
  47. **********************************************************************/
  48. #define CTX %rdi
  49. #define RA1 %xmm0
  50. #define RB1 %xmm1
  51. #define RC1 %xmm2
  52. #define RD1 %xmm3
  53. #define RA2 %xmm4
  54. #define RB2 %xmm5
  55. #define RC2 %xmm6
  56. #define RD2 %xmm7
  57. #define RX0 %xmm8
  58. #define RY0 %xmm9
  59. #define RX1 %xmm10
  60. #define RY1 %xmm11
  61. #define RK1 %xmm12
  62. #define RK2 %xmm13
  63. #define RT %xmm14
  64. #define RR %xmm15
  65. #define RID1 %r13
  66. #define RID1d %r13d
  67. #define RID2 %rsi
  68. #define RID2d %esi
  69. #define RGI1 %rdx
  70. #define RGI1bl %dl
  71. #define RGI1bh %dh
  72. #define RGI2 %rcx
  73. #define RGI2bl %cl
  74. #define RGI2bh %ch
  75. #define RGI3 %rax
  76. #define RGI3bl %al
  77. #define RGI3bh %ah
  78. #define RGI4 %rbx
  79. #define RGI4bl %bl
  80. #define RGI4bh %bh
  81. #define RGS1 %r8
  82. #define RGS1d %r8d
  83. #define RGS2 %r9
  84. #define RGS2d %r9d
  85. #define RGS3 %r10
  86. #define RGS3d %r10d
  87. #define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
  88. movzbl src ## bl, RID1d; \
  89. movzbl src ## bh, RID2d; \
  90. shrq $16, src; \
  91. movl t0(CTX, RID1, 4), dst ## d; \
  92. movl t1(CTX, RID2, 4), RID2d; \
  93. movzbl src ## bl, RID1d; \
  94. xorl RID2d, dst ## d; \
  95. movzbl src ## bh, RID2d; \
  96. interleave_op(il_reg); \
  97. xorl t2(CTX, RID1, 4), dst ## d; \
  98. xorl t3(CTX, RID2, 4), dst ## d;
  99. #define dummy(d) /* do nothing */
  100. #define shr_next(reg) \
  101. shrq $16, reg;
  102. #define G(gi1, gi2, x, t0, t1, t2, t3) \
  103. lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1); \
  104. lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2); \
  105. \
  106. lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none); \
  107. shlq $32, RGS2; \
  108. orq RGS1, RGS2; \
  109. lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none); \
  110. shlq $32, RGS1; \
  111. orq RGS1, RGS3;
  112. #define round_head_2(a, b, x1, y1, x2, y2) \
  113. vmovq b ## 1, RGI3; \
  114. vpextrq $1, b ## 1, RGI4; \
  115. \
  116. G(RGI1, RGI2, x1, s0, s1, s2, s3); \
  117. vmovq a ## 2, RGI1; \
  118. vpextrq $1, a ## 2, RGI2; \
  119. vmovq RGS2, x1; \
  120. vpinsrq $1, RGS3, x1, x1; \
  121. \
  122. G(RGI3, RGI4, y1, s1, s2, s3, s0); \
  123. vmovq b ## 2, RGI3; \
  124. vpextrq $1, b ## 2, RGI4; \
  125. vmovq RGS2, y1; \
  126. vpinsrq $1, RGS3, y1, y1; \
  127. \
  128. G(RGI1, RGI2, x2, s0, s1, s2, s3); \
  129. vmovq RGS2, x2; \
  130. vpinsrq $1, RGS3, x2, x2; \
  131. \
  132. G(RGI3, RGI4, y2, s1, s2, s3, s0); \
  133. vmovq RGS2, y2; \
  134. vpinsrq $1, RGS3, y2, y2;
  135. #define encround_tail(a, b, c, d, x, y, prerotate) \
  136. vpaddd x, y, x; \
  137. vpaddd x, RK1, RT;\
  138. prerotate(b); \
  139. vpxor RT, c, c; \
  140. vpaddd y, x, y; \
  141. vpaddd y, RK2, y; \
  142. vpsrld $1, c, RT; \
  143. vpslld $(32 - 1), c, c; \
  144. vpor c, RT, c; \
  145. vpxor d, y, d; \
  146. #define decround_tail(a, b, c, d, x, y, prerotate) \
  147. vpaddd x, y, x; \
  148. vpaddd x, RK1, RT;\
  149. prerotate(a); \
  150. vpxor RT, c, c; \
  151. vpaddd y, x, y; \
  152. vpaddd y, RK2, y; \
  153. vpxor d, y, d; \
  154. vpsrld $1, d, y; \
  155. vpslld $(32 - 1), d, d; \
  156. vpor d, y, d; \
  157. #define rotate_1l(x) \
  158. vpslld $1, x, RR; \
  159. vpsrld $(32 - 1), x, x; \
  160. vpor x, RR, x;
  161. #define preload_rgi(c) \
  162. vmovq c, RGI1; \
  163. vpextrq $1, c, RGI2;
  164. #define encrypt_round(n, a, b, c, d, preload, prerotate) \
  165. vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
  166. vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
  167. round_head_2(a, b, RX0, RY0, RX1, RY1); \
  168. encround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
  169. preload(c ## 1); \
  170. encround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
  171. #define decrypt_round(n, a, b, c, d, preload, prerotate) \
  172. vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
  173. vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
  174. round_head_2(a, b, RX0, RY0, RX1, RY1); \
  175. decround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
  176. preload(c ## 1); \
  177. decround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
  178. #define encrypt_cycle(n) \
  179. encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
  180. encrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l);
  181. #define encrypt_cycle_last(n) \
  182. encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
  183. encrypt_round(((2*n) + 1), RC, RD, RA, RB, dummy, dummy);
  184. #define decrypt_cycle(n) \
  185. decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
  186. decrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l);
  187. #define decrypt_cycle_last(n) \
  188. decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
  189. decrypt_round((2*n), RA, RB, RC, RD, dummy, dummy);
  190. #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
  191. vpunpckldq x1, x0, t0; \
  192. vpunpckhdq x1, x0, t2; \
  193. vpunpckldq x3, x2, t1; \
  194. vpunpckhdq x3, x2, x3; \
  195. \
  196. vpunpcklqdq t1, t0, x0; \
  197. vpunpckhqdq t1, t0, x1; \
  198. vpunpcklqdq x3, t2, x2; \
  199. vpunpckhqdq x3, t2, x3;
  200. #define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
  201. vpxor x0, wkey, x0; \
  202. vpxor x1, wkey, x1; \
  203. vpxor x2, wkey, x2; \
  204. vpxor x3, wkey, x3; \
  205. \
  206. transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
  207. #define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
  208. transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
  209. \
  210. vpxor x0, wkey, x0; \
  211. vpxor x1, wkey, x1; \
  212. vpxor x2, wkey, x2; \
  213. vpxor x3, wkey, x3;
  214. .align 8
  215. __twofish_enc_blk8:
  216. /* input:
  217. * %rdi: ctx, CTX
  218. * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
  219. * output:
  220. * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
  221. */
  222. vmovdqu w(CTX), RK1;
  223. pushq %r13;
  224. pushq %rbx;
  225. pushq %rcx;
  226. inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
  227. preload_rgi(RA1);
  228. rotate_1l(RD1);
  229. inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
  230. rotate_1l(RD2);
  231. encrypt_cycle(0);
  232. encrypt_cycle(1);
  233. encrypt_cycle(2);
  234. encrypt_cycle(3);
  235. encrypt_cycle(4);
  236. encrypt_cycle(5);
  237. encrypt_cycle(6);
  238. encrypt_cycle_last(7);
  239. vmovdqu (w+4*4)(CTX), RK1;
  240. popq %rcx;
  241. popq %rbx;
  242. popq %r13;
  243. outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
  244. outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
  245. ret;
  246. ENDPROC(__twofish_enc_blk8)
  247. .align 8
  248. __twofish_dec_blk8:
  249. /* input:
  250. * %rdi: ctx, CTX
  251. * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
  252. * output:
  253. * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
  254. */
  255. vmovdqu (w+4*4)(CTX), RK1;
  256. pushq %r13;
  257. pushq %rbx;
  258. inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
  259. preload_rgi(RC1);
  260. rotate_1l(RA1);
  261. inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
  262. rotate_1l(RA2);
  263. decrypt_cycle(7);
  264. decrypt_cycle(6);
  265. decrypt_cycle(5);
  266. decrypt_cycle(4);
  267. decrypt_cycle(3);
  268. decrypt_cycle(2);
  269. decrypt_cycle(1);
  270. decrypt_cycle_last(0);
  271. vmovdqu (w)(CTX), RK1;
  272. popq %rbx;
  273. popq %r13;
  274. outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
  275. outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
  276. ret;
  277. ENDPROC(__twofish_dec_blk8)
  278. ENTRY(twofish_ecb_enc_8way)
  279. /* input:
  280. * %rdi: ctx, CTX
  281. * %rsi: dst
  282. * %rdx: src
  283. */
  284. FRAME_BEGIN
  285. movq %rsi, %r11;
  286. load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
  287. call __twofish_enc_blk8;
  288. store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
  289. FRAME_END
  290. ret;
  291. ENDPROC(twofish_ecb_enc_8way)
  292. ENTRY(twofish_ecb_dec_8way)
  293. /* input:
  294. * %rdi: ctx, CTX
  295. * %rsi: dst
  296. * %rdx: src
  297. */
  298. FRAME_BEGIN
  299. movq %rsi, %r11;
  300. load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
  301. call __twofish_dec_blk8;
  302. store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
  303. FRAME_END
  304. ret;
  305. ENDPROC(twofish_ecb_dec_8way)
  306. ENTRY(twofish_cbc_dec_8way)
  307. /* input:
  308. * %rdi: ctx, CTX
  309. * %rsi: dst
  310. * %rdx: src
  311. */
  312. FRAME_BEGIN
  313. pushq %r12;
  314. movq %rsi, %r11;
  315. movq %rdx, %r12;
  316. load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
  317. call __twofish_dec_blk8;
  318. store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
  319. popq %r12;
  320. FRAME_END
  321. ret;
  322. ENDPROC(twofish_cbc_dec_8way)
  323. ENTRY(twofish_ctr_8way)
  324. /* input:
  325. * %rdi: ctx, CTX
  326. * %rsi: dst
  327. * %rdx: src
  328. * %rcx: iv (little endian, 128bit)
  329. */
  330. FRAME_BEGIN
  331. pushq %r12;
  332. movq %rsi, %r11;
  333. movq %rdx, %r12;
  334. load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
  335. RD2, RX0, RX1, RY0);
  336. call __twofish_enc_blk8;
  337. store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
  338. popq %r12;
  339. FRAME_END
  340. ret;
  341. ENDPROC(twofish_ctr_8way)
  342. ENTRY(twofish_xts_enc_8way)
  343. /* input:
  344. * %rdi: ctx, CTX
  345. * %rsi: dst
  346. * %rdx: src
  347. * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
  348. */
  349. FRAME_BEGIN
  350. movq %rsi, %r11;
  351. /* regs <= src, dst <= IVs, regs <= regs xor IVs */
  352. load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
  353. RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
  354. call __twofish_enc_blk8;
  355. /* dst <= regs xor IVs(in dst) */
  356. store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
  357. FRAME_END
  358. ret;
  359. ENDPROC(twofish_xts_enc_8way)
  360. ENTRY(twofish_xts_dec_8way)
  361. /* input:
  362. * %rdi: ctx, CTX
  363. * %rsi: dst
  364. * %rdx: src
  365. * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
  366. */
  367. FRAME_BEGIN
  368. movq %rsi, %r11;
  369. /* regs <= src, dst <= IVs, regs <= regs xor IVs */
  370. load_xts_8way(%rcx, %rdx, %rsi, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2,
  371. RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
  372. call __twofish_dec_blk8;
  373. /* dst <= regs xor IVs(in dst) */
  374. store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
  375. FRAME_END
  376. ret;
  377. ENDPROC(twofish_xts_dec_8way)