blake2s-core.S 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. /* SPDX-License-Identifier: GPL-2.0 OR MIT */
  2. /*
  3. * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
  4. * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
  5. */
  6. #include <linux/linkage.h>
  7. .section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
  8. .align 32
  9. IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
  10. .octa 0x5BE0CD191F83D9AB9B05688C510E527F
  11. .section .rodata.cst16.ROT16, "aM", @progbits, 16
  12. .align 16
  13. ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
  14. .section .rodata.cst16.ROR328, "aM", @progbits, 16
  15. .align 16
  16. ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
  17. .section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
  18. .align 64
  19. SIGMA:
  20. .byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
  21. .byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
  22. .byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
  23. .byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
  24. .byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
  25. .byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
  26. .byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
  27. .byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
  28. .byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
  29. .byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
  30. #ifdef CONFIG_AS_AVX512
  31. .section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
  32. .align 64
  33. SIGMA2:
  34. .long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
  35. .long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
  36. .long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
  37. .long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
  38. .long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
  39. .long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
  40. .long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
  41. .long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
  42. .long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
  43. .long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
  44. #endif /* CONFIG_AS_AVX512 */
  45. .text
  46. SYM_FUNC_START(blake2s_compress_ssse3)
  47. testq %rdx,%rdx
  48. je .Lendofloop
  49. movdqu (%rdi),%xmm0
  50. movdqu 0x10(%rdi),%xmm1
  51. movdqa ROT16(%rip),%xmm12
  52. movdqa ROR328(%rip),%xmm13
  53. movdqu 0x20(%rdi),%xmm14
  54. movq %rcx,%xmm15
  55. leaq SIGMA+0xa0(%rip),%r8
  56. jmp .Lbeginofloop
  57. .align 32
  58. .Lbeginofloop:
  59. movdqa %xmm0,%xmm10
  60. movdqa %xmm1,%xmm11
  61. paddq %xmm15,%xmm14
  62. movdqa IV(%rip),%xmm2
  63. movdqa %xmm14,%xmm3
  64. pxor IV+0x10(%rip),%xmm3
  65. leaq SIGMA(%rip),%rcx
  66. .Lroundloop:
  67. movzbl (%rcx),%eax
  68. movd (%rsi,%rax,4),%xmm4
  69. movzbl 0x1(%rcx),%eax
  70. movd (%rsi,%rax,4),%xmm5
  71. movzbl 0x2(%rcx),%eax
  72. movd (%rsi,%rax,4),%xmm6
  73. movzbl 0x3(%rcx),%eax
  74. movd (%rsi,%rax,4),%xmm7
  75. punpckldq %xmm5,%xmm4
  76. punpckldq %xmm7,%xmm6
  77. punpcklqdq %xmm6,%xmm4
  78. paddd %xmm4,%xmm0
  79. paddd %xmm1,%xmm0
  80. pxor %xmm0,%xmm3
  81. pshufb %xmm12,%xmm3
  82. paddd %xmm3,%xmm2
  83. pxor %xmm2,%xmm1
  84. movdqa %xmm1,%xmm8
  85. psrld $0xc,%xmm1
  86. pslld $0x14,%xmm8
  87. por %xmm8,%xmm1
  88. movzbl 0x4(%rcx),%eax
  89. movd (%rsi,%rax,4),%xmm5
  90. movzbl 0x5(%rcx),%eax
  91. movd (%rsi,%rax,4),%xmm6
  92. movzbl 0x6(%rcx),%eax
  93. movd (%rsi,%rax,4),%xmm7
  94. movzbl 0x7(%rcx),%eax
  95. movd (%rsi,%rax,4),%xmm4
  96. punpckldq %xmm6,%xmm5
  97. punpckldq %xmm4,%xmm7
  98. punpcklqdq %xmm7,%xmm5
  99. paddd %xmm5,%xmm0
  100. paddd %xmm1,%xmm0
  101. pxor %xmm0,%xmm3
  102. pshufb %xmm13,%xmm3
  103. paddd %xmm3,%xmm2
  104. pxor %xmm2,%xmm1
  105. movdqa %xmm1,%xmm8
  106. psrld $0x7,%xmm1
  107. pslld $0x19,%xmm8
  108. por %xmm8,%xmm1
  109. pshufd $0x93,%xmm0,%xmm0
  110. pshufd $0x4e,%xmm3,%xmm3
  111. pshufd $0x39,%xmm2,%xmm2
  112. movzbl 0x8(%rcx),%eax
  113. movd (%rsi,%rax,4),%xmm6
  114. movzbl 0x9(%rcx),%eax
  115. movd (%rsi,%rax,4),%xmm7
  116. movzbl 0xa(%rcx),%eax
  117. movd (%rsi,%rax,4),%xmm4
  118. movzbl 0xb(%rcx),%eax
  119. movd (%rsi,%rax,4),%xmm5
  120. punpckldq %xmm7,%xmm6
  121. punpckldq %xmm5,%xmm4
  122. punpcklqdq %xmm4,%xmm6
  123. paddd %xmm6,%xmm0
  124. paddd %xmm1,%xmm0
  125. pxor %xmm0,%xmm3
  126. pshufb %xmm12,%xmm3
  127. paddd %xmm3,%xmm2
  128. pxor %xmm2,%xmm1
  129. movdqa %xmm1,%xmm8
  130. psrld $0xc,%xmm1
  131. pslld $0x14,%xmm8
  132. por %xmm8,%xmm1
  133. movzbl 0xc(%rcx),%eax
  134. movd (%rsi,%rax,4),%xmm7
  135. movzbl 0xd(%rcx),%eax
  136. movd (%rsi,%rax,4),%xmm4
  137. movzbl 0xe(%rcx),%eax
  138. movd (%rsi,%rax,4),%xmm5
  139. movzbl 0xf(%rcx),%eax
  140. movd (%rsi,%rax,4),%xmm6
  141. punpckldq %xmm4,%xmm7
  142. punpckldq %xmm6,%xmm5
  143. punpcklqdq %xmm5,%xmm7
  144. paddd %xmm7,%xmm0
  145. paddd %xmm1,%xmm0
  146. pxor %xmm0,%xmm3
  147. pshufb %xmm13,%xmm3
  148. paddd %xmm3,%xmm2
  149. pxor %xmm2,%xmm1
  150. movdqa %xmm1,%xmm8
  151. psrld $0x7,%xmm1
  152. pslld $0x19,%xmm8
  153. por %xmm8,%xmm1
  154. pshufd $0x39,%xmm0,%xmm0
  155. pshufd $0x4e,%xmm3,%xmm3
  156. pshufd $0x93,%xmm2,%xmm2
  157. addq $0x10,%rcx
  158. cmpq %r8,%rcx
  159. jnz .Lroundloop
  160. pxor %xmm2,%xmm0
  161. pxor %xmm3,%xmm1
  162. pxor %xmm10,%xmm0
  163. pxor %xmm11,%xmm1
  164. addq $0x40,%rsi
  165. decq %rdx
  166. jnz .Lbeginofloop
  167. movdqu %xmm0,(%rdi)
  168. movdqu %xmm1,0x10(%rdi)
  169. movdqu %xmm14,0x20(%rdi)
  170. .Lendofloop:
  171. RET
  172. SYM_FUNC_END(blake2s_compress_ssse3)
  173. #ifdef CONFIG_AS_AVX512
  174. SYM_FUNC_START(blake2s_compress_avx512)
  175. vmovdqu (%rdi),%xmm0
  176. vmovdqu 0x10(%rdi),%xmm1
  177. vmovdqu 0x20(%rdi),%xmm4
  178. vmovq %rcx,%xmm5
  179. vmovdqa IV(%rip),%xmm14
  180. vmovdqa IV+16(%rip),%xmm15
  181. jmp .Lblake2s_compress_avx512_mainloop
  182. .align 32
  183. .Lblake2s_compress_avx512_mainloop:
  184. vmovdqa %xmm0,%xmm10
  185. vmovdqa %xmm1,%xmm11
  186. vpaddq %xmm5,%xmm4,%xmm4
  187. vmovdqa %xmm14,%xmm2
  188. vpxor %xmm15,%xmm4,%xmm3
  189. vmovdqu (%rsi),%ymm6
  190. vmovdqu 0x20(%rsi),%ymm7
  191. addq $0x40,%rsi
  192. leaq SIGMA2(%rip),%rax
  193. movb $0xa,%cl
  194. .Lblake2s_compress_avx512_roundloop:
  195. addq $0x40,%rax
  196. vmovdqa -0x40(%rax),%ymm8
  197. vmovdqa -0x20(%rax),%ymm9
  198. vpermi2d %ymm7,%ymm6,%ymm8
  199. vpermi2d %ymm7,%ymm6,%ymm9
  200. vmovdqa %ymm8,%ymm6
  201. vmovdqa %ymm9,%ymm7
  202. vpaddd %xmm8,%xmm0,%xmm0
  203. vpaddd %xmm1,%xmm0,%xmm0
  204. vpxor %xmm0,%xmm3,%xmm3
  205. vprord $0x10,%xmm3,%xmm3
  206. vpaddd %xmm3,%xmm2,%xmm2
  207. vpxor %xmm2,%xmm1,%xmm1
  208. vprord $0xc,%xmm1,%xmm1
  209. vextracti128 $0x1,%ymm8,%xmm8
  210. vpaddd %xmm8,%xmm0,%xmm0
  211. vpaddd %xmm1,%xmm0,%xmm0
  212. vpxor %xmm0,%xmm3,%xmm3
  213. vprord $0x8,%xmm3,%xmm3
  214. vpaddd %xmm3,%xmm2,%xmm2
  215. vpxor %xmm2,%xmm1,%xmm1
  216. vprord $0x7,%xmm1,%xmm1
  217. vpshufd $0x93,%xmm0,%xmm0
  218. vpshufd $0x4e,%xmm3,%xmm3
  219. vpshufd $0x39,%xmm2,%xmm2
  220. vpaddd %xmm9,%xmm0,%xmm0
  221. vpaddd %xmm1,%xmm0,%xmm0
  222. vpxor %xmm0,%xmm3,%xmm3
  223. vprord $0x10,%xmm3,%xmm3
  224. vpaddd %xmm3,%xmm2,%xmm2
  225. vpxor %xmm2,%xmm1,%xmm1
  226. vprord $0xc,%xmm1,%xmm1
  227. vextracti128 $0x1,%ymm9,%xmm9
  228. vpaddd %xmm9,%xmm0,%xmm0
  229. vpaddd %xmm1,%xmm0,%xmm0
  230. vpxor %xmm0,%xmm3,%xmm3
  231. vprord $0x8,%xmm3,%xmm3
  232. vpaddd %xmm3,%xmm2,%xmm2
  233. vpxor %xmm2,%xmm1,%xmm1
  234. vprord $0x7,%xmm1,%xmm1
  235. vpshufd $0x39,%xmm0,%xmm0
  236. vpshufd $0x4e,%xmm3,%xmm3
  237. vpshufd $0x93,%xmm2,%xmm2
  238. decb %cl
  239. jne .Lblake2s_compress_avx512_roundloop
  240. vpxor %xmm10,%xmm0,%xmm0
  241. vpxor %xmm11,%xmm1,%xmm1
  242. vpxor %xmm2,%xmm0,%xmm0
  243. vpxor %xmm3,%xmm1,%xmm1
  244. decq %rdx
  245. jne .Lblake2s_compress_avx512_mainloop
  246. vmovdqu %xmm0,(%rdi)
  247. vmovdqu %xmm1,0x10(%rdi)
  248. vmovdqu %xmm4,0x20(%rdi)
  249. vzeroupper
  250. RET
  251. SYM_FUNC_END(blake2s_compress_avx512)
  252. #endif /* CONFIG_AS_AVX512 */