sha256-avx-asm.S 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499
  1. ########################################################################
  2. # Implement fast SHA-256 with AVX1 instructions. (x86_64)
  3. #
  4. # Copyright (C) 2013 Intel Corporation.
  5. #
  6. # Authors:
  7. # James Guilford <james.guilford@intel.com>
  8. # Kirk Yap <kirk.s.yap@intel.com>
  9. # Tim Chen <tim.c.chen@linux.intel.com>
  10. #
  11. # This software is available to you under a choice of one of two
  12. # licenses. You may choose to be licensed under the terms of the GNU
  13. # General Public License (GPL) Version 2, available from the file
  14. # COPYING in the main directory of this source tree, or the
  15. # OpenIB.org BSD license below:
  16. #
  17. # Redistribution and use in source and binary forms, with or
  18. # without modification, are permitted provided that the following
  19. # conditions are met:
  20. #
  21. # - Redistributions of source code must retain the above
  22. # copyright notice, this list of conditions and the following
  23. # disclaimer.
  24. #
  25. # - Redistributions in binary form must reproduce the above
  26. # copyright notice, this list of conditions and the following
  27. # disclaimer in the documentation and/or other materials
  28. # provided with the distribution.
  29. #
  30. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  31. # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  32. # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  33. # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  34. # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  35. # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  36. # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  37. # SOFTWARE.
  38. ########################################################################
  39. #
  40. # This code is described in an Intel White-Paper:
  41. # "Fast SHA-256 Implementations on Intel Architecture Processors"
  42. #
  43. # To find it, surf to http://www.intel.com/p/en_US/embedded
  44. # and search for that title.
  45. #
  46. ########################################################################
  47. # This code schedules 1 block at a time, with 4 lanes per block
  48. ########################################################################
  49. #include <linux/linkage.h>
  50. #include <linux/cfi_types.h>
  51. ## assume buffers not aligned
  52. #define VMOVDQ vmovdqu
  53. ################################ Define Macros
  54. # addm [mem], reg
  55. # Add reg to mem using reg-mem add and store
  56. .macro addm p1 p2
  57. add \p1, \p2
  58. mov \p2, \p1
  59. .endm
  60. .macro MY_ROR p1 p2
  61. shld $(32-(\p1)), \p2, \p2
  62. .endm
  63. ################################
  64. # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
  65. # Load xmm with mem and byte swap each dword
  66. .macro COPY_XMM_AND_BSWAP p1 p2 p3
  67. VMOVDQ \p2, \p1
  68. vpshufb \p3, \p1, \p1
  69. .endm
  70. ################################
  71. X0 = %xmm4
  72. X1 = %xmm5
  73. X2 = %xmm6
  74. X3 = %xmm7
  75. XTMP0 = %xmm0
  76. XTMP1 = %xmm1
  77. XTMP2 = %xmm2
  78. XTMP3 = %xmm3
  79. XTMP4 = %xmm8
  80. XFER = %xmm9
  81. XTMP5 = %xmm11
  82. SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
  83. SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
  84. BYTE_FLIP_MASK = %xmm13
  85. NUM_BLKS = %rdx # 3rd arg
  86. INP = %rsi # 2nd arg
  87. CTX = %rdi # 1st arg
  88. SRND = %rsi # clobbers INP
  89. c = %ecx
  90. d = %r8d
  91. e = %edx
  92. TBL = %r12
  93. a = %eax
  94. b = %ebx
  95. f = %r9d
  96. g = %r10d
  97. h = %r11d
  98. y0 = %r13d
  99. y1 = %r14d
  100. y2 = %r15d
  101. _INP_END_SIZE = 8
  102. _INP_SIZE = 8
  103. _XFER_SIZE = 16
  104. _XMM_SAVE_SIZE = 0
  105. _INP_END = 0
  106. _INP = _INP_END + _INP_END_SIZE
  107. _XFER = _INP + _INP_SIZE
  108. _XMM_SAVE = _XFER + _XFER_SIZE
  109. STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
  110. # rotate_Xs
  111. # Rotate values of symbols X0...X3
  112. .macro rotate_Xs
  113. X_ = X0
  114. X0 = X1
  115. X1 = X2
  116. X2 = X3
  117. X3 = X_
  118. .endm
  119. # ROTATE_ARGS
  120. # Rotate values of symbols a...h
  121. .macro ROTATE_ARGS
  122. TMP_ = h
  123. h = g
  124. g = f
  125. f = e
  126. e = d
  127. d = c
  128. c = b
  129. b = a
  130. a = TMP_
  131. .endm
  132. .macro FOUR_ROUNDS_AND_SCHED
  133. ## compute s0 four at a time and s1 two at a time
  134. ## compute W[-16] + W[-7] 4 at a time
  135. mov e, y0 # y0 = e
  136. MY_ROR (25-11), y0 # y0 = e >> (25-11)
  137. mov a, y1 # y1 = a
  138. vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
  139. MY_ROR (22-13), y1 # y1 = a >> (22-13)
  140. xor e, y0 # y0 = e ^ (e >> (25-11))
  141. mov f, y2 # y2 = f
  142. MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
  143. xor a, y1 # y1 = a ^ (a >> (22-13)
  144. xor g, y2 # y2 = f^g
  145. vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]
  146. xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  147. and e, y2 # y2 = (f^g)&e
  148. MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
  149. ## compute s0
  150. vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
  151. xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  152. MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  153. xor g, y2 # y2 = CH = ((f^g)&e)^g
  154. MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  155. add y0, y2 # y2 = S1 + CH
  156. add _XFER(%rsp), y2 # y2 = k + w + S1 + CH
  157. mov a, y0 # y0 = a
  158. add y2, h # h = h + S1 + CH + k + w
  159. mov a, y2 # y2 = a
  160. vpsrld $7, XTMP1, XTMP2
  161. or c, y0 # y0 = a|c
  162. add h, d # d = d + h + S1 + CH + k + w
  163. and c, y2 # y2 = a&c
  164. vpslld $(32-7), XTMP1, XTMP3
  165. and b, y0 # y0 = (a|c)&b
  166. add y1, h # h = h + S1 + CH + k + w + S0
  167. vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7
  168. or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
  169. add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
  170. ROTATE_ARGS
  171. mov e, y0 # y0 = e
  172. mov a, y1 # y1 = a
  173. MY_ROR (25-11), y0 # y0 = e >> (25-11)
  174. xor e, y0 # y0 = e ^ (e >> (25-11))
  175. mov f, y2 # y2 = f
  176. MY_ROR (22-13), y1 # y1 = a >> (22-13)
  177. vpsrld $18, XTMP1, XTMP2 #
  178. xor a, y1 # y1 = a ^ (a >> (22-13)
  179. MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
  180. xor g, y2 # y2 = f^g
  181. vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
  182. MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
  183. xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  184. and e, y2 # y2 = (f^g)&e
  185. MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  186. vpslld $(32-18), XTMP1, XTMP1
  187. xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  188. xor g, y2 # y2 = CH = ((f^g)&e)^g
  189. vpxor XTMP1, XTMP3, XTMP3 #
  190. add y0, y2 # y2 = S1 + CH
  191. add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
  192. MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  193. vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
  194. mov a, y0 # y0 = a
  195. add y2, h # h = h + S1 + CH + k + w
  196. mov a, y2 # y2 = a
  197. vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
  198. or c, y0 # y0 = a|c
  199. add h, d # d = d + h + S1 + CH + k + w
  200. and c, y2 # y2 = a&c
  201. ## compute low s1
  202. vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
  203. and b, y0 # y0 = (a|c)&b
  204. add y1, h # h = h + S1 + CH + k + w + S0
  205. vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
  206. or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
  207. add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
  208. ROTATE_ARGS
  209. mov e, y0 # y0 = e
  210. mov a, y1 # y1 = a
  211. MY_ROR (25-11), y0 # y0 = e >> (25-11)
  212. xor e, y0 # y0 = e ^ (e >> (25-11))
  213. MY_ROR (22-13), y1 # y1 = a >> (22-13)
  214. mov f, y2 # y2 = f
  215. xor a, y1 # y1 = a ^ (a >> (22-13)
  216. MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
  217. vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
  218. xor g, y2 # y2 = f^g
  219. vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA}
  220. xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  221. and e, y2 # y2 = (f^g)&e
  222. vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA}
  223. MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
  224. xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  225. xor g, y2 # y2 = CH = ((f^g)&e)^g
  226. MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  227. vpxor XTMP3, XTMP2, XTMP2 #
  228. add y0, y2 # y2 = S1 + CH
  229. MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  230. add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
  231. vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
  232. mov a, y0 # y0 = a
  233. add y2, h # h = h + S1 + CH + k + w
  234. mov a, y2 # y2 = a
  235. vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
  236. or c, y0 # y0 = a|c
  237. add h, d # d = d + h + S1 + CH + k + w
  238. and c, y2 # y2 = a&c
  239. vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
  240. and b, y0 # y0 = (a|c)&b
  241. add y1, h # h = h + S1 + CH + k + w + S0
  242. ## compute high s1
  243. vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
  244. or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
  245. add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
  246. ROTATE_ARGS
  247. mov e, y0 # y0 = e
  248. MY_ROR (25-11), y0 # y0 = e >> (25-11)
  249. mov a, y1 # y1 = a
  250. MY_ROR (22-13), y1 # y1 = a >> (22-13)
  251. xor e, y0 # y0 = e ^ (e >> (25-11))
  252. mov f, y2 # y2 = f
  253. MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
  254. vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
  255. xor a, y1 # y1 = a ^ (a >> (22-13)
  256. xor g, y2 # y2 = f^g
  257. vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC}
  258. xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  259. and e, y2 # y2 = (f^g)&e
  260. MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
  261. vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC}
  262. xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  263. MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  264. xor g, y2 # y2 = CH = ((f^g)&e)^g
  265. vpxor XTMP3, XTMP2, XTMP2
  266. MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  267. add y0, y2 # y2 = S1 + CH
  268. add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
  269. vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
  270. mov a, y0 # y0 = a
  271. add y2, h # h = h + S1 + CH + k + w
  272. mov a, y2 # y2 = a
  273. vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
  274. or c, y0 # y0 = a|c
  275. add h, d # d = d + h + S1 + CH + k + w
  276. and c, y2 # y2 = a&c
  277. vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
  278. and b, y0 # y0 = (a|c)&b
  279. add y1, h # h = h + S1 + CH + k + w + S0
  280. or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
  281. add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
  282. ROTATE_ARGS
  283. rotate_Xs
  284. .endm
  285. ## input is [rsp + _XFER + %1 * 4]
  286. .macro DO_ROUND round
  287. mov e, y0 # y0 = e
  288. MY_ROR (25-11), y0 # y0 = e >> (25-11)
  289. mov a, y1 # y1 = a
  290. xor e, y0 # y0 = e ^ (e >> (25-11))
  291. MY_ROR (22-13), y1 # y1 = a >> (22-13)
  292. mov f, y2 # y2 = f
  293. xor a, y1 # y1 = a ^ (a >> (22-13)
  294. MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
  295. xor g, y2 # y2 = f^g
  296. xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
  297. MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
  298. and e, y2 # y2 = (f^g)&e
  299. xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
  300. MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
  301. xor g, y2 # y2 = CH = ((f^g)&e)^g
  302. add y0, y2 # y2 = S1 + CH
  303. MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
  304. offset = \round * 4 + _XFER #
  305. add offset(%rsp), y2 # y2 = k + w + S1 + CH
  306. mov a, y0 # y0 = a
  307. add y2, h # h = h + S1 + CH + k + w
  308. mov a, y2 # y2 = a
  309. or c, y0 # y0 = a|c
  310. add h, d # d = d + h + S1 + CH + k + w
  311. and c, y2 # y2 = a&c
  312. and b, y0 # y0 = (a|c)&b
  313. add y1, h # h = h + S1 + CH + k + w + S0
  314. or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
  315. add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
  316. ROTATE_ARGS
  317. .endm
  318. ########################################################################
  319. ## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks)
  320. ## arg 1 : pointer to state
  321. ## arg 2 : pointer to input data
  322. ## arg 3 : Num blocks
  323. ########################################################################
  324. .text
  325. SYM_TYPED_FUNC_START(sha256_transform_avx)
  326. pushq %rbx
  327. pushq %r12
  328. pushq %r13
  329. pushq %r14
  330. pushq %r15
  331. pushq %rbp
  332. movq %rsp, %rbp
  333. subq $STACK_SIZE, %rsp # allocate stack space
  334. and $~15, %rsp # align stack pointer
  335. shl $6, NUM_BLKS # convert to bytes
  336. jz .Ldone_hash
  337. add INP, NUM_BLKS # pointer to end of data
  338. mov NUM_BLKS, _INP_END(%rsp)
  339. ## load initial digest
  340. mov 4*0(CTX), a
  341. mov 4*1(CTX), b
  342. mov 4*2(CTX), c
  343. mov 4*3(CTX), d
  344. mov 4*4(CTX), e
  345. mov 4*5(CTX), f
  346. mov 4*6(CTX), g
  347. mov 4*7(CTX), h
  348. vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
  349. vmovdqa _SHUF_00BA(%rip), SHUF_00BA
  350. vmovdqa _SHUF_DC00(%rip), SHUF_DC00
  351. .Lloop0:
  352. lea K256(%rip), TBL
  353. ## byte swap first 16 dwords
  354. COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
  355. COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
  356. COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
  357. COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
  358. mov INP, _INP(%rsp)
  359. ## schedule 48 input dwords, by doing 3 rounds of 16 each
  360. mov $3, SRND
  361. .align 16
  362. .Lloop1:
  363. vpaddd (TBL), X0, XFER
  364. vmovdqa XFER, _XFER(%rsp)
  365. FOUR_ROUNDS_AND_SCHED
  366. vpaddd 1*16(TBL), X0, XFER
  367. vmovdqa XFER, _XFER(%rsp)
  368. FOUR_ROUNDS_AND_SCHED
  369. vpaddd 2*16(TBL), X0, XFER
  370. vmovdqa XFER, _XFER(%rsp)
  371. FOUR_ROUNDS_AND_SCHED
  372. vpaddd 3*16(TBL), X0, XFER
  373. vmovdqa XFER, _XFER(%rsp)
  374. add $4*16, TBL
  375. FOUR_ROUNDS_AND_SCHED
  376. sub $1, SRND
  377. jne .Lloop1
  378. mov $2, SRND
  379. .Lloop2:
  380. vpaddd (TBL), X0, XFER
  381. vmovdqa XFER, _XFER(%rsp)
  382. DO_ROUND 0
  383. DO_ROUND 1
  384. DO_ROUND 2
  385. DO_ROUND 3
  386. vpaddd 1*16(TBL), X1, XFER
  387. vmovdqa XFER, _XFER(%rsp)
  388. add $2*16, TBL
  389. DO_ROUND 0
  390. DO_ROUND 1
  391. DO_ROUND 2
  392. DO_ROUND 3
  393. vmovdqa X2, X0
  394. vmovdqa X3, X1
  395. sub $1, SRND
  396. jne .Lloop2
  397. addm (4*0)(CTX),a
  398. addm (4*1)(CTX),b
  399. addm (4*2)(CTX),c
  400. addm (4*3)(CTX),d
  401. addm (4*4)(CTX),e
  402. addm (4*5)(CTX),f
  403. addm (4*6)(CTX),g
  404. addm (4*7)(CTX),h
  405. mov _INP(%rsp), INP
  406. add $64, INP
  407. cmp _INP_END(%rsp), INP
  408. jne .Lloop0
  409. .Ldone_hash:
  410. mov %rbp, %rsp
  411. popq %rbp
  412. popq %r15
  413. popq %r14
  414. popq %r13
  415. popq %r12
  416. popq %rbx
  417. RET
  418. SYM_FUNC_END(sha256_transform_avx)
  419. .section .rodata.cst256.K256, "aM", @progbits, 256
  420. .align 64
  421. K256:
  422. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  423. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  424. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  425. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  426. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  427. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  428. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  429. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  430. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  431. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  432. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  433. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  434. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  435. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  436. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  437. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  438. .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
  439. .align 16
  440. PSHUFFLE_BYTE_FLIP_MASK:
  441. .octa 0x0c0d0e0f08090a0b0405060700010203
  442. .section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
  443. .align 16
  444. # shuffle xBxA -> 00BA
  445. _SHUF_00BA:
  446. .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
  447. .section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
  448. .align 16
  449. # shuffle xDxC -> DC00
  450. _SHUF_DC00:
  451. .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF