sha256-avx2-asm.S 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771
  1. ########################################################################
  2. # Implement fast SHA-256 with AVX2 instructions. (x86_64)
  3. #
  4. # Copyright (C) 2013 Intel Corporation.
  5. #
  6. # Authors:
  7. # James Guilford <james.guilford@intel.com>
  8. # Kirk Yap <kirk.s.yap@intel.com>
  9. # Tim Chen <tim.c.chen@linux.intel.com>
  10. #
  11. # This software is available to you under a choice of one of two
  12. # licenses. You may choose to be licensed under the terms of the GNU
  13. # General Public License (GPL) Version 2, available from the file
  14. # COPYING in the main directory of this source tree, or the
  15. # OpenIB.org BSD license below:
  16. #
  17. # Redistribution and use in source and binary forms, with or
  18. # without modification, are permitted provided that the following
  19. # conditions are met:
  20. #
  21. # - Redistributions of source code must retain the above
  22. # copyright notice, this list of conditions and the following
  23. # disclaimer.
  24. #
  25. # - Redistributions in binary form must reproduce the above
  26. # copyright notice, this list of conditions and the following
  27. # disclaimer in the documentation and/or other materials
  28. # provided with the distribution.
  29. #
  30. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  31. # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  32. # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  33. # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  34. # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  35. # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  36. # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  37. # SOFTWARE.
  38. #
  39. ########################################################################
  40. #
  41. # This code is described in an Intel White-Paper:
  42. # "Fast SHA-256 Implementations on Intel Architecture Processors"
  43. #
  44. # To find it, surf to http://www.intel.com/p/en_US/embedded
  45. # and search for that title.
  46. #
  47. ########################################################################
  48. # This code schedules 2 blocks at a time, with 4 lanes per block
  49. ########################################################################
  50. #ifdef CONFIG_AS_AVX2
  51. #include <linux/linkage.h>
  52. ## assume buffers not aligned
  53. #define VMOVDQ vmovdqu
  54. ################################ Define Macros
  55. # addm [mem], reg
  56. # Add reg to mem using reg-mem add and store
  57. .macro addm p1 p2
  58. add \p1, \p2
  59. mov \p2, \p1
  60. .endm
  61. ################################
  62. X0 = %ymm4
  63. X1 = %ymm5
  64. X2 = %ymm6
  65. X3 = %ymm7
  66. # XMM versions of above
  67. XWORD0 = %xmm4
  68. XWORD1 = %xmm5
  69. XWORD2 = %xmm6
  70. XWORD3 = %xmm7
  71. XTMP0 = %ymm0
  72. XTMP1 = %ymm1
  73. XTMP2 = %ymm2
  74. XTMP3 = %ymm3
  75. XTMP4 = %ymm8
  76. XFER = %ymm9
  77. XTMP5 = %ymm11
  78. SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
  79. SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
  80. BYTE_FLIP_MASK = %ymm13
  81. X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
  82. NUM_BLKS = %rdx # 3rd arg
  83. INP = %rsi # 2nd arg
  84. CTX = %rdi # 1st arg
  85. c = %ecx
  86. d = %r8d
  87. e = %edx # clobbers NUM_BLKS
  88. y3 = %esi # clobbers INP
  89. SRND = CTX # SRND is same register as CTX
  90. a = %eax
  91. b = %ebx
  92. f = %r9d
  93. g = %r10d
  94. h = %r11d
  95. old_h = %r11d
  96. T1 = %r12d
  97. y0 = %r13d
  98. y1 = %r14d
  99. y2 = %r15d
  100. _XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
  101. _XMM_SAVE_SIZE = 0
  102. _INP_END_SIZE = 8
  103. _INP_SIZE = 8
  104. _CTX_SIZE = 8
  105. _RSP_SIZE = 8
  106. _XFER = 0
  107. _XMM_SAVE = _XFER + _XFER_SIZE
  108. _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
  109. _INP = _INP_END + _INP_END_SIZE
  110. _CTX = _INP + _INP_SIZE
  111. _RSP = _CTX + _CTX_SIZE
  112. STACK_SIZE = _RSP + _RSP_SIZE
  113. # rotate_Xs
  114. # Rotate values of symbols X0...X3
  115. .macro rotate_Xs
  116. X_ = X0
  117. X0 = X1
  118. X1 = X2
  119. X2 = X3
  120. X3 = X_
  121. .endm
  122. # ROTATE_ARGS
  123. # Rotate values of symbols a...h
  124. .macro ROTATE_ARGS
  125. old_h = h
  126. TMP_ = h
  127. h = g
  128. g = f
  129. f = e
  130. e = d
  131. d = c
  132. c = b
  133. b = a
  134. a = TMP_
  135. .endm
  136. .macro FOUR_ROUNDS_AND_SCHED disp
  137. ################################### RND N + 0 ############################
  138. mov a, y3 # y3 = a # MAJA
  139. rorx $25, e, y0 # y0 = e >> 25 # S1A
  140. rorx $11, e, y1 # y1 = e >> 11 # S1B
  141. addl \disp(%rsp, SRND), h # h = k + w + h # --
  142. or c, y3 # y3 = a|c # MAJA
  143. vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
  144. mov f, y2 # y2 = f # CH
  145. rorx $13, a, T1 # T1 = a >> 13 # S0B
  146. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  147. xor g, y2 # y2 = f^g # CH
  148. vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
  149. rorx $6, e, y1 # y1 = (e >> 6) # S1
  150. and e, y2 # y2 = (f^g)&e # CH
  151. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  152. rorx $22, a, y1 # y1 = a >> 22 # S0A
  153. add h, d # d = k + w + h + d # --
  154. and b, y3 # y3 = (a|c)&b # MAJA
  155. vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
  156. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  157. rorx $2, a, T1 # T1 = (a >> 2) # S0
  158. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  159. vpsrld $7, XTMP1, XTMP2
  160. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  161. mov a, T1 # T1 = a # MAJB
  162. and c, T1 # T1 = a&c # MAJB
  163. add y0, y2 # y2 = S1 + CH # --
  164. vpslld $(32-7), XTMP1, XTMP3
  165. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  166. add y1, h # h = k + w + h + S0 # --
  167. add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
  168. vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
  169. vpsrld $18, XTMP1, XTMP2
  170. add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  171. add y3, h # h = t1 + S0 + MAJ # --
  172. ROTATE_ARGS
  173. ################################### RND N + 1 ############################
  174. mov a, y3 # y3 = a # MAJA
  175. rorx $25, e, y0 # y0 = e >> 25 # S1A
  176. rorx $11, e, y1 # y1 = e >> 11 # S1B
  177. offset = \disp + 1*4
  178. addl offset(%rsp, SRND), h # h = k + w + h # --
  179. or c, y3 # y3 = a|c # MAJA
  180. vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
  181. mov f, y2 # y2 = f # CH
  182. rorx $13, a, T1 # T1 = a >> 13 # S0B
  183. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  184. xor g, y2 # y2 = f^g # CH
  185. rorx $6, e, y1 # y1 = (e >> 6) # S1
  186. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  187. rorx $22, a, y1 # y1 = a >> 22 # S0A
  188. and e, y2 # y2 = (f^g)&e # CH
  189. add h, d # d = k + w + h + d # --
  190. vpslld $(32-18), XTMP1, XTMP1
  191. and b, y3 # y3 = (a|c)&b # MAJA
  192. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  193. vpxor XTMP1, XTMP3, XTMP3
  194. rorx $2, a, T1 # T1 = (a >> 2) # S0
  195. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  196. vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
  197. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  198. mov a, T1 # T1 = a # MAJB
  199. and c, T1 # T1 = a&c # MAJB
  200. add y0, y2 # y2 = S1 + CH # --
  201. vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
  202. vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
  203. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  204. add y1, h # h = k + w + h + S0 # --
  205. vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
  206. add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
  207. add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  208. add y3, h # h = t1 + S0 + MAJ # --
  209. vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
  210. ROTATE_ARGS
  211. ################################### RND N + 2 ############################
  212. mov a, y3 # y3 = a # MAJA
  213. rorx $25, e, y0 # y0 = e >> 25 # S1A
  214. offset = \disp + 2*4
  215. addl offset(%rsp, SRND), h # h = k + w + h # --
  216. vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
  217. rorx $11, e, y1 # y1 = e >> 11 # S1B
  218. or c, y3 # y3 = a|c # MAJA
  219. mov f, y2 # y2 = f # CH
  220. xor g, y2 # y2 = f^g # CH
  221. rorx $13, a, T1 # T1 = a >> 13 # S0B
  222. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  223. vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
  224. and e, y2 # y2 = (f^g)&e # CH
  225. rorx $6, e, y1 # y1 = (e >> 6) # S1
  226. vpxor XTMP3, XTMP2, XTMP2
  227. add h, d # d = k + w + h + d # --
  228. and b, y3 # y3 = (a|c)&b # MAJA
  229. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  230. rorx $22, a, y1 # y1 = a >> 22 # S0A
  231. vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
  232. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  233. vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
  234. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  235. rorx $2, a ,T1 # T1 = (a >> 2) # S0
  236. vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
  237. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  238. mov a, T1 # T1 = a # MAJB
  239. and c, T1 # T1 = a&c # MAJB
  240. add y0, y2 # y2 = S1 + CH # --
  241. vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
  242. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  243. add y1,h # h = k + w + h + S0 # --
  244. add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
  245. add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  246. add y3,h # h = t1 + S0 + MAJ # --
  247. ROTATE_ARGS
  248. ################################### RND N + 3 ############################
  249. mov a, y3 # y3 = a # MAJA
  250. rorx $25, e, y0 # y0 = e >> 25 # S1A
  251. rorx $11, e, y1 # y1 = e >> 11 # S1B
  252. offset = \disp + 3*4
  253. addl offset(%rsp, SRND), h # h = k + w + h # --
  254. or c, y3 # y3 = a|c # MAJA
  255. vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
  256. mov f, y2 # y2 = f # CH
  257. rorx $13, a, T1 # T1 = a >> 13 # S0B
  258. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  259. xor g, y2 # y2 = f^g # CH
  260. vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
  261. rorx $6, e, y1 # y1 = (e >> 6) # S1
  262. and e, y2 # y2 = (f^g)&e # CH
  263. add h, d # d = k + w + h + d # --
  264. and b, y3 # y3 = (a|c)&b # MAJA
  265. vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
  266. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  267. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  268. vpxor XTMP3, XTMP2, XTMP2
  269. rorx $22, a, y1 # y1 = a >> 22 # S0A
  270. add y0, y2 # y2 = S1 + CH # --
  271. vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
  272. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  273. add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
  274. rorx $2, a, T1 # T1 = (a >> 2) # S0
  275. vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
  276. vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
  277. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  278. mov a, T1 # T1 = a # MAJB
  279. and c, T1 # T1 = a&c # MAJB
  280. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  281. add y1, h # h = k + w + h + S0 # --
  282. add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  283. add y3, h # h = t1 + S0 + MAJ # --
  284. ROTATE_ARGS
  285. rotate_Xs
  286. .endm
  287. .macro DO_4ROUNDS disp
  288. ################################### RND N + 0 ###########################
  289. mov f, y2 # y2 = f # CH
  290. rorx $25, e, y0 # y0 = e >> 25 # S1A
  291. rorx $11, e, y1 # y1 = e >> 11 # S1B
  292. xor g, y2 # y2 = f^g # CH
  293. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  294. rorx $6, e, y1 # y1 = (e >> 6) # S1
  295. and e, y2 # y2 = (f^g)&e # CH
  296. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  297. rorx $13, a, T1 # T1 = a >> 13 # S0B
  298. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  299. rorx $22, a, y1 # y1 = a >> 22 # S0A
  300. mov a, y3 # y3 = a # MAJA
  301. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  302. rorx $2, a, T1 # T1 = (a >> 2) # S0
  303. addl \disp(%rsp, SRND), h # h = k + w + h # --
  304. or c, y3 # y3 = a|c # MAJA
  305. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  306. mov a, T1 # T1 = a # MAJB
  307. and b, y3 # y3 = (a|c)&b # MAJA
  308. and c, T1 # T1 = a&c # MAJB
  309. add y0, y2 # y2 = S1 + CH # --
  310. add h, d # d = k + w + h + d # --
  311. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  312. add y1, h # h = k + w + h + S0 # --
  313. add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
  314. ROTATE_ARGS
  315. ################################### RND N + 1 ###########################
  316. add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  317. mov f, y2 # y2 = f # CH
  318. rorx $25, e, y0 # y0 = e >> 25 # S1A
  319. rorx $11, e, y1 # y1 = e >> 11 # S1B
  320. xor g, y2 # y2 = f^g # CH
  321. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  322. rorx $6, e, y1 # y1 = (e >> 6) # S1
  323. and e, y2 # y2 = (f^g)&e # CH
  324. add y3, old_h # h = t1 + S0 + MAJ # --
  325. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  326. rorx $13, a, T1 # T1 = a >> 13 # S0B
  327. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  328. rorx $22, a, y1 # y1 = a >> 22 # S0A
  329. mov a, y3 # y3 = a # MAJA
  330. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  331. rorx $2, a, T1 # T1 = (a >> 2) # S0
  332. offset = 4*1 + \disp
  333. addl offset(%rsp, SRND), h # h = k + w + h # --
  334. or c, y3 # y3 = a|c # MAJA
  335. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  336. mov a, T1 # T1 = a # MAJB
  337. and b, y3 # y3 = (a|c)&b # MAJA
  338. and c, T1 # T1 = a&c # MAJB
  339. add y0, y2 # y2 = S1 + CH # --
  340. add h, d # d = k + w + h + d # --
  341. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  342. add y1, h # h = k + w + h + S0 # --
  343. add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
  344. ROTATE_ARGS
  345. ################################### RND N + 2 ##############################
  346. add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  347. mov f, y2 # y2 = f # CH
  348. rorx $25, e, y0 # y0 = e >> 25 # S1A
  349. rorx $11, e, y1 # y1 = e >> 11 # S1B
  350. xor g, y2 # y2 = f^g # CH
  351. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  352. rorx $6, e, y1 # y1 = (e >> 6) # S1
  353. and e, y2 # y2 = (f^g)&e # CH
  354. add y3, old_h # h = t1 + S0 + MAJ # --
  355. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  356. rorx $13, a, T1 # T1 = a >> 13 # S0B
  357. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  358. rorx $22, a, y1 # y1 = a >> 22 # S0A
  359. mov a, y3 # y3 = a # MAJA
  360. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  361. rorx $2, a, T1 # T1 = (a >> 2) # S0
  362. offset = 4*2 + \disp
  363. addl offset(%rsp, SRND), h # h = k + w + h # --
  364. or c, y3 # y3 = a|c # MAJA
  365. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  366. mov a, T1 # T1 = a # MAJB
  367. and b, y3 # y3 = (a|c)&b # MAJA
  368. and c, T1 # T1 = a&c # MAJB
  369. add y0, y2 # y2 = S1 + CH # --
  370. add h, d # d = k + w + h + d # --
  371. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  372. add y1, h # h = k + w + h + S0 # --
  373. add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
  374. ROTATE_ARGS
  375. ################################### RND N + 3 ###########################
  376. add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  377. mov f, y2 # y2 = f # CH
  378. rorx $25, e, y0 # y0 = e >> 25 # S1A
  379. rorx $11, e, y1 # y1 = e >> 11 # S1B
  380. xor g, y2 # y2 = f^g # CH
  381. xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
  382. rorx $6, e, y1 # y1 = (e >> 6) # S1
  383. and e, y2 # y2 = (f^g)&e # CH
  384. add y3, old_h # h = t1 + S0 + MAJ # --
  385. xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
  386. rorx $13, a, T1 # T1 = a >> 13 # S0B
  387. xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
  388. rorx $22, a, y1 # y1 = a >> 22 # S0A
  389. mov a, y3 # y3 = a # MAJA
  390. xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
  391. rorx $2, a, T1 # T1 = (a >> 2) # S0
  392. offset = 4*3 + \disp
  393. addl offset(%rsp, SRND), h # h = k + w + h # --
  394. or c, y3 # y3 = a|c # MAJA
  395. xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
  396. mov a, T1 # T1 = a # MAJB
  397. and b, y3 # y3 = (a|c)&b # MAJA
  398. and c, T1 # T1 = a&c # MAJB
  399. add y0, y2 # y2 = S1 + CH # --
  400. add h, d # d = k + w + h + d # --
  401. or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
  402. add y1, h # h = k + w + h + S0 # --
  403. add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
  404. add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
  405. add y3, h # h = t1 + S0 + MAJ # --
  406. ROTATE_ARGS
  407. .endm
  408. ########################################################################
  409. ## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
  410. ## arg 1 : pointer to digest
  411. ## arg 2 : pointer to input data
  412. ## arg 3 : Num blocks
  413. ########################################################################
  414. .text
  415. ENTRY(sha256_transform_rorx)
  416. .align 32
  417. pushq %rbx
  418. pushq %r12
  419. pushq %r13
  420. pushq %r14
  421. pushq %r15
  422. mov %rsp, %rax
  423. subq $STACK_SIZE, %rsp
  424. and $-32, %rsp # align rsp to 32 byte boundary
  425. mov %rax, _RSP(%rsp)
  426. shl $6, NUM_BLKS # convert to bytes
  427. jz done_hash
  428. lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
  429. mov NUM_BLKS, _INP_END(%rsp)
  430. cmp NUM_BLKS, INP
  431. je only_one_block
  432. ## load initial digest
  433. mov (CTX), a
  434. mov 4*1(CTX), b
  435. mov 4*2(CTX), c
  436. mov 4*3(CTX), d
  437. mov 4*4(CTX), e
  438. mov 4*5(CTX), f
  439. mov 4*6(CTX), g
  440. mov 4*7(CTX), h
  441. vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
  442. vmovdqa _SHUF_00BA(%rip), SHUF_00BA
  443. vmovdqa _SHUF_DC00(%rip), SHUF_DC00
  444. mov CTX, _CTX(%rsp)
  445. loop0:
  446. ## Load first 16 dwords from two blocks
  447. VMOVDQ 0*32(INP),XTMP0
  448. VMOVDQ 1*32(INP),XTMP1
  449. VMOVDQ 2*32(INP),XTMP2
  450. VMOVDQ 3*32(INP),XTMP3
  451. ## byte swap data
  452. vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
  453. vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
  454. vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
  455. vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
  456. ## transpose data into high/low halves
  457. vperm2i128 $0x20, XTMP2, XTMP0, X0
  458. vperm2i128 $0x31, XTMP2, XTMP0, X1
  459. vperm2i128 $0x20, XTMP3, XTMP1, X2
  460. vperm2i128 $0x31, XTMP3, XTMP1, X3
  461. last_block_enter:
  462. add $64, INP
  463. mov INP, _INP(%rsp)
  464. ## schedule 48 input dwords, by doing 3 rounds of 12 each
  465. xor SRND, SRND
  466. .align 16
  467. loop1:
  468. vpaddd K256+0*32(SRND), X0, XFER
  469. vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
  470. FOUR_ROUNDS_AND_SCHED _XFER + 0*32
  471. vpaddd K256+1*32(SRND), X0, XFER
  472. vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
  473. FOUR_ROUNDS_AND_SCHED _XFER + 1*32
  474. vpaddd K256+2*32(SRND), X0, XFER
  475. vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
  476. FOUR_ROUNDS_AND_SCHED _XFER + 2*32
  477. vpaddd K256+3*32(SRND), X0, XFER
  478. vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
  479. FOUR_ROUNDS_AND_SCHED _XFER + 3*32
  480. add $4*32, SRND
  481. cmp $3*4*32, SRND
  482. jb loop1
  483. loop2:
  484. ## Do last 16 rounds with no scheduling
  485. vpaddd K256+0*32(SRND), X0, XFER
  486. vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
  487. DO_4ROUNDS _XFER + 0*32
  488. vpaddd K256+1*32(SRND), X1, XFER
  489. vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
  490. DO_4ROUNDS _XFER + 1*32
  491. add $2*32, SRND
  492. vmovdqa X2, X0
  493. vmovdqa X3, X1
  494. cmp $4*4*32, SRND
  495. jb loop2
  496. mov _CTX(%rsp), CTX
  497. mov _INP(%rsp), INP
  498. addm (4*0)(CTX),a
  499. addm (4*1)(CTX),b
  500. addm (4*2)(CTX),c
  501. addm (4*3)(CTX),d
  502. addm (4*4)(CTX),e
  503. addm (4*5)(CTX),f
  504. addm (4*6)(CTX),g
  505. addm (4*7)(CTX),h
  506. cmp _INP_END(%rsp), INP
  507. ja done_hash
  508. #### Do second block using previously scheduled results
  509. xor SRND, SRND
  510. .align 16
  511. loop3:
  512. DO_4ROUNDS _XFER + 0*32 + 16
  513. DO_4ROUNDS _XFER + 1*32 + 16
  514. add $2*32, SRND
  515. cmp $4*4*32, SRND
  516. jb loop3
  517. mov _CTX(%rsp), CTX
  518. mov _INP(%rsp), INP
  519. add $64, INP
  520. addm (4*0)(CTX),a
  521. addm (4*1)(CTX),b
  522. addm (4*2)(CTX),c
  523. addm (4*3)(CTX),d
  524. addm (4*4)(CTX),e
  525. addm (4*5)(CTX),f
  526. addm (4*6)(CTX),g
  527. addm (4*7)(CTX),h
  528. cmp _INP_END(%rsp), INP
  529. jb loop0
  530. ja done_hash
  531. do_last_block:
  532. VMOVDQ 0*16(INP),XWORD0
  533. VMOVDQ 1*16(INP),XWORD1
  534. VMOVDQ 2*16(INP),XWORD2
  535. VMOVDQ 3*16(INP),XWORD3
  536. vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
  537. vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
  538. vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
  539. vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
  540. jmp last_block_enter
  541. only_one_block:
  542. ## load initial digest
  543. mov (4*0)(CTX),a
  544. mov (4*1)(CTX),b
  545. mov (4*2)(CTX),c
  546. mov (4*3)(CTX),d
  547. mov (4*4)(CTX),e
  548. mov (4*5)(CTX),f
  549. mov (4*6)(CTX),g
  550. mov (4*7)(CTX),h
  551. vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
  552. vmovdqa _SHUF_00BA(%rip), SHUF_00BA
  553. vmovdqa _SHUF_DC00(%rip), SHUF_DC00
  554. mov CTX, _CTX(%rsp)
  555. jmp do_last_block
  556. done_hash:
  557. mov _RSP(%rsp), %rsp
  558. popq %r15
  559. popq %r14
  560. popq %r13
  561. popq %r12
  562. popq %rbx
  563. ret
  564. ENDPROC(sha256_transform_rorx)
  565. .section .rodata.cst512.K256, "aM", @progbits, 512
  566. .align 64
  567. K256:
  568. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  569. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  570. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  571. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  572. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  573. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  574. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  575. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  576. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  577. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  578. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  579. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  580. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  581. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  582. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  583. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  584. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  585. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  586. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  587. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  588. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  589. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  590. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  591. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  592. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  593. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  594. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  595. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  596. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  597. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  598. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  599. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  600. .section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
  601. .align 32
  602. PSHUFFLE_BYTE_FLIP_MASK:
  603. .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
  604. # shuffle xBxA -> 00BA
  605. .section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
  606. .align 32
  607. _SHUF_00BA:
  608. .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
  609. # shuffle xDxC -> DC00
  610. .section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
  611. .align 32
  612. _SHUF_DC00:
  613. .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
  614. #endif