sha3-ce-core.S 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /*
  3. * sha3-ce-core.S - core SHA-3 transform using v8.2 Crypto Extensions
  4. *
  5. * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
  6. *
  7. * This program is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License version 2 as
  9. * published by the Free Software Foundation.
  10. */
  11. #include <linux/linkage.h>
  12. #include <asm/assembler.h>
  13. .irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
  14. .set .Lv\b\().2d, \b
  15. .set .Lv\b\().16b, \b
  16. .endr
  17. /*
  18. * ARMv8.2 Crypto Extensions instructions
  19. */
  20. .macro eor3, rd, rn, rm, ra
  21. .inst 0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
  22. .endm
  23. .macro rax1, rd, rn, rm
  24. .inst 0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
  25. .endm
  26. .macro bcax, rd, rn, rm, ra
  27. .inst 0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
  28. .endm
  29. .macro xar, rd, rn, rm, imm6
  30. .inst 0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16)
  31. .endm
  32. /*
  33. * int sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
  34. */
  35. .text
  36. SYM_FUNC_START(sha3_ce_transform)
  37. /* load state */
  38. add x8, x0, #32
  39. ld1 { v0.1d- v3.1d}, [x0]
  40. ld1 { v4.1d- v7.1d}, [x8], #32
  41. ld1 { v8.1d-v11.1d}, [x8], #32
  42. ld1 {v12.1d-v15.1d}, [x8], #32
  43. ld1 {v16.1d-v19.1d}, [x8], #32
  44. ld1 {v20.1d-v23.1d}, [x8], #32
  45. ld1 {v24.1d}, [x8]
  46. 0: sub w2, w2, #1
  47. mov w8, #24
  48. adr_l x9, .Lsha3_rcon
  49. /* load input */
  50. ld1 {v25.8b-v28.8b}, [x1], #32
  51. ld1 {v29.8b-v31.8b}, [x1], #24
  52. eor v0.8b, v0.8b, v25.8b
  53. eor v1.8b, v1.8b, v26.8b
  54. eor v2.8b, v2.8b, v27.8b
  55. eor v3.8b, v3.8b, v28.8b
  56. eor v4.8b, v4.8b, v29.8b
  57. eor v5.8b, v5.8b, v30.8b
  58. eor v6.8b, v6.8b, v31.8b
  59. tbnz x3, #6, 2f // SHA3-512
  60. ld1 {v25.8b-v28.8b}, [x1], #32
  61. ld1 {v29.8b-v30.8b}, [x1], #16
  62. eor v7.8b, v7.8b, v25.8b
  63. eor v8.8b, v8.8b, v26.8b
  64. eor v9.8b, v9.8b, v27.8b
  65. eor v10.8b, v10.8b, v28.8b
  66. eor v11.8b, v11.8b, v29.8b
  67. eor v12.8b, v12.8b, v30.8b
  68. tbnz x3, #4, 1f // SHA3-384 or SHA3-224
  69. // SHA3-256
  70. ld1 {v25.8b-v28.8b}, [x1], #32
  71. eor v13.8b, v13.8b, v25.8b
  72. eor v14.8b, v14.8b, v26.8b
  73. eor v15.8b, v15.8b, v27.8b
  74. eor v16.8b, v16.8b, v28.8b
  75. b 3f
  76. 1: tbz x3, #2, 3f // bit 2 cleared? SHA-384
  77. // SHA3-224
  78. ld1 {v25.8b-v28.8b}, [x1], #32
  79. ld1 {v29.8b}, [x1], #8
  80. eor v13.8b, v13.8b, v25.8b
  81. eor v14.8b, v14.8b, v26.8b
  82. eor v15.8b, v15.8b, v27.8b
  83. eor v16.8b, v16.8b, v28.8b
  84. eor v17.8b, v17.8b, v29.8b
  85. b 3f
  86. // SHA3-512
  87. 2: ld1 {v25.8b-v26.8b}, [x1], #16
  88. eor v7.8b, v7.8b, v25.8b
  89. eor v8.8b, v8.8b, v26.8b
  90. 3: sub w8, w8, #1
  91. eor3 v29.16b, v4.16b, v9.16b, v14.16b
  92. eor3 v26.16b, v1.16b, v6.16b, v11.16b
  93. eor3 v28.16b, v3.16b, v8.16b, v13.16b
  94. eor3 v25.16b, v0.16b, v5.16b, v10.16b
  95. eor3 v27.16b, v2.16b, v7.16b, v12.16b
  96. eor3 v29.16b, v29.16b, v19.16b, v24.16b
  97. eor3 v26.16b, v26.16b, v16.16b, v21.16b
  98. eor3 v28.16b, v28.16b, v18.16b, v23.16b
  99. eor3 v25.16b, v25.16b, v15.16b, v20.16b
  100. eor3 v27.16b, v27.16b, v17.16b, v22.16b
  101. rax1 v30.2d, v29.2d, v26.2d // bc[0]
  102. rax1 v26.2d, v26.2d, v28.2d // bc[2]
  103. rax1 v28.2d, v28.2d, v25.2d // bc[4]
  104. rax1 v25.2d, v25.2d, v27.2d // bc[1]
  105. rax1 v27.2d, v27.2d, v29.2d // bc[3]
  106. eor v0.16b, v0.16b, v30.16b
  107. xar v29.2d, v1.2d, v25.2d, (64 - 1)
  108. xar v1.2d, v6.2d, v25.2d, (64 - 44)
  109. xar v6.2d, v9.2d, v28.2d, (64 - 20)
  110. xar v9.2d, v22.2d, v26.2d, (64 - 61)
  111. xar v22.2d, v14.2d, v28.2d, (64 - 39)
  112. xar v14.2d, v20.2d, v30.2d, (64 - 18)
  113. xar v31.2d, v2.2d, v26.2d, (64 - 62)
  114. xar v2.2d, v12.2d, v26.2d, (64 - 43)
  115. xar v12.2d, v13.2d, v27.2d, (64 - 25)
  116. xar v13.2d, v19.2d, v28.2d, (64 - 8)
  117. xar v19.2d, v23.2d, v27.2d, (64 - 56)
  118. xar v23.2d, v15.2d, v30.2d, (64 - 41)
  119. xar v15.2d, v4.2d, v28.2d, (64 - 27)
  120. xar v28.2d, v24.2d, v28.2d, (64 - 14)
  121. xar v24.2d, v21.2d, v25.2d, (64 - 2)
  122. xar v8.2d, v8.2d, v27.2d, (64 - 55)
  123. xar v4.2d, v16.2d, v25.2d, (64 - 45)
  124. xar v16.2d, v5.2d, v30.2d, (64 - 36)
  125. xar v5.2d, v3.2d, v27.2d, (64 - 28)
  126. xar v27.2d, v18.2d, v27.2d, (64 - 21)
  127. xar v3.2d, v17.2d, v26.2d, (64 - 15)
  128. xar v25.2d, v11.2d, v25.2d, (64 - 10)
  129. xar v26.2d, v7.2d, v26.2d, (64 - 6)
  130. xar v30.2d, v10.2d, v30.2d, (64 - 3)
  131. bcax v20.16b, v31.16b, v22.16b, v8.16b
  132. bcax v21.16b, v8.16b, v23.16b, v22.16b
  133. bcax v22.16b, v22.16b, v24.16b, v23.16b
  134. bcax v23.16b, v23.16b, v31.16b, v24.16b
  135. bcax v24.16b, v24.16b, v8.16b, v31.16b
  136. ld1r {v31.2d}, [x9], #8
  137. bcax v17.16b, v25.16b, v19.16b, v3.16b
  138. bcax v18.16b, v3.16b, v15.16b, v19.16b
  139. bcax v19.16b, v19.16b, v16.16b, v15.16b
  140. bcax v15.16b, v15.16b, v25.16b, v16.16b
  141. bcax v16.16b, v16.16b, v3.16b, v25.16b
  142. bcax v10.16b, v29.16b, v12.16b, v26.16b
  143. bcax v11.16b, v26.16b, v13.16b, v12.16b
  144. bcax v12.16b, v12.16b, v14.16b, v13.16b
  145. bcax v13.16b, v13.16b, v29.16b, v14.16b
  146. bcax v14.16b, v14.16b, v26.16b, v29.16b
  147. bcax v7.16b, v30.16b, v9.16b, v4.16b
  148. bcax v8.16b, v4.16b, v5.16b, v9.16b
  149. bcax v9.16b, v9.16b, v6.16b, v5.16b
  150. bcax v5.16b, v5.16b, v30.16b, v6.16b
  151. bcax v6.16b, v6.16b, v4.16b, v30.16b
  152. bcax v3.16b, v27.16b, v0.16b, v28.16b
  153. bcax v4.16b, v28.16b, v1.16b, v0.16b
  154. bcax v0.16b, v0.16b, v2.16b, v1.16b
  155. bcax v1.16b, v1.16b, v27.16b, v2.16b
  156. bcax v2.16b, v2.16b, v28.16b, v27.16b
  157. eor v0.16b, v0.16b, v31.16b
  158. cbnz w8, 3b
  159. cond_yield 4f, x8, x9
  160. cbnz w2, 0b
  161. /* save state */
  162. 4: st1 { v0.1d- v3.1d}, [x0], #32
  163. st1 { v4.1d- v7.1d}, [x0], #32
  164. st1 { v8.1d-v11.1d}, [x0], #32
  165. st1 {v12.1d-v15.1d}, [x0], #32
  166. st1 {v16.1d-v19.1d}, [x0], #32
  167. st1 {v20.1d-v23.1d}, [x0], #32
  168. st1 {v24.1d}, [x0]
  169. mov w0, w2
  170. ret
  171. SYM_FUNC_END(sha3_ce_transform)
  172. .section ".rodata", "a"
  173. .align 8
  174. .Lsha3_rcon:
  175. .quad 0x0000000000000001, 0x0000000000008082, 0x800000000000808a
  176. .quad 0x8000000080008000, 0x000000000000808b, 0x0000000080000001
  177. .quad 0x8000000080008081, 0x8000000000008009, 0x000000000000008a
  178. .quad 0x0000000000000088, 0x0000000080008009, 0x000000008000000a
  179. .quad 0x000000008000808b, 0x800000000000008b, 0x8000000000008089
  180. .quad 0x8000000000008003, 0x8000000000008002, 0x8000000000000080
  181. .quad 0x000000000000800a, 0x800000008000000a, 0x8000000080008081
  182. .quad 0x8000000000008080, 0x0000000080000001, 0x8000000080008008