vgetrandom-chacha.S 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
  4. */
  5. #include <asm/asm.h>
  6. #include <asm/regdef.h>
  7. #include <linux/linkage.h>
  8. .text
  9. .macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3
  10. \op \d0, \d0, \s0
  11. \op \d1, \d1, \s1
  12. \op \d2, \d2, \s2
  13. \op \d3, \d3, \s3
  14. .endm
  15. /*
  16. * Very basic LoongArch implementation of ChaCha20. Produces a given positive
  17. * number of blocks of output with a nonce of 0, taking an input key and
  18. * 8-byte counter. Importantly does not spill to the stack. Its arguments
  19. * are:
  20. *
  21. * a0: output bytes
  22. * a1: 32-byte key input
  23. * a2: 8-byte counter input/output
  24. * a3: number of 64-byte blocks to write to output
  25. */
  26. SYM_FUNC_START(__arch_chacha20_blocks_nostack)
  27. /* We don't need a frame pointer */
  28. #define s9 fp
  29. #define output a0
  30. #define key a1
  31. #define counter a2
  32. #define nblocks a3
  33. #define i a4
  34. #define state0 s0
  35. #define state1 s1
  36. #define state2 s2
  37. #define state3 s3
  38. #define state4 s4
  39. #define state5 s5
  40. #define state6 s6
  41. #define state7 s7
  42. #define state8 s8
  43. #define state9 s9
  44. #define state10 a5
  45. #define state11 a6
  46. #define state12 a7
  47. #define state13 t0
  48. #define state14 t1
  49. #define state15 t2
  50. #define cnt_lo t3
  51. #define cnt_hi t4
  52. #define copy0 t5
  53. #define copy1 t6
  54. #define copy2 t7
  55. /* Reuse i as copy3 */
  56. #define copy3 i
  57. /* Packs to be used with OP_4REG */
  58. #define line0 state0, state1, state2, state3
  59. #define line1 state4, state5, state6, state7
  60. #define line2 state8, state9, state10, state11
  61. #define line3 state12, state13, state14, state15
  62. #define line1_perm state5, state6, state7, state4
  63. #define line2_perm state10, state11, state8, state9
  64. #define line3_perm state15, state12, state13, state14
  65. #define copy copy0, copy1, copy2, copy3
  66. #define _16 16, 16, 16, 16
  67. #define _20 20, 20, 20, 20
  68. #define _24 24, 24, 24, 24
  69. #define _25 25, 25, 25, 25
  70. /*
  71. * The ABI requires s0-s9 saved, and sp aligned to 16-byte.
  72. * This does not violate the stack-less requirement: no sensitive data
  73. * is spilled onto the stack.
  74. */
  75. PTR_ADDI sp, sp, (-SZREG * 10) & STACK_ALIGN
  76. REG_S s0, sp, 0
  77. REG_S s1, sp, SZREG
  78. REG_S s2, sp, SZREG * 2
  79. REG_S s3, sp, SZREG * 3
  80. REG_S s4, sp, SZREG * 4
  81. REG_S s5, sp, SZREG * 5
  82. REG_S s6, sp, SZREG * 6
  83. REG_S s7, sp, SZREG * 7
  84. REG_S s8, sp, SZREG * 8
  85. REG_S s9, sp, SZREG * 9
  86. li.w copy0, 0x61707865
  87. li.w copy1, 0x3320646e
  88. li.w copy2, 0x79622d32
  89. ld.w cnt_lo, counter, 0
  90. ld.w cnt_hi, counter, 4
  91. .Lblock:
  92. /* state[0,1,2,3] = "expand 32-byte k" */
  93. move state0, copy0
  94. move state1, copy1
  95. move state2, copy2
  96. li.w state3, 0x6b206574
  97. /* state[4,5,..,11] = key */
  98. ld.w state4, key, 0
  99. ld.w state5, key, 4
  100. ld.w state6, key, 8
  101. ld.w state7, key, 12
  102. ld.w state8, key, 16
  103. ld.w state9, key, 20
  104. ld.w state10, key, 24
  105. ld.w state11, key, 28
  106. /* state[12,13] = counter */
  107. move state12, cnt_lo
  108. move state13, cnt_hi
  109. /* state[14,15] = 0 */
  110. move state14, zero
  111. move state15, zero
  112. li.w i, 10
  113. .Lpermute:
  114. /* odd round */
  115. OP_4REG add.w line0, line1
  116. OP_4REG xor line3, line0
  117. OP_4REG rotri.w line3, _16
  118. OP_4REG add.w line2, line3
  119. OP_4REG xor line1, line2
  120. OP_4REG rotri.w line1, _20
  121. OP_4REG add.w line0, line1
  122. OP_4REG xor line3, line0
  123. OP_4REG rotri.w line3, _24
  124. OP_4REG add.w line2, line3
  125. OP_4REG xor line1, line2
  126. OP_4REG rotri.w line1, _25
  127. /* even round */
  128. OP_4REG add.w line0, line1_perm
  129. OP_4REG xor line3_perm, line0
  130. OP_4REG rotri.w line3_perm, _16
  131. OP_4REG add.w line2_perm, line3_perm
  132. OP_4REG xor line1_perm, line2_perm
  133. OP_4REG rotri.w line1_perm, _20
  134. OP_4REG add.w line0, line1_perm
  135. OP_4REG xor line3_perm, line0
  136. OP_4REG rotri.w line3_perm, _24
  137. OP_4REG add.w line2_perm, line3_perm
  138. OP_4REG xor line1_perm, line2_perm
  139. OP_4REG rotri.w line1_perm, _25
  140. addi.w i, i, -1
  141. bnez i, .Lpermute
  142. /*
  143. * copy[3] = "expa", materialize it here because copy[3] shares the
  144. * same register with i which just became dead.
  145. */
  146. li.w copy3, 0x6b206574
  147. /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
  148. OP_4REG add.w line0, copy
  149. st.w state0, output, 0
  150. st.w state1, output, 4
  151. st.w state2, output, 8
  152. st.w state3, output, 12
  153. /* from now on state[0,1,2,3] are scratch registers */
  154. /* state[0,1,2,3] = lo32(key) */
  155. ld.w state0, key, 0
  156. ld.w state1, key, 4
  157. ld.w state2, key, 8
  158. ld.w state3, key, 12
  159. /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
  160. OP_4REG add.w line1, line0
  161. st.w state4, output, 16
  162. st.w state5, output, 20
  163. st.w state6, output, 24
  164. st.w state7, output, 28
  165. /* state[0,1,2,3] = hi32(key) */
  166. ld.w state0, key, 16
  167. ld.w state1, key, 20
  168. ld.w state2, key, 24
  169. ld.w state3, key, 28
  170. /* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */
  171. OP_4REG add.w line2, line0
  172. st.w state8, output, 32
  173. st.w state9, output, 36
  174. st.w state10, output, 40
  175. st.w state11, output, 44
  176. /* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */
  177. add.w state12, state12, cnt_lo
  178. add.w state13, state13, cnt_hi
  179. st.w state12, output, 48
  180. st.w state13, output, 52
  181. st.w state14, output, 56
  182. st.w state15, output, 60
  183. /* ++counter */
  184. addi.w cnt_lo, cnt_lo, 1
  185. sltui state0, cnt_lo, 1
  186. add.w cnt_hi, cnt_hi, state0
  187. /* output += 64 */
  188. PTR_ADDI output, output, 64
  189. /* --nblocks */
  190. PTR_ADDI nblocks, nblocks, -1
  191. bnez nblocks, .Lblock
  192. /* counter = [cnt_lo, cnt_hi] */
  193. st.w cnt_lo, counter, 0
  194. st.w cnt_hi, counter, 4
  195. /*
  196. * Zero out the potentially sensitive regs, in case nothing uses these
  197. * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and
  198. * state[0,...,9] are s0-s9 those we'll restore in the epilogue, so we
  199. * only need to zero state[11,...,15].
  200. */
  201. move state10, zero
  202. move state11, zero
  203. move state12, zero
  204. move state13, zero
  205. move state14, zero
  206. move state15, zero
  207. REG_L s0, sp, 0
  208. REG_L s1, sp, SZREG
  209. REG_L s2, sp, SZREG * 2
  210. REG_L s3, sp, SZREG * 3
  211. REG_L s4, sp, SZREG * 4
  212. REG_L s5, sp, SZREG * 5
  213. REG_L s6, sp, SZREG * 6
  214. REG_L s7, sp, SZREG * 7
  215. REG_L s8, sp, SZREG * 8
  216. REG_L s9, sp, SZREG * 9
  217. PTR_ADDI sp, sp, -((-SZREG * 10) & STACK_ALIGN)
  218. jr ra
  219. SYM_FUNC_END(__arch_chacha20_blocks_nostack)