aes-riscv64-zvkned-zvbb-zvkg.S 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312
  1. /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
  2. //
  3. // This file is dual-licensed, meaning that you can use it under your
  4. // choice of either of the following two licenses:
  5. //
  6. // Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
  7. //
  8. // Licensed under the Apache License 2.0 (the "License"). You can obtain
  9. // a copy in the file LICENSE in the source distribution or at
  10. // https://www.openssl.org/source/license.html
  11. //
  12. // or
  13. //
  14. // Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
  15. // Copyright 2024 Google LLC
  16. // All rights reserved.
  17. //
  18. // Redistribution and use in source and binary forms, with or without
  19. // modification, are permitted provided that the following conditions
  20. // are met:
  21. // 1. Redistributions of source code must retain the above copyright
  22. // notice, this list of conditions and the following disclaimer.
  23. // 2. Redistributions in binary form must reproduce the above copyright
  24. // notice, this list of conditions and the following disclaimer in the
  25. // documentation and/or other materials provided with the distribution.
  26. //
  27. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  30. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  31. // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  32. // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  33. // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  34. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  35. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  36. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  37. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38. // The generated code of this file depends on the following RISC-V extensions:
  39. // - RV64I
  40. // - RISC-V Vector ('V') with VLEN >= 128 && VLEN < 2048
  41. // - RISC-V Vector AES block cipher extension ('Zvkned')
  42. // - RISC-V Vector Bit-manipulation extension ('Zvbb')
  43. // - RISC-V Vector GCM/GMAC extension ('Zvkg')
  44. #include <linux/linkage.h>
  45. .text
  46. .option arch, +zvkned, +zvbb, +zvkg
  47. #include "aes-macros.S"
  48. #define KEYP a0
  49. #define INP a1
  50. #define OUTP a2
  51. #define LEN a3
  52. #define TWEAKP a4
  53. #define LEN32 a5
  54. #define TAIL_LEN a6
  55. #define VL a7
  56. #define VLMAX t4
  57. // v1-v15 contain the AES round keys, but they are used for temporaries before
  58. // the AES round keys have been loaded.
  59. #define TWEAKS v16 // LMUL=4 (most of the time)
  60. #define TWEAKS_BREV v20 // LMUL=4 (most of the time)
  61. #define MULTS_BREV v24 // LMUL=4 (most of the time)
  62. #define TMP0 v28
  63. #define TMP1 v29
  64. #define TMP2 v30
  65. #define TMP3 v31
  66. // xts_init initializes the following values:
  67. //
  68. // TWEAKS: N 128-bit tweaks T*(x^i) for i in 0..(N - 1)
  69. // TWEAKS_BREV: same as TWEAKS, but bit-reversed
  70. // MULTS_BREV: N 128-bit values x^N, bit-reversed. Only if N > 1.
  71. //
  72. // N is the maximum number of blocks that will be processed per loop iteration,
  73. // computed using vsetvli.
  74. //
  75. // The field convention used by XTS is the same as that of GHASH, but with the
  76. // bits reversed within each byte. The zvkg extension provides the vgmul
  77. // instruction which does multiplication in this field. Therefore, for tweak
  78. // computation we use vgmul to do multiplications in parallel, instead of
  79. // serially multiplying by x using shifting+xoring. Note that for this to work,
  80. // the inputs and outputs to vgmul must be bit-reversed (we do it with vbrev8).
  81. .macro xts_init
  82. // Load the first tweak T.
  83. vsetivli zero, 4, e32, m1, ta, ma
  84. vle32.v TWEAKS, (TWEAKP)
  85. // If there's only one block (or no blocks at all), then skip the tweak
  86. // sequence computation because (at most) T itself is needed.
  87. li t0, 16
  88. ble LEN, t0, .Linit_single_block\@
  89. // Save a copy of T bit-reversed in v12.
  90. vbrev8.v v12, TWEAKS
  91. //
  92. // Generate x^i for i in 0..(N - 1), i.e. 128-bit values 1 << i assuming
  93. // that N <= 128. Though, this code actually requires N < 64 (or
  94. // equivalently VLEN < 2048) due to the use of 64-bit intermediate
  95. // values here and in the x^N computation later.
  96. //
  97. vsetvli VL, LEN32, e32, m4, ta, ma
  98. srli t0, VL, 2 // t0 = N (num blocks)
  99. // Generate two sequences, each with N 32-bit values:
  100. // v0=[1, 1, 1, ...] and v1=[0, 1, 2, ...].
  101. vsetvli zero, t0, e32, m1, ta, ma
  102. vmv.v.i v0, 1
  103. vid.v v1
  104. // Use vzext to zero-extend the sequences to 64 bits. Reinterpret them
  105. // as two sequences, each with 2*N 32-bit values:
  106. // v2=[1, 0, 1, 0, 1, 0, ...] and v4=[0, 0, 1, 0, 2, 0, ...].
  107. vsetvli zero, t0, e64, m2, ta, ma
  108. vzext.vf2 v2, v0
  109. vzext.vf2 v4, v1
  110. slli t1, t0, 1 // t1 = 2*N
  111. vsetvli zero, t1, e32, m2, ta, ma
  112. // Use vwsll to compute [1<<0, 0<<0, 1<<1, 0<<0, 1<<2, 0<<0, ...],
  113. // widening to 64 bits per element. When reinterpreted as N 128-bit
  114. // values, this is the needed sequence of 128-bit values 1 << i (x^i).
  115. vwsll.vv v8, v2, v4
  116. // Copy the bit-reversed T to all N elements of TWEAKS_BREV, then
  117. // multiply by x^i. This gives the sequence T*(x^i), bit-reversed.
  118. vsetvli zero, LEN32, e32, m4, ta, ma
  119. vmv.v.i TWEAKS_BREV, 0
  120. vaesz.vs TWEAKS_BREV, v12
  121. vbrev8.v v8, v8
  122. vgmul.vv TWEAKS_BREV, v8
  123. // Save a copy of the sequence T*(x^i) with the bit reversal undone.
  124. vbrev8.v TWEAKS, TWEAKS_BREV
  125. // Generate N copies of x^N, i.e. 128-bit values 1 << N, bit-reversed.
  126. li t1, 1
  127. sll t1, t1, t0 // t1 = 1 << N
  128. vsetivli zero, 2, e64, m1, ta, ma
  129. vmv.v.i v0, 0
  130. vsetivli zero, 1, e64, m1, tu, ma
  131. vmv.v.x v0, t1
  132. vbrev8.v v0, v0
  133. vsetvli zero, LEN32, e32, m4, ta, ma
  134. vmv.v.i MULTS_BREV, 0
  135. vaesz.vs MULTS_BREV, v0
  136. j .Linit_done\@
  137. .Linit_single_block\@:
  138. vbrev8.v TWEAKS_BREV, TWEAKS
  139. .Linit_done\@:
  140. .endm
  141. // Set the first 128 bits of MULTS_BREV to 0x40, i.e. 'x' bit-reversed. This is
  142. // the multiplier required to advance the tweak by one.
  143. .macro load_x
  144. li t0, 0x40
  145. vsetivli zero, 4, e32, m1, ta, ma
  146. vmv.v.i MULTS_BREV, 0
  147. vsetivli zero, 1, e8, m1, tu, ma
  148. vmv.v.x MULTS_BREV, t0
  149. .endm
  150. .macro __aes_xts_crypt enc, keylen
  151. // With 16 < len <= 31, there's no main loop, just ciphertext stealing.
  152. beqz LEN32, .Lcts_without_main_loop\@
  153. vsetvli VLMAX, zero, e32, m4, ta, ma
  154. 1:
  155. vsetvli VL, LEN32, e32, m4, ta, ma
  156. 2:
  157. // Encrypt or decrypt VL/4 blocks.
  158. vle32.v TMP0, (INP)
  159. vxor.vv TMP0, TMP0, TWEAKS
  160. aes_crypt TMP0, \enc, \keylen
  161. vxor.vv TMP0, TMP0, TWEAKS
  162. vse32.v TMP0, (OUTP)
  163. // Update the pointers and the remaining length.
  164. slli t0, VL, 2
  165. add INP, INP, t0
  166. add OUTP, OUTP, t0
  167. sub LEN32, LEN32, VL
  168. // Check whether more blocks remain.
  169. beqz LEN32, .Lmain_loop_done\@
  170. // Compute the next sequence of tweaks by multiplying the previous
  171. // sequence by x^N. Store the result in both bit-reversed order and
  172. // regular order (i.e. with the bit reversal undone).
  173. vgmul.vv TWEAKS_BREV, MULTS_BREV
  174. vbrev8.v TWEAKS, TWEAKS_BREV
  175. // Since we compute the tweak multipliers x^N in advance, we require
  176. // that each iteration process the same length except possibly the last.
  177. // This conflicts slightly with the behavior allowed by RISC-V Vector
  178. // Extension, where CPUs can select a lower length for both of the last
  179. // two iterations. E.g., vl might take the sequence of values
  180. // [16, 16, 16, 12, 12], whereas we need [16, 16, 16, 16, 8] so that we
  181. // can use x^4 again instead of computing x^3. Therefore, we explicitly
  182. // keep the vl at VLMAX if there is at least VLMAX remaining.
  183. bge LEN32, VLMAX, 2b
  184. j 1b
  185. .Lmain_loop_done\@:
  186. load_x
  187. // Compute the next tweak.
  188. addi t0, VL, -4
  189. vsetivli zero, 4, e32, m4, ta, ma
  190. vslidedown.vx TWEAKS_BREV, TWEAKS_BREV, t0 // Extract last tweak
  191. vsetivli zero, 4, e32, m1, ta, ma
  192. vgmul.vv TWEAKS_BREV, MULTS_BREV // Advance to next tweak
  193. bnez TAIL_LEN, .Lcts\@
  194. // Update *TWEAKP to contain the next tweak.
  195. vbrev8.v TWEAKS, TWEAKS_BREV
  196. vse32.v TWEAKS, (TWEAKP)
  197. ret
  198. .Lcts_without_main_loop\@:
  199. load_x
  200. .Lcts\@:
  201. // TWEAKS_BREV now contains the next tweak. Compute the one after that.
  202. vsetivli zero, 4, e32, m1, ta, ma
  203. vmv.v.v TMP0, TWEAKS_BREV
  204. vgmul.vv TMP0, MULTS_BREV
  205. // Undo the bit reversal of the next two tweaks and store them in TMP1
  206. // and TMP2, such that TMP1 is the first needed and TMP2 the second.
  207. .if \enc
  208. vbrev8.v TMP1, TWEAKS_BREV
  209. vbrev8.v TMP2, TMP0
  210. .else
  211. vbrev8.v TMP1, TMP0
  212. vbrev8.v TMP2, TWEAKS_BREV
  213. .endif
  214. // Encrypt/decrypt the last full block.
  215. vle32.v TMP0, (INP)
  216. vxor.vv TMP0, TMP0, TMP1
  217. aes_crypt TMP0, \enc, \keylen
  218. vxor.vv TMP0, TMP0, TMP1
  219. // Swap the first TAIL_LEN bytes of the above result with the tail.
  220. // Note that to support in-place encryption/decryption, the load from
  221. // the input tail must happen before the store to the output tail.
  222. addi t0, INP, 16
  223. addi t1, OUTP, 16
  224. vmv.v.v TMP3, TMP0
  225. vsetvli zero, TAIL_LEN, e8, m1, tu, ma
  226. vle8.v TMP0, (t0)
  227. vse8.v TMP3, (t1)
  228. // Encrypt/decrypt again and store the last full block.
  229. vsetivli zero, 4, e32, m1, ta, ma
  230. vxor.vv TMP0, TMP0, TMP2
  231. aes_crypt TMP0, \enc, \keylen
  232. vxor.vv TMP0, TMP0, TMP2
  233. vse32.v TMP0, (OUTP)
  234. ret
  235. .endm
  236. .macro aes_xts_crypt enc
  237. // Check whether the length is a multiple of the AES block size.
  238. andi TAIL_LEN, LEN, 15
  239. beqz TAIL_LEN, 1f
  240. // The length isn't a multiple of the AES block size, so ciphertext
  241. // stealing will be required. Ciphertext stealing involves special
  242. // handling of the partial block and the last full block, so subtract
  243. // the length of both from the length to be processed in the main loop.
  244. sub LEN, LEN, TAIL_LEN
  245. addi LEN, LEN, -16
  246. 1:
  247. srli LEN32, LEN, 2
  248. // LEN and LEN32 now contain the total length of the blocks that will be
  249. // processed in the main loop, in bytes and 32-bit words respectively.
  250. xts_init
  251. aes_begin KEYP, 128f, 192f
  252. __aes_xts_crypt \enc, 256
  253. 128:
  254. __aes_xts_crypt \enc, 128
  255. 192:
  256. __aes_xts_crypt \enc, 192
  257. .endm
  258. // void aes_xts_encrypt_zvkned_zvbb_zvkg(const struct crypto_aes_ctx *key,
  259. // const u8 *in, u8 *out, size_t len,
  260. // u8 tweak[16]);
  261. //
  262. // |key| is the data key. |tweak| contains the next tweak; the encryption of
  263. // the original IV with the tweak key was already done. This function supports
  264. // incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
  265. // |len| must be a multiple of 16 except on the last call. If |len| is a
  266. // multiple of 16, then this function updates |tweak| to contain the next tweak.
  267. SYM_FUNC_START(aes_xts_encrypt_zvkned_zvbb_zvkg)
  268. aes_xts_crypt 1
  269. SYM_FUNC_END(aes_xts_encrypt_zvkned_zvbb_zvkg)
  270. // Same prototype and calling convention as the encryption function
  271. SYM_FUNC_START(aes_xts_decrypt_zvkned_zvbb_zvkg)
  272. aes_xts_crypt 0
  273. SYM_FUNC_END(aes_xts_decrypt_zvkned_zvbb_zvkg)