| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312 |
- /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
- //
- // This file is dual-licensed, meaning that you can use it under your
- // choice of either of the following two licenses:
- //
- // Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
- //
- // Licensed under the Apache License 2.0 (the "License"). You can obtain
- // a copy in the file LICENSE in the source distribution or at
- // https://www.openssl.org/source/license.html
- //
- // or
- //
- // Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
- // Copyright 2024 Google LLC
- // All rights reserved.
- //
- // Redistribution and use in source and binary forms, with or without
- // modification, are permitted provided that the following conditions
- // are met:
- // 1. Redistributions of source code must retain the above copyright
- // notice, this list of conditions and the following disclaimer.
- // 2. Redistributions in binary form must reproduce the above copyright
- // notice, this list of conditions and the following disclaimer in the
- // documentation and/or other materials provided with the distribution.
- //
- // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- // The generated code of this file depends on the following RISC-V extensions:
- // - RV64I
- // - RISC-V Vector ('V') with VLEN >= 128 && VLEN < 2048
- // - RISC-V Vector AES block cipher extension ('Zvkned')
- // - RISC-V Vector Bit-manipulation extension ('Zvbb')
- // - RISC-V Vector GCM/GMAC extension ('Zvkg')
- #include <linux/linkage.h>
- .text
- .option arch, +zvkned, +zvbb, +zvkg
- #include "aes-macros.S"
- #define KEYP a0
- #define INP a1
- #define OUTP a2
- #define LEN a3
- #define TWEAKP a4
- #define LEN32 a5
- #define TAIL_LEN a6
- #define VL a7
- #define VLMAX t4
- // v1-v15 contain the AES round keys, but they are used for temporaries before
- // the AES round keys have been loaded.
- #define TWEAKS v16 // LMUL=4 (most of the time)
- #define TWEAKS_BREV v20 // LMUL=4 (most of the time)
- #define MULTS_BREV v24 // LMUL=4 (most of the time)
- #define TMP0 v28
- #define TMP1 v29
- #define TMP2 v30
- #define TMP3 v31
- // xts_init initializes the following values:
- //
- // TWEAKS: N 128-bit tweaks T*(x^i) for i in 0..(N - 1)
- // TWEAKS_BREV: same as TWEAKS, but bit-reversed
- // MULTS_BREV: N 128-bit values x^N, bit-reversed. Only if N > 1.
- //
- // N is the maximum number of blocks that will be processed per loop iteration,
- // computed using vsetvli.
- //
- // The field convention used by XTS is the same as that of GHASH, but with the
- // bits reversed within each byte. The zvkg extension provides the vgmul
- // instruction which does multiplication in this field. Therefore, for tweak
- // computation we use vgmul to do multiplications in parallel, instead of
- // serially multiplying by x using shifting+xoring. Note that for this to work,
- // the inputs and outputs to vgmul must be bit-reversed (we do it with vbrev8).
- .macro xts_init
- // Load the first tweak T.
- vsetivli zero, 4, e32, m1, ta, ma
- vle32.v TWEAKS, (TWEAKP)
- // If there's only one block (or no blocks at all), then skip the tweak
- // sequence computation because (at most) T itself is needed.
- li t0, 16
- ble LEN, t0, .Linit_single_block\@
- // Save a copy of T bit-reversed in v12.
- vbrev8.v v12, TWEAKS
- //
- // Generate x^i for i in 0..(N - 1), i.e. 128-bit values 1 << i assuming
- // that N <= 128. Though, this code actually requires N < 64 (or
- // equivalently VLEN < 2048) due to the use of 64-bit intermediate
- // values here and in the x^N computation later.
- //
- vsetvli VL, LEN32, e32, m4, ta, ma
- srli t0, VL, 2 // t0 = N (num blocks)
- // Generate two sequences, each with N 32-bit values:
- // v0=[1, 1, 1, ...] and v1=[0, 1, 2, ...].
- vsetvli zero, t0, e32, m1, ta, ma
- vmv.v.i v0, 1
- vid.v v1
- // Use vzext to zero-extend the sequences to 64 bits. Reinterpret them
- // as two sequences, each with 2*N 32-bit values:
- // v2=[1, 0, 1, 0, 1, 0, ...] and v4=[0, 0, 1, 0, 2, 0, ...].
- vsetvli zero, t0, e64, m2, ta, ma
- vzext.vf2 v2, v0
- vzext.vf2 v4, v1
- slli t1, t0, 1 // t1 = 2*N
- vsetvli zero, t1, e32, m2, ta, ma
- // Use vwsll to compute [1<<0, 0<<0, 1<<1, 0<<0, 1<<2, 0<<0, ...],
- // widening to 64 bits per element. When reinterpreted as N 128-bit
- // values, this is the needed sequence of 128-bit values 1 << i (x^i).
- vwsll.vv v8, v2, v4
- // Copy the bit-reversed T to all N elements of TWEAKS_BREV, then
- // multiply by x^i. This gives the sequence T*(x^i), bit-reversed.
- vsetvli zero, LEN32, e32, m4, ta, ma
- vmv.v.i TWEAKS_BREV, 0
- vaesz.vs TWEAKS_BREV, v12
- vbrev8.v v8, v8
- vgmul.vv TWEAKS_BREV, v8
- // Save a copy of the sequence T*(x^i) with the bit reversal undone.
- vbrev8.v TWEAKS, TWEAKS_BREV
- // Generate N copies of x^N, i.e. 128-bit values 1 << N, bit-reversed.
- li t1, 1
- sll t1, t1, t0 // t1 = 1 << N
- vsetivli zero, 2, e64, m1, ta, ma
- vmv.v.i v0, 0
- vsetivli zero, 1, e64, m1, tu, ma
- vmv.v.x v0, t1
- vbrev8.v v0, v0
- vsetvli zero, LEN32, e32, m4, ta, ma
- vmv.v.i MULTS_BREV, 0
- vaesz.vs MULTS_BREV, v0
- j .Linit_done\@
- .Linit_single_block\@:
- vbrev8.v TWEAKS_BREV, TWEAKS
- .Linit_done\@:
- .endm
- // Set the first 128 bits of MULTS_BREV to 0x40, i.e. 'x' bit-reversed. This is
- // the multiplier required to advance the tweak by one.
- .macro load_x
- li t0, 0x40
- vsetivli zero, 4, e32, m1, ta, ma
- vmv.v.i MULTS_BREV, 0
- vsetivli zero, 1, e8, m1, tu, ma
- vmv.v.x MULTS_BREV, t0
- .endm
- .macro __aes_xts_crypt enc, keylen
- // With 16 < len <= 31, there's no main loop, just ciphertext stealing.
- beqz LEN32, .Lcts_without_main_loop\@
- vsetvli VLMAX, zero, e32, m4, ta, ma
- 1:
- vsetvli VL, LEN32, e32, m4, ta, ma
- 2:
- // Encrypt or decrypt VL/4 blocks.
- vle32.v TMP0, (INP)
- vxor.vv TMP0, TMP0, TWEAKS
- aes_crypt TMP0, \enc, \keylen
- vxor.vv TMP0, TMP0, TWEAKS
- vse32.v TMP0, (OUTP)
- // Update the pointers and the remaining length.
- slli t0, VL, 2
- add INP, INP, t0
- add OUTP, OUTP, t0
- sub LEN32, LEN32, VL
- // Check whether more blocks remain.
- beqz LEN32, .Lmain_loop_done\@
- // Compute the next sequence of tweaks by multiplying the previous
- // sequence by x^N. Store the result in both bit-reversed order and
- // regular order (i.e. with the bit reversal undone).
- vgmul.vv TWEAKS_BREV, MULTS_BREV
- vbrev8.v TWEAKS, TWEAKS_BREV
- // Since we compute the tweak multipliers x^N in advance, we require
- // that each iteration process the same length except possibly the last.
- // This conflicts slightly with the behavior allowed by RISC-V Vector
- // Extension, where CPUs can select a lower length for both of the last
- // two iterations. E.g., vl might take the sequence of values
- // [16, 16, 16, 12, 12], whereas we need [16, 16, 16, 16, 8] so that we
- // can use x^4 again instead of computing x^3. Therefore, we explicitly
- // keep the vl at VLMAX if there is at least VLMAX remaining.
- bge LEN32, VLMAX, 2b
- j 1b
- .Lmain_loop_done\@:
- load_x
- // Compute the next tweak.
- addi t0, VL, -4
- vsetivli zero, 4, e32, m4, ta, ma
- vslidedown.vx TWEAKS_BREV, TWEAKS_BREV, t0 // Extract last tweak
- vsetivli zero, 4, e32, m1, ta, ma
- vgmul.vv TWEAKS_BREV, MULTS_BREV // Advance to next tweak
- bnez TAIL_LEN, .Lcts\@
- // Update *TWEAKP to contain the next tweak.
- vbrev8.v TWEAKS, TWEAKS_BREV
- vse32.v TWEAKS, (TWEAKP)
- ret
- .Lcts_without_main_loop\@:
- load_x
- .Lcts\@:
- // TWEAKS_BREV now contains the next tweak. Compute the one after that.
- vsetivli zero, 4, e32, m1, ta, ma
- vmv.v.v TMP0, TWEAKS_BREV
- vgmul.vv TMP0, MULTS_BREV
- // Undo the bit reversal of the next two tweaks and store them in TMP1
- // and TMP2, such that TMP1 is the first needed and TMP2 the second.
- .if \enc
- vbrev8.v TMP1, TWEAKS_BREV
- vbrev8.v TMP2, TMP0
- .else
- vbrev8.v TMP1, TMP0
- vbrev8.v TMP2, TWEAKS_BREV
- .endif
- // Encrypt/decrypt the last full block.
- vle32.v TMP0, (INP)
- vxor.vv TMP0, TMP0, TMP1
- aes_crypt TMP0, \enc, \keylen
- vxor.vv TMP0, TMP0, TMP1
- // Swap the first TAIL_LEN bytes of the above result with the tail.
- // Note that to support in-place encryption/decryption, the load from
- // the input tail must happen before the store to the output tail.
- addi t0, INP, 16
- addi t1, OUTP, 16
- vmv.v.v TMP3, TMP0
- vsetvli zero, TAIL_LEN, e8, m1, tu, ma
- vle8.v TMP0, (t0)
- vse8.v TMP3, (t1)
- // Encrypt/decrypt again and store the last full block.
- vsetivli zero, 4, e32, m1, ta, ma
- vxor.vv TMP0, TMP0, TMP2
- aes_crypt TMP0, \enc, \keylen
- vxor.vv TMP0, TMP0, TMP2
- vse32.v TMP0, (OUTP)
- ret
- .endm
- .macro aes_xts_crypt enc
- // Check whether the length is a multiple of the AES block size.
- andi TAIL_LEN, LEN, 15
- beqz TAIL_LEN, 1f
- // The length isn't a multiple of the AES block size, so ciphertext
- // stealing will be required. Ciphertext stealing involves special
- // handling of the partial block and the last full block, so subtract
- // the length of both from the length to be processed in the main loop.
- sub LEN, LEN, TAIL_LEN
- addi LEN, LEN, -16
- 1:
- srli LEN32, LEN, 2
- // LEN and LEN32 now contain the total length of the blocks that will be
- // processed in the main loop, in bytes and 32-bit words respectively.
- xts_init
- aes_begin KEYP, 128f, 192f
- __aes_xts_crypt \enc, 256
- 128:
- __aes_xts_crypt \enc, 128
- 192:
- __aes_xts_crypt \enc, 192
- .endm
- // void aes_xts_encrypt_zvkned_zvbb_zvkg(const struct crypto_aes_ctx *key,
- // const u8 *in, u8 *out, size_t len,
- // u8 tweak[16]);
- //
- // |key| is the data key. |tweak| contains the next tweak; the encryption of
- // the original IV with the tweak key was already done. This function supports
- // incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
- // |len| must be a multiple of 16 except on the last call. If |len| is a
- // multiple of 16, then this function updates |tweak| to contain the next tweak.
- SYM_FUNC_START(aes_xts_encrypt_zvkned_zvbb_zvkg)
- aes_xts_crypt 1
- SYM_FUNC_END(aes_xts_encrypt_zvkned_zvbb_zvkg)
- // Same prototype and calling convention as the encryption function
- SYM_FUNC_START(aes_xts_decrypt_zvkned_zvbb_zvkg)
- aes_xts_crypt 0
- SYM_FUNC_END(aes_xts_decrypt_zvkned_zvbb_zvkg)
|