| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566 |
- /* SPDX-License-Identifier: GPL-2.0-or-later */
- /*
- * SM4 Cipher Algorithm for ARMv8 NEON
- * as specified in
- * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
- *
- * Copyright (C) 2022, Alibaba Group.
- * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
- */
- #include <linux/linkage.h>
- #include <asm/assembler.h>
- /* Register macros */
- #define RTMP0 v8
- #define RTMP1 v9
- #define RTMP2 v10
- #define RTMP3 v11
- #define RTMP4 v12
- #define RTMP5 v13
- #define RTMP6 v14
- #define RTMP7 v15
- #define RX0 v12
- #define RX1 v13
- #define RKEY v14
- #define RIV v15
- /* Helper macros. */
- #define SM4_PREPARE() \
- adr_l x5, crypto_sm4_sbox; \
- ld1 {v16.16b-v19.16b}, [x5], #64; \
- ld1 {v20.16b-v23.16b}, [x5], #64; \
- ld1 {v24.16b-v27.16b}, [x5], #64; \
- ld1 {v28.16b-v31.16b}, [x5];
- #define transpose_4x4(s0, s1, s2, s3) \
- zip1 RTMP0.4s, s0.4s, s1.4s; \
- zip1 RTMP1.4s, s2.4s, s3.4s; \
- zip2 RTMP2.4s, s0.4s, s1.4s; \
- zip2 RTMP3.4s, s2.4s, s3.4s; \
- zip1 s0.2d, RTMP0.2d, RTMP1.2d; \
- zip2 s1.2d, RTMP0.2d, RTMP1.2d; \
- zip1 s2.2d, RTMP2.2d, RTMP3.2d; \
- zip2 s3.2d, RTMP2.2d, RTMP3.2d;
- #define transpose_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \
- zip1 RTMP0.4s, s0.4s, s1.4s; \
- zip1 RTMP1.4s, s2.4s, s3.4s; \
- zip2 RTMP2.4s, s0.4s, s1.4s; \
- zip2 RTMP3.4s, s2.4s, s3.4s; \
- zip1 RTMP4.4s, s4.4s, s5.4s; \
- zip1 RTMP5.4s, s6.4s, s7.4s; \
- zip2 RTMP6.4s, s4.4s, s5.4s; \
- zip2 RTMP7.4s, s6.4s, s7.4s; \
- zip1 s0.2d, RTMP0.2d, RTMP1.2d; \
- zip2 s1.2d, RTMP0.2d, RTMP1.2d; \
- zip1 s2.2d, RTMP2.2d, RTMP3.2d; \
- zip2 s3.2d, RTMP2.2d, RTMP3.2d; \
- zip1 s4.2d, RTMP4.2d, RTMP5.2d; \
- zip2 s5.2d, RTMP4.2d, RTMP5.2d; \
- zip1 s6.2d, RTMP6.2d, RTMP7.2d; \
- zip2 s7.2d, RTMP6.2d, RTMP7.2d;
- #define rotate_clockwise_4x4(s0, s1, s2, s3) \
- zip1 RTMP0.4s, s1.4s, s0.4s; \
- zip2 RTMP1.4s, s1.4s, s0.4s; \
- zip1 RTMP2.4s, s3.4s, s2.4s; \
- zip2 RTMP3.4s, s3.4s, s2.4s; \
- zip1 s0.2d, RTMP2.2d, RTMP0.2d; \
- zip2 s1.2d, RTMP2.2d, RTMP0.2d; \
- zip1 s2.2d, RTMP3.2d, RTMP1.2d; \
- zip2 s3.2d, RTMP3.2d, RTMP1.2d;
- #define rotate_clockwise_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \
- zip1 RTMP0.4s, s1.4s, s0.4s; \
- zip1 RTMP2.4s, s3.4s, s2.4s; \
- zip2 RTMP1.4s, s1.4s, s0.4s; \
- zip2 RTMP3.4s, s3.4s, s2.4s; \
- zip1 RTMP4.4s, s5.4s, s4.4s; \
- zip1 RTMP6.4s, s7.4s, s6.4s; \
- zip2 RTMP5.4s, s5.4s, s4.4s; \
- zip2 RTMP7.4s, s7.4s, s6.4s; \
- zip1 s0.2d, RTMP2.2d, RTMP0.2d; \
- zip2 s1.2d, RTMP2.2d, RTMP0.2d; \
- zip1 s2.2d, RTMP3.2d, RTMP1.2d; \
- zip2 s3.2d, RTMP3.2d, RTMP1.2d; \
- zip1 s4.2d, RTMP6.2d, RTMP4.2d; \
- zip2 s5.2d, RTMP6.2d, RTMP4.2d; \
- zip1 s6.2d, RTMP7.2d, RTMP5.2d; \
- zip2 s7.2d, RTMP7.2d, RTMP5.2d;
- #define ROUND4(round, s0, s1, s2, s3) \
- dup RX0.4s, RKEY.s[round]; \
- /* rk ^ s1 ^ s2 ^ s3 */ \
- eor RTMP1.16b, s2.16b, s3.16b; \
- eor RX0.16b, RX0.16b, s1.16b; \
- eor RX0.16b, RX0.16b, RTMP1.16b; \
- \
- /* sbox, non-linear part */ \
- movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \
- tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
- sub RX0.16b, RX0.16b, RTMP3.16b; \
- tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
- sub RX0.16b, RX0.16b, RTMP3.16b; \
- tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
- sub RX0.16b, RX0.16b, RTMP3.16b; \
- tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
- \
- /* linear part */ \
- shl RTMP1.4s, RTMP0.4s, #8; \
- shl RTMP2.4s, RTMP0.4s, #16; \
- shl RTMP3.4s, RTMP0.4s, #24; \
- sri RTMP1.4s, RTMP0.4s, #(32-8); \
- sri RTMP2.4s, RTMP0.4s, #(32-16); \
- sri RTMP3.4s, RTMP0.4s, #(32-24); \
- /* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
- eor RTMP1.16b, RTMP1.16b, RTMP0.16b; \
- eor RTMP1.16b, RTMP1.16b, RTMP2.16b; \
- /* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */ \
- eor RTMP3.16b, RTMP3.16b, RTMP0.16b; \
- shl RTMP2.4s, RTMP1.4s, 2; \
- sri RTMP2.4s, RTMP1.4s, #(32-2); \
- eor RTMP3.16b, RTMP3.16b, RTMP2.16b; \
- /* s0 ^= RTMP3 */ \
- eor s0.16b, s0.16b, RTMP3.16b;
- #define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3) \
- mov x6, 8; \
- 4: \
- ld1 {RKEY.4s}, [x0], #16; \
- subs x6, x6, #1; \
- \
- ROUND4(0, b0, b1, b2, b3); \
- ROUND4(1, b1, b2, b3, b0); \
- ROUND4(2, b2, b3, b0, b1); \
- ROUND4(3, b3, b0, b1, b2); \
- \
- bne 4b; \
- \
- rev32 b0.16b, b0.16b; \
- rev32 b1.16b, b1.16b; \
- rev32 b2.16b, b2.16b; \
- rev32 b3.16b, b3.16b; \
- \
- rotate_clockwise_4x4(b0, b1, b2, b3); \
- \
- /* repoint to rkey */ \
- sub x0, x0, #128;
- #define SM4_CRYPT_BLK4(b0, b1, b2, b3) \
- rev32 b0.16b, b0.16b; \
- rev32 b1.16b, b1.16b; \
- rev32 b2.16b, b2.16b; \
- rev32 b3.16b, b3.16b; \
- SM4_CRYPT_BLK4_BE(b0, b1, b2, b3);
- #define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3) \
- /* rk ^ s1 ^ s2 ^ s3 */ \
- dup RX0.4s, RKEY.s[round]; \
- eor RTMP0.16b, s2.16b, s3.16b; \
- mov RX1.16b, RX0.16b; \
- eor RTMP1.16b, t2.16b, t3.16b; \
- eor RX0.16b, RX0.16b, s1.16b; \
- eor RX1.16b, RX1.16b, t1.16b; \
- eor RX0.16b, RX0.16b, RTMP0.16b; \
- eor RX1.16b, RX1.16b, RTMP1.16b; \
- \
- /* sbox, non-linear part */ \
- movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \
- tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \
- tbl RTMP1.16b, {v16.16b-v19.16b}, RX1.16b; \
- sub RX0.16b, RX0.16b, RTMP3.16b; \
- sub RX1.16b, RX1.16b, RTMP3.16b; \
- tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \
- tbx RTMP1.16b, {v20.16b-v23.16b}, RX1.16b; \
- sub RX0.16b, RX0.16b, RTMP3.16b; \
- sub RX1.16b, RX1.16b, RTMP3.16b; \
- tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \
- tbx RTMP1.16b, {v24.16b-v27.16b}, RX1.16b; \
- sub RX0.16b, RX0.16b, RTMP3.16b; \
- sub RX1.16b, RX1.16b, RTMP3.16b; \
- tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \
- tbx RTMP1.16b, {v28.16b-v31.16b}, RX1.16b; \
- \
- /* linear part */ \
- shl RX0.4s, RTMP0.4s, #8; \
- shl RX1.4s, RTMP1.4s, #8; \
- shl RTMP2.4s, RTMP0.4s, #16; \
- shl RTMP3.4s, RTMP1.4s, #16; \
- sri RX0.4s, RTMP0.4s, #(32 - 8); \
- sri RX1.4s, RTMP1.4s, #(32 - 8); \
- sri RTMP2.4s, RTMP0.4s, #(32 - 16); \
- sri RTMP3.4s, RTMP1.4s, #(32 - 16); \
- /* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */ \
- eor RX0.16b, RX0.16b, RTMP0.16b; \
- eor RX1.16b, RX1.16b, RTMP1.16b; \
- eor RX0.16b, RX0.16b, RTMP2.16b; \
- eor RX1.16b, RX1.16b, RTMP3.16b; \
- /* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */ \
- shl RTMP2.4s, RTMP0.4s, #24; \
- shl RTMP3.4s, RTMP1.4s, #24; \
- sri RTMP2.4s, RTMP0.4s, #(32 - 24); \
- sri RTMP3.4s, RTMP1.4s, #(32 - 24); \
- eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \
- eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \
- shl RTMP2.4s, RX0.4s, #2; \
- shl RTMP3.4s, RX1.4s, #2; \
- sri RTMP2.4s, RX0.4s, #(32 - 2); \
- sri RTMP3.4s, RX1.4s, #(32 - 2); \
- eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \
- eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \
- /* s0/t0 ^= RTMP0/1 */ \
- eor s0.16b, s0.16b, RTMP0.16b; \
- eor t0.16b, t0.16b, RTMP1.16b;
- #define SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7) \
- rev32 b0.16b, b0.16b; \
- rev32 b1.16b, b1.16b; \
- rev32 b2.16b, b2.16b; \
- rev32 b3.16b, b3.16b; \
- rev32 b4.16b, b4.16b; \
- rev32 b5.16b, b5.16b; \
- rev32 b6.16b, b6.16b; \
- rev32 b7.16b, b7.16b; \
- \
- mov x6, 8; \
- 8: \
- ld1 {RKEY.4s}, [x0], #16; \
- subs x6, x6, #1; \
- \
- ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7); \
- ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4); \
- ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5); \
- ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6); \
- \
- bne 8b; \
- \
- rev32 b0.16b, b0.16b; \
- rev32 b1.16b, b1.16b; \
- rev32 b2.16b, b2.16b; \
- rev32 b3.16b, b3.16b; \
- rev32 b4.16b, b4.16b; \
- rev32 b5.16b, b5.16b; \
- rev32 b6.16b, b6.16b; \
- rev32 b7.16b, b7.16b; \
- \
- /* repoint to rkey */ \
- sub x0, x0, #128;
- #define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
- SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7); \
- rotate_clockwise_4x4_2x(b0, b1, b2, b3, b4, b5, b6, b7); \
- .align 3
- SYM_FUNC_START(sm4_neon_crypt)
- /* input:
- * x0: round key array, CTX
- * x1: dst
- * x2: src
- * w3: nblocks
- */
- SM4_PREPARE()
- .Lcrypt_loop_8x:
- sub w3, w3, #8
- tbnz w3, #31, .Lcrypt_4x
- ld4 {v0.4s-v3.4s}, [x2], #64
- ld4 {v4.4s-v7.4s}, [x2], #64
- SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
- st1 {v0.16b-v3.16b}, [x1], #64
- st1 {v4.16b-v7.16b}, [x1], #64
- cbz w3, .Lcrypt_end
- b .Lcrypt_loop_8x
- .Lcrypt_4x:
- add w3, w3, #8
- cmp w3, #4
- blt .Lcrypt_tail
- sub w3, w3, #4
- ld4 {v0.4s-v3.4s}, [x2], #64
- SM4_CRYPT_BLK4(v0, v1, v2, v3)
- st1 {v0.16b-v3.16b}, [x1], #64
- cbz w3, .Lcrypt_end
- .Lcrypt_tail:
- cmp w3, #2
- ld1 {v0.16b}, [x2], #16
- blt .Lcrypt_tail_load_done
- ld1 {v1.16b}, [x2], #16
- beq .Lcrypt_tail_load_done
- ld1 {v2.16b}, [x2], #16
- .Lcrypt_tail_load_done:
- transpose_4x4(v0, v1, v2, v3)
- SM4_CRYPT_BLK4(v0, v1, v2, v3)
- cmp w3, #2
- st1 {v0.16b}, [x1], #16
- blt .Lcrypt_end
- st1 {v1.16b}, [x1], #16
- beq .Lcrypt_end
- st1 {v2.16b}, [x1], #16
- .Lcrypt_end:
- ret
- SYM_FUNC_END(sm4_neon_crypt)
- .align 3
- SYM_FUNC_START(sm4_neon_cbc_dec)
- /* input:
- * x0: round key array, CTX
- * x1: dst
- * x2: src
- * x3: iv (big endian, 128 bit)
- * w4: nblocks
- */
- SM4_PREPARE()
- ld1 {RIV.16b}, [x3]
- .Lcbc_dec_loop_8x:
- sub w4, w4, #8
- tbnz w4, #31, .Lcbc_dec_4x
- ld4 {v0.4s-v3.4s}, [x2], #64
- ld4 {v4.4s-v7.4s}, [x2]
- SM4_CRYPT_BLK8_norotate(v0, v1, v2, v3, v4, v5, v6, v7)
- /* Avoid overwriting the RIV register */
- rotate_clockwise_4x4(v0, v1, v2, v3)
- rotate_clockwise_4x4(v4, v5, v6, v7)
- sub x2, x2, #64
- eor v0.16b, v0.16b, RIV.16b
- ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64
- ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64
- eor v1.16b, v1.16b, RTMP0.16b
- eor v2.16b, v2.16b, RTMP1.16b
- eor v3.16b, v3.16b, RTMP2.16b
- eor v4.16b, v4.16b, RTMP3.16b
- eor v5.16b, v5.16b, RTMP4.16b
- eor v6.16b, v6.16b, RTMP5.16b
- eor v7.16b, v7.16b, RTMP6.16b
- mov RIV.16b, RTMP7.16b
- st1 {v0.16b-v3.16b}, [x1], #64
- st1 {v4.16b-v7.16b}, [x1], #64
- cbz w4, .Lcbc_dec_end
- b .Lcbc_dec_loop_8x
- .Lcbc_dec_4x:
- add w4, w4, #8
- cmp w4, #4
- blt .Lcbc_dec_tail
- sub w4, w4, #4
- ld1 {v0.16b-v3.16b}, [x2], #64
- rev32 v4.16b, v0.16b
- rev32 v5.16b, v1.16b
- rev32 v6.16b, v2.16b
- rev32 v7.16b, v3.16b
- transpose_4x4(v4, v5, v6, v7)
- SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
- eor v4.16b, v4.16b, RIV.16b
- eor v5.16b, v5.16b, v0.16b
- eor v6.16b, v6.16b, v1.16b
- eor v7.16b, v7.16b, v2.16b
- mov RIV.16b, v3.16b
- st1 {v4.16b-v7.16b}, [x1], #64
- cbz w4, .Lcbc_dec_end
- .Lcbc_dec_tail:
- cmp w4, #2
- ld1 {v0.16b}, [x2], #16
- blt .Lcbc_dec_tail_load_done
- ld1 {v1.16b}, [x2], #16
- beq .Lcbc_dec_tail_load_done
- ld1 {v2.16b}, [x2], #16
- .Lcbc_dec_tail_load_done:
- rev32 v4.16b, v0.16b
- rev32 v5.16b, v1.16b
- rev32 v6.16b, v2.16b
- transpose_4x4(v4, v5, v6, v7)
- SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
- cmp w4, #2
- eor v4.16b, v4.16b, RIV.16b
- mov RIV.16b, v0.16b
- st1 {v4.16b}, [x1], #16
- blt .Lcbc_dec_end
- eor v5.16b, v5.16b, v0.16b
- mov RIV.16b, v1.16b
- st1 {v5.16b}, [x1], #16
- beq .Lcbc_dec_end
- eor v6.16b, v6.16b, v1.16b
- mov RIV.16b, v2.16b
- st1 {v6.16b}, [x1], #16
- .Lcbc_dec_end:
- /* store new IV */
- st1 {RIV.16b}, [x3]
- ret
- SYM_FUNC_END(sm4_neon_cbc_dec)
- .align 3
- SYM_FUNC_START(sm4_neon_ctr_crypt)
- /* input:
- * x0: round key array, CTX
- * x1: dst
- * x2: src
- * x3: ctr (big endian, 128 bit)
- * w4: nblocks
- */
- SM4_PREPARE()
- ldp x7, x8, [x3]
- rev x7, x7
- rev x8, x8
- .Lctr_crypt_loop_8x:
- sub w4, w4, #8
- tbnz w4, #31, .Lctr_crypt_4x
- #define inc_le128(vctr) \
- mov vctr.d[1], x8; \
- mov vctr.d[0], x7; \
- adds x8, x8, #1; \
- rev64 vctr.16b, vctr.16b; \
- adc x7, x7, xzr;
- /* construct CTRs */
- inc_le128(v0) /* +0 */
- inc_le128(v1) /* +1 */
- inc_le128(v2) /* +2 */
- inc_le128(v3) /* +3 */
- inc_le128(v4) /* +4 */
- inc_le128(v5) /* +5 */
- inc_le128(v6) /* +6 */
- inc_le128(v7) /* +7 */
- transpose_4x4_2x(v0, v1, v2, v3, v4, v5, v6, v7)
- SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
- ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64
- ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64
- eor v0.16b, v0.16b, RTMP0.16b
- eor v1.16b, v1.16b, RTMP1.16b
- eor v2.16b, v2.16b, RTMP2.16b
- eor v3.16b, v3.16b, RTMP3.16b
- eor v4.16b, v4.16b, RTMP4.16b
- eor v5.16b, v5.16b, RTMP5.16b
- eor v6.16b, v6.16b, RTMP6.16b
- eor v7.16b, v7.16b, RTMP7.16b
- st1 {v0.16b-v3.16b}, [x1], #64
- st1 {v4.16b-v7.16b}, [x1], #64
- cbz w4, .Lctr_crypt_end
- b .Lctr_crypt_loop_8x
- .Lctr_crypt_4x:
- add w4, w4, #8
- cmp w4, #4
- blt .Lctr_crypt_tail
- sub w4, w4, #4
- /* construct CTRs */
- inc_le128(v0) /* +0 */
- inc_le128(v1) /* +1 */
- inc_le128(v2) /* +2 */
- inc_le128(v3) /* +3 */
- ld1 {v4.16b-v7.16b}, [x2], #64
- transpose_4x4(v0, v1, v2, v3)
- SM4_CRYPT_BLK4(v0, v1, v2, v3)
- eor v0.16b, v0.16b, v4.16b
- eor v1.16b, v1.16b, v5.16b
- eor v2.16b, v2.16b, v6.16b
- eor v3.16b, v3.16b, v7.16b
- st1 {v0.16b-v3.16b}, [x1], #64
- cbz w4, .Lctr_crypt_end
- .Lctr_crypt_tail:
- /* inc_le128 will change the sign bit */
- ld1 {v4.16b}, [x2], #16
- inc_le128(v0)
- cmp w4, #2
- blt .Lctr_crypt_tail_load_done
- ld1 {v5.16b}, [x2], #16
- inc_le128(v1)
- cmp w4, #2
- beq .Lctr_crypt_tail_load_done
- ld1 {v6.16b}, [x2], #16
- inc_le128(v2)
- .Lctr_crypt_tail_load_done:
- transpose_4x4(v0, v1, v2, v3)
- SM4_CRYPT_BLK4(v0, v1, v2, v3)
- cmp w4, #2
- eor v0.16b, v0.16b, v4.16b
- st1 {v0.16b}, [x1], #16
- blt .Lctr_crypt_end
- eor v1.16b, v1.16b, v5.16b
- st1 {v1.16b}, [x1], #16
- beq .Lctr_crypt_end
- eor v2.16b, v2.16b, v6.16b
- st1 {v2.16b}, [x1], #16
- .Lctr_crypt_end:
- /* store new CTR */
- rev x7, x7
- rev x8, x8
- stp x7, x8, [x3]
- ret
- SYM_FUNC_END(sm4_neon_ctr_crypt)
|