| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935 |
- /* SPDX-License-Identifier: GPL-2.0-or-later */
- /*
- * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
- * as specified in
- * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
- *
- * Copyright (C) 2022, Alibaba Group.
- * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
- */
- #include <linux/linkage.h>
- #include <asm/assembler.h>
- #include "sm4-ce-asm.h"
- .arch armv8-a+crypto
- .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
- 20, 24, 25, 26, 27, 28, 29, 30, 31
- .set .Lv\b\().4s, \b
- .endr
- .macro sm4e, vd, vn
- .inst 0xcec08400 | (.L\vn << 5) | .L\vd
- .endm
- .macro sm4ekey, vd, vn, vm
- .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
- .endm
- /* Register macros */
- #define RTMP0 v16
- #define RTMP1 v17
- #define RTMP2 v18
- #define RTMP3 v19
- #define RIV v20
- #define RMAC v20
- #define RMASK v21
- .align 3
- SYM_FUNC_START(sm4_ce_expand_key)
- /* input:
- * x0: 128-bit key
- * x1: rkey_enc
- * x2: rkey_dec
- * x3: fk array
- * x4: ck array
- */
- ld1 {v0.16b}, [x0];
- rev32 v0.16b, v0.16b;
- ld1 {v1.16b}, [x3];
- /* load ck */
- ld1 {v24.16b-v27.16b}, [x4], #64;
- ld1 {v28.16b-v31.16b}, [x4];
- /* input ^ fk */
- eor v0.16b, v0.16b, v1.16b;
- sm4ekey v0.4s, v0.4s, v24.4s;
- sm4ekey v1.4s, v0.4s, v25.4s;
- sm4ekey v2.4s, v1.4s, v26.4s;
- sm4ekey v3.4s, v2.4s, v27.4s;
- sm4ekey v4.4s, v3.4s, v28.4s;
- sm4ekey v5.4s, v4.4s, v29.4s;
- sm4ekey v6.4s, v5.4s, v30.4s;
- sm4ekey v7.4s, v6.4s, v31.4s;
- adr_l x5, .Lbswap128_mask
- ld1 {v24.16b}, [x5]
- st1 {v0.16b-v3.16b}, [x1], #64;
- st1 {v4.16b-v7.16b}, [x1];
- tbl v16.16b, {v7.16b}, v24.16b
- tbl v17.16b, {v6.16b}, v24.16b
- tbl v18.16b, {v5.16b}, v24.16b
- tbl v19.16b, {v4.16b}, v24.16b
- tbl v20.16b, {v3.16b}, v24.16b
- tbl v21.16b, {v2.16b}, v24.16b
- tbl v22.16b, {v1.16b}, v24.16b
- tbl v23.16b, {v0.16b}, v24.16b
- st1 {v16.16b-v19.16b}, [x2], #64
- st1 {v20.16b-v23.16b}, [x2]
- ret;
- SYM_FUNC_END(sm4_ce_expand_key)
- .align 3
- SYM_FUNC_START(sm4_ce_crypt_block)
- /* input:
- * x0: round key array, CTX
- * x1: dst
- * x2: src
- */
- SM4_PREPARE(x0)
- ld1 {v0.16b}, [x2];
- SM4_CRYPT_BLK(v0);
- st1 {v0.16b}, [x1];
- ret;
- SYM_FUNC_END(sm4_ce_crypt_block)
- .align 3
- SYM_FUNC_START(sm4_ce_crypt)
- /* input:
- * x0: round key array, CTX
- * x1: dst
- * x2: src
- * w3: nblocks
- */
- SM4_PREPARE(x0)
- .Lcrypt_loop_blk:
- sub w3, w3, #8;
- tbnz w3, #31, .Lcrypt_tail8;
- ld1 {v0.16b-v3.16b}, [x2], #64;
- ld1 {v4.16b-v7.16b}, [x2], #64;
- SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
- st1 {v0.16b-v3.16b}, [x1], #64;
- st1 {v4.16b-v7.16b}, [x1], #64;
- cbz w3, .Lcrypt_end;
- b .Lcrypt_loop_blk;
- .Lcrypt_tail8:
- add w3, w3, #8;
- cmp w3, #4;
- blt .Lcrypt_tail4;
- sub w3, w3, #4;
- ld1 {v0.16b-v3.16b}, [x2], #64;
- SM4_CRYPT_BLK4(v0, v1, v2, v3);
- st1 {v0.16b-v3.16b}, [x1], #64;
- cbz w3, .Lcrypt_end;
- .Lcrypt_tail4:
- sub w3, w3, #1;
- ld1 {v0.16b}, [x2], #16;
- SM4_CRYPT_BLK(v0);
- st1 {v0.16b}, [x1], #16;
- cbnz w3, .Lcrypt_tail4;
- .Lcrypt_end:
- ret;
- SYM_FUNC_END(sm4_ce_crypt)
- .align 3
- SYM_FUNC_START(sm4_ce_cbc_enc)
- /* input:
- * x0: round key array, CTX
- * x1: dst
- * x2: src
- * x3: iv (big endian, 128 bit)
- * w4: nblocks
- */
- SM4_PREPARE(x0)
- ld1 {RIV.16b}, [x3]
- .Lcbc_enc_loop_4x:
- cmp w4, #4
- blt .Lcbc_enc_loop_1x
- sub w4, w4, #4
- ld1 {v0.16b-v3.16b}, [x2], #64
- eor v0.16b, v0.16b, RIV.16b
- SM4_CRYPT_BLK(v0)
- eor v1.16b, v1.16b, v0.16b
- SM4_CRYPT_BLK(v1)
- eor v2.16b, v2.16b, v1.16b
- SM4_CRYPT_BLK(v2)
- eor v3.16b, v3.16b, v2.16b
- SM4_CRYPT_BLK(v3)
- st1 {v0.16b-v3.16b}, [x1], #64
- mov RIV.16b, v3.16b
- cbz w4, .Lcbc_enc_end
- b .Lcbc_enc_loop_4x
- .Lcbc_enc_loop_1x:
- sub w4, w4, #1
- ld1 {v0.16b}, [x2], #16
- eor RIV.16b, RIV.16b, v0.16b
- SM4_CRYPT_BLK(RIV)
- st1 {RIV.16b}, [x1], #16
- cbnz w4, .Lcbc_enc_loop_1x
- .Lcbc_enc_end:
- /* store new IV */
- st1 {RIV.16b}, [x3]
- ret
- SYM_FUNC_END(sm4_ce_cbc_enc)
- .align 3
- SYM_FUNC_START(sm4_ce_cbc_dec)
- /* input:
- * x0: round key array, CTX
- * x1: dst
- * x2: src
- * x3: iv (big endian, 128 bit)
- * w4: nblocks
- */
- SM4_PREPARE(x0)
- ld1 {RIV.16b}, [x3]
- .Lcbc_dec_loop_8x:
- sub w4, w4, #8
- tbnz w4, #31, .Lcbc_dec_4x
- ld1 {v0.16b-v3.16b}, [x2], #64
- ld1 {v4.16b-v7.16b}, [x2], #64
- rev32 v8.16b, v0.16b
- rev32 v9.16b, v1.16b
- rev32 v10.16b, v2.16b
- rev32 v11.16b, v3.16b
- rev32 v12.16b, v4.16b
- rev32 v13.16b, v5.16b
- rev32 v14.16b, v6.16b
- rev32 v15.16b, v7.16b
- SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
- eor v8.16b, v8.16b, RIV.16b
- eor v9.16b, v9.16b, v0.16b
- eor v10.16b, v10.16b, v1.16b
- eor v11.16b, v11.16b, v2.16b
- eor v12.16b, v12.16b, v3.16b
- eor v13.16b, v13.16b, v4.16b
- eor v14.16b, v14.16b, v5.16b
- eor v15.16b, v15.16b, v6.16b
- st1 {v8.16b-v11.16b}, [x1], #64
- st1 {v12.16b-v15.16b}, [x1], #64
- mov RIV.16b, v7.16b
- cbz w4, .Lcbc_dec_end
- b .Lcbc_dec_loop_8x
- .Lcbc_dec_4x:
- add w4, w4, #8
- cmp w4, #4
- blt .Lcbc_dec_loop_1x
- sub w4, w4, #4
- ld1 {v0.16b-v3.16b}, [x2], #64
- rev32 v8.16b, v0.16b
- rev32 v9.16b, v1.16b
- rev32 v10.16b, v2.16b
- rev32 v11.16b, v3.16b
- SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
- eor v8.16b, v8.16b, RIV.16b
- eor v9.16b, v9.16b, v0.16b
- eor v10.16b, v10.16b, v1.16b
- eor v11.16b, v11.16b, v2.16b
- st1 {v8.16b-v11.16b}, [x1], #64
- mov RIV.16b, v3.16b
- cbz w4, .Lcbc_dec_end
- .Lcbc_dec_loop_1x:
- sub w4, w4, #1
- ld1 {v0.16b}, [x2], #16
- rev32 v8.16b, v0.16b
- SM4_CRYPT_BLK_BE(v8)
- eor v8.16b, v8.16b, RIV.16b
- st1 {v8.16b}, [x1], #16
- mov RIV.16b, v0.16b
- cbnz w4, .Lcbc_dec_loop_1x
- .Lcbc_dec_end:
- /* store new IV */
- st1 {RIV.16b}, [x3]
- ret
- SYM_FUNC_END(sm4_ce_cbc_dec)
- .align 3
- SYM_FUNC_START(sm4_ce_cbc_cts_enc)
- /* input:
- * x0: round key array, CTX
- * x1: dst
- * x2: src
- * x3: iv (big endian, 128 bit)
- * w4: nbytes
- */
- SM4_PREPARE(x0)
- sub w5, w4, #16
- uxtw x5, w5
- ld1 {RIV.16b}, [x3]
- ld1 {v0.16b}, [x2]
- eor RIV.16b, RIV.16b, v0.16b
- SM4_CRYPT_BLK(RIV)
- /* load permute table */
- adr_l x6, .Lcts_permute_table
- add x7, x6, #32
- add x6, x6, x5
- sub x7, x7, x5
- ld1 {v3.16b}, [x6]
- ld1 {v4.16b}, [x7]
- /* overlapping loads */
- add x2, x2, x5
- ld1 {v1.16b}, [x2]
- /* create Cn from En-1 */
- tbl v0.16b, {RIV.16b}, v3.16b
- /* padding Pn with zeros */
- tbl v1.16b, {v1.16b}, v4.16b
- eor v1.16b, v1.16b, RIV.16b
- SM4_CRYPT_BLK(v1)
- /* overlapping stores */
- add x5, x1, x5
- st1 {v0.16b}, [x5]
- st1 {v1.16b}, [x1]
- ret
- SYM_FUNC_END(sm4_ce_cbc_cts_enc)
- .align 3
- SYM_FUNC_START(sm4_ce_cbc_cts_dec)
- /* input:
- * x0: round key array, CTX
- * x1: dst
- * x2: src
- * x3: iv (big endian, 128 bit)
- * w4: nbytes
- */
- SM4_PREPARE(x0)
- sub w5, w4, #16
- uxtw x5, w5
- ld1 {RIV.16b}, [x3]
- /* load permute table */
- adr_l x6, .Lcts_permute_table
- add x7, x6, #32
- add x6, x6, x5
- sub x7, x7, x5
- ld1 {v3.16b}, [x6]
- ld1 {v4.16b}, [x7]
- /* overlapping loads */
- ld1 {v0.16b}, [x2], x5
- ld1 {v1.16b}, [x2]
- SM4_CRYPT_BLK(v0)
- /* select the first Ln bytes of Xn to create Pn */
- tbl v2.16b, {v0.16b}, v3.16b
- eor v2.16b, v2.16b, v1.16b
- /* overwrite the first Ln bytes with Cn to create En-1 */
- tbx v0.16b, {v1.16b}, v4.16b
- SM4_CRYPT_BLK(v0)
- eor v0.16b, v0.16b, RIV.16b
- /* overlapping stores */
- add x5, x1, x5
- st1 {v2.16b}, [x5]
- st1 {v0.16b}, [x1]
- ret
- SYM_FUNC_END(sm4_ce_cbc_cts_dec)
- .align 3
- SYM_FUNC_START(sm4_ce_ctr_enc)
- /* input:
- * x0: round key array, CTX
- * x1: dst
- * x2: src
- * x3: ctr (big endian, 128 bit)
- * w4: nblocks
- */
- SM4_PREPARE(x0)
- ldp x7, x8, [x3]
- rev x7, x7
- rev x8, x8
- .Lctr_loop_8x:
- sub w4, w4, #8
- tbnz w4, #31, .Lctr_4x
- #define inc_le128(vctr) \
- mov vctr.d[1], x8; \
- mov vctr.d[0], x7; \
- adds x8, x8, #1; \
- rev64 vctr.16b, vctr.16b; \
- adc x7, x7, xzr;
- /* construct CTRs */
- inc_le128(v0) /* +0 */
- inc_le128(v1) /* +1 */
- inc_le128(v2) /* +2 */
- inc_le128(v3) /* +3 */
- inc_le128(v4) /* +4 */
- inc_le128(v5) /* +5 */
- inc_le128(v6) /* +6 */
- inc_le128(v7) /* +7 */
- ld1 {v8.16b-v11.16b}, [x2], #64
- ld1 {v12.16b-v15.16b}, [x2], #64
- SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v9.16b
- eor v2.16b, v2.16b, v10.16b
- eor v3.16b, v3.16b, v11.16b
- eor v4.16b, v4.16b, v12.16b
- eor v5.16b, v5.16b, v13.16b
- eor v6.16b, v6.16b, v14.16b
- eor v7.16b, v7.16b, v15.16b
- st1 {v0.16b-v3.16b}, [x1], #64
- st1 {v4.16b-v7.16b}, [x1], #64
- cbz w4, .Lctr_end
- b .Lctr_loop_8x
- .Lctr_4x:
- add w4, w4, #8
- cmp w4, #4
- blt .Lctr_loop_1x
- sub w4, w4, #4
- /* construct CTRs */
- inc_le128(v0) /* +0 */
- inc_le128(v1) /* +1 */
- inc_le128(v2) /* +2 */
- inc_le128(v3) /* +3 */
- ld1 {v8.16b-v11.16b}, [x2], #64
- SM4_CRYPT_BLK4(v0, v1, v2, v3)
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v9.16b
- eor v2.16b, v2.16b, v10.16b
- eor v3.16b, v3.16b, v11.16b
- st1 {v0.16b-v3.16b}, [x1], #64
- cbz w4, .Lctr_end
- .Lctr_loop_1x:
- sub w4, w4, #1
- /* construct CTRs */
- inc_le128(v0)
- ld1 {v8.16b}, [x2], #16
- SM4_CRYPT_BLK(v0)
- eor v0.16b, v0.16b, v8.16b
- st1 {v0.16b}, [x1], #16
- cbnz w4, .Lctr_loop_1x
- .Lctr_end:
- /* store new CTR */
- rev x7, x7
- rev x8, x8
- stp x7, x8, [x3]
- ret
- SYM_FUNC_END(sm4_ce_ctr_enc)
- #define tweak_next(vt, vin, RTMP) \
- sshr RTMP.2d, vin.2d, #63; \
- and RTMP.16b, RTMP.16b, RMASK.16b; \
- add vt.2d, vin.2d, vin.2d; \
- ext RTMP.16b, RTMP.16b, RTMP.16b, #8; \
- eor vt.16b, vt.16b, RTMP.16b;
- .align 3
- SYM_FUNC_START(sm4_ce_xts_enc)
- /* input:
- * x0: round key array, CTX
- * x1: dst
- * x2: src
- * x3: tweak (big endian, 128 bit)
- * w4: nbytes
- * x5: round key array for IV
- */
- ld1 {v8.16b}, [x3]
- cbz x5, .Lxts_enc_nofirst
- SM4_PREPARE(x5)
- /* Generate first tweak */
- SM4_CRYPT_BLK(v8)
- .Lxts_enc_nofirst:
- SM4_PREPARE(x0)
- ands w5, w4, #15
- lsr w4, w4, #4
- sub w6, w4, #1
- csel w4, w4, w6, eq
- uxtw x5, w5
- movi RMASK.2s, #0x1
- movi RTMP0.2s, #0x87
- uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
- cbz w4, .Lxts_enc_cts
- .Lxts_enc_loop_8x:
- sub w4, w4, #8
- tbnz w4, #31, .Lxts_enc_4x
- tweak_next( v9, v8, RTMP0)
- tweak_next(v10, v9, RTMP1)
- tweak_next(v11, v10, RTMP2)
- tweak_next(v12, v11, RTMP3)
- tweak_next(v13, v12, RTMP0)
- tweak_next(v14, v13, RTMP1)
- tweak_next(v15, v14, RTMP2)
- ld1 {v0.16b-v3.16b}, [x2], #64
- ld1 {v4.16b-v7.16b}, [x2], #64
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v9.16b
- eor v2.16b, v2.16b, v10.16b
- eor v3.16b, v3.16b, v11.16b
- eor v4.16b, v4.16b, v12.16b
- eor v5.16b, v5.16b, v13.16b
- eor v6.16b, v6.16b, v14.16b
- eor v7.16b, v7.16b, v15.16b
- SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v9.16b
- eor v2.16b, v2.16b, v10.16b
- eor v3.16b, v3.16b, v11.16b
- eor v4.16b, v4.16b, v12.16b
- eor v5.16b, v5.16b, v13.16b
- eor v6.16b, v6.16b, v14.16b
- eor v7.16b, v7.16b, v15.16b
- st1 {v0.16b-v3.16b}, [x1], #64
- st1 {v4.16b-v7.16b}, [x1], #64
- tweak_next(v8, v15, RTMP3)
- cbz w4, .Lxts_enc_cts
- b .Lxts_enc_loop_8x
- .Lxts_enc_4x:
- add w4, w4, #8
- cmp w4, #4
- blt .Lxts_enc_loop_1x
- sub w4, w4, #4
- tweak_next( v9, v8, RTMP0)
- tweak_next(v10, v9, RTMP1)
- tweak_next(v11, v10, RTMP2)
- ld1 {v0.16b-v3.16b}, [x2], #64
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v9.16b
- eor v2.16b, v2.16b, v10.16b
- eor v3.16b, v3.16b, v11.16b
- SM4_CRYPT_BLK4(v0, v1, v2, v3)
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v9.16b
- eor v2.16b, v2.16b, v10.16b
- eor v3.16b, v3.16b, v11.16b
- st1 {v0.16b-v3.16b}, [x1], #64
- tweak_next(v8, v11, RTMP3)
- cbz w4, .Lxts_enc_cts
- .Lxts_enc_loop_1x:
- sub w4, w4, #1
- ld1 {v0.16b}, [x2], #16
- eor v0.16b, v0.16b, v8.16b
- SM4_CRYPT_BLK(v0)
- eor v0.16b, v0.16b, v8.16b
- st1 {v0.16b}, [x1], #16
- tweak_next(v8, v8, RTMP0)
- cbnz w4, .Lxts_enc_loop_1x
- .Lxts_enc_cts:
- cbz x5, .Lxts_enc_end
- /* cipher text stealing */
- tweak_next(v9, v8, RTMP0)
- ld1 {v0.16b}, [x2]
- eor v0.16b, v0.16b, v8.16b
- SM4_CRYPT_BLK(v0)
- eor v0.16b, v0.16b, v8.16b
- /* load permute table */
- adr_l x6, .Lcts_permute_table
- add x7, x6, #32
- add x6, x6, x5
- sub x7, x7, x5
- ld1 {v3.16b}, [x6]
- ld1 {v4.16b}, [x7]
- /* overlapping loads */
- add x2, x2, x5
- ld1 {v1.16b}, [x2]
- /* create Cn from En-1 */
- tbl v2.16b, {v0.16b}, v3.16b
- /* padding Pn with En-1 at the end */
- tbx v0.16b, {v1.16b}, v4.16b
- eor v0.16b, v0.16b, v9.16b
- SM4_CRYPT_BLK(v0)
- eor v0.16b, v0.16b, v9.16b
- /* overlapping stores */
- add x5, x1, x5
- st1 {v2.16b}, [x5]
- st1 {v0.16b}, [x1]
- b .Lxts_enc_ret
- .Lxts_enc_end:
- /* store new tweak */
- st1 {v8.16b}, [x3]
- .Lxts_enc_ret:
- ret
- SYM_FUNC_END(sm4_ce_xts_enc)
- .align 3
- SYM_FUNC_START(sm4_ce_xts_dec)
- /* input:
- * x0: round key array, CTX
- * x1: dst
- * x2: src
- * x3: tweak (big endian, 128 bit)
- * w4: nbytes
- * x5: round key array for IV
- */
- ld1 {v8.16b}, [x3]
- cbz x5, .Lxts_dec_nofirst
- SM4_PREPARE(x5)
- /* Generate first tweak */
- SM4_CRYPT_BLK(v8)
- .Lxts_dec_nofirst:
- SM4_PREPARE(x0)
- ands w5, w4, #15
- lsr w4, w4, #4
- sub w6, w4, #1
- csel w4, w4, w6, eq
- uxtw x5, w5
- movi RMASK.2s, #0x1
- movi RTMP0.2s, #0x87
- uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
- cbz w4, .Lxts_dec_cts
- .Lxts_dec_loop_8x:
- sub w4, w4, #8
- tbnz w4, #31, .Lxts_dec_4x
- tweak_next( v9, v8, RTMP0)
- tweak_next(v10, v9, RTMP1)
- tweak_next(v11, v10, RTMP2)
- tweak_next(v12, v11, RTMP3)
- tweak_next(v13, v12, RTMP0)
- tweak_next(v14, v13, RTMP1)
- tweak_next(v15, v14, RTMP2)
- ld1 {v0.16b-v3.16b}, [x2], #64
- ld1 {v4.16b-v7.16b}, [x2], #64
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v9.16b
- eor v2.16b, v2.16b, v10.16b
- eor v3.16b, v3.16b, v11.16b
- eor v4.16b, v4.16b, v12.16b
- eor v5.16b, v5.16b, v13.16b
- eor v6.16b, v6.16b, v14.16b
- eor v7.16b, v7.16b, v15.16b
- SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v9.16b
- eor v2.16b, v2.16b, v10.16b
- eor v3.16b, v3.16b, v11.16b
- eor v4.16b, v4.16b, v12.16b
- eor v5.16b, v5.16b, v13.16b
- eor v6.16b, v6.16b, v14.16b
- eor v7.16b, v7.16b, v15.16b
- st1 {v0.16b-v3.16b}, [x1], #64
- st1 {v4.16b-v7.16b}, [x1], #64
- tweak_next(v8, v15, RTMP3)
- cbz w4, .Lxts_dec_cts
- b .Lxts_dec_loop_8x
- .Lxts_dec_4x:
- add w4, w4, #8
- cmp w4, #4
- blt .Lxts_dec_loop_1x
- sub w4, w4, #4
- tweak_next( v9, v8, RTMP0)
- tweak_next(v10, v9, RTMP1)
- tweak_next(v11, v10, RTMP2)
- ld1 {v0.16b-v3.16b}, [x2], #64
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v9.16b
- eor v2.16b, v2.16b, v10.16b
- eor v3.16b, v3.16b, v11.16b
- SM4_CRYPT_BLK4(v0, v1, v2, v3)
- eor v0.16b, v0.16b, v8.16b
- eor v1.16b, v1.16b, v9.16b
- eor v2.16b, v2.16b, v10.16b
- eor v3.16b, v3.16b, v11.16b
- st1 {v0.16b-v3.16b}, [x1], #64
- tweak_next(v8, v11, RTMP3)
- cbz w4, .Lxts_dec_cts
- .Lxts_dec_loop_1x:
- sub w4, w4, #1
- ld1 {v0.16b}, [x2], #16
- eor v0.16b, v0.16b, v8.16b
- SM4_CRYPT_BLK(v0)
- eor v0.16b, v0.16b, v8.16b
- st1 {v0.16b}, [x1], #16
- tweak_next(v8, v8, RTMP0)
- cbnz w4, .Lxts_dec_loop_1x
- .Lxts_dec_cts:
- cbz x5, .Lxts_dec_end
- /* cipher text stealing */
- tweak_next(v9, v8, RTMP0)
- ld1 {v0.16b}, [x2]
- eor v0.16b, v0.16b, v9.16b
- SM4_CRYPT_BLK(v0)
- eor v0.16b, v0.16b, v9.16b
- /* load permute table */
- adr_l x6, .Lcts_permute_table
- add x7, x6, #32
- add x6, x6, x5
- sub x7, x7, x5
- ld1 {v3.16b}, [x6]
- ld1 {v4.16b}, [x7]
- /* overlapping loads */
- add x2, x2, x5
- ld1 {v1.16b}, [x2]
- /* create Cn from En-1 */
- tbl v2.16b, {v0.16b}, v3.16b
- /* padding Pn with En-1 at the end */
- tbx v0.16b, {v1.16b}, v4.16b
- eor v0.16b, v0.16b, v8.16b
- SM4_CRYPT_BLK(v0)
- eor v0.16b, v0.16b, v8.16b
- /* overlapping stores */
- add x5, x1, x5
- st1 {v2.16b}, [x5]
- st1 {v0.16b}, [x1]
- b .Lxts_dec_ret
- .Lxts_dec_end:
- /* store new tweak */
- st1 {v8.16b}, [x3]
- .Lxts_dec_ret:
- ret
- SYM_FUNC_END(sm4_ce_xts_dec)
- .align 3
- SYM_FUNC_START(sm4_ce_mac_update)
- /* input:
- * x0: round key array, CTX
- * x1: digest
- * x2: src
- * w3: nblocks
- * w4: enc_before
- * w5: enc_after
- */
- SM4_PREPARE(x0)
- ld1 {RMAC.16b}, [x1]
- cbz w4, .Lmac_update
- SM4_CRYPT_BLK(RMAC)
- .Lmac_update:
- cbz w3, .Lmac_ret
- sub w6, w3, #1
- cmp w5, wzr
- csel w3, w3, w6, ne
- cbz w3, .Lmac_end
- .Lmac_loop_4x:
- cmp w3, #4
- blt .Lmac_loop_1x
- sub w3, w3, #4
- ld1 {v0.16b-v3.16b}, [x2], #64
- eor RMAC.16b, RMAC.16b, v0.16b
- SM4_CRYPT_BLK(RMAC)
- eor RMAC.16b, RMAC.16b, v1.16b
- SM4_CRYPT_BLK(RMAC)
- eor RMAC.16b, RMAC.16b, v2.16b
- SM4_CRYPT_BLK(RMAC)
- eor RMAC.16b, RMAC.16b, v3.16b
- SM4_CRYPT_BLK(RMAC)
- cbz w3, .Lmac_end
- b .Lmac_loop_4x
- .Lmac_loop_1x:
- sub w3, w3, #1
- ld1 {v0.16b}, [x2], #16
- eor RMAC.16b, RMAC.16b, v0.16b
- SM4_CRYPT_BLK(RMAC)
- cbnz w3, .Lmac_loop_1x
- .Lmac_end:
- cbnz w5, .Lmac_ret
- ld1 {v0.16b}, [x2], #16
- eor RMAC.16b, RMAC.16b, v0.16b
- .Lmac_ret:
- st1 {RMAC.16b}, [x1]
- ret
- SYM_FUNC_END(sm4_ce_mac_update)
- .section ".rodata", "a"
- .align 4
- .Lbswap128_mask:
- .byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
- .byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03
- .Lcts_permute_table:
- .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
- .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
- .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
- .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
- .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
- .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|